diff --git a/CMakeLists.txt b/CMakeLists.txt index 93b884dd9..17dced38c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ # 3. cmake .. # 4. make -j -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.12) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") include(ReadVersion) diff --git a/HISTORY.md b/HISTORY.md index d4fc843a2..a8bcdef13 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,28 @@ # Rocksdb Change Log > NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt` +## 9.8.0 (10/25/2024) +### New Features +* All non-`block_cache` options in `BlockBasedTableOptions` are now mutable with `DB::SetOptions()`. See also Bug Fixes below. +* When using iterators with BlobDB, it is now possible to load large values on an on-demand basis, i.e. only if they are actually needed by the application. This can save I/O in use cases where the values associated with certain keys are not needed. For more details, see the new read option `allow_unprepared_value` and the iterator API `PrepareValue`. +* Add a new file ingestion option `IngestExternalFileOptions::fill_cache` to support not adding blocks from ingested files into block cache during file ingestion. +* The option `allow_unprepared_value` is now also supported for multi-column-family iterators (i.e. `CoalescingIterator` and `AttributeGroupIterator`). +* When a file with just one range deletion (standalone range deletion file) is ingested via bulk loading, it will be marked for compaction. During compaction, this type of files can be used to directly filter out some input files that are not protected by any snapshots and completely deleted by the standalone range deletion file. + +### Behavior Changes +* During file ingestion, overlapping files level assignment are done in multiple batches, so that they can potentially be assigned to lower levels other than always land on L0. +* OPTIONS file to be loaded by remote worker is now preserved so that it does not get purged by the primary host. A similar technique as how we are preserving new SST files from getting purged is used for this. min_options_file_numbers_ is tracked like pending_outputs_ is tracked. +* Trim readahead_size during scans so data blocks containing keys that are not in the same prefix as the seek key in `Seek()` are not prefetched when `ReadOptions::auto_readahead_size=true` (default value) and `ReadOptions::prefix_same_as_start = true` +* Assigning levels for external files are done in the same way for universal compaction and leveled compaction. The old behavior tends to assign files to L0 while the new behavior will assign the files to the lowest level possible. + +### Bug Fixes +* Fix a longstanding race condition in SetOptions for `block_based_table_factory` options. The fix has some subtle behavior changes because of copying and replacing the TableFactory on a change with SetOptions, including requiring an Iterator::Refresh() for an existing Iterator to use the latest options. +* Fix under counting of allocated memory in the compressed secondary cache due to looking at the compressed block size rather than the actual memory allocated, which could be larger due to internal fragmentation. +* `GetApproximateMemTableStats()` could return disastrously bad estimates 5-25% of the time. The function has been re-engineered to return much better estimates with similar CPU cost. +* Skip insertion of compressed blocks in the secondary cache if the lowest_used_cache_tier DB option is kVolatileTier. +* Fix an issue in level compaction where a small CF with small compaction debt can cause the DB to allow parallel compactions. (#13054) +* Several DB option settings could be lost through `GetOptionsFromString()`, possibly elsewhere as well. Affected options, now fixed:`background_close_inactive_wals`, `write_dbid_to_manifest`, `write_identity_file`, `prefix_seek_opt_in_only` + ## 9.7.0 (09/20/2024) ### New Features * Make Cache a customizable class that can be instantiated by the object registry. diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py index 0ce29a695..e802c7759 100755 --- a/buckifier/buckify_rocksdb.py +++ b/buckifier/buckify_rocksdb.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -from __future__ import absolute_import, division, print_function, unicode_literals try: from builtins import str @@ -132,7 +131,7 @@ def generate_targets(repo_path, deps_map): if len(sys.argv) >= 2: # Heuristically quote and canonicalize whitespace for inclusion # in how the file was generated. - extra_argv = " '{0}'".format(" ".join(sys.argv[1].split())) + extra_argv = " '{}'".format(" ".join(sys.argv[1].split())) TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path, extra_argv) @@ -213,7 +212,7 @@ def generate_targets(repo_path, deps_map): for src in src_mk.get("MICROBENCH_SOURCES", []): name = src.rsplit("/", 1)[1].split(".")[0] if "/" in src else src.split(".")[0] TARGETS.add_binary(name, [src], [], extra_bench_libs=True) - print("Extra dependencies:\n{0}".format(json.dumps(deps_map))) + print(f"Extra dependencies:\n{json.dumps(deps_map)}") # Dictionary test executable name -> relative source file path test_source_map = {} diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py index f6e35593d..e62eaf958 100644 --- a/buckifier/targets_builder.py +++ b/buckifier/targets_builder.py @@ -1,5 +1,4 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -from __future__ import absolute_import, division, print_function, unicode_literals try: from builtins import object, str diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py index 08f58628a..4e58d1210 100644 --- a/buckifier/targets_cfg.py +++ b/buckifier/targets_cfg.py @@ -1,5 +1,4 @@ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -from __future__ import absolute_import, division, print_function, unicode_literals rocksdb_target_header_template = """# This file \100generated by: #$ python3 buckifier/buckify_rocksdb.py{extra_argv} diff --git a/buckifier/util.py b/buckifier/util.py index be197efd0..69bbc0329 100644 --- a/buckifier/util.py +++ b/buckifier/util.py @@ -2,7 +2,6 @@ """ This module keeps commonly used components. """ -from __future__ import absolute_import, division, print_function, unicode_literals try: from builtins import object diff --git a/build_tools/amalgamate.py b/build_tools/amalgamate.py index f79e9075e..d4fafbb20 100755 --- a/build_tools/amalgamate.py +++ b/build_tools/amalgamate.py @@ -25,7 +25,6 @@ # # The solution is to move the include out of the #ifdef. -from __future__ import print_function import argparse import re @@ -62,7 +61,7 @@ def expand_include( included.add(include_path) with open(include_path) as f: - print('#line 1 "{}"'.format(include_path), file=source_out) + print(f'#line 1 "{include_path}"', file=source_out) process_file( f, include_path, source_out, header_out, include_paths, public_include_paths ) @@ -118,7 +117,7 @@ def process_file( ) if expanded: - print('#line {} "{}"'.format(line + 1, abs_path), file=source_out) + print(f'#line {line + 1} "{abs_path}"', file=source_out) elif text != "#pragma once\n": source_out.write(text) @@ -157,8 +156,8 @@ def main(): with open(filename) as f, open(args.source_out, "w") as source_out, open( args.header_out, "w" ) as header_out: - print('#line 1 "{}"'.format(filename), file=source_out) - print('#include "{}"'.format(header_out.name), file=source_out) + print(f'#line 1 "{filename}"', file=source_out) + print(f'#include "{header_out.name}"', file=source_out) process_file( f, abs_path, source_out, header_out, include_paths, public_include_paths ) diff --git a/build_tools/benchmark_log_tool.py b/build_tools/benchmark_log_tool.py index d1ad45911..116740d33 100755 --- a/build_tools/benchmark_log_tool.py +++ b/build_tools/benchmark_log_tool.py @@ -102,7 +102,7 @@ def conform_opensearch(row): class ResultParser: - def __init__(self, field="(\w|[+-:.%])+", intrafield="(\s)+", separator="\t"): + def __init__(self, field=r"(\w|[+-:.%])+", intrafield=r"(\s)+", separator="\t"): self.field = re.compile(field) self.intra = re.compile(intrafield) self.sep = re.compile(separator) @@ -159,7 +159,7 @@ def parse(self, lines): def load_report_from_tsv(filename: str): - file = open(filename, "r") + file = open(filename) contents = file.readlines() file.close() parser = ResultParser() diff --git a/build_tools/error_filter.py b/build_tools/error_filter.py index d9cb1099c..b610f7c33 100644 --- a/build_tools/error_filter.py +++ b/build_tools/error_filter.py @@ -9,7 +9,6 @@ - Prints those error messages to stdout """ -from __future__ import absolute_import, division, print_function, unicode_literals import re import sys @@ -43,7 +42,7 @@ def parse_error(self, line): return None gtest_fail_match = self._GTEST_FAIL_PATTERN.match(line) if gtest_fail_match: - return "%s failed: %s" % (self._last_gtest_name, gtest_fail_match.group(1)) + return "{} failed: {}".format(self._last_gtest_name, gtest_fail_match.group(1)) return None @@ -66,52 +65,52 @@ def __init__(self): # format (link error): # ':: error: ' # The below regex catches both - super(CompilerErrorParser, self).__init__(r"\S+:\d+: error:") + super().__init__(r"\S+:\d+: error:") class ScanBuildErrorParser(MatchErrorParser): def __init__(self): - super(ScanBuildErrorParser, self).__init__(r"scan-build: \d+ bugs found.$") + super().__init__(r"scan-build: \d+ bugs found.$") class DbCrashErrorParser(MatchErrorParser): def __init__(self): - super(DbCrashErrorParser, self).__init__(r"\*\*\*.*\^$|TEST FAILED.") + super().__init__(r"\*\*\*.*\^$|TEST FAILED.") class WriteStressErrorParser(MatchErrorParser): def __init__(self): - super(WriteStressErrorParser, self).__init__( + super().__init__( r"ERROR: write_stress died with exitcode=\d+" ) class AsanErrorParser(MatchErrorParser): def __init__(self): - super(AsanErrorParser, self).__init__(r"==\d+==ERROR: AddressSanitizer:") + super().__init__(r"==\d+==ERROR: AddressSanitizer:") class UbsanErrorParser(MatchErrorParser): def __init__(self): # format: '::: runtime error: ' - super(UbsanErrorParser, self).__init__(r"\S+:\d+:\d+: runtime error:") + super().__init__(r"\S+:\d+:\d+: runtime error:") class ValgrindErrorParser(MatchErrorParser): def __init__(self): # just grab the summary, valgrind doesn't clearly distinguish errors # from other log messages. - super(ValgrindErrorParser, self).__init__(r"==\d+== ERROR SUMMARY:") + super().__init__(r"==\d+== ERROR SUMMARY:") class CompatErrorParser(MatchErrorParser): def __init__(self): - super(CompatErrorParser, self).__init__(r"==== .*[Ee]rror.* ====$") + super().__init__(r"==== .*[Ee]rror.* ====$") class TsanErrorParser(MatchErrorParser): def __init__(self): - super(TsanErrorParser, self).__init__(r"WARNING: ThreadSanitizer:") + super().__init__(r"WARNING: ThreadSanitizer:") _TEST_NAME_TO_PARSERS = { diff --git a/cache/cache.cc b/cache/cache.cc index cf5febb70..1f120d28e 100644 --- a/cache/cache.cc +++ b/cache/cache.cc @@ -133,7 +133,9 @@ Status Cache::CreateFromString(const ConfigOptions& config_options, std::shared_ptr* result) { Status status; std::shared_ptr cache; - if (value.find("://") == std::string::npos) { + if (StartsWith(value, "null")) { + cache = nullptr; + } else if (value.find("://") == std::string::npos) { if (value.find('=') == std::string::npos) { cache = NewLRUCache(ParseSizeT(value)); } else { diff --git a/coverage/parse_gcov_output.py b/coverage/parse_gcov_output.py index b9788ec81..8a1056c51 100644 --- a/coverage/parse_gcov_output.py +++ b/coverage/parse_gcov_output.py @@ -1,7 +1,6 @@ #!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -from __future__ import print_function import optparse import re @@ -109,11 +108,11 @@ def report_coverage(): # Check if we need to display coverage info for interested files. if len(interested_files): - per_file_coverage = dict( - (fname, per_file_coverage[fname]) + per_file_coverage = { + fname: per_file_coverage[fname] for fname in interested_files if fname in per_file_coverage - ) + } # If we only interested in several files, it makes no sense to report # the total_coverage total_coverage = None diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h index a3b285564..a2dc15a01 100644 --- a/db/arena_wrapped_db_iter.h +++ b/db/arena_wrapped_db_iter.h @@ -83,6 +83,8 @@ class ArenaWrappedDBIter : public Iterator { Status Refresh() override; Status Refresh(const Snapshot*) override; + bool PrepareValue() override { return db_iter_->PrepareValue(); } + void Init(Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, const MutableCFOptions& mutable_cf_options, const Version* version, diff --git a/db/attribute_group_iterator_impl.h b/db/attribute_group_iterator_impl.h index 3977fe428..c6dc2722a 100644 --- a/db/attribute_group_iterator_impl.h +++ b/db/attribute_group_iterator_impl.h @@ -13,14 +13,11 @@ namespace ROCKSDB_NAMESPACE { class AttributeGroupIteratorImpl : public AttributeGroupIterator { public: AttributeGroupIteratorImpl( - const Comparator* comparator, + const Comparator* comparator, bool allow_unprepared_value, const std::vector& column_families, const std::vector& child_iterators) - : impl_( - comparator, column_families, child_iterators, [this]() { Reset(); }, - [this](const autovector& items) { - AddToAttributeGroups(items); - }) {} + : impl_(comparator, allow_unprepared_value, column_families, + child_iterators, ResetFunc(this), PopulateFunc(this)) {} ~AttributeGroupIteratorImpl() override {} // No copy allowed @@ -45,8 +42,36 @@ class AttributeGroupIteratorImpl : public AttributeGroupIterator { void Reset() { attribute_groups_.clear(); } + bool PrepareValue() override { return impl_.PrepareValue(); } + private: - MultiCfIteratorImpl impl_; + class ResetFunc { + public: + explicit ResetFunc(AttributeGroupIteratorImpl* iter) : iter_(iter) {} + + void operator()() const { + assert(iter_); + iter_->Reset(); + } + + private: + AttributeGroupIteratorImpl* iter_; + }; + + class PopulateFunc { + public: + explicit PopulateFunc(AttributeGroupIteratorImpl* iter) : iter_(iter) {} + + void operator()(const autovector& items) const { + assert(iter_); + iter_->AddToAttributeGroups(items); + } + + private: + AttributeGroupIteratorImpl* iter_; + }; + + MultiCfIteratorImpl impl_; IteratorAttributeGroups attribute_groups_; void AddToAttributeGroups(const autovector& items); }; diff --git a/db/blob/blob_source.cc b/db/blob/blob_source.cc index b524982e5..8eef7328d 100644 --- a/db/blob/blob_source.cc +++ b/db/blob/blob_source.cc @@ -20,23 +20,24 @@ namespace ROCKSDB_NAMESPACE { -BlobSource::BlobSource(const ImmutableOptions* immutable_options, +BlobSource::BlobSource(const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options, const std::string& db_id, const std::string& db_session_id, BlobFileCache* blob_file_cache) : db_id_(db_id), db_session_id_(db_session_id), - statistics_(immutable_options->statistics.get()), + statistics_(immutable_options.statistics.get()), blob_file_cache_(blob_file_cache), - blob_cache_(immutable_options->blob_cache), - lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) { + blob_cache_(immutable_options.blob_cache), + lowest_used_cache_tier_(immutable_options.lowest_used_cache_tier) { auto bbto = - immutable_options->table_factory->GetOptions(); + mutable_cf_options.table_factory->GetOptions(); if (bbto && bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache) .charged == CacheEntryRoleOptions::Decision::kEnabled) { blob_cache_ = SharedCacheInterface{std::make_shared( - immutable_options->blob_cache, bbto->block_cache)}; + immutable_options.blob_cache, bbto->block_cache)}; } } diff --git a/db/blob/blob_source.h b/db/blob/blob_source.h index d5e009b54..6811d3e41 100644 --- a/db/blob/blob_source.h +++ b/db/blob/blob_source.h @@ -21,6 +21,7 @@ namespace ROCKSDB_NAMESPACE { struct ImmutableOptions; +struct MutableCFOptions; class Status; class FilePrefetchBuffer; class Slice; @@ -31,7 +32,10 @@ class Slice; // storage with minimal cost. class BlobSource { public: - BlobSource(const ImmutableOptions* immutable_options, + // NOTE: db_id, db_session_id, and blob_file_cache are saved by reference or + // pointer. + BlobSource(const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options, const std::string& db_id, const std::string& db_session_id, BlobFileCache* blob_file_cache); diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc index a12c210fc..d0e9def7d 100644 --- a/db/blob/blob_source_test.cc +++ b/db/blob/blob_source_test.cc @@ -148,6 +148,7 @@ TEST_F(BlobSourceTest, GetBlobsFromCache) { DestroyAndReopen(options_); ImmutableOptions immutable_options(options_); + MutableCFOptions mutable_cf_options(options_); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -193,8 +194,8 @@ TEST_F(BlobSourceTest, GetBlobsFromCache) { backing_cache.get(), &immutable_options, &file_options, column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); - BlobSource blob_source(&immutable_options, db_id_, db_session_id_, - blob_file_cache.get()); + BlobSource blob_source(immutable_options, mutable_cf_options, db_id_, + db_session_id_, blob_file_cache.get()); ReadOptions read_options; read_options.verify_checksums = true; @@ -464,6 +465,7 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) { DestroyAndReopen(options_); ImmutableOptions immutable_options(options_); + MutableCFOptions mutable_cf_options(options_); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -498,8 +500,8 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) { backing_cache.get(), &immutable_options, &file_options, column_family_id, nullptr /*HistogramImpl*/, nullptr /*IOTracer*/); - BlobSource blob_source(&immutable_options, db_id_, db_session_id_, - blob_file_cache.get()); + BlobSource blob_source(immutable_options, mutable_cf_options, db_id_, + db_session_id_, blob_file_cache.get()); ReadOptions read_options; read_options.verify_checksums = true; @@ -589,6 +591,7 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) { DestroyAndReopen(options_); ImmutableOptions immutable_options(options_); + MutableCFOptions mutable_cf_options(options_); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -644,8 +647,8 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) { backing_cache.get(), &immutable_options, &file_options, column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); - BlobSource blob_source(&immutable_options, db_id_, db_session_id_, - blob_file_cache.get()); + BlobSource blob_source(immutable_options, mutable_cf_options, db_id_, + db_session_id_, blob_file_cache.get()); ReadOptions read_options; read_options.verify_checksums = true; @@ -782,6 +785,7 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromCache) { DestroyAndReopen(options_); ImmutableOptions immutable_options(options_); + MutableCFOptions mutable_cf_options(options_); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -827,8 +831,8 @@ TEST_F(BlobSourceTest, MultiGetBlobsFromCache) { backing_cache.get(), &immutable_options, &file_options, column_family_id, blob_file_read_hist, nullptr /*IOTracer*/); - BlobSource blob_source(&immutable_options, db_id_, db_session_id_, - blob_file_cache.get()); + BlobSource blob_source(immutable_options, mutable_cf_options, db_id_, + db_session_id_, blob_file_cache.get()); ReadOptions read_options; read_options.verify_checksums = true; @@ -1105,6 +1109,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { DestroyAndReopen(options_); ImmutableOptions immutable_options(options_); + MutableCFOptions mutable_cf_options(options_); constexpr uint32_t column_family_id = 1; constexpr bool has_ttl = false; @@ -1137,8 +1142,8 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { backing_cache.get(), &immutable_options, &file_options, column_family_id, blob_file_read_hist, nullptr /*IOTracer*/)); - BlobSource blob_source(&immutable_options, db_id_, db_session_id_, - blob_file_cache.get()); + BlobSource blob_source(immutable_options, mutable_cf_options, db_id_, + db_session_id_, blob_file_cache.get()); CacheHandleGuard file_reader; ReadOptions read_options; @@ -1405,6 +1410,7 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) { DestroyAndReopen(options_); ImmutableOptions immutable_options(options_); + MutableCFOptions mutable_cf_options(options_); constexpr ExpirationRange expiration_range; @@ -1426,8 +1432,8 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) { backing_cache.get(), &immutable_options, &file_options, kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/); - BlobSource blob_source(&immutable_options, db_id_, db_session_id_, - blob_file_cache.get()); + BlobSource blob_source(immutable_options, mutable_cf_options, db_id_, + db_session_id_, blob_file_cache.get()); ConcurrentCacheReservationManager* cache_res_mgr = static_cast(blob_source.GetBlobCache()) @@ -1519,6 +1525,8 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) { DestroyAndReopen(options_); ImmutableOptions immutable_options(options_); + MutableCFOptions mutable_cf_options(options_); + constexpr size_t blob_size = 24 << 10; // 24KB for (size_t i = 0; i < kNumBlobs; ++i) { blob_file_size_ -= blobs_[i].size(); // old blob size @@ -1546,8 +1554,8 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) { backing_cache.get(), &immutable_options, &file_options, kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/); - BlobSource blob_source(&immutable_options, db_id_, db_session_id_, - blob_file_cache.get()); + BlobSource blob_source(immutable_options, mutable_cf_options, db_id_, + db_session_id_, blob_file_cache.get()); ConcurrentCacheReservationManager* cache_res_mgr = static_cast(blob_source.GetBlobCache()) diff --git a/db/blob/db_blob_basic_test.cc b/db/blob/db_blob_basic_test.cc index 0e17df7aa..49f454ed2 100644 --- a/db/blob/db_blob_basic_test.cc +++ b/db/blob/db_blob_basic_test.cc @@ -374,6 +374,115 @@ TEST_F(DBBlobBasicTest, IterateBlobsFromCachePinning) { } } +TEST_F(DBBlobBasicTest, IterateBlobsAllowUnpreparedValue) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + + Reopen(options); + + constexpr size_t num_blobs = 5; + std::vector keys; + std::vector blobs; + + for (size_t i = 0; i < num_blobs; ++i) { + keys.emplace_back("key" + std::to_string(i)); + blobs.emplace_back("blob" + std::to_string(i)); + ASSERT_OK(Put(keys[i], blobs[i])); + } + + ASSERT_OK(Flush()); + + ReadOptions read_options; + read_options.allow_unprepared_value = true; + + std::unique_ptr iter(db_->NewIterator(read_options)); + + { + size_t i = 0; + + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_EQ(iter->key(), keys[i]); + ASSERT_TRUE(iter->value().empty()); + ASSERT_OK(iter->status()); + + ASSERT_TRUE(iter->PrepareValue()); + + ASSERT_EQ(iter->key(), keys[i]); + ASSERT_EQ(iter->value(), blobs[i]); + ASSERT_OK(iter->status()); + + ++i; + } + + ASSERT_OK(iter->status()); + ASSERT_EQ(i, num_blobs); + } + + { + size_t i = 0; + + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]); + ASSERT_TRUE(iter->value().empty()); + ASSERT_OK(iter->status()); + + ASSERT_TRUE(iter->PrepareValue()); + + ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]); + ASSERT_EQ(iter->value(), blobs[num_blobs - 1 - i]); + ASSERT_OK(iter->status()); + + ++i; + } + + ASSERT_OK(iter->status()); + ASSERT_EQ(i, num_blobs); + } + + { + size_t i = 1; + + for (iter->Seek(keys[i]); iter->Valid(); iter->Next()) { + ASSERT_EQ(iter->key(), keys[i]); + ASSERT_TRUE(iter->value().empty()); + ASSERT_OK(iter->status()); + + ASSERT_TRUE(iter->PrepareValue()); + + ASSERT_EQ(iter->key(), keys[i]); + ASSERT_EQ(iter->value(), blobs[i]); + ASSERT_OK(iter->status()); + + ++i; + } + + ASSERT_OK(iter->status()); + ASSERT_EQ(i, num_blobs); + } + + { + size_t i = 1; + + for (iter->SeekForPrev(keys[num_blobs - 1 - i]); iter->Valid(); + iter->Prev()) { + ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]); + ASSERT_TRUE(iter->value().empty()); + ASSERT_OK(iter->status()); + + ASSERT_TRUE(iter->PrepareValue()); + + ASSERT_EQ(iter->key(), keys[num_blobs - 1 - i]); + ASSERT_EQ(iter->value(), blobs[num_blobs - 1 - i]); + ASSERT_OK(iter->status()); + + ++i; + } + + ASSERT_OK(iter->status()); + ASSERT_EQ(i, num_blobs); + } +} + TEST_F(DBBlobBasicTest, MultiGetBlobs) { constexpr size_t min_blob_size = 6; @@ -1655,6 +1764,46 @@ TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) { SyncPoint::GetInstance()->ClearAllCallBacks(); } +TEST_P(DBBlobBasicIOErrorTest, IterateBlobsAllowUnpreparedValue_IOError) { + Options options; + options.env = fault_injection_env_.get(); + options.enable_blob_files = true; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(Put(key, blob_value)); + + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) { + fault_injection_env_->SetFilesystemActive(false, + Status::IOError(sync_point_)); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ReadOptions read_options; + read_options.allow_unprepared_value = true; + + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->SeekToFirst(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), key); + ASSERT_TRUE(iter->value().empty()); + ASSERT_OK(iter->status()); + + ASSERT_FALSE(iter->PrepareValue()); + + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsIOError()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) { Options options = GetDefaultOptions(); diff --git a/db/builder.cc b/db/builder.cc index 5bc75b060..8c2c624b0 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -53,7 +53,7 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, assert((tboptions.column_family_id == TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) == tboptions.column_family_name.empty()); - return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file); + return tboptions.moptions.table_factory->NewTableBuilder(tboptions, file); } Status BuildTable( @@ -206,10 +206,6 @@ Status BuildTable( /*compaction=*/nullptr, compaction_filter.get(), /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low); - const size_t ts_sz = ucmp->timestamp_size(); - const bool logical_strip_timestamp = - ts_sz > 0 && !ioptions.persist_user_defined_timestamps; - SequenceNumber smallest_preferred_seqno = kMaxSequenceNumber; std::string key_after_flush_buf; std::string value_buf; @@ -222,16 +218,6 @@ Status BuildTable( Slice key_after_flush = key_after_flush_buf; Slice value_after_flush = value; - // If user defined timestamps will be stripped from user key after flush, - // the in memory version of the key act logically the same as one with a - // minimum timestamp. We update the timestamp here so file boundary and - // output validator, block builder all see the effect of the stripping. - if (logical_strip_timestamp) { - key_after_flush_buf.clear(); - ReplaceInternalKeyWithMinTimestamp(&key_after_flush_buf, key, ts_sz); - key_after_flush = key_after_flush_buf; - } - if (ikey.type == kTypeValuePreferredSeqno) { auto [unpacked_value, unix_write_time] = ParsePackedValueWithWriteTime(value); @@ -291,11 +277,7 @@ Status BuildTable( Slice last_tombstone_start_user_key{}; for (range_del_it->SeekToFirst(); range_del_it->Valid(); range_del_it->Next()) { - // When user timestamp should not be persisted, we logically strip a - // range tombstone's start and end key's timestamp (replace it with min - // timestamp) before passing them along to table builder and to update - // file boundaries. - auto tombstone = range_del_it->Tombstone(logical_strip_timestamp); + auto tombstone = range_del_it->Tombstone(); std::pair kv = tombstone.Serialize(); builder->Add(kv.first.Encode(), kv.second); InternalKey tombstone_end = tombstone.SerializeEndKey(); @@ -438,8 +420,7 @@ Status BuildTable( // the goal is to cache it here for further user reads. std::unique_ptr it(table_cache->NewIterator( tboptions.read_options, file_options, tboptions.internal_comparator, - *meta, nullptr /* range_del_agg */, - mutable_cf_options.prefix_extractor, nullptr, + *meta, nullptr /* range_del_agg */, mutable_cf_options, nullptr, (internal_stats == nullptr) ? nullptr : internal_stats->GetFileReadHist(0), TableReaderCaller::kFlush, /*arena=*/nullptr, @@ -447,8 +428,7 @@ Status BuildTable( MaxFileSizeForL0MetaPin(mutable_cf_options), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key*/ nullptr, - /*allow_unprepared_value*/ false, - mutable_cf_options.block_protection_bytes_per_key)); + /*allow_unprepared_value*/ false)); s = it->status(); if (s.ok() && paranoid_file_checks) { OutputValidator file_validator(tboptions.internal_comparator, diff --git a/db/coalescing_iterator.h b/db/coalescing_iterator.h index a4d156a6d..c4a1c831e 100644 --- a/db/coalescing_iterator.h +++ b/db/coalescing_iterator.h @@ -12,14 +12,11 @@ namespace ROCKSDB_NAMESPACE { // EXPERIMENTAL class CoalescingIterator : public Iterator { public: - CoalescingIterator(const Comparator* comparator, + CoalescingIterator(const Comparator* comparator, bool allow_unprepared_value, const std::vector& column_families, const std::vector& child_iterators) - : impl_( - comparator, column_families, child_iterators, [this]() { Reset(); }, - [this](const autovector& items) { - Coalesce(items); - }) {} + : impl_(comparator, allow_unprepared_value, column_families, + child_iterators, ResetFunc(this), PopulateFunc(this)) {} ~CoalescingIterator() override {} // No copy allowed @@ -50,8 +47,36 @@ class CoalescingIterator : public Iterator { wide_columns_.clear(); } + bool PrepareValue() override { return impl_.PrepareValue(); } + private: - MultiCfIteratorImpl impl_; + class ResetFunc { + public: + explicit ResetFunc(CoalescingIterator* iter) : iter_(iter) {} + + void operator()() const { + assert(iter_); + iter_->Reset(); + } + + private: + CoalescingIterator* iter_; + }; + + class PopulateFunc { + public: + explicit PopulateFunc(CoalescingIterator* iter) : iter_(iter) {} + + void operator()(const autovector& items) const { + assert(iter_); + iter_->Coalesce(items); + } + + private: + CoalescingIterator* iter_; + }; + + MultiCfIteratorImpl impl_; Slice value_; WideColumns wide_columns_; diff --git a/db/column_family.cc b/db/column_family.cc index 8abad2941..f8b6a55a5 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -595,8 +595,8 @@ ColumnFamilyData::ColumnFamilyData( blob_file_cache_.reset( new BlobFileCache(_table_cache, ioptions(), soptions(), id_, internal_stats_->GetBlobFileReadHist(), io_tracer)); - blob_source_.reset(new BlobSource(ioptions(), db_id, db_session_id, - blob_file_cache_.get())); + blob_source_.reset(new BlobSource(ioptions_, mutable_cf_options_, db_id, + db_session_id, blob_file_cache_.get())); if (ioptions_.compaction_style == kCompactionStyleLevel) { compaction_picker_.reset( @@ -901,7 +901,11 @@ uint64_t GetPendingCompactionBytesForCompactionSpeedup( return slowdown_threshold; } - uint64_t size_threshold = bottommost_files_size / kBottommostSizeDivisor; + // Prevent a small CF from triggering parallel compactions for other CFs. + // Require compaction debt to be more than a full L0 to Lbase compaction. + const uint64_t kMinDebtSize = 2 * mutable_cf_options.max_bytes_for_level_base; + uint64_t size_threshold = + std::max(bottommost_files_size / kBottommostSizeDivisor, kMinDebtSize); return std::min(size_threshold, slowdown_threshold); } @@ -1172,10 +1176,12 @@ bool ColumnFamilyData::NeedsCompaction() const { Compaction* ColumnFamilyData::PickCompaction( const MutableCFOptions& mutable_options, - const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) { + const MutableDBOptions& mutable_db_options, + const std::vector& existing_snapshots, + const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer) { auto* result = compaction_picker_->PickCompaction( - GetName(), mutable_options, mutable_db_options, current_->storage_info(), - log_buffer); + GetName(), mutable_options, mutable_db_options, existing_snapshots, + snapshot_checker, current_->storage_info(), log_buffer); if (result != nullptr) { result->FinalizeInputInfo(current_); } diff --git a/db/column_family.h b/db/column_family.h index e4b7adde8..ff038d8df 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -16,6 +16,7 @@ #include "cache/cache_reservation_manager.h" #include "db/memtable_list.h" +#include "db/snapshot_checker.h" #include "db/table_cache.h" #include "db/table_properties_collector.h" #include "db/write_batch_internal.h" @@ -385,9 +386,9 @@ class ColumnFamilyData { uint64_t GetTotalSstFilesSize() const; // REQUIRE: DB mutex held uint64_t GetLiveSstFilesSize() const; // REQUIRE: DB mutex held uint64_t GetTotalBlobFileSize() const; // REQUIRE: DB mutex held + // REQUIRE: DB mutex held void SetMemtable(MemTable* new_mem) { - uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1; - new_mem->SetID(memtable_id); + new_mem->SetID(++last_memtable_id_); mem_ = new_mem; } @@ -407,9 +408,11 @@ class ColumnFamilyData { // REQUIRES: DB mutex held bool NeedsCompaction() const; // REQUIRES: DB mutex held - Compaction* PickCompaction(const MutableCFOptions& mutable_options, - const MutableDBOptions& mutable_db_options, - LogBuffer* log_buffer); + Compaction* PickCompaction( + const MutableCFOptions& mutable_options, + const MutableDBOptions& mutable_db_options, + const std::vector& existing_snapshots, + const SnapshotChecker* snapshot_checker, LogBuffer* log_buffer); // Check if the passed range overlap with any running compactions. // REQUIRES: DB mutex held @@ -669,7 +672,7 @@ class ColumnFamilyData { bool allow_2pc_; // Memtable id to track flush. - std::atomic last_memtable_id_; + uint64_t last_memtable_id_; // Directories corresponding to cf_paths. std::vector> data_dirs_; diff --git a/db/column_family_test.cc b/db/column_family_test.cc index d7751992b..29ff2d15a 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -3012,19 +3012,25 @@ TEST_P(ColumnFamilyTest, CompactionSpeedupForCompactionDebt) { ASSERT_OK(db_->Flush(FlushOptions())); { - // 1MB debt is way bigger than bottommost data so definitely triggers - // speedup. VersionStorageInfo* vstorage = cfd->current()->storage_info(); - vstorage->TEST_set_estimated_compaction_needed_bytes(1048576 /* 1MB */, - dbmu); - RecalculateWriteStallConditions(cfd, mutable_cf_options); - ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); - // Eight bytes is way smaller than bottommost data so definitely does not // trigger speedup. vstorage->TEST_set_estimated_compaction_needed_bytes(8, dbmu); RecalculateWriteStallConditions(cfd, mutable_cf_options); ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + // 1MB is much larger than bottommost level size. However, since it's too + // small in terms of absolute size, it does not trigger parallel compaction + // in this case (see GetPendingCompactionBytesForCompactionSpeedup()). + vstorage->TEST_set_estimated_compaction_needed_bytes(1048576 /* 1MB */, + dbmu); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes( + 2 * mutable_cf_options.max_bytes_for_level_base, dbmu); + RecalculateWriteStallConditions(cfd, mutable_cf_options); + ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed()); } } @@ -3870,6 +3876,91 @@ TEST_F(ManualFlushSkipRetainUDTTest, ManualFlush) { Close(); } +TEST_F(ManualFlushSkipRetainUDTTest, FlushRemovesStaleEntries) { + column_family_options_.max_write_buffer_number = 4; + Open(); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(0))); + + ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); + ColumnFamilyData* cfd = + static_cast_with_check(cfh)->cfd(); + for (int version = 0; version < 100; version++) { + if (version == 50) { + ASSERT_OK(static_cast_with_check(db_)->TEST_SwitchMemtable(cfd)); + } + ASSERT_OK( + Put(0, "foo", EncodeAsUint64(version), "v" + std::to_string(version))); + } + + ASSERT_OK(Flush(0)); + TablePropertiesCollection tables_properties; + ASSERT_OK(db_->GetPropertiesOfAllTables(&tables_properties)); + ASSERT_EQ(1, tables_properties.size()); + std::shared_ptr table_properties = + tables_properties.begin()->second; + ASSERT_EQ(1, table_properties->num_entries); + ASSERT_EQ(0, table_properties->num_deletions); + ASSERT_EQ(0, table_properties->num_range_deletions); + CheckEffectiveCutoffTime(100); + CheckAutomaticFlushRetainUDT(101); + + Close(); +} + +TEST_F(ManualFlushSkipRetainUDTTest, RangeDeletionFlushRemovesStaleEntries) { + column_family_options_.max_write_buffer_number = 4; + Open(); + // TODO(yuzhangyu): a non 0 full history ts low is needed for this garbage + // collection to kick in. This doesn't work well for the very first flush of + // the column family. Not a big issue, but would be nice to improve this. + ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(9))); + + for (int i = 10; i < 100; i++) { + ASSERT_OK(Put(0, "foo" + std::to_string(i), EncodeAsUint64(i), + "val" + std::to_string(i))); + if (i % 2 == 1) { + ASSERT_OK(db_->DeleteRange(WriteOptions(), "foo" + std::to_string(i - 1), + "foo" + std::to_string(i), EncodeAsUint64(i))); + } + } + + ASSERT_OK(Flush(0)); + CheckEffectiveCutoffTime(100); + std::string read_ts = EncodeAsUint64(100); + std::string min_ts = EncodeAsUint64(0); + ReadOptions ropts; + Slice read_ts_slice = read_ts; + std::string value; + ropts.timestamp = &read_ts_slice; + { + Iterator* iter = db_->NewIterator(ropts); + iter->SeekToFirst(); + int i = 11; + while (iter->Valid()) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("foo" + std::to_string(i), iter->key()); + ASSERT_EQ("val" + std::to_string(i), iter->value()); + ASSERT_EQ(min_ts, iter->timestamp()); + iter->Next(); + i += 2; + } + ASSERT_OK(iter->status()); + delete iter; + } + TablePropertiesCollection tables_properties; + ASSERT_OK(db_->GetPropertiesOfAllTables(&tables_properties)); + ASSERT_EQ(1, tables_properties.size()); + std::shared_ptr table_properties = + tables_properties.begin()->second; + // 45 point data + 45 range deletions. 45 obsolete point data are garbage + // collected. + ASSERT_EQ(90, table_properties->num_entries); + ASSERT_EQ(45, table_properties->num_deletions); + ASSERT_EQ(45, table_properties->num_range_deletions); + + Close(); +} + TEST_F(ManualFlushSkipRetainUDTTest, ManualCompaction) { Open(); ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], EncodeAsUint64(0))); diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index fc76f93f1..035195f9d 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -283,9 +283,10 @@ Compaction::Compaction( uint32_t _output_path_id, CompressionType _compression, CompressionOptions _compression_opts, Temperature _output_temperature, uint32_t _max_subcompactions, std::vector _grandparents, - bool _manual_compaction, const std::string& _trim_ts, double _score, - bool _deletion_compaction, bool l0_files_might_overlap, - CompactionReason _compaction_reason, + std::optional _earliest_snapshot, + const SnapshotChecker* _snapshot_checker, bool _manual_compaction, + const std::string& _trim_ts, double _score, bool _deletion_compaction, + bool l0_files_might_overlap, CompactionReason _compaction_reason, BlobGarbageCollectionPolicy _blob_garbage_collection_policy, double _blob_garbage_collection_age_cutoff) : input_vstorage_(vstorage), @@ -307,6 +308,8 @@ Compaction::Compaction( l0_files_might_overlap_(l0_files_might_overlap), inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))), grandparents_(std::move(_grandparents)), + earliest_snapshot_(_earliest_snapshot), + snapshot_checker_(_snapshot_checker), score_(_score), bottommost_level_( // For simplicity, we don't support the concept of "bottommost level" @@ -367,9 +370,13 @@ Compaction::Compaction( // setup input_levels_ { input_levels_.resize(num_input_levels()); - for (size_t which = 0; which < num_input_levels(); which++) { - DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files, - &arena_); + if (earliest_snapshot_.has_value()) { + FilterInputsForCompactionIterator(); + } else { + for (size_t which = 0; which < num_input_levels(); which++) { + DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files, + &arena_); + } } } @@ -745,8 +752,10 @@ void Compaction::ResetNextCompactionIndex() { } namespace { -int InputSummary(const std::vector& files, char* output, +int InputSummary(const std::vector& files, + const std::vector& files_filtered, char* output, int len) { + assert(files_filtered.empty() || (files.size() == files_filtered.size())); *output = '\0'; int write = 0; for (size_t i = 0; i < files.size(); i++) { @@ -754,8 +763,14 @@ int InputSummary(const std::vector& files, char* output, int ret; char sztxt[16]; AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16); - ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", - files.at(i)->fd.GetNumber(), sztxt); + if (files_filtered.empty()) { + ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", + files.at(i)->fd.GetNumber(), sztxt); + } else { + ret = snprintf(output + write, sz, "%" PRIu64 "(%s filtered:%s) ", + files.at(i)->fd.GetNumber(), sztxt, + files_filtered.at(i) ? "true" : "false"); + } if (ret < 0 || ret >= sz) { break; } @@ -781,8 +796,15 @@ void Compaction::Summary(char* output, int len) { return; } } - write += - InputSummary(inputs_[level_iter].files, output + write, len - write); + + assert(non_start_level_input_files_filtered_.empty() || + non_start_level_input_files_filtered_.size() == inputs_.size() - 1); + write += InputSummary( + inputs_[level_iter].files, + (level_iter == 0 || non_start_level_input_files_filtered_.empty()) + ? std::vector{} + : non_start_level_input_files_filtered_[level_iter - 1], + output + write, len - write); if (write < 0 || write >= len) { return; } @@ -865,7 +887,7 @@ bool Compaction::ShouldFormSubcompactions() const { return false; } - if (cfd_->ioptions()->table_factory->Name() == + if (mutable_cf_options_.table_factory->Name() == TableFactory::kPlainTableName()) { return false; } @@ -991,4 +1013,69 @@ int Compaction::EvaluatePenultimateLevel( return penultimate_level; } +void Compaction::FilterInputsForCompactionIterator() { + assert(earliest_snapshot_.has_value()); + // cfd_ is not populated at Compaction construction time, get it from + // VersionStorageInfo instead. + assert(input_vstorage_); + const auto* ucmp = input_vstorage_->user_comparator(); + assert(ucmp); + // Simply comparing file boundaries when user-defined timestamp is defined + // is not as safe because we need to also compare timestamp to know for + // sure. Although entries with higher timestamp is also supposed to have + // higher sequence number for the same user key (without timestamp). + assert(ucmp->timestamp_size() == 0); + size_t num_input_levels = inputs_.size(); + // TODO(yuzhangyu): filtering of older L0 file by new L0 file is not + // supported yet. + FileMetaData* rangedel_candidate = inputs_[0].level == 0 + ? inputs_[0].files.back() + : inputs_[0].files.front(); + assert(rangedel_candidate); + if (!rangedel_candidate->FileIsStandAloneRangeTombstone() || + !DataIsDefinitelyInSnapshot(rangedel_candidate->fd.smallest_seqno, + earliest_snapshot_.value(), + snapshot_checker_)) { + for (size_t level = 0; level < num_input_levels; level++) { + DoGenerateLevelFilesBrief(&input_levels_[level], inputs_[level].files, + &arena_); + } + return; + } + + Slice rangedel_start_ukey = rangedel_candidate->smallest.user_key(); + Slice rangedel_end_ukey = rangedel_candidate->largest.user_key(); + SequenceNumber rangedel_seqno = rangedel_candidate->fd.smallest_seqno; + + std::vector> non_start_level_input_files; + non_start_level_input_files.reserve(num_input_levels - 1); + non_start_level_input_files_filtered_.reserve(num_input_levels - 1); + for (size_t level = 1; level < num_input_levels; level++) { + non_start_level_input_files.emplace_back(); + non_start_level_input_files_filtered_.emplace_back(); + for (FileMetaData* file : inputs_[level].files) { + non_start_level_input_files_filtered_.back().push_back(false); + // When range data and point data has the same sequence number, point + // data wins. Range deletion end key is exclusive, so check it's bigger + // than file right boundary user key. + if (rangedel_seqno > file->fd.largest_seqno && + ucmp->CompareWithoutTimestamp(rangedel_start_ukey, + file->smallest.user_key()) <= 0 && + ucmp->CompareWithoutTimestamp(rangedel_end_ukey, + file->largest.user_key()) > 0) { + non_start_level_input_files_filtered_.back().back() = true; + } else { + non_start_level_input_files.back().push_back(file); + } + } + } + + DoGenerateLevelFilesBrief(&input_levels_[0], inputs_[0].files, &arena_); + assert(non_start_level_input_files.size() == num_input_levels - 1); + for (size_t level = 1; level < num_input_levels; level++) { + DoGenerateLevelFilesBrief(&input_levels_[level], + non_start_level_input_files[level - 1], &arena_); + } +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index 633e68a9e..d5e8f06c6 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -8,6 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once + +#include "db/snapshot_checker.h" #include "db/version_set.h" #include "memory/arena.h" #include "options/cf_options.h" @@ -90,6 +92,8 @@ class Compaction { CompressionOptions compression_opts, Temperature output_temperature, uint32_t max_subcompactions, std::vector grandparents, + std::optional earliest_snapshot, + const SnapshotChecker* snapshot_checker, bool manual_compaction = false, const std::string& trim_ts = "", double score = -1, bool deletion_compaction = false, bool l0_files_might_overlap = true, @@ -460,6 +464,13 @@ class Compaction { // `Compaction::WithinPenultimateLevelOutputRange()`. void PopulatePenultimateLevelOutputRange(); + // If oldest snapshot is specified at Compaction construction time, we have + // an opportunity to optimize inputs for compaction iterator for this case: + // When a standalone range deletion file on the start level is recognized and + // can be determined to completely shadow some input files on non-start level. + // These files will be filtered out and later not feed to compaction iterator. + void FilterInputsForCompactionIterator(); + // Get the atomic file boundaries for all files in the compaction. Necessary // in order to avoid the scenario described in // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and @@ -510,12 +521,27 @@ class Compaction { // Compaction input files organized by level. Constant after construction const std::vector inputs_; - // A copy of inputs_, organized more closely in memory + // All files from inputs_ that are not filtered and will be fed to compaction + // iterator, organized more closely in memory. autovector input_levels_; // State used to check for number of overlapping grandparent files // (grandparent == "output_level_ + 1") std::vector grandparents_; + + // The earliest snapshot and snapshot checker at compaction picking time. + // These fields are only set for deletion triggered compactions picked in + // universal compaction. And when user-defined timestamp is not enabled. + // It will be used to possibly filter out some non start level input files. + std::optional earliest_snapshot_; + const SnapshotChecker* snapshot_checker_; + + // Markers for which non start level input files are filtered out if + // applicable. Only applicable if earliest_snapshot_ is provided and input + // start level has a standalone range deletion file. + std::vector> non_start_level_input_files_filtered_; + + // bool standalone_range_tombstones_used_for_filtering_inputs_; const double score_; // score that was used to pick this compaction. // Is this compaction creating a file in the bottom most level? diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index 39def1ebc..b9b4766c4 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -540,18 +540,12 @@ class CompactionIterator { inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot) { - return ((seq) <= (snapshot) && - (snapshot_checker_ == nullptr || - LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == - SnapshotCheckerResult::kInSnapshot))); + return DataIsDefinitelyInSnapshot(seq, snapshot, snapshot_checker_); } inline bool CompactionIterator::DefinitelyNotInSnapshot( SequenceNumber seq, SequenceNumber snapshot) { - return ((seq) > (snapshot) || - (snapshot_checker_ != nullptr && - UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) == - SnapshotCheckerResult::kNotInSnapshot))); + return DataIsDefinitelyNotInSnapshot(seq, snapshot, snapshot_checker_); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index b4b4eeace..7aa2bb217 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -469,7 +469,7 @@ void CompactionJob::GenSubcompactionBoundaries() { ReadOptions read_options(Env::IOActivity::kCompaction); read_options.rate_limiter_priority = GetRateLimiterPriority(); auto* c = compact_->compaction; - if (c->immutable_options()->table_factory->Name() == + if (c->mutable_cf_options()->table_factory->Name() == TableFactory::kPlainTableName()) { return; } @@ -506,9 +506,7 @@ void CompactionJob::GenSubcompactionBoundaries() { FileMetaData* f = flevel->files[i].file_metadata; std::vector my_anchors; Status s = cfd->table_cache()->ApproximateKeyAnchors( - read_options, icomp, *f, - c->mutable_cf_options()->block_protection_bytes_per_key, - my_anchors); + read_options, icomp, *f, *c->mutable_cf_options(), my_anchors); if (!s.ok() || my_anchors.empty()) { my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize()); } @@ -711,8 +709,6 @@ Status CompactionJob::Run() { } } ColumnFamilyData* cfd = compact_->compaction->column_family_data(); - auto& prefix_extractor = - compact_->compaction->mutable_cf_options()->prefix_extractor; std::atomic next_file_idx(0); auto verify_table = [&](Status& output_status) { while (true) { @@ -733,7 +729,8 @@ Status CompactionJob::Run() { InternalIterator* iter = cfd->table_cache()->NewIterator( verify_table_read_options, file_options_, cfd->internal_comparator(), files_output[file_idx]->meta, - /*range_del_agg=*/nullptr, prefix_extractor, + /*range_del_agg=*/nullptr, + *compact_->compaction->mutable_cf_options(), /*table_reader_ptr=*/nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), @@ -743,9 +740,7 @@ Status CompactionJob::Run() { *compact_->compaction->mutable_cf_options()), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false, - compact_->compaction->mutable_cf_options() - ->block_protection_bytes_per_key); + /*allow_unprepared_value=*/false); auto s = iter->status(); if (s.ok() && paranoid_file_checks_) { @@ -806,6 +801,12 @@ Status CompactionJob::Run() { } } + // Before the compaction starts, is_remote_compaction was set to true if + // compaction_service is set. We now know whether each sub_compaction was + // done remotely or not. Reset is_remote_compaction back to false and allow + // AggregateCompactionStats() to set the right value. + compaction_job_stats_->is_remote_compaction = false; + // Finish up all bookkeeping to unify the subcompaction results. compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); uint64_t num_input_range_del = 0; @@ -1084,6 +1085,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } // fallback to local compaction assert(comp_status == CompactionServiceJobStatus::kUseLocal); + sub_compact->compaction_job_stats.is_remote_compaction = false; } uint64_t prev_cpu_micros = db_options_.clock->CPUMicros(); @@ -2001,10 +2003,12 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) { bool has_error = false; const ReadOptions read_options(Env::IOActivity::kCompaction); const auto& input_table_properties = compaction->GetInputTableProperties(); + // TODO(yuzhangyu): add dedicated stats for filtered files. for (int input_level = 0; input_level < static_cast(compaction->num_input_levels()); ++input_level) { - size_t num_input_files = compaction->num_input_files(input_level); + const LevelFilesBrief* flevel = compaction->input_levels(input_level); + size_t num_input_files = flevel->num_files; uint64_t* bytes_read; if (compaction->level(input_level) != compaction->output_level()) { compaction_stats_.stats.num_input_files_in_non_output_levels += @@ -2016,7 +2020,7 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) { bytes_read = &compaction_stats_.stats.bytes_read_output_level; } for (size_t i = 0; i < num_input_files; ++i) { - const FileMetaData* file_meta = compaction->input(input_level, i); + const FileMetaData* file_meta = flevel->files[i].file_metadata; *bytes_read += file_meta->fd.GetFileSize(); uint64_t file_input_entries = file_meta->num_entries; uint64_t file_num_range_del = file_meta->num_range_deletions; diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index dd3b53737..c519b5959 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -209,6 +209,8 @@ class CompactionJob { // Returns true iff compaction_stats_.stats.num_input_records and // num_input_range_del are calculated successfully. bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr); + virtual void UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const; void LogCompaction(); virtual void RecordCompactionIOStats(); void CleanupCompaction(); @@ -279,8 +281,7 @@ class CompactionJob { bool* compaction_released); Status OpenCompactionOutputFile(SubcompactionState* sub_compact, CompactionOutputs& outputs); - void UpdateCompactionJobStats( - const InternalStats::CompactionStats& stats) const; + void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, CompactionJobStats* compaction_job_stats = nullptr); @@ -385,7 +386,7 @@ struct CompactionServiceInput { // files needed for this compaction, for both input level files and output // level files. std::vector input_files; - int output_level; + int output_level = 0; // db_id is used to generate unique id of sst on the remote compactor std::string db_id; @@ -396,6 +397,8 @@ struct CompactionServiceInput { bool has_end = false; std::string end; + uint64_t options_file_number = 0; + // serialization interface to read and write the object static Status Read(const std::string& data_str, CompactionServiceInput* obj); Status Write(std::string* output); @@ -413,20 +416,25 @@ struct CompactionServiceOutputFile { SequenceNumber largest_seqno; std::string smallest_internal_key; std::string largest_internal_key; - uint64_t oldest_ancester_time; - uint64_t file_creation_time; - uint64_t epoch_number; + uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; + uint64_t file_creation_time = kUnknownFileCreationTime; + uint64_t epoch_number = kUnknownEpochNumber; + std::string file_checksum = kUnknownFileChecksum; + std::string file_checksum_func_name = kUnknownFileChecksumFuncName; uint64_t paranoid_hash; bool marked_for_compaction; - UniqueId64x2 unique_id; + UniqueId64x2 unique_id{}; + TableProperties table_properties; CompactionServiceOutputFile() = default; CompactionServiceOutputFile( const std::string& name, SequenceNumber smallest, SequenceNumber largest, std::string _smallest_internal_key, std::string _largest_internal_key, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, - uint64_t _epoch_number, uint64_t _paranoid_hash, - bool _marked_for_compaction, UniqueId64x2 _unique_id) + uint64_t _epoch_number, const std::string& _file_checksum, + const std::string& _file_checksum_func_name, uint64_t _paranoid_hash, + bool _marked_for_compaction, UniqueId64x2 _unique_id, + const std::shared_ptr& _table_properties) : file_name(name), smallest_seqno(smallest), largest_seqno(largest), @@ -435,9 +443,12 @@ struct CompactionServiceOutputFile { oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time), epoch_number(_epoch_number), + file_checksum(_file_checksum), + file_checksum_func_name(_file_checksum_func_name), paranoid_hash(_paranoid_hash), marked_for_compaction(_marked_for_compaction), - unique_id(std::move(_unique_id)) {} + unique_id(std::move(_unique_id)), + table_properties(*_table_properties.get()) {} }; // CompactionServiceResult contains the compaction result from a different db @@ -446,14 +457,11 @@ struct CompactionServiceOutputFile { struct CompactionServiceResult { Status status; std::vector output_files; - int output_level; + int output_level = 0; // location of the output files std::string output_path; - // some statistics about the compaction - uint64_t num_output_records = 0; - uint64_t total_bytes = 0; uint64_t bytes_read = 0; uint64_t bytes_written = 0; CompactionJobStats stats; @@ -499,6 +507,9 @@ class CompactionServiceCompactionJob : private CompactionJob { protected: void RecordCompactionIOStats() override; + void UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const override; + private: // Get table file name in output_path std::string GetTableFileName(uint64_t file_number) override; diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index f8a78bc9b..cf981cc95 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -250,6 +250,7 @@ class CompactionJobTestBase : public testing::Test { } else { assert(false); } + mutable_cf_options_.table_factory = cf_options_.table_factory; } std::string GenerateFileName(uint64_t file_number) { @@ -651,7 +652,8 @@ class CompactionJobTestBase : public testing::Test { mutable_cf_options_.target_file_size_base, mutable_cf_options_.max_compaction_bytes, 0, kNoCompression, cfd->GetLatestMutableCFOptions()->compression_opts, - Temperature::kUnknown, max_subcompactions, grandparents, true); + Temperature::kUnknown, max_subcompactions, grandparents, + /*earliest_snapshot*/ std::nullopt, /*snapshot_checker*/ nullptr, true); compaction.FinalizeInputInfo(cfd->current()); assert(db_options_.info_log); @@ -1656,20 +1658,44 @@ TEST_F(CompactionJobTest, ResultSerialization) { }; result.status = status_list.at(rnd.Uniform(static_cast(status_list.size()))); + + std::string file_checksum = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)); + std::string file_checksum_func_name = "MyAwesomeChecksumGenerator"; while (!rnd.OneIn(10)) { + TableProperties tp; + tp.user_collected_properties.emplace( + "UCP_Key1", rnd.RandomString(rnd.Uniform(kStrMaxLen))); + tp.user_collected_properties.emplace( + "UCP_Key2", rnd.RandomString(rnd.Uniform(kStrMaxLen))); + tp.readable_properties.emplace("RP_Key1", + rnd.RandomString(rnd.Uniform(kStrMaxLen))); + tp.readable_properties.emplace("RP_K2y2", + rnd.RandomString(rnd.Uniform(kStrMaxLen))); + + std::shared_ptr table_properties = + std::make_shared(tp); + UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)}; result.output_files.emplace_back( - rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX), - rnd64.Uniform(UINT64_MAX), - rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), - rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), - rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), - rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id); + rnd.RandomString(rnd.Uniform(kStrMaxLen)) /* file_name */, + rnd64.Uniform(UINT64_MAX) /* smallest_seqno */, + rnd64.Uniform(UINT64_MAX) /* largest_seqno */, + rnd.RandomBinaryString( + rnd.Uniform(kStrMaxLen)) /* smallest_internal_key */, + rnd.RandomBinaryString( + rnd.Uniform(kStrMaxLen)) /* largest_internal_key */, + rnd64.Uniform(UINT64_MAX) /* oldest_ancester_time */, + rnd64.Uniform(UINT64_MAX) /* file_creation_time */, + rnd64.Uniform(UINT64_MAX) /* epoch_number */, + file_checksum /* file_checksum */, + file_checksum_func_name /* file_checksum_func_name */, + rnd64.Uniform(UINT64_MAX) /* paranoid_hash */, + rnd.OneIn(2) /* marked_for_compaction */, id /* unique_id */, + table_properties); } result.output_level = rnd.Uniform(10); result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen)); - result.num_output_records = rnd64.Uniform(UINT64_MAX); - result.total_bytes = rnd64.Uniform(UINT64_MAX); + result.stats.num_output_records = rnd64.Uniform(UINT64_MAX); result.bytes_read = 123; result.bytes_written = rnd64.Uniform(UINT64_MAX); result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX); @@ -1686,6 +1712,21 @@ TEST_F(CompactionJobTest, ResultSerialization) { ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1)); ASSERT_TRUE(deserialized1.TEST_Equals(&result)); + for (size_t i = 0; i < result.output_files.size(); i++) { + for (const auto& prop : + result.output_files[i].table_properties.user_collected_properties) { + ASSERT_EQ(deserialized1.output_files[i] + .table_properties.user_collected_properties[prop.first], + prop.second); + } + for (const auto& prop : + result.output_files[i].table_properties.readable_properties) { + ASSERT_EQ(deserialized1.output_files[i] + .table_properties.readable_properties[prop.first], + prop.second); + } + } + // Test mismatch deserialized1.stats.num_input_files += 10; std::string mismatch; @@ -1700,6 +1741,10 @@ TEST_F(CompactionJobTest, ResultSerialization) { ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch)); ASSERT_EQ(mismatch, "output_files.unique_id"); deserialized_tmp.status.PermitUncheckedError(); + + ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum, file_checksum); + ASSERT_EQ(deserialized_tmp.output_files[0].file_checksum_func_name, + file_checksum_func_name); } // Test unknown field diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index 1b02fb0e9..a3c3552ed 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -62,8 +62,9 @@ class CompactionOutputs { } // TODO: Remove it when remote compaction support tiered compaction - void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; } + void AddBytesWritten(uint64_t bytes) { stats_.bytes_written += bytes; } void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; } + void SetNumOutputFiles(uint64_t num) { stats_.num_output_files = num; } // TODO: Move the BlobDB builder into CompactionOutputs const std::vector& GetBlobFileAdditions() const { @@ -107,6 +108,12 @@ class CompactionOutputs { Status Finish(const Status& intput_status, const SeqnoToTimeMapping& seqno_to_time_mapping); + // Update output table properties from already populated TableProperties. + // Used for remote compaction + void UpdateTableProperties(const TableProperties& table_properties) { + current_output().table_properties = + std::make_shared(table_properties); + } // Update output table properties from table builder void UpdateTableProperties() { current_output().table_properties = diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index cc47060b5..fe1dbcdb9 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -380,7 +380,8 @@ Compaction* CompactionPicker::CompactFiles( GetCompressionOptions(mutable_cf_options, vstorage, output_level), mutable_cf_options.default_write_temperature, compact_options.max_subcompactions, - /* grandparents */ {}, true); + /* grandparents */ {}, /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, true); RegisterCompaction(c); return c; } @@ -677,7 +678,9 @@ Compaction* CompactionPicker::CompactRange( GetCompressionOptions(mutable_cf_options, vstorage, output_level), mutable_cf_options.default_write_temperature, compact_range_options.max_subcompactions, - /* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1, + /* grandparents */ {}, /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, + /* is manual */ true, trim_ts, /* score */ -1, /* deletion_compaction */ false, /* l0_files_might_overlap */ true, CompactionReason::kUnknown, compact_range_options.blob_garbage_collection_policy, @@ -866,6 +869,7 @@ Compaction* CompactionPicker::CompactRange( GetCompressionOptions(mutable_cf_options, vstorage, output_level), mutable_cf_options.default_write_temperature, compact_range_options.max_subcompactions, std::move(grandparents), + /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr, /* is manual */ true, trim_ts, /* score */ -1, /* deletion_compaction */ false, /* l0_files_might_overlap */ true, CompactionReason::kUnknown, @@ -1171,7 +1175,8 @@ void CompactionPicker::UnregisterCompaction(Compaction* c) { void CompactionPicker::PickFilesMarkedForCompaction( const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level, - int* output_level, CompactionInputFiles* start_level_inputs) { + int* output_level, CompactionInputFiles* start_level_inputs, + std::function skip_marked_file) { if (vstorage->FilesMarkedForCompaction().empty()) { return; } @@ -1181,6 +1186,9 @@ void CompactionPicker::PickFilesMarkedForCompaction( // If this assert() fails that means that some function marked some // files as being_compacted, but didn't call ComputeCompactionScore() assert(!level_file.second->being_compacted); + if (skip_marked_file(level_file.second)) { + return false; + } *start_level = level_file.first; *output_level = (*start_level == 0) ? vstorage->base_level() : *start_level + 1; diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h index 087595a8a..6729cda0a 100644 --- a/db/compaction/compaction_picker.h +++ b/db/compaction/compaction_picker.h @@ -16,6 +16,7 @@ #include #include "db/compaction/compaction.h" +#include "db/snapshot_checker.h" #include "db/version_set.h" #include "options/cf_options.h" #include "rocksdb/env.h" @@ -55,17 +56,17 @@ class CompactionPicker { // Returns nullptr if there is no compaction to be done. // Otherwise returns a pointer to a heap-allocated object that // describes the compaction. Caller should delete the result. - virtual Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, - VersionStorageInfo* vstorage, - LogBuffer* log_buffer) = 0; - - // Return a compaction object for compacting the range [begin,end] in - // the specified level. Returns nullptr if there is nothing in that - // level that overlaps the specified range. Caller should delete - // the result. - // + // Currently, only universal compaction will query existing snapshots and + // pass it to aid compaction picking. And it's only passed when user-defined + // timestamps is not enabled. The other compaction styles do not pass or use + // `existing_snapshots` or `snapshot_checker`. + virtual Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + const std::vector& existing_snapshots, + const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) = 0; + // The returned Compaction might not include the whole requested range. // In that case, compaction_end will be set to the next key that needs // compacting. In case the compaction will compact the whole range, @@ -203,10 +204,11 @@ class CompactionPicker { const CompactionInputFiles& output_level_inputs, std::vector* grandparents); - void PickFilesMarkedForCompaction(const std::string& cf_name, - VersionStorageInfo* vstorage, - int* start_level, int* output_level, - CompactionInputFiles* start_level_inputs); + void PickFilesMarkedForCompaction( + const std::string& cf_name, VersionStorageInfo* vstorage, + int* start_level, int* output_level, + CompactionInputFiles* start_level_inputs, + std::function skip_marked_file); bool GetOverlappingL0Files(VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs, @@ -257,11 +259,13 @@ class NullCompactionPicker : public CompactionPicker { virtual ~NullCompactionPicker() {} // Always return "nullptr" - Compaction* PickCompaction(const std::string& /*cf_name*/, - const MutableCFOptions& /*mutable_cf_options*/, - const MutableDBOptions& /*mutable_db_options*/, - VersionStorageInfo* /*vstorage*/, - LogBuffer* /* log_buffer */) override { + Compaction* PickCompaction( + const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, + const std::vector& /*existing_snapshots*/, + const SnapshotChecker* /*snapshot_checker*/, + VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */) override { return nullptr; } diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc index 29f4462d4..12cf60e0e 100644 --- a/db/compaction/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -118,7 +118,9 @@ Compaction* FIFOCompactionPicker::PickTTLCompaction( std::move(inputs), 0, 0, 0, 0, kNoCompression, mutable_cf_options.compression_opts, mutable_cf_options.default_write_temperature, - /* max_subcompactions */ 0, {}, /* is manual */ false, + /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, + /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0), /* is deletion compaction */ true, /* l0_files_might_overlap */ true, CompactionReason::kFIFOTtl); @@ -188,7 +190,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( 0 /* output path ID */, mutable_cf_options.compression, mutable_cf_options.compression_opts, mutable_cf_options.default_write_temperature, - 0 /* max_subcompactions */, {}, /* is manual */ false, + 0 /* max_subcompactions */, {}, + /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0), /* is deletion compaction */ false, /* l0_files_might_overlap */ true, @@ -284,7 +288,9 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( /* output_path_id */ 0, kNoCompression, mutable_cf_options.compression_opts, mutable_cf_options.default_write_temperature, - /* max_subcompactions */ 0, {}, /* is manual */ false, + /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, + /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0), /* is deletion compaction */ true, /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize); @@ -410,8 +416,9 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction( 0 /* max compaction bytes, not applicable */, 0 /* output path ID */, mutable_cf_options.compression, mutable_cf_options.compression_opts, compaction_target_temp, - /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "", - vstorage->CompactionScore(0), + /* max_subcompactions */ 0, {}, /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, + /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0), /* is deletion compaction */ false, /* l0_files_might_overlap */ true, CompactionReason::kChangeTemperature); return c; @@ -419,7 +426,9 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction( Compaction* FIFOCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + const MutableDBOptions& mutable_db_options, + const std::vector& /* existing_snapshots */, + const SnapshotChecker* /* snapshot_checker */, VersionStorageInfo* vstorage, LogBuffer* log_buffer) { Compaction* c = nullptr; if (mutable_cf_options.ttl > 0) { @@ -454,8 +463,10 @@ Compaction* FIFOCompactionPicker::CompactRange( assert(output_level == 0); *compaction_end = nullptr; LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger); - Compaction* c = PickCompaction(cf_name, mutable_cf_options, - mutable_db_options, vstorage, &log_buffer); + Compaction* c = + PickCompaction(cf_name, mutable_cf_options, mutable_db_options, + /*existing_snapshots*/ {}, /*snapshot_checker*/ nullptr, + vstorage, &log_buffer); log_buffer.FlushBufferToLog(); return c; } diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h index cd7e56e8b..4dd1053e1 100644 --- a/db/compaction/compaction_picker_fifo.h +++ b/db/compaction/compaction_picker_fifo.h @@ -18,11 +18,12 @@ class FIFOCompactionPicker : public CompactionPicker { const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, - VersionStorageInfo* version, - LogBuffer* log_buffer) override; + Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + const std::vector& /* existing_snapshots */, + const SnapshotChecker* /* snapshot_checker */, + VersionStorageInfo* version, LogBuffer* log_buffer) override; Compaction* CompactRange(const std::string& cf_name, const MutableCFOptions& mutable_cf_options, diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index ae289ac3f..262ebba85 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -262,7 +262,10 @@ void LevelCompactionBuilder::SetupInitialFiles() { parent_index_ = base_index_ = -1; compaction_picker_->PickFilesMarkedForCompaction( - cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_); + cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_, + /*skip_marked_file*/ [](const FileMetaData* /* file */) { + return false; + }); if (!start_level_inputs_.empty()) { compaction_reason_ = CompactionReason::kFilesMarkedForCompaction; return; @@ -554,7 +557,9 @@ Compaction* LevelCompactionBuilder::GetCompaction() { vstorage_->base_level()), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_), mutable_cf_options_.default_write_temperature, - /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, + /* max_subcompactions */ 0, std::move(grandparents_), + /* earliest_snapshot */ std::nullopt, /* snapshot_checker */ nullptr, + is_manual_, /* trim_ts */ "", start_level_score_, false /* deletion_compaction */, l0_files_might_overlap, compaction_reason_); @@ -967,7 +972,9 @@ bool LevelCompactionBuilder::PickSizeBasedIntraL0Compaction() { Compaction* LevelCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + const MutableDBOptions& mutable_db_options, + const std::vector& /*existing_snapshots */, + const SnapshotChecker* /*snapshot_checker*/, VersionStorageInfo* vstorage, LogBuffer* log_buffer) { LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer, mutable_cf_options, ioptions_, diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h index e822e3396..9cb41dfb6 100644 --- a/db/compaction/compaction_picker_level.h +++ b/db/compaction/compaction_picker_level.h @@ -20,11 +20,12 @@ class LevelCompactionPicker : public CompactionPicker { LevelCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, - VersionStorageInfo* vstorage, - LogBuffer* log_buffer) override; + Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + const std::vector& /* existing_snapshots */, + const SnapshotChecker* /* snapshot_checker */, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) override; bool NeedsCompaction(const VersionStorageInfo* vstorage) const override; }; diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index bec8e9182..3cf0c7377 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -232,8 +232,9 @@ TEST_F(CompactionPickerTest, Empty) { NewVersionStorage(6, kCompactionStyleLevel); UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() == nullptr); } @@ -244,8 +245,9 @@ TEST_F(CompactionPickerTest, Single) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() == nullptr); } @@ -258,8 +260,9 @@ TEST_F(CompactionPickerTest, Level0Trigger) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -272,8 +275,9 @@ TEST_F(CompactionPickerTest, Level1Trigger) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); @@ -291,8 +295,9 @@ TEST_F(CompactionPickerTest, Level1Trigger2) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(2U, compaction->num_input_files(1)); @@ -323,8 +328,9 @@ TEST_F(CompactionPickerTest, LevelMaxScore) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); @@ -371,8 +377,9 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -396,8 +403,9 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic2) { ASSERT_EQ(vstorage_->base_level(), num_levels - 2); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -422,8 +430,9 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic3) { ASSERT_EQ(vstorage_->base_level(), num_levels - 3); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -452,8 +461,9 @@ TEST_F(CompactionPickerTest, Level0TriggerDynamic4) { ASSERT_EQ(vstorage_->base_level(), num_levels - 3); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); @@ -485,8 +495,9 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); @@ -540,8 +551,9 @@ TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); // output level should be the one above the bottom-most ASSERT_EQ(1, compaction->output_level()); @@ -583,8 +595,9 @@ TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(!compaction->is_trivial_move()); } @@ -610,8 +623,9 @@ TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction->is_trivial_move()); } @@ -639,8 +653,9 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); @@ -670,8 +685,9 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_FALSE(compaction); } @@ -697,8 +713,9 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_FALSE(compaction); } @@ -728,8 +745,9 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(!compaction || compaction->start_level() != compaction->output_level()); } @@ -749,8 +767,9 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(0, compaction->start_level()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -774,8 +793,9 @@ TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->start_level()); ASSERT_EQ(2U, compaction->num_input_files(0)); @@ -812,8 +832,9 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); ASSERT_EQ(3, compaction->start_level()); @@ -854,8 +875,9 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); ASSERT_EQ(2, compaction->start_level()); @@ -896,8 +918,9 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); ASSERT_EQ(2, compaction->start_level()); @@ -944,8 +967,9 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); ASSERT_EQ(3, compaction->start_level()); @@ -988,8 +1012,9 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(4, compaction->output_level()); ASSERT_EQ(3, compaction->start_level()); @@ -1040,8 +1065,9 @@ TEST_F(CompactionPickerTest, ASSERT_TRUE(universal_compaction_picker.NeedsCompaction(vstorage_.get())); std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kUniversalSizeAmplification); @@ -1109,8 +1135,9 @@ TEST_F(CompactionPickerTest, FIFOToCold1) { ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kChangeTemperature); @@ -1155,8 +1182,9 @@ TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) { ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kChangeTemperature); @@ -1201,8 +1229,9 @@ TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) { ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kChangeTemperature); @@ -1247,8 +1276,9 @@ TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) { ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kChangeTemperature); @@ -1297,8 +1327,9 @@ TEST_F(CompactionPickerTest, FIFOToHotAndWarm) { ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(compaction->compaction_reason(), CompactionReason::kChangeTemperature); @@ -1329,8 +1360,9 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Pick file 8 because it overlaps with 0 files on level 3. @@ -1362,8 +1394,9 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 7 because overlapping ratio is the biggest. @@ -1390,8 +1423,9 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 8 because overlapping ratio is the biggest. @@ -1418,8 +1452,9 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 6 because overlapping ratio is the biggest. @@ -1454,8 +1489,9 @@ TEST_F(CompactionPickerTest, CompactionPriRoundRobin) { LevelCompactionPicker(ioptions_, &icmp_); std::unique_ptr compaction( local_level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); // Since the max bytes for level 2 is 120M, picking one file to compact // makes the post-compaction level size less than 120M, there is exactly one @@ -1494,8 +1530,9 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin1) { LevelCompactionPicker(ioptions_, &icmp_); std::unique_ptr compaction( local_level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); // The maximum compaction bytes is very large in this case so we can igore its @@ -1537,8 +1574,9 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin2) { LevelCompactionPicker(ioptions_, &icmp_); std::unique_ptr compaction( local_level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); // The maximum compaction bytes is only 2500 bytes now. Even though we are @@ -1581,8 +1619,9 @@ TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin3) { LevelCompactionPicker(ioptions_, &icmp_); std::unique_ptr compaction( local_level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); // Cannot pick more files since we reach the last file in level 2 @@ -1640,8 +1679,9 @@ TEST_F(CompactionPickerTest, CompactionPriMinOverlappingManyFiles) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_files(0)); // Picking file 8 because overlapping ratio is the biggest. @@ -1668,8 +1708,9 @@ TEST_F(CompactionPickerTest, ParentIndexResetBug) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); } // This test checks ExpandWhileOverlapping() by having overlapping user keys @@ -1686,8 +1727,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(2U, compaction->num_input_files(0)); @@ -1706,8 +1748,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys2) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(2U, compaction->num_input_files(0)); @@ -1734,8 +1777,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys3) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(5U, compaction->num_input_files(0)); @@ -1765,8 +1809,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys4) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -1789,8 +1834,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys5) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() == nullptr); } @@ -1811,8 +1857,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys6) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -1832,8 +1879,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys7) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_GE(1U, compaction->num_input_files(0)); @@ -1861,8 +1909,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys8) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(3U, compaction->num_input_files(0)); @@ -1894,8 +1943,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys9) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(5U, compaction->num_input_files(0)); @@ -1935,8 +1985,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys10) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -1974,8 +2025,9 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys11) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -2081,8 +2133,9 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) { ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() == nullptr); } @@ -2112,8 +2165,9 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) { ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0)); ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1)); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); } @@ -2146,8 +2200,9 @@ TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) { ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0)); ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1)); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); } @@ -2447,8 +2502,9 @@ TEST_F(CompactionPickerTest, CompactionLimitWhenAddFileFromInputLevel) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(4U, compaction->num_input_files(0)); @@ -2482,8 +2538,9 @@ TEST_F(CompactionPickerTest, HitCompactionLimitWhenAddFileFromInputLevel) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -2510,8 +2567,9 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOn) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); } @@ -2535,8 +2593,9 @@ TEST_F(CompactionPickerTest, L0TrivialMove1) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1, compaction->num_input_levels()); ASSERT_EQ(2, compaction->num_input_files(0)); @@ -2564,8 +2623,9 @@ TEST_F(CompactionPickerTest, L0TrivialMoveOneFile) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1, compaction->num_input_levels()); ASSERT_EQ(1, compaction->num_input_files(0)); @@ -2590,8 +2650,9 @@ TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1, compaction->num_input_levels()); ASSERT_EQ(4, compaction->num_input_files(0)); @@ -2618,8 +2679,9 @@ TEST_F(CompactionPickerTest, NonL0TrivialMoveExtendBothDirection) { // File #2 should be picked first, and expand both directions to include // files #1 and #3. std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1, compaction->num_input_levels()); ASSERT_EQ(3, compaction->num_input_files(0)); @@ -2648,8 +2710,9 @@ TEST_F(CompactionPickerTest, L0TrivialMoveToEmptyLevel) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1, compaction->num_input_levels()); ASSERT_EQ(1, compaction->num_input_files(0)); @@ -2676,8 +2739,9 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); // No trivial move, because partitioning is applied ASSERT_TRUE(!compaction->IsTrivialMove()); @@ -2699,8 +2763,9 @@ TEST_F(CompactionPickerTest, IsTrivialMoveOff) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_FALSE(compaction->IsTrivialMove()); } @@ -2728,8 +2793,9 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -2762,8 +2828,9 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -2795,8 +2862,9 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -2821,8 +2889,9 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -2851,8 +2920,9 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -2885,8 +2955,9 @@ TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_TRUE(compaction->IsTrivialMove()); ASSERT_EQ(1, compaction->num_input_levels()); @@ -2920,8 +2991,9 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -2930,8 +3002,9 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */)); compaction.reset(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(2U, compaction->num_input_levels()); ASSERT_EQ(1U, compaction->num_input_files(0)); @@ -2940,8 +3013,9 @@ TEST_F(CompactionPickerTest, CacheNextCompactionIndex) { ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */)); compaction.reset(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() == nullptr); ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */)); } @@ -2966,8 +3040,9 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(5U, compaction->num_input_files(0)); @@ -2997,8 +3072,9 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(1U, compaction->num_input_levels()); ASSERT_EQ(4U, compaction->num_input_files(0)); @@ -3043,8 +3119,9 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); // Validate that its a compaction to reduce sorted runs @@ -3065,8 +3142,9 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { std::unique_ptr compaction2( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_FALSE(compaction2); } @@ -3094,8 +3172,9 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); // Validate that its a delete triggered compaction @@ -3122,8 +3201,9 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { std::unique_ptr compaction2( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_FALSE(compaction2); } @@ -3163,8 +3243,9 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); // Validate that its a delete triggered compaction @@ -3194,8 +3275,9 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) { random_index = 0; std::unique_ptr compaction2( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_FALSE(compaction2); DeleteVersionStorage(); } @@ -3220,8 +3302,9 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); // Validate that its a delete triggered compaction @@ -3257,8 +3340,9 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); // Validate that its a delete triggered compaction @@ -3310,8 +3394,9 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); // Validate that its a delete triggered compaction @@ -3341,8 +3426,9 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { std::unique_ptr compaction2( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction2); ASSERT_EQ(3U, compaction->num_input_files(0)); ASSERT_TRUE(file_map_[1].first->being_compacted); @@ -3420,8 +3506,9 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); // Make sure it's a size amp compaction and includes all files ASSERT_EQ(compaction->compaction_reason(), @@ -3456,8 +3543,9 @@ TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); // Internally, size amp compaction is evaluated before size ratio compaction. // Here to make sure it's size ratio compaction instead of size amp @@ -3496,8 +3584,9 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); // size amp compaction is still triggered even preclude_last_level is set ASSERT_EQ(compaction->compaction_reason(), @@ -3530,8 +3619,9 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); // It's a Size Amp compaction, but doesn't include the last level file and // output to the penultimate level. @@ -3638,8 +3728,9 @@ TEST_F(CompactionPickerU64TsTest, CannotTrivialMoveUniversal) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); assert(compaction); ASSERT_TRUE(!compaction->is_trivial_move()); } @@ -4169,8 +4260,9 @@ TEST_F(CompactionPickerTest, UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(num_levels - 2, compaction->start_level()); ASSERT_EQ(num_levels - 1, compaction->output_level()); @@ -4178,9 +4270,10 @@ TEST_F(CompactionPickerTest, ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber()); std::unique_ptr second_compaction( - level_compaction_picker.PickCompaction(cf_name_, mutable_cf_options_, - mutable_db_options_, - vstorage_.get(), &log_buffer_)); + level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(second_compaction); ASSERT_EQ(num_levels - 1, compaction->output_level()); ASSERT_EQ(num_levels - 2, compaction->start_level()); @@ -4225,8 +4318,9 @@ TEST_F(CompactionPickerTest, UpdateVersionStorageInfo(); std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction); ASSERT_EQ(num_levels - 3, compaction->start_level()); ASSERT_EQ(num_levels - 2, compaction->output_level()); @@ -4274,8 +4368,9 @@ TEST_F(CompactionPickerTest, IntraL0WhenL0IsSmall) { LevelCompactionPicker compaction_picker(ioptions_, &icmp_); std::unique_ptr compaction(compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); ASSERT_EQ(CompactionReason::kLevelL0FilesNum, compaction->compaction_reason()); @@ -4351,6 +4446,7 @@ TEST_F(CompactionPickerTest, UniversalMaxReadAmpLargeDB) { std::unique_ptr compaction( universal_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, vstorage_.get(), &log_buffer_)); if (i == kMaxRuns) { // There are in total i + 1 > kMaxRuns sorted runs. @@ -4397,8 +4493,9 @@ TEST_F(CompactionPickerTest, UniversalMaxReadAmpSmallDB) { ASSERT_TRUE(universal_compaction_picker.NeedsCompaction(vstorage_.get())); std::unique_ptr compaction( universal_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_)); + cf_name_, mutable_cf_options_, mutable_db_options_, + /*existing_snapshots=*/{}, /* snapshot_checker */ nullptr, + vstorage_.get(), &log_buffer_)); ASSERT_EQ(nullptr, compaction); } } diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index 0b5426149..c1ad96bd8 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -35,7 +35,9 @@ class UniversalCompactionBuilder { UniversalCompactionBuilder( const ImmutableOptions& ioptions, const InternalKeyComparator* icmp, const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + const MutableDBOptions& mutable_db_options, + const std::vector& existing_snapshots, + const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage, UniversalCompactionPicker* picker, LogBuffer* log_buffer) : ioptions_(ioptions), icmp_(icmp), @@ -44,7 +46,19 @@ class UniversalCompactionBuilder { mutable_db_options_(mutable_db_options), vstorage_(vstorage), picker_(picker), - log_buffer_(log_buffer) {} + log_buffer_(log_buffer) { + assert(icmp_); + const auto* ucmp = icmp_->user_comparator(); + assert(ucmp); + // These parameters are only passed when user-defined timestamp is not + // enabled. + if (ucmp->timestamp_size() == 0) { + earliest_snapshot_ = existing_snapshots.empty() + ? kMaxSequenceNumber + : existing_snapshots.at(0); + snapshot_checker_ = snapshot_checker; + } + } // Form and return the compaction object. The caller owns return object. Compaction* PickCompaction(); @@ -52,12 +66,15 @@ class UniversalCompactionBuilder { private: struct SortedRun { SortedRun(int _level, FileMetaData* _file, uint64_t _size, - uint64_t _compensated_file_size, bool _being_compacted) + uint64_t _compensated_file_size, bool _being_compacted, + bool _level_has_marked_standalone_rangedel) : level(_level), file(_file), size(_size), compensated_file_size(_compensated_file_size), - being_compacted(_being_compacted) { + being_compacted(_being_compacted), + level_has_marked_standalone_rangedel( + _level_has_marked_standalone_rangedel) { assert(compensated_file_size > 0); assert(level != 0 || file != nullptr); } @@ -79,6 +96,10 @@ class UniversalCompactionBuilder { uint64_t size; uint64_t compensated_file_size; bool being_compacted; + // True if this level has any file that is a standalone range deletion file + // marked for compaction. Best effort is made to make only deletion + // triggered compaction pick this type of file. + bool level_has_marked_standalone_rangedel; }; // Pick Universal compaction to limit read amplification @@ -98,6 +119,11 @@ class UniversalCompactionBuilder { Compaction* PickDeleteTriggeredCompaction(); + // Returns true if this given file (that is marked be compaction) should be + // skipped from being picked for now. We do this to best use standalone range + // tombstone files. + bool ShouldSkipMarkedFile(const FileMetaData* file) const; + // Form a compaction from the sorted run indicated by start_index to the // oldest sorted run. // The caller is responsible for making sure that those files are not in @@ -234,8 +260,18 @@ class UniversalCompactionBuilder { VersionStorageInfo* vstorage_; UniversalCompactionPicker* picker_; LogBuffer* log_buffer_; - - static std::vector CalculateSortedRuns( + // Optional earliest snapshot at time of compaction picking. This is only + // provided if the column family doesn't enable user-defined timestamps. + // And this information is only passed to `Compaction` picked by deletion + // triggered compaction for possible optimizations. + std::optional earliest_snapshot_; + const SnapshotChecker* snapshot_checker_; + // Mapping from file id to its index in the sorted run for the files that are + // marked for compaction. This is only populated when snapshot info is + // populated. + std::map file_marked_for_compaction_to_sorted_run_index_; + + std::vector CalculateSortedRuns( const VersionStorageInfo& vstorage, int last_level, uint64_t* max_run_size); @@ -394,11 +430,13 @@ bool UniversalCompactionPicker::NeedsCompaction( Compaction* UniversalCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + const MutableDBOptions& mutable_db_options, + const std::vector& existing_snapshots, + const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage, LogBuffer* log_buffer) { - UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, - mutable_cf_options, mutable_db_options, - vstorage, this, log_buffer); + UniversalCompactionBuilder builder( + ioptions_, icmp_, cf_name, mutable_cf_options, mutable_db_options, + existing_snapshots, snapshot_checker, vstorage, this, log_buffer); return builder.PickCompaction(); } @@ -448,14 +486,20 @@ UniversalCompactionBuilder::CalculateSortedRuns( *max_run_size = 0; std::vector ret; for (FileMetaData* f : vstorage.LevelFiles(0)) { - ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size, - f->being_compacted); + if (earliest_snapshot_.has_value() && f->marked_for_compaction) { + file_marked_for_compaction_to_sorted_run_index_.emplace(f->fd.GetNumber(), + ret.size()); + } + ret.emplace_back( + 0, f, f->fd.GetFileSize(), f->compensated_file_size, f->being_compacted, + f->marked_for_compaction && f->FileIsStandAloneRangeTombstone()); *max_run_size = std::max(*max_run_size, f->fd.GetFileSize()); } for (int level = 1; level <= last_level; level++) { uint64_t total_compensated_size = 0U; uint64_t total_size = 0U; bool being_compacted = false; + bool level_has_marked_standalone_rangedel = false; for (FileMetaData* f : vstorage.LevelFiles(level)) { total_compensated_size += f->compensated_file_size; total_size += f->fd.GetFileSize(); @@ -467,16 +511,57 @@ UniversalCompactionBuilder::CalculateSortedRuns( if (f->being_compacted) { being_compacted = f->being_compacted; } + level_has_marked_standalone_rangedel = + level_has_marked_standalone_rangedel || + (f->marked_for_compaction && f->FileIsStandAloneRangeTombstone()); + if (earliest_snapshot_.has_value() && f->marked_for_compaction) { + file_marked_for_compaction_to_sorted_run_index_.emplace( + f->fd.GetNumber(), ret.size()); + } } if (total_compensated_size > 0) { ret.emplace_back(level, nullptr, total_size, total_compensated_size, - being_compacted); + being_compacted, level_has_marked_standalone_rangedel); } *max_run_size = std::max(*max_run_size, total_size); } return ret; } +bool UniversalCompactionBuilder::ShouldSkipMarkedFile( + const FileMetaData* file) const { + assert(file->marked_for_compaction); + if (!earliest_snapshot_.has_value()) { + return false; + } + if (!file->FileIsStandAloneRangeTombstone()) { + return false; + } + // Skip until earliest snapshot advances at or above this standalone range + // tombstone file. `DB::ReleaseSnapshot` will re-examine and schedule + // compaction for it. + if (!DataIsDefinitelyInSnapshot(file->fd.largest_seqno, + earliest_snapshot_.value(), + snapshot_checker_)) { + return true; + } + + auto iter = file_marked_for_compaction_to_sorted_run_index_.find( + file->fd.GetNumber()); + assert(iter != file_marked_for_compaction_to_sorted_run_index_.end()); + size_t idx = iter->second; + const SortedRun* succeeding_sorted_run = + idx < sorted_runs_.size() - 1 ? &sorted_runs_[idx + 1] : nullptr; + // Marked standalone range tombstone file is best used if it's in the start + // input level. Skip to let that compaction happen first. + if (succeeding_sorted_run && + succeeding_sorted_run->level_has_marked_standalone_rangedel) { + return true; + } + + return false; +} + // Universal style of compaction. Pick files that are contiguous in // time-range to compact. Compaction* UniversalCompactionBuilder::PickCompaction() { @@ -580,7 +665,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { // Get the total number of sorted runs that are not being compacted int num_sr_not_compacted = 0; for (size_t i = 0; i < sorted_runs_.size(); i++) { - if (sorted_runs_[i].being_compacted == false) { + if (sorted_runs_[i].being_compacted == false && + !sorted_runs_[i].level_has_marked_standalone_rangedel) { num_sr_not_compacted++; } } @@ -743,16 +829,24 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( for (sr = nullptr; loop < sorted_runs_.size(); loop++) { sr = &sorted_runs_[loop]; - if (!sr->being_compacted) { + if (!sr->being_compacted && !sr->level_has_marked_standalone_rangedel) { candidate_count = 1; break; } char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf)); - ROCKS_LOG_BUFFER(log_buffer_, - "[%s] Universal: %s" - "[%d] being compacted, skipping", - cf_name_.c_str(), file_num_buf, loop); + if (sr->being_compacted) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: %s" + "[%d] being compacted, skipping", + cf_name_.c_str(), file_num_buf, loop); + } else if (sr->level_has_marked_standalone_rangedel) { + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: %s" + "[%d] has standalone range tombstone files marked for " + "compaction, skipping", + cf_name_.c_str(), file_num_buf, loop); + } sr = nullptr; } @@ -773,7 +867,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( candidate_count < max_files_to_compact && i < sorted_runs_.size(); i++) { const SortedRun* succeeding_sr = &sorted_runs_[i]; - if (succeeding_sr->being_compacted) { + if (succeeding_sr->being_compacted || + succeeding_sr->level_has_marked_standalone_rangedel) { break; } // Pick files if the total/last candidate file size (increased by the @@ -923,6 +1018,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( output_level, enable_compression), mutable_cf_options_.default_write_temperature, /* max_subcompactions */ 0, grandparents, + /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, /* is manual */ false, /* trim_ts */ "", score_, false /* deletion_compaction */, /* l0_files_might_overlap */ true, compaction_reason); @@ -939,7 +1036,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { const size_t end_index = ShouldSkipLastSortedRunForSizeAmpCompaction() ? sorted_runs_.size() - 2 : sorted_runs_.size() - 1; - if (sorted_runs_[end_index].being_compacted) { + if (sorted_runs_[end_index].being_compacted || + sorted_runs_[end_index].level_has_marked_standalone_rangedel) { return nullptr; } const uint64_t base_sr_size = sorted_runs_[end_index].size; @@ -950,14 +1048,23 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { // Get longest span (i.e, [start_index, end_index]) of available sorted runs while (start_index > 0) { const SortedRun* sr = &sorted_runs_[start_index - 1]; - if (sr->being_compacted) { + if (sr->being_compacted || sr->level_has_marked_standalone_rangedel) { char file_num_buf[kFormatFileNumberBufSize]; sr->Dump(file_num_buf, sizeof(file_num_buf), true); - ROCKS_LOG_BUFFER( - log_buffer_, - "[%s] Universal: stopping at sorted run undergoing compaction: " - "%s[%" ROCKSDB_PRIszt "]", - cf_name_.c_str(), file_num_buf, start_index - 1); + if (sr->being_compacted) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: stopping at sorted run undergoing compaction: " + "%s[%" ROCKSDB_PRIszt "]", + cf_name_.c_str(), file_num_buf, start_index - 1); + } else if (sr->level_has_marked_standalone_rangedel) { + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: stopping at sorted run that has standalone range " + "tombstone files marked for compaction: " + "%s[%" ROCKSDB_PRIszt "]", + cf_name_.c_str(), file_num_buf, start_index - 1); + } break; } candidate_size += sr->compensated_file_size; @@ -1257,7 +1364,10 @@ Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, true /* enable_compression */), mutable_cf_options_.default_write_temperature, - /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + /* max_subcompactions */ 0, /* grandparents */ {}, + /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, + /* is manual */ false, /* trim_ts */ "", score_, false /* deletion_compaction */, /* l0_files_might_overlap */ true, CompactionReason::kUniversalSizeAmplification); @@ -1288,7 +1398,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { continue; } FileMetaData* f = vstorage_->LevelFiles(0)[loop]; - if (f->marked_for_compaction) { + if (f->marked_for_compaction && !ShouldSkipMarkedFile(f)) { start_level_inputs.files.push_back(f); start_index = static_cast(loop); // Consider this as the first candidate. @@ -1302,7 +1412,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) { SortedRun* sr = &sorted_runs_[loop]; - if (sr->being_compacted) { + if (sr->being_compacted || sr->level_has_marked_standalone_rangedel) { break; } @@ -1321,7 +1431,10 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { // leveled. We pick one of the files marked for compaction and compact with // overlapping files in the adjacent level. picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level, - &output_level, &start_level_inputs); + &output_level, &start_level_inputs, + [this](const FileMetaData* file) { + return ShouldSkipMarkedFile(file); + }); if (start_level_inputs.empty()) { return nullptr; } @@ -1401,7 +1514,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1), GetCompressionOptions(mutable_cf_options_, vstorage_, output_level), mutable_cf_options_.default_write_temperature, - /* max_subcompactions */ 0, grandparents, /* is manual */ false, + /* max_subcompactions */ 0, grandparents, earliest_snapshot_, + snapshot_checker_, + /* is manual */ false, /* trim_ts */ "", score_, false /* deletion_compaction */, /* l0_files_might_overlap */ true, CompactionReason::kFilesMarkedForCompaction); @@ -1494,7 +1609,10 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( GetCompressionOptions(mutable_cf_options_, vstorage_, output_level, true /* enable_compression */), mutable_cf_options_.default_write_temperature, - /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false, + /* max_subcompactions */ 0, /* grandparents */ {}, + /* earliest_snapshot */ std::nullopt, + /* snapshot_checker */ nullptr, + /* is manual */ false, /* trim_ts */ "", score_, false /* deletion_compaction */, /* l0_files_might_overlap */ true, compaction_reason); } @@ -1515,7 +1633,8 @@ Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() { // included in the compaction. size_t start_index = sorted_runs_.size(); - while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) { + while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted && + !sorted_runs_[start_index - 1].level_has_marked_standalone_rangedel) { start_index--; } if (start_index == sorted_runs_.size()) { diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h index b6103088f..18c0f27af 100644 --- a/db/compaction/compaction_picker_universal.h +++ b/db/compaction/compaction_picker_universal.h @@ -10,6 +10,7 @@ #pragma once #include "db/compaction/compaction_picker.h" +#include "db/snapshot_checker.h" namespace ROCKSDB_NAMESPACE { class UniversalCompactionPicker : public CompactionPicker { @@ -17,11 +18,12 @@ class UniversalCompactionPicker : public CompactionPicker { UniversalCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - Compaction* PickCompaction(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, - VersionStorageInfo* vstorage, - LogBuffer* log_buffer) override; + Compaction* PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + const std::vector& existing_snapshots, + const SnapshotChecker* snapshot_checker, VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; int MaxOutputLevel() const override { return NumberLevels() - 1; } bool NeedsCompaction(const VersionStorageInfo* vstorage) const override; diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc index b34c7e662..ff6dd9182 100644 --- a/db/compaction/compaction_service_job.cc +++ b/db/compaction/compaction_service_job.cc @@ -48,6 +48,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( compaction_input.has_end = sub_compact->end.has_value(); compaction_input.end = compaction_input.has_end ? sub_compact->end->ToString() : ""; + compaction_input.options_file_number = + sub_compact->compaction->input_version() + ->version_set() + ->options_file_number(); + + TEST_SYNC_POINT_CALLBACK( + "CompactionServiceJob::ProcessKeyValueCompactionWithCompactionService", + &compaction_input); std::string compaction_input_binary; Status s = compaction_input.Write(&compaction_input_binary); @@ -195,6 +203,8 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( meta.oldest_ancester_time = file.oldest_ancester_time; meta.file_creation_time = file.file_creation_time; meta.epoch_number = file.epoch_number; + meta.file_checksum = file.file_checksum; + meta.file_checksum_func_name = file.file_checksum_func_name; meta.marked_for_compaction = file.marked_for_compaction; meta.unique_id = file.unique_id; @@ -202,11 +212,14 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( sub_compact->Current().AddOutput(std::move(meta), cfd->internal_comparator(), false, true, file.paranoid_hash); + sub_compact->Current().UpdateTableProperties(file.table_properties); } sub_compact->compaction_job_stats = compaction_result.stats; sub_compact->Current().SetNumOutputRecords( - compaction_result.num_output_records); - sub_compact->Current().SetTotalBytes(compaction_result.total_bytes); + compaction_result.stats.num_output_records); + sub_compact->Current().SetNumOutputFiles( + compaction_result.stats.num_output_files); + sub_compact->Current().AddBytesWritten(compaction_result.bytes_written); RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read); RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES, compaction_result.bytes_written); @@ -226,6 +239,18 @@ void CompactionServiceCompactionJob::RecordCompactionIOStats() { CompactionJob::RecordCompactionIOStats(); } +void CompactionServiceCompactionJob::UpdateCompactionJobStats( + const InternalStats::CompactionStats& stats) const { + compaction_job_stats_->elapsed_micros = stats.micros; + + // output information only in remote compaction + compaction_job_stats_->total_output_bytes = stats.bytes_written; + compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob; + compaction_job_stats_->num_output_records = stats.num_output_records; + compaction_job_stats_->num_output_files = stats.num_output_files; + compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob; +} + CompactionServiceCompactionJob::CompactionServiceCompactionJob( int job_id, Compaction* compaction, const ImmutableDBOptions& db_options, const MutableDBOptions& mutable_db_options, const FileOptions& file_options, @@ -280,6 +305,9 @@ Status CompactionServiceCompactionJob::Run() { log_buffer_->FlushBufferToLog(); LogCompaction(); + + compaction_result_->stats.Reset(); + const uint64_t start_micros = db_options_.clock->NowMicros(); c->GetOrInitInputTableProperties(); @@ -320,39 +348,48 @@ Status CompactionServiceCompactionJob::Run() { if (status.ok()) { status = io_s; } - if (status.ok()) { - // TODO: Add verify_table() - } - - // Finish up all book-keeping to unify the subcompaction results - compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); - UpdateCompactionStats(); - RecordCompactionIOStats(); LogFlush(db_options_.info_log); compact_->status = status; compact_->status.PermitUncheckedError(); - // Build compaction result + // Build Compaction Job Stats + + // 1. Aggregate CompactionOutputStats into Internal Compaction Stats + // (compaction_stats_) and aggregate Compaction Job Stats + // (compaction_job_stats_) from the sub compactions + compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); + + // 2. Update the Output information in the Compaction Job Stats with + // aggregated Internal Compaction Stats. + UpdateCompactionJobStats(compaction_stats_.stats); + + // 3. Set fields that are not propagated as part of aggregations above + compaction_result_->stats.is_manual_compaction = c->is_manual_compaction(); + compaction_result_->stats.is_full_compaction = c->is_full_compaction(); + compaction_result_->stats.is_remote_compaction = true; + + // 4. Update IO Stats that are not part of the aggregations above (bytes_read, + // bytes_written) + RecordCompactionIOStats(); + + // Build Output compaction_result_->output_level = compact_->compaction->output_level(); compaction_result_->output_path = output_path_; - compaction_result_->stats.is_remote_compaction = true; for (const auto& output_file : sub_compact->GetOutputs()) { auto& meta = output_file.meta; compaction_result_->output_files.emplace_back( MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, meta.fd.largest_seqno, meta.smallest.Encode().ToString(), meta.largest.Encode().ToString(), meta.oldest_ancester_time, - meta.file_creation_time, meta.epoch_number, - output_file.validator.GetHash(), meta.marked_for_compaction, - meta.unique_id); + meta.file_creation_time, meta.epoch_number, meta.file_checksum, + meta.file_checksum_func_name, output_file.validator.GetHash(), + meta.marked_for_compaction, meta.unique_id, + output_file.table_properties); } - InternalStats::CompactionStatsFull compaction_stats; - sub_compact->AggregateCompactionStats(compaction_stats); - compaction_result_->num_output_records = - compaction_stats.stats.num_output_records; - compaction_result_->total_bytes = compaction_stats.TotalBytesWritten(); + TEST_SYNC_POINT_CALLBACK("CompactionServiceCompactionJob::Run:0", + &compaction_result_); return status; } @@ -435,6 +472,163 @@ static std::unordered_map cs_input_type_info = { {"end", {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"options_file_number", + {offsetof(struct CompactionServiceInput, options_file_number), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, +}; +static std::unordered_map + table_properties_type_info = { + {"orig_file_number", + {offsetof(struct TableProperties, orig_file_number), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"data_size", + {offsetof(struct TableProperties, data_size), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"index_size", + {offsetof(struct TableProperties, index_size), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"index_partitions", + {offsetof(struct TableProperties, index_partitions), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"top_level_index_size", + {offsetof(struct TableProperties, top_level_index_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"index_key_is_user_key", + {offsetof(struct TableProperties, index_key_is_user_key), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"index_value_is_delta_encoded", + {offsetof(struct TableProperties, index_value_is_delta_encoded), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"filter_size", + {offsetof(struct TableProperties, filter_size), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"raw_key_size", + {offsetof(struct TableProperties, raw_key_size), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"raw_value_size", + {offsetof(struct TableProperties, raw_value_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_data_blocks", + {offsetof(struct TableProperties, num_data_blocks), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_entries", + {offsetof(struct TableProperties, num_entries), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"num_filter_entries", + {offsetof(struct TableProperties, num_filter_entries), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_deletions", + {offsetof(struct TableProperties, num_deletions), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"num_merge_operands", + {offsetof(struct TableProperties, num_merge_operands), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"num_range_deletions", + {offsetof(struct TableProperties, num_range_deletions), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"format_version", + {offsetof(struct TableProperties, format_version), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"fixed_key_len", + {offsetof(struct TableProperties, fixed_key_len), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"column_family_id", + {offsetof(struct TableProperties, column_family_id), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"creation_time", + {offsetof(struct TableProperties, creation_time), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"oldest_key_time", + {offsetof(struct TableProperties, oldest_key_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_creation_time", + {offsetof(struct TableProperties, file_creation_time), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"slow_compression_estimated_data_size", + {offsetof(struct TableProperties, + slow_compression_estimated_data_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"fast_compression_estimated_data_size", + {offsetof(struct TableProperties, + fast_compression_estimated_data_size), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"external_sst_file_global_seqno_offset", + {offsetof(struct TableProperties, + external_sst_file_global_seqno_offset), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"tail_start_offset", + {offsetof(struct TableProperties, tail_start_offset), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"user_defined_timestamps_persisted", + {offsetof(struct TableProperties, user_defined_timestamps_persisted), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"key_largest_seqno", + {offsetof(struct TableProperties, key_largest_seqno), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"db_id", + {offsetof(struct TableProperties, db_id), OptionType::kEncodedString}}, + {"db_session_id", + {offsetof(struct TableProperties, db_session_id), + OptionType::kEncodedString}}, + {"db_host_id", + {offsetof(struct TableProperties, db_host_id), + OptionType::kEncodedString}}, + {"column_family_name", + {offsetof(struct TableProperties, column_family_name), + OptionType::kEncodedString}}, + {"filter_policy_name", + {offsetof(struct TableProperties, filter_policy_name), + OptionType::kEncodedString}}, + {"comparator_name", + {offsetof(struct TableProperties, comparator_name), + OptionType::kEncodedString}}, + {"merge_operator_name", + {offsetof(struct TableProperties, merge_operator_name), + OptionType::kEncodedString}}, + {"prefix_extractor_name", + {offsetof(struct TableProperties, prefix_extractor_name), + OptionType::kEncodedString}}, + {"property_collectors_names", + {offsetof(struct TableProperties, property_collectors_names), + OptionType::kEncodedString}}, + {"compression_name", + {offsetof(struct TableProperties, compression_name), + OptionType::kEncodedString}}, + {"compression_options", + {offsetof(struct TableProperties, compression_options), + OptionType::kEncodedString}}, + {"seqno_to_time_mapping", + {offsetof(struct TableProperties, seqno_to_time_mapping), + OptionType::kEncodedString}}, + {"user_collected_properties", + OptionTypeInfo::StringMap( + offsetof(struct TableProperties, user_collected_properties), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, + {"readable_properties", + OptionTypeInfo::StringMap( + offsetof(struct TableProperties, readable_properties), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, }; static std::unordered_map @@ -471,6 +665,14 @@ static std::unordered_map {offsetof(struct CompactionServiceOutputFile, epoch_number), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"file_checksum", + {offsetof(struct CompactionServiceOutputFile, file_checksum), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, + {"file_checksum_func_name", + {offsetof(struct CompactionServiceOutputFile, file_checksum_func_name), + OptionType::kEncodedString, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"paranoid_hash", {offsetof(struct CompactionServiceOutputFile, paranoid_hash), OptionType::kUInt64T, OptionVerificationType::kNormal, @@ -484,6 +686,11 @@ static std::unordered_map offsetof(struct CompactionServiceOutputFile, unique_id), OptionVerificationType::kNormal, OptionTypeFlags::kNone, {0, OptionType::kUInt64T})}, + {"table_properties", + OptionTypeInfo::Struct( + "table_properties", &table_properties_type_info, + offsetof(struct CompactionServiceOutputFile, table_properties), + OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, }; static std::unordered_map @@ -703,14 +910,6 @@ static std::unordered_map cs_result_type_info = { {offsetof(struct CompactionServiceResult, output_path), OptionType::kEncodedString, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, - {"num_output_records", - {offsetof(struct CompactionServiceResult, num_output_records), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, - {"total_bytes", - {offsetof(struct CompactionServiceResult, total_bytes), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, {"bytes_read", {offsetof(struct CompactionServiceResult, bytes_read), OptionType::kUInt64T, OptionVerificationType::kNormal, diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc index bb53a4029..ad3fa1a5c 100644 --- a/db/compaction/compaction_service_test.cc +++ b/db/compaction/compaction_service_test.cc @@ -3,9 +3,9 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). - #include "db/db_test_util.h" #include "port/stack_trace.h" +#include "rocksdb/utilities/options_util.h" #include "table/unique_id_impl.h" namespace ROCKSDB_NAMESPACE { @@ -349,7 +349,27 @@ TEST_F(CompactionServiceTest, BasicCompactions) { } else { ASSERT_OK(result.status); } + ASSERT_GE(result.stats.elapsed_micros, 1); + ASSERT_GE(result.stats.cpu_micros, 1); + + ASSERT_EQ(20, result.stats.num_output_records); + ASSERT_EQ(result.output_files.size(), result.stats.num_output_files); + + uint64_t total_size = 0; + for (auto output_file : result.output_files) { + std::string file_name = result.output_path + "/" + output_file.file_name; + + uint64_t file_size = 0; + ASSERT_OK(options.env->GetFileSize(file_name, &file_size)); + ASSERT_GT(file_size, 0); + total_size += file_size; + } + ASSERT_EQ(total_size, result.stats.total_output_bytes); + ASSERT_TRUE(result.stats.is_remote_compaction); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_FALSE(result.stats.is_full_compaction); + Close(); } @@ -396,6 +416,501 @@ TEST_F(CompactionServiceTest, ManualCompaction) { ASSERT_TRUE(result.stats.is_remote_compaction); } +TEST_F(CompactionServiceTest, PreservedOptionsLocalCompaction) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + Random rnd(301); + for (auto i = 0; i < 2; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) { + auto compaction = static_cast(arg); + std::string options_file_name = OptionsFileName( + dbname_, + compaction->input_version()->version_set()->options_file_number()); + + // Change option twice to make sure the very first OPTIONS file gets + // purged + ASSERT_OK(dbfull()->SetOptions( + {{"level0_file_num_compaction_trigger", "4"}})); + ASSERT_EQ(4, dbfull()->GetOptions().level0_file_num_compaction_trigger); + ASSERT_OK(dbfull()->SetOptions( + {{"level0_file_num_compaction_trigger", "6"}})); + ASSERT_EQ(6, dbfull()->GetOptions().level0_file_num_compaction_trigger); + dbfull()->TEST_DeleteObsoleteFiles(); + + // For non-remote compactions, OPTIONS file can be deleted while + // using option at the start of the compaction + Status s = env_->FileExists(options_file_name); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsNotFound()); + // Should be old value + ASSERT_EQ(2, compaction->mutable_cf_options() + ->level0_file_num_compaction_trigger); + ASSERT_TRUE(dbfull()->min_options_file_numbers_.empty()); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.ok()); +} + +TEST_F(CompactionServiceTest, PreservedOptionsRemoteCompaction) { + // For non-remote compaction do not preserve options file + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 2; + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + Random rnd(301); + for (auto i = 0; i < 2; ++i) { + for (auto j = 0; j < 10; ++j) { + ASSERT_OK( + Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024))); + } + ASSERT_OK(Flush()); + } + + bool is_primary_called = false; + // This will be called twice. One from primary and one from remote. + // Try changing the option when called from remote. Otherwise, the new option + // will be used + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", [&](void* /*arg*/) { + if (!is_primary_called) { + is_primary_called = true; + return; + } + // Change the option right before the compaction run + ASSERT_OK(dbfull()->SetOptions( + {{"level0_file_num_compaction_trigger", "4"}})); + ASSERT_EQ(4, dbfull()->GetOptions().level0_file_num_compaction_trigger); + dbfull()->TEST_DeleteObsoleteFiles(); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionServiceJob::ProcessKeyValueCompactionWithCompactionService", + [&](void* arg) { + auto input = static_cast(arg); + std::string options_file_name = + OptionsFileName(dbname_, input->options_file_number); + + ASSERT_OK(env_->FileExists(options_file_name)); + ASSERT_FALSE(dbfull()->min_options_file_numbers_.empty()); + ASSERT_EQ(dbfull()->min_options_file_numbers_.front(), + input->options_file_number); + + DBOptions db_options; + ConfigOptions config_options; + std::vector all_column_families; + config_options.env = env_; + ASSERT_OK(LoadOptionsFromFile(config_options, options_file_name, + &db_options, &all_column_families)); + bool has_cf = false; + for (auto& cf : all_column_families) { + if (cf.name == input->cf_name) { + // Should be old value + ASSERT_EQ(2, cf.options.level0_file_num_compaction_trigger); + has_cf = true; + } + } + ASSERT_TRUE(has_cf); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) { + auto compaction = static_cast(arg); + ASSERT_EQ(2, compaction->mutable_cf_options() + ->level0_file_num_compaction_trigger); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.ok()); + + CompactionServiceResult result; + my_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_TRUE(result.stats.is_remote_compaction); +} + +class EventVerifier : public EventListener { + public: + explicit EventVerifier(uint64_t expected_num_input_records, + size_t expected_num_input_files, + uint64_t expected_num_output_records, + size_t expected_num_output_files, + const std::string& expected_smallest_output_key_prefix, + const std::string& expected_largest_output_key_prefix, + bool expected_is_remote_compaction_on_begin, + bool expected_is_remote_compaction_on_complete) + : expected_num_input_records_(expected_num_input_records), + expected_num_input_files_(expected_num_input_files), + expected_num_output_records_(expected_num_output_records), + expected_num_output_files_(expected_num_output_files), + expected_smallest_output_key_prefix_( + expected_smallest_output_key_prefix), + expected_largest_output_key_prefix_(expected_largest_output_key_prefix), + expected_is_remote_compaction_on_begin_( + expected_is_remote_compaction_on_begin), + expected_is_remote_compaction_on_complete_( + expected_is_remote_compaction_on_complete) {} + void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_EQ(expected_num_input_files_, ci.input_files.size()); + ASSERT_EQ(expected_num_input_files_, ci.input_file_infos.size()); + ASSERT_EQ(expected_is_remote_compaction_on_begin_, + ci.stats.is_remote_compaction); + ASSERT_TRUE(ci.stats.is_manual_compaction); + ASSERT_FALSE(ci.stats.is_full_compaction); + } + void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { + ASSERT_GT(ci.stats.elapsed_micros, 0); + ASSERT_GT(ci.stats.cpu_micros, 0); + ASSERT_EQ(expected_num_input_records_, ci.stats.num_input_records); + ASSERT_EQ(expected_num_input_files_, ci.stats.num_input_files); + ASSERT_EQ(expected_num_output_records_, ci.stats.num_output_records); + ASSERT_EQ(expected_num_output_files_, ci.stats.num_output_files); + ASSERT_EQ(expected_smallest_output_key_prefix_, + ci.stats.smallest_output_key_prefix); + ASSERT_EQ(expected_largest_output_key_prefix_, + ci.stats.largest_output_key_prefix); + ASSERT_GT(ci.stats.total_input_bytes, 0); + ASSERT_GT(ci.stats.total_output_bytes, 0); + ASSERT_EQ(ci.stats.num_input_records, + ci.stats.num_output_records + ci.stats.num_records_replaced); + ASSERT_EQ(expected_is_remote_compaction_on_complete_, + ci.stats.is_remote_compaction); + ASSERT_TRUE(ci.stats.is_manual_compaction); + ASSERT_FALSE(ci.stats.is_full_compaction); + } + + private: + uint64_t expected_num_input_records_; + size_t expected_num_input_files_; + uint64_t expected_num_output_records_; + size_t expected_num_output_files_; + std::string expected_smallest_output_key_prefix_; + std::string expected_largest_output_key_prefix_; + bool expected_is_remote_compaction_on_begin_; + bool expected_is_remote_compaction_on_complete_; +}; + +TEST_F(CompactionServiceTest, VerifyStats) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + auto event_verifier = std::make_shared( + 30 /* expected_num_input_records */, 3 /* expected_num_input_files */, + 20 /* expected_num_output_records */, 1 /* expected_num_output_files */, + "key00000" /* expected_smallest_output_key_prefix */, + "key00001" /* expected_largest_output_key_prefix */, + true /* expected_is_remote_compaction_on_begin */, + true /* expected_is_remote_compaction_on_complete */); + options.listeners.push_back(event_verifier); + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(0); + std::string end_str = Key(1); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + VerifyTestData(); + + CompactionServiceResult result; + my_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_TRUE(result.stats.is_remote_compaction); +} + +TEST_F(CompactionServiceTest, VerifyStatsLocalFallback) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + auto event_verifier = std::make_shared( + 30 /* expected_num_input_records */, 3 /* expected_num_input_files */, + 20 /* expected_num_output_records */, 1 /* expected_num_output_files */, + "key00000" /* expected_smallest_output_key_prefix */, + "key00001" /* expected_largest_output_key_prefix */, + true /* expected_is_remote_compaction_on_begin */, + false /* expected_is_remote_compaction_on_complete */); + options.listeners.push_back(event_verifier); + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal); + + std::string start_str = Key(0); + std::string end_str = Key(1); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + // Remote Compaction did not happen + ASSERT_EQ(my_cs->GetCompactionNum(), comp_num); + VerifyTestData(); +} + +TEST_F(CompactionServiceTest, CorruptedOutput) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionServiceCompactionJob::Run:0", [&](void* arg) { + CompactionServiceResult* compaction_result = + *(static_cast(arg)); + ASSERT_TRUE(compaction_result != nullptr && + !compaction_result->output_files.empty()); + // Corrupt files here + for (const auto& output_file : compaction_result->output_files) { + std::string file_name = + compaction_result->output_path + "/" + output_file.file_name; + + uint64_t file_size = 0; + Status s = options.env->GetFileSize(file_name, &file_size); + ASSERT_OK(s); + ASSERT_GT(file_size, 0); + + ASSERT_OK(test::CorruptFile(env_, file_name, 0, + static_cast(file_size), + true /* verifyChecksum */)); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // CompactRange() should fail + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsCorruption()); + + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // On the worker side, the compaction is considered success + // Verification is done on the primary side + CompactionServiceResult result; + my_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_TRUE(result.stats.is_remote_compaction); +} + +TEST_F(CompactionServiceTest, CorruptedOutputParanoidFileCheck) { + for (bool paranoid_file_check_enabled : {false, true}) { + SCOPED_TRACE("paranoid_file_check_enabled=" + + std::to_string(paranoid_file_check_enabled)); + + Options options = CurrentOptions(); + Destroy(options); + options.disable_auto_compactions = true; + options.paranoid_file_checks = paranoid_file_check_enabled; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionServiceCompactionJob::Run:0", [&](void* arg) { + CompactionServiceResult* compaction_result = + *(static_cast(arg)); + ASSERT_TRUE(compaction_result != nullptr && + !compaction_result->output_files.empty()); + // Corrupt files here + for (const auto& output_file : compaction_result->output_files) { + std::string file_name = + compaction_result->output_path + "/" + output_file.file_name; + + // Corrupt very small range of bytes. This corruption is so small + // that this isn't caught by default light-weight check + ASSERT_OK(test::CorruptFile(env_, file_name, 0, 1, + false /* verifyChecksum */)); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + if (paranoid_file_check_enabled) { + ASSERT_NOK(s); + ASSERT_EQ(Status::Corruption("Paranoid checksums do not match"), s); + } else { + // CompactRange() goes through if paranoid file check is not enabled + ASSERT_OK(s); + } + + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // On the worker side, the compaction is considered success + // Verification is done on the primary side + CompactionServiceResult result; + my_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_TRUE(result.stats.is_remote_compaction); + } +} + +TEST_F(CompactionServiceTest, TruncatedOutput) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionServiceCompactionJob::Run:0", [&](void* arg) { + CompactionServiceResult* compaction_result = + *(static_cast(arg)); + ASSERT_TRUE(compaction_result != nullptr && + !compaction_result->output_files.empty()); + // Truncate files here + for (const auto& output_file : compaction_result->output_files) { + std::string file_name = + compaction_result->output_path + "/" + output_file.file_name; + + uint64_t file_size = 0; + Status s = options.env->GetFileSize(file_name, &file_size); + ASSERT_OK(s); + ASSERT_GT(file_size, 0); + + ASSERT_OK(test::TruncateFile(env_, file_name, file_size / 2)); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // CompactRange() should fail + Status s = db_->CompactRange(CompactRangeOptions(), &start, &end); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsCorruption()); + + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // On the worker side, the compaction is considered success + // Verification is done on the primary side + CompactionServiceResult result; + my_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_TRUE(result.stats.is_remote_compaction); +} + +TEST_F(CompactionServiceTest, CustomFileChecksum) { + Options options = CurrentOptions(); + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + ReopenWithCompactionService(&options); + GenerateTestData(); + + auto my_cs = GetCompactionService(); + + std::string start_str = Key(15); + std::string end_str = Key(45); + Slice start(start_str); + Slice end(end_str); + uint64_t comp_num = my_cs->GetCompactionNum(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionServiceCompactionJob::Run:0", [&](void* arg) { + CompactionServiceResult* compaction_result = + *(static_cast(arg)); + ASSERT_TRUE(compaction_result != nullptr && + !compaction_result->output_files.empty()); + // Validate Checksum files here + for (const auto& output_file : compaction_result->output_files) { + std::string file_name = + compaction_result->output_path + "/" + output_file.file_name; + + FileChecksumGenContext gen_context; + gen_context.file_name = file_name; + std::unique_ptr file_checksum_gen = + options.file_checksum_gen_factory->CreateFileChecksumGenerator( + gen_context); + + std::unique_ptr file_reader; + uint64_t file_size = 0; + Status s = options.env->GetFileSize(file_name, &file_size); + ASSERT_OK(s); + ASSERT_GT(file_size, 0); + + s = options.env->NewSequentialFile(file_name, &file_reader, + EnvOptions()); + ASSERT_OK(s); + + Slice result; + std::unique_ptr scratch(new char[file_size]); + s = file_reader->Read(file_size, &result, scratch.get()); + ASSERT_OK(s); + + file_checksum_gen->Update(scratch.get(), result.size()); + file_checksum_gen->Finalize(); + + // Verify actual checksum and the func name + ASSERT_EQ(file_checksum_gen->Name(), + output_file.file_checksum_func_name); + ASSERT_EQ(file_checksum_gen->GetChecksum(), + output_file.file_checksum); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + + CompactionServiceResult result; + my_cs->GetResult(&result); + ASSERT_OK(result.status); + ASSERT_TRUE(result.stats.is_manual_compaction); + ASSERT_TRUE(result.stats.is_remote_compaction); +} + TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) { Options options = CurrentOptions(); options.disable_auto_compactions = true; diff --git a/db/compaction/compaction_state.cc b/db/compaction/compaction_state.cc index ee4b0c189..bf016d04b 100644 --- a/db/compaction/compaction_state.cc +++ b/db/compaction/compaction_state.cc @@ -39,7 +39,7 @@ void CompactionState::AggregateCompactionStats( InternalStats::CompactionStatsFull& compaction_stats, CompactionJobStats& compaction_job_stats) { for (const auto& sc : sub_compact_states) { - sc.AggregateCompactionStats(compaction_stats); + sc.AggregateCompactionOutputStats(compaction_stats); compaction_job_stats.Add(sc.compaction_job_stats); } } diff --git a/db/compaction/subcompaction_state.cc b/db/compaction/subcompaction_state.cc index 0c56471e9..aae446351 100644 --- a/db/compaction/subcompaction_state.cc +++ b/db/compaction/subcompaction_state.cc @@ -13,7 +13,7 @@ #include "rocksdb/sst_partitioner.h" namespace ROCKSDB_NAMESPACE { -void SubcompactionState::AggregateCompactionStats( +void SubcompactionState::AggregateCompactionOutputStats( InternalStats::CompactionStatsFull& compaction_stats) const { compaction_stats.stats.Add(compaction_outputs_.stats_); if (HasPenultimateLevelOutputs()) { diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h index b933a62a5..252fdfb8a 100644 --- a/db/compaction/subcompaction_state.h +++ b/db/compaction/subcompaction_state.h @@ -179,7 +179,7 @@ class SubcompactionState { void Cleanup(Cache* cache); - void AggregateCompactionStats( + void AggregateCompactionOutputStats( InternalStats::CompactionStatsFull& compaction_stats) const; CompactionOutputs& Current() const { diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc index 93b979121..6be3c63eb 100644 --- a/db/compaction/tiered_compaction_test.cc +++ b/db/compaction/tiered_compaction_test.cc @@ -2512,6 +2512,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromMemtables) { start_time + kSecondsPerRecording * (i + 1)); } } + ASSERT_EQ(kNumKeys, i); ASSERT_OK(iter->status()); } @@ -2531,12 +2532,13 @@ TEST_P(IteratorWriteTimeTest, ReadFromMemtables) { } } ASSERT_OK(iter->status()); + ASSERT_EQ(-1, i); } // Reopen the DB and disable the seqno to time recording, data with user // specified write time can still get a write time before it's flushed. options.preserve_internal_time_seconds = 0; - DestroyAndReopen(options); + Reopen(options); ASSERT_OK(TimedPut(Key(kKeyWithWriteTime), rnd.RandomString(100), kUserSpecifiedWriteTime)); { @@ -2613,6 +2615,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) { } } ASSERT_OK(iter->status()); + ASSERT_EQ(kNumKeys, i); } // Backward iteration @@ -2632,12 +2635,13 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) { } } ASSERT_OK(iter->status()); + ASSERT_EQ(-1, i); } // Reopen the DB and disable the seqno to time recording. Data retrieved from // SST files still have write time available. options.preserve_internal_time_seconds = 0; - DestroyAndReopen(options); + Reopen(options); dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(kSecondsPerRecording); }); @@ -2663,6 +2667,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) { start_time + kSecondsPerRecording * (i + 1)); } } + ASSERT_EQ(kNumKeys, i); ASSERT_OK(iter->status()); } @@ -2686,6 +2691,7 @@ TEST_P(IteratorWriteTimeTest, ReadFromSstFile) { VerifyKeyAndWriteTime(iter.get(), Key(i), 0); } ASSERT_OK(iter->status()); + ASSERT_EQ(kNumKeys, i); } Close(); } diff --git a/db/convenience.cc b/db/convenience.cc index cd02b15f8..84d3f99a9 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -87,7 +87,7 @@ Status VerifySstFileChecksumInternal(const Options& options, options.block_protection_bytes_per_key, false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */, -1 /* level */); reader_options.largest_seqno = largest_seqno; - s = ioptions.table_factory->NewTableReader( + s = options.table_factory->NewTableReader( read_options, reader_options, std::move(file_reader), file_size, &table_reader, false /* prefetch_index_and_filter_in_cache */); if (!s.ok()) { diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index fdcfc2884..adb2d599a 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -563,7 +563,7 @@ TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) { } } -TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) { +TEST_F(DBBlockCacheTest, DynamicOptions) { Options options = CurrentOptions(); options.create_if_missing = true; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); @@ -578,39 +578,74 @@ TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) { DestroyAndReopen(options); std::string value(kValueSize, 'a'); + auto st = options.statistics; - for (size_t i = 1; i <= 5; i++) { - ASSERT_OK(Put(std::to_string(i), value)); - ASSERT_OK(Flush()); - ASSERT_EQ(1, - options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + size_t i = 1; + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); - ASSERT_EQ(value, Get(std::to_string(i))); - ASSERT_EQ(0, - options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); - ASSERT_EQ( - 0, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); - ASSERT_EQ(1, - options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); - } + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); + ++i; ASSERT_OK(dbfull()->SetOptions( {{"block_based_table_factory", "{prepopulate_block_cache=kDisable;}"}})); - for (size_t i = 6; i <= kNumBlocks; i++) { - ASSERT_OK(Put(std::to_string(i), value)); - ASSERT_OK(Flush()); - ASSERT_EQ(0, - options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); - ASSERT_EQ(value, Get(std::to_string(i))); - ASSERT_EQ(1, - options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); - ASSERT_EQ( - 1, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); - ASSERT_EQ(0, - options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); - } + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); + + ++i; + ASSERT_OK(dbfull()->SetOptions({{"block_based_table_factory", + "{prepopulate_block_cache=kFlushOnly;}"}})); + + ASSERT_OK(Put(std::to_string(i), value)); + ASSERT_OK(Flush()); + ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + + ASSERT_EQ(value, Get(std::to_string(i))); + ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); + ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); + + ++i; + // NOT YET SUPPORTED + // FIXME: find a way to make this fail again (until well supported) + // ASSERT_NOK(dbfull()->SetOptions( + // {{"block_based_table_factory", "{block_cache=null;}"}})); + + // ASSERT_OK(Put(std::to_string(i), value)); + // ASSERT_OK(Flush()); + // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + + // ASSERT_EQ(value, Get(std::to_string(i))); + // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); + // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); + + // ++i; + + // NOT YET SUPPORTED + // FIXME: find a way to make this fail again (until well supported) + // ASSERT_NOK(dbfull()->SetOptions( + // {{"block_based_table_factory", "{block_cache=1M;}"}})); + + // ASSERT_OK(Put(std::to_string(i), value)); + // ASSERT_OK(Flush()); + // ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + + // ASSERT_EQ(value, Get(std::to_string(i))); + // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD)); + // ASSERT_EQ(0, st->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS)); + // ASSERT_EQ(1, st->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT)); } #endif diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index f0b3c4066..66fbaab02 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -1975,10 +1975,24 @@ TEST_F(DBBloomFilterTest, MutatingRibbonFilterPolicy) { if (configs.empty()) { break; } + std::string factory_field = + (v[0] & 1) ? "table_factory" : "block_based_table_factory"; + // Some irrelevant SetOptions to be sure they don't interfere + ASSERT_OK(db_->SetOptions({{"level0_file_num_compaction_trigger", "10"}})); ASSERT_OK( - db_->SetOptions({{"table_factory.filter_policy.bloom_before_level", + db_->SetOptions({{"block_based_table_factory", "{block_size=1234}"}})); + ASSERT_OK(db_->SetOptions({{factory_field + ".block_size", "12345"}})); + + // Test the mutable field we're interested in + ASSERT_OK( + db_->SetOptions({{factory_field + ".filter_policy.bloom_before_level", configs.back().first}})); + // FilterPolicy pointer should not have changed + ASSERT_EQ(db_->GetOptions() + .table_factory->GetOptions() + ->filter_policy.get(), + table_options.filter_policy.get()); // Ensure original object is mutated std::string val; @@ -1991,6 +2005,59 @@ TEST_F(DBBloomFilterTest, MutatingRibbonFilterPolicy) { } } +TEST_F(DBBloomFilterTest, MutableFilterPolicy) { + // Test that BlockBasedTableOptions::filter_policy is mutable (replaceable) + // with SetOptions. + + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + auto& stats = *options.statistics; + BlockBasedTableOptions table_options; + // First config, to make sure there's no issues with this shared ptr + // etc. when the DB switches filter policies. + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + double expected_bpk = 10.0; + // Other configs to try + std::vector> configs = { + {"ribbonfilter:10:-1", 7.0}, {"bloomfilter:5", 5.0}, {"nullptr", 0.0}}; + + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.level0_file_num_compaction_trigger = + static_cast(configs.size()) + 2; + + ASSERT_OK(TryReopen(options)); + + char v[] = "a"; + + for (;; ++(v[0])) { + const int maxKey = 8000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(Key(i), v)); + } + ASSERT_OK(Flush()); + + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Get(Key(i)), v); + } + + uint64_t filter_bytes = + stats.getAndResetTickerCount(BLOCK_CACHE_FILTER_BYTES_INSERT); + + EXPECT_NEAR(filter_bytes * 8.0 / maxKey, expected_bpk, 0.3); + + if (configs.empty()) { + break; + } + + ASSERT_OK( + db_->SetOptions({{"block_based_table_factory", + "{filter_policy=" + configs.back().first + "}"}})); + expected_bpk = configs.back().second; + configs.pop_back(); + } +} + class SliceTransformLimitedDomain : public SliceTransform { const char* Name() const override { return "SliceTransformLimitedDomain"; } diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index b218bd0ba..a9cde476a 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1153,6 +1153,13 @@ void DBImpl::DumpStats() { continue; } + auto* table_factory = + cfd->GetCurrentMutableCFOptions()->table_factory.get(); + assert(table_factory != nullptr); + // FIXME: need to a shared_ptr if/when block_cache is going to be mutable + Cache* cache = + table_factory->GetOptions(TableFactory::kBlockCacheOpts()); + // Release DB mutex for gathering cache entry stats. Pass over all // column families for this first so that other stats are dumped // near-atomically. @@ -1161,10 +1168,6 @@ void DBImpl::DumpStats() { // Probe block cache for problems (if not already via another CF) if (immutable_db_options_.info_log) { - auto* table_factory = cfd->ioptions()->table_factory.get(); - assert(table_factory != nullptr); - Cache* cache = - table_factory->GetOptions(TableFactory::kBlockCacheOpts()); if (cache && probed_caches.insert(cache).second) { cache->ReportProblems(immutable_db_options_.info_log); } @@ -3989,9 +3992,9 @@ std::unique_ptr DBImpl::NewMultiCfIterator( if (!s.ok()) { return error_iterator_func(s); } - return std::make_unique(column_families[0]->GetComparator(), - column_families, - std::move(child_iterators)); + return std::make_unique( + column_families[0]->GetComparator(), _read_options.allow_unprepared_value, + column_families, std::move(child_iterators)); } Status DBImpl::NewIterators( @@ -4295,8 +4298,8 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { } // Avoid to go through every column family by checking a global threshold // first. + CfdList cf_scheduled; if (oldest_snapshot > bottommost_files_mark_threshold_) { - CfdList cf_scheduled; for (auto* cfd : *versions_->GetColumnFamilySet()) { if (!cfd->ioptions()->allow_ingest_behind) { cfd->current()->storage_info()->UpdateOldestSnapshot( @@ -4328,6 +4331,24 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { } bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold; } + + // Avoid to go through every column family by checking a global threshold + // first. + if (oldest_snapshot >= standalone_range_deletion_files_mark_threshold_) { + for (auto* cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped() || CfdListContains(cf_scheduled, cfd)) { + continue; + } + if (oldest_snapshot >= + cfd->current() + ->storage_info() + ->standalone_range_tombstone_files_mark_threshold()) { + EnqueuePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + cf_scheduled.push_back(cfd); + } + } + } } delete casted_s; } @@ -4779,6 +4800,24 @@ void DBImpl::ReleaseFileNumberFromPendingOutputs( } } +std::list::iterator DBImpl::CaptureOptionsFileNumber() { + // We need to remember the iterator of our insert, because after the + // compaction is done, we need to remove that element from + // min_options_file_numbers_. + min_options_file_numbers_.push_back(versions_->options_file_number()); + auto min_options_file_numbers_inserted_elem = min_options_file_numbers_.end(); + --min_options_file_numbers_inserted_elem; + return min_options_file_numbers_inserted_elem; +} + +void DBImpl::ReleaseOptionsFileNumber( + std::unique_ptr::iterator>& v) { + if (v.get() != nullptr) { + min_options_file_numbers_.erase(*v.get()); + v.reset(); + } +} + Status DBImpl::GetUpdatesSince( SequenceNumber seq, std::unique_ptr* iter, const TransactionLogIterator::ReadOptions& read_options) { @@ -5811,7 +5850,6 @@ Status DBImpl::IngestExternalFile( Status DBImpl::IngestExternalFiles( const std::vector& args) { // TODO: plumb Env::IOActivity, Env::IOPriority - const ReadOptions read_options; const WriteOptions write_options; if (args.empty()) { @@ -5837,6 +5875,10 @@ Status DBImpl::IngestExternalFiles( snprintf(err_msg, 128, "external_files[%zu] is empty", i); return Status::InvalidArgument(err_msg); } + if (i && args[i].options.fill_cache != args[i - 1].options.fill_cache) { + return Status::InvalidArgument( + "fill_cache should be the same across ingestion options."); + } } for (const auto& arg : args) { const IngestExternalFileOptions& ingest_opts = arg.options; @@ -6024,6 +6066,8 @@ Status DBImpl::IngestExternalFiles( } } if (status.ok()) { + ReadOptions read_options; + read_options.fill_cache = args[0].options.fill_cache; autovector cfds_to_commit; autovector mutable_cf_options_list; autovector> edit_lists; diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 218c1851e..b81110fa9 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -853,6 +853,8 @@ class DBImpl : public DB { uint64_t GetObsoleteSstFilesSize(); + uint64_t MinOptionsFileNumberToKeep(); + // Returns the list of live files in 'live' and the list // of all files in the filesystem in 'candidate_files'. // If force == false and the last call was less than @@ -1197,9 +1199,7 @@ class DBImpl : public DB { uint64_t TEST_total_log_size() const { return total_log_size_; } - // Returns column family name to ImmutableCFOptions map. - Status TEST_GetAllImmutableCFOptions( - std::unordered_map* iopts_map); + void TEST_GetAllBlockCaches(std::unordered_set* cache_set); // Return the lastest MutableCFOptions of a column family Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family, @@ -1694,6 +1694,8 @@ class DBImpl : public DB { friend class XFTransactionWriteHandler; friend class DBBlobIndexTest; friend class WriteUnpreparedTransactionTest_RecoveryTest_Test; + friend class CompactionServiceTest_PreservedOptionsLocalCompaction_Test; + friend class CompactionServiceTest_PreservedOptionsRemoteCompaction_Test; #endif struct CompactionState; @@ -1965,6 +1967,12 @@ class DBImpl : public DB { void ReleaseFileNumberFromPendingOutputs( std::unique_ptr::iterator>& v); + // Similar to pending_outputs, preserve OPTIONS file. Used for remote + // compaction. + std::list::iterator CaptureOptionsFileNumber(); + void ReleaseOptionsFileNumber( + std::unique_ptr::iterator>& v); + // Sets bg error if there is an error writing to WAL. IOStatus SyncClosedWals(const WriteOptions& write_options, JobContext* job_context, VersionEdit* synced_wals, @@ -2076,17 +2084,18 @@ class DBImpl : public DB { // memtable pending flush. // resuming_from_bg_err indicates whether the caller is attempting to resume // from background error. - Status WaitForFlushMemTable(ColumnFamilyData* cfd, - const uint64_t* flush_memtable_id = nullptr, - bool resuming_from_bg_err = false) { + Status WaitForFlushMemTable( + ColumnFamilyData* cfd, const uint64_t* flush_memtable_id = nullptr, + bool resuming_from_bg_err = false, + std::optional flush_reason = std::nullopt) { return WaitForFlushMemTables({cfd}, {flush_memtable_id}, - resuming_from_bg_err); + resuming_from_bg_err, flush_reason); } // Wait for memtables to be flushed for multiple column families. Status WaitForFlushMemTables( const autovector& cfds, const autovector& flush_memtable_ids, - bool resuming_from_bg_err); + bool resuming_from_bg_err, std::optional flush_reason); inline void WaitForPendingWrites() { mutex_.AssertHeld(); @@ -2755,6 +2764,11 @@ class DBImpl : public DB { // State is protected with db mutex. std::list pending_outputs_; + // Similar to pending_outputs_, FindObsoleteFiles()/PurgeObsoleteFiles() never + // deletes any OPTIONS file that has number bigger than any of the file number + // in min_options_file_numbers_. + std::list min_options_file_numbers_; + // flush_queue_ and compaction_queue_ hold column families that we need to // flush and compact, respectively. // A column family is inserted into flush_queue_ when it satisfies condition @@ -2877,6 +2891,11 @@ class DBImpl : public DB { // garbages, among all column families. SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; + // The min threshold to trigger compactions for standalone range deletion + // files that are marked for compaction. + SequenceNumber standalone_range_deletion_files_mark_threshold_ = + kMaxSequenceNumber; + LogsWithPrepTracker logs_with_prep_tracker_; // Callback for compaction to check if a key is visible to a snapshot. diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 3fb8af447..c75178525 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1561,6 +1561,12 @@ Status DBImpl::CompactFilesImpl( compaction_job.Prepare(); + std::unique_ptr::iterator> min_options_file_number_elem; + if (immutable_db_options().compaction_service != nullptr) { + min_options_file_number_elem.reset( + new std::list::iterator(CaptureOptionsFileNumber())); + } + mutex_.Unlock(); TEST_SYNC_POINT("CompactFilesImpl:0"); TEST_SYNC_POINT("CompactFilesImpl:1"); @@ -1570,6 +1576,10 @@ Status DBImpl::CompactFilesImpl( TEST_SYNC_POINT("CompactFilesImpl:3"); mutex_.Lock(); + if (immutable_db_options().compaction_service != nullptr) { + ReleaseOptionsFileNumber(min_options_file_number_elem); + } + bool compaction_released = false; Status status = compaction_job.Install(*c->mutable_cf_options(), &compaction_released); @@ -1852,8 +1862,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { mutable_cf_options.compression_opts, mutable_cf_options.default_write_temperature, 0 /* max_subcompactions, not applicable */, - {} /* grandparents, not applicable */, false /* is manual */, - "" /* trim_ts */, -1 /* score, not applicable */, + {} /* grandparents, not applicable */, + std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */, + false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */, false /* is deletion compaction, not applicable */, false /* l0_files_might_overlap, not applicable */, CompactionReason::kRefitLevel)); @@ -2407,7 +2418,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, } s = WaitForFlushMemTables( cfds, flush_memtable_ids, - flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */); + flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */, + flush_reason); InstrumentedMutexLock lock_guard(&mutex_); for (auto* tmp_cfd : cfds) { tmp_cfd->UnrefAndTryDelete(); @@ -2549,7 +2561,8 @@ Status DBImpl::AtomicFlushMemTables( } s = WaitForFlushMemTables( cfds, flush_memtable_ids, - flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */); + flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */, + flush_reason); InstrumentedMutexLock lock_guard(&mutex_); for (auto* cfd : cfds) { cfd->UnrefAndTryDelete(); @@ -2612,7 +2625,7 @@ Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason, flush_memtable_id_ptrs.push_back(&flush_memtable_id); } s = WaitForFlushMemTables(cfds, flush_memtable_id_ptrs, - true /* resuming_from_bg_err */); + true /* resuming_from_bg_err */, flush_reason); mutex_.Lock(); } @@ -2712,7 +2725,7 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, Status DBImpl::WaitForFlushMemTables( const autovector& cfds, const autovector& flush_memtable_ids, - bool resuming_from_bg_err) { + bool resuming_from_bg_err, std::optional flush_reason) { int num = static_cast(cfds.size()); // Wait until the compaction completes InstrumentedMutexLock l(&mutex_); @@ -2750,7 +2763,15 @@ Status DBImpl::WaitForFlushMemTables( (flush_memtable_ids[i] != nullptr && cfds[i]->imm()->GetEarliestMemTableID() > *flush_memtable_ids[i])) { - ++num_finished; + // Make file ingestion's flush wait until SuperVersion is also updated + // since after flush, it does range overlapping check and file level + // assignment with the current SuperVersion. + if (!flush_reason.has_value() || + flush_reason.value() != FlushReason::kExternalFileIngestion || + cfds[i]->GetSuperVersion()->imm->GetID() == + cfds[i]->imm()->current()->GetID()) { + ++num_finished; + } } } if (1 == num_dropped && 1 == num) { @@ -3541,6 +3562,14 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, is_manual && manual_compaction->disallow_trivial_move; CompactionJobStats compaction_job_stats; + // Set is_remote_compaction to true on CompactionBegin Event if + // compaction_service is set except for trivial moves. We do not know whether + // remote compaction will actually be successfully scheduled, or fall back to + // local at this time. CompactionCompleted event will tell the truth where + // the compaction actually happened. + compaction_job_stats.is_remote_compaction = + immutable_db_options().compaction_service != nullptr; + Status status; if (!error_handler_.IsBGWorkStopped()) { if (shutting_down_.load(std::memory_order_acquire)) { @@ -3661,8 +3690,20 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // compaction is not necessary. Need to make sure mutex is held // until we make a copy in the following code TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction"); + SnapshotChecker* snapshot_checker = nullptr; + std::vector snapshot_seqs; + // This info is not useful for other scenarios, so save querying existing + // snapshots for those cases. + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal && + cfd->user_comparator()->timestamp_size() == 0) { + SequenceNumber earliest_write_conflict_snapshot; + GetSnapshotContext(job_context, &snapshot_seqs, + &earliest_write_conflict_snapshot, + &snapshot_checker); + assert(is_snapshot_supported_ || snapshots_.empty()); + } c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_, - log_buffer)); + snapshot_seqs, snapshot_checker, log_buffer)); TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction"); if (c != nullptr) { @@ -3766,6 +3807,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); compaction_job_stats.num_input_files = c->num_input_files(0); + // Trivial moves do not get compacted remotely + compaction_job_stats.is_remote_compaction = false; NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); @@ -3901,6 +3944,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, &bg_bottom_compaction_scheduled_); compaction_job.Prepare(); + std::unique_ptr::iterator> min_options_file_number_elem; + if (immutable_db_options().compaction_service != nullptr) { + min_options_file_number_elem.reset( + new std::list::iterator(CaptureOptionsFileNumber())); + } + NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); mutex_.Unlock(); @@ -3910,6 +3959,11 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, compaction_job.Run().PermitUncheckedError(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); mutex_.Lock(); + + if (immutable_db_options().compaction_service != nullptr) { + ReleaseOptionsFileNumber(min_options_file_number_elem); + } + status = compaction_job.Install(*c->mutable_cf_options(), &compaction_released); io_s = compaction_job.io_status(); @@ -4256,12 +4310,18 @@ void DBImpl::InstallSuperVersionAndScheduleWork( // newer snapshot created and released frequently, the compaction will be // triggered soon anyway. bottommost_files_mark_threshold_ = kMaxSequenceNumber; + standalone_range_deletion_files_mark_threshold_ = kMaxSequenceNumber; for (auto* my_cfd : *versions_->GetColumnFamilySet()) { if (!my_cfd->ioptions()->allow_ingest_behind) { bottommost_files_mark_threshold_ = std::min( bottommost_files_mark_threshold_, my_cfd->current()->storage_info()->bottommost_files_mark_threshold()); } + standalone_range_deletion_files_mark_threshold_ = + std::min(standalone_range_deletion_files_mark_threshold_, + cfd->current() + ->storage_info() + ->standalone_range_tombstone_files_mark_threshold()); } // Whenever we install new SuperVersion, we might need to issue new flushes or diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index 790a50d7a..dac5e0035 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -232,23 +232,16 @@ uint64_t DBImpl::TEST_LogfileNumber() { return logfile_number_; } -Status DBImpl::TEST_GetAllImmutableCFOptions( - std::unordered_map* iopts_map) { - std::vector cf_names; - std::vector iopts; - { - InstrumentedMutexLock l(&mutex_); - for (auto cfd : *versions_->GetColumnFamilySet()) { - cf_names.push_back(cfd->GetName()); - iopts.push_back(cfd->ioptions()); +void DBImpl::TEST_GetAllBlockCaches( + std::unordered_set* cache_set) { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (const auto bbto = + cfd->GetCurrentMutableCFOptions() + ->table_factory->GetOptions()) { + cache_set->insert(bbto->block_cache.get()); } } - iopts_map->clear(); - for (size_t i = 0; i < cf_names.size(); ++i) { - iopts_map->insert({cf_names[i], iopts[i]}); - } - - return Status::OK(); } uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() { diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 3812e1fcb..ff3054d10 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -43,6 +43,14 @@ uint64_t DBImpl::GetObsoleteSstFilesSize() { return versions_->GetObsoleteSstFilesSize(); } +uint64_t DBImpl::MinOptionsFileNumberToKeep() { + mutex_.AssertHeld(); + if (!min_options_file_numbers_.empty()) { + return *min_options_file_numbers_.begin(); + } + return std::numeric_limits::max(); +} + Status DBImpl::DisableFileDeletions() { Status s; int my_disable_delete_obsolete_files; @@ -147,6 +155,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // here but later find newer generated unfinalized files while scanning. job_context->min_pending_output = MinObsoleteSstNumberToKeep(); job_context->files_to_quarantine = error_handler_.GetFilesToQuarantine(); + job_context->min_options_file_number = MinOptionsFileNumberToKeep(); // Get obsolete files. This function will also update the list of // pending files in VersionSet(). @@ -498,7 +507,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { dbname_); // File numbers of most recent two OPTIONS file in candidate_files (found in - // previos FindObsoleteFiles(full_scan=true)) + // previous FindObsoleteFiles(full_scan=true)) // At this point, there must not be any duplicate file numbers in // candidate_files. uint64_t optsfile_num1 = std::numeric_limits::min(); @@ -519,6 +528,11 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { } } + // For remote compactions, we need to keep OPTIONS file that may get + // referenced by the remote worker + + optsfile_num2 = std::min(optsfile_num2, state.min_options_file_number); + // Close WALs before trying to delete them. for (const auto w : state.logs_to_free) { // TODO: maybe check the return value of Close. diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index dec61c050..6727839ee 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1667,10 +1667,19 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, Arena arena; Status s; TableProperties table_properties; + const auto* ucmp = cfd->internal_comparator().user_comparator(); + assert(ucmp); + const size_t ts_sz = ucmp->timestamp_size(); + const bool logical_strip_timestamp = + ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps; { ScopedArenaPtr iter( - mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena, - /*prefix_extractor=*/nullptr)); + logical_strip_timestamp + ? mem->NewTimestampStrippingIterator( + ro, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr, ts_sz) + : mem->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] [WriteLevel0TableForRecovery]" " Level-0 table #%" PRIu64 ": started", @@ -1705,11 +1714,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, std::vector> range_del_iters; auto range_del_iter = - // This is called during recovery, where a live memtable is flushed - // directly. In this case, no fragmented tombstone list is cached in - // this memtable yet. - mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber, - false /* immutable_memtable */); + logical_strip_timestamp + ? mem->NewTimestampStrippingRangeTombstoneIterator( + ro, kMaxSequenceNumber, ts_sz) + // This is called during recovery, where a live memtable is + // flushed directly. In this case, no fragmented tombstone list is + // cached in this memtable yet. + : mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber, + false /* immutable_memtable */); if (range_del_iter != nullptr) { range_del_iters.emplace_back(range_del_iter); } @@ -1795,9 +1807,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, // For UDT in memtable only feature, move up the cutoff timestamp whenever // a flush happens. - const Comparator* ucmp = cfd->user_comparator(); - size_t ts_sz = ucmp->timestamp_size(); - if (ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps) { + if (logical_strip_timestamp) { Slice mem_newest_udt = mem->GetNewestUDT(); std::string full_history_ts_low = cfd->GetFullHistoryTsLow(); if (full_history_ts_low.empty() || diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index fb7ea1110..d333b70d4 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -247,9 +247,7 @@ Status DBImplSecondary::RecoverLogFiles( if (cfd == nullptr) { continue; } - if (cfds_changed->count(cfd) == 0) { - cfds_changed->insert(cfd); - } + cfds_changed->insert(cfd); const std::vector& l0_files = cfd->current()->storage_info()->LevelFiles(0); SequenceNumber seq = @@ -951,21 +949,19 @@ Status DB::OpenAndCompact( return s; } - // 2. Load the options from latest OPTIONS file + // 2. Load the options DBOptions db_options; ConfigOptions config_options; config_options.env = override_options.env; std::vector all_column_families; - s = LoadLatestOptions(config_options, name, &db_options, - &all_column_families); - // In a very rare scenario, loading options may fail if the options changed by - // the primary host at the same time. Just retry once for now. - if (!s.ok()) { - s = LoadLatestOptions(config_options, name, &db_options, + + std::string options_file_name = + OptionsFileName(name, compaction_input.options_file_number); + + s = LoadOptionsFromFile(config_options, options_file_name, &db_options, &all_column_families); - if (!s.ok()) { - return s; - } + if (!s.ok()) { + return s; } // 3. Override pointer configurations in DBOptions with diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index e28d3fa91..248ddb88d 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -735,17 +735,6 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, size_t total_byte_size = 0; if (w.status.ok()) { - // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock - // grabs but does not seem thread-safe. - if (tracer_) { - InstrumentedMutexLock lock(&trace_mutex_); - if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { - for (auto* writer : wal_write_group) { - // TODO: maybe handle the tracing status? - tracer_->Write(writer->batch).PermitUncheckedError(); - } - } - } SequenceNumber next_sequence = current_sequence; for (auto* writer : wal_write_group) { assert(writer); @@ -760,6 +749,22 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } } } + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : wal_write_group) { + if (writer->CallbackFailed()) { + // When optimisitc txn conflict checking fails, we should + // not record to trace. + continue; + } + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } if (w.disable_wal) { has_unpersisted_data_.store(true, std::memory_order_relaxed); } @@ -1005,19 +1010,6 @@ Status DBImpl::WriteImplWALOnly( WriteThread::WriteGroup write_group; uint64_t last_sequence; write_thread->EnterAsBatchGroupLeader(&w, &write_group); - // Note: no need to update last_batch_group_size_ here since the batch writes - // to WAL only - // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock - // grabs but does not seem thread-safe. - if (tracer_) { - InstrumentedMutexLock lock(&trace_mutex_); - if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { - for (auto* writer : write_group) { - // TODO: maybe handle the tracing status? - tracer_->Write(writer->batch).PermitUncheckedError(); - } - } - } size_t pre_release_callback_cnt = 0; size_t total_byte_size = 0; @@ -1032,6 +1024,23 @@ Status DBImpl::WriteImplWALOnly( } } + // Note: no need to update last_batch_group_size_ here since the batch writes + // to WAL only + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : write_group) { + if (writer->CallbackFailed()) { + continue; + } + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } + const bool concurrent_update = true; // Update stats while we are an exclusive group leader, so we know // that nobody else can be writing to these particular stats. diff --git a/db/db_iter.cc b/db/db_iter.cc index e02586377..97f6f7a07 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -52,7 +52,9 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, user_comparator_(cmp), merge_operator_(ioptions.merge_operator.get()), iter_(iter), - version_(version), + blob_reader_(version, read_options.read_tier, + read_options.verify_checksums, read_options.fill_cache, + read_options.io_activity), read_callback_(read_callback), sequence_(s), statistics_(ioptions.stats), @@ -71,13 +73,10 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, expect_total_order_inner_iter_(prefix_extractor_ == nullptr || read_options.total_order_seek || read_options.auto_prefix_mode), - read_tier_(read_options.read_tier), - fill_cache_(read_options.fill_cache), - verify_checksums_(read_options.verify_checksums), expose_blob_index_(expose_blob_index), + allow_unprepared_value_(read_options.allow_unprepared_value), is_blob_(false), arena_mode_(arena_mode), - io_activity_(read_options.io_activity), cfh_(cfh), timestamp_ub_(read_options.timestamp), timestamp_lb_(read_options.iter_start_ts), @@ -151,7 +150,7 @@ void DBIter::Next() { PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_); // Release temporarily pinned blocks from last operation ReleaseTempPinnedData(); - ResetBlobValue(); + ResetBlobData(); ResetValueAndColumns(); local_stats_.skip_count_ += num_internal_keys_skipped_; local_stats_.skip_count_--; @@ -194,29 +193,21 @@ void DBIter::Next() { } } -bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, - const Slice& blob_index) { - assert(!is_blob_); +Status DBIter::BlobReader::RetrieveAndSetBlobValue(const Slice& user_key, + const Slice& blob_index) { assert(blob_value_.empty()); - if (expose_blob_index_) { // Stacked BlobDB implementation - is_blob_ = true; - return true; - } - if (!version_) { - status_ = Status::Corruption("Encountered unexpected blob index."); - valid_ = false; - return false; + return Status::Corruption("Encountered unexpected blob index."); } // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to // avoid having to copy options back and forth. - // TODO: plumb Env::IOActivity, Env::IOPriority + // TODO: plumb Env::IOPriority ReadOptions read_options; read_options.read_tier = read_tier_; - read_options.fill_cache = fill_cache_; read_options.verify_checksums = verify_checksums_; + read_options.fill_cache = fill_cache_; read_options.io_activity = io_activity_; constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; constexpr uint64_t* bytes_read = nullptr; @@ -224,16 +215,51 @@ bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, const Status s = version_->GetBlob(read_options, user_key, blob_index, prefetch_buffer, &blob_value_, bytes_read); + if (!s.ok()) { + return s; + } + + return Status::OK(); +} + +bool DBIter::SetValueAndColumnsFromBlobImpl(const Slice& user_key, + const Slice& blob_index) { + const Status s = blob_reader_.RetrieveAndSetBlobValue(user_key, blob_index); if (!s.ok()) { status_ = s; valid_ = false; + is_blob_ = false; return false; } - is_blob_ = true; + SetValueAndColumnsFromPlain(blob_reader_.GetBlobValue()); + return true; } +bool DBIter::SetValueAndColumnsFromBlob(const Slice& user_key, + const Slice& blob_index) { + assert(!is_blob_); + is_blob_ = true; + + if (expose_blob_index_) { + SetValueAndColumnsFromPlain(blob_index); + return true; + } + + if (allow_unprepared_value_) { + assert(value_.empty()); + assert(wide_columns_.empty()); + + assert(lazy_blob_index_.empty()); + lazy_blob_index_ = blob_index; + + return true; + } + + return SetValueAndColumnsFromBlobImpl(user_key, blob_index); +} + bool DBIter::SetValueAndColumnsFromEntity(Slice slice) { assert(value_.empty()); assert(wide_columns_.empty()); @@ -279,6 +305,24 @@ bool DBIter::SetValueAndColumnsFromMergeResult(const Status& merge_status, return true; } +bool DBIter::PrepareValue() { + assert(valid_); + + if (lazy_blob_index_.empty()) { + return true; + } + + assert(allow_unprepared_value_); + assert(is_blob_); + + const bool result = + SetValueAndColumnsFromBlobImpl(saved_key_.GetUserKey(), lazy_blob_index_); + + lazy_blob_index_.clear(); + + return result; +} + // PRE: saved_key_ has the current user key if skipping_saved_key // POST: saved_key_ should have the next user key if valid_, // if the current entry is a result of merge @@ -408,7 +452,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, case kTypeValuePreferredSeqno: case kTypeBlobIndex: case kTypeWideColumnEntity: - if (!PrepareValue()) { + if (!PrepareValueInternal()) { return false; } if (timestamp_lb_) { @@ -420,12 +464,9 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, } if (ikey_.type == kTypeBlobIndex) { - if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) { + if (!SetValueAndColumnsFromBlob(ikey_.user_key, iter_.value())) { return false; } - - SetValueAndColumnsFromPlain(expose_blob_index_ ? iter_.value() - : blob_value_); } else if (ikey_.type == kTypeWideColumnEntity) { if (!SetValueAndColumnsFromEntity(iter_.value())) { return false; @@ -445,7 +486,7 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, return true; break; case kTypeMerge: - if (!PrepareValue()) { + if (!PrepareValueInternal()) { return false; } saved_key_.SetUserKey( @@ -590,7 +631,7 @@ bool DBIter::MergeValuesNewToOld() { iter_.Next(); break; } - if (!PrepareValue()) { + if (!PrepareValueInternal()) { return false; } @@ -619,23 +660,9 @@ bool DBIter::MergeValuesNewToOld() { iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); PERF_COUNTER_ADD(internal_merge_count, 1); } else if (kTypeBlobIndex == ikey.type) { - if (expose_blob_index_) { - status_ = - Status::NotSupported("BlobDB does not support merge operator."); - valid_ = false; + if (!MergeWithBlobBaseValue(iter_.value(), ikey.user_key)) { return false; } - // hit a put, merge the put value with operands and store the - // final result in saved_value_. We are done! - if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) { - return false; - } - valid_ = true; - if (!MergeWithPlainBaseValue(blob_value_, ikey.user_key)) { - return false; - } - - ResetBlobValue(); // iter_ is positioned after put iter_.Next(); @@ -643,6 +670,7 @@ bool DBIter::MergeValuesNewToOld() { valid_ = false; return false; } + return true; } else if (kTypeWideColumnEntity == ikey.type) { if (!MergeWithWideColumnBaseValue(iter_.value(), ikey.user_key)) { @@ -689,7 +717,7 @@ void DBIter::Prev() { PERF_COUNTER_ADD(iter_prev_count, 1); PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_); ReleaseTempPinnedData(); - ResetBlobValue(); + ResetBlobData(); ResetValueAndColumns(); ResetInternalKeysSkippedCounter(); bool ok = true; @@ -926,7 +954,7 @@ bool DBIter::FindValueForCurrentKey() { return FindValueForCurrentKeyUsingSeek(); } - if (!PrepareValue()) { + if (!PrepareValueInternal()) { return false; } @@ -1041,21 +1069,9 @@ bool DBIter::FindValueForCurrentKey() { } return true; } else if (last_not_merge_type == kTypeBlobIndex) { - if (expose_blob_index_) { - status_ = - Status::NotSupported("BlobDB does not support merge operator."); - valid_ = false; + if (!MergeWithBlobBaseValue(pinned_value_, saved_key_.GetUserKey())) { return false; } - if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { - return false; - } - valid_ = true; - if (!MergeWithPlainBaseValue(blob_value_, saved_key_.GetUserKey())) { - return false; - } - - ResetBlobValue(); return true; } else if (last_not_merge_type == kTypeWideColumnEntity) { @@ -1080,13 +1096,9 @@ bool DBIter::FindValueForCurrentKey() { break; case kTypeBlobIndex: - if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) { + if (!SetValueAndColumnsFromBlob(saved_key_.GetUserKey(), pinned_value_)) { return false; } - - SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_ - : blob_value_); - break; case kTypeWideColumnEntity: if (!SetValueAndColumnsFromEntity(pinned_value_)) { @@ -1173,7 +1185,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { } return true; } - if (!PrepareValue()) { + if (!PrepareValueInternal()) { return false; } if (timestamp_size_ > 0) { @@ -1190,12 +1202,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { pinned_value_ = iter_.value(); } if (ikey.type == kTypeBlobIndex) { - if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) { + if (!SetValueAndColumnsFromBlob(ikey.user_key, pinned_value_)) { return false; } - - SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_ - : blob_value_); } else if (ikey.type == kTypeWideColumnEntity) { if (!SetValueAndColumnsFromEntity(pinned_value_)) { return false; @@ -1243,7 +1252,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { ikey.type == kTypeDeletionWithTimestamp) { break; } - if (!PrepareValue()) { + if (!PrepareValueInternal()) { return false; } @@ -1261,21 +1270,9 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); PERF_COUNTER_ADD(internal_merge_count, 1); } else if (ikey.type == kTypeBlobIndex) { - if (expose_blob_index_) { - status_ = - Status::NotSupported("BlobDB does not support merge operator."); - valid_ = false; - return false; - } - if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) { + if (!MergeWithBlobBaseValue(iter_.value(), saved_key_.GetUserKey())) { return false; } - valid_ = true; - if (!MergeWithPlainBaseValue(blob_value_, saved_key_.GetUserKey())) { - return false; - } - - ResetBlobValue(); return true; } else if (ikey.type == kTypeWideColumnEntity) { @@ -1342,6 +1339,35 @@ bool DBIter::MergeWithPlainBaseValue(const Slice& value, return SetValueAndColumnsFromMergeResult(s, result_type); } +bool DBIter::MergeWithBlobBaseValue(const Slice& blob_index, + const Slice& user_key) { + assert(!is_blob_); + + if (expose_blob_index_) { + status_ = + Status::NotSupported("Legacy BlobDB does not support merge operator."); + valid_ = false; + return false; + } + + const Status s = blob_reader_.RetrieveAndSetBlobValue(user_key, blob_index); + if (!s.ok()) { + status_ = s; + valid_ = false; + return false; + } + + valid_ = true; + + if (!MergeWithPlainBaseValue(blob_reader_.GetBlobValue(), user_key)) { + return false; + } + + blob_reader_.ResetBlobValue(); + + return true; +} + bool DBIter::MergeWithWideColumnBaseValue(const Slice& entity, const Slice& user_key) { // `op_failure_scope` (an output parameter) is not provided (set to nullptr) @@ -1531,7 +1557,7 @@ void DBIter::Seek(const Slice& target) { status_ = Status::OK(); ReleaseTempPinnedData(); - ResetBlobValue(); + ResetBlobData(); ResetValueAndColumns(); ResetInternalKeysSkippedCounter(); @@ -1607,7 +1633,7 @@ void DBIter::SeekForPrev(const Slice& target) { status_ = Status::OK(); ReleaseTempPinnedData(); - ResetBlobValue(); + ResetBlobData(); ResetValueAndColumns(); ResetInternalKeysSkippedCounter(); @@ -1668,7 +1694,7 @@ void DBIter::SeekToFirst() { status_.PermitUncheckedError(); direction_ = kForward; ReleaseTempPinnedData(); - ResetBlobValue(); + ResetBlobData(); ResetValueAndColumns(); ResetInternalKeysSkippedCounter(); ClearSavedValue(); @@ -1731,7 +1757,7 @@ void DBIter::SeekToLast() { status_.PermitUncheckedError(); direction_ = kReverse; ReleaseTempPinnedData(); - ResetBlobValue(); + ResetBlobData(); ResetValueAndColumns(); ResetInternalKeysSkippedCounter(); ClearSavedValue(); diff --git a/db/db_iter.h b/db/db_iter.h index e27791923..084ed80d4 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -218,7 +218,34 @@ class DBIter final : public Iterator { } void set_valid(bool v) { valid_ = v; } + bool PrepareValue() override; + private: + class BlobReader { + public: + BlobReader(const Version* version, ReadTier read_tier, + bool verify_checksums, bool fill_cache, + Env::IOActivity io_activity) + : version_(version), + read_tier_(read_tier), + verify_checksums_(verify_checksums), + fill_cache_(fill_cache), + io_activity_(io_activity) {} + + const Slice& GetBlobValue() const { return blob_value_; } + Status RetrieveAndSetBlobValue(const Slice& user_key, + const Slice& blob_index); + void ResetBlobValue() { blob_value_.Reset(); } + + private: + PinnableSlice blob_value_; + const Version* version_; + ReadTier read_tier_; + bool verify_checksums_; + bool fill_cache_; + Env::IOActivity io_activity_; + }; + // For all methods in this block: // PRE: iter_->Valid() && status_.ok() // Return false if there was an error, and status() is non-ok, valid_ = false; @@ -299,15 +326,6 @@ class DBIter final : public Iterator { : user_comparator_.CompareWithoutTimestamp(a, b); } - // Retrieves the blob value for the specified user key using the given blob - // index when using the integrated BlobDB implementation. - bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index); - - void ResetBlobValue() { - is_blob_ = false; - blob_value_.Reset(); - } - void SetValueAndColumnsFromPlain(const Slice& slice) { assert(value_.empty()); assert(wide_columns_.empty()); @@ -316,6 +334,11 @@ class DBIter final : public Iterator { wide_columns_.emplace_back(kDefaultWideColumnName, slice); } + bool SetValueAndColumnsFromBlobImpl(const Slice& user_key, + const Slice& blob_index); + bool SetValueAndColumnsFromBlob(const Slice& user_key, + const Slice& blob_index); + bool SetValueAndColumnsFromEntity(Slice slice); bool SetValueAndColumnsFromMergeResult(const Status& merge_status, @@ -326,14 +349,21 @@ class DBIter final : public Iterator { wide_columns_.clear(); } + void ResetBlobData() { + blob_reader_.ResetBlobValue(); + lazy_blob_index_.clear(); + is_blob_ = false; + } + // The following methods perform the actual merge operation for the - // no base value/plain base value/wide-column base value cases. + // no/plain/blob/wide-column base value cases. // If user-defined timestamp is enabled, `user_key` includes timestamp. bool MergeWithNoBaseValue(const Slice& user_key); bool MergeWithPlainBaseValue(const Slice& value, const Slice& user_key); + bool MergeWithBlobBaseValue(const Slice& blob_index, const Slice& user_key); bool MergeWithWideColumnBaseValue(const Slice& entity, const Slice& user_key); - bool PrepareValue() { + bool PrepareValueInternal() { if (!iter_.PrepareValue()) { assert(!iter_.status().ok()); valid_ = false; @@ -356,7 +386,7 @@ class DBIter final : public Iterator { UserComparatorWrapper user_comparator_; const MergeOperator* const merge_operator_; IteratorWrapper iter_; - const Version* version_; + BlobReader blob_reader_; ReadCallback* read_callback_; // Max visible sequence number. It is normally the snapshot seq unless we have // uncommitted data in db as in WriteUnCommitted. @@ -376,7 +406,6 @@ class DBIter final : public Iterator { std::string saved_value_; Slice pinned_value_; // for prefix seek mode to support prev() - PinnableSlice blob_value_; // Value of the default column Slice value_; // All columns (i.e. name-value pairs) @@ -410,15 +439,13 @@ class DBIter final : public Iterator { // Expect the inner iterator to maintain a total order. // prefix_extractor_ must be non-NULL if the value is false. const bool expect_total_order_inner_iter_; - ReadTier read_tier_; - bool fill_cache_; - bool verify_checksums_; // Whether the iterator is allowed to expose blob references. Set to true when // the stacked BlobDB implementation is used, false otherwise. bool expose_blob_index_; + bool allow_unprepared_value_; + Slice lazy_blob_index_; bool is_blob_; bool arena_mode_; - const Env::IOActivity io_activity_; // List of operands for merge operator. MergeContext merge_context_; LocalStatistics local_stats_; diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 1be7a5064..e2e96e59f 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -231,21 +231,33 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) { ASSERT_OK(dbfull()->SetOptions( cfh, {{"table_factory.block_size", "16384"}, {"table_factory.block_restart_interval", "11"}})); + // Old c_bbto + ASSERT_EQ(c_bbto->block_size, 8192); + ASSERT_EQ(c_bbto->block_restart_interval, 7); + // New c_bbto + c_opts = dbfull()->GetOptions(cfh); + c_bbto = c_opts.table_factory->GetOptions(); ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_restart_interval, 11); // Now set an option that is not mutable - options should not change - ASSERT_NOK( - dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}})); + // FIXME: find a way to make this fail again + // ASSERT_NOK( + // dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}})); + c_opts = dbfull()->GetOptions(cfh); + ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions()); ASSERT_EQ(c_bbto->no_block_cache, true); ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_restart_interval, 11); // Set some that are mutable and some that are not - options should not change - ASSERT_NOK(dbfull()->SetOptions( - cfh, {{"table_factory.no_block_cache", "false"}, - {"table_factory.block_size", "8192"}, - {"table_factory.block_restart_interval", "7"}})); + // FIXME: find a way to make this fail again + // ASSERT_NOK(dbfull()->SetOptions( + // cfh, {{"table_factory.no_block_cache", "false"}, + // {"table_factory.block_size", "8192"}, + // {"table_factory.block_restart_interval", "7"}})); + c_opts = dbfull()->GetOptions(cfh); + ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions()); ASSERT_EQ(c_bbto->no_block_cache, true); ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_restart_interval, 11); @@ -256,6 +268,8 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) { cfh, {{"table_factory.block_size", "8192"}, {"table_factory.does_not_exist", "true"}, {"table_factory.block_restart_interval", "7"}})); + c_opts = dbfull()->GetOptions(cfh); + ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions()); ASSERT_EQ(c_bbto->no_block_cache, true); ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_restart_interval, 11); @@ -271,6 +285,7 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) { {"table_factory.block_restart_interval", "13"}})); c_opts = dbfull()->GetOptions(cfh); ASSERT_EQ(c_opts.blob_file_size, 32768); + c_bbto = c_opts.table_factory->GetOptions(); ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_restart_interval, 13); // Set some on the table and a bad one on the ColumnFamily - options should @@ -279,6 +294,7 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) { cfh, {{"table_factory.block_size", "1024"}, {"no_such_option", "32768"}, {"table_factory.block_restart_interval", "7"}})); + ASSERT_EQ(c_bbto, c_opts.table_factory->GetOptions()); ASSERT_EQ(c_bbto->block_size, 16384); ASSERT_EQ(c_bbto->block_restart_interval, 13); } diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 6c33d41df..ba1ed5c95 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -244,7 +244,7 @@ TEST_F(DBSecondaryTest, SimpleInternalCompaction) { ASSERT_EQ(largest.user_key().ToString(), "foo"); ASSERT_EQ(result.output_level, 1); ASSERT_EQ(result.output_path, this->secondary_path_); - ASSERT_EQ(result.num_output_records, 2); + ASSERT_EQ(result.stats.num_output_records, 2); ASSERT_GT(result.bytes_written, 0); ASSERT_OK(result.status); } diff --git a/db/db_test.cc b/db/db_test.cc index fbcff5b48..ba65a7b50 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1826,21 +1826,30 @@ TEST_F(DBTest, GetApproximateMemTableStats) { uint64_t count; uint64_t size; + // Because Random::GetTLSInstance() seed is reset in DBTestBase, + // this test is deterministic. + std::string start = Key(50); std::string end = Key(60); Range r(start, end); db_->GetApproximateMemTableStats(r, &count, &size); - ASSERT_GT(count, 0); - ASSERT_LE(count, N); - ASSERT_GT(size, 6000); - ASSERT_LT(size, 204800); + // When actual count is <= 10, it returns that as the minimum + EXPECT_EQ(count, 10); + EXPECT_EQ(size, 10440); + + start = Key(20); + end = Key(100); + r = Range(start, end); + db_->GetApproximateMemTableStats(r, &count, &size); + EXPECT_EQ(count, 72); + EXPECT_EQ(size, 75168); start = Key(500); end = Key(600); r = Range(start, end); db_->GetApproximateMemTableStats(r, &count, &size); - ASSERT_EQ(count, 0); - ASSERT_EQ(size, 0); + EXPECT_EQ(count, 0); + EXPECT_EQ(size, 0); ASSERT_OK(Flush()); @@ -1848,8 +1857,8 @@ TEST_F(DBTest, GetApproximateMemTableStats) { end = Key(60); r = Range(start, end); db_->GetApproximateMemTableStats(r, &count, &size); - ASSERT_EQ(count, 0); - ASSERT_EQ(size, 0); + EXPECT_EQ(count, 0); + EXPECT_EQ(size, 0); for (int i = 0; i < N; i++) { ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024))); @@ -1857,10 +1866,11 @@ TEST_F(DBTest, GetApproximateMemTableStats) { start = Key(100); end = Key(1020); + // Actually 20 keys in the range ^^ r = Range(start, end); db_->GetApproximateMemTableStats(r, &count, &size); - ASSERT_GT(count, 20); - ASSERT_GT(size, 6000); + EXPECT_EQ(count, 20); + EXPECT_EQ(size, 20880); } TEST_F(DBTest, ApproximateSizes) { @@ -5169,10 +5179,14 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) { options.max_bytes_for_level_multiplier = 4; options.max_background_compactions = 1; options.num_levels = 5; + options.statistics = CreateDBStatistics(); options.compression_per_level.resize(3); + // No compression for L0 options.compression_per_level[0] = kNoCompression; + // No compression for the Ln whre L0 is compacted to options.compression_per_level[1] = kNoCompression; + // Snpapy compression for Ln+1 options.compression_per_level[2] = kSnappyCompression; OnFileDeletionListener* listener = new OnFileDeletionListener(); @@ -5181,7 +5195,7 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) { DestroyAndReopen(options); // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should - // be compressed, so total data size should be more than 80K. + // be compressed, so there shouldn't be any compression. for (int i = 0; i < 20; i++) { ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); } @@ -5191,10 +5205,17 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) { ASSERT_EQ(NumTableFilesAtLevel(1), 0); ASSERT_EQ(NumTableFilesAtLevel(2), 0); ASSERT_EQ(NumTableFilesAtLevel(3), 0); - // Assuming each files' metadata is at least 50 bytes/ - ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4); + ASSERT_TRUE(NumTableFilesAtLevel(0) > 0 || NumTableFilesAtLevel(4) > 0); + + // Verify there was no compression + auto num_block_compressed = + options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); + ASSERT_EQ(num_block_compressed, 0); - // Insert 400KB. Some data will be compressed + // Insert 400KB and there will be some files end up in L3. According to the + // above compression settings for each level, there will be some compression. + ASSERT_OK(options.statistics->Reset()); + ASSERT_EQ(num_block_compressed, 0); for (int i = 21; i < 120; i++) { ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000))); } @@ -5202,9 +5223,14 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) { ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(1), 0); ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_GE(NumTableFilesAtLevel(3), 1); + ASSERT_GE(NumTableFilesAtLevel(4), 1); + + // Verify there was compression + num_block_compressed = + options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); + ASSERT_GT(num_block_compressed, 0); - ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4), - 120U * 4000U + 50U * 24); // Make sure data in files in L3 is not compacted by removing all files // in L4 and calculate number of rows ASSERT_OK(dbfull()->SetOptions({ @@ -5224,6 +5250,12 @@ TEST_F(DBTest, DynamicLevelCompressionPerLevel) { num_keys++; } ASSERT_OK(iter->status()); + + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + ASSERT_GE(NumTableFilesAtLevel(3), 1); + ASSERT_EQ(NumTableFilesAtLevel(4), 0); + ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U); } diff --git a/db/dbformat.cc b/db/dbformat.cc index 4a613c7d4..f5dba3fcb 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -272,11 +272,23 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s, void IterKey::EnlargeBuffer(size_t key_size) { // If size is smaller than buffer size, continue using current buffer, - // or the static allocated one, as default + // or the inline one, as default assert(key_size > buf_size_); // Need to enlarge the buffer. ResetBuffer(); buf_ = new char[key_size]; buf_size_ = key_size; } + +void IterKey::EnlargeSecondaryBufferIfNeeded(size_t key_size) { + // If size is smaller than buffer size, continue using current buffer, + // or the inline one, as default + if (key_size <= secondary_buf_size_) { + return; + } + // Need to enlarge the secondary buffer. + ResetSecondaryBuffer(); + secondary_buf_ = new char[key_size]; + secondary_buf_size_ = key_size; +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/dbformat.h b/db/dbformat.h index 3f8715780..f3a9b9a1a 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -10,6 +10,7 @@ #pragma once #include +#include #include #include #include @@ -562,18 +563,28 @@ inline uint64_t GetInternalKeySeqno(const Slice& internal_key) { // allocation for smaller keys. // 3. It tracks user key or internal key, and allow conversion between them. class IterKey { + static constexpr size_t kInlineBufferSize = 39; + // This is only used by user-defined timestamps in MemTable only feature, + // which only supports uint64_t timestamps. + static constexpr char kTsMin[] = "\x00\x00\x00\x00\x00\x00\x00\x00"; + public: IterKey() : buf_(space_), key_(buf_), key_size_(0), - buf_size_(sizeof(space_)), - is_user_key_(true) {} + buf_size_(kInlineBufferSize), + is_user_key_(true), + secondary_buf_(space_for_secondary_buf_), + secondary_buf_size_(kInlineBufferSize) {} // No copying allowed IterKey(const IterKey&) = delete; void operator=(const IterKey&) = delete; - ~IterKey() { ResetBuffer(); } + ~IterKey() { + ResetBuffer(); + ResetSecondaryBuffer(); + } // The bool will be picked up by the next calls to SetKey void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; } @@ -641,13 +652,15 @@ class IterKey { const char* non_shared_data, const size_t non_shared_len, const size_t ts_sz) { - std::string kTsMin(ts_sz, static_cast(0)); - std::string key_with_ts; - std::vector key_parts_with_ts; + // This function is only used by the UDT in memtable feature, which only + // support built in comparators with uint64 timestamps. + assert(ts_sz == sizeof(uint64_t)); + size_t next_key_slice_index = 0; if (IsUserKey()) { - key_parts_with_ts = {Slice(key_, shared_len), - Slice(non_shared_data, non_shared_len), - Slice(kTsMin)}; + key_slices_[next_key_slice_index++] = Slice(key_, shared_len); + key_slices_[next_key_slice_index++] = + Slice(non_shared_data, non_shared_len); + key_slices_[next_key_slice_index++] = Slice(kTsMin, ts_sz); } else { assert(shared_len + non_shared_len >= kNumInternalBytes); // Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len @@ -664,30 +677,46 @@ class IterKey { // One Slice among the three Slices will get split into two Slices, plus // a timestamp slice. - key_parts_with_ts.reserve(5); bool ts_added = false; // Add slice parts and find the right location to add the min timestamp. MaybeAddKeyPartsWithTimestamp( key_, shared_user_key_len, shared_internal_bytes_len + non_shared_len < kNumInternalBytes, - shared_len + non_shared_len - kNumInternalBytes, kTsMin, - key_parts_with_ts, &ts_added); + shared_len + non_shared_len - kNumInternalBytes, ts_sz, + &next_key_slice_index, &ts_added); MaybeAddKeyPartsWithTimestamp( key_ + user_key_len, shared_internal_bytes_len, non_shared_len < kNumInternalBytes, - shared_internal_bytes_len + non_shared_len - kNumInternalBytes, - kTsMin, key_parts_with_ts, &ts_added); + shared_internal_bytes_len + non_shared_len - kNumInternalBytes, ts_sz, + &next_key_slice_index, &ts_added); MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len, non_shared_len >= kNumInternalBytes, - non_shared_len - kNumInternalBytes, kTsMin, - key_parts_with_ts, &ts_added); + non_shared_len - kNumInternalBytes, ts_sz, + &next_key_slice_index, &ts_added); assert(ts_added); } + SetKeyImpl(next_key_slice_index, + /* total_bytes= */ shared_len + non_shared_len + ts_sz); + } - Slice new_key(SliceParts(&key_parts_with_ts.front(), - static_cast(key_parts_with_ts.size())), - &key_with_ts); - SetKey(new_key); + Slice SetKeyWithPaddedMinTimestamp(const Slice& key, size_t ts_sz) { + // This function is only used by the UDT in memtable feature, which only + // support built in comparators with uint64 timestamps. + assert(ts_sz == sizeof(uint64_t)); + size_t num_key_slices = 0; + if (is_user_key_) { + key_slices_[0] = key; + key_slices_[1] = Slice(kTsMin, ts_sz); + num_key_slices = 2; + } else { + assert(key.size() >= kNumInternalBytes); + size_t user_key_size = key.size() - kNumInternalBytes; + key_slices_[0] = Slice(key.data(), user_key_size); + key_slices_[1] = Slice(kTsMin, ts_sz); + key_slices_[2] = Slice(key.data() + user_key_size, kNumInternalBytes); + num_key_slices = 3; + } + return SetKeyImpl(num_key_slices, key.size() + ts_sz); } Slice SetKey(const Slice& key, bool copy = true) { @@ -718,15 +747,6 @@ class IterKey { return Slice(key_, key_n); } - // Copy the key into IterKey own buf_ - void OwnKey() { - assert(IsKeyPinned() == true); - - Reserve(key_size_); - memcpy(buf_, key_, key_size_); - key_ = buf_; - } - // Update the sequence number in the internal key. Guarantees not to // invalidate slices to the key (and the user key). void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) { @@ -738,10 +758,15 @@ class IterKey { ts->size()); } uint64_t newval = (seq << 8) | t; - EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval); + if (key_ == buf_) { + EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval); + } else { + assert(key_ == secondary_buf_); + EncodeFixed64(&secondary_buf_[key_size_ - kNumInternalBytes], newval); + } } - bool IsKeyPinned() const { return (key_ != buf_); } + bool IsKeyPinned() const { return key_ != buf_ && key_ != secondary_buf_; } // If `ts` is provided, user_key should not contain timestamp, // and `ts` is appended after user_key. @@ -806,8 +831,24 @@ class IterKey { const char* key_; size_t key_size_; size_t buf_size_; - char space_[39]; // Avoid allocation for short keys + char space_[kInlineBufferSize]; // Avoid allocation for short keys bool is_user_key_; + // Below variables are only used by user-defined timestamps in MemTable only + // feature for iterating keys in an index block or a data block. + // + // We will alternate between buf_ and secondary_buf_ to hold the key. key_ + // will be modified in accordance to point to the right one. This is to avoid + // an extra copy when we need to copy some shared bytes from previous key + // (delta encoding), and we need to pad a min timestamp at the right location. + char space_for_secondary_buf_[kInlineBufferSize]; // Avoid allocation for + // short keys + char* secondary_buf_; + size_t secondary_buf_size_; + // Use to track the pieces that together make the whole key. We then copy + // these pieces in order either into buf_ or secondary_buf_ depending on where + // the previous key is held. + std::array key_slices_; + // End of variables used by user-defined timestamps in MemTable only feature. Slice SetKeyImpl(const Slice& key, bool copy) { size_t size = key.size(); @@ -824,18 +865,64 @@ class IterKey { return Slice(key_, key_size_); } + Slice SetKeyImpl(size_t num_key_slices, size_t total_bytes) { + assert(num_key_slices <= 5); + char* buf_start = nullptr; + if (key_ == buf_) { + // If the previous key is in buf_, we copy key_slices_ in order into + // secondary_buf_. + EnlargeSecondaryBufferIfNeeded(total_bytes); + buf_start = secondary_buf_; + key_ = secondary_buf_; + } else { + // Copy key_slices_ in order into buf_. + EnlargeBufferIfNeeded(total_bytes); + buf_start = buf_; + key_ = buf_; + } +#ifndef NDEBUG + size_t actual_total_bytes = 0; +#endif // NDEBUG + for (size_t i = 0; i < num_key_slices; i++) { + size_t key_slice_size = key_slices_[i].size(); + memcpy(buf_start, key_slices_[i].data(), key_slice_size); + buf_start += key_slice_size; +#ifndef NDEBUG + actual_total_bytes += key_slice_size; +#endif // NDEBUG + } +#ifndef NDEBUG + assert(actual_total_bytes == total_bytes); +#endif // NDEBUG + key_size_ = total_bytes; + return Slice(key_, key_size_); + } + void ResetBuffer() { + if (key_ == buf_) { + key_size_ = 0; + } if (buf_ != space_) { delete[] buf_; buf_ = space_; } - buf_size_ = sizeof(space_); - key_size_ = 0; + buf_size_ = kInlineBufferSize; + } + + void ResetSecondaryBuffer() { + if (key_ == secondary_buf_) { + key_size_ = 0; + } + if (secondary_buf_ != space_for_secondary_buf_) { + delete[] secondary_buf_; + secondary_buf_ = space_for_secondary_buf_; + } + secondary_buf_size_ = kInlineBufferSize; } // Enlarge the buffer size if needed based on key_size. - // By default, static allocated buffer is used. Once there is a key - // larger than the static allocated buffer, another buffer is dynamically + // By default, inline buffer is used. Once there is a key + // larger than the inline buffer, another buffer is dynamically // allocated, until a larger key buffer is requested. In that case, we // reallocate buffer and delete the old one. void EnlargeBufferIfNeeded(size_t key_size) { @@ -846,23 +933,27 @@ class IterKey { } } + void EnlargeSecondaryBufferIfNeeded(size_t key_size); + void EnlargeBuffer(size_t key_size); void MaybeAddKeyPartsWithTimestamp(const char* slice_data, const size_t slice_sz, bool add_timestamp, - const size_t left_sz, - const std::string& min_timestamp, - std::vector& key_parts, + const size_t left_sz, const size_t ts_sz, + size_t* next_key_slice_idx, bool* ts_added) { + assert(next_key_slice_idx); if (add_timestamp && !*ts_added) { assert(slice_sz >= left_sz); - key_parts.emplace_back(slice_data, left_sz); - key_parts.emplace_back(min_timestamp); - key_parts.emplace_back(slice_data + left_sz, slice_sz - left_sz); + key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, left_sz); + key_slices_[(*next_key_slice_idx)++] = Slice(kTsMin, ts_sz); + key_slices_[(*next_key_slice_idx)++] = + Slice(slice_data + left_sz, slice_sz - left_sz); *ts_added = true; } else { - key_parts.emplace_back(slice_data, slice_sz); + key_slices_[(*next_key_slice_idx)++] = Slice(slice_data, slice_sz); } + assert(*next_key_slice_idx <= 5); } }; @@ -936,22 +1027,13 @@ struct RangeTombstone { // User-defined timestamp is enabled, `sk` and `ek` should be user key // with timestamp, `ts` will replace the timestamps in `sk` and // `ek`. - // When `logical_strip_timestamp` is true, the timestamps in `sk` and `ek` - // will be replaced with min timestamp. - RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts, - bool logical_strip_timestamp) - : seq_(sn) { + RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts) : seq_(sn) { const size_t ts_sz = ts.size(); assert(ts_sz > 0); pinned_start_key_.reserve(sk.size()); pinned_end_key_.reserve(ek.size()); - if (logical_strip_timestamp) { - AppendUserKeyWithMinTimestamp(&pinned_start_key_, sk, ts_sz); - AppendUserKeyWithMinTimestamp(&pinned_end_key_, ek, ts_sz); - } else { - AppendUserKeyWithDifferentTimestamp(&pinned_start_key_, sk, ts); - AppendUserKeyWithDifferentTimestamp(&pinned_end_key_, ek, ts); - } + AppendUserKeyWithDifferentTimestamp(&pinned_start_key_, sk, ts); + AppendUserKeyWithDifferentTimestamp(&pinned_end_key_, ek, ts); start_key_ = pinned_start_key_; end_key_ = pinned_end_key_; ts_ = Slice(pinned_start_key_.data() + sk.size() - ts_sz, ts_sz); diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 0d260fbf5..1c57102c3 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -1790,8 +1790,8 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) { SstFileWriter sst_file_writer(EnvOptions(), options); std::string file1 = sst_files_dir_ + "file1.sst"; ASSERT_OK(sst_file_writer.Open(file1)); - ASSERT_OK(sst_file_writer.Put("a", "z")); - ASSERT_OK(sst_file_writer.Put("i", "m")); + ASSERT_OK(sst_file_writer.Put("a", "a1")); + ASSERT_OK(sst_file_writer.Put("i", "i1")); ExternalSstFileInfo file1_info; ASSERT_OK(sst_file_writer.Finish(&file1_info)); files.push_back(std::move(file1)); @@ -1800,16 +1800,29 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) { SstFileWriter sst_file_writer(EnvOptions(), options); std::string file2 = sst_files_dir_ + "file2.sst"; ASSERT_OK(sst_file_writer.Open(file2)); - ASSERT_OK(sst_file_writer.Put("i", "k")); + ASSERT_OK(sst_file_writer.Put("i", "i2")); ExternalSstFileInfo file2_info; ASSERT_OK(sst_file_writer.Finish(&file2_info)); files.push_back(std::move(file2)); } + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + ASSERT_OK(sst_file_writer.Put("j", "j1")); + ASSERT_OK(sst_file_writer.Put("m", "m1")); + ExternalSstFileInfo file3_info; + ASSERT_OK(sst_file_writer.Finish(&file3_info)); + files.push_back(std::move(file3)); + } + IngestExternalFileOptions ifo; ASSERT_OK(db_->IngestExternalFile(files, ifo)); - ASSERT_EQ(Get("a"), "z"); - ASSERT_EQ(Get("i"), "k"); + ASSERT_EQ(Get("a"), "a1"); + ASSERT_EQ(Get("i"), "i2"); + ASSERT_EQ(Get("j"), "j1"); + ASSERT_EQ(Get("m"), "m1"); int total_keys = 0; Iterator* iter = db_->NewIterator(ReadOptions()); @@ -1817,10 +1830,355 @@ TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) { ASSERT_OK(iter->status()); total_keys++; } + ASSERT_OK(iter->status()); delete iter; - ASSERT_EQ(total_keys, 2); + ASSERT_EQ(total_keys, 4); - ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + ASSERT_EQ(2, NumTableFilesAtLevel(5)); +} + +TEST_F(ExternalSSTFileBasicTest, AtomicReplaceDataWithStandaloneRangeDeletion) { + Options options = CurrentOptions(); + options.compaction_style = CompactionStyle::kCompactionStyleUniversal; + DestroyAndReopen(options); + + std::vector files; + { + // Writes first version of data in range partitioned files. + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("a", "a1")); + ASSERT_OK(sst_file_writer.Put("b", "b1")); + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + files.push_back(std::move(file1)); + + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.Put("x", "x1")); + ASSERT_OK(sst_file_writer.Put("y", "y1")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + } + + IngestExternalFileOptions ifo; + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + ASSERT_EQ(Get("a"), "a1"); + ASSERT_EQ(Get("b"), "b1"); + ASSERT_EQ(Get("x"), "x1"); + ASSERT_EQ(Get("y"), "y1"); + ASSERT_EQ(2, NumTableFilesAtLevel(6)); + + { + // Atomically delete old version of data with one range delete file. + // And a new batch of range partitioned files with new version of data. + files.clear(); + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.DeleteRange("a", "z")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + ASSERT_OK(sst_file_writer.Put("a", "a2")); + ASSERT_OK(sst_file_writer.Put("b", "b2")); + ExternalSstFileInfo file3_info; + ASSERT_OK(sst_file_writer.Finish(&file3_info)); + files.push_back(std::move(file3)); + + std::string file4 = sst_files_dir_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + ASSERT_OK(sst_file_writer.Put("x", "x2")); + ASSERT_OK(sst_file_writer.Put("y", "y2")); + ExternalSstFileInfo file4_info; + ASSERT_OK(sst_file_writer.Finish(&file4_info)); + files.push_back(std::move(file4)); + } + + const Snapshot* snapshot = db_->GetSnapshot(); + + auto seqno_before_ingestion = db_->GetLatestSequenceNumber(); + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + // Overlapping files each occupy one new sequence number. + ASSERT_EQ(db_->GetLatestSequenceNumber(), seqno_before_ingestion + 3); + + // Check old version of data, big range deletion, new version of data are + // on separate levels. + ASSERT_EQ(2, NumTableFilesAtLevel(4)); + ASSERT_EQ(1, NumTableFilesAtLevel(5)); + ASSERT_EQ(2, NumTableFilesAtLevel(6)); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(2, NumTableFilesAtLevel(4)); + ASSERT_EQ(1, NumTableFilesAtLevel(5)); + ASSERT_EQ(2, NumTableFilesAtLevel(6)); + + bool compaction_iter_input_checked = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::MakeInputIterator:NewCompactionMergingIterator", + [&](void* arg) { + size_t* num_input_files = static_cast(arg); + EXPECT_EQ(1, *num_input_files); + compaction_iter_input_checked = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + db_->ReleaseSnapshot(snapshot); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(2, NumTableFilesAtLevel(4)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); + ASSERT_EQ(0, NumTableFilesAtLevel(6)); + ASSERT_TRUE(compaction_iter_input_checked); + + ASSERT_EQ(Get("a"), "a2"); + ASSERT_EQ(Get("b"), "b2"); + ASSERT_EQ(Get("x"), "x2"); + ASSERT_EQ(Get("y"), "y2"); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileBasicTest, + PartiallyReplaceDataWithOneStandaloneRangeDeletion) { + Options options = CurrentOptions(); + options.compaction_style = CompactionStyle::kCompactionStyleUniversal; + DestroyAndReopen(options); + + std::vector files; + { + // Writes first version of data in range partitioned files. + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("a", "a1")); + ASSERT_OK(sst_file_writer.Put("b", "b1")); + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + files.push_back(std::move(file1)); + + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.Put("x", "x1")); + ASSERT_OK(sst_file_writer.Put("y", "y")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + } + + IngestExternalFileOptions ifo; + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + ASSERT_EQ(Get("a"), "a1"); + ASSERT_EQ(Get("b"), "b1"); + ASSERT_EQ(Get("x"), "x1"); + ASSERT_EQ(Get("y"), "y"); + ASSERT_EQ(2, NumTableFilesAtLevel(6)); + + { + // Partially delete old version of data with one range delete file. And + // add new version of data for deleted range. + files.clear(); + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.DeleteRange("a", "y")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + ASSERT_OK(sst_file_writer.Put("a", "a2")); + ASSERT_OK(sst_file_writer.Put("b", "b2")); + ExternalSstFileInfo file3_info; + ASSERT_OK(sst_file_writer.Finish(&file3_info)); + files.push_back(std::move(file3)); + std::string file4 = sst_files_dir_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + ASSERT_OK(sst_file_writer.Put("h", "h1")); + ASSERT_OK(sst_file_writer.Put("x", "x2")); + ExternalSstFileInfo file4_info; + ASSERT_OK(sst_file_writer.Finish(&file4_info)); + files.push_back(std::move(file4)); + } + + bool compaction_iter_input_checked = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::MakeInputIterator:NewCompactionMergingIterator", + [&](void* arg) { + size_t* num_input_files = static_cast(arg); + EXPECT_EQ(2, *num_input_files); + compaction_iter_input_checked = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(2, NumTableFilesAtLevel(4)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + ASSERT_TRUE(compaction_iter_input_checked); + + ASSERT_EQ(Get("a"), "a2"); + ASSERT_EQ(Get("b"), "b2"); + ASSERT_EQ(Get("h"), "h1"); + ASSERT_EQ(Get("x"), "x2"); + ASSERT_EQ(Get("y"), "y"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileBasicTest, + PartiallyReplaceDataWithMultipleStandaloneRangeDeletions) { + Options options = CurrentOptions(); + options.compaction_style = CompactionStyle::kCompactionStyleUniversal; + DestroyAndReopen(options); + + std::vector files; + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("a", "a1")); + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + files.push_back(std::move(file1)); + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.Put("h", "h")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + std::string file3 = sst_files_dir_ + "file3.sst"; + ASSERT_OK(sst_file_writer.Open(file3)); + ASSERT_OK(sst_file_writer.Put("x", "x1")); + ExternalSstFileInfo file3_info; + ASSERT_OK(sst_file_writer.Finish(&file3_info)); + files.push_back(std::move(file3)); + } + + IngestExternalFileOptions ifo; + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + ASSERT_EQ(Get("a"), "a1"); + ASSERT_EQ(Get("h"), "h"); + ASSERT_EQ(Get("x"), "x1"); + ASSERT_EQ(3, NumTableFilesAtLevel(6)); + + { + files.clear(); + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file4 = sst_files_dir_ + "file4.sst"; + ASSERT_OK(sst_file_writer.Open(file4)); + ASSERT_OK(sst_file_writer.DeleteRange("a", "b")); + ExternalSstFileInfo file4_info; + ASSERT_OK(sst_file_writer.Finish(&file4_info)); + files.push_back(std::move(file4)); + std::string file5 = sst_files_dir_ + "file5.sst"; + ASSERT_OK(sst_file_writer.Open(file5)); + ASSERT_OK(sst_file_writer.DeleteRange("x", "y")); + ExternalSstFileInfo file5_info; + ASSERT_OK(sst_file_writer.Finish(&file5_info)); + files.push_back(std::move(file5)); + std::string file6 = sst_files_dir_ + "file6.sst"; + ASSERT_OK(sst_file_writer.Open(file6)); + ASSERT_OK(sst_file_writer.Put("a", "a2")); + ExternalSstFileInfo file6_info; + ASSERT_OK(sst_file_writer.Finish(&file6_info)); + files.push_back(std::move(file6)); + std::string file7 = sst_files_dir_ + "file7.sst"; + ASSERT_OK(sst_file_writer.Open(file7)); + ASSERT_OK(sst_file_writer.Put("x", "x2")); + ExternalSstFileInfo file7_info; + ASSERT_OK(sst_file_writer.Finish(&file7_info)); + files.push_back(std::move(file7)); + } + + int num_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::MakeInputIterator:NewCompactionMergingIterator", + [&](void* arg) { + size_t* num_input_files = static_cast(arg); + EXPECT_EQ(1, *num_input_files); + num_compactions += 1; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(2, NumTableFilesAtLevel(4)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + ASSERT_EQ(2, num_compactions); + + ASSERT_EQ(Get("a"), "a2"); + ASSERT_EQ(Get("h"), "h"); + ASSERT_EQ(Get("x"), "x2"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(ExternalSSTFileBasicTest, StandaloneRangeDeletionEndKeyIsExclusive) { + Options options = CurrentOptions(); + options.compaction_style = CompactionStyle::kCompactionStyleUniversal; + DestroyAndReopen(options); + + std::vector files; + { + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file1 = sst_files_dir_ + "file1.sst"; + ASSERT_OK(sst_file_writer.Open(file1)); + ASSERT_OK(sst_file_writer.Put("a", "a")); + ASSERT_OK(sst_file_writer.Put("b", "b")); + ExternalSstFileInfo file1_info; + ASSERT_OK(sst_file_writer.Finish(&file1_info)); + files.push_back(std::move(file1)); + } + + IngestExternalFileOptions ifo; + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + ASSERT_EQ(Get("a"), "a"); + ASSERT_EQ(Get("b"), "b"); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + + { + // A standalone range deletion with its exclusive end matching the range end + // of file doesn't fully delete it. + files.clear(); + SstFileWriter sst_file_writer(EnvOptions(), options); + std::string file2 = sst_files_dir_ + "file2.sst"; + ASSERT_OK(sst_file_writer.Open(file2)); + ASSERT_OK(sst_file_writer.DeleteRange("a", "b")); + ExternalSstFileInfo file2_info; + ASSERT_OK(sst_file_writer.Finish(&file2_info)); + files.push_back(std::move(file2)); + } + + bool compaction_iter_input_checked = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionSet::MakeInputIterator:NewCompactionMergingIterator", + [&](void* arg) { + size_t* num_input_files = static_cast(arg); + // Standalone range deletion file for ["a", "b") + file with ["a", "b"]. + EXPECT_EQ(2, *num_input_files); + compaction_iter_input_checked = true; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(db_->IngestExternalFile(files, ifo)); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, NumTableFilesAtLevel(4)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + ASSERT_TRUE(compaction_iter_input_checked); + + ASSERT_EQ(Get("a"), "NOT_FOUND"); + ASSERT_EQ(Get("b"), "b"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) { @@ -2045,7 +2403,7 @@ TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevel) { ifo.fail_if_not_bottommost_level = true; ifo.snapshot_consistency = true; const Status s = db_->IngestExternalFile({file_path}, ifo); - ASSERT_TRUE(s.IsTryAgain()); + ASSERT_TRUE(s.ok()); } // Test level compaction diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 7e5a97562..63a5f6fc8 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -67,7 +67,6 @@ Status ExternalSstFileIngestionJob::Prepare( files_to_ingest_.emplace_back(std::move(file_to_ingest)); } - const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); auto num_files = files_to_ingest_.size(); if (num_files == 0) { return Status::InvalidArgument("The list of files is empty"); @@ -78,16 +77,12 @@ Status ExternalSstFileIngestionJob::Prepare( sorted_files.push_back(&files_to_ingest_[i]); } - std::sort( - sorted_files.begin(), sorted_files.end(), - [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) { - return sstableKeyCompare(ucmp, info1->smallest_internal_key, - info2->smallest_internal_key) < 0; - }); + std::sort(sorted_files.begin(), sorted_files.end(), file_range_checker_); for (size_t i = 0; i + 1 < num_files; i++) { - if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key, - sorted_files[i + 1]->smallest_internal_key) >= 0) { + if (file_range_checker_.OverlapsWithPrev(sorted_files[i], + sorted_files[i + 1], + /* ranges_sorted= */ true)) { files_overlap_ = true; break; } @@ -100,7 +95,7 @@ Status ExternalSstFileIngestionJob::Prepare( "behind mode."); } - if (ucmp->timestamp_size() > 0 && files_overlap_) { + if (ucmp_->timestamp_size() > 0 && files_overlap_) { return Status::NotSupported( "Files with overlapping ranges cannot be ingested to column " "family with user-defined timestamp enabled."); @@ -336,9 +331,35 @@ Status ExternalSstFileIngestionJob::Prepare( } } + if (status.ok()) { + DivideInputFilesIntoBatches(); + } + return status; } +void ExternalSstFileIngestionJob::DivideInputFilesIntoBatches() { + if (!files_overlap_) { + // No overlap, treat as one batch without the need of tracking overall batch + // range. + file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ false); + for (auto& file : files_to_ingest_) { + file_batches_to_ingest_.back().AddFile(&file, file_range_checker_); + } + return; + } + + file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true); + for (auto& file : files_to_ingest_) { + if (file_range_checker_.OverlapsWithPrev(&file_batches_to_ingest_.back(), + &file, + /* ranges_sorted= */ false)) { + file_batches_to_ingest_.emplace_back(/* _track_batch_range= */ true); + } + file_batches_to_ingest_.back().AddFile(&file, file_range_checker_); + } +} + Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, SuperVersion* super_version) { size_t n = files_to_ingest_.size(); @@ -353,9 +374,7 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, if (!ingestion_options_.allow_blocking_flush) { status = Status::InvalidArgument("External file requires flush"); } - auto ucmp = cfd_->user_comparator(); - assert(ucmp); - if (ucmp->timestamp_size() > 0) { + if (ucmp_->timestamp_size() > 0) { status = Status::InvalidArgument( "Column family enables user-defined timestamps, please make " "sure the key range (without timestamp) of external file does not " @@ -397,14 +416,39 @@ Status ExternalSstFileIngestionJob::Run() { edit_.SetColumnFamily(cfd_->GetID()); // The levels that the files will be ingested into - for (IngestedFileInfo& f : files_to_ingest_) { + std::optional prev_batch_uppermost_level; + for (auto& batch : file_batches_to_ingest_) { + int batch_uppermost_level = 0; + status = AssignLevelsForOneBatch(batch, super_version, force_global_seqno, + &last_seqno, &batch_uppermost_level, + prev_batch_uppermost_level); + if (!status.ok()) { + return status; + } + + prev_batch_uppermost_level = batch_uppermost_level; + } + + CreateEquivalentFileIngestingCompactions(); + return status; +} + +Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch( + FileBatchInfo& batch, SuperVersion* super_version, bool force_global_seqno, + SequenceNumber* last_seqno, int* batch_uppermost_level, + std::optional prev_batch_uppermost_level) { + Status status; + assert(batch_uppermost_level); + *batch_uppermost_level = std::numeric_limits::max(); + for (IngestedFileInfo* file : batch.files) { + assert(file); SequenceNumber assigned_seqno = 0; if (ingestion_options_.ingest_behind) { - status = CheckLevelForIngestedBehindFile(&f); + status = CheckLevelForIngestedBehindFile(file); } else { status = AssignLevelAndSeqnoForIngestedFile( super_version, force_global_seqno, cfd_->ioptions()->compaction_style, - last_seqno, &f, &assigned_seqno); + *last_seqno, file, &assigned_seqno, prev_batch_uppermost_level); } // Modify the smallest/largest internal key to include the sequence number @@ -413,38 +457,38 @@ Status ExternalSstFileIngestionJob::Run() { // exclusive endpoint. ParsedInternalKey smallest_parsed, largest_parsed; if (status.ok()) { - status = ParseInternalKey(*f.smallest_internal_key.rep(), + status = ParseInternalKey(*(file->smallest_internal_key.rep()), &smallest_parsed, false /* log_err_key */); } if (status.ok()) { - status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed, - false /* log_err_key */); + status = ParseInternalKey(*(file->largest_internal_key.rep()), + &largest_parsed, false /* log_err_key */); } if (!status.ok()) { return status; } if (smallest_parsed.sequence == 0 && assigned_seqno != 0) { - UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno, + UpdateInternalKey(file->smallest_internal_key.rep(), assigned_seqno, smallest_parsed.type); } if (largest_parsed.sequence == 0 && assigned_seqno != 0) { - UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno, + UpdateInternalKey(file->largest_internal_key.rep(), assigned_seqno, largest_parsed.type); } - status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno); + status = AssignGlobalSeqnoForIngestedFile(file, assigned_seqno); if (!status.ok()) { return status; } TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run", &assigned_seqno); - assert(assigned_seqno == 0 || assigned_seqno == last_seqno + 1); - if (assigned_seqno > last_seqno) { - last_seqno = assigned_seqno; + assert(assigned_seqno == 0 || assigned_seqno == *last_seqno + 1); + if (assigned_seqno > *last_seqno) { + *last_seqno = assigned_seqno; ++consumed_seqno_count_; } - status = GenerateChecksumForIngestedFile(&f); + status = GenerateChecksumForIngestedFile(file); if (!status.ok()) { return status; } @@ -459,31 +503,40 @@ Status ExternalSstFileIngestionJob::Run() { static_cast(temp_current_time); } uint64_t tail_size = 0; - bool contain_no_data_blocks = f.table_properties.num_entries > 0 && - (f.table_properties.num_entries == - f.table_properties.num_range_deletions); - if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { - uint64_t file_size = f.fd.GetFileSize(); - assert(f.table_properties.tail_start_offset <= file_size); - tail_size = file_size - f.table_properties.tail_start_offset; + bool contain_no_data_blocks = file->table_properties.num_entries > 0 && + (file->table_properties.num_entries == + file->table_properties.num_range_deletions); + if (file->table_properties.tail_start_offset > 0 || + contain_no_data_blocks) { + uint64_t file_size = file->fd.GetFileSize(); + assert(file->table_properties.tail_start_offset <= file_size); + tail_size = file_size - file->table_properties.tail_start_offset; } + bool marked_for_compaction = + file->table_properties.num_range_deletions == 1 && + (file->table_properties.num_entries == + file->table_properties.num_range_deletions); FileMetaData f_metadata( - f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), - f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, - f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber, - oldest_ancester_time, current_time, + file->fd.GetNumber(), file->fd.GetPathId(), file->fd.GetFileSize(), + file->smallest_internal_key, file->largest_internal_key, + file->assigned_seqno, file->assigned_seqno, false, + file->file_temperature, kInvalidBlobFileNumber, oldest_ancester_time, + current_time, ingestion_options_.ingest_behind ? kReservedEpochNumberForFileIngestedBehind : cfd_->NewEpochNumber(), - f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, tail_size, - f.user_defined_timestamps_persisted); - f_metadata.temperature = f.file_temperature; - edit_.AddFile(f.picked_level, f_metadata); + file->file_checksum, file->file_checksum_func_name, file->unique_id, 0, + tail_size, file->user_defined_timestamps_persisted); + f_metadata.temperature = file->file_temperature; + f_metadata.marked_for_compaction = marked_for_compaction; + edit_.AddFile(file->picked_level, f_metadata); + + *batch_uppermost_level = + std::min(*batch_uppermost_level, file->picked_level); } - CreateEquivalentFileIngestingCompactions(); - return status; + return Status::OK(); } void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() { @@ -519,20 +572,17 @@ void ExternalSstFileIngestionJob::CreateEquivalentFileIngestingCompactions() { file_ingesting_compactions_.push_back(new Compaction( cfd_->current()->storage_info(), *cfd_->ioptions(), mutable_cf_options, mutable_db_options_, {input}, output_level, - MaxFileSizeForLevel( - mutable_cf_options, output_level, - cfd_->ioptions()->compaction_style) /* output file size - limit, - * not applicable - */ - , + /* output file size limit not applicable */ + MaxFileSizeForLevel(mutable_cf_options, output_level, + cfd_->ioptions()->compaction_style), LLONG_MAX /* max compaction bytes, not applicable */, 0 /* output path ID, not applicable */, mutable_cf_options.compression, mutable_cf_options.compression_opts, mutable_cf_options.default_write_temperature, 0 /* max_subcompaction, not applicable */, - {} /* grandparents, not applicable */, false /* is manual */, - "" /* trim_ts */, -1 /* score, not applicable */, + {} /* grandparents, not applicable */, + std::nullopt /* earliest_snapshot */, nullptr /* snapshot_checker */, + false /* is manual */, "" /* trim_ts */, -1 /* score, not applicable */, false /* is deletion compaction, not applicable */, files_overlap_ /* l0_files_might_overlap, not applicable */, CompactionReason::kExternalSstIngestion)); @@ -679,7 +729,10 @@ Status ExternalSstFileIngestionJob::ResetTableReader( new RandomAccessFileReader(std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); table_reader->reset(); - status = cfd_->ioptions()->table_factory->NewTableReader( + ReadOptions ro; + ro.fill_cache = ingestion_options_.fill_cache; + status = sv->mutable_cf_options.table_factory->NewTableReader( + ro, TableReaderOptions( *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, env_options_, cfd_->internal_comparator(), @@ -691,7 +744,9 @@ Status ExternalSstFileIngestionJob::ResetTableReader( /*cur_file_num*/ new_file_number, /* unique_id */ {}, /* largest_seqno */ 0, /* tail_size */ 0, user_defined_timestamps_persisted), - std::move(sst_file_reader), file_to_ingest->file_size, table_reader); + std::move(sst_file_reader), file_to_ingest->file_size, table_reader, + // No need to prefetch index/filter if caching is not needed. + /*prefetch_index_and_filter_in_cache=*/ingestion_options_.fill_cache); return status; } @@ -707,6 +762,7 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties( // Get table version auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion); if (version_iter == uprops.end()) { + assert(!SstFileWriter::CreatedBySstFileWriter(*props)); if (!ingestion_options_.allow_db_generated_files) { return Status::Corruption("External file version not found"); } else { @@ -715,6 +771,7 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties( file_to_ingest->version = 0; } } else { + assert(SstFileWriter::CreatedBySstFileWriter(*props)); file_to_ingest->version = DecodeFixed32(version_iter->second.c_str()); } @@ -787,9 +844,7 @@ Status ExternalSstFileIngestionJob::SanityCheckTableProperties( // `TableReader` is initialized with `user_defined_timestamps_persisted` flag // to be true. If its value changed to false after this sanity check, we // need to reset the `TableReader`. - auto ucmp = cfd_->user_comparator(); - assert(ucmp); - if (ucmp->timestamp_size() > 0 && + if (ucmp_->timestamp_size() > 0 && !file_to_ingest->user_defined_timestamps_persisted) { s = ResetTableReader(external_file, new_file_number, file_to_ingest->user_defined_timestamps_persisted, sv, @@ -839,6 +894,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; + ro.fill_cache = ingestion_options_.fill_cache; status = table_reader->VerifyChecksum( ro, TableReaderCaller::kExternalSSTIngestion); if (!status.ok()) { @@ -849,16 +905,12 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( ParsedInternalKey key; // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; + ro.fill_cache = ingestion_options_.fill_cache; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); // Get first (smallest) and last (largest) key from file. - file_to_ingest->smallest_internal_key = - InternalKey("", 0, ValueType::kTypeValue); - file_to_ingest->largest_internal_key = - InternalKey("", 0, ValueType::kTypeValue); - bool bounds_set = false; bool allow_data_in_errors = db_options_.allow_data_in_errors; iter->SeekToFirst(); if (iter->Valid()) { @@ -874,7 +926,8 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( file_to_ingest->smallest_internal_key.SetFrom(key); Slice largest; - if (strcmp(cfd_->ioptions()->table_factory->Name(), "PlainTable") == 0) { + if (strcmp(sv->mutable_cf_options.table_factory->Name(), "PlainTable") == + 0) { // PlainTable iterator does not support SeekToLast(). largest = iter->key(); for (; iter->Valid(); iter->Next()) { @@ -908,8 +961,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( return Status::Corruption("External file has non zero sequence number"); } file_to_ingest->largest_internal_key.SetFrom(key); - - bounds_set = true; } else if (!iter->status().ok()) { return iter->status(); } @@ -946,7 +997,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( table_reader->NewRangeTombstoneIterator(ro)); // We may need to adjust these key bounds, depending on whether any range // deletion tombstones extend past them. - const Comparator* ucmp = cfd_->user_comparator(); if (range_del_iter != nullptr) { for (range_del_iter->SeekToFirst(); range_del_iter->Valid(); range_del_iter->Next()) { @@ -962,24 +1012,13 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( "number."); } RangeTombstone tombstone(key, range_del_iter->value()); - - InternalKey start_key = tombstone.SerializeKey(); - if (!bounds_set || - sstableKeyCompare(ucmp, start_key, - file_to_ingest->smallest_internal_key) < 0) { - file_to_ingest->smallest_internal_key = start_key; - } - InternalKey end_key = tombstone.SerializeEndKey(); - if (!bounds_set || - sstableKeyCompare(ucmp, end_key, - file_to_ingest->largest_internal_key) > 0) { - file_to_ingest->largest_internal_key = end_key; - } - bounds_set = true; + file_range_checker_.MaybeUpdateRange(tombstone.SerializeKey(), + tombstone.SerializeEndKey(), + file_to_ingest); } } - const size_t ts_sz = ucmp->timestamp_size(); + const size_t ts_sz = ucmp_->timestamp_size(); Slice smallest = file_to_ingest->smallest_internal_key.user_key(); Slice largest = file_to_ingest->largest_internal_key.user_key(); if (ts_sz > 0) { @@ -1008,16 +1047,19 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style, SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest, - SequenceNumber* assigned_seqno) { + SequenceNumber* assigned_seqno, + std::optional prev_batch_uppermost_level) { Status status; *assigned_seqno = 0; - auto ucmp = cfd_->user_comparator(); - const size_t ts_sz = ucmp->timestamp_size(); + const size_t ts_sz = ucmp_->timestamp_size(); + assert(!prev_batch_uppermost_level.has_value() || + prev_batch_uppermost_level.value() < cfd_->NumberLevels()); + bool must_assign_to_l0 = prev_batch_uppermost_level.has_value() && + prev_batch_uppermost_level.value() == 0; if (force_global_seqno || files_overlap_ || - compaction_style == kCompactionStyleFIFO) { + compaction_style == kCompactionStyleFIFO || must_assign_to_l0) { *assigned_seqno = last_seqno + 1; - // If files overlap, we have to ingest them at level 0. - if (files_overlap_ || compaction_style == kCompactionStyleFIFO) { + if (compaction_style == kCompactionStyleFIFO || must_assign_to_l0) { assert(ts_sz == 0); file_to_ingest->picked_level = 0; if (ingestion_options_.fail_if_not_bottommost_level && @@ -1034,11 +1076,16 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( Arena arena; // TODO: plumb Env::IOActivity, Env::IOPriority ReadOptions ro; + ro.fill_cache = ingestion_options_.fill_cache; ro.total_order_seek = true; int target_level = 0; auto* vstorage = cfd_->current()->storage_info(); + assert(!must_assign_to_l0); + int exclusive_end_level = prev_batch_uppermost_level.has_value() + ? prev_batch_uppermost_level.value() + : cfd_->NumberLevels(); - for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) { + for (int lvl = 0; lvl < exclusive_end_level; lvl++) { if (lvl > 0 && lvl < vstorage->base_level()) { continue; } @@ -1065,8 +1112,6 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( overlap_with_db = true; break; } - } else if (compaction_style == kCompactionStyleUniversal) { - continue; } // We don't overlap with any keys in this level, but we still need to check diff --git a/db/external_sst_file_ingestion_job.h b/db/external_sst_file_ingestion_job.h index 46d43fd21..df66e9e91 100644 --- a/db/external_sst_file_ingestion_job.h +++ b/db/external_sst_file_ingestion_job.h @@ -25,13 +25,74 @@ namespace ROCKSDB_NAMESPACE { class Directories; class SystemClock; -struct IngestedFileInfo { - // External file path - std::string external_file_path; - // Smallest internal key in external file +struct KeyRangeInfo { + // Smallest internal key in an external file or for a batch of external files. InternalKey smallest_internal_key; - // Largest internal key in external file + // Largest internal key in an external file or for a batch of external files. InternalKey largest_internal_key; + + bool empty() const { + return smallest_internal_key.size() == 0 && + largest_internal_key.size() == 0; + } +}; + +// Helper class to apply SST file key range checks to the external files. +class ExternalFileRangeChecker { + public: + explicit ExternalFileRangeChecker(const Comparator* ucmp) : ucmp_(ucmp) {} + + // Operator used for sorting ranges. + bool operator()(const KeyRangeInfo* prev_range, + const KeyRangeInfo* range) const { + assert(prev_range); + assert(range); + return sstableKeyCompare(ucmp_, prev_range->smallest_internal_key, + range->smallest_internal_key) < 0; + } + + // Check whether `range` overlaps with `prev_range`. `ranges_sorted` can be + // set to true when the inputs are already sorted based on the sorting logic + // provided by this checker's operator(), which can help simplify the check. + bool OverlapsWithPrev(const KeyRangeInfo* prev_range, + const KeyRangeInfo* range, + bool ranges_sorted = false) const { + assert(prev_range); + assert(range); + if (prev_range->empty() || range->empty()) { + return false; + } + if (ranges_sorted) { + return sstableKeyCompare(ucmp_, prev_range->largest_internal_key, + range->smallest_internal_key) >= 0; + } + + return sstableKeyCompare(ucmp_, prev_range->largest_internal_key, + range->smallest_internal_key) >= 0 && + sstableKeyCompare(ucmp_, prev_range->smallest_internal_key, + range->largest_internal_key) <= 0; + } + + void MaybeUpdateRange(const InternalKey& start_key, + const InternalKey& end_key, KeyRangeInfo* range) const { + assert(range); + if (range->smallest_internal_key.size() == 0 || + sstableKeyCompare(ucmp_, start_key, range->smallest_internal_key) < 0) { + range->smallest_internal_key = start_key; + } + if (range->largest_internal_key.size() == 0 || + sstableKeyCompare(ucmp_, end_key, range->largest_internal_key) > 0) { + range->largest_internal_key = end_key; + } + } + + private: + const Comparator* ucmp_; +}; + +struct IngestedFileInfo : public KeyRangeInfo { + // External file path + std::string external_file_path; // NOTE: use below two fields for all `*Overlap*` types of checks instead of // smallest_internal_key.user_key() and largest_internal_key.user_key(). // The smallest / largest user key contained in the file for key range checks. @@ -94,6 +155,30 @@ struct IngestedFileInfo { bool user_defined_timestamps_persisted = true; }; +// A batch of files. +struct FileBatchInfo : public KeyRangeInfo { + autovector files; + // When true, `smallest_internal_key` and `largest_internal_key` will be + // tracked and updated as new file get added via `AddFile`. When false, we + // bypass this tracking. This is used when the all input external files + // are already checked and not overlapping, and they just need to be added + // into one default batch. + bool track_batch_range; + + void AddFile(IngestedFileInfo* file, + const ExternalFileRangeChecker& key_range_checker) { + assert(file); + files.push_back(file); + if (track_batch_range) { + key_range_checker.MaybeUpdateRange(file->smallest_internal_key, + file->largest_internal_key, this); + } + } + + explicit FileBatchInfo(bool _track_batch_range) + : track_batch_range(_track_batch_range) {} +}; + class ExternalSstFileIngestionJob { public: ExternalSstFileIngestionJob( @@ -108,6 +193,8 @@ class ExternalSstFileIngestionJob { fs_(db_options.fs, io_tracer), versions_(versions), cfd_(cfd), + ucmp_(cfd ? cfd->user_comparator() : nullptr), + file_range_checker_(ucmp_), db_options_(db_options), mutable_db_options_(mutable_db_options), env_options_(env_options), @@ -119,6 +206,8 @@ class ExternalSstFileIngestionJob { consumed_seqno_count_(0), io_tracer_(io_tracer) { assert(directories != nullptr); + assert(cfd_); + assert(ucmp_); } ~ExternalSstFileIngestionJob() { UnregisterRange(); } @@ -194,15 +283,38 @@ class ExternalSstFileIngestionJob { IngestedFileInfo* file_to_ingest, SuperVersion* sv); + // If the input files' key range overlaps themselves, this function divides + // them in the user specified order into multiple batches. Where the files + // within a batch do not overlap with each other, but key range could overlap + // between batches. + // If the input files' key range don't overlap themselves, they always just + // make one batch. + void DivideInputFilesIntoBatches(); + + // Assign level for the files in one batch. The files within one batch are not + // overlapping, and we assign level to each file one after another. + // If `prev_batch_uppermost_level` is specified, all files in this batch will + // be assigned to levels that are higher than `prev_batch_uppermost_level`. + // The uppermost level used by this batch of files is tracked too, so that it + // can be used by the next batch. + // REQUIRES: Mutex held + Status AssignLevelsForOneBatch(FileBatchInfo& batch, + SuperVersion* super_version, + bool force_global_seqno, + SequenceNumber* last_seqno, + int* batch_uppermost_level, + std::optional prev_batch_uppermost_level); + // Assign `file_to_ingest` the appropriate sequence number and the lowest // possible level that it can be ingested to according to compaction_style. + // If `prev_batch_uppermost_level` is specified, the file will only be + // assigned to levels tha are higher than `prev_batch_uppermost_level`. // REQUIRES: Mutex held - Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv, - bool force_global_seqno, - CompactionStyle compaction_style, - SequenceNumber last_seqno, - IngestedFileInfo* file_to_ingest, - SequenceNumber* assigned_seqno); + Status AssignLevelAndSeqnoForIngestedFile( + SuperVersion* sv, bool force_global_seqno, + CompactionStyle compaction_style, SequenceNumber last_seqno, + IngestedFileInfo* file_to_ingest, SequenceNumber* assigned_seqno, + std::optional prev_batch_uppermost_level); // File that we want to ingest behind always goes to the lowest level; // we just check that it fits in the level, that DB allows ingest_behind, @@ -237,11 +349,14 @@ class ExternalSstFileIngestionJob { FileSystemPtr fs_; VersionSet* versions_; ColumnFamilyData* cfd_; + const Comparator* ucmp_; + ExternalFileRangeChecker file_range_checker_; const ImmutableDBOptions& db_options_; const MutableDBOptions& mutable_db_options_; const EnvOptions& env_options_; SnapshotList* db_snapshots_; autovector files_to_ingest_; + std::vector file_batches_to_ingest_; const IngestExternalFileOptions& ingestion_options_; Directories* directories_; EventLogger* event_logger_; diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index 17793c493..de261af7a 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -3,6 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include + #include #include @@ -150,7 +152,7 @@ class ExternalSSTFileTest bool verify_checksums_before_ingest = true, bool ingest_behind = false, bool sort_data = false, std::map* true_data = nullptr, - ColumnFamilyHandle* cfh = nullptr) { + ColumnFamilyHandle* cfh = nullptr, bool fill_cache = false) { // Generate a file id if not provided if (file_id == -1) { file_id = last_file_id_ + 1; @@ -194,6 +196,7 @@ class ExternalSSTFileTest ifo.write_global_seqno = allow_global_seqno ? write_global_seqno : false; ifo.verify_checksums_before_ingest = verify_checksums_before_ingest; ifo.ingest_behind = ingest_behind; + ifo.fill_cache = fill_cache; if (cfh) { s = db_->IngestExternalFile(cfh, {file_path}, ifo); } else { @@ -267,15 +270,15 @@ class ExternalSSTFileTest bool verify_checksums_before_ingest = true, bool ingest_behind = false, bool sort_data = false, std::map* true_data = nullptr, - ColumnFamilyHandle* cfh = nullptr) { + ColumnFamilyHandle* cfh = nullptr, bool fill_cache = false) { std::vector> file_data; for (auto& k : keys) { file_data.emplace_back(Key(k), Key(k) + std::to_string(file_id)); } - return GenerateAndAddExternalFile(options, file_data, file_id, - allow_global_seqno, write_global_seqno, - verify_checksums_before_ingest, - ingest_behind, sort_data, true_data, cfh); + return GenerateAndAddExternalFile( + options, file_data, file_id, allow_global_seqno, write_global_seqno, + verify_checksums_before_ingest, ingest_behind, sort_data, true_data, + cfh, fill_cache); } Status DeprecatedAddFile(const std::vector& files, @@ -314,6 +317,49 @@ TEST_F(ExternalSSTFileTest, ComparatorMismatch) { ASSERT_NOK(DeprecatedAddFile({file})); } +TEST_F(ExternalSSTFileTest, NoBlockCache) { + LRUCacheOptions co; + co.capacity = 32 << 20; + std::shared_ptr cache = NewLRUCache(co); + BlockBasedTableOptions table_options; + table_options.block_cache = cache; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.cache_index_and_filter_blocks = true; + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + + size_t usage_before_ingestion = cache->GetUsage(); + std::map true_data; + // Ingest with fill_cache = true + ASSERT_OK(GenerateAndAddExternalFile(options, {1, 2}, -1, false, false, true, + false, false, &true_data, nullptr, + /*fill_cache=*/true)); + ASSERT_EQ(FilesPerLevel(), "0,0,0,0,0,0,1"); + EXPECT_GT(cache->GetUsage(), usage_before_ingestion); + + TablePropertiesCollection tp; + ASSERT_OK(db_->GetPropertiesOfAllTables(&tp)); + for (const auto& entry : tp) { + EXPECT_GT(entry.second->index_size, 0); + EXPECT_GT(entry.second->filter_size, 0); + } + + usage_before_ingestion = cache->GetUsage(); + // Ingest with fill_cache = false + ASSERT_OK(GenerateAndAddExternalFile(options, {3, 4}, -1, false, false, true, + false, false, &true_data, nullptr, + /*fill_cache=*/false)); + EXPECT_EQ(usage_before_ingestion, cache->GetUsage()); + + tp.clear(); + ASSERT_OK(db_->GetPropertiesOfAllTables(&tp)); + for (const auto& entry : tp) { + EXPECT_GT(entry.second->index_size, 0); + EXPECT_GT(entry.second->filter_size, 0); + } +} + TEST_F(ExternalSSTFileTest, Basic) { do { Options options = CurrentOptions(); @@ -1941,9 +1987,9 @@ TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedUniversal) { options, file_data, -1, true, write_global_seqno, verify_checksums_before_ingest, false, false, &true_data)); - // This file overlap with files in L4, we will ingest it into the last - // non-overlapping and non-empty level, in this case, it's L0. - ASSERT_EQ("3,0,0,0,3", FilesPerLevel()); + // This file overlap with files in L4, we will ingest it into the closest + // non-overlapping level, in this case, it's L3. + ASSERT_EQ("2,0,0,1,3", FilesPerLevel()); size_t kcnt = 0; VerifyDBFromMap(true_data, &kcnt, false); diff --git a/db/flush_job.cc b/db/flush_job.cc index 8206bd298..f7d585a3b 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -858,6 +858,12 @@ Status FlushJob::WriteLevel0Table() { meta_.temperature = mutable_cf_options_.default_write_temperature; file_options_.temperature = meta_.temperature; + const auto* ucmp = cfd_->internal_comparator().user_comparator(); + assert(ucmp); + const size_t ts_sz = ucmp->timestamp_size(); + const bool logical_strip_timestamp = + ts_sz > 0 && !cfd_->ioptions()->persist_user_defined_timestamps; + std::vector blob_file_additions; { @@ -893,10 +899,21 @@ Status FlushJob::WriteLevel0Table() { db_options_.info_log, "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n", cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber()); - memtables.push_back(m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, - &arena, /*prefix_extractor=*/nullptr)); - auto* range_del_iter = m->NewRangeTombstoneIterator( - ro, kMaxSequenceNumber, true /* immutable_memtable */); + if (logical_strip_timestamp) { + memtables.push_back(m->NewTimestampStrippingIterator( + ro, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr, ts_sz)); + } else { + memtables.push_back( + m->NewIterator(ro, /*seqno_to_time_mapping=*/nullptr, &arena, + /*prefix_extractor=*/nullptr)); + } + auto* range_del_iter = + logical_strip_timestamp + ? m->NewTimestampStrippingRangeTombstoneIterator( + ro, kMaxSequenceNumber, ts_sz) + : m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber, + true /* immutable_memtable */); if (range_del_iter != nullptr) { range_del_iters.emplace_back(range_del_iter); } diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index ce21ec648..ee8e7d14a 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -874,9 +874,15 @@ TEST_P(FlushJobTimestampTest, NoKeyExpired) { expected_full_history_ts_low = full_history_ts_low; } InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue); - InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue); - CheckFileMetaData(cfd, smallest, largest, &fmeta); - CheckFullHistoryTsLow(cfd, expected_full_history_ts_low); + if (!persist_udt_) { + InternalKey largest(largest_key, curr_seq_ - 1, ValueType::kTypeValue); + CheckFileMetaData(cfd, smallest, largest, &fmeta); + CheckFullHistoryTsLow(cfd, expected_full_history_ts_low); + } else { + InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue); + CheckFileMetaData(cfd, smallest, largest, &fmeta); + CheckFullHistoryTsLow(cfd, expected_full_history_ts_low); + } } job_context.Clean(); ASSERT_TRUE(to_delete.empty()); diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 0bf7c15ab..f111e2fc2 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -32,11 +32,11 @@ namespace ROCKSDB_NAMESPACE { // iter.Next() class ForwardLevelIterator : public InternalIterator { public: - ForwardLevelIterator( - const ColumnFamilyData* const cfd, const ReadOptions& read_options, - const std::vector& files, - const std::shared_ptr& prefix_extractor, - bool allow_unprepared_value, uint8_t block_protection_bytes_per_key) + ForwardLevelIterator(const ColumnFamilyData* const cfd, + const ReadOptions& read_options, + const std::vector& files, + const MutableCFOptions& mutable_cf_options, + bool allow_unprepared_value) : cfd_(cfd), read_options_(read_options), files_(files), @@ -44,9 +44,8 @@ class ForwardLevelIterator : public InternalIterator { file_index_(std::numeric_limits::max()), file_iter_(nullptr), pinned_iters_mgr_(nullptr), - prefix_extractor_(prefix_extractor), - allow_unprepared_value_(allow_unprepared_value), - block_protection_bytes_per_key_(block_protection_bytes_per_key) { + mutable_cf_options_(mutable_cf_options), + allow_unprepared_value_(allow_unprepared_value) { status_.PermitUncheckedError(); // Allow uninitialized status through } @@ -83,13 +82,12 @@ class ForwardLevelIterator : public InternalIterator { read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), *files_[file_index_], read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - prefix_extractor_, /*table_reader_ptr=*/nullptr, + mutable_cf_options_, /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_, - block_protection_bytes_per_key_); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_); file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); valid_ = false; if (!range_del_agg.IsEmpty()) { @@ -167,6 +165,10 @@ class ForwardLevelIterator : public InternalIterator { assert(valid_); return file_iter_->value(); } + uint64_t write_unix_time() const override { + assert(valid_); + return file_iter_->write_unix_time(); + } Status status() const override { if (!status_.ok()) { return status_; @@ -210,10 +212,9 @@ class ForwardLevelIterator : public InternalIterator { Status status_; InternalIterator* file_iter_; PinnedIteratorsManager* pinned_iters_mgr_; - // Kept alive by ForwardIterator::sv_->mutable_cf_options - const std::shared_ptr& prefix_extractor_; + const MutableCFOptions& mutable_cf_options_; + const bool allow_unprepared_value_; - const uint8_t block_protection_bytes_per_key_; }; ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, @@ -746,14 +747,13 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { l0_iters_.push_back(cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0, read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - sv_->mutable_cf_options.prefix_extractor, + sv_->mutable_cf_options, /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_, - sv_->mutable_cf_options.block_protection_bytes_per_key)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); } BuildLevelIterators(vstorage, sv_); current_ = nullptr; @@ -834,14 +834,13 @@ void ForwardIterator::RenewIterators() { read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0_files_new[inew], read_options_.ignore_range_deletions ? nullptr : &range_del_agg, - svnew->mutable_cf_options.prefix_extractor, + svnew->mutable_cf_options, /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(svnew->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_, - svnew->mutable_cf_options.block_protection_bytes_per_key)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); } for (auto* f : l0_iters_) { @@ -884,9 +883,8 @@ void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage, } } else { level_iters_.push_back(new ForwardLevelIterator( - cfd_, read_options_, level_files, - sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_, - sv->mutable_cf_options.block_protection_bytes_per_key)); + cfd_, read_options_, level_files, sv->mutable_cf_options, + allow_unprepared_value_)); } } } @@ -901,15 +899,13 @@ void ForwardIterator::ResetIncompleteIterators() { DeleteIterator(l0_iters_[i]); l0_iters_[i] = cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), - *l0_files[i], /*range_del_agg=*/nullptr, - sv_->mutable_cf_options.prefix_extractor, + *l0_files[i], /*range_del_agg=*/nullptr, sv_->mutable_cf_options, /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator, /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_, - sv_->mutable_cf_options.block_protection_bytes_per_key); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_); l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); } diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 270f889a9..73ea9f5c8 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -329,7 +329,7 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( // TODO(yuzhangyu): User-defined timestamps doesn't support importing column // family. Pass in the correct `user_defined_timestamps_persisted` flag for // creating `TableReaderOptions` when the support is there. - status = cfd_->ioptions()->table_factory->NewTableReader( + status = sv->mutable_cf_options.table_factory->NewTableReader( TableReaderOptions( *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, env_options_, cfd_->internal_comparator(), @@ -371,7 +371,8 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( if (iter->Valid()) { file_to_import->smallest_internal_key.DecodeFrom(iter->key()); Slice largest; - if (strcmp(cfd_->ioptions()->table_factory->Name(), "PlainTable") == 0) { + if (strcmp(sv->mutable_cf_options.table_factory->Name(), "PlainTable") == + 0) { // PlainTable iterator does not support SeekToLast(). largest = iter->key(); for (; iter->Valid(); iter->Next()) { diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index 89fdbb7e3..c659ba6ae 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -951,6 +951,8 @@ TEST_F(ImportColumnFamilyTest, AssignEpochNumberToMultipleCF) { Options options = CurrentOptions(); options.level_compaction_dynamic_level_bytes = true; options.max_background_jobs = 8; + // Always allow parallel compaction + options.soft_pending_compaction_bytes_limit = 10; env_->SetBackgroundThreads(2, Env::LOW); env_->SetBackgroundThreads(0, Env::BOTTOM); CreateAndReopenWithCF({"CF1", "CF2"}, options); diff --git a/db/internal_stats.cc b/db/internal_stats.cc index f4447591e..8baa5b18d 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -1495,8 +1495,10 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/, } Cache* InternalStats::GetBlockCacheForStats() { - auto* table_factory = cfd_->ioptions()->table_factory.get(); + // NOTE: called in startup before GetCurrentMutableCFOptions() is ready + auto* table_factory = cfd_->GetLatestMutableCFOptions()->table_factory.get(); assert(table_factory != nullptr); + // FIXME: need to a shared_ptr if/when block_cache is going to be mutable return table_factory->GetOptions(TableFactory::kBlockCacheOpts()); } @@ -2161,7 +2163,8 @@ class BlockCachePropertyAggregator : public IntPropertyAggregator { virtual ~BlockCachePropertyAggregator() override = default; void Add(ColumnFamilyData* cfd, uint64_t value) override { - auto* table_factory = cfd->ioptions()->table_factory.get(); + auto* table_factory = + cfd->GetCurrentMutableCFOptions()->table_factory.get(); assert(table_factory != nullptr); Cache* cache = table_factory->GetOptions(TableFactory::kBlockCacheOpts()); diff --git a/db/job_context.h b/db/job_context.h index 272b79a21..a0cb3c815 100644 --- a/db/job_context.h +++ b/db/job_context.h @@ -202,6 +202,10 @@ struct JobContext { // that corresponds to the set of files in 'live'. uint64_t manifest_file_number; uint64_t pending_manifest_file_number; + + // Used for remote compaction. To prevent OPTIONS files from getting + // purged by PurgeObsoleteFiles() of the primary host + uint64_t min_options_file_number; uint64_t log_number; uint64_t prev_log_number; diff --git a/db/memtable.cc b/db/memtable.cc index 5ba0a0dac..42117c639 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -613,6 +613,135 @@ InternalIterator* MemTable::NewIterator( seqno_to_time_mapping, arena, prefix_extractor); } +// An iterator wrapper that wraps a MemTableIterator and logically strips each +// key's user-defined timestamp. +class TimestampStrippingIterator : public InternalIterator { + public: + TimestampStrippingIterator( + MemTableIterator::Kind kind, const MemTable& memtable, + const ReadOptions& read_options, + UnownedPtr seqno_to_time_mapping, Arena* arena, + const SliceTransform* cf_prefix_extractor, size_t ts_sz) + : arena_mode_(arena != nullptr), kind_(kind), ts_sz_(ts_sz) { + assert(ts_sz_ != 0); + void* mem = arena ? arena->AllocateAligned(sizeof(MemTableIterator)) : + operator new(sizeof(MemTableIterator)); + iter_ = new (mem) + MemTableIterator(kind, memtable, read_options, seqno_to_time_mapping, + arena, cf_prefix_extractor); + } + + // No copying allowed + TimestampStrippingIterator(const TimestampStrippingIterator&) = delete; + void operator=(const TimestampStrippingIterator&) = delete; + + ~TimestampStrippingIterator() override { + if (arena_mode_) { + iter_->~MemTableIterator(); + } else { + delete iter_; + } + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + iter_->SetPinnedItersMgr(pinned_iters_mgr); + } + + bool Valid() const override { return iter_->Valid(); } + void Seek(const Slice& k) override { + iter_->Seek(k); + UpdateKeyAndValueBuffer(); + } + void SeekForPrev(const Slice& k) override { + iter_->SeekForPrev(k); + UpdateKeyAndValueBuffer(); + } + void SeekToFirst() override { + iter_->SeekToFirst(); + UpdateKeyAndValueBuffer(); + } + void SeekToLast() override { + iter_->SeekToLast(); + UpdateKeyAndValueBuffer(); + } + void Next() override { + iter_->Next(); + UpdateKeyAndValueBuffer(); + } + bool NextAndGetResult(IterateResult* result) override { + iter_->Next(); + UpdateKeyAndValueBuffer(); + bool is_valid = Valid(); + if (is_valid) { + result->key = key(); + result->bound_check_result = IterBoundCheck::kUnknown; + result->value_prepared = true; + } + return is_valid; + } + void Prev() override { + iter_->Prev(); + UpdateKeyAndValueBuffer(); + } + Slice key() const override { + assert(Valid()); + return key_buf_; + } + + uint64_t write_unix_time() const override { return iter_->write_unix_time(); } + Slice value() const override { + if (kind_ == MemTableIterator::Kind::kRangeDelEntries) { + return value_buf_; + } + return iter_->value(); + } + Status status() const override { return iter_->status(); } + bool IsKeyPinned() const override { + // Key is only in a buffer that is updated in each iteration. + return false; + } + bool IsValuePinned() const override { + if (kind_ == MemTableIterator::Kind::kRangeDelEntries) { + return false; + } + return iter_->IsValuePinned(); + } + + private: + void UpdateKeyAndValueBuffer() { + key_buf_.clear(); + if (kind_ == MemTableIterator::Kind::kRangeDelEntries) { + value_buf_.clear(); + } + if (!Valid()) { + return; + } + Slice original_key = iter_->key(); + ReplaceInternalKeyWithMinTimestamp(&key_buf_, original_key, ts_sz_); + if (kind_ == MemTableIterator::Kind::kRangeDelEntries) { + Slice original_value = iter_->value(); + AppendUserKeyWithMinTimestamp(&value_buf_, original_value, ts_sz_); + } + } + bool arena_mode_; + MemTableIterator::Kind kind_; + size_t ts_sz_; + MemTableIterator* iter_; + std::string key_buf_; + std::string value_buf_; +}; + +InternalIterator* MemTable::NewTimestampStrippingIterator( + const ReadOptions& read_options, + UnownedPtr seqno_to_time_mapping, Arena* arena, + const SliceTransform* prefix_extractor, size_t ts_sz) { + assert(arena != nullptr); + auto mem = arena->AllocateAligned(sizeof(TimestampStrippingIterator)); + return new (mem) TimestampStrippingIterator( + MemTableIterator::kPointEntries, *this, read_options, + seqno_to_time_mapping, arena, prefix_extractor, ts_sz); +} + FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( const ReadOptions& read_options, SequenceNumber read_seq, bool immutable_memtable) { @@ -624,6 +753,30 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( immutable_memtable); } +FragmentedRangeTombstoneIterator* +MemTable::NewTimestampStrippingRangeTombstoneIterator( + const ReadOptions& read_options, SequenceNumber read_seq, size_t ts_sz) { + if (read_options.ignore_range_deletions || + is_range_del_table_empty_.load(std::memory_order_relaxed)) { + return nullptr; + } + if (!timestamp_stripping_fragmented_range_tombstone_list_) { + // TODO: plumb Env::IOActivity, Env::IOPriority + auto* unfragmented_iter = new TimestampStrippingIterator( + MemTableIterator::kRangeDelEntries, *this, ReadOptions(), + /*seqno_to_time_mapping*/ nullptr, /* arena */ nullptr, + /* prefix_extractor */ nullptr, ts_sz); + + timestamp_stripping_fragmented_range_tombstone_list_ = + std::make_unique( + std::unique_ptr(unfragmented_iter), + comparator_.comparator); + } + return new FragmentedRangeTombstoneIterator( + timestamp_stripping_fragmented_range_tombstone_list_.get(), + comparator_.comparator, read_seq, read_options.timestamp); +} + FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal( const ReadOptions& read_options, SequenceNumber read_seq, bool immutable_memtable) { diff --git a/db/memtable.h b/db/memtable.h index 194b4543c..1ed82f456 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -213,6 +213,14 @@ class MemTable { UnownedPtr seqno_to_time_mapping, Arena* arena, const SliceTransform* prefix_extractor); + // Returns an iterator that wraps a MemTableIterator and logically strips the + // user-defined timestamp of each key. This API is only used by flush when + // user-defined timestamps in MemTable only feature is enabled. + InternalIterator* NewTimestampStrippingIterator( + const ReadOptions& read_options, + UnownedPtr seqno_to_time_mapping, Arena* arena, + const SliceTransform* prefix_extractor, size_t ts_sz); + // Returns an iterator that yields the range tombstones of the memtable. // The caller must ensure that the underlying MemTable remains live // while the returned iterator is live. @@ -227,6 +235,13 @@ class MemTable { const ReadOptions& read_options, SequenceNumber read_seq, bool immutable_memtable); + // Returns an iterator that yields the range tombstones of the memtable and + // logically strips the user-defined timestamp of each key (including start + // key, and end key). This API is only used by flush when user-defined + // timestamps in MemTable only feature is enabled. + FragmentedRangeTombstoneIterator* NewTimestampStrippingRangeTombstoneIterator( + const ReadOptions& read_options, SequenceNumber read_seq, size_t ts_sz); + Status VerifyEncodedEntry(Slice encoded, const ProtectionInfoKVOS64& kv_prot_info); @@ -704,6 +719,12 @@ class MemTable { std::unique_ptr fragmented_range_tombstone_list_; + // The fragmented range tombstone of this memtable with all keys' user-defined + // timestamps logically stripped. This is constructed and used by flush when + // user-defined timestamps in memtable only feature is enabled. + std::unique_ptr + timestamp_stripping_fragmented_range_tombstone_list_; + // makes sure there is a single range tombstone writer to invalidate cache std::mutex range_del_mutex_; CoreLocalArray> diff --git a/db/memtable_list.cc b/db/memtable_list.cc index c81c096b5..8ad4efcc2 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -359,11 +359,15 @@ bool MemTableListVersion::MemtableLimitExceeded(size_t usage) { } } +bool MemTableListVersion::HistoryShouldBeTrimmed(size_t usage) { + return MemtableLimitExceeded(usage) && !memlist_history_.empty(); +} + // Make sure we don't use up too much space in history bool MemTableListVersion::TrimHistory(autovector* to_delete, size_t usage) { bool ret = false; - while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) { + while (HistoryShouldBeTrimmed(usage)) { MemTable* x = memlist_history_.back(); memlist_history_.pop_back(); @@ -661,8 +665,16 @@ void MemTableList::Add(MemTable* m, autovector* to_delete) { } bool MemTableList::TrimHistory(autovector* to_delete, size_t usage) { + // Check if history trim is needed first, so that we can avoid installing a + // new MemTableListVersion without installing a SuperVersion (installed based + // on return value of this function). + if (!current_->HistoryShouldBeTrimmed(usage)) { + ResetTrimHistoryNeeded(); + return false; + } InstallNewVersion(); bool ret = current_->TrimHistory(to_delete, usage); + assert(ret); UpdateCachedValuesFromMemTableListVersion(); ResetTrimHistoryNeeded(); return ret; @@ -714,6 +726,7 @@ void MemTableList::InstallNewVersion() { // somebody else holds the current version, we need to create new one MemTableListVersion* version = current_; current_ = new MemTableListVersion(¤t_memory_usage_, *version); + current_->SetID(++last_memtable_list_version_id_); current_->Ref(); version->Unref(); } diff --git a/db/memtable_list.h b/db/memtable_list.h index 390b4137d..75afb5018 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -141,6 +141,11 @@ class MemTableListVersion { // Return kMaxSequenceNumber if the list is empty. SequenceNumber GetFirstSequenceNumber() const; + // REQUIRES: db_mutex held. + void SetID(uint64_t id) { id_ = id; } + + uint64_t GetID() const { return id_; } + private: friend class MemTableList; @@ -161,7 +166,11 @@ class MemTableListVersion { // REQUIRE: m is an immutable memtable void Remove(MemTable* m, autovector* to_delete); - // Return true if memtable is trimmed + // Return true if the memtable list should be trimmed to get memory usage + // under budget. + bool HistoryShouldBeTrimmed(size_t usage); + + // Trim history, Return true if memtable is trimmed bool TrimHistory(autovector* to_delete, size_t usage); bool GetFromList(std::list* list, const LookupKey& key, @@ -205,6 +214,9 @@ class MemTableListVersion { int refs_ = 0; size_t* parent_memtable_list_memory_usage_; + + // MemtableListVersion id to track for flush results checking. + uint64_t id_ = 0; }; // This class stores references to all the immutable memtables. @@ -235,7 +247,8 @@ class MemTableList { flush_requested_(false), current_memory_usage_(0), current_memory_allocted_bytes_excluding_last_(0), - current_has_history_(false) { + current_has_history_(false), + last_memtable_list_version_id_(0) { current_->Ref(); } @@ -500,6 +513,10 @@ class MemTableList { // Cached value of current_->HasHistory(). std::atomic current_has_history_; + + // Last memtabe list version id, increase by 1 each time a new + // MemtableListVersion is installed. + uint64_t last_memtable_list_version_id_; }; // Installs memtable atomic flush results. diff --git a/db/multi_cf_iterator_impl.h b/db/multi_cf_iterator_impl.h index 46f397cc4..4bea05a5b 100644 --- a/db/multi_cf_iterator_impl.h +++ b/db/multi_cf_iterator_impl.h @@ -21,15 +21,15 @@ struct MultiCfIteratorInfo { int order; }; +template class MultiCfIteratorImpl { public: - MultiCfIteratorImpl( - const Comparator* comparator, - const std::vector& column_families, - const std::vector& child_iterators, - std::function reset_func, - std::function&)> populate_func) + MultiCfIteratorImpl(const Comparator* comparator, bool allow_unprepared_value, + const std::vector& column_families, + const std::vector& child_iterators, + ResetFunc reset_func, PopulateFunc populate_func) : comparator_(comparator), + allow_unprepared_value_(allow_unprepared_value), heap_(MultiCfMinHeap( MultiCfHeapItemComparator>(comparator_))), reset_func_(std::move(reset_func)), @@ -101,6 +101,41 @@ class MultiCfIteratorImpl { AdvanceIterator(max_heap, [](Iterator* iter) { iter->Prev(); }); } + bool PrepareValue() { + assert(Valid()); + + if (!allow_unprepared_value_) { + return true; + } + + auto prepare_value_func = [this](auto& heap, Iterator* iterator) { + assert(iterator); + assert(iterator->Valid()); + assert(iterator->status().ok()); + + if (!iterator->PrepareValue()) { + assert(!iterator->Valid()); + assert(!iterator->status().ok()); + + considerStatus(iterator->status()); + assert(!status_.ok()); + + heap.clear(); + return false; + } + + return true; + }; + + if (std::holds_alternative(heap_)) { + return PopulateIterator(std::get(heap_), + prepare_value_func); + } + + return PopulateIterator(std::get(heap_), + prepare_value_func); + } + private: std::vector>> cfh_iter_pairs_; @@ -125,7 +160,10 @@ class MultiCfIteratorImpl { private: const Comparator* comparator_; }; + const Comparator* comparator_; + bool allow_unprepared_value_; + using MultiCfMinHeap = BinaryHeap>>; @@ -136,8 +174,8 @@ class MultiCfIteratorImpl { MultiCfIterHeap heap_; - std::function reset_func_; - std::function)> populate_func_; + ResetFunc reset_func_; + PopulateFunc populate_func_; Iterator* current() const { if (std::holds_alternative(heap_)) { @@ -163,11 +201,11 @@ class MultiCfIteratorImpl { } void InitMinHeap() { - heap_.emplace( + heap_.template emplace( MultiCfHeapItemComparator>(comparator_)); } void InitMaxHeap() { - heap_.emplace( + heap_.template emplace( MultiCfHeapItemComparator>(comparator_)); } @@ -186,13 +224,16 @@ class MultiCfIteratorImpl { if (!status_.ok()) { // Non-OK status from the iterator. Bail out early heap.clear(); - break; + return; } } ++i; } - if (!heap.empty()) { - PopulateIterator(heap); + if (!allow_unprepared_value_ && !heap.empty()) { + [[maybe_unused]] const bool result = PopulateIterator( + heap, + [](auto& /* heap */, Iterator* /* iterator */) { return true; }); + assert(result); } } @@ -212,32 +253,41 @@ class MultiCfIteratorImpl { // 2. Make sure all others have iterated past the top iterator key slice // 3. Advance the top iterator, and add it back to the heap if valid auto top = heap.top(); + assert(top.iterator); + assert(top.iterator->Valid()); + assert(top.iterator->status().ok()); + heap.pop(); - if (!heap.empty()) { + + while (!heap.empty()) { auto current = heap.top(); assert(current.iterator); - while (current.iterator->Valid() && - comparator_->Compare(top.iterator->key(), - current.iterator->key()) == 0) { + assert(current.iterator->Valid()); + assert(current.iterator->status().ok()); + + if (comparator_->Compare(current.iterator->key(), top.iterator->key()) != + 0) { + break; + } + + advance_func(current.iterator); + + if (current.iterator->Valid()) { assert(current.iterator->status().ok()); - advance_func(current.iterator); - if (current.iterator->Valid()) { - heap.replace_top(heap.top()); + heap.replace_top(current); + } else { + considerStatus(current.iterator->status()); + if (!status_.ok()) { + heap.clear(); + return; } else { - considerStatus(current.iterator->status()); - if (!status_.ok()) { - heap.clear(); - return; - } else { - heap.pop(); - } - } - if (!heap.empty()) { - current = heap.top(); + heap.pop(); } } } + advance_func(top.iterator); + if (top.iterator->Valid()) { assert(top.iterator->status().ok()); heap.push(top); @@ -249,13 +299,16 @@ class MultiCfIteratorImpl { } } - if (!heap.empty()) { - PopulateIterator(heap); + if (!allow_unprepared_value_ && !heap.empty()) { + [[maybe_unused]] const bool result = PopulateIterator( + heap, + [](auto& /* heap */, Iterator* /* iterator */) { return true; }); + assert(result); } } - template - void PopulateIterator(BinaryHeap& heap) { + template + bool PopulateIterator(BinaryHeap& heap, PrepareValueFunc prepare_value_func) { // 1. Keep the top iterator (by popping it from the heap) and add it to list // to populate // 2. For all non-top iterators having the same key as top iter popped @@ -265,31 +318,48 @@ class MultiCfIteratorImpl { // populate the value/columns and attribute_groups from the list // collected in step 1 and 2 and add all the iters back to the heap assert(!heap.empty()); + auto top = heap.top(); - heap.pop(); + assert(top.iterator); + assert(top.iterator->Valid()); + assert(top.iterator->status().ok()); + + if (!prepare_value_func(heap, top.iterator)) { + return false; + } + autovector to_populate; + to_populate.push_back(top); - if (!heap.empty()) { + heap.pop(); + + while (!heap.empty()) { auto current = heap.top(); assert(current.iterator); - while (current.iterator->Valid() && - comparator_->Compare(top.iterator->key(), - current.iterator->key()) == 0) { - assert(current.iterator->status().ok()); - to_populate.push_back(current); - heap.pop(); - if (!heap.empty()) { - current = heap.top(); - } else { - break; - } + assert(current.iterator->Valid()); + assert(current.iterator->status().ok()); + + if (comparator_->Compare(current.iterator->key(), top.iterator->key()) != + 0) { + break; } + + if (!prepare_value_func(heap, current.iterator)) { + return false; + } + + to_populate.push_back(current); + heap.pop(); } + // Add the items back to the heap for (auto& item : to_populate) { heap.push(item); } + populate_func_(to_populate); + + return true; } }; diff --git a/db/multi_cf_iterator_test.cc b/db/multi_cf_iterator_test.cc index f3094f358..0cdf5fae9 100644 --- a/db/multi_cf_iterator_test.cc +++ b/db/multi_cf_iterator_test.cc @@ -15,53 +15,74 @@ class CoalescingIteratorTest : public DBTestBase { // Verify Iteration of CoalescingIterator // by SeekToFirst() + Next() and SeekToLast() + Prev() - void verifyCoalescingIterator(const std::vector& cfhs, + void VerifyCoalescingIterator(const std::vector& cfhs, const std::vector& expected_keys, const std::vector& expected_values, const std::optional>& expected_wide_columns = std::nullopt, const Slice* lower_bound = nullptr, - const Slice* upper_bound = nullptr) { - int i = 0; + const Slice* upper_bound = nullptr, + bool allow_unprepared_value = false) { + const size_t num_keys = expected_keys.size(); + ReadOptions read_options; read_options.iterate_lower_bound = lower_bound; read_options.iterate_upper_bound = upper_bound; + read_options.allow_unprepared_value = allow_unprepared_value; + std::unique_ptr iter = db_->NewCoalescingIterator(read_options, cfhs); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_EQ(expected_keys[i], iter->key()); - ASSERT_EQ(expected_values[i], iter->value()); + + auto check_iter_entry = [&](size_t idx) { + ASSERT_EQ(iter->key(), expected_keys[idx]); + + if (allow_unprepared_value) { + ASSERT_TRUE(iter->value().empty()); + ASSERT_TRUE(iter->PrepareValue()); + ASSERT_TRUE(iter->Valid()); + } + + ASSERT_EQ(iter->value(), expected_values[idx]); if (expected_wide_columns.has_value()) { - ASSERT_EQ(expected_wide_columns.value()[i], iter->columns()); + ASSERT_EQ(iter->columns(), expected_wide_columns.value()[idx]); } - ++i; + }; + + { + size_t i = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + check_iter_entry(i); + ++i; + } + + ASSERT_EQ(num_keys, i); + ASSERT_OK(iter->status()); } - ASSERT_EQ(expected_keys.size(), i); - ASSERT_OK(iter->status()); - int rev_i = i - 1; - for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { - ASSERT_EQ(expected_keys[rev_i], iter->key()); - ASSERT_EQ(expected_values[rev_i], iter->value()); - if (expected_wide_columns.has_value()) { - ASSERT_EQ(expected_wide_columns.value()[rev_i], iter->columns()); + { + size_t i = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + check_iter_entry(num_keys - 1 - i); + ++i; } - rev_i--; + + ASSERT_EQ(num_keys, i); + ASSERT_OK(iter->status()); } - ASSERT_OK(iter->status()); } - void verifyExpectedKeys(ColumnFamilyHandle* cfh, + void VerifyExpectedKeys(ColumnFamilyHandle* cfh, const std::vector& expected_keys) { - int i = 0; - Iterator* iter = db_->NewIterator(ReadOptions(), cfh); + std::unique_ptr iter(db_->NewIterator(ReadOptions(), cfh)); + + size_t i = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_EQ(expected_keys[i], iter->key()); + ASSERT_EQ(iter->key(), expected_keys[i]); ++i; } - ASSERT_EQ(expected_keys.size(), i); + + ASSERT_EQ(i, expected_keys.size()); ASSERT_OK(iter->status()); - delete iter; } }; @@ -96,7 +117,7 @@ TEST_F(CoalescingIteratorTest, SimpleValues) { // Test for iteration over CF default->1->2->3 std::vector cfhs_order_0_1_2_3 = { handles_[0], handles_[1], handles_[2], handles_[3]}; - verifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, + VerifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, expected_values); // Test for iteration over CF 3->1->default_cf->2 @@ -104,7 +125,7 @@ TEST_F(CoalescingIteratorTest, SimpleValues) { handles_[3], handles_[1], handles_[0], handles_[2]}; // Iteration order and the return values should be the same since keys are // unique per CF - verifyCoalescingIterator(cfhs_order_3_1_0_2, expected_keys, + VerifyCoalescingIterator(cfhs_order_3_1_0_2, expected_keys, expected_values); // Verify Seek() @@ -163,14 +184,14 @@ TEST_F(CoalescingIteratorTest, SimpleValues) { handles_[0], handles_[1], handles_[2], handles_[3]}; std::vector expected_values = {"key_1_cf_3_val", "key_2_cf_2_val", "key_3_cf_3_val"}; - verifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, + VerifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, expected_values); // Test for iteration over CFs 3->2->default_cf->1 std::vector cfhs_order_3_2_0_1 = { handles_[3], handles_[2], handles_[0], handles_[1]}; expected_values = {"key_1_cf_0_val", "key_2_cf_1_val", "key_3_cf_1_val"}; - verifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, + VerifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, expected_values); // Verify Seek() @@ -227,7 +248,7 @@ TEST_F(CoalescingIteratorTest, LowerAndUpperBounds) { std::vector expected_keys = {"key_2", "key_3", "key_4"}; std::vector expected_values = {"key_2_cf_1_val", "key_3_cf_2_val", "key_4_cf_3_val"}; - verifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, + VerifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, expected_values, std::nullopt, &lb); } // with upper_bound @@ -236,7 +257,7 @@ TEST_F(CoalescingIteratorTest, LowerAndUpperBounds) { Slice ub = Slice("key_3"); std::vector expected_keys = {"key_1", "key_2"}; std::vector expected_values = {"key_1_cf_0_val", "key_2_cf_1_val"}; - verifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, + VerifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, expected_values, std::nullopt, nullptr, &ub); } // with lower and upper bound @@ -245,7 +266,7 @@ TEST_F(CoalescingIteratorTest, LowerAndUpperBounds) { Slice ub = Slice("key_4"); std::vector expected_keys = {"key_2", "key_3"}; std::vector expected_values = {"key_2_cf_1_val", "key_3_cf_2_val"}; - verifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, + VerifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, expected_values, std::nullopt, &lb, &ub); } @@ -312,7 +333,7 @@ TEST_F(CoalescingIteratorTest, LowerAndUpperBounds) { Slice lb = Slice("key_2"); std::vector expected_keys = {"key_2", "key_3"}; std::vector expected_values = {"key_2_cf_2_val", "key_3_cf_3_val"}; - verifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, + VerifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, expected_values, std::nullopt, &lb); } // with upper_bound @@ -321,7 +342,7 @@ TEST_F(CoalescingIteratorTest, LowerAndUpperBounds) { Slice ub = Slice("key_3"); std::vector expected_keys = {"key_1", "key_2"}; std::vector expected_values = {"key_1_cf_3_val", "key_2_cf_2_val"}; - verifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, + VerifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, expected_values, std::nullopt, nullptr, &ub); } // with lower and upper bound @@ -330,7 +351,7 @@ TEST_F(CoalescingIteratorTest, LowerAndUpperBounds) { Slice ub = Slice("key_3"); std::vector expected_keys = {"key_2"}; std::vector expected_values = {"key_2_cf_2_val"}; - verifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, + VerifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, expected_values, std::nullopt, &lb, &ub); } @@ -342,7 +363,7 @@ TEST_F(CoalescingIteratorTest, LowerAndUpperBounds) { Slice lb = Slice("key_2"); std::vector expected_keys = {"key_2", "key_3"}; std::vector expected_values = {"key_2_cf_1_val", "key_3_cf_1_val"}; - verifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, + VerifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, expected_values, std::nullopt, &lb); } // with upper_bound @@ -351,7 +372,7 @@ TEST_F(CoalescingIteratorTest, LowerAndUpperBounds) { Slice ub = Slice("key_3"); std::vector expected_keys = {"key_1", "key_2"}; std::vector expected_values = {"key_1_cf_0_val", "key_2_cf_1_val"}; - verifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, + VerifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, expected_values, std::nullopt, nullptr, &ub); } // with lower and upper bound @@ -360,7 +381,7 @@ TEST_F(CoalescingIteratorTest, LowerAndUpperBounds) { Slice ub = Slice("key_3"); std::vector expected_keys = {"key_2"}; std::vector expected_values = {"key_2_cf_1_val"}; - verifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, + VerifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, expected_values, std::nullopt, &lb, &ub); } { @@ -662,7 +683,7 @@ TEST_F(CoalescingIteratorTest, WideColumns) { key_2_expected_columns_cfh_order_1_2, key_3_expected_columns, key_4_expected_columns}; - verifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, expected_values, + VerifyCoalescingIterator(cfhs_order_0_1_2_3, expected_keys, expected_values, expected_wide_columns_0_1_2_3); } @@ -677,7 +698,7 @@ TEST_F(CoalescingIteratorTest, WideColumns) { key_2_expected_columns_cfh_order_2_1, key_3_expected_columns, key_4_expected_columns}; - verifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, expected_values, + VerifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, expected_values, expected_wide_columns_3_2_0_1); } } @@ -700,8 +721,8 @@ TEST_F(CoalescingIteratorTest, DifferentComparatorsInMultiCFs) { ASSERT_OK(Put(1, "key_2", "value_2")); ASSERT_OK(Put(1, "key_3", "value_3")); - verifyExpectedKeys(handles_[0], {"key_1", "key_2", "key_3"}); - verifyExpectedKeys(handles_[1], {"key_3", "key_2", "key_1"}); + VerifyExpectedKeys(handles_[0], {"key_1", "key_2", "key_3"}); + VerifyExpectedKeys(handles_[1], {"key_3", "key_2", "key_1"}); std::unique_ptr iter = db_->NewCoalescingIterator(ReadOptions(), handles_); @@ -742,10 +763,10 @@ TEST_F(CoalescingIteratorTest, CustomComparatorsInMultiCFs) { ASSERT_OK(Put(1, "key_003_005", "value_1_5")); ASSERT_OK(Put(1, "key_003_006", "value_1_4")); - verifyExpectedKeys( + VerifyExpectedKeys( handles_[0], {"key_001_003", "key_001_002", "key_001_001", "key_002_003", "key_002_002", "key_002_001"}); - verifyExpectedKeys( + VerifyExpectedKeys( handles_[1], {"key_001_003", "key_001_002", "key_001_001", "key_003_006", "key_003_005", "key_003_004"}); @@ -755,66 +776,205 @@ TEST_F(CoalescingIteratorTest, CustomComparatorsInMultiCFs) { std::vector expected_values = {"value_1_1", "value_1_2", "value_1_3", "value_0_4", "value_0_5", "value_0_6", "value_1_4", "value_1_5", "value_1_6"}; - int i = 0; std::unique_ptr iter = db_->NewCoalescingIterator(ReadOptions(), handles_); + + size_t i = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_EQ(expected_keys[i], iter->key()); ASSERT_EQ(expected_values[i], iter->value()); ++i; } + + ASSERT_EQ(expected_keys.size(), i); ASSERT_OK(iter->status()); } +TEST_F(CoalescingIteratorTest, AllowUnpreparedValue) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + + CreateAndReopenWithCF({"cf_1", "cf_2", "cf_3"}, options); + + ASSERT_OK(Put(0, "key_1", "key_1_cf_0_val")); + ASSERT_OK(Put(3, "key_1", "key_1_cf_3_val")); + ASSERT_OK(Put(1, "key_2", "key_2_cf_1_val")); + ASSERT_OK(Put(2, "key_2", "key_2_cf_2_val")); + ASSERT_OK(Put(0, "key_3", "key_3_cf_0_val")); + ASSERT_OK(Put(1, "key_3", "key_3_cf_1_val")); + ASSERT_OK(Put(3, "key_3", "key_3_cf_3_val")); + + ASSERT_OK(Flush()); + + std::vector cfhs_order_3_2_0_1{handles_[3], handles_[2], + handles_[0], handles_[1]}; + std::vector expected_keys{"key_1", "key_2", "key_3"}; + std::vector expected_values{"key_1_cf_0_val", "key_2_cf_1_val", + "key_3_cf_1_val"}; + + VerifyCoalescingIterator(cfhs_order_3_2_0_1, expected_keys, expected_values, + /* expected_wide_columns */ std::nullopt, + /* lower_bound */ nullptr, /* upper_bound */ nullptr, + /* allow_unprepared_value */ true); + + ReadOptions read_options; + read_options.allow_unprepared_value = true; + + { + std::unique_ptr iter = + db_->NewCoalescingIterator(read_options, cfhs_order_3_2_0_1); + iter->Seek(""); + ASSERT_EQ(IterStatus(iter.get()), "key_1->"); + ASSERT_TRUE(iter->PrepareValue()); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_0_val"); + + iter->Seek("key_1"); + ASSERT_EQ(IterStatus(iter.get()), "key_1->"); + ASSERT_TRUE(iter->PrepareValue()); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_0_val"); + + iter->Seek("key_2"); + ASSERT_EQ(IterStatus(iter.get()), "key_2->"); + ASSERT_TRUE(iter->PrepareValue()); + ASSERT_EQ(IterStatus(iter.get()), "key_2->key_2_cf_1_val"); + + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "key_3->"); + ASSERT_TRUE(iter->PrepareValue()); + ASSERT_EQ(IterStatus(iter.get()), "key_3->key_3_cf_1_val"); + + iter->Seek("key_x"); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + } + + { + std::unique_ptr iter = + db_->NewCoalescingIterator(read_options, cfhs_order_3_2_0_1); + iter->SeekForPrev(""); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + + iter->SeekForPrev("key_1"); + ASSERT_EQ(IterStatus(iter.get()), "key_1->"); + ASSERT_TRUE(iter->PrepareValue()); + ASSERT_EQ(IterStatus(iter.get()), "key_1->key_1_cf_0_val"); + + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "key_2->"); + ASSERT_TRUE(iter->PrepareValue()); + ASSERT_EQ(IterStatus(iter.get()), "key_2->key_2_cf_1_val"); + + iter->SeekForPrev("key_x"); + ASSERT_EQ(IterStatus(iter.get()), "key_3->"); + ASSERT_TRUE(iter->PrepareValue()); + ASSERT_EQ(IterStatus(iter.get()), "key_3->key_3_cf_1_val"); + + iter->Next(); + ASSERT_EQ(IterStatus(iter.get()), "(invalid)"); + } +} + +TEST_F(CoalescingIteratorTest, AllowUnpreparedValue_Corruption) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + + CreateAndReopenWithCF({"cf_1", "cf_2", "cf_3"}, options); + + ASSERT_OK(Put(0, "key_1", "key_1_cf_0_val")); + ASSERT_OK(Put(3, "key_1", "key_1_cf_3_val")); + ASSERT_OK(Put(1, "key_2", "key_2_cf_1_val")); + ASSERT_OK(Put(2, "key_2", "key_2_cf_2_val")); + ASSERT_OK(Put(0, "key_3", "key_3_cf_0_val")); + ASSERT_OK(Put(1, "key_3", "key_3_cf_1_val")); + ASSERT_OK(Put(3, "key_3", "key_3_cf_3_val")); + + ASSERT_OK(Flush()); + + ReadOptions read_options; + read_options.allow_unprepared_value = true; + + std::vector cfhs_order_3_2_0_1{handles_[3], handles_[2], + handles_[0], handles_[1]}; + + std::unique_ptr iter = + db_->NewCoalescingIterator(read_options, cfhs_order_3_2_0_1); + iter->SeekToFirst(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), "key_1"); + ASSERT_TRUE(iter->value().empty()); + + SyncPoint::GetInstance()->SetCallBack( + "BlobFileReader::GetBlob:TamperWithResult", [](void* arg) { + Slice* const blob_index = static_cast(arg); + assert(blob_index); + assert(!blob_index->empty()); + blob_index->remove_prefix(1); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_FALSE(iter->PrepareValue()); + ASSERT_FALSE(iter->Valid()); + ASSERT_TRUE(iter->status().IsCorruption()); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + class AttributeGroupIteratorTest : public DBTestBase { public: AttributeGroupIteratorTest() : DBTestBase("attribute_group_iterator_test", /*env_do_fsync=*/true) {} - void verifyAttributeGroupIterator( + void VerifyAttributeGroupIterator( const std::vector& cfhs, const std::vector& expected_keys, - const std::vector& expected_attribute_groups, - const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr) { - int i = 0; + const std::vector& expected_attribute_groups, + const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr, + bool allow_unprepared_value = false) { + const size_t num_keys = expected_keys.size(); + ReadOptions read_options; read_options.iterate_lower_bound = lower_bound; read_options.iterate_upper_bound = upper_bound; + read_options.allow_unprepared_value = allow_unprepared_value; + std::unique_ptr iter = db_->NewAttributeGroupIterator(read_options, cfhs); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_EQ(expected_keys[i], iter->key()); - auto iterator_attribute_groups = iter->attribute_groups(); - ASSERT_EQ(expected_attribute_groups[i].size(), - iterator_attribute_groups.size()); - for (size_t cfh_i = 0; cfh_i < iterator_attribute_groups.size(); - cfh_i++) { - ASSERT_EQ(expected_attribute_groups[i][cfh_i].column_family(), - iterator_attribute_groups[cfh_i].column_family()); - ASSERT_EQ(expected_attribute_groups[i][cfh_i].columns(), - iterator_attribute_groups[cfh_i].columns()); + + auto check_iter_entry = [&](size_t idx) { + ASSERT_EQ(iter->key(), expected_keys[idx]); + + if (allow_unprepared_value) { + ASSERT_TRUE(iter->attribute_groups().empty()); + ASSERT_TRUE(iter->PrepareValue()); + ASSERT_TRUE(iter->Valid()); } - ++i; + + ASSERT_EQ(iter->attribute_groups(), expected_attribute_groups[idx]); + }; + + { + size_t i = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + check_iter_entry(i); + ++i; + } + + ASSERT_EQ(i, num_keys); + ASSERT_OK(iter->status()); } - ASSERT_EQ(expected_keys.size(), i); - ASSERT_OK(iter->status()); - int rev_i = i - 1; - for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { - ASSERT_EQ(expected_keys[rev_i], iter->key()); - auto iterator_attribute_groups = iter->attribute_groups(); - ASSERT_EQ(expected_attribute_groups[rev_i].size(), - iterator_attribute_groups.size()); - for (size_t cfh_i = 0; cfh_i < iterator_attribute_groups.size(); - cfh_i++) { - ASSERT_EQ(expected_attribute_groups[rev_i][cfh_i].column_family(), - iterator_attribute_groups[cfh_i].column_family()); - ASSERT_EQ(expected_attribute_groups[rev_i][cfh_i].columns(), - iterator_attribute_groups[cfh_i].columns()); + { + size_t i = 0; + for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { + check_iter_entry(num_keys - 1 - i); + ++i; } - rev_i--; + + ASSERT_EQ(i, num_keys); + ASSERT_OK(iter->status()); } - ASSERT_OK(iter->status()); } }; @@ -870,45 +1030,140 @@ TEST_F(AttributeGroupIteratorTest, IterateAttributeGroups) { ASSERT_OK(db_->PutEntity(WriteOptions(), key_3, key_3_attribute_groups)); ASSERT_OK(db_->PutEntity(WriteOptions(), key_4, key_4_attribute_groups)); + IteratorAttributeGroups key_1_expected_attribute_groups{ + IteratorAttributeGroup(key_1_attribute_groups[0]), + IteratorAttributeGroup(key_1_attribute_groups[1])}; + IteratorAttributeGroups key_2_expected_attribute_groups{ + IteratorAttributeGroup(key_2_attribute_groups[0]), + IteratorAttributeGroup(key_2_attribute_groups[1])}; + IteratorAttributeGroups key_3_expected_attribute_groups{ + IteratorAttributeGroup(key_3_attribute_groups[0]), + IteratorAttributeGroup(key_3_attribute_groups[1])}; + IteratorAttributeGroups key_4_expected_attribute_groups{ + IteratorAttributeGroup(key_4_attribute_groups[0]), + IteratorAttributeGroup(key_4_attribute_groups[1])}; + // Test for iteration over CF default->1->2->3 std::vector cfhs_order_0_1_2_3 = { handles_[0], handles_[1], handles_[2], handles_[3]}; { std::vector expected_keys = {key_1, key_2, key_3, key_4}; - std::vector expected_attribute_groups = { - key_1_attribute_groups, key_2_attribute_groups, key_3_attribute_groups, - key_4_attribute_groups}; - verifyAttributeGroupIterator(cfhs_order_0_1_2_3, expected_keys, + std::vector expected_attribute_groups{ + key_1_expected_attribute_groups, key_2_expected_attribute_groups, + key_3_expected_attribute_groups, key_4_expected_attribute_groups}; + VerifyAttributeGroupIterator(cfhs_order_0_1_2_3, expected_keys, expected_attribute_groups); } + Slice lb = Slice("key_2"); Slice ub = Slice("key_4"); + // Test for lower bound only { std::vector expected_keys = {key_2, key_3, key_4}; - std::vector expected_attribute_groups = { - key_2_attribute_groups, key_3_attribute_groups, key_4_attribute_groups}; - verifyAttributeGroupIterator(cfhs_order_0_1_2_3, expected_keys, + std::vector expected_attribute_groups{ + key_2_expected_attribute_groups, key_3_expected_attribute_groups, + key_4_expected_attribute_groups}; + VerifyAttributeGroupIterator(cfhs_order_0_1_2_3, expected_keys, expected_attribute_groups, &lb); } + // Test for upper bound only { std::vector expected_keys = {key_1, key_2, key_3}; - std::vector expected_attribute_groups = { - key_1_attribute_groups, key_2_attribute_groups, key_3_attribute_groups}; - verifyAttributeGroupIterator(cfhs_order_0_1_2_3, expected_keys, + std::vector expected_attribute_groups{ + key_1_expected_attribute_groups, key_2_expected_attribute_groups, + key_3_expected_attribute_groups}; + VerifyAttributeGroupIterator(cfhs_order_0_1_2_3, expected_keys, expected_attribute_groups, nullptr, &ub); } + // Test for lower and upper bound { std::vector expected_keys = {key_2, key_3}; - std::vector expected_attribute_groups = { - key_2_attribute_groups, key_3_attribute_groups}; - verifyAttributeGroupIterator(cfhs_order_0_1_2_3, expected_keys, + std::vector expected_attribute_groups{ + key_2_expected_attribute_groups, key_3_expected_attribute_groups}; + VerifyAttributeGroupIterator(cfhs_order_0_1_2_3, expected_keys, expected_attribute_groups, &lb, &ub); } } +TEST_F(AttributeGroupIteratorTest, AllowUnpreparedValue) { + Options options = GetDefaultOptions(); + CreateAndReopenWithCF({"cf_1", "cf_2", "cf_3"}, options); + + constexpr char key_1[] = "key_1"; + WideColumns key_1_columns_in_cf_2{ + {kDefaultWideColumnName, "cf_2_col_val_0_key_1"}, + {"cf_2_col_name_1", "cf_2_col_val_1_key_1"}, + {"cf_2_col_name_2", "cf_2_col_val_2_key_1"}}; + WideColumns key_1_columns_in_cf_3{ + {"cf_3_col_name_1", "cf_3_col_val_1_key_1"}, + {"cf_3_col_name_2", "cf_3_col_val_2_key_1"}, + {"cf_3_col_name_3", "cf_3_col_val_3_key_1"}}; + + constexpr char key_2[] = "key_2"; + WideColumns key_2_columns_in_cf_1{ + {"cf_1_col_name_1", "cf_1_col_val_1_key_2"}}; + WideColumns key_2_columns_in_cf_2{ + {"cf_2_col_name_1", "cf_2_col_val_1_key_2"}, + {"cf_2_col_name_2", "cf_2_col_val_2_key_2"}}; + + constexpr char key_3[] = "key_3"; + WideColumns key_3_columns_in_cf_1{ + {"cf_1_col_name_1", "cf_1_col_val_1_key_3"}}; + WideColumns key_3_columns_in_cf_3{ + {"cf_3_col_name_1", "cf_3_col_val_1_key_3"}}; + + constexpr char key_4[] = "key_4"; + WideColumns key_4_columns_in_cf_0{ + {"cf_0_col_name_1", "cf_0_col_val_1_key_4"}}; + WideColumns key_4_columns_in_cf_2{ + {"cf_2_col_name_1", "cf_2_col_val_1_key_4"}}; + + AttributeGroups key_1_attribute_groups{ + AttributeGroup(handles_[2], key_1_columns_in_cf_2), + AttributeGroup(handles_[3], key_1_columns_in_cf_3)}; + AttributeGroups key_2_attribute_groups{ + AttributeGroup(handles_[1], key_2_columns_in_cf_1), + AttributeGroup(handles_[2], key_2_columns_in_cf_2)}; + AttributeGroups key_3_attribute_groups{ + AttributeGroup(handles_[1], key_3_columns_in_cf_1), + AttributeGroup(handles_[3], key_3_columns_in_cf_3)}; + AttributeGroups key_4_attribute_groups{ + AttributeGroup(handles_[0], key_4_columns_in_cf_0), + AttributeGroup(handles_[2], key_4_columns_in_cf_2)}; + + ASSERT_OK(db_->PutEntity(WriteOptions(), key_1, key_1_attribute_groups)); + ASSERT_OK(db_->PutEntity(WriteOptions(), key_2, key_2_attribute_groups)); + ASSERT_OK(db_->PutEntity(WriteOptions(), key_3, key_3_attribute_groups)); + ASSERT_OK(db_->PutEntity(WriteOptions(), key_4, key_4_attribute_groups)); + + IteratorAttributeGroups key_1_expected_attribute_groups{ + IteratorAttributeGroup(key_1_attribute_groups[0]), + IteratorAttributeGroup(key_1_attribute_groups[1])}; + IteratorAttributeGroups key_2_expected_attribute_groups{ + IteratorAttributeGroup(key_2_attribute_groups[0]), + IteratorAttributeGroup(key_2_attribute_groups[1])}; + IteratorAttributeGroups key_3_expected_attribute_groups{ + IteratorAttributeGroup(key_3_attribute_groups[0]), + IteratorAttributeGroup(key_3_attribute_groups[1])}; + IteratorAttributeGroups key_4_expected_attribute_groups{ + IteratorAttributeGroup(key_4_attribute_groups[0]), + IteratorAttributeGroup(key_4_attribute_groups[1])}; + + std::vector cfhs_order_0_1_2_3{handles_[0], handles_[1], + handles_[2], handles_[3]}; + std::vector expected_keys{key_1, key_2, key_3, key_4}; + std::vector expected_attribute_groups{ + key_1_expected_attribute_groups, key_2_expected_attribute_groups, + key_3_expected_attribute_groups, key_4_expected_attribute_groups}; + VerifyAttributeGroupIterator( + cfhs_order_0_1_2_3, expected_keys, expected_attribute_groups, + /* lower_bound */ nullptr, /* upper_bound */ nullptr, + /* allow_unprepared_value */ true); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index 9c644cb68..1fb68bd42 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -197,11 +197,10 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { pinned_seq_pos_ = tombstones_->seq_end(); } - RangeTombstone Tombstone(bool logical_strip_timestamp = false) const { + RangeTombstone Tombstone() const { assert(Valid()); if (icmp_->user_comparator()->timestamp_size()) { - return RangeTombstone(start_key(), end_key(), seq(), timestamp(), - logical_strip_timestamp); + return RangeTombstone(start_key(), end_key(), seq(), timestamp()); } return RangeTombstone(start_key(), end_key(), seq()); } diff --git a/db/repair.cc b/db/repair.cc index f7f4fbafb..1d2b81adc 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -105,6 +105,7 @@ class Repairer { SanitizeOptions(immutable_db_options_, default_cf_opts)), default_iopts_( ImmutableOptions(immutable_db_options_, default_cf_opts_)), + default_mopts_(MutableCFOptions(default_cf_opts_)), unknown_cf_opts_( SanitizeOptions(immutable_db_options_, unknown_cf_opts)), create_unknown_cfs_(create_unknown_cfs), @@ -261,6 +262,7 @@ class Repairer { const InternalKeyComparator icmp_; const ColumnFamilyOptions default_cf_opts_; const ImmutableOptions default_iopts_; // table_cache_ holds reference + const MutableCFOptions default_mopts_; const ColumnFamilyOptions unknown_cf_opts_; const bool create_unknown_cfs_; std::shared_ptr raw_table_cache_; @@ -537,8 +539,7 @@ class Repairer { // TODO: plumb Env::IOActivity, Env::IOPriority const ReadOptions read_options; status = table_cache_->GetTableProperties( - file_options_, read_options, icmp_, t->meta, &props, - 0 /* block_protection_bytes_per_key */); + file_options_, read_options, icmp_, t->meta, &props, default_mopts_); } if (status.ok()) { auto s = @@ -602,15 +603,13 @@ class Repairer { ropts.total_order_seek = true; InternalIterator* iter = table_cache_->NewIterator( ropts, file_options_, cfd->internal_comparator(), t->meta, - nullptr /* range_del_agg */, - cfd->GetLatestMutableCFOptions()->prefix_extractor, + nullptr /* range_del_agg */, *cfd->GetLatestMutableCFOptions(), /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false, - cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key); + /*allow_unprepared_value=*/false); ParsedInternalKey parsed; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Slice key = iter->key(); @@ -651,8 +650,7 @@ class Repairer { std::unique_ptr r_iter; status = table_cache_->GetRangeTombstoneIterator( ropts, cfd->internal_comparator(), t->meta, - cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key, - &r_iter); + *cfd->GetLatestMutableCFOptions(), &r_iter); if (r_iter) { r_iter->SeekToFirst(); diff --git a/db/snapshot_checker.h b/db/snapshot_checker.h index 4a6a71162..e221914d7 100644 --- a/db/snapshot_checker.h +++ b/db/snapshot_checker.h @@ -55,4 +55,11 @@ class WritePreparedSnapshotChecker : public SnapshotChecker { const WritePreparedTxnDB* const txn_db_; }; +bool DataIsDefinitelyInSnapshot(SequenceNumber seqno, SequenceNumber snapshot, + const SnapshotChecker* snapshot_checker); + +bool DataIsDefinitelyNotInSnapshot(SequenceNumber seqno, + SequenceNumber snapshot, + const SnapshotChecker* snapshot_checker); + } // namespace ROCKSDB_NAMESPACE diff --git a/db/table_cache.cc b/db/table_cache.cc index 71fc29c32..1e1a94716 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -91,10 +91,9 @@ Status TableCache::GetTableReader( const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, bool sequential_mode, - uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist, - std::unique_ptr* table_reader, - const std::shared_ptr& prefix_extractor, - bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, + HistogramImpl* file_read_hist, std::unique_ptr* table_reader, + const MutableCFOptions& mutable_cf_options, bool skip_filters, int level, + bool prefetch_index_and_filter_in_cache, size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { std::string fname = TableFileName( ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId()); @@ -146,13 +145,14 @@ Status TableCache::GetTableReader( } else { expected_unique_id = kNullUniqueId64x2; // null ID == no verification } - s = ioptions_.table_factory->NewTableReader( + s = mutable_cf_options.table_factory->NewTableReader( ro, TableReaderOptions( - ioptions_, prefix_extractor, file_options, internal_comparator, - block_protection_bytes_per_key, skip_filters, immortal_tables_, - false /* force_direct_prefetch */, level, block_cache_tracer_, - max_file_size_for_l0_meta_pin, db_session_id_, + ioptions_, mutable_cf_options.prefix_extractor, file_options, + internal_comparator, + mutable_cf_options.block_protection_bytes_per_key, skip_filters, + immortal_tables_, false /* force_direct_prefetch */, level, + block_cache_tracer_, max_file_size_for_l0_meta_pin, db_session_id_, file_meta.fd.GetNumber(), expected_unique_id, file_meta.fd.largest_seqno, file_meta.tail_size, file_meta.user_defined_timestamps_persisted), @@ -172,10 +172,9 @@ Status TableCache::FindTable( const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, TypedHandle** handle, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor, - const bool no_io, HistogramImpl* file_read_hist, bool skip_filters, - int level, bool prefetch_index_and_filter_in_cache, + const MutableCFOptions& mutable_cf_options, const bool no_io, + HistogramImpl* file_read_hist, bool skip_filters, int level, + bool prefetch_index_and_filter_in_cache, size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); uint64_t number = file_meta.fd.GetNumber(); @@ -197,9 +196,8 @@ Status TableCache::FindTable( std::unique_ptr table_reader; Status s = GetTableReader(ro, file_options, internal_comparator, file_meta, - false /* sequential mode */, - block_protection_bytes_per_key, file_read_hist, - &table_reader, prefix_extractor, skip_filters, + false /* sequential mode */, file_read_hist, + &table_reader, mutable_cf_options, skip_filters, level, prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, file_temperature); if (!s.ok()) { @@ -223,13 +221,12 @@ InternalIterator* TableCache::NewIterator( const ReadOptions& options, const FileOptions& file_options, const InternalKeyComparator& icomparator, const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, - const std::shared_ptr& prefix_extractor, - TableReader** table_reader_ptr, HistogramImpl* file_read_hist, - TableReaderCaller caller, Arena* arena, bool skip_filters, int level, - size_t max_file_size_for_l0_meta_pin, + const MutableCFOptions& mutable_cf_options, TableReader** table_reader_ptr, + HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena, + bool skip_filters, int level, size_t max_file_size_for_l0_meta_pin, const InternalKey* smallest_compaction_key, const InternalKey* largest_compaction_key, bool allow_unprepared_value, - uint8_t block_protection_bytes_per_key, const SequenceNumber* read_seqno, + const SequenceNumber* read_seqno, std::unique_ptr* range_del_iter) { PERF_TIMER_GUARD(new_table_iterator_nanos); @@ -244,7 +241,7 @@ InternalIterator* TableCache::NewIterator( table_reader = fd.table_reader; if (table_reader == nullptr) { s = FindTable(options, file_options, icomparator, file_meta, &handle, - block_protection_bytes_per_key, prefix_extractor, + mutable_cf_options, options.read_tier == kBlockCacheTier /* no_io */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, @@ -260,8 +257,9 @@ InternalIterator* TableCache::NewIterator( result = NewEmptyInternalIterator(arena); } else { result = table_reader->NewIterator( - options, prefix_extractor.get(), arena, skip_filters, caller, - file_options.compaction_readahead_size, allow_unprepared_value); + options, mutable_cf_options.prefix_extractor.get(), arena, + skip_filters, caller, file_options.compaction_readahead_size, + allow_unprepared_value); } if (handle != nullptr) { cache_.RegisterReleaseAsCleanup(handle, *result); @@ -328,7 +326,7 @@ InternalIterator* TableCache::NewIterator( Status TableCache::GetRangeTombstoneIterator( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options, std::unique_ptr* out_iter) { assert(out_iter); const FileDescriptor& fd = file_meta.fd; @@ -337,7 +335,7 @@ Status TableCache::GetRangeTombstoneIterator( TypedHandle* handle = nullptr; if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle, block_protection_bytes_per_key); + &handle, mutable_cf_options); if (s.ok()) { t = cache_.Value(handle); } @@ -429,14 +427,13 @@ bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, return found; } -Status TableCache::Get( - const ReadOptions& options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, GetContext* get_context, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor, - HistogramImpl* file_read_hist, bool skip_filters, int level, - size_t max_file_size_for_l0_meta_pin) { +Status TableCache::Get(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, + GetContext* get_context, + const MutableCFOptions& mutable_cf_options, + HistogramImpl* file_read_hist, bool skip_filters, + int level, size_t max_file_size_for_l0_meta_pin) { auto& fd = file_meta.fd; std::string* row_cache_entry = nullptr; bool done = false; @@ -461,7 +458,7 @@ Status TableCache::Get( if (s.ok() && !done) { if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle, block_protection_bytes_per_key, prefix_extractor, + &handle, mutable_cf_options, options.read_tier == kBlockCacheTier /* no_io */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, @@ -490,7 +487,8 @@ Status TableCache::Get( } if (s.ok()) { get_context->SetReplayLog(row_cache_entry); // nullptr if no cache. - s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters); + s = t->Get(options, k, get_context, + mutable_cf_options.prefix_extractor.get(), skip_filters); get_context->SetReplayLog(nullptr); } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { // Couldn't find table in cache and couldn't open it because of no_io. @@ -543,11 +541,9 @@ void TableCache::UpdateRangeTombstoneSeqnums( Status TableCache::MultiGetFilter( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, - const std::shared_ptr& prefix_extractor, + const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options, HistogramImpl* file_read_hist, int level, - MultiGetContext::Range* mget_range, TypedHandle** table_handle, - uint8_t block_protection_bytes_per_key) { + MultiGetContext::Range* mget_range, TypedHandle** table_handle) { auto& fd = file_meta.fd; IterKey row_cache_key; std::string row_cache_entry_buffer; @@ -566,7 +562,7 @@ Status TableCache::MultiGetFilter( mget_range->end()); if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle, block_protection_bytes_per_key, prefix_extractor, + &handle, mutable_cf_options, options.read_tier == kBlockCacheTier /* no_io */, file_read_hist, /*skip_filters=*/false, level, @@ -578,7 +574,8 @@ Status TableCache::MultiGetFilter( *table_handle = handle; } if (s.ok()) { - s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range); + s = t->MultiGetFilter(options, mutable_cf_options.prefix_extractor.get(), + mget_range); } if (s.ok() && !options.ignore_range_deletions) { // Update the range tombstone sequence numbers for the keys here @@ -599,8 +596,7 @@ Status TableCache::GetTableProperties( const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor, bool no_io) { + const MutableCFOptions& mutable_cf_options, bool no_io) { auto table_reader = file_meta.fd.table_reader; // table already been pre-loaded? if (table_reader) { @@ -611,8 +607,7 @@ Status TableCache::GetTableProperties( TypedHandle* table_handle = nullptr; Status s = FindTable(read_options, file_options, internal_comparator, - file_meta, &table_handle, block_protection_bytes_per_key, - prefix_extractor, no_io); + file_meta, &table_handle, mutable_cf_options, no_io); if (!s.ok()) { return s; } @@ -625,14 +620,15 @@ Status TableCache::GetTableProperties( Status TableCache::ApproximateKeyAnchors( const ReadOptions& ro, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options, + std::vector& anchors) { Status s; TableReader* t = file_meta.fd.table_reader; TypedHandle* handle = nullptr; if (t == nullptr) { s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle, - block_protection_bytes_per_key); + mutable_cf_options); if (s.ok()) { t = cache_.Value(handle); } @@ -649,8 +645,7 @@ Status TableCache::ApproximateKeyAnchors( size_t TableCache::GetMemoryUsageByTableReader( const FileOptions& file_options, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor) { + const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options) { auto table_reader = file_meta.fd.table_reader; // table already been pre-loaded? if (table_reader) { @@ -658,9 +653,9 @@ size_t TableCache::GetMemoryUsageByTableReader( } TypedHandle* table_handle = nullptr; - Status s = FindTable(read_options, file_options, internal_comparator, - file_meta, &table_handle, block_protection_bytes_per_key, - prefix_extractor, true /* no_io */); + Status s = + FindTable(read_options, file_options, internal_comparator, file_meta, + &table_handle, mutable_cf_options, true /* no_io */); if (!s.ok()) { return 0; } @@ -679,16 +674,14 @@ uint64_t TableCache::ApproximateOffsetOf( const ReadOptions& read_options, const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor) { + const MutableCFOptions& mutable_cf_options) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; TypedHandle* table_handle = nullptr; if (table_reader == nullptr) { Status s = FindTable(read_options, file_options_, internal_comparator, file_meta, - &table_handle, block_protection_bytes_per_key, - prefix_extractor, false /* no_io */); + &table_handle, mutable_cf_options, false /* no_io */); if (s.ok()) { table_reader = cache_.Value(table_handle); } @@ -708,16 +701,14 @@ uint64_t TableCache::ApproximateSize( const ReadOptions& read_options, const Slice& start, const Slice& end, const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor) { + const MutableCFOptions& mutable_cf_options) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; TypedHandle* table_handle = nullptr; if (table_reader == nullptr) { Status s = FindTable(read_options, file_options_, internal_comparator, file_meta, - &table_handle, block_protection_bytes_per_key, - prefix_extractor, false /* no_io */); + &table_handle, mutable_cf_options, false /* no_io */); if (s.ok()) { table_reader = cache_.Value(table_handle); } diff --git a/db/table_cache.h b/db/table_cache.h index f77d74bbe..5fd0123bc 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -92,13 +92,12 @@ class TableCache { const ReadOptions& options, const FileOptions& toptions, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, - const std::shared_ptr& prefix_extractor, + const MutableCFOptions& mutable_cf_options, TableReader** table_reader_ptr, HistogramImpl* file_read_hist, TableReaderCaller caller, Arena* arena, bool skip_filters, int level, size_t max_file_size_for_l0_meta_pin, const InternalKey* smallest_compaction_key, const InternalKey* largest_compaction_key, bool allow_unprepared_value, - uint8_t protection_bytes_per_key, const SequenceNumber* range_del_read_seqno = nullptr, std::unique_ptr* range_del_iter = nullptr); @@ -112,21 +111,20 @@ class TableCache { // recorded // @param skip_filters Disables loading/accessing the filter block // @param level The level this table is at, -1 for "not set / don't know" - Status Get( - const ReadOptions& options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, GetContext* get_context, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor = nullptr, - HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, - int level = -1, size_t max_file_size_for_l0_meta_pin = 0); + Status Get(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, const Slice& k, + GetContext* get_context, + const MutableCFOptions& mutable_cf_options, + HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, + int level = -1, size_t max_file_size_for_l0_meta_pin = 0); // Return the range delete tombstone iterator of the file specified by // `file_meta`. Status GetRangeTombstoneIterator( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + const FileMetaData& file_meta, const MutableCFOptions& mutable_cf_options, std::unique_ptr* out_iter); // Call table reader's MultiGetFilter to use the bloom filter to filter out @@ -134,14 +132,13 @@ class TableCache { // If the table cache is looked up to get the table reader, the cache handle // is returned in table_handle. This handle should be passed back to // MultiGet() so it can be released. - Status MultiGetFilter( - const ReadOptions& options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, - const std::shared_ptr& prefix_extractor, - HistogramImpl* file_read_hist, int level, - MultiGetContext::Range* mget_range, TypedHandle** table_handle, - uint8_t block_protection_bytes_per_key); + Status MultiGetFilter(const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const MutableCFOptions& mutable_cf_options, + HistogramImpl* file_read_hist, int level, + MultiGetContext::Range* mget_range, + TypedHandle** table_handle); // If a seek to internal key "k" in specified file finds an entry, // call get_context->SaveValue() repeatedly until @@ -152,15 +149,15 @@ class TableCache { // in the embedded GetContext // @param skip_filters Disables loading/accessing the filter block // @param level The level this table is at, -1 for "not set / don't know" - DECLARE_SYNC_AND_ASYNC( - Status, MultiGet, const ReadOptions& options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor = nullptr, - HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, - bool skip_range_deletions = false, int level = -1, - TypedHandle* table_handle = nullptr); + DECLARE_SYNC_AND_ASYNC(Status, MultiGet, const ReadOptions& options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + const MultiGetContext::Range* mget_range, + const MutableCFOptions& mutable_cf_options, + HistogramImpl* file_read_hist = nullptr, + bool skip_filters = false, + bool skip_range_deletions = false, int level = -1, + TypedHandle* table_handle = nullptr); // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); @@ -176,17 +173,16 @@ class TableCache { // Find table reader // @param skip_filters Disables loading/accessing the filter block // @param level == -1 means not specified - Status FindTable( - const ReadOptions& ro, const FileOptions& toptions, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, TypedHandle**, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor = nullptr, - const bool no_io = false, HistogramImpl* file_read_hist = nullptr, - bool skip_filters = false, int level = -1, - bool prefetch_index_and_filter_in_cache = true, - size_t max_file_size_for_l0_meta_pin = 0, - Temperature file_temperature = Temperature::kUnknown); + Status FindTable(const ReadOptions& ro, const FileOptions& toptions, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, TypedHandle**, + const MutableCFOptions& mutable_cf_options, + const bool no_io = false, + HistogramImpl* file_read_hist = nullptr, + bool skip_filters = false, int level = -1, + bool prefetch_index_and_filter_in_cache = true, + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); // Get the table properties of a given table. // @no_io: indicates if we should load table to the cache if it is not present @@ -194,19 +190,18 @@ class TableCache { // @returns: `properties` will be reset on success. Please note that we will // return Status::Incomplete() if table is not present in cache and // we set `no_io` to be true. - Status GetTableProperties( - const FileOptions& toptions, const ReadOptions& read_options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, - std::shared_ptr* properties, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor = nullptr, - bool no_io = false); + Status GetTableProperties(const FileOptions& toptions, + const ReadOptions& read_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, + std::shared_ptr* properties, + const MutableCFOptions& mutable_cf_options, + bool no_io = false); Status ApproximateKeyAnchors(const ReadOptions& ro, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, - uint8_t block_protection_bytes_per_key, + const MutableCFOptions& mutable_cf_options, std::vector& anchors); // Return total memory usage of the table reader of the file. @@ -214,25 +209,23 @@ class TableCache { size_t GetMemoryUsageByTableReader( const FileOptions& toptions, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor = nullptr); + const FileMetaData& file_meta, + const MutableCFOptions& mutable_cf_options); // Returns approximated offset of a key in a file represented by fd. - uint64_t ApproximateOffsetOf( - const ReadOptions& read_options, const Slice& key, - const FileMetaData& file_meta, TableReaderCaller caller, - const InternalKeyComparator& internal_comparator, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor = nullptr); + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, const FileMetaData& file_meta, + TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const MutableCFOptions& mutable_cf_options); // Returns approximated data size between start and end keys in a file // represented by fd (the start key must not be greater than the end key). - uint64_t ApproximateSize( - const ReadOptions& read_options, const Slice& start, const Slice& end, - const FileMetaData& file_meta, TableReaderCaller caller, - const InternalKeyComparator& internal_comparator, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor = nullptr); + uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start, + const Slice& end, const FileMetaData& file_meta, + TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + const MutableCFOptions& mutable_cf_options); CacheInterface& get_cache() { return cache_; } @@ -250,17 +243,16 @@ class TableCache { private: // Build a table reader - Status GetTableReader( - const ReadOptions& ro, const FileOptions& file_options, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, bool sequential_mode, - uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist, - std::unique_ptr* table_reader, - const std::shared_ptr& prefix_extractor = nullptr, - bool skip_filters = false, int level = -1, - bool prefetch_index_and_filter_in_cache = true, - size_t max_file_size_for_l0_meta_pin = 0, - Temperature file_temperature = Temperature::kUnknown); + Status GetTableReader(const ReadOptions& ro, const FileOptions& file_options, + const InternalKeyComparator& internal_comparator, + const FileMetaData& file_meta, bool sequential_mode, + HistogramImpl* file_read_hist, + std::unique_ptr* table_reader, + const MutableCFOptions& mutable_cf_options, + bool skip_filters = false, int level = -1, + bool prefetch_index_and_filter_in_cache = true, + size_t max_file_size_for_l0_meta_pin = 0, + Temperature file_temperature = Temperature::kUnknown); // Update the max_covering_tombstone_seq in the GetContext for each key based // on the range deletions in the table diff --git a/db/table_cache_sync_and_async.h b/db/table_cache_sync_and_async.h index f069c8b80..68228485c 100644 --- a/db/table_cache_sync_and_async.h +++ b/db/table_cache_sync_and_async.h @@ -17,10 +17,8 @@ namespace ROCKSDB_NAMESPACE { DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) (const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, - uint8_t block_protection_bytes_per_key, - const std::shared_ptr& prefix_extractor, - HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions, - int level, TypedHandle* handle) { + const MutableCFOptions& mutable_cf_options, HistogramImpl* file_read_hist, + bool skip_filters, bool skip_range_deletions, int level, TypedHandle* handle) { auto& fd = file_meta.fd; Status s; TableReader* t = fd.table_reader; @@ -72,7 +70,7 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) if (t == nullptr) { assert(handle == nullptr); s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle, block_protection_bytes_per_key, prefix_extractor, + &handle, mutable_cf_options, options.read_tier == kBlockCacheTier /* no_io */, file_read_hist, skip_filters, level, true /* prefetch_index_and_filter_in_cache */, @@ -88,7 +86,8 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) } if (s.ok()) { CO_AWAIT(t->MultiGet) - (options, &table_range, prefix_extractor.get(), skip_filters); + (options, &table_range, mutable_cf_options.prefix_extractor.get(), + skip_filters); } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) { Status* status = iter->s; diff --git a/db/version_builder.cc b/db/version_builder.cc index ed8ab8214..1343d113e 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -1636,12 +1636,12 @@ class VersionBuilder::Rep { return s; } - Status LoadTableHandlers( - InternalStats* internal_stats, int max_threads, - bool prefetch_index_and_filter_in_cache, bool is_initial_load, - const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, - uint8_t block_protection_bytes_per_key) { + Status LoadTableHandlers(InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load, + const MutableCFOptions& mutable_cf_options, + size_t max_file_size_for_l0_meta_pin, + const ReadOptions& read_options) { assert(table_cache_ != nullptr); assert(!track_found_and_missing_files_ || valid_version_available_); @@ -1716,7 +1716,7 @@ class VersionBuilder::Rep { statuses[file_idx] = table_cache_->FindTable( read_options, file_options_, *(base_vstorage_->InternalComparator()), *file_meta, &handle, - block_protection_bytes_per_key, prefix_extractor, false /*no_io */, + mutable_cf_options, false /*no_io */, internal_stats->GetFileReadHist(level), false, level, prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, file_meta->temperature); @@ -1777,13 +1777,12 @@ Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) const { Status VersionBuilder::LoadTableHandlers( InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, - const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, - uint8_t block_protection_bytes_per_key) { - return rep_->LoadTableHandlers( - internal_stats, max_threads, prefetch_index_and_filter_in_cache, - is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin, - read_options, block_protection_bytes_per_key); + const MutableCFOptions& mutable_cf_options, + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) { + return rep_->LoadTableHandlers(internal_stats, max_threads, + prefetch_index_and_filter_in_cache, + is_initial_load, mutable_cf_options, + max_file_size_for_l0_meta_pin, read_options); } void VersionBuilder::CreateOrReplaceSavePoint() { @@ -1814,16 +1813,15 @@ Status VersionBuilder::SaveSavePointTo(VersionStorageInfo* vstorage) const { Status VersionBuilder::LoadSavePointTableHandlers( InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, - const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, - uint8_t block_protection_bytes_per_key) { + const MutableCFOptions& mutable_cf_options, + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) { if (!savepoint_ || !savepoint_->ValidVersionAvailable()) { return Status::InvalidArgument(); } return savepoint_->LoadTableHandlers( internal_stats, max_threads, prefetch_index_and_filter_in_cache, - is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin, - read_options, block_protection_bytes_per_key); + is_initial_load, mutable_cf_options, max_file_size_for_l0_meta_pin, + read_options); } void VersionBuilder::ClearSavePoint() { savepoint_.reset(nullptr); } diff --git a/db/version_builder.h b/db/version_builder.h index 7e2a0253c..55520b430 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -54,12 +54,12 @@ class VersionBuilder { Status SaveTo(VersionStorageInfo* vstorage) const; // Load all the table handlers for the current Version in the builder. - Status LoadTableHandlers( - InternalStats* internal_stats, int max_threads, - bool prefetch_index_and_filter_in_cache, bool is_initial_load, - const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, - uint8_t block_protection_bytes_per_key); + Status LoadTableHandlers(InternalStats* internal_stats, int max_threads, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load, + const MutableCFOptions& mutable_cf_options, + size_t max_file_size_for_l0_meta_pin, + const ReadOptions& read_options); //============APIs only used by VersionEditHandlerPointInTime ============// @@ -99,12 +99,13 @@ class VersionBuilder { // Load all the table handlers for the Version in the save point. // Non-OK status will be returned if there is not a valid save point. - Status LoadSavePointTableHandlers( - InternalStats* internal_stats, int max_threads, - bool prefetch_index_and_filter_in_cache, bool is_initial_load, - const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, - uint8_t block_protection_bytes_per_key); + Status LoadSavePointTableHandlers(InternalStats* internal_stats, + int max_threads, + bool prefetch_index_and_filter_in_cache, + bool is_initial_load, + const MutableCFOptions& mutable_cf_options, + size_t max_file_size_for_l0_meta_pin, + const ReadOptions& read_options); void ClearSavePoint(); diff --git a/db/version_edit.h b/db/version_edit.h index d6c0e6bde..2b247e457 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -350,6 +350,14 @@ struct FileMetaData { file_checksum_func_name.size(); return usage; } + + // Returns whether this file is one with just one range tombstone. These type + // of file should always be marked for compaction. + bool FileIsStandAloneRangeTombstone() const { + bool res = num_range_deletions == 1 && num_entries == num_range_deletions; + assert(!res || fd.smallest_seqno == fd.largest_seqno); + return res; + } }; // A compressed copy of file meta data that just contain minimum data needed diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 4784c9096..ab21a0558 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -545,9 +545,8 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, Status s = builder->LoadTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, - prefetch_index_and_filter_in_cache, is_initial_load, - moptions->prefix_extractor, MaxFileSizeForL0MetaPin(*moptions), - read_options_, moptions->block_protection_bytes_per_key); + prefetch_index_and_filter_in_cache, is_initial_load, *moptions, + MaxFileSizeForL0MetaPin(*moptions), read_options_); if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) { s = Status::OK(); } @@ -870,8 +869,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersionBeforeApplyEdit( s = builder->LoadSavePointTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, false, true, - cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr), - read_options_, cf_opts_ptr->block_protection_bytes_per_key); + *cf_opts_ptr, MaxFileSizeForL0MetaPin(*cf_opts_ptr), read_options_); if (!s.ok()) { delete version; if (s.IsCorruption()) { diff --git a/db/version_set.cc b/db/version_set.cc index d28b2e2d9..2f4892fbf 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -964,15 +964,15 @@ namespace { class LevelIterator final : public InternalIterator { public: - // @param read_options Must outlive this iterator. + // NOTE: many of the const& parameters are saved in this object (so + // must outlive this object) LevelIterator( TableCache* table_cache, const ReadOptions& read_options, const FileOptions& file_options, const InternalKeyComparator& icomparator, - const LevelFilesBrief* flevel, - const std::shared_ptr& prefix_extractor, + const LevelFilesBrief* flevel, const MutableCFOptions& mutable_cf_options, bool should_sample, HistogramImpl* file_read_hist, TableReaderCaller caller, bool skip_filters, int level, - uint8_t block_protection_bytes_per_key, RangeDelAggregator* range_del_agg, + RangeDelAggregator* range_del_agg, const std::vector* compaction_boundaries = nullptr, bool allow_unprepared_value = false, @@ -984,7 +984,8 @@ class LevelIterator final : public InternalIterator { icomparator_(icomparator), user_comparator_(icomparator.user_comparator()), flevel_(flevel), - prefix_extractor_(prefix_extractor), + mutable_cf_options_(mutable_cf_options), + prefix_extractor_(mutable_cf_options.prefix_extractor.get()), file_read_hist_(file_read_hist), caller_(caller), file_index_(flevel_->num_files), @@ -996,7 +997,6 @@ class LevelIterator final : public InternalIterator { ? read_options.snapshot->GetSequenceNumber() : kMaxSequenceNumber), level_(level), - block_protection_bytes_per_key_(block_protection_bytes_per_key), should_sample_(should_sample), skip_filters_(skip_filters), allow_unprepared_value_(allow_unprepared_value), @@ -1050,6 +1050,11 @@ class LevelIterator final : public InternalIterator { return file_iter_.value(); } + uint64_t write_unix_time() const override { + assert(Valid()); + return file_iter_.write_unix_time(); + } + Status status() const override { return file_iter_.iter() ? file_iter_.status() : Status::OK(); } @@ -1142,12 +1147,12 @@ class LevelIterator final : public InternalIterator { ClearRangeTombstoneIter(); return table_cache_->NewIterator( read_options_, file_options_, icomparator_, *file_meta.file_metadata, - range_del_agg_, prefix_extractor_, + range_del_agg_, mutable_cf_options_, nullptr /* don't need reference to table */, file_read_hist_, caller_, /*arena=*/nullptr, skip_filters_, level_, /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key, - largest_compaction_key, allow_unprepared_value_, - block_protection_bytes_per_key_, &read_seq_, range_tombstone_iter_); + largest_compaction_key, allow_unprepared_value_, &read_seq_, + range_tombstone_iter_); } // Check if current file being fully within iterate_lower_bound. @@ -1171,10 +1176,8 @@ class LevelIterator final : public InternalIterator { const UserComparatorWrapper user_comparator_; const LevelFilesBrief* flevel_; mutable FileDescriptor current_value_; - // `prefix_extractor_` may be non-null even for total order seek. Checking - // this variable is not the right way to identify whether prefix iterator - // is used. - const std::shared_ptr& prefix_extractor_; + const MutableCFOptions& mutable_cf_options_; + const SliceTransform* prefix_extractor_; HistogramImpl* file_read_hist_; TableReaderCaller caller_; @@ -1208,7 +1211,6 @@ class LevelIterator final : public InternalIterator { SequenceNumber read_seq_; int level_; - uint8_t block_protection_bytes_per_key_; bool should_sample_; bool skip_filters_; bool allow_unprepared_value_; @@ -1575,8 +1577,7 @@ Status Version::GetTableProperties(const ReadOptions& read_options, auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp, - mutable_cf_options_.block_protection_bytes_per_key, - mutable_cf_options_.prefix_extractor, true /* no io */); + mutable_cf_options_, true /* no io */); if (s.ok()) { return s; } @@ -1662,8 +1663,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, Status s = table_cache->GetRangeTombstoneIterator( read_options, cfd_->internal_comparator(), *file_meta, - cfd_->GetLatestMutableCFOptions()->block_protection_bytes_per_key, - &tombstone_iter); + mutable_cf_options_, &tombstone_iter); if (!s.ok()) { return s; } @@ -1780,9 +1780,7 @@ size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) { for (size_t i = 0; i < file_level.num_files; i++) { total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( file_options_, read_options, cfd_->internal_comparator(), - *file_level.files[i].file_metadata, - mutable_cf_options_.block_protection_bytes_per_key, - mutable_cf_options_.prefix_extractor); + *file_level.files[i].file_metadata, mutable_cf_options_); } } return total_usage; @@ -1931,10 +1929,9 @@ InternalIterator* Version::TEST_GetLevelIterator( auto level_iter = new (mem) LevelIterator( cfd_->table_cache(), read_options, file_options_, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), - mutable_cf_options_.prefix_extractor, should_sample_file_read(), + mutable_cf_options_, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, - mutable_cf_options_.block_protection_bytes_per_key, nullptr /* range_del_agg */, nullptr /* compaction_boundaries */, allow_unprepared_value, &tombstone_iter_ptr); if (read_options.ignore_range_deletions) { @@ -1968,9 +1965,16 @@ uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { } if (current_num_samples_ < file_count) { - // casting to avoid overflowing - return static_cast( - (est * static_cast(file_count) / current_num_samples_)); + assert(current_num_samples_ != 0); + assert(est != 0); + double multiplier = static_cast(file_count) / current_num_samples_; + double maximum_multiplier = + static_cast(std::numeric_limits::max()) / est; + // If it can overflow, we return the maximum unsigned long. + if (multiplier >= maximum_multiplier) { + return std::numeric_limits::max(); + } + return static_cast(est * multiplier); } else { return est; } @@ -2032,14 +2036,12 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, const auto& file = storage_info_.LevelFilesBrief(0).files[i]; auto table_iter = cfd_->table_cache()->NewIterator( read_options, soptions, cfd_->internal_comparator(), - *file.file_metadata, /*range_del_agg=*/nullptr, - mutable_cf_options_.prefix_extractor, nullptr, - cfd_->internal_stats()->GetFileReadHist(0), + *file.file_metadata, /*range_del_agg=*/nullptr, mutable_cf_options_, + nullptr, cfd_->internal_stats()->GetFileReadHist(0), TableReaderCaller::kUserIterator, arena, /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, allow_unprepared_value, - mutable_cf_options_.block_protection_bytes_per_key, /*range_del_read_seqno=*/nullptr, &tombstone_iter); if (read_options.ignore_range_deletions) { merge_iter_builder->AddIterator(table_iter); @@ -2066,10 +2068,9 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, auto level_iter = new (mem) LevelIterator( cfd_->table_cache(), read_options, soptions, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), - mutable_cf_options_.prefix_extractor, should_sample_file_read(), + mutable_cf_options_, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, - mutable_cf_options_.block_protection_bytes_per_key, /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr, allow_unprepared_value, &tombstone_iter_ptr); @@ -2108,15 +2109,13 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, } ScopedArenaPtr iter(cfd_->table_cache()->NewIterator( read_options, file_options, cfd_->internal_comparator(), - *file->file_metadata, &range_del_agg, - mutable_cf_options_.prefix_extractor, nullptr, + *file->file_metadata, &range_del_agg, mutable_cf_options_, nullptr, cfd_->internal_stats()->GetFileReadHist(0), TableReaderCaller::kUserIterator, &arena, /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false, - mutable_cf_options_.block_protection_bytes_per_key)); + /*allow_unprepared_value=*/false)); status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); if (!status.ok() || *overlap) { @@ -2128,11 +2127,10 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, ScopedArenaPtr iter(new (mem) LevelIterator( cfd_->table_cache(), read_options, file_options, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), - mutable_cf_options_.prefix_extractor, should_sample_file_read(), + mutable_cf_options_, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), TableReaderCaller::kUserIterator, IsFilterSkipped(level), level, - mutable_cf_options_.block_protection_bytes_per_key, &range_del_agg, - nullptr, false)); + &range_del_agg, nullptr, false)); status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); } @@ -2445,8 +2443,7 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, StopWatchNano timer(clock_, timer_enabled /* auto_start */); *status = table_cache_->Get( read_options, *internal_comparator(), *f->file_metadata, ikey, - &get_context, mutable_cf_options_.block_protection_bytes_per_key, - mutable_cf_options_.prefix_extractor, + &get_context, mutable_cf_options_, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), IsFilterSkipped(static_cast(fp.GetHitFileLevel()), fp.IsHitFileLastInLevel()), @@ -2681,10 +2678,9 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, if (!skip_filters) { Status status = table_cache_->MultiGetFilter( read_options, *internal_comparator(), *f->file_metadata, - mutable_cf_options_.prefix_extractor, + mutable_cf_options_, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), - fp.GetHitFileLevel(), &file_range, &table_handle, - mutable_cf_options_.block_protection_bytes_per_key); + fp.GetHitFileLevel(), &file_range, &table_handle); skip_range_deletions = true; if (status.ok()) { skip_filters = true; @@ -2868,10 +2864,9 @@ Status Version::ProcessBatch( if (!skip_filters) { Status status = table_cache_->MultiGetFilter( read_options, *internal_comparator(), *f->file_metadata, - mutable_cf_options_.prefix_extractor, + mutable_cf_options_, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), - fp.GetHitFileLevel(), &file_range, &table_handle, - mutable_cf_options_.block_protection_bytes_per_key); + fp.GetHitFileLevel(), &file_range, &table_handle); if (status.ok()) { skip_filters = true; skip_range_deletions = true; @@ -3635,6 +3630,7 @@ void VersionStorageInfo::ComputeCompactionScore( void VersionStorageInfo::ComputeFilesMarkedForCompaction(int last_level) { files_marked_for_compaction_.clear(); int last_qualify_level = 0; + standalone_range_tombstone_files_mark_threshold_ = kMaxSequenceNumber; // Do not include files from the last level with data // If table properties collector suggests a file on the last level, @@ -3650,6 +3646,11 @@ void VersionStorageInfo::ComputeFilesMarkedForCompaction(int last_level) { for (auto* f : files_[level]) { if (!f->being_compacted && f->marked_for_compaction) { files_marked_for_compaction_.emplace_back(level, f); + if (f->FileIsStandAloneRangeTombstone()) { + standalone_range_tombstone_files_mark_threshold_ = + std::min(standalone_range_tombstone_files_mark_threshold_, + f->fd.smallest_seqno); + } } } } @@ -5533,10 +5534,8 @@ Status VersionSet::ProcessManifestWrites( s = builder_guards[i]->version_builder()->LoadTableHandlers( cfd->internal_stats(), 1 /* max_threads */, true /* prefetch_index_and_filter_in_cache */, - false /* is_initial_load */, - mutable_cf_options_ptrs[i]->prefix_extractor, - MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options, - mutable_cf_options_ptrs[i]->block_protection_bytes_per_key); + false /* is_initial_load */, *mutable_cf_options_ptrs[i], + MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options); if (!s.ok()) { if (db_options_->paranoid_checks) { break; @@ -6923,8 +6922,7 @@ uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options, const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); if (table_cache != nullptr) { result = table_cache->ApproximateOffsetOf( - read_options, key, *f.file_metadata, caller, icmp, - cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor); + read_options, key, *f.file_metadata, caller, icmp, cf_opts); } } return result; @@ -6965,9 +6963,8 @@ uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options, return 0; } const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); - return table_cache->ApproximateSize( - read_options, start, end, *f.file_metadata, caller, icmp, - cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor); + return table_cache->ApproximateSize(read_options, start, end, + *f.file_metadata, caller, icmp, cf_opts); } void VersionSet::RemoveLiveFiles( @@ -7093,10 +7090,12 @@ InternalIterator* VersionSet::MakeInputIterator( std::unique_ptr**>> range_tombstones; size_t num = 0; + [[maybe_unused]] size_t num_input_files = 0; for (size_t which = 0; which < c->num_input_levels(); which++) { - if (c->input_levels(which)->num_files != 0) { + const LevelFilesBrief* flevel = c->input_levels(which); + num_input_files += flevel->num_files; + if (flevel->num_files != 0) { if (c->level(which) == 0) { - const LevelFilesBrief* flevel = c->input_levels(which); for (size_t i = 0; i < flevel->num_files; i++) { const FileMetaData& fmd = *flevel->files[i].file_metadata; if (start.has_value() && @@ -7117,7 +7116,7 @@ InternalIterator* VersionSet::MakeInputIterator( list[num++] = cfd->table_cache()->NewIterator( read_options, file_options_compactions, cfd->internal_comparator(), fmd, range_del_agg, - c->mutable_cf_options()->prefix_extractor, + *c->mutable_cf_options(), /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction, /*arena=*/nullptr, @@ -7127,7 +7126,6 @@ InternalIterator* VersionSet::MakeInputIterator( /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, /*allow_unprepared_value=*/false, - c->mutable_cf_options()->block_protection_bytes_per_key, /*range_del_read_seqno=*/nullptr, /*range_del_iter=*/&range_tombstone_iter); range_tombstones.emplace_back(std::move(range_tombstone_iter), @@ -7139,18 +7137,19 @@ InternalIterator* VersionSet::MakeInputIterator( nullptr; list[num++] = new LevelIterator( cfd->table_cache(), read_options, file_options_compactions, - cfd->internal_comparator(), c->input_levels(which), - c->mutable_cf_options()->prefix_extractor, + cfd->internal_comparator(), flevel, *c->mutable_cf_options(), /*should_sample=*/false, /*no per level latency histogram=*/nullptr, TableReaderCaller::kCompaction, /*skip_filters=*/false, - /*level=*/static_cast(c->level(which)), - c->mutable_cf_options()->block_protection_bytes_per_key, - range_del_agg, c->boundaries(which), false, &tombstone_iter_ptr); + /*level=*/static_cast(c->level(which)), range_del_agg, + c->boundaries(which), false, &tombstone_iter_ptr); range_tombstones.emplace_back(nullptr, tombstone_iter_ptr); } } } + TEST_SYNC_POINT_CALLBACK( + "VersionSet::MakeInputIterator:NewCompactionMergingIterator", + &num_input_files); assert(num <= space); InternalIterator* result = NewCompactionMergingIterator( &c->column_family_data()->internal_comparator(), list, @@ -7391,7 +7390,6 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options, const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions(); assert(cf_opts); - std::shared_ptr pe = cf_opts->prefix_extractor; size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts); const FileOptions& file_opts = file_options(); @@ -7407,8 +7405,7 @@ Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options, TableCache::TypedHandle* handle = nullptr; FileMetaData meta_copy = meta; status = table_cache->FindTable( - read_options, file_opts, *icmp, meta_copy, &handle, - cf_opts->block_protection_bytes_per_key, pe, + read_options, file_opts, *icmp, meta_copy, &handle, *cf_opts, /*no_io=*/false, internal_stats->GetFileReadHist(level), false, level, /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, meta_copy.temperature); diff --git a/db/version_set.h b/db/version_set.h index 9336782b1..f117b6082 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -616,6 +616,10 @@ class VersionStorageInfo { return bottommost_files_mark_threshold_; } + SequenceNumber standalone_range_tombstone_files_mark_threshold() const { + return standalone_range_tombstone_files_mark_threshold_; + } + // Returns whether any key in [`smallest_key`, `largest_key`] could appear in // an older L0 file than `last_l0_idx` or in a greater level than `last_level` // @@ -628,6 +632,8 @@ class VersionStorageInfo { Env::WriteLifeTimeHint CalculateSSTWriteHint(int level) const; + const Comparator* user_comparator() const { return user_comparator_; } + private: void ComputeCompensatedSizes(); void UpdateNumNonEmptyLevels(); @@ -732,6 +738,12 @@ class VersionStorageInfo { // seqnums of unmarked bottommost files. SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber; + // The minimum sequence number among all the standalone range tombstone files + // that are marked for compaction. A standalone range tombstone file is one + // with just one range tombstone. + SequenceNumber standalone_range_tombstone_files_mark_threshold_ = + kMaxSequenceNumber; + // Monotonically increases as we release old snapshots. Zero indicates no // snapshots have been released yet. When no snapshots remain we set it to the // current seqnum, which needs to be protected as a snapshot can still be diff --git a/db/version_set_sync_and_async.h b/db/version_set_sync_and_async.h index 75776b620..46b504fc3 100644 --- a/db/version_set_sync_and_async.h +++ b/db/version_set_sync_and_async.h @@ -25,8 +25,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) StopWatchNano timer(clock_, timer_enabled /* auto_start */); s = CO_AWAIT(table_cache_->MultiGet)( read_options, *internal_comparator(), *f->file_metadata, &file_range, - mutable_cf_options_.block_protection_bytes_per_key, - mutable_cf_options_.prefix_extractor, + mutable_cf_options_, cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters, skip_range_deletions, hit_file_level, table_handle); // TODO: examine the behavior for corrupted key diff --git a/db/version_set_test.cc b/db/version_set_test.cc index a483ccf0e..9264345c5 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -1182,6 +1182,9 @@ class VersionSetTestBase { immutable_options_.fs = fs_; immutable_options_.clock = env_->GetSystemClock().get(); + cf_options_.table_factory = table_factory_; + mutable_cf_options_.table_factory = table_factory_; + versions_.reset(new VersionSet( dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_, diff --git a/db/write_batch.cc b/db/write_batch.cc index 3820dccd0..7cb8f6d11 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -502,7 +502,9 @@ Status ReadRecordFromWriteBatch(Slice* input, char* tag, break; } default: - return Status::Corruption("unknown WriteBatch tag"); + return Status::Corruption( + "unknown WriteBatch tag", + std::to_string(static_cast(*tag))); } return Status::OK(); } @@ -750,7 +752,9 @@ Status WriteBatchInternal::Iterate(const WriteBatch* wb, } break; default: - return Status::Corruption("unknown WriteBatch tag"); + return Status::Corruption( + "unknown WriteBatch tag", + std::to_string(static_cast(tag))); } } if (!s.ok()) { @@ -2620,9 +2624,8 @@ class MemTableInserter : public WriteBatch::Handler { // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`. ret_status.PermitUncheckedError(); return Status::NotSupported( - std::string("DeleteRange not supported for table type ") + - cfd->ioptions()->table_factory->Name() + " in CF " + - cfd->GetName()); + std::string("CF " + cfd->GetName() + + " reports it does not support DeleteRange")); } int cmp = cfd->user_comparator()->CompareWithoutTimestamp(begin_key, end_key); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index 9f165cf97..a2632dfa3 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -1031,8 +1031,9 @@ DEFINE_int32(continuous_verification_interval, 1000, "disables continuous verification."); DEFINE_int32(approximate_size_one_in, 64, - "If non-zero, DB::GetApproximateSizes() will be called against" - " random key ranges."); + "If non-zero, DB::GetApproximateSizes() and " + "DB::GetApproximateMemTableStats() will be called against " + "random key ranges."); DEFINE_int32(read_fault_one_in, 1000, "On non-zero, enables fault injection on read"); diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index 2b6db414f..9b7ebb85d 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -274,6 +274,8 @@ bool StressTest::BuildOptionsTable() { return true; } + bool keepRibbonFilterPolicyOnly = FLAGS_bloom_before_level != INT_MAX; + std::unordered_map> options_tbl = { {"write_buffer_size", {std::to_string(options_.write_buffer_size), @@ -339,6 +341,17 @@ bool StressTest::BuildOptionsTable() { "2", }}, {"max_sequential_skip_in_iterations", {"4", "8", "12"}}, + {"block_based_table_factory", + { + keepRibbonFilterPolicyOnly ? "{filter_policy=ribbonfilter:2.35}" + : "{filter_policy=bloomfilter:2.34}", + "{filter_policy=ribbonfilter:5.67:-1}", + keepRibbonFilterPolicyOnly ? "{filter_policy=ribbonfilter:8.9:3}" + : "{filter_policy=nullptr}", + "{block_size=" + std::to_string(FLAGS_block_size) + "}", + "{block_size=" + + std::to_string(FLAGS_block_size + (FLAGS_seed & 0xFFFU)) + "}", + }}, }; if (FLAGS_compaction_style == kCompactionStyleUniversal && FLAGS_universal_max_read_amp > 0) { @@ -393,7 +406,7 @@ bool StressTest::BuildOptionsTable() { std::vector{"kDisable", "kFlushOnly"}); } - if (FLAGS_bloom_before_level != INT_MAX) { + if (keepRibbonFilterPolicyOnly) { // Can modify RibbonFilterPolicy field options_tbl.emplace("table_factory.filter_policy.bloom_before_level", std::vector{"-1", "0", "1", "2", @@ -2427,22 +2440,31 @@ Status StressTest::TestApproximateSize( std::string key1_str = Key(key1); std::string key2_str = Key(key2); Range range{Slice(key1_str), Slice(key2_str)}; - SizeApproximationOptions sao; - sao.include_memtables = thread->rand.OneIn(2); - if (sao.include_memtables) { - sao.include_files = thread->rand.OneIn(2); - } - if (thread->rand.OneIn(2)) { + if (thread->rand.OneIn(3)) { + // Call GetApproximateMemTableStats instead + uint64_t count, size; + db_->GetApproximateMemTableStats(column_families_[rand_column_families[0]], + range, &count, &size); + return Status::OK(); + } else { + // Call GetApproximateSizes + SizeApproximationOptions sao; + sao.include_memtables = thread->rand.OneIn(2); + if (sao.include_memtables) { + sao.include_files = thread->rand.OneIn(2); + } if (thread->rand.OneIn(2)) { - sao.files_size_error_margin = 0.0; - } else { - sao.files_size_error_margin = - static_cast(thread->rand.Uniform(3)); + if (thread->rand.OneIn(2)) { + sao.files_size_error_margin = 0.0; + } else { + sao.files_size_error_margin = + static_cast(thread->rand.Uniform(3)); + } } + uint64_t result; + return db_->GetApproximateSizes( + sao, column_families_[rand_column_families[0]], &range, 1, &result); } - uint64_t result; - return db_->GetApproximateSizes( - sao, column_families_[rand_column_families[0]], &range, 1, &result); } Status StressTest::TestCheckpoint(ThreadState* thread, diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index 1e628d7d2..a5a61d28f 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -568,9 +568,11 @@ class NonBatchedOpsStressTest : public StressTest { post_read_expected_value)) { thread->shared->SetVerificationFailure(); fprintf(stderr, - "error : inconsistent values for key %s: Get returns %s, " + "error : inconsistent values for key %s (%" PRIi64 + "): Get returns %s, " "but expected state is \"deleted\".\n", - key.ToString(true).c_str(), StringToHex(from_db).c_str()); + key.ToString(true).c_str(), rand_keys[0], + StringToHex(from_db).c_str()); } Slice from_db_slice(from_db); uint32_t value_base_from_db = GetValueBase(from_db_slice); @@ -579,11 +581,12 @@ class NonBatchedOpsStressTest : public StressTest { post_read_expected_value)) { thread->shared->SetVerificationFailure(); fprintf(stderr, - "error : inconsistent values for key %s: Get returns %s with " + "error : inconsistent values for key %s (%" PRIi64 + "): Get returns %s with " "value base %d that falls out of expected state's value base " "range.\n", - key.ToString(true).c_str(), StringToHex(from_db).c_str(), - value_base_from_db); + key.ToString(true).c_str(), rand_keys[0], + StringToHex(from_db).c_str(), value_base_from_db); } } } else if (s.IsNotFound()) { @@ -594,15 +597,16 @@ class NonBatchedOpsStressTest : public StressTest { post_read_expected_value)) { thread->shared->SetVerificationFailure(); fprintf(stderr, - "error : inconsistent values for key %s: expected state has " + "error : inconsistent values for key %s (%" PRIi64 + "): expected state has " "the key, Get() returns NotFound.\n", - key.ToString(true).c_str()); + key.ToString(true).c_str(), rand_keys[0]); } } } else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) { thread->shared->SetVerificationFailure(); - fprintf(stderr, "error : Get() returns %s for key: %s.\n", - s.ToString().c_str(), key.ToString(true).c_str()); + fprintf(stderr, "error : Get() returns %s for key: %s (%" PRIi64 ").\n", + s.ToString().c_str(), key.ToString(true).c_str(), rand_keys[0]); } return s; } @@ -1031,17 +1035,18 @@ class NonBatchedOpsStressTest : public StressTest { shared->SetVerificationFailure(); fprintf(stderr, "error : inconsistent columns returned by GetEntity for key " - "%s: %s\n", - StringToHex(key_str).c_str(), + "%s (%" PRIi64 "): %s\n", + StringToHex(key_str).c_str(), rand_keys[0], WideColumnsToHex(columns).c_str()); } else if (ExpectedValueHelper::MustHaveNotExisted( pre_read_expected_value, post_read_expected_value)) { shared->SetVerificationFailure(); - fprintf( - stderr, - "error : inconsistent values for key %s: GetEntity returns %s, " - "expected state does not have the key.\n", - StringToHex(key_str).c_str(), WideColumnsToHex(columns).c_str()); + fprintf(stderr, + "error : inconsistent values for key %s (%" PRIi64 + "): GetEntity returns %s, " + "expected state does not have the key.\n", + StringToHex(key_str).c_str(), rand_keys[0], + WideColumnsToHex(columns).c_str()); } else { const uint32_t value_base_from_db = GetValueBase(WideColumnsHelper::GetDefaultColumn(columns)); @@ -1051,11 +1056,12 @@ class NonBatchedOpsStressTest : public StressTest { shared->SetVerificationFailure(); fprintf( stderr, - "error : inconsistent values for key %s: GetEntity returns %s " + "error : inconsistent values for key %s (%" PRIi64 + "): GetEntity returns %s " "with value base %d that falls out of expected state's value " "base range.\n", - StringToHex(key_str).c_str(), WideColumnsToHex(columns).c_str(), - value_base_from_db); + StringToHex(key_str).c_str(), rand_keys[0], + WideColumnsToHex(columns).c_str(), value_base_from_db); } } } @@ -1067,14 +1073,16 @@ class NonBatchedOpsStressTest : public StressTest { post_read_expected_value)) { shared->SetVerificationFailure(); fprintf(stderr, - "error : inconsistent values for key %s: expected state has " + "error : inconsistent values for key %s (%" PRIi64 + "): expected state has " "the key, GetEntity returns NotFound.\n", - StringToHex(key_str).c_str()); + StringToHex(key_str).c_str(), rand_keys[0]); } } } else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) { - fprintf(stderr, "error : GetEntity() returns %s for key: %s.\n", - s.ToString().c_str(), StringToHex(key_str).c_str()); + fprintf(stderr, + "error : GetEntity() returns %s for key: %s (%" PRIi64 ").\n", + s.ToString().c_str(), StringToHex(key_str).c_str(), rand_keys[0]); thread->shared->SetVerificationFailure(); } } @@ -1450,8 +1458,10 @@ class NonBatchedOpsStressTest : public StressTest { Slice ub_slice; ReadOptions ro_copy = read_opts; - // Get the next prefix first and then see if we want to set upper bound. - // We'll use the next prefix in an assertion later on + // Randomly test with `iterate_upper_bound` and `prefix_same_as_start` + // + // Get the next prefix first and then see if we want to set it to be the + // upper bound. We'll use the next prefix in an assertion later on if (GetNextPrefix(prefix, &upper_bound) && thread->rand.OneIn(2)) { // For half of the time, set the upper bound to the next prefix ub_slice = Slice(upper_bound); @@ -1460,6 +1470,8 @@ class NonBatchedOpsStressTest : public StressTest { ro_copy.table_filter = sqfc_factory_->GetTableFilterForRangeQuery(prefix, ub_slice); } + } else if (options_.prefix_extractor && thread->rand.OneIn(2)) { + ro_copy.prefix_same_as_start = true; } std::string read_ts_str; @@ -1480,8 +1492,16 @@ class NonBatchedOpsStressTest : public StressTest { uint64_t count = 0; Status s; - for (iter->Seek(prefix); iter->Valid() && iter->key().starts_with(prefix); - iter->Next()) { + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + // If upper or prefix bounds is specified, only keys of the target + // prefix should show up. Otherwise, we need to manual exit the loop when + // we see the first key that is not in the target prefix show up. + if (ro_copy.iterate_upper_bound != nullptr || + ro_copy.prefix_same_as_start) { + assert(iter->key().starts_with(prefix)); + } else if (!iter->key().starts_with(prefix)) { + break; + } ++count; // When iter_start_ts is set, iterator exposes internal keys, including @@ -1535,7 +1555,14 @@ class NonBatchedOpsStressTest : public StressTest { if (s.ok()) { thread->stats.AddPrefixes(1, count); } else if (injected_error_count == 0 || !IsErrorInjectedAndRetryable(s)) { - fprintf(stderr, "TestPrefixScan error: %s\n", s.ToString().c_str()); + fprintf(stderr, + "TestPrefixScan error: %s with ReadOptions::iterate_upper_bound: " + "%s, prefix_same_as_start: %s \n", + s.ToString().c_str(), + ro_copy.iterate_upper_bound + ? ro_copy.iterate_upper_bound->ToString(true).c_str() + : "nullptr", + ro_copy.prefix_same_as_start ? "true" : "false"); thread->shared->SetVerificationFailure(); } @@ -1963,8 +1990,24 @@ class NonBatchedOpsStressTest : public StressTest { void TestIngestExternalFile(ThreadState* thread, const std::vector& rand_column_families, const std::vector& rand_keys) override { + // When true, we create two sst files, the first one with regular puts for + // a continuous range of keys, the second one with a standalone range + // deletion for all the keys. This is to exercise the standalone range + // deletion file's compaction input optimization. + // TODO(yuzhangyu): make this an option. + bool test_standalone_range_deletion = + thread->rand.OneInOpt(10) && FLAGS_delrangepercent > 0; + std::vector external_files; const std::string sst_filename = FLAGS_db + "/." + std::to_string(thread->tid) + ".sst"; + external_files.push_back(sst_filename); + std::string standalone_rangedel_filename; + if (test_standalone_range_deletion) { + standalone_rangedel_filename = FLAGS_db + "/." + + std::to_string(thread->tid) + + "_standalone_rangedel.sst"; + external_files.push_back(standalone_rangedel_filename); + } Status s; std::ostringstream ingest_options_oss; @@ -1976,10 +2019,15 @@ class NonBatchedOpsStressTest : public StressTest { FaultInjectionIOType::kMetadataWrite); } - if (db_stress_env->FileExists(sst_filename).ok()) { - // Maybe we terminated abnormally before, so cleanup to give this file - // ingestion a clean slate - s = db_stress_env->DeleteFile(sst_filename); + for (const auto& filename : external_files) { + if (db_stress_env->FileExists(filename).ok()) { + // Maybe we terminated abnormally before, so cleanup to give this file + // ingestion a clean slate + s = db_stress_env->DeleteFile(filename); + } + if (!s.ok()) { + return; + } } if (fault_fs_guard) { @@ -1990,9 +2038,19 @@ class NonBatchedOpsStressTest : public StressTest { } SstFileWriter sst_file_writer(EnvOptions(options_), options_); + SstFileWriter standalone_rangedel_sst_file_writer(EnvOptions(options_), + options_); if (s.ok()) { s = sst_file_writer.Open(sst_filename); } + if (s.ok() && test_standalone_range_deletion) { + s = standalone_rangedel_sst_file_writer.Open( + standalone_rangedel_filename); + } + if (!s.ok()) { + return; + } + int64_t key_base = rand_keys[0]; int column_family = rand_column_families[0]; std::vector> range_locks; @@ -2005,51 +2063,93 @@ class NonBatchedOpsStressTest : public StressTest { pending_expected_values.reserve(FLAGS_ingest_external_file_width); SharedState* shared = thread->shared; + // Grab locks, add keys assert(FLAGS_nooverwritepercent < 100); - // Grab locks, set pending state on expected values, and add keys for (int64_t key = key_base; - s.ok() && key < shared->GetMaxKey() && - static_cast(keys.size()) < FLAGS_ingest_external_file_width; + key < shared->GetMaxKey() && + key < key_base + FLAGS_ingest_external_file_width; ++key) { if (key == key_base || (key & ((1 << FLAGS_log2_keys_per_lock) - 1)) == 0) { range_locks.emplace_back( new MutexLock(shared->GetMutexForKey(column_family, key))); } - if (!shared->AllowsOverwrite(key)) { - // We could alternatively include `key` that is deleted. - continue; + if (test_standalone_range_deletion) { + // Testing standalone range deletion needs a continuous range of keys. + if (shared->AllowsOverwrite(key)) { + if (keys.empty() || (!keys.empty() && keys.back() == key - 1)) { + keys.push_back(key); + } else { + keys.clear(); + keys.push_back(key); + } + } else { + if (keys.size() > 0) { + break; + } else { + continue; + } + } + } else { + if (!shared->AllowsOverwrite(key)) { + // We could alternatively include `key` that is deleted. + continue; + } + keys.push_back(key); } - keys.push_back(key); - - PendingExpectedValue pending_expected_value = - shared->PreparePut(column_family, key); + } - const uint32_t value_base = pending_expected_value.GetFinalValueBase(); - values.push_back(value_base); - pending_expected_values.push_back(pending_expected_value); + if (s.ok() && keys.empty()) { + return; + } + // set pending state on expected values, create and ingest files. + size_t total_keys = keys.size(); + for (size_t i = 0; s.ok() && i < total_keys; i++) { + int64_t key = keys.at(i); char value[100]; auto key_str = Key(key); - const size_t value_len = GenerateValue(value_base, value, sizeof(value)); const Slice k(key_str); - const Slice v(value, value_len); - - if (FLAGS_use_put_entity_one_in > 0 && - (value_base % FLAGS_use_put_entity_one_in) == 0) { - WideColumns columns = GenerateWideColumns(value_base, v); - s = sst_file_writer.PutEntity(k, columns); - } else { + Slice v; + if (test_standalone_range_deletion) { + assert(i == 0 || keys.at(i - 1) == key - 1); s = sst_file_writer.Put(k, v); + } else { + PendingExpectedValue pending_expected_value = + shared->PreparePut(column_family, key); + const uint32_t value_base = pending_expected_value.GetFinalValueBase(); + const size_t value_len = + GenerateValue(value_base, value, sizeof(value)); + v = Slice(value, value_len); + values.push_back(value_base); + pending_expected_values.push_back(pending_expected_value); + if (FLAGS_use_put_entity_one_in > 0 && + (value_base % FLAGS_use_put_entity_one_in) == 0) { + WideColumns columns = GenerateWideColumns(values.back(), v); + s = sst_file_writer.PutEntity(k, columns); + } else { + s = sst_file_writer.Put(k, v); + } } } - - if (s.ok() && keys.empty()) { - return; + if (s.ok() && !keys.empty()) { + s = sst_file_writer.Finish(); } - if (s.ok()) { - s = sst_file_writer.Finish(); + if (s.ok() && total_keys != 0 && test_standalone_range_deletion) { + int64_t start_key = keys.at(0); + int64_t end_key = keys.back() + 1; + pending_expected_values = + shared->PrepareDeleteRange(column_family, start_key, end_key); + auto start_key_str = Key(start_key); + const Slice start_key_slice(start_key_str); + auto end_key_str = Key(end_key); + const Slice end_key_slice(end_key_str); + s = standalone_rangedel_sst_file_writer.DeleteRange(start_key_slice, + end_key_slice); + if (s.ok()) { + s = standalone_rangedel_sst_file_writer.Finish(); + } } if (s.ok()) { IngestExternalFileOptions ingest_options; @@ -2057,13 +2157,17 @@ class NonBatchedOpsStressTest : public StressTest { ingest_options.verify_checksums_before_ingest = thread->rand.OneInOpt(2); ingest_options.verify_checksums_readahead_size = thread->rand.OneInOpt(2) ? 1024 * 1024 : 0; + ingest_options.fill_cache = thread->rand.OneInOpt(4); ingest_options_oss << "move_files: " << ingest_options.move_files << ", verify_checksums_before_ingest: " << ingest_options.verify_checksums_before_ingest << ", verify_checksums_readahead_size: " - << ingest_options.verify_checksums_readahead_size; + << ingest_options.verify_checksums_readahead_size + << ", fill_cache: " << ingest_options.fill_cache + << ", test_standalone_range_deletion: " + << test_standalone_range_deletion; s = db_->IngestExternalFile(column_families_[column_family], - {sst_filename}, ingest_options); + external_files, ingest_options); } if (!s.ok()) { for (PendingExpectedValue& pending_expected_value : @@ -2775,4 +2879,4 @@ StressTest* CreateNonBatchedOpsStressTest() { } } // namespace ROCKSDB_NAMESPACE -#endif // GFLAGS +#endif // GFLAGS \ No newline at end of file diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index 20d85a785..62d44be54 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -11,6 +11,7 @@ #ifdef GFLAGS #include "tools/io_tracer_parser_tool.h" #endif +#include "rocksdb/flush_block_policy.h" #include "util/random.h" namespace { @@ -121,6 +122,81 @@ class PrefetchTest table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } + + void VerifyScan(ReadOptions& iter_ro, ReadOptions& cmp_iter_ro, + const Slice* seek_key, const Slice* iterate_upper_bound, + bool prefix_same_as_start) const { + assert(!(seek_key == nullptr)); + iter_ro.iterate_upper_bound = cmp_iter_ro.iterate_upper_bound = + iterate_upper_bound; + iter_ro.prefix_same_as_start = cmp_iter_ro.prefix_same_as_start = + prefix_same_as_start; + + auto iter = std::unique_ptr(db_->NewIterator(iter_ro)); + auto cmp_iter = std::unique_ptr(db_->NewIterator(cmp_iter_ro)); + + iter->Seek(*seek_key); + cmp_iter->Seek(*seek_key); + + while (iter->Valid() && cmp_iter->Valid()) { + if (iter->key() != cmp_iter->key()) { + // Error + ASSERT_TRUE(false); + } + iter->Next(); + cmp_iter->Next(); + } + + ASSERT_TRUE(!cmp_iter->Valid() && !iter->Valid()); + ASSERT_TRUE(cmp_iter->status().ok() && iter->status().ok()); + } + + void VerifySeekPrevSeek(ReadOptions& iter_ro, ReadOptions& cmp_iter_ro, + const Slice* seek_key, + const Slice* iterate_upper_bound, + bool prefix_same_as_start) { + assert(!(seek_key == nullptr)); + iter_ro.iterate_upper_bound = cmp_iter_ro.iterate_upper_bound = + iterate_upper_bound; + iter_ro.prefix_same_as_start = cmp_iter_ro.prefix_same_as_start = + prefix_same_as_start; + + auto iter = std::unique_ptr(db_->NewIterator(iter_ro)); + auto cmp_iter = std::unique_ptr(db_->NewIterator(cmp_iter_ro)); + + // Seek + cmp_iter->Seek(*seek_key); + ASSERT_TRUE(cmp_iter->Valid()); + ASSERT_OK(cmp_iter->status()); + + iter->Seek(*seek_key); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + + ASSERT_EQ(iter->key(), cmp_iter->key()); + + // Prev op should pass + cmp_iter->Prev(); + ASSERT_TRUE(cmp_iter->Valid()); + ASSERT_OK(cmp_iter->status()); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + + ASSERT_EQ(iter->key(), cmp_iter->key()); + + // Reseek would follow as usual + cmp_iter->Seek(*seek_key); + ASSERT_TRUE(cmp_iter->Valid()); + ASSERT_OK(cmp_iter->status()); + + iter->Seek(*seek_key); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + + ASSERT_EQ(iter->key(), cmp_iter->key()); + } }; INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest, @@ -599,6 +675,8 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { default: assert(false); } + ASSERT_OK(iter->status()); + ASSERT_OK(iter->Refresh()); // Update to latest mutable options for (int i = 0; i < num_keys_per_level; ++i) { iter->Seek(Key(key_count++)); @@ -726,6 +804,8 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) { default: assert(false); } + ASSERT_OK(iter->status()); + ASSERT_OK(iter->Refresh()); // Update to latest mutable options for (int i = 0; i < num_keys_per_level; ++i) { iter->Seek(Key(key_count++)); @@ -1262,6 +1342,8 @@ TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { Options options; SetGenericOptions(env.get(), /*use_direct_io=*/false, options); options.statistics = CreateDBStatistics(); + const std::string prefix = "my_key_"; + options.prefix_extractor.reset(NewFixedPrefixTransform(prefix.size())); BlockBasedTableOptions table_options; SetBlockBasedTableOptions(table_options); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -1272,8 +1354,9 @@ TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { Random rnd(309); WriteBatch batch; + // Create the DB with keys from "my_key_aaaaaaaaaa" to "my_key_zzzzzzzzzz" for (int i = 0; i < 26; i++) { - std::string key = "my_key_"; + std::string key = prefix; for (int j = 0; j < 10; j++) { key += char('a' + i); @@ -1282,9 +1365,9 @@ TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { } ASSERT_OK(db_->Write(WriteOptions(), &batch)); - std::string start_key = "my_key_a"; + std::string start_key = prefix + "a"; - std::string end_key = "my_key_"; + std::string end_key = prefix; for (int j = 0; j < 10; j++) { end_key += char('a' + 25); } @@ -1309,32 +1392,30 @@ TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { { auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); - iter->Seek("my_key_bbb"); + iter->Seek(prefix + "bbb"); ASSERT_TRUE(iter->Valid()); - iter->Seek("my_key_ccccccccc"); + iter->Seek(prefix + "ccccccccc"); ASSERT_TRUE(iter->Valid()); - iter->Seek("my_key_ddd"); + iter->Seek(prefix + "ddd"); ASSERT_TRUE(iter->Valid()); - iter->Seek("my_key_ddddddd"); + iter->Seek(prefix + "ddddddd"); ASSERT_TRUE(iter->Valid()); - iter->Seek("my_key_e"); + iter->Seek(prefix + "e"); ASSERT_TRUE(iter->Valid()); - iter->Seek("my_key_eeeee"); + iter->Seek(prefix + "eeeee"); ASSERT_TRUE(iter->Valid()); - iter->Seek("my_key_eeeeeeeee"); + iter->Seek(prefix + "eeeeeeeee"); ASSERT_TRUE(iter->Valid()); } ReadOptions ropts; - ropts.auto_readahead_size = true; ReadOptions cmp_ro; - cmp_ro.auto_readahead_size = false; if (std::get<0>(GetParam())) { ropts.readahead_size = cmp_ro.readahead_size = 32768; @@ -1345,61 +1426,31 @@ TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { } // With and without tuning readahead_size. - { - ASSERT_OK(options.statistics->Reset()); - // Seek. - { - Slice ub = Slice("my_key_uuu"); - Slice* ub_ptr = &ub; - cmp_ro.iterate_upper_bound = ub_ptr; - ropts.iterate_upper_bound = ub_ptr; - - auto iter = std::unique_ptr(db_->NewIterator(ropts)); - auto cmp_iter = std::unique_ptr(db_->NewIterator(cmp_ro)); - - Slice seek_key = Slice("my_key_aaa"); - iter->Seek(seek_key); - cmp_iter->Seek(seek_key); - - while (iter->Valid() && cmp_iter->Valid()) { - if (iter->key() != cmp_iter->key()) { - // Error - ASSERT_TRUE(false); - } - iter->Next(); - cmp_iter->Next(); - } - - ASSERT_OK(cmp_iter->status()); - ASSERT_OK(iter->status()); - } - - // Reseek with new upper_bound_iterator. - { - Slice ub = Slice("my_key_y"); - ropts.iterate_upper_bound = &ub; - cmp_ro.iterate_upper_bound = &ub; - - auto iter = std::unique_ptr(db_->NewIterator(ropts)); - auto cmp_iter = std::unique_ptr(db_->NewIterator(cmp_ro)); - - Slice reseek_key = Slice("my_key_v"); - iter->Seek(reseek_key); - cmp_iter->Seek(reseek_key); - - while (iter->Valid() && cmp_iter->Valid()) { - if (iter->key() != cmp_iter->key()) { - // Error - ASSERT_TRUE(false); - } - iter->Next(); - cmp_iter->Next(); - } - - ASSERT_OK(cmp_iter->status()); - ASSERT_OK(iter->status()); - } - } + ropts.auto_readahead_size = true; + cmp_ro.auto_readahead_size = false; + ASSERT_OK(options.statistics->Reset()); + // Seek with a upper bound + const std::string seek_key_str = prefix + "aaa"; + const Slice seek_key(seek_key_str); + const std::string ub_str = prefix + "uuu"; + const Slice ub(ub_str); + VerifyScan(ropts /* iter_ro */, cmp_ro /* cmp_iter_ro */, + &seek_key /* seek_key */, &ub /* iterate_upper_bound */, + false /* prefix_same_as_start */); + + // Seek with a new seek key and upper bound + const std::string seek_key_new_str = prefix + "v"; + const Slice seek_key_new(seek_key_new_str); + const std::string ub_new_str = prefix + "y"; + const Slice ub_new(ub_new_str); + VerifyScan(ropts /* iter_ro */, cmp_ro /* cmp_iter_ro */, + &seek_key_new /* seek_key */, &ub_new /* iterate_upper_bound */, + false /* prefix_same_as_start */); + + // Seek with no upper bound, prefix_same_as_start = true + VerifyScan(ropts /* iter_ro */, cmp_ro /* cmp_iter_ro */, + &seek_key /* seek_key */, nullptr /* iterate_upper_bound */, + true /* prefix_same_as_start */); Close(); } } @@ -1418,6 +1469,8 @@ TEST_F(PrefetchTest, PrefetchWithBlockLookupAutoTuneWithPrev) { Options options; SetGenericOptions(env.get(), /*use_direct_io=*/false, options); options.statistics = CreateDBStatistics(); + const std::string prefix = "my_key_"; + options.prefix_extractor.reset(NewFixedPrefixTransform(prefix.size())); BlockBasedTableOptions table_options; SetBlockBasedTableOptions(table_options); std::shared_ptr cache = NewLRUCache(1024 * 1024, 2); @@ -1432,7 +1485,7 @@ TEST_F(PrefetchTest, PrefetchWithBlockLookupAutoTuneWithPrev) { WriteBatch batch; for (int i = 0; i < 26; i++) { - std::string key = "my_key_"; + std::string key = prefix; for (int j = 0; j < 10; j++) { key += char('a' + i); @@ -1441,9 +1494,9 @@ TEST_F(PrefetchTest, PrefetchWithBlockLookupAutoTuneWithPrev) { } ASSERT_OK(db_->Write(WriteOptions(), &batch)); - std::string start_key = "my_key_a"; + std::string start_key = prefix + "a"; - std::string end_key = "my_key_"; + std::string end_key = prefix; for (int j = 0; j < 10; j++) { end_key += char('a' + 25); } @@ -1455,58 +1508,147 @@ TEST_F(PrefetchTest, PrefetchWithBlockLookupAutoTuneWithPrev) { ReadOptions ropts; ropts.auto_readahead_size = true; + ReadOptions cmp_readopts = ropts; + cmp_readopts.auto_readahead_size = false; + + const std::string seek_key_str = prefix + "bbb"; + const Slice seek_key(seek_key_str); + const std::string ub_key = prefix + "uuu"; + const Slice ub(ub_key); + + VerifySeekPrevSeek(ropts /* iter_ro */, cmp_readopts /* cmp_iter_ro */, + &seek_key /* seek_key */, &ub /* iterate_upper_bound */, + false /* prefix_same_as_start */); + + VerifySeekPrevSeek(ropts /* iter_ro */, cmp_readopts /* cmp_iter_ro */, + &seek_key /* seek_key */, + nullptr /* iterate_upper_bound */, + true /* prefix_same_as_start */); + Close(); +} - { - // Seek. - Slice ub = Slice("my_key_uuu"); - Slice* ub_ptr = &ub; - ropts.iterate_upper_bound = ub_ptr; - ropts.auto_readahead_size = true; +class PrefetchTrimReadaheadTestParam + : public DBTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + const std::string kPrefix = "a_prefix_"; + Random rnd = Random(309); - ReadOptions cmp_readopts = ropts; - cmp_readopts.auto_readahead_size = false; + PrefetchTrimReadaheadTestParam() + : DBTestBase("prefetch_trim_readahead_test_param", true) {} + virtual void SetGenericOptions(Env* env, Options& options) { + options = CurrentOptions(); + options.env = env; + options.create_if_missing = true; + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); - auto iter = std::unique_ptr(db_->NewIterator(ropts)); - auto cmp_iter = std::unique_ptr(db_->NewIterator(cmp_readopts)); + // To make all the data bocks fit in one file for testing purpose + options.write_buffer_size = 1024 * 1024 * 1024; + options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefix.size())); + } - Slice seek_key = Slice("my_key_bbb"); - { - cmp_iter->Seek(seek_key); - ASSERT_TRUE(cmp_iter->Valid()); - ASSERT_OK(cmp_iter->status()); + void SetBlockBasedTableOptions(BlockBasedTableOptions& table_options) { + table_options.no_block_cache = false; + table_options.index_shortening = std::get<0>(GetParam()); - iter->Seek(seek_key); - ASSERT_TRUE(iter->Valid()); - ASSERT_OK(iter->status()); + // To force keys with different prefixes are in different data blocks of the + // file for testing purpose + table_options.block_size = 1; + table_options.flush_block_policy_factory.reset( + new FlushBlockBySizePolicyFactory()); + } +}; - ASSERT_EQ(iter->key(), cmp_iter->key()); - } +INSTANTIATE_TEST_CASE_P( + PrefetchTrimReadaheadTestParam, PrefetchTrimReadaheadTestParam, + ::testing::Combine( + // Params are as follows - + // Param 0 - TableOptions::index_shortening + // Param 2 - ReadOptinos::auto_readahead_size + ::testing::Values( + BlockBasedTableOptions::IndexShorteningMode::kNoShortening, + BlockBasedTableOptions::IndexShorteningMode::kShortenSeparators, + BlockBasedTableOptions::IndexShorteningMode:: + kShortenSeparatorsAndSuccessor), + ::testing::Bool())); + +TEST_P(PrefetchTrimReadaheadTestParam, PrefixSameAsStart) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + const bool auto_readahead_size = std::get<1>(GetParam()); - // Prev op should pass with auto tuning of readahead_size. - { - cmp_iter->Prev(); - ASSERT_TRUE(cmp_iter->Valid()); - ASSERT_OK(cmp_iter->status()); + std::shared_ptr fs = std::make_shared( + FileSystem::Default(), false /* support_prefetch */, + true /* small_buffer_alignment */); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + Options options; + SetGenericOptions(env.get(), options); + BlockBasedTableOptions table_optoins; + SetBlockBasedTableOptions(table_optoins); + options.table_factory.reset(NewBlockBasedTableFactory(table_optoins)); - iter->Prev(); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); + Status s = TryReopen(options); + ASSERT_OK(s); - ASSERT_EQ(iter->key(), cmp_iter->key()); - } + // To create a DB with data block layout (denoted as "[...]" below ) as the + // following: + // ["a_prefix_0": random value] + // ["a_prefix_1": random value] + // ... + // ["a_prefix_9": random value] + // ["c_prefix_0": random value] + // ["d_prefix_1": random value] + // ... + // ["l_prefix_9": random value] + // + // We want to verify keys not with prefix "a_prefix_" are not prefetched due + // to trimming + WriteBatch prefix_batch; + for (int i = 0; i < 10; i++) { + std::string key = kPrefix + std::to_string(i); + ASSERT_OK(prefix_batch.Put(key, rnd.RandomString(100))); + } + ASSERT_OK(db_->Write(WriteOptions(), &prefix_batch)); + + WriteBatch diff_prefix_batch; + for (int i = 0; i < 10; i++) { + std::string diff_prefix = std::string(1, char('c' + i)) + kPrefix.substr(1); + std::string key = diff_prefix + std::to_string(i); + ASSERT_OK(diff_prefix_batch.Put(key, rnd.RandomString(100))); + } + ASSERT_OK(db_->Write(WriteOptions(), &diff_prefix_batch)); - // Reseek would follow as usual. - { - cmp_iter->Seek(seek_key); - ASSERT_TRUE(cmp_iter->Valid()); - ASSERT_OK(cmp_iter->status()); + ASSERT_OK(db_->Flush(FlushOptions())); - iter->Seek(seek_key); - ASSERT_OK(iter->status()); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key(), cmp_iter->key()); + // To verify readahead is trimmed based on prefix by checking the counter + // READAHEAD_TRIMMED + ReadOptions ro; + ro.prefix_same_as_start = true; + ro.auto_readahead_size = auto_readahead_size; + // Set a large readahead size to introduce readahead waste when without + // trimming based on prefix + ro.readahead_size = 1024 * 1024 * 1024; + + ASSERT_OK(options.statistics->Reset()); + { + auto iter = std::unique_ptr(db_->NewIterator(ro)); + for (iter->Seek(kPrefix); iter->status().ok() && iter->Valid(); + iter->Next()) { } } + + auto readahead_trimmed = + options.statistics->getTickerCount(READAHEAD_TRIMMED); + + if (auto_readahead_size) { + ASSERT_GT(readahead_trimmed, 0); + } else { + ASSERT_EQ(readahead_trimmed, 0); + } Close(); } diff --git a/include/rocksdb/attribute_groups.h b/include/rocksdb/attribute_groups.h index bbc621703..c7944eb50 100644 --- a/include/rocksdb/attribute_groups.h +++ b/include/rocksdb/attribute_groups.h @@ -35,6 +35,10 @@ inline bool operator==(const AttributeGroup& lhs, const AttributeGroup& rhs) { lhs.columns() == rhs.columns(); } +inline bool operator!=(const AttributeGroup& lhs, const AttributeGroup& rhs) { + return !(lhs == rhs); +} + // A collection of Attribute Groups. using AttributeGroups = std::vector; @@ -84,6 +88,11 @@ class IteratorAttributeGroup { explicit IteratorAttributeGroup(ColumnFamilyHandle* column_family, const WideColumns* columns) : column_family_(column_family), columns_(columns) {} + + explicit IteratorAttributeGroup(const AttributeGroup& attribute_group) + : IteratorAttributeGroup(attribute_group.column_family(), + &attribute_group.columns()) {} + ColumnFamilyHandle* column_family() const { return column_family_; } const WideColumns& columns() const { return *columns_; } @@ -92,6 +101,17 @@ class IteratorAttributeGroup { const WideColumns* columns_; }; +inline bool operator==(const IteratorAttributeGroup& lhs, + const IteratorAttributeGroup& rhs) { + return lhs.column_family() == rhs.column_family() && + lhs.columns() == rhs.columns(); +} + +inline bool operator!=(const IteratorAttributeGroup& lhs, + const IteratorAttributeGroup& rhs) { + return !(lhs == rhs); +} + using IteratorAttributeGroups = std::vector; extern const IteratorAttributeGroups kNoIteratorAttributeGroups; diff --git a/include/rocksdb/configurable.h b/include/rocksdb/configurable.h index a200d7e86..9dab3409f 100644 --- a/include/rocksdb/configurable.h +++ b/include/rocksdb/configurable.h @@ -47,8 +47,10 @@ class Configurable { struct RegisteredOptions { // The name of the options being registered std::string name; - // Pointer to the object being registered - void* opt_ptr; + // Pointer to the object being registered, relative to `this` so that + // RegisteredOptions are copyable from one Configurable to another of the + // same type, assuming the option is a member of `this`. + intptr_t opt_offset; // The map of options being registered const std::unordered_map* type_map; }; @@ -79,6 +81,8 @@ class Configurable { } template T* GetOptions(const std::string& name) { + // FIXME: Is this sometimes reading a raw pointer from a shared_ptr, + // unsafely relying on the object layout? return reinterpret_cast(const_cast(GetOptionsPtr(name))); } @@ -382,9 +386,9 @@ class Configurable { inline bool HasRegisteredOptions() const { return !options_.empty(); } private: - // Contains the collection of options (name, opt_ptr, opt_map) associated with - // this object. This collection is typically set in the constructor of the - // Configurable option via + // Contains the collection of options (name, opt_offset, opt_map) associated + // with this object. This collection is typically set in the constructor of + // the specific Configurable via RegisterOptions(). std::vector options_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/iterator_base.h b/include/rocksdb/iterator_base.h index 335c41df7..dc172ffed 100644 --- a/include/rocksdb/iterator_base.h +++ b/include/rocksdb/iterator_base.h @@ -74,6 +74,27 @@ class IteratorBase : public Cleanable { return Status::NotSupported("Refresh() is not supported"); } + // When ReadOptions::allow_unprepared_value is set, the iterator may defer + // loading and/or preparing the value when moving to a different entry (i.e. + // during SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev operations). This + // can be used to save on I/O and/or CPU when the values associated with + // certain keys may not be used by the application. When + // allow_unprepared_value is true, the application is expected to call this + // method before accessing the value to ensure it is prepared (for all entries + // whose values are actually needed). Note: it is safe to call this method for + // entries whose values are already prepared. + // + // Returns true on success. Returns false and sets Valid() to false and + // status() to non-OK if there is an error while loading or preparing the + // value. + // + // Note: this option currently only applies to 1) large values stored in blob + // files using BlobDB and 2) multi-column-family iterators (CoalescingIterator + // and AttributeGroupIterator). Otherwise, it has no effect. + // + // REQUIRES: Valid() + virtual bool PrepareValue() { return true; } + // Return the key for the current entry. The underlying storage for // the returned slice is valid only until the next modification of the // iterator (i.e. the next SeekToFirst/SeekToLast/Seek/SeekForPrev/Next/Prev diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 9700e25af..9160494ef 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -388,6 +388,8 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // block cache entries (shared among copies) are obsolete. Such a scenerio // is the best case for uncache_aggressiveness = 0. // + // When using allow_mmap_reads=true, this option is ignored (no un-caching). + // // Once validated in production, the default will likely change to something // around 300. uint32_t uncache_aggressiveness = 0; @@ -1910,10 +1912,25 @@ struct ReadOptions { std::function table_filter; // If auto_readahead_size is set to true, it will auto tune the readahead_size - // during scans internally. - // For this feature to enabled, iterate_upper_bound must also be specified. - // - // NOTE: - Recommended for forward Scans only. + // during scans internally based on block cache data when block cache is + // enabled, iteration upper bound when `iterate_upper_bound != nullptr` and + // prefix when `prefix_same_as_start == true` + // + // Besides enabling block cache, it + // also requires `iterate_upper_bound != nullptr` or `prefix_same_as_start == + // true` for this option to take effect + // + // To be specific, it does the following: + // (1) When `iterate_upper_bound` + // is specified, trim the readahead so the readahead does not exceed iteration + // upper bound + // (2) When `prefix_same_as_start` is set to true, trim the + // readahead so data blocks containing keys that are not in the same prefix as + // the seek key in `Seek()` are not prefetched + // - Limition: `Seek(key)` instead of `SeekToFirst()` needs to be called in + // order for this trimming to take effect + // + // NOTE: - Used for forward Scans only. // - If there is a backward scans, this option will be // disabled internally and won't be enabled again if the forward scan // is issued again. @@ -1921,6 +1938,19 @@ struct ReadOptions { // Default: true bool auto_readahead_size = true; + // When set, the iterator may defer loading and/or preparing the value when + // moving to a different entry (i.e. during SeekToFirst/SeekToLast/Seek/ + // SeekForPrev/Next/Prev operations). This can be used to save on I/O and/or + // CPU when the values associated with certain keys may not be used by the + // application. See also IteratorBase::PrepareValue(). + // + // Note: this option currently only applies to 1) large values stored in blob + // files using BlobDB and 2) multi-column-family iterators (CoalescingIterator + // and AttributeGroupIterator). Otherwise, it has no effect. + // + // Default: false + bool allow_unprepared_value = false; + // *** END options only relevant to iterators or scans *** // *** BEGIN options for RocksDB internal use only *** @@ -2037,6 +2067,7 @@ struct FlushOptions { // is performed by someone else (foreground call or background thread). // Default: false bool allow_write_stall; + FlushOptions() : wait(true), allow_write_stall(false) {} }; @@ -2256,6 +2287,14 @@ struct IngestExternalFileOptions { // RepairDB() may not recover these files correctly, potentially leading to // data loss. bool allow_db_generated_files = false; + + // Controls whether data and metadata blocks (e.g. index, filter) read during + // file ingestion will be added to block cache. + // Users may wish to set this to false when bulk loading into a CF that is not + // available for reads yet. + // When ingesting to multiple families, this option should be the same across + // ingestion options. + bool fill_cache = true; }; enum TraceFilterType : uint64_t { diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h index 976a41400..c11139918 100644 --- a/include/rocksdb/sst_file_writer.h +++ b/include/rocksdb/sst_file_writer.h @@ -195,6 +195,9 @@ class SstFileWriter { // Return the current file size. uint64_t FileSize(); + // Check if a file with input table property is created by SstFileWriter. + static bool CreatedBySstFileWriter(const TableProperties&); + private: void InvalidatePageCache(bool closing); struct Rep; diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index c7fe503ff..e1f76fcd4 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -125,7 +125,25 @@ struct CacheUsageOptions { std::map options_overrides; }; -// For advanced user only +// Configures how SST files using the block-based table format (standard) +// are written and read. +// +// Except as specifically noted, all options here are "mutable" using +// SetOptions(), with the caveat that only new table builders and new table +// readers will pick up new options. This is nearly immediate effect for +// SST building, but in the worst case, options affecting reads only take +// effect for new files. (Unless the DB is closed and re-opened, table readers +// can live as long as the SST file itself.) +// +// Examples (DB* db): +// db->SetOptions({{"block_based_table_factory", +// "{detect_filter_construct_corruption=true;}"}}); +// db->SetOptions({{"block_based_table_factory", +// "{max_auto_readahead_size=0;block_size=8192;}"}})); +// db->SetOptions({{"block_based_table_factory", +// "{prepopulate_block_cache=kFlushOnly;}"}})); +// db->SetOptions({{"block_based_table_factory", +// "{filter_policy=ribbonfilter:10;}"}}); struct BlockBasedTableOptions { static const char* kName() { return "BlockTableOptions"; } // @flush_block_policy_factory creates the instances of flush block policy. @@ -256,13 +274,20 @@ struct BlockBasedTableOptions { // even though they have different checksum type. ChecksumType checksum = kXXH3; - // Disable block cache. If this is set to true, - // then no block cache should be used, and the block_cache should - // point to a nullptr object. + // Disable block cache. If this is set to true, then no block cache + // will be configured (block_cache reset to nullptr). + // + // This option should not be used with SetOptions. bool no_block_cache = false; - // If non-NULL use the specified cache for blocks. - // If NULL, rocksdb will automatically create and use a 32MB internal cache. + // If non-nullptr and no_block_cache == false, use the specified cache for + // blocks. If nullptr and no_block_cache == false, a 32MB internal cache + // will be created and used. + // + // This option should not be used with SetOptions, because (a) the code + // to make it safe is incomplete, and (b) it is not clear when/if the + // old block cache would go away. For now, dynamic changes to block cache + // should be through the Cache object, e.g. Cache::SetCapacity(). std::shared_ptr block_cache = nullptr; // If non-NULL use the specified cache for pages read from device @@ -468,10 +493,6 @@ struct BlockBasedTableOptions { // useful in detecting software bugs or CPU+memory malfunction. // Turning on this feature increases filter construction time by 30%. // - // This parameter can be changed dynamically by - // DB::SetOptions({{"block_based_table_factory", - // "{detect_filter_construct_corruption=true;}"}}); - // // TODO: optimize this performance bool detect_filter_construct_corruption = false; @@ -602,13 +623,6 @@ struct BlockBasedTableOptions { // Found that 256 KB readahead size provides the best performance, based on // experiments, for auto readahead. Experiment data is in PR #3282. // - // This parameter can be changed dynamically by - // DB::SetOptions({{"block_based_table_factory", - // "{max_auto_readahead_size=0;}"}})); - // - // Changing the value dynamically will only affect files opened after the - // change. - // // Default: 256 KB (256 * 1024). size_t max_auto_readahead_size = 256 * 1024; @@ -620,10 +634,6 @@ struct BlockBasedTableOptions { // further helps if the workload exhibits high temporal locality, where most // of the reads go to recently written data. This also helps in case of // Distributed FileSystem. - // - // This parameter can be changed dynamically by - // DB::SetOptions({{"block_based_table_factory", - // "{prepopulate_block_cache=kFlushOnly;}"}})); enum class PrepopulateBlockCache : char { // Disable prepopulate block cache. kDisable, @@ -653,13 +663,6 @@ struct BlockBasedTableOptions { // Value should be provided along with KB i.e. 8 * 1024 as it will prefetch // the blocks. // - // This parameter can be changed dynamically by - // DB::SetOptions({{"block_based_table_factory", - // "{initial_auto_readahead_size=0;}"}})); - // - // Changing the value dynamically will only affect files opened after the - // change. - // // Default: 8 KB (8 * 1024). size_t initial_auto_readahead_size = 8 * 1024; @@ -934,6 +937,11 @@ class TableFactory : public Customizable { const TableBuilderOptions& table_builder_options, WritableFileWriter* file) const = 0; + // Clone this TableFactory with the same options, ideally a "shallow" clone + // in which shared_ptr members and hidden state are (safely) shared between + // this original and the returned clone. + virtual std::unique_ptr Clone() const = 0; + // Return is delete range supported virtual bool IsDeleteRangeSupported() const { return false; } }; diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h index c3a12b694..55d5663be 100644 --- a/include/rocksdb/utilities/ldb_cmd.h +++ b/include/rocksdb/utilities/ldb_cmd.h @@ -74,6 +74,7 @@ class LDBCommand { static const std::string ARG_DECODE_BLOB_INDEX; static const std::string ARG_DUMP_UNCOMPRESSED_BLOBS; static const std::string ARG_READ_TIMESTAMP; + static const std::string ARG_GET_WRITE_UNIX_TIME; struct ParsedParams { std::string cmd; diff --git a/include/rocksdb/utilities/options_type.h b/include/rocksdb/utilities/options_type.h index aea24526c..9feb61fa9 100644 --- a/include/rocksdb/utilities/options_type.h +++ b/include/rocksdb/utilities/options_type.h @@ -58,6 +58,7 @@ enum class OptionType { kEncodedString, kTemperature, kArray, + kStringMap, // Map of kUnknown, }; @@ -241,42 +242,13 @@ using ValidateFunc = std::function map; + Status s; + for (size_t start = 0, end = 0; + s.ok() && start < value.size() && end != std::string::npos; + start = end + 1) { + std::string token; + s = OptionTypeInfo::NextToken(value, item_separator, start, &end, + &token); + if (s.ok() && !token.empty()) { + size_t pos = token.find(kv_separator); + assert(pos != std::string::npos); + std::string k = token.substr(0, pos); + std::string v = token.substr(pos + 1); + std::string decoded_key; + std::string decoded_value; + (Slice(k)).DecodeHex(&decoded_key); + (Slice(v)).DecodeHex(&decoded_value); + map.emplace(std::move(decoded_key), std::move(decoded_value)); + } + } + if (s.ok()) { + *(static_cast*>(addr)) = map; + } + return s; + }); + info.SetSerializeFunc( + [kv_separator, item_separator](const ConfigOptions&, const std::string&, + const void* addr, std::string* value) { + const auto map = + static_cast*>(addr); + value->append("{"); + for (const auto& entry : *map) { + value->append(Slice(entry.first).ToString(true)); + *value += kv_separator; + value->append(Slice(entry.second).ToString(true)); + *value += item_separator; + } + value->append("}"); + return Status::OK(); + }); + info.SetEqualsFunc([](const ConfigOptions&, const std::string&, + const void* addr1, const void* addr2, std::string*) { + return (*static_cast*>(addr1) == + *static_cast*>(addr2)); + }); + return info; + } + // Create a new std::shared_ptr OptionTypeInfo // This function will call the T::CreateFromString method to create a new // std::shared_ptr object. // // @param offset The offset for the Customizable from the base pointer // @param ovt How to verify this option - // @param flags, Extra flags specifying the behavior of this option - // @param _sfunc Optional function for serializing this option - // @param _efunc Optional function for comparing this option + // @param flags Extra flags specifying the behavior of this option + // @param serialize_func Optional function for serializing this option + // @param equals_func Optional function for comparing this option template - static OptionTypeInfo AsCustomSharedPtr(int offset, - OptionVerificationType ovt, - OptionTypeFlags flags) { - OptionTypeInfo info(offset, OptionType::kCustomizable, ovt, - flags | OptionTypeFlags::kShared); - return info.SetParseFunc([](const ConfigOptions& opts, - const std::string& name, - const std::string& value, void* addr) { + static OptionTypeInfo AsCustomSharedPtr( + int offset, OptionVerificationType ovt, + OptionTypeFlags flags = OptionTypeFlags::kNone, + const SerializeFunc& serialize_func = {}, + const EqualsFunc& equals_func = {}) { + auto parse_func = [](const ConfigOptions& opts, const std::string& name, + const std::string& value, void* addr) { auto* shared = static_cast*>(addr); if (name == kIdPropName() && value.empty()) { shared->reset(); @@ -495,19 +523,10 @@ class OptionTypeInfo { } else { return T::CreateFromString(opts, value, shared); } - }); - } - - template - static OptionTypeInfo AsCustomSharedPtr(int offset, - OptionVerificationType ovt, - OptionTypeFlags flags, - const SerializeFunc& serialize_func, - const EqualsFunc& equals_func) { - OptionTypeInfo info(AsCustomSharedPtr(offset, ovt, flags)); - info.SetSerializeFunc(serialize_func); - info.SetEqualsFunc(equals_func); - return info; + }; + return OptionTypeInfo(offset, OptionType::kCustomizable, ovt, + flags | OptionTypeFlags::kShared, parse_func, + serialize_func, equals_func); } // Create a new std::unique_ptr OptionTypeInfo @@ -612,6 +631,9 @@ class OptionTypeInfo { return *this; } + OptionTypeFlags GetFlags() const { return flags_; } + void SetFlags(OptionTypeFlags flags) { flags_ = flags; } + bool IsEnabled(OptionTypeFlags otf) const { return (flags_ & otf) == otf; } bool IsEditable(const ConfigOptions& opts) const { @@ -714,6 +736,8 @@ class OptionTypeInfo { bool IsCustomizable() const { return (type_ == OptionType::kCustomizable); } + OptionType GetType() const { return type_; } + inline const void* GetOffset(const void* base) const { return static_cast(base) + offset_; } diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 2a9b1aff3..d414e2fd5 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -12,7 +12,7 @@ // NOTE: in 'main' development branch, this should be the *next* // minor or major version number planned for release. #define ROCKSDB_MAJOR 9 -#define ROCKSDB_MINOR 8 +#define ROCKSDB_MINOR 9 #define ROCKSDB_PATCH 0 // Do not use these. We made the mistake of declaring macros starting with diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index 06ef0397a..9fdf618fa 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -141,8 +141,9 @@ class InlineSkipList { // Returns true iff an entry that compares equal to key is in the list. bool Contains(const char* key) const; - // Return estimated number of entries smaller than `key`. - uint64_t EstimateCount(const char* key) const; + // Return estimated number of entries from `start_ikey` to `end_ikey`. + uint64_t ApproximateNumEntries(const Slice& start_ikey, + const Slice& end_ikey) const; // Validate correctness of the skip-list. void TEST_Validate() const; @@ -673,31 +674,88 @@ InlineSkipList::FindRandomEntry() const { } template -uint64_t InlineSkipList::EstimateCount(const char* key) const { +uint64_t InlineSkipList::ApproximateNumEntries( + const Slice& start_ikey, const Slice& end_ikey) const { + // The number of entries at a given level for the given range, in terms of + // the actual number of entries in that range (level 0), follows a binomial + // distribution, which is very well approximated by the Poisson distribution. + // That has stddev sqrt(x) where x is the expected number of entries (mean) + // at this level, and the best predictor of x is the number of observed + // entries (at this level). To predict the number of entries on level 0 we use + // x * kBranchinng ^ level. From the standard deviation, the P99+ relative + // error is roughly 3 * sqrt(x) / x. Thus, a reasonable approach would be to + // find the smallest level with at least some moderate constant number entries + // in range. E.g. with at least ~40 entries, we expect P99+ relative error + // (approximation accuracy) of ~ 50% = 3 * sqrt(40) / 40; P95 error of + // ~30%; P75 error of < 20%. + // + // However, there are two issues with this approach, and an observation: + // * Pointer chasing on the larger (bottom) levels is much slower because of + // cache hierarchy effects, so when the result is smaller, getting the result + // will be substantially slower, despite traversing a similar number of + // entries. (We could be clever about pipelining our pointer chasing but + // that's complicated.) + // * The larger (bottom) levels also have lower variance because there's a + // chance (or certainty) that we reach level 0 and return the exact answer. + // * For applications in query planning, we can also tolerate more variance on + // small results because the impact of misestimating is likely smaller. + // + // These factors point us to an approach in which we have a higher minimum + // threshold number of samples for higher levels and lower for lower levels + // (see sufficient_samples below). This seems to yield roughly consistent + // relative error (stddev around 20%, less for large results) and roughly + // consistent query time around the time of two memtable point queries. + // + // Engineering observation: it is tempting to think that taking into account + // what we already found in how many entries occur on higher levels, not just + // the first iterated level with a sufficient number of samples, would yield + // a more accurate estimate. But that doesn't work because of the particular + // correlations and independences of the data: each level higher is just an + // independently probabilistic filtering of the level below it. That + // filtering from level l to l+1 has no more information about levels + // 0 .. l-1 than we can get from level l. The structure of RandomHeight() is + // a clue to these correlations and independences. + + Node* lb = head_; + Node* ub = nullptr; uint64_t count = 0; - - Node* x = head_; - int level = GetMaxHeight() - 1; - const DecodedKey key_decoded = compare_.decode_key(key); - while (true) { - assert(x == head_ || compare_(x->Key(), key_decoded) < 0); - Node* next = x->Next(level); - if (next != nullptr) { - PREFETCH(next->Next(level), 0, 1); + for (int level = GetMaxHeight() - 1; level >= 0; level--) { + auto sufficient_samples = static_cast(level) * kBranching_ + 10U; + if (count >= sufficient_samples) { + // No more counting; apply powers of kBranching and avoid floating point + count *= kBranching_; + continue; } - if (next == nullptr || compare_(next->Key(), key_decoded) >= 0) { - if (level == 0) { - return count; - } else { - // Switch to next list - count *= kBranching_; - level--; + count = 0; + Node* next; + // Get a more precise lower bound (for start key) + for (;;) { + next = lb->Next(level); + if (next == ub) { + break; + } + assert(next != nullptr); + if (compare_(next->Key(), start_ikey) >= 0) { + break; + } + lb = next; + } + // Count entries on this level until upper bound (for end key) + for (;;) { + if (next == ub) { + break; + } + assert(next != nullptr); + if (compare_(next->Key(), end_ikey) >= 0) { + // Save refined upper bound to potentially save key comparison + ub = next; + break; } - } else { - x = next; count++; + next = next->Next(level); } } + return count; } template diff --git a/memtable/skiplist.h b/memtable/skiplist.h index e3cecd30c..f2e2a829d 100644 --- a/memtable/skiplist.h +++ b/memtable/skiplist.h @@ -64,8 +64,9 @@ class SkipList { // Returns true iff an entry that compares equal to key is in the list. bool Contains(const Key& key) const; - // Return estimated number of entries smaller than `key`. - uint64_t EstimateCount(const Key& key) const; + // Return estimated number of entries from `start_ikey` to `end_ikey`. + uint64_t ApproximateNumEntries(const Slice& start_ikey, + const Slice& end_ikey) const; // Iteration over the contents of a skip list class Iterator { @@ -383,27 +384,49 @@ typename SkipList::Node* SkipList::FindLast() } template -uint64_t SkipList::EstimateCount(const Key& key) const { +uint64_t SkipList::ApproximateNumEntries( + const Slice& start_ikey, const Slice& end_ikey) const { + // See InlineSkipList::ApproximateNumEntries() (copy-paste) + Node* lb = head_; + Node* ub = nullptr; uint64_t count = 0; - - Node* x = head_; - int level = GetMaxHeight() - 1; - while (true) { - assert(x == head_ || compare_(x->key, key) < 0); - Node* next = x->Next(level); - if (next == nullptr || compare_(next->key, key) >= 0) { - if (level == 0) { - return count; - } else { - // Switch to next list - count *= kBranching_; - level--; + for (int level = GetMaxHeight() - 1; level >= 0; level--) { + auto sufficient_samples = static_cast(level) * kBranching_ + 10U; + if (count >= sufficient_samples) { + // No more counting; apply powers of kBranching and avoid floating point + count *= kBranching_; + continue; + } + count = 0; + Node* next; + // Get a more precise lower bound (for start key) + for (;;) { + next = lb->Next(level); + if (next == ub) { + break; + } + assert(next != nullptr); + if (compare_(next->Key(), start_ikey) >= 0) { + break; + } + lb = next; + } + // Count entries on this level until upper bound (for end key) + for (;;) { + if (next == ub) { + break; + } + assert(next != nullptr); + if (compare_(next->Key(), end_ikey) >= 0) { + // Save refined upper bound to potentially save key comparison + ub = next; + break; } - } else { - x = next; count++; + next = next->Next(level); } } + return count; } template diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index 3b2f3f4d8..73bb64d18 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -108,11 +108,7 @@ class SkipListRep : public MemTableRep { uint64_t ApproximateNumEntries(const Slice& start_ikey, const Slice& end_ikey) override { - std::string tmp; - uint64_t start_count = - skip_list_.EstimateCount(EncodeKey(&tmp, start_ikey)); - uint64_t end_count = skip_list_.EstimateCount(EncodeKey(&tmp, end_ikey)); - return (end_count >= start_count) ? (end_count - start_count) : 0; + return skip_list_.ApproximateNumEntries(start_ikey, end_ikey); } void UniqueRandomSample(const uint64_t num_entries, diff --git a/options/cf_options.cc b/options/cf_options.cc index 7f2cd0313..8c5751cee 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -132,6 +132,92 @@ static Status ParseCompressionOptions(const std::string& value, return Status::OK(); } +static Status TableFactoryParseFn(const ConfigOptions& opts, + const std::string& name, + const std::string& value, void* addr) { + assert(addr); + auto table_factory = static_cast*>(addr); + + // The general approach to mutating a table factory is to clone it, then + // mutate and save the clone. This avoids race conditions between SetOptions + // and consumers of table_factory/table options by leveraging + // MutableCFOptions infrastructure to track the table_factory pointer. + + // However, in the atypical case of setting an option that is safely mutable + // under something pointed to by the table factory, we should avoid cloning. + // The simple way to detect that case is to try with "mutable_options_only" + // and see if it works. If it does, we are finished. If not, we proceed to + // cloning etc. + // + // The canonical example of what is handled here is + // table_factory.filter_policy.bloom_before_level for RibbonFilterPolicy. + if (table_factory->get() != nullptr && !EndsWith(name, "table_factory")) { + ConfigOptions opts_mutable_only{opts}; + opts_mutable_only.mutable_options_only = true; + Status s = + table_factory->get()->ConfigureOption(opts_mutable_only, name, value); + if (s.ok()) { + return s; + } + s.PermitUncheckedError(); + } + + std::shared_ptr new_factory; + Status s; + if (name == "block_based_table_factory") { + if (table_factory->get() != nullptr) { + std::string factory_name = table_factory->get()->Name(); + if (factory_name == TableFactory::kBlockBasedTableName()) { + new_factory = table_factory->get()->Clone(); + } else { + s = Status::InvalidArgument("Cannot modify " + factory_name + " as " + + name); + return s; + } + } else { + new_factory.reset(NewBlockBasedTableFactory()); + } + // Passing an object string to configure/instantiate a table factory + s = new_factory->ConfigureFromString(opts, value); + } else if (name == "plain_table_factory") { + if (table_factory->get() != nullptr) { + std::string factory_name = table_factory->get()->Name(); + if (factory_name == TableFactory::kPlainTableName()) { + new_factory = table_factory->get()->Clone(); + } else { + s = Status::InvalidArgument("Cannot modify " + factory_name + " as " + + name); + return s; + } + } else { + new_factory.reset(NewPlainTableFactory()); + } + // Passing an object string to configure/instantiate a table factory + s = new_factory->ConfigureFromString(opts, value); + } else if (name == "table_factory" || name == OptionTypeInfo::kIdPropName()) { + // Related to OptionTypeInfo::AsCustomSharedPtr + if (value.empty()) { + new_factory = nullptr; + } else { + s = TableFactory::CreateFromString(opts, value, &new_factory); + } + } else if (table_factory->get() != nullptr) { + new_factory = table_factory->get()->Clone(); + // Presumably passing a value for a specific field of the table factory + s = new_factory->ConfigureOption(opts, name, value); + } else { + s = Status::NotFound("Unable to instantiate a table factory from option: ", + name); + return s; + } + + // Only keep the modified clone if everything went OK + if (s.ok()) { + *table_factory = std::move(new_factory); + } + return s; +} + const std::string kOptNameBMCompOpts = "bottommost_compression_opts"; const std::string kOptNameCompOpts = "compression_opts"; @@ -266,6 +352,25 @@ static std::unordered_map {offsetof(struct MutableCFOptions, disable_auto_compactions), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"table_factory", + {offsetof(struct MutableCFOptions, table_factory), + OptionType::kCustomizable, OptionVerificationType::kByName, + OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose | + OptionTypeFlags::kStringNameOnly | OptionTypeFlags::kDontPrepare | + OptionTypeFlags::kMutable, + TableFactoryParseFn}}, + {"block_based_table_factory", + {offsetof(struct MutableCFOptions, table_factory), + OptionType::kCustomizable, OptionVerificationType::kAlias, + OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose | + OptionTypeFlags::kMutable, + TableFactoryParseFn}}, + {"plain_table_factory", + {offsetof(struct MutableCFOptions, table_factory), + OptionType::kCustomizable, OptionVerificationType::kAlias, + OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose | + OptionTypeFlags::kMutable, + TableFactoryParseFn}}, {"filter_deletes", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, OptionTypeFlags::kMutable}}, @@ -713,76 +818,6 @@ static std::unordered_map MemTableRepFactory::CreateFromString(opts, value, shared); return s; }}}, - {"table_factory", - OptionTypeInfo::AsCustomSharedPtr( - offsetof(struct ImmutableCFOptions, table_factory), - OptionVerificationType::kByName, - (OptionTypeFlags::kCompareLoose | - OptionTypeFlags::kStringNameOnly | - OptionTypeFlags::kDontPrepare))}, - {"block_based_table_factory", - {offsetof(struct ImmutableCFOptions, table_factory), - OptionType::kCustomizable, OptionVerificationType::kAlias, - OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose, - // Parses the input value and creates a BlockBasedTableFactory - [](const ConfigOptions& opts, const std::string& name, - const std::string& value, void* addr) { - BlockBasedTableOptions* old_opts = nullptr; - auto table_factory = - static_cast*>(addr); - if (table_factory->get() != nullptr) { - old_opts = - table_factory->get()->GetOptions(); - } - if (name == "block_based_table_factory") { - std::unique_ptr new_factory; - if (old_opts != nullptr) { - new_factory.reset(NewBlockBasedTableFactory(*old_opts)); - } else { - new_factory.reset(NewBlockBasedTableFactory()); - } - Status s = new_factory->ConfigureFromString(opts, value); - if (s.ok()) { - table_factory->reset(new_factory.release()); - } - return s; - } else if (old_opts != nullptr) { - return table_factory->get()->ConfigureOption(opts, name, value); - } else { - return Status::NotFound("Mismatched table option: ", name); - } - }}}, - {"plain_table_factory", - {offsetof(struct ImmutableCFOptions, table_factory), - OptionType::kCustomizable, OptionVerificationType::kAlias, - OptionTypeFlags::kShared | OptionTypeFlags::kCompareLoose, - // Parses the input value and creates a PlainTableFactory - [](const ConfigOptions& opts, const std::string& name, - const std::string& value, void* addr) { - PlainTableOptions* old_opts = nullptr; - auto table_factory = - static_cast*>(addr); - if (table_factory->get() != nullptr) { - old_opts = table_factory->get()->GetOptions(); - } - if (name == "plain_table_factory") { - std::unique_ptr new_factory; - if (old_opts != nullptr) { - new_factory.reset(NewPlainTableFactory(*old_opts)); - } else { - new_factory.reset(NewPlainTableFactory()); - } - Status s = new_factory->ConfigureFromString(opts, value); - if (s.ok()) { - table_factory->reset(new_factory.release()); - } - return s; - } else if (old_opts != nullptr) { - return table_factory->get()->ConfigureOption(opts, name, value); - } else { - return Status::NotFound("Mismatched table option: ", name); - } - }}}, {"table_properties_collectors", OptionTypeInfo::Vector< std::shared_ptr>( @@ -954,7 +989,6 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) inplace_update_support(cf_options.inplace_update_support), inplace_callback(cf_options.inplace_callback), memtable_factory(cf_options.memtable_factory), - table_factory(cf_options.table_factory), table_properties_collector_factories( cf_options.table_properties_collector_factories), bloom_locality(cf_options.bloom_locality), diff --git a/options/cf_options.h b/options/cf_options.h index 3a0c3b09a..5cc46712c 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -53,8 +53,6 @@ struct ImmutableCFOptions { std::shared_ptr memtable_factory; - std::shared_ptr table_factory; - Options::TablePropertiesCollectorFactories table_properties_collector_factories; @@ -124,6 +122,7 @@ struct MutableCFOptions { experimental_mempurge_threshold( options.experimental_mempurge_threshold), disable_auto_compactions(options.disable_auto_compactions), + table_factory(options.table_factory), soft_pending_compaction_bytes_limit( options.soft_pending_compaction_bytes_limit), hard_pending_compaction_bytes_limit( @@ -258,6 +257,9 @@ struct MutableCFOptions { size_t max_successive_merges; bool strict_max_successive_merges; size_t inplace_update_num_locks; + // NOTE: if too many shared_ptr make their way into MutableCFOptions, the + // copy performance might suffer enough to warrant aggregating them in an + // immutable+copy-on-write sub-object managed through a single shared_ptr. std::shared_ptr prefix_extractor; // [experimental] // Used to activate or deactive the Mempurge feature (memtable garbage @@ -278,6 +280,7 @@ struct MutableCFOptions { // Compaction related options bool disable_auto_compactions; + std::shared_ptr table_factory; uint64_t soft_pending_compaction_bytes_limit; uint64_t hard_pending_compaction_bytes_limit; int level0_file_num_compaction_trigger; diff --git a/options/configurable.cc b/options/configurable.cc index 134de99a2..d396349b1 100644 --- a/options/configurable.cc +++ b/options/configurable.cc @@ -17,13 +17,25 @@ namespace ROCKSDB_NAMESPACE { +namespace { +intptr_t GetOffset(const Configurable* holder, void* field) { + return reinterpret_cast(field) - + reinterpret_cast(static_cast(holder)); +} + +void* ApplyOffset(const Configurable* holder, intptr_t offset) { + return reinterpret_cast( + reinterpret_cast(static_cast(holder)) + offset); +} +} // namespace + void Configurable::RegisterOptions( const std::string& name, void* opt_ptr, const std::unordered_map* type_map) { RegisteredOptions opts; opts.name = name; opts.type_map = type_map; - opts.opt_ptr = opt_ptr; + opts.opt_offset = GetOffset(this, opt_ptr); options_.emplace_back(opts); } @@ -42,7 +54,8 @@ Status Configurable::PrepareOptions(const ConfigOptions& opts) { for (const auto& map_iter : *(opt_iter.type_map)) { auto& opt_info = map_iter.second; if (opt_info.ShouldPrepare()) { - status = opt_info.Prepare(opts, map_iter.first, opt_iter.opt_ptr); + status = opt_info.Prepare(opts, map_iter.first, + ApplyOffset(this, opt_iter.opt_offset)); if (!status.ok()) { return status; } @@ -62,7 +75,7 @@ Status Configurable::ValidateOptions(const DBOptions& db_opts, auto& opt_info = map_iter.second; if (opt_info.ShouldValidate()) { status = opt_info.Validate(db_opts, cf_opts, map_iter.first, - opt_iter.opt_ptr); + ApplyOffset(this, opt_iter.opt_offset)); if (!status.ok()) { return status; } @@ -82,7 +95,7 @@ Status Configurable::ValidateOptions(const DBOptions& db_opts, const void* Configurable::GetOptionsPtr(const std::string& name) const { for (const auto& o : options_) { if (o.name == name) { - return o.opt_ptr; + return ApplyOffset(this, o.opt_offset); } } return nullptr; @@ -93,14 +106,14 @@ std::string Configurable::GetOptionName(const std::string& opt_name) const { } const OptionTypeInfo* ConfigurableHelper::FindOption( - const std::vector& options, - const std::string& short_name, std::string* opt_name, void** opt_ptr) { - for (const auto& iter : options) { + const Configurable& configurable, const std::string& short_name, + std::string* opt_name, void** opt_ptr) { + for (const auto& iter : configurable.options_) { if (iter.type_map != nullptr) { const auto opt_info = OptionTypeInfo::Find(short_name, *(iter.type_map), opt_name); if (opt_info != nullptr) { - *opt_ptr = iter.opt_ptr; + *opt_ptr = ApplyOffset(&configurable, iter.opt_offset); return opt_info; } } @@ -244,7 +257,8 @@ Status ConfigurableHelper::ConfigureOptions( for (const auto& iter : configurable.options_) { if (iter.type_map != nullptr) { s = ConfigureSomeOptions(config_options, configurable, *(iter.type_map), - &remaining, iter.opt_ptr); + &remaining, + ApplyOffset(&configurable, iter.opt_offset)); if (remaining.empty()) { // Are there more options left? break; } else if (!s.ok()) { @@ -354,7 +368,7 @@ Status ConfigurableHelper::ConfigureSingleOption( std::string elem_name; void* opt_ptr = nullptr; const auto opt_info = - FindOption(configurable.options_, opt_name, &elem_name, &opt_ptr); + FindOption(configurable, opt_name, &elem_name, &opt_ptr); if (opt_info == nullptr) { return Status::NotFound("Could not find option: ", name); } else { @@ -507,7 +521,7 @@ Status ConfigurableHelper::GetOption(const ConfigOptions& config_options, std::string opt_name; void* opt_ptr = nullptr; const auto opt_info = - FindOption(configurable.options_, short_name, &opt_name, &opt_ptr); + FindOption(configurable, short_name, &opt_name, &opt_ptr); if (opt_info != nullptr) { ConfigOptions embedded = config_options; embedded.delimiter = ";"; @@ -538,22 +552,22 @@ Status ConfigurableHelper::SerializeOptions(const ConfigOptions& config_options, if (opt_info.ShouldSerialize()) { std::string value; Status s; + void* opt_ptr = ApplyOffset(&configurable, opt_iter.opt_offset); if (!config_options.mutable_options_only) { - s = opt_info.Serialize(config_options, prefix + opt_name, - opt_iter.opt_ptr, &value); + s = opt_info.Serialize(config_options, prefix + opt_name, opt_ptr, + &value); } else if (opt_info.IsMutable()) { ConfigOptions copy = config_options; copy.mutable_options_only = false; - s = opt_info.Serialize(copy, prefix + opt_name, opt_iter.opt_ptr, - &value); + s = opt_info.Serialize(copy, prefix + opt_name, opt_ptr, &value); } else if (opt_info.IsConfigurable()) { // If it is a Configurable and we are either printing all of the // details or not printing only the name, this option should be // included in the list if (config_options.IsDetailed() || !opt_info.IsEnabled(OptionTypeFlags::kStringNameOnly)) { - s = opt_info.Serialize(config_options, prefix + opt_name, - opt_iter.opt_ptr, &value); + s = opt_info.Serialize(config_options, prefix + opt_name, opt_ptr, + &value); } } if (!s.ok()) { diff --git a/options/configurable_helper.h b/options/configurable_helper.h index 5d409f82a..627bbe188 100644 --- a/options/configurable_helper.h +++ b/options/configurable_helper.h @@ -160,9 +160,9 @@ class ConfigurableHelper { std::string* mismatch); private: - // Looks for the option specified by name in the RegisteredOptions. - // This method traverses the types in the input options vector. If an entry - // matching name is found, that entry, opt_name, and pointer are returned. + // Looks for the option specified by name in the RegisteredOptions of a + // configurable. If an entry matching name is found, that entry, opt_name, + // and pointer are returned. // @param options The vector of options to search through // @param name The name of the option to search for in the OptionType map // @param opt_name If the name was found, this value is set to the option name @@ -172,9 +172,10 @@ class ConfigurableHelper { // in the RegisteredOptions vector associated with this entry // @return A pointer to the OptionTypeInfo from the options if found, // nullptr if the name was not found in the input options - static const OptionTypeInfo* FindOption( - const std::vector& options, - const std::string& name, std::string* opt_name, void** opt_ptr); + static const OptionTypeInfo* FindOption(const Configurable& configurable, + const std::string& name, + std::string* opt_name, + void** opt_ptr); static Status ConfigureCustomizableOption( const ConfigOptions& config_options, Configurable& configurable, diff --git a/options/options_helper.cc b/options/options_helper.cc index 232b3f3bd..007aaeaa1 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -239,6 +239,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, // Compaction related options cf_opts->disable_auto_compactions = moptions.disable_auto_compactions; + cf_opts->table_factory = moptions.table_factory; cf_opts->soft_pending_compaction_bytes_limit = moptions.soft_pending_compaction_bytes_limit; cf_opts->hard_pending_compaction_bytes_limit = @@ -315,7 +316,6 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, cf_opts->inplace_update_support = ioptions.inplace_update_support; cf_opts->inplace_callback = ioptions.inplace_callback; cf_opts->memtable_factory = ioptions.memtable_factory; - cf_opts->table_factory = ioptions.table_factory; cf_opts->table_properties_collector_factories = ioptions.table_properties_collector_factories; cf_opts->bloom_locality = ioptions.bloom_locality; diff --git a/options/options_test.cc b/options/options_test.cc index fa2970465..bcb04d741 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -61,6 +61,9 @@ class UnregisteredTableFactory : public TableFactory { WritableFileWriter*) const override { return nullptr; } + std::unique_ptr Clone() const override { + return std::make_unique(); + } }; TEST_F(OptionsTest, GetOptionsFromMapTest) { @@ -1662,29 +1665,38 @@ TEST_F(OptionsTest, MutableTableOptions) { bbtf.reset(NewBlockBasedTableFactory()); auto bbto = bbtf->GetOptions(); ASSERT_NE(bbto, nullptr); - ASSERT_OK(bbtf->ConfigureOption(config_options, "block_align", "true")); + ASSERT_OK(bbtf->ConfigureOption(config_options, "no_block_cache", "true")); ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "1024")); - ASSERT_EQ(bbto->block_align, true); + ASSERT_EQ(bbto->no_block_cache, true); ASSERT_EQ(bbto->block_size, 1024); ASSERT_OK(bbtf->PrepareOptions(config_options)); config_options.mutable_options_only = true; - ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "1024")); - ASSERT_EQ(bbto->block_align, true); - ASSERT_NOK(bbtf->ConfigureOption(config_options, "block_align", "false")); - ASSERT_OK(bbtf->ConfigureOption(config_options, "block_size", "2048")); - ASSERT_EQ(bbto->block_align, true); - ASSERT_EQ(bbto->block_size, 2048); + // Options on BlockBasedTableOptions/Factory are no longer directly mutable + // but have to be mutated on a live DB with SetOptions replacing the + // table_factory with a copy using the new options. + ASSERT_NOK(bbtf->ConfigureOption(config_options, "no_block_cache", "false")); + ASSERT_NOK(bbtf->ConfigureOption(config_options, "block_size", "2048")); + ASSERT_EQ(bbto->no_block_cache, true); + ASSERT_EQ(bbto->block_size, 1024); ColumnFamilyOptions cf_opts; cf_opts.table_factory = bbtf; + // FIXME: find a way to make this fail again + /* ASSERT_NOK(GetColumnFamilyOptionsFromString( - config_options, cf_opts, "block_based_table_factory.block_align=false", + config_options, cf_opts, "block_based_table_factory.no_block_cache=false", &cf_opts)); + */ ASSERT_OK(GetColumnFamilyOptionsFromString( config_options, cf_opts, "block_based_table_factory.block_size=8192", &cf_opts)); - ASSERT_EQ(bbto->block_align, true); - ASSERT_EQ(bbto->block_size, 8192); + const auto new_bbto = + cf_opts.table_factory->GetOptions(); + ASSERT_NE(new_bbto, nullptr); + ASSERT_NE(new_bbto, bbto); + ASSERT_EQ(new_bbto->no_block_cache, true); + ASSERT_EQ(new_bbto->block_size, 8192); + ASSERT_EQ(bbto->block_size, 1024); } TEST_F(OptionsTest, MutableCFOptions) { @@ -1698,7 +1710,7 @@ TEST_F(OptionsTest, MutableCFOptions) { &cf_opts)); ASSERT_TRUE(cf_opts.paranoid_file_checks); ASSERT_NE(cf_opts.table_factory.get(), nullptr); - const auto bbto = cf_opts.table_factory->GetOptions(); + auto* bbto = cf_opts.table_factory->GetOptions(); ASSERT_NE(bbto, nullptr); ASSERT_EQ(bbto->block_size, 8192); ASSERT_EQ(bbto->block_align, false); @@ -1707,10 +1719,11 @@ TEST_F(OptionsTest, MutableCFOptions) { config_options, cf_opts, {{"paranoid_file_checks", "false"}}, &cf_opts)); ASSERT_EQ(cf_opts.paranoid_file_checks, false); + // Should replace the factory with the new setting ASSERT_OK(GetColumnFamilyOptionsFromMap( config_options, cf_opts, {{"block_based_table_factory.block_size", "16384"}}, &cf_opts)); - ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + bbto = cf_opts.table_factory->GetOptions(); ASSERT_EQ(bbto->block_size, 16384); config_options.mutable_options_only = true; @@ -1719,45 +1732,103 @@ TEST_F(OptionsTest, MutableCFOptions) { config_options, cf_opts, {{"force_consistency_checks", "true"}}, &cf_opts)); - // Attempt to change the table. It is not mutable, so this should fail and - // leave the original intact - ASSERT_NOK(GetColumnFamilyOptionsFromMap( + // Attempt to change the table factory kind. This was previously disallowed + // and is a dubious operation but is tricky to disallow without breaking + // other things (FIXME?) + ASSERT_OK(GetColumnFamilyOptionsFromMap( config_options, cf_opts, {{"table_factory", "PlainTable"}}, &cf_opts)); - ASSERT_NOK(GetColumnFamilyOptionsFromMap( + ASSERT_STREQ(cf_opts.table_factory->Name(), TableFactory::kPlainTableName()); + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, {{"table_factory", "BlockBasedTable"}}, + &cf_opts)); + ASSERT_STREQ(cf_opts.table_factory->Name(), + TableFactory::kBlockBasedTableName()); + ASSERT_OK(GetColumnFamilyOptionsFromMap( config_options, cf_opts, {{"table_factory.id", "PlainTable"}}, &cf_opts)); - ASSERT_NE(cf_opts.table_factory.get(), nullptr); - ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + ASSERT_STREQ(cf_opts.table_factory->Name(), TableFactory::kPlainTableName()); + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, {{"table_factory.id", "BlockBasedTable"}}, + &cf_opts)); + ASSERT_STREQ(cf_opts.table_factory->Name(), + TableFactory::kBlockBasedTableName()); + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"table_factory", "{id=PlainTable;bloom_bits_per_key=42}"}}, &cf_opts)); + ASSERT_STREQ(cf_opts.table_factory->Name(), TableFactory::kPlainTableName()); + + // Should at least be allowed to instantiate in place of nullptr, for + // initialization purposes. + cf_opts.table_factory = nullptr; + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"table_factory", "{id=BlockBasedTable;block_size=12345}"}}, &cf_opts)); + ASSERT_STREQ(cf_opts.table_factory->Name(), + TableFactory::kBlockBasedTableName()); + bbto = cf_opts.table_factory->GetOptions(); + ASSERT_EQ(bbto->block_size, 12345); + + // Accessing through the wrong factory alias fails gracefully + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"plain_table_factory", "{bloom_bits_per_key=42}"}}, &cf_opts)); + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"plain_table_factory.bloom_bits_per_key", "42"}}, &cf_opts)); + ASSERT_STREQ(cf_opts.table_factory->Name(), + TableFactory::kBlockBasedTableName()); - // Change the block size. Should update the value in the current table + // Change the block size. ASSERT_OK(GetColumnFamilyOptionsFromMap( config_options, cf_opts, {{"block_based_table_factory.block_size", "8192"}}, &cf_opts)); - ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + bbto = cf_opts.table_factory->GetOptions(); ASSERT_EQ(bbto->block_size, 8192); // Attempt to turn off block cache fails, as this option is not mutable + // FIXME: find a way to make this fail again + /* ASSERT_NOK(GetColumnFamilyOptionsFromMap( config_options, cf_opts, {{"block_based_table_factory.no_block_cache", "true"}}, &cf_opts)); - ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + */ - // Attempt to change the block size via a config string/map. Should update - // the current value + // Attempt to change the block size via a config string/map. ASSERT_OK(GetColumnFamilyOptionsFromMap( config_options, cf_opts, {{"block_based_table_factory", "{block_size=32768}"}}, &cf_opts)); - ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + bbto = cf_opts.table_factory->GetOptions(); ASSERT_EQ(bbto->block_size, 32768); // Attempt to change the block size and no cache through the map. Should // fail, leaving the old values intact + // FIXME: find a way to make this fail again + /* ASSERT_NOK(GetColumnFamilyOptionsFromMap( config_options, cf_opts, {{"block_based_table_factory", "{block_size=16384; no_block_cache=true}"}}, &cf_opts)); - ASSERT_EQ(bbto, cf_opts.table_factory->GetOptions()); + */ ASSERT_EQ(bbto->block_size, 32768); + + // Switch to plain table for some tests + cf_opts.table_factory = nullptr; + ASSERT_OK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"table_factory", "{id=PlainTable;bloom_bits_per_key=42}"}}, &cf_opts)); + ASSERT_STREQ(cf_opts.table_factory->Name(), TableFactory::kPlainTableName()); + auto* pto = cf_opts.table_factory->GetOptions(); + ASSERT_EQ(pto->bloom_bits_per_key, 42); + + // Accessing through the wrong factory alias fails gracefully + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory.block_size", "8192"}}, &cf_opts)); + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + config_options, cf_opts, + {{"block_based_table_factory", "{block_size=32768}"}}, &cf_opts)); + ASSERT_STREQ(cf_opts.table_factory->Name(), TableFactory::kPlainTableName()); + ASSERT_EQ(pto, cf_opts.table_factory->GetOptions()); } diff --git a/table/adaptive/adaptive_table_factory.h b/table/adaptive/adaptive_table_factory.h index 55c8bca1f..fe6d4ece4 100644 --- a/table/adaptive/adaptive_table_factory.h +++ b/table/adaptive/adaptive_table_factory.h @@ -46,6 +46,10 @@ class AdaptiveTableFactory : public TableFactory { std::string GetPrintableOptions() const override; + std::unique_ptr Clone() const override { + return std::make_unique(*this); + } + private: std::shared_ptr table_factory_to_write_; std::shared_ptr block_based_table_factory_; diff --git a/table/block_based/block.h b/table/block_based/block.h index 439598ba5..2cd2918a8 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -575,13 +575,7 @@ class BlockIter : public InternalIteratorBase { void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { if (pad_min_timestamp_) { - std::string buf; - if (raw_key_.IsUserKey()) { - AppendKeyWithMinTimestamp(&buf, key, ts_sz_); - } else { - PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_); - } - raw_key_.SetKey(buf, true /* copy */); + raw_key_.SetKeyWithPaddedMinTimestamp(key, ts_sz_); } else { raw_key_.SetKey(key, false /* copy */); } diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index ec24721b7..ec4a695b6 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -624,6 +624,7 @@ struct BlockBasedTableBuilder::Rep { props.db_id = tbo.db_id; props.db_session_id = tbo.db_session_id; props.db_host_id = ioptions.db_host_id; + props.format_version = table_options.format_version; if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) { ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set"); } diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 5382db097..d461dc526 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -224,10 +224,20 @@ static std::unordered_map - block_based_table_type_info = { - /* currently not supported - std::shared_ptr block_cache = nullptr; +static struct BlockBasedTableTypeInfo { + std::unordered_map info; + + BlockBasedTableTypeInfo() { + info = { + // NOTE: Below the list, most of these options are marked as mutable. + // In theory, there should be no danger in mutability, as table + // builders and readers work from copies of BlockBasedTableOptions. + // However, there is currently an unresolved read-write race that + // affecting SetOptions on BBTO fields. This should be generally + // acceptable for non-pointer options of 64 bits or less, but a fix + // is needed to make it mutability general here. See + // https://github.com/facebook/rocksdb/issues/10079 + /* currently not supported: CacheUsageOptions cache_usage_options; */ {"flush_block_policy_factory", @@ -238,24 +248,20 @@ static std::unordered_map {"cache_index_and_filter_blocks", {offsetof(struct BlockBasedTableOptions, cache_index_and_filter_blocks), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"cache_index_and_filter_blocks_with_high_priority", {offsetof(struct BlockBasedTableOptions, cache_index_and_filter_blocks_with_high_priority), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"pin_l0_filter_and_index_blocks_in_cache", {offsetof(struct BlockBasedTableOptions, pin_l0_filter_and_index_blocks_in_cache), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"index_type", OptionTypeInfo::Enum( offsetof(struct BlockBasedTableOptions, index_type), &block_base_table_index_type_string_map)}, {"hash_index_allow_collision", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, - OptionTypeFlags::kNone}}, + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated}}, {"data_block_index_type", OptionTypeInfo::Enum( offsetof(struct BlockBasedTableOptions, data_block_index_type), @@ -267,86 +273,65 @@ static std::unordered_map {"data_block_hash_table_util_ratio", {offsetof(struct BlockBasedTableOptions, data_block_hash_table_util_ratio), - OptionType::kDouble, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kDouble, OptionVerificationType::kNormal}}, {"checksum", {offsetof(struct BlockBasedTableOptions, checksum), - OptionType::kChecksumType, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kChecksumType, OptionVerificationType::kNormal}}, {"no_block_cache", {offsetof(struct BlockBasedTableOptions, no_block_cache), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"block_size", {offsetof(struct BlockBasedTableOptions, block_size), - OptionType::kSizeT, OptionVerificationType::kNormal, - OptionTypeFlags::kMutable}}, + OptionType::kSizeT, OptionVerificationType::kNormal}}, {"block_size_deviation", {offsetof(struct BlockBasedTableOptions, block_size_deviation), - OptionType::kInt, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kInt, OptionVerificationType::kNormal}}, {"block_restart_interval", {offsetof(struct BlockBasedTableOptions, block_restart_interval), - OptionType::kInt, OptionVerificationType::kNormal, - OptionTypeFlags::kMutable}}, + OptionType::kInt, OptionVerificationType::kNormal}}, {"index_block_restart_interval", {offsetof(struct BlockBasedTableOptions, index_block_restart_interval), - OptionType::kInt, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kInt, OptionVerificationType::kNormal}}, {"index_per_partition", - {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated, - OptionTypeFlags::kNone}}, + {0, OptionType::kUInt64T, OptionVerificationType::kDeprecated}}, {"metadata_block_size", {offsetof(struct BlockBasedTableOptions, metadata_block_size), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kUInt64T, OptionVerificationType::kNormal}}, {"partition_filters", {offsetof(struct BlockBasedTableOptions, partition_filters), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"decouple_partitioned_filters", {offsetof(struct BlockBasedTableOptions, decouple_partitioned_filters), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"optimize_filters_for_memory", {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"use_delta_encoding", {offsetof(struct BlockBasedTableOptions, use_delta_encoding), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"filter_policy", OptionTypeInfo::AsCustomSharedPtr( offsetof(struct BlockBasedTableOptions, filter_policy), - OptionVerificationType::kByNameAllowFromNull, - OptionTypeFlags::kNone)}, + OptionVerificationType::kByNameAllowFromNull)}, {"whole_key_filtering", {offsetof(struct BlockBasedTableOptions, whole_key_filtering), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"detect_filter_construct_corruption", {offsetof(struct BlockBasedTableOptions, detect_filter_construct_corruption), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kMutable}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"reserve_table_builder_memory", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, - OptionTypeFlags::kNone}}, + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated}}, {"reserve_table_reader_memory", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, - OptionTypeFlags::kNone}}, + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated}}, {"skip_table_builder_flush", - {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, - OptionTypeFlags::kNone}}, + {0, OptionType::kBoolean, OptionVerificationType::kDeprecated}}, {"format_version", {offsetof(struct BlockBasedTableOptions, format_version), - OptionType::kUInt32T, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kUInt32T, OptionVerificationType::kNormal}}, {"verify_compression", {offsetof(struct BlockBasedTableOptions, verify_compression), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"read_amp_bytes_per_bit", {offsetof(struct BlockBasedTableOptions, read_amp_bytes_per_bit), OptionType::kUInt32T, OptionVerificationType::kNormal, @@ -369,17 +354,14 @@ static std::unordered_map }}}, {"enable_index_compression", {offsetof(struct BlockBasedTableOptions, enable_index_compression), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"block_align", {offsetof(struct BlockBasedTableOptions, block_align), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {"pin_top_level_index_and_filter", {offsetof(struct BlockBasedTableOptions, pin_top_level_index_and_filter), - OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kNone}}, + OptionType::kBoolean, OptionVerificationType::kNormal}}, {kOptNameMetadataCacheOpts, OptionTypeInfo::Struct( kOptNameMetadataCacheOpts, &metadata_cache_options_type_info, @@ -396,36 +378,33 @@ static std::unordered_map return Cache::CreateFromString(opts, value, cache); }}}, {"block_cache_compressed", - {0, OptionType::kUnknown, OptionVerificationType::kDeprecated, - OptionTypeFlags::kNone}}, + {0, OptionType::kUnknown, OptionVerificationType::kDeprecated}}, {"max_auto_readahead_size", {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size), - OptionType::kSizeT, OptionVerificationType::kNormal, - OptionTypeFlags::kMutable}}, + OptionType::kSizeT, OptionVerificationType::kNormal}}, {"prepopulate_block_cache", OptionTypeInfo::Enum( offsetof(struct BlockBasedTableOptions, prepopulate_block_cache), - &block_base_table_prepopulate_block_cache_string_map, - OptionTypeFlags::kMutable)}, + &block_base_table_prepopulate_block_cache_string_map)}, {"initial_auto_readahead_size", {offsetof(struct BlockBasedTableOptions, initial_auto_readahead_size), - OptionType::kSizeT, OptionVerificationType::kNormal, - OptionTypeFlags::kMutable}}, + OptionType::kSizeT, OptionVerificationType::kNormal}}, {"num_file_reads_for_auto_readahead", {offsetof(struct BlockBasedTableOptions, num_file_reads_for_auto_readahead), - OptionType::kUInt64T, OptionVerificationType::kNormal, - OptionTypeFlags::kMutable}}, - -}; + OptionType::kUInt64T, OptionVerificationType::kNormal}}, + }; + } +} block_based_table_type_info; // TODO(myabandeh): We should return an error instead of silently changing the // options BlockBasedTableFactory::BlockBasedTableFactory( const BlockBasedTableOptions& _table_options) - : table_options_(_table_options) { + : table_options_(_table_options), + shared_state_(std::make_shared()) { InitializeOptions(); - RegisterOptions(&table_options_, &block_based_table_type_info); + RegisterOptions(&table_options_, &block_based_table_type_info.info); const auto table_reader_charged = table_options_.cache_usage_options.options_overrides @@ -433,10 +412,11 @@ BlockBasedTableFactory::BlockBasedTableFactory( .charged; if (table_options_.block_cache && table_reader_charged == CacheEntryRoleOptions::Decision::kEnabled) { - table_reader_cache_res_mgr_.reset(new ConcurrentCacheReservationManager( - std::make_shared>( - table_options_.block_cache))); + shared_state_->table_reader_cache_res_mgr = + std::make_shared( + std::make_shared>( + table_options_.block_cache)); } } @@ -574,11 +554,13 @@ Status BlockBasedTableFactory::NewTableReader( ro, table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), file_size, table_reader_options.block_protection_bytes_per_key, - table_reader, table_reader_options.tail_size, table_reader_cache_res_mgr_, + table_reader, table_reader_options.tail_size, + shared_state_->table_reader_cache_res_mgr, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, table_reader_options.largest_seqno, - table_reader_options.force_direct_prefetch, &tail_prefetch_stats_, + table_reader_options.force_direct_prefetch, + &shared_state_->tail_prefetch_stats, table_reader_options.block_cache_tracer, table_reader_options.max_file_size_for_l0_meta_pin, table_reader_options.cur_db_session_id, table_reader_options.cur_file_num, diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h index 1f7876977..b05b45660 100644 --- a/table/block_based/block_based_table_factory.h +++ b/table/block_based/block_based_table_factory.h @@ -79,7 +79,13 @@ class BlockBasedTableFactory : public TableFactory { bool IsDeleteRangeSupported() const override { return true; } - TailPrefetchStats* tail_prefetch_stats() { return &tail_prefetch_stats_; } + std::unique_ptr Clone() const override { + return std::make_unique(*this); + } + + TailPrefetchStats* tail_prefetch_stats() { + return &shared_state_->tail_prefetch_stats; + } protected: const void* GetOptionsPtr(const std::string& name) const override; @@ -91,8 +97,12 @@ class BlockBasedTableFactory : public TableFactory { private: BlockBasedTableOptions table_options_; - std::shared_ptr table_reader_cache_res_mgr_; - mutable TailPrefetchStats tail_prefetch_stats_; + // Share some state among cloned instances + struct SharedState { + std::shared_ptr table_reader_cache_res_mgr; + TailPrefetchStats tail_prefetch_stats; + }; + std::shared_ptr shared_state_; }; extern const std::string kHashIndexPrefixesBlock; diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc index 14db24d9b..3f55f82a7 100644 --- a/table/block_based/block_based_table_iterator.cc +++ b/table/block_based/block_based_table_iterator.cc @@ -35,6 +35,17 @@ void BlockBasedTableIterator::SeekSecondPass(const Slice* target) { void BlockBasedTableIterator::SeekImpl(const Slice* target, bool async_prefetch) { + // TODO(hx235): set `seek_key_prefix_for_readahead_trimming_` + // even when `target == nullptr` that is when `SeekToFirst()` is called + if (target != nullptr && prefix_extractor_ && + read_options_.prefix_same_as_start) { + const Slice& seek_user_key = ExtractUserKey(*target); + seek_key_prefix_for_readahead_trimming_ = + prefix_extractor_->InDomain(seek_user_key) + ? prefix_extractor_->Transform(seek_user_key).ToString() + : ""; + } + bool is_first_pass = !async_read_in_progress_; if (!is_first_pass) { @@ -44,9 +55,9 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target, ResetBlockCacheLookupVar(); - bool autotune_readaheadsize = is_first_pass && - read_options_.auto_readahead_size && - read_options_.iterate_upper_bound; + bool autotune_readaheadsize = + is_first_pass && read_options_.auto_readahead_size && + (read_options_.iterate_upper_bound || read_options_.prefix_same_as_start); if (autotune_readaheadsize && table_->get_rep()->table_options.block_cache.get() && @@ -778,7 +789,7 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize( size_t footer = table_->get_rep()->footer.GetBlockTrailerSize(); if (read_curr_block && !DoesContainBlockHandles() && - IsNextBlockOutOfBound()) { + IsNextBlockOutOfReadaheadBound()) { end_offset = index_iter_->value().handle.offset() + footer + index_iter_->value().handle.size(); return; @@ -850,7 +861,7 @@ void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize( // If curr block's index key >= iterate_upper_bound, it // means all the keys in next block or above are out of // bound. - if (IsNextBlockOutOfBound()) { + if (IsNextBlockOutOfReadaheadBound()) { is_index_out_of_bound_ = true; break; } diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h index 2b562ef06..d49224de4 100644 --- a/table/block_based/block_based_table_iterator.h +++ b/table/block_based/block_based_table_iterator.h @@ -353,6 +353,11 @@ class BlockBasedTableIterator : public InternalIteratorBase { // is used to disable the lookup. IterDirection direction_ = IterDirection::kForward; + // The prefix of the key called with SeekImpl(). + // This is for readahead trimming so no data blocks containing keys of a + // different prefix are prefetched + std::string seek_key_prefix_for_readahead_trimming_ = ""; + void SeekSecondPass(const Slice* target); // If `target` is null, seek to first. @@ -408,15 +413,41 @@ class BlockBasedTableIterator : public InternalIteratorBase { ClearBlockHandles(); } - bool IsNextBlockOutOfBound() { + bool IsNextBlockOutOfReadaheadBound() { + const Slice& index_iter_user_key = index_iter_->user_key(); // If curr block's index key >= iterate_upper_bound, it means all the keys // in next block or above are out of bound. - return (user_comparator_.CompareWithoutTimestamp( - index_iter_->user_key(), - /*a_has_ts=*/true, *read_options_.iterate_upper_bound, - /*b_has_ts=*/false) >= 0 - ? true - : false); + bool out_of_upper_bound = + read_options_.iterate_upper_bound != nullptr && + (user_comparator_.CompareWithoutTimestamp( + index_iter_user_key, + /*a_has_ts=*/true, *read_options_.iterate_upper_bound, + /*b_has_ts=*/false) >= 0 + ? true + : false); + if (out_of_upper_bound) { + return true; + } + + // If curr block's index key has a different prefix from the seek key's, it + // means all the keys in next block or above has a different prefix from the + // seek key's. + bool out_of_prefix_bound = + (read_options_.prefix_same_as_start && + !seek_key_prefix_for_readahead_trimming_.empty() && + (prefix_extractor_->InDomain(index_iter_user_key) + ? (prefix_extractor_->Transform(index_iter_user_key) + .compare(seek_key_prefix_for_readahead_trimming_) != 0) + : user_comparator_.CompareWithoutTimestamp( + index_iter_user_key, + /*a_has_ts=*/true, seek_key_prefix_for_readahead_trimming_, + /*b_has_ts=*/false) > 0)); + + if (out_of_prefix_bound) { + return true; + } + + return false; } void ClearBlockHandles() { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 0dfe3e38a..35d387ae9 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -137,7 +137,13 @@ extern const std::string kHashIndexPrefixesMetadataBlock; BlockBasedTable::~BlockBasedTable() { auto ua = rep_->uncache_aggressiveness.LoadRelaxed(); - if (ua > 0 && rep_->table_options.block_cache) { + // NOTE: there is an undiagnosed incompatibility with mmap reads, + // where attempting to read the index below can result in bus error. + // In theory the mmap should remain in place until destruction of + // rep_, so even a page fault should be satisfiable. But also, combining + // mmap reads with block cache is weird, so it's not a concerning loss. + if (ua > 0 && rep_->table_options.block_cache && + !rep_->ioptions.allow_mmap_reads) { if (rep_->filter) { rep_->filter->EraseFromCacheBeforeDestruction(ua); } @@ -647,6 +653,7 @@ Status BlockBasedTable::Open( ro.rate_limiter_priority = read_options.rate_limiter_priority; ro.verify_checksums = read_options.verify_checksums; ro.io_activity = read_options.io_activity; + ro.fill_cache = read_options.fill_cache; // prefetch both index and filters, down to all partitions const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 1aadc62e0..34519f257 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -500,6 +500,8 @@ class BlockBasedTable : public TableReader { InternalIterator* meta_iter, const InternalKeyComparator& internal_comparator, BlockCacheLookupContext* lookup_context); + // If index and filter blocks do not need to be pinned, `prefetch_all` + // determines whether they will be read and add to cache. Status PrefetchIndexAndFilterBlocks( const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, InternalIterator* meta_iter, BlockBasedTable* new_table, diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index d7bee1675..9936a34fd 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -555,7 +555,7 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, std::string column_family_name; const ReadOptions read_options; const WriteOptions write_options; - builder.reset(ioptions.table_factory->NewTableBuilder( + builder.reset(moptions.table_factory->NewTableBuilder( TableBuilderOptions( ioptions, moptions, read_options, write_options, internal_comparator, &internal_tbl_prop_coll_factories, options.compression, @@ -581,7 +581,7 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, file_reader.reset(new RandomAccessFileReader(std::move(file), "test")); const bool kSkipFilters = true; const bool kImmortal = true; - ASSERT_OK(ioptions.table_factory->NewTableReader( + ASSERT_OK(moptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, internal_comparator, 0 /* block_protection_bytes_per_key */, !kSkipFilters, diff --git a/table/cuckoo/cuckoo_table_factory.h b/table/cuckoo/cuckoo_table_factory.h index 7132cec65..64077217d 100644 --- a/table/cuckoo/cuckoo_table_factory.h +++ b/table/cuckoo/cuckoo_table_factory.h @@ -73,6 +73,10 @@ class CuckooTableFactory : public TableFactory { std::string GetPrintableOptions() const override; + std::unique_ptr Clone() const override { + return std::make_unique(*this); + } + private: CuckooTableOptions table_options_; }; diff --git a/table/iterator.cc b/table/iterator.cc index 8306f5a04..999522575 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -74,6 +74,10 @@ class EmptyInternalIterator : public InternalIteratorBase { assert(false); return TValue(); } + uint64_t write_unix_time() const override { + assert(false); + return std::numeric_limits::max(); + } Status status() const override { return status_; } private: diff --git a/table/mock_table.h b/table/mock_table.h index 737360c23..af90740a2 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -76,6 +76,9 @@ class MockTableFactory : public TableFactory { // contents are equal to file_contents void AssertSingleFile(const KVVector& file_contents); void AssertLatestFiles(const std::vector& files_contents); + std::unique_ptr Clone() const override { + return nullptr; // Not implemented + } private: Status GetAndWriteNextID(WritableFileWriter* file, uint32_t* id) const; diff --git a/table/plain/plain_table_factory.h b/table/plain/plain_table_factory.h index a47418af6..d6055ccbd 100644 --- a/table/plain/plain_table_factory.h +++ b/table/plain/plain_table_factory.h @@ -173,6 +173,10 @@ class PlainTableFactory : public TableFactory { std::string GetPrintableOptions() const override; static const char kValueTypeSeqId0 = char(~0); + std::unique_ptr Clone() const override { + return std::make_unique(*this); + } + private: PlainTableOptions table_options_; }; diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index 76de317d7..806da18da 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -416,7 +416,7 @@ Status SstFileWriter::Open(const std::string& file_path, Temperature temp) { // TODO(tec) : If table_factory is using compressed block cache, we will // be adding the external sst file blocks into it, which is wasteful. - r->builder.reset(r->ioptions.table_factory->NewTableBuilder( + r->builder.reset(r->mutable_cf_options.table_factory->NewTableBuilder( table_builder_options, r->file_writer.get())); r->file_info = ExternalSstFileInfo(); @@ -533,4 +533,9 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { uint64_t SstFileWriter::FileSize() { return rep_->file_info.file_size; } +bool SstFileWriter::CreatedBySstFileWriter(const TableProperties& tp) { + const auto& uprops = tp.user_collected_properties; + return uprops.find(ExternalSstFilePropertyNames::kVersion) != uprops.end(); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/table/table_test.cc b/table/table_test.cc index bb0b70222..f51ddf69c 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -379,7 +379,7 @@ class TableConstructor : public Constructor { std::string column_family_name; const ReadOptions read_options; const WriteOptions write_options; - builder.reset(ioptions.table_factory->NewTableBuilder( + builder.reset(moptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, read_options, write_options, internal_comparator, &internal_tbl_prop_coll_factories, @@ -440,7 +440,7 @@ class TableConstructor : public Constructor { TEST_GetSink()->contents(), file_num_, ioptions.allow_mmap_reads)); file_reader_.reset(new RandomAccessFileReader(std::move(source), "test")); - return ioptions.table_factory->NewTableReader( + return moptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, *last_internal_comparator_, 0 /* block_protection_bytes_per_key */, @@ -4460,7 +4460,7 @@ TEST_P(BlockBasedTableTest, NoFileChecksum) { std::unique_ptr builder; const ReadOptions read_options; const WriteOptions write_options; - builder.reset(ioptions.table_factory->NewTableBuilder( + builder.reset(moptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, read_options, write_options, *comparator, &internal_tbl_prop_coll_factories, options.compression, options.compression_opts, @@ -4498,7 +4498,7 @@ TEST_P(BlockBasedTableTest, Crc32cFileChecksum) { std::unique_ptr builder; const ReadOptions read_options; const WriteOptions write_options; - builder.reset(ioptions.table_factory->NewTableBuilder( + builder.reset(moptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, read_options, write_options, *comparator, &internal_tbl_prop_coll_factories, options.compression, options.compression_opts, @@ -5491,7 +5491,7 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { ImmutableOptions ioptions2(options2); const MutableCFOptions moptions2(options2); - ASSERT_OK(ioptions.table_factory->NewTableReader( + ASSERT_OK(moptions.table_factory->NewTableReader( TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(), GetPlainInternalComparator(options2.comparator), 0 /* block_protection_bytes_per_key */), diff --git a/tools/advisor/advisor/bench_runner.py b/tools/advisor/advisor/bench_runner.py index 45d6c8313..702ccd8c0 100644 --- a/tools/advisor/advisor/bench_runner.py +++ b/tools/advisor/advisor/bench_runner.py @@ -30,7 +30,7 @@ def get_info_log_file_name(log_dir, db_path): # refer GetInfoLogPrefix() in rocksdb/util/filename.cc # example db_path: /dev/shm/dbbench file_name = db_path[1:] # to ignore the leading '/' character - to_be_replaced = re.compile("[^0-9a-zA-Z\-_\.]") # noqa + to_be_replaced = re.compile(r"[^0-9a-zA-Z\-_\.]") # noqa for character in to_be_replaced.findall(db_path): file_name = file_name.replace(character, "_") if not file_name.endswith("_"): diff --git a/tools/advisor/advisor/db_bench_runner.py b/tools/advisor/advisor/db_bench_runner.py index f5802ed15..c249e9074 100644 --- a/tools/advisor/advisor/db_bench_runner.py +++ b/tools/advisor/advisor/db_bench_runner.py @@ -65,7 +65,7 @@ def _parse_output(self, get_perf_context=False): """ output = {self.THROUGHPUT: None, self.DB_PATH: None, self.PERF_CON: None} perf_context_begins = False - with open(self.OUTPUT_FILE, "r") as fp: + with open(self.OUTPUT_FILE) as fp: for line in fp: if line.startswith(self.benchmark): # line from sample output: @@ -159,7 +159,7 @@ def _setup_db_before_experiment(self, curr_options, db_path): except OSError as e: print("Error: rmdir " + e.filename + " " + e.strerror) # setup database with a million keys using the fillrandom benchmark - command = "%s --benchmarks=fillrandom --db=%s --num=1000000" % ( + command = "{} --benchmarks=fillrandom --db={} --num=1000000".format( self.db_bench_binary, db_path, ) @@ -168,7 +168,7 @@ def _setup_db_before_experiment(self, curr_options, db_path): self._run_command(command) def _build_experiment_command(self, curr_options, db_path): - command = "%s --benchmarks=%s --statistics --perf_level=3 --db=%s" % ( + command = "{} --benchmarks={} --statistics --perf_level=3 --db={}".format( self.db_bench_binary, self.benchmark, db_path, diff --git a/tools/advisor/advisor/db_log_parser.py b/tools/advisor/advisor/db_log_parser.py index 9ba541fc3..14662b2ca 100644 --- a/tools/advisor/advisor/db_log_parser.py +++ b/tools/advisor/advisor/db_log_parser.py @@ -33,7 +33,7 @@ class Log: def is_new_log(log_line): # The assumption is that a new log will start with a date printed in # the below regex format. - date_regex = "\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}\.\d{6}" # noqa + date_regex = r"\d{4}/\d{2}/\d{2}-\d{2}:\d{2}:\d{2}\.\d{6}" # noqa return re.match(date_regex, log_line) def __init__(self, log_line, column_families): @@ -46,7 +46,7 @@ def __init__(self, log_line, column_families): # "2018/07/25-17:29:05.176080 7f969de68700 [db/compaction_job.cc:1634] # [default] [JOB 3] Compacting 24@0 + 16@1 files to L1, score 6.00\n" for col_fam in column_families: - search_for_str = "\[" + col_fam + "\]" # noqa + search_for_str = r"\[" + col_fam + r"\]" # noqa if re.search(search_for_str, self.message): self.column_family = col_fam break @@ -119,7 +119,7 @@ def check_and_trigger_conditions(self, conditions): # 'old' and were not deleted for some reason if re.search("old", file_name, re.IGNORECASE): continue - with open(file_name, "r") as db_logs: + with open(file_name) as db_logs: new_log = None for line in db_logs: if Log.is_new_log(line): diff --git a/tools/advisor/advisor/db_options_parser.py b/tools/advisor/advisor/db_options_parser.py index 062aeeec4..4587efb50 100644 --- a/tools/advisor/advisor/db_options_parser.py +++ b/tools/advisor/advisor/db_options_parser.py @@ -143,7 +143,7 @@ def setup_misc_options(self, misc_options): def load_from_source(self, options_path): self.options_dict = {} - with open(options_path, "r") as db_options: + with open(options_path) as db_options: for line in db_options: line = OptionsSpecParser.remove_trailing_comment(line) if not line: diff --git a/tools/advisor/advisor/db_stats_fetcher.py b/tools/advisor/advisor/db_stats_fetcher.py index 30d1ad8b3..b8d9970da 100755 --- a/tools/advisor/advisor/db_stats_fetcher.py +++ b/tools/advisor/advisor/db_stats_fetcher.py @@ -99,7 +99,7 @@ def fetch_timeseries(self, reqd_stats): # directory if re.search("old", file_name, re.IGNORECASE): continue - with open(file_name, "r") as db_logs: + with open(file_name) as db_logs: new_log = None for line in db_logs: if Log.is_new_log(line): @@ -215,7 +215,7 @@ def parse_rapido_output(self): # \t\t[[ts, value], [ts, value], ...] # ts = timestamp; value = value of key_name in entity_name at time ts self.keys_ts = {} - with open(self.OUTPUT_FILE, "r") as fp: + with open(self.OUTPUT_FILE) as fp: for line in fp: token_list = line.strip().split("\t") entity = token_list[0] @@ -236,7 +236,7 @@ def parse_ods_output(self): # \t\t\t # there is one line per (entity_name, key_name, timestamp) self.keys_ts = {} - with open(self.OUTPUT_FILE, "r") as fp: + with open(self.OUTPUT_FILE) as fp: for line in fp: token_list = line.split() entity = token_list[0] @@ -301,8 +301,8 @@ def get_keys_from_conditions(self, conditions): def fetch_rate_url( self, - entities: List[str], - keys: List[str], + entities: list[str], + keys: list[str], window_len: str, percent: str, display: bool, @@ -341,6 +341,6 @@ def fetch_rate_url( ) self.execute_script(command) url = "" - with open(self.OUTPUT_FILE, "r") as fp: + with open(self.OUTPUT_FILE) as fp: url = fp.readline() return url diff --git a/tools/advisor/advisor/db_timeseries_parser.py b/tools/advisor/advisor/db_timeseries_parser.py index 5840d7b90..92977532c 100644 --- a/tools/advisor/advisor/db_timeseries_parser.py +++ b/tools/advisor/advisor/db_timeseries_parser.py @@ -51,7 +51,7 @@ def fetch_burst_epochs( window_sec: float, threshold: bool, percent: bool, - ) -> Dict[str, Dict[int, float]]: + ) -> dict[str, dict[int, float]]: # this method calculates the (percent) rate change in the 'statistic' # for each entity (over 'window_sec' seconds) and returns the epochs # where this rate change is greater than or equal to the 'threshold' diff --git a/tools/advisor/advisor/rule_parser.py b/tools/advisor/advisor/rule_parser.py index 169a55363..e2ba450cc 100644 --- a/tools/advisor/advisor/rule_parser.py +++ b/tools/advisor/advisor/rule_parser.py @@ -67,10 +67,10 @@ def perform_checks(self): + ": rule must be associated with 2 conditions\ in order to check for a time dependency between them" ) - time_format = "^\d+[s|m|h|d]$" # noqa + time_format = r"^\d+[s|m|h|d]$" # noqa if not re.match(time_format, self.overlap_time_seconds, re.IGNORECASE): raise ValueError( - self.name + ": overlap_time_seconds format: \d+[s|m|h|d]" + self.name + r": overlap_time_seconds format: \d+[s|m|h|d]" ) else: # convert to seconds in_seconds = int(self.overlap_time_seconds[:-1]) @@ -428,7 +428,7 @@ def perform_section_checks(self): def load_rules_from_spec(self): self.initialise_fields() - with open(self.file_path, "r") as db_rules: + with open(self.file_path) as db_rules: curr_section = None for line in db_rules: line = IniParser.remove_trailing_comment(line) diff --git a/tools/advisor/test/test_db_stats_fetcher.py b/tools/advisor/test/test_db_stats_fetcher.py index e2c29ab74..534d669f8 100644 --- a/tools/advisor/test/test_db_stats_fetcher.py +++ b/tools/advisor/test/test_db_stats_fetcher.py @@ -19,7 +19,7 @@ def setUp(self): stats_file = os.path.join(this_path, "input_files/log_stats_parser_keys_ts") # populate the keys_ts dictionary of LogStatsParser self.stats_dict = {NO_ENTITY: {}} - with open(stats_file, "r") as fp: + with open(stats_file) as fp: for line in fp: stat_name = line.split(":")[0].strip() self.stats_dict[NO_ENTITY][stat_name] = {} diff --git a/tools/benchmark_ci.py b/tools/benchmark_ci.py index de9f69cf9..c50cb0fb5 100755 --- a/tools/benchmark_ci.py +++ b/tools/benchmark_ci.py @@ -56,7 +56,7 @@ def read_version(config): majorRegex = re.compile(r"#define ROCKSDB_MAJOR\s([0-9]+)") minorRegex = re.compile(r"#define ROCKSDB_MINOR\s([0-9]+)") patchRegex = re.compile(r"#define ROCKSDB_PATCH\s([0-9]+)") - with open(config.version_file, "r") as reader: + with open(config.version_file) as reader: major = None minor = None patch = None diff --git a/tools/block_cache_analyzer/block_cache_pysim.py b/tools/block_cache_analyzer/block_cache_pysim.py index 3962f37eb..7a542edad 100644 --- a/tools/block_cache_analyzer/block_cache_pysim.py +++ b/tools/block_cache_analyzer/block_cache_pysim.py @@ -120,18 +120,18 @@ def __repr__(self): def cost_class(self, cost_class_label): if cost_class_label == "table_bt": - return "{}-{}".format(self.table_id, self.block_type) + return f"{self.table_id}-{self.block_type}" elif cost_class_label == "table": - return "{}".format(self.table_id) + return f"{self.table_id}" elif cost_class_label == "bt": - return "{}".format(self.block_type) + return f"{self.block_type}" elif cost_class_label == "cf": - return "{}".format(self.cf_id) + return f"{self.cf_id}" elif cost_class_label == "cf_bt": - return "{}-{}".format(self.cf_id, self.block_type) + return f"{self.cf_id}-{self.block_type}" elif cost_class_label == "table_level_bt": - return "{}-{}-{}".format(self.table_id, self.level, self.block_type) - assert False, "Unknown cost class label {}".format(cost_class_label) + return f"{self.table_id}-{self.level}-{self.block_type}" + assert False, f"Unknown cost class label {cost_class_label}" return None @@ -144,7 +144,7 @@ def __init__(self, key, hash, value): self.value = value def __repr__(self): - return "k={},h={},v=[{}]".format(self.key, self.hash, self.value) + return f"k={self.key},h={self.hash},v=[{self.value}]" class HashTable: @@ -190,7 +190,7 @@ def __repr__(self): for j in range(len(self.table[i])): if self.table[i][j] is not None: all_entries.append(self.table[i][j]) - return "{}".format(all_entries) + return f"{all_entries}" def values(self): all_values = [] @@ -366,15 +366,15 @@ def write_miss_timeline( with open(header_file_path, "w+") as header_file: header = "time" for trace_time in range(start, end): - header += ",{}".format(trace_time) + header += f",{trace_time}" header_file.write(header + "\n") file_path = "{}/data-ml-miss-timeline-{}-{}-{}-{}".format( result_dir, self.time_unit, cache_type, cache_size, target_cf_name ) with open(file_path, "w+") as file: - row = "{}".format(cache_type) + row = f"{cache_type}" for trace_time in range(start, end): - row += ",{}".format(self.time_misses.get(trace_time, 0)) + row += f",{self.time_misses.get(trace_time, 0)}" file.write(row + "\n") def write_miss_ratio_timeline( @@ -389,13 +389,13 @@ def write_miss_ratio_timeline( with open(header_file_path, "w+") as header_file: header = "time" for trace_time in range(start, end): - header += ",{}".format(trace_time) + header += f",{trace_time}" header_file.write(header + "\n") file_path = "{}/data-ml-miss-ratio-timeline-{}-{}-{}-{}".format( result_dir, self.time_unit, cache_type, cache_size, target_cf_name ) with open(file_path, "w+") as file: - row = "{}".format(cache_type) + row = f"{cache_type}" for trace_time in range(start, end): naccesses = self.time_accesses.get(trace_time, 0) miss_ratio = 0 @@ -403,7 +403,7 @@ def write_miss_ratio_timeline( miss_ratio = float( self.time_misses.get(trace_time, 0) * 100.0 ) / float(naccesses) - row += ",{0:.2f}".format(miss_ratio) + row += f",{miss_ratio:.2f}" file.write(row + "\n") @@ -440,7 +440,7 @@ def write_policy_timeline( with open(header_file_path, "w+") as header_file: header = "time" for trace_time in range(start, end): - header += ",{}".format(trace_time) + header += f",{trace_time}" header_file.write(header + "\n") file_path = "{}/data-ml-policy-timeline-{}-{}-{}-{}".format( result_dir, self.time_unit, cache_type, cache_size, target_cf_name @@ -448,7 +448,7 @@ def write_policy_timeline( with open(file_path, "w+") as file: for policy in self.policy_names: policy_name = self.policy_names[policy] - row = "{}-{}".format(cache_type, policy_name) + row = f"{cache_type}-{policy_name}" for trace_time in range(start, end): row += ",{}".format( self.time_selected_polices.get(trace_time, {}).get( @@ -469,7 +469,7 @@ def write_policy_ratio_timeline( with open(header_file_path, "w+") as header_file: header = "time" for trace_time in range(start, end): - header += ",{}".format(trace_time) + header += f",{trace_time}" header_file.write(header + "\n") file_path = "{}/data-ml-policy-ratio-timeline-{}-{}-{}-{}".format( result_dir, self.time_unit, cache_type, cache_size, target_cf_name @@ -477,7 +477,7 @@ def write_policy_ratio_timeline( with open(file_path, "w+") as file: for policy in self.policy_names: policy_name = self.policy_names[policy] - row = "{}-{}".format(cache_type, policy_name) + row = f"{cache_type}-{policy_name}" for trace_time in range(start, end): naccesses = self.time_accesses.get(trace_time, 0) ratio = 0 @@ -488,7 +488,7 @@ def write_policy_ratio_timeline( ) * 100.0 ) / float(naccesses) - row += ",{0:.2f}".format(ratio) + row += f",{ratio:.2f}" file.write(row + "\n") @@ -674,10 +674,10 @@ def __init__(self, cache_size, enable_cache_row_key): self.retain_get_id_range = 100000 def block_key(self, trace_record): - return "b{}".format(trace_record.block_id) + return f"b{trace_record.block_id}" def row_key(self, trace_record): - return "g{}-{}".format(trace_record.fd, trace_record.key_id) + return f"g{trace_record.fd}-{trace_record.key_id}" def _lookup(self, trace_record, key, hash): """ @@ -893,7 +893,7 @@ class MLCache(Cache): """ def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label): - super(MLCache, self).__init__(cache_size, enable_cache_row_key) + super().__init__(cache_size, enable_cache_row_key) self.table = HashTable() self.policy_stats = PolicyStats(kSecondsInMinute, policies) self.per_hour_policy_stats = PolicyStats(kSecondsInHour, policies) @@ -1015,7 +1015,7 @@ def __init__( init_a=1, init_b=1, ): - super(ThompsonSamplingCache, self).__init__( + super().__init__( cache_size, enable_cache_row_key, policies, cost_class_label ) self._as = {} @@ -1042,7 +1042,7 @@ def cache_name(self): return "Hybrid ThompsonSampling with cost class {} (ts_hybrid)".format( self.cost_class_label ) - return "ThompsonSampling with cost class {} (ts)".format(self.cost_class_label) + return f"ThompsonSampling with cost class {self.cost_class_label} (ts)" class LinUCBCache(MLCache): @@ -1057,7 +1057,7 @@ class LinUCBCache(MLCache): """ def __init__(self, cache_size, enable_cache_row_key, policies, cost_class_label): - super(LinUCBCache, self).__init__( + super().__init__( cache_size, enable_cache_row_key, policies, cost_class_label ) self.nfeatures = 4 # Block type, level, cf. @@ -1101,7 +1101,7 @@ def cache_name(self): return "Hybrid LinUCB with cost class {} (linucb_hybrid)".format( self.cost_class_label ) - return "LinUCB with cost class {} (linucb)".format(self.cost_class_label) + return f"LinUCB with cost class {self.cost_class_label} (linucb)" class OPTCacheEntry: @@ -1198,7 +1198,7 @@ class OPTCache(Cache): """ def __init__(self, cache_size): - super(OPTCache, self).__init__(cache_size, enable_cache_row_key=0) + super().__init__(cache_size, enable_cache_row_key=0) self.table = PQTable() def _lookup(self, trace_record, key, hash): @@ -1271,7 +1271,7 @@ class GDSizeCache(Cache): """ def __init__(self, cache_size, enable_cache_row_key): - super(GDSizeCache, self).__init__(cache_size, enable_cache_row_key) + super().__init__(cache_size, enable_cache_row_key) self.table = PQTable() self.L = 0.0 @@ -1340,7 +1340,7 @@ def __iter__(self): return reversed(self.od) def __repr__(self): - return "Deque(%r)" % (list(self),) + return "Deque({!r})".format(list(self)) class ARCCache(Cache): @@ -1361,7 +1361,7 @@ class ARCCache(Cache): """ def __init__(self, cache_size, enable_cache_row_key): - super(ARCCache, self).__init__(cache_size, enable_cache_row_key) + super().__init__(cache_size, enable_cache_row_key) self.table = {} self.c = cache_size / 16 * 1024 # Number of elements in the cache. self.p = 0 # Target size for the list T1 @@ -1459,7 +1459,7 @@ class LRUCache(Cache): """ def __init__(self, cache_size, enable_cache_row_key): - super(LRUCache, self).__init__(cache_size, enable_cache_row_key) + super().__init__(cache_size, enable_cache_row_key) self.table = {} self.lru = Deque() @@ -1505,7 +1505,7 @@ class TraceCache(Cache): """ def __init__(self, cache_size): - super(TraceCache, self).__init__(cache_size, enable_cache_row_key=0) + super().__init__(cache_size, enable_cache_row_key=0) def _lookup(self, trace_record, key, hash): return trace_record.is_hit @@ -1629,7 +1629,7 @@ def create_cache(cache_type, cache_size, downsample_size): elif cache_type == "gdsize": return GDSizeCache(cache_size, enable_cache_row_key) else: - print("Unknown cache type {}".format(cache_type)) + print(f"Unknown cache type {cache_type}") assert False return None @@ -1692,7 +1692,7 @@ def run( # can use this information to evict the cached key which next access is # the furthest in the future. print("Preprocessing block traces.") - with open(trace_file_path, "r") as trace_file: + with open(trace_file_path) as trace_file: for line in trace_file: if ( max_accesses_to_process != -1 @@ -1735,9 +1735,9 @@ def run( ) time_interval += 1 print( - "Trace contains {0} blocks, {1}({2:.2f}%) blocks with no size." - "{3} accesses, {4}({5:.2f}%) accesses with no_insert," - "{6}({7:.2f}%) accesses that want to insert but block size is 0.".format( + "Trace contains {} blocks, {}({:.2f}%) blocks with no size." + "{} accesses, {}({:.2f}%) accesses with no_insert," + "{}({:.2f}%) accesses that want to insert but block size is 0.".format( len(block_access_timelines), num_blocks_with_no_size, percent(num_blocks_with_no_size, len(block_access_timelines)), @@ -1754,8 +1754,8 @@ def run( start_time = time.time() trace_start_time = 0 trace_duration = 0 - print("Running simulated {} cache on block traces.".format(cache.cache_name())) - with open(trace_file_path, "r") as trace_file: + print(f"Running simulated {cache.cache_name()} cache on block traces.") + with open(trace_file_path) as trace_file: for line in trace_file: if ( max_accesses_to_process != -1 @@ -1871,8 +1871,8 @@ def report_stats( trace_start_time, trace_end_time, ): - cache_label = "{}-{}-{}".format(cache_type, cache_size, target_cf_name) - with open("{}/data-ml-mrc-{}".format(result_dir, cache_label), "w+") as mrc_file: + cache_label = f"{cache_type}-{cache_size}-{target_cf_name}" + with open(f"{result_dir}/data-ml-mrc-{cache_label}", "w+") as mrc_file: mrc_file.write( "{},0,0,{},{},{}\n".format( cache_type, @@ -1897,7 +1897,7 @@ def report_stats( "w+", ) as mb_file: mb_file.write( - "{},0,0,{},{}\n".format(cache_type, cache_size, avg_miss_bytes) + f"{cache_type},0,0,{cache_size},{avg_miss_bytes}\n" ) with open( @@ -1907,7 +1907,7 @@ def report_stats( "w+", ) as mb_file: mb_file.write( - "{},0,0,{},{}\n".format(cache_type, cache_size, p95_miss_bytes) + f"{cache_type},0,0,{cache_size},{p95_miss_bytes}\n" ) cache_stats[i].write_miss_timeline( @@ -1970,7 +1970,7 @@ def report_stats( "it will run against all accesses.)" ) exit(1) - print("Arguments: {}".format(sys.argv)) + print(f"Arguments: {sys.argv}") cache_type = sys.argv[1] cache_size = parse_cache_size(sys.argv[2]) downsample_size = int(sys.argv[3]) diff --git a/tools/block_cache_analyzer/block_cache_pysim_test.py b/tools/block_cache_analyzer/block_cache_pysim_test.py index eed1b94af..68fcd462e 100644 --- a/tools/block_cache_analyzer/block_cache_pysim_test.py +++ b/tools/block_cache_analyzer/block_cache_pysim_test.py @@ -33,13 +33,13 @@ def test_hash_table(): table = HashTable() data_size = 10000 for i in range(data_size): - table.insert("k{}".format(i), i, "v{}".format(i)) + table.insert(f"k{i}", i, f"v{i}") for i in range(data_size): - assert table.lookup("k{}".format(i), i) is not None + assert table.lookup(f"k{i}", i) is not None for i in range(data_size): - table.delete("k{}".format(i), i) + table.delete(f"k{i}", i) for i in range(data_size): - assert table.lookup("k{}".format(i), i) is None + assert table.lookup(f"k{i}", i) is None truth_map = {} n = 1000000 @@ -47,7 +47,7 @@ def test_hash_table(): for i in range(n): key_id = random.randint(0, records) v = random.randint(0, records) - key = "k{}".format(key_id) + key = f"k{key_id}" value = CacheEntry(v, v, v, v, v, v, v) action = random.randint(0, 10) assert len(truth_map) == table.elements, "{} {} {}".format( @@ -104,18 +104,18 @@ def assert_metrics(cache, expected_value, expected_value_size=1, custom_hashtabl ) for expeceted_k in expected_value[3]: if custom_hashtable: - val = cache.table.lookup("b{}".format(expeceted_k), expeceted_k) + val = cache.table.lookup(f"b{expeceted_k}", expeceted_k) else: - val = cache.table["b{}".format(expeceted_k)] + val = cache.table[f"b{expeceted_k}"] assert val is not None, "Expected {} Actual: Not Exist {}, Table: {}".format( expeceted_k, expected_value, cache.table ) assert val.value_size == expected_value_size for expeceted_k in expected_value[4]: if custom_hashtable: - val = cache.table.lookup("g0-{}".format(expeceted_k), expeceted_k) + val = cache.table.lookup(f"g0-{expeceted_k}", expeceted_k) else: - val = cache.table["g0-{}".format(expeceted_k)] + val = cache.table[f"g0-{expeceted_k}"] assert val is not None assert val.value_size == expected_value_size @@ -288,7 +288,7 @@ def test_lfu_cache(): def test_mix(cache): - print("Test Mix {} cache".format(cache.cache_name())) + print(f"Test Mix {cache.cache_name()} cache") n = 100000 records = 100 block_size_table = {} @@ -343,7 +343,7 @@ def test_mix(cache): assert cached_size == cache.used_size, "Expeced {} Actual {}".format( cache.used_size, cached_size ) - print("Test Mix {} cache: Success".format(cache.cache_name())) + print(f"Test Mix {cache.cache_name()} cache: Success") def test_end_to_end(): @@ -366,27 +366,27 @@ def test_end_to_end(): fd = random.randint(0, nfds) now = i * kMicrosInSecond access_record = "" - access_record += "{},".format(now) - access_record += "{},".format(key_id) - access_record += "{},".format(9) # block type - access_record += "{},".format(block_size) # block size - access_record += "{},".format(cf_id) - access_record += "cf_{},".format(cf_id) - access_record += "{},".format(level) - access_record += "{},".format(fd) - access_record += "{},".format(key_id % 3) # caller - access_record += "{},".format(0) # no insert - access_record += "{},".format(i) # get_id - access_record += "{},".format(i) # key_id - access_record += "{},".format(100) # kv_size - access_record += "{},".format(1) # is_hit - access_record += "{},".format(1) # referenced_key_exist_in_block - access_record += "{},".format(10) # num_keys_in_block - access_record += "{},".format(1) # table_id - access_record += "{},".format(0) # seq_number - access_record += "{},".format(10) # block key size - access_record += "{},".format(20) # key size - access_record += "{},".format(0) # block offset + access_record += f"{now}," + access_record += f"{key_id}," + access_record += f"{9}," # block type + access_record += f"{block_size}," # block size + access_record += f"{cf_id}," + access_record += f"cf_{cf_id}," + access_record += f"{level}," + access_record += f"{fd}," + access_record += f"{key_id % 3}," # caller + access_record += f"{0}," # no insert + access_record += f"{i}," # get_id + access_record += f"{i}," # key_id + access_record += f"{100}," # kv_size + access_record += f"{1}," # is_hit + access_record += f"{1}," # referenced_key_exist_in_block + access_record += f"{10}," # num_keys_in_block + access_record += f"{1}," # table_id + access_record += f"{0}," # seq_number + access_record += f"{10}," # block key size + access_record += f"{20}," # key size + access_record += f"{0}," # block offset access_record = access_record[:-1] access_records += access_record + "\n" trace_file.write(access_records) @@ -424,14 +424,14 @@ def test_end_to_end(): assert cached_size == cache.used_size, "Expeced {} Actual {}".format( cache.used_size, cached_size ) - print("Test All {}: Success".format(cache.cache_name())) + print(f"Test All {cache.cache_name()}: Success") os.remove(trace_file_path) print("Test All: Success") def test_hybrid(cache): - print("Test {} cache".format(cache.cache_name())) + print(f"Test {cache.cache_name()} cache") k = TraceRecord( access_time=0, block_id=1, @@ -530,7 +530,7 @@ def test_hybrid(cache): assert_metrics( cache, [kSampleSize, 103, 99, [i for i in range(101 - kSampleSize, 101)], []] ) - print("Test {} cache: Success".format(cache.cache_name())) + print(f"Test {cache.cache_name()} cache: Success") def test_opt_cache(): diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py index 37166bcb4..6521ef286 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_plot.py @@ -43,9 +43,9 @@ def get_cmap(n, name="hsv"): def num_to_gb(n): one_gb = 1024 * 1024 * 1024 if float(n) % one_gb == 0: - return "{}".format(n / one_gb) + return f"{n / one_gb}" # Keep two decimal points. - return "{0:.2f}".format(float(n) / one_gb) + return f"{float(n) / one_gb:.2f}" def plot_miss_stats_graphs( @@ -57,9 +57,9 @@ def plot_miss_stats_graphs( continue if not file.endswith(file_suffix): continue - print("Processing file {}/{}".format(csv_result_dir, file)) + print(f"Processing file {csv_result_dir}/{file}") mrc_file_path = csv_result_dir + "/" + file - with open(mrc_file_path, "r") as csvfile: + with open(mrc_file_path) as csvfile: rows = csv.reader(csvfile, delimiter=",") for row in rows: cache_name = row[0] @@ -67,7 +67,7 @@ def plot_miss_stats_graphs( ghost_capacity = int(row[2]) capacity = int(row[3]) miss_ratio = float(row[4]) - config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) + config = f"{cache_name}-{num_shard_bits}-{ghost_capacity}" if config not in miss_ratios: miss_ratios[config] = {} miss_ratios[config]["x"] = [] @@ -83,10 +83,10 @@ def plot_miss_stats_graphs( plt.ylabel(ylabel) plt.xscale("log", basex=2) plt.ylim(ymin=0) - plt.title("{}".format(file)) + plt.title(f"{file}") plt.legend() fig.savefig( - output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight" + output_result_dir + f"/{pdf_file_name}.pdf", bbox_inches="tight" ) @@ -99,9 +99,9 @@ def plot_miss_stats_diff_lru_graphs( continue if not file.endswith(file_suffix): continue - print("Processing file {}/{}".format(csv_result_dir, file)) + print(f"Processing file {csv_result_dir}/{file}") mrc_file_path = csv_result_dir + "/" + file - with open(mrc_file_path, "r") as csvfile: + with open(mrc_file_path) as csvfile: rows = csv.reader(csvfile, delimiter=",") for row in rows: cache_name = row[0] @@ -109,7 +109,7 @@ def plot_miss_stats_diff_lru_graphs( ghost_capacity = int(row[2]) capacity = int(row[3]) miss_ratio = float(row[4]) - config = "{}-{}-{}".format(cache_name, num_shard_bits, ghost_capacity) + config = f"{cache_name}-{num_shard_bits}-{ghost_capacity}" if config not in miss_ratios: miss_ratios[config] = {} miss_ratios[config]["x"] = [] @@ -132,10 +132,10 @@ def plot_miss_stats_diff_lru_graphs( plt.xlabel("Cache capacity") plt.ylabel(ylabel) plt.xscale("log", basex=2) - plt.title("{}".format(file)) + plt.title(f"{file}") plt.legend() fig.savefig( - output_result_dir + "/{}.pdf".format(pdf_file_name), bbox_inches="tight" + output_result_dir + f"/{pdf_file_name}.pdf", bbox_inches="tight" ) @@ -226,8 +226,8 @@ def plot_line_charts( continue if not file.startswith(filename_prefix): continue - print("Processing file {}/{}".format(csv_result_dir, file)) - with open(csv_result_dir + "/" + file, "r") as csvfile: + print(f"Processing file {csv_result_dir}/{file}") + with open(csv_result_dir + "/" + file) as csvfile: x, labels, label_stats = read_data_for_plot(csvfile, vertical) if len(x) == 0 or len(labels) == 0: continue @@ -247,11 +247,11 @@ def plot_line_charts( # Translate time unit into x labels. if "_60" in file: - plt.xlabel("{} (Minute)".format(xlabel)) + plt.xlabel(f"{xlabel} (Minute)") if "_3600" in file: - plt.xlabel("{} (Hour)".format(xlabel)) + plt.xlabel(f"{xlabel} (Hour)") plt.ylabel(ylabel) - plt.title("{} {}".format(title, file)) + plt.title(f"{title} {file}") if legend: plt.legend() pdf.savefig(fig) @@ -271,13 +271,13 @@ def plot_stacked_bar_charts( ): global color_index, bar_color_maps, colors pdf = matplotlib.backends.backend_pdf.PdfPages( - "{}/{}".format(output_result_dir, pdf_name) + f"{output_result_dir}/{pdf_name}" ) for file in os.listdir(csv_result_dir): if not file.endswith(filename_suffix): continue - with open(csv_result_dir + "/" + file, "r") as csvfile: - print("Processing file {}/{}".format(csv_result_dir, file)) + with open(csv_result_dir + "/" + file) as csvfile: + print(f"Processing file {csv_result_dir}/{file}") x, labels, label_stats = read_data_for_plot(csvfile, vertical) if len(x) == 0 or len(label_stats) == 0: continue @@ -310,25 +310,25 @@ def plot_stacked_bar_charts( ind, [x_prefix + x[i] for i in range(len(x))], rotation=20, fontsize=8 ) plt.legend(bars, labels) - plt.title("{} filename:{}".format(title, file)) + plt.title(f"{title} filename:{file}") pdf.savefig(fig) pdf.close() def plot_heatmap(csv_result_dir, output_result_dir, filename_suffix, pdf_name, title): pdf = matplotlib.backends.backend_pdf.PdfPages( - "{}/{}".format(output_result_dir, pdf_name) + f"{output_result_dir}/{pdf_name}" ) for file in os.listdir(csv_result_dir): if not file.endswith(filename_suffix): continue - csv_file_name = "{}/{}".format(csv_result_dir, file) - print("Processing file {}/{}".format(csv_result_dir, file)) + csv_file_name = f"{csv_result_dir}/{file}" + print(f"Processing file {csv_result_dir}/{file}") corr_table = pd.read_csv(csv_file_name) corr_table = corr_table.pivot("label", "corr", "value") fig = plt.figure() sns.heatmap(corr_table, annot=True, linewidths=0.5, fmt=".2") - plt.title("{} filename:{}".format(title, file)) + plt.title(f"{title} filename:{file}") pdf.savefig(fig) pdf.close() @@ -360,16 +360,16 @@ def plot_correlation(csv_result_dir, output_result_dir): for file in os.listdir(csv_result_dir): if not file.endswith("correlation_input"): continue - csv_file_name = "{}/{}".format(csv_result_dir, file) - print("Processing file {}/{}".format(csv_result_dir, file)) + csv_file_name = f"{csv_result_dir}/{file}" + print(f"Processing file {csv_result_dir}/{file}") corr_table = pd.read_csv(csv_file_name) label_str = file.split("_")[0] label = file[len(label_str) + 1 :] label = label[: len(label) - len("_correlation_input")] - output_file = "{}/{}_correlation_output".format(csv_result_dir, label_str) + output_file = f"{csv_result_dir}/{label_str}_correlation_output" if output_file not in label_str_file: - f = open("{}/{}_correlation_output".format(csv_result_dir, label_str), "w+") + f = open(f"{csv_result_dir}/{label_str}_correlation_output", "w+") label_str_file[output_file] = f f.write("label,corr,value\n") f = label_str_file[output_file] @@ -666,9 +666,9 @@ def plot_miss_ratio_timeline(csv_result_dir, output_result_dir): csv_abs_dir = csv_result_dir + "/" + csv_relative_dir result_dir = output_result_dir + "/" + csv_relative_dir if not os.path.isdir(csv_abs_dir): - print("{} is not a directory".format(csv_abs_dir)) + print(f"{csv_abs_dir} is not a directory") continue - print("Processing experiment dir: {}".format(csv_relative_dir)) + print(f"Processing experiment dir: {csv_relative_dir}") if not os.path.exists(result_dir): os.makedirs(result_dir) plot_access_count_summary(csv_abs_dir, result_dir) @@ -698,32 +698,32 @@ def plot_miss_ratio_timeline(csv_result_dir, output_result_dir): plot_miss_stats_graphs( csv_abs_dir, result_dir, - file_prefix="ml_{}_".format(time_unit), + file_prefix=f"ml_{time_unit}_", file_suffix="p95mb", - ylabel="p95 number of byte miss per {} seconds".format(time_unit), - pdf_file_name="p95mb_per{}_seconds".format(time_unit), + ylabel=f"p95 number of byte miss per {time_unit} seconds", + pdf_file_name=f"p95mb_per{time_unit}_seconds", ) plot_miss_stats_graphs( csv_abs_dir, result_dir, - file_prefix="ml_{}_".format(time_unit), + file_prefix=f"ml_{time_unit}_", file_suffix="avgmb", - ylabel="Average number of byte miss per {} seconds".format(time_unit), - pdf_file_name="avgmb_per{}_seconds".format(time_unit), + ylabel=f"Average number of byte miss per {time_unit} seconds", + pdf_file_name=f"avgmb_per{time_unit}_seconds", ) plot_miss_stats_diff_lru_graphs( csv_abs_dir, result_dir, - file_prefix="ml_{}_".format(time_unit), + file_prefix=f"ml_{time_unit}_", file_suffix="p95mb", - ylabel="p95 number of byte miss per {} seconds".format(time_unit), - pdf_file_name="p95mb_per{}_seconds_diff_lru".format(time_unit), + ylabel=f"p95 number of byte miss per {time_unit} seconds", + pdf_file_name=f"p95mb_per{time_unit}_seconds_diff_lru", ) plot_miss_stats_diff_lru_graphs( csv_abs_dir, result_dir, - file_prefix="ml_{}_".format(time_unit), + file_prefix=f"ml_{time_unit}_", file_suffix="avgmb", - ylabel="Average number of byte miss per {} seconds".format(time_unit), - pdf_file_name="avgmb_per{}_seconds_diff_lru".format(time_unit), + ylabel=f"Average number of byte miss per {time_unit} seconds", + pdf_file_name=f"avgmb_per{time_unit}_seconds_diff_lru", ) diff --git a/tools/check_all_python.py b/tools/check_all_python.py index 708339a67..567e370c4 100755 --- a/tools/check_all_python.py +++ b/tools/check_all_python.py @@ -15,8 +15,8 @@ filenames += glob.glob(base + "/" + suff + ".py") for filename in filenames: - source = open(filename, "r").read() + "\n" + source = open(filename).read() + "\n" # Parses and syntax checks the file, throwing on error. (No pyc written.) _ = compile(source, filename, "exec") -print("No syntax errors in {0} .py files".format(len(filenames))) +print(f"No syntax errors in {len(filenames)} .py files") diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index 16309f18c..131e32faa 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -125,7 +125,7 @@ EOF # To check for DB forward compatibility with loading options (old version # reading data from new), as well as backward compatibility -declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb") +declare -a db_forward_with_options_refs=("8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb" "8.10.fb" "8.11.fb" "9.0.fb" "9.1.fb" "9.2.fb" "9.3.fb" "9.4.fb" "9.5.fb" "9.6.fb" "9.7.fb" "9.8.fb") # To check for DB forward compatibility without loading options (in addition # to the "with loading options" set), as well as backward compatibility declare -a db_forward_no_options_refs=() # N/A at the moment diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 713aaaa41..1baa76cbd 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -153,10 +153,11 @@ DEFINE_string( "randomtransaction," "randomreplacekeys," "timeseries," - "getmergeoperands,", + "getmergeoperands," "readrandomoperands," "backup," - "restore" + "restore," + "approximatememtablestats", "Comma-separated list of operations to run in the specified" " order. Available benchmarks:\n" @@ -243,9 +244,14 @@ DEFINE_string( "operation includes a rare but possible retry in case it got " "`Status::Incomplete()`. This happens upon encountering more keys than " "have ever been seen by the thread (or eight initially)\n" - "\tbackup -- Create a backup of the current DB and verify that a new backup is corrected. " + "\tbackup -- Create a backup of the current DB and verify that a new " + "backup is corrected. " "Rate limit can be specified through --backup_rate_limit\n" - "\trestore -- Restore the DB from the latest backup available, rate limit can be specified through --restore_rate_limit\n"); + "\trestore -- Restore the DB from the latest backup available, rate limit " + "can be specified through --restore_rate_limit\n" + "\tapproximatememtablestats -- Tests accuracy of " + "GetApproximateMemTableStats, ideally\n" + "after fillrandom, where actual answer is batch_size"); DEFINE_int64(num, 1000000, "Number of key/values to place in database"); @@ -3621,6 +3627,8 @@ class Benchmark { fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", entries_per_batch_); method = &Benchmark::ApproximateSizeRandom; + } else if (name == "approximatememtablestats") { + method = &Benchmark::ApproximateMemtableStats; } else if (name == "mixgraph") { method = &Benchmark::MixGraph; } else if (name == "readmissing") { @@ -6298,6 +6306,35 @@ class Benchmark { thread->stats.AddMessage(msg); } + void ApproximateMemtableStats(ThreadState* thread) { + const size_t batch_size = entries_per_batch_; + std::unique_ptr skey_guard; + Slice skey = AllocateKey(&skey_guard); + std::unique_ptr ekey_guard; + Slice ekey = AllocateKey(&ekey_guard); + Duration duration(FLAGS_duration, reads_); + if (FLAGS_num < static_cast(batch_size)) { + std::terminate(); + } + uint64_t range = static_cast(FLAGS_num) - batch_size; + auto count_hist = std::make_shared(); + while (!duration.Done(1)) { + DB* db = SelectDB(thread); + uint64_t start_key = thread->rand.Uniform(range); + GenerateKeyFromInt(start_key, FLAGS_num, &skey); + uint64_t end_key = start_key + batch_size; + GenerateKeyFromInt(end_key, FLAGS_num, &ekey); + uint64_t count = UINT64_MAX; + uint64_t size = UINT64_MAX; + db->GetApproximateMemTableStats({skey, ekey}, &count, &size); + count_hist->Add(count); + thread->stats.FinishedOps(nullptr, db, 1, kOthers); + } + thread->stats.AddMessage("\nReported entry count stats (expected " + + std::to_string(batch_size) + "):"); + thread->stats.AddMessage("\n" + count_hist->ToString()); + } + // Calls ApproximateSize over random key ranges. void ApproximateSizeRandom(ThreadState* thread) { int64_t size_sum = 0; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index f88c0be19..ccfa878d2 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -from __future__ import absolute_import, division, print_function, unicode_literals import argparse import math @@ -48,7 +47,7 @@ "charge_filter_construction": lambda: random.choice([0, 1]), "charge_table_reader": lambda: random.choice([0, 1]), "charge_file_metadata": lambda: random.choice([0, 1]), - "checkpoint_one_in": lambda: random.choice([10000, 1000000]), + "checkpoint_one_in": lambda: random.choice([0, 0, 10000, 1000000]), "compression_type": lambda: random.choice( ["none", "snappy", "zlib", "lz4", "lz4hc", "xpress", "zstd"] ), @@ -444,10 +443,12 @@ def is_direct_io_supported(dbname): "duration": 6000, # time for one db_stress instance to run "interval": 120, + # time for the final verification step + "verify_timeout": 1200, # since we will be killing anyway, use large value for ops_per_thread "ops_per_thread": 100000000, "reopen": 0, - "set_options_one_in": 10000, + "set_options_one_in": 2000, } whitebox_default_params = { @@ -841,7 +842,7 @@ def finalize_and_sanitize(src_params): # WriteCommitted only dest_params["use_put_entity_one_in"] = 0 # MultiCfIterator is currently only compatible with write committed policy - dest_params["use_multi_cf_iterator"] = 0 + dest_params["use_multi_cf_iterator"] = 0 # TODO(hx235): enable test_multi_ops_txns with fault injection after stabilizing the CI if dest_params.get("test_multi_ops_txns") == 1: dest_params["write_fault_one_in"] = 0 @@ -966,6 +967,10 @@ def finalize_and_sanitize(src_params): ): # At least one must be true dest_params["write_dbid_to_manifest"] = 1 + # Checkpoint creation skips flush if the WAL is locked, so enabling lock_wal_one_in + # can cause checkpoint verification to fail. So make the two mutually exclusive. + if dest_params.get("checkpoint_one_in") != 0: + dest_params["lock_wal_one_in"] = 0 return dest_params @@ -1023,7 +1028,7 @@ def gen_cmd(params, unknown_params): cmd = ( [stress_cmd] + [ - "--{0}={1}".format(k, v) + f"--{k}={v}" for k, v in [(k, finalzied_params[k]) for k in sorted(finalzied_params)] if k not in { @@ -1044,6 +1049,7 @@ def gen_cmd(params, unknown_params): "cleanup_cmd", "skip_tmpdir_check", "print_stderr_separately", + "verify_timeout", } and v is not None ] @@ -1052,9 +1058,10 @@ def gen_cmd(params, unknown_params): return cmd -def execute_cmd(cmd, timeout=None): +def execute_cmd(cmd, timeout=None, timeout_pstack=False): child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) print("Running db_stress with pid=%d: %s\n\n" % (child.pid, " ".join(cmd))) + pid = child.pid try: outs, errs = child.communicate(timeout=timeout) @@ -1062,6 +1069,8 @@ def execute_cmd(cmd, timeout=None): print("WARNING: db_stress ended before kill: exitcode=%d\n" % child.returncode) except subprocess.TimeoutExpired: hit_timeout = True + if timeout_pstack: + os.system("pstack %d" % pid) child.kill() print("KILLED %d\n" % child.pid) outs, errs = child.communicate() @@ -1136,7 +1145,7 @@ def blackbox_crash_main(args, unknown_args): cmd = gen_cmd( dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args ) - hit_timeout, retcode, outs, errs = execute_cmd(cmd) + hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["verify_timeout"], True) # For the final run print_output_and_exit_on_error(outs, errs, args.print_stderr_separately) @@ -1278,7 +1287,7 @@ def whitebox_crash_main(args, unknown_args): hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd( cmd, exit_time - time.time() + 900 ) - msg = "check_mode={0}, kill option={1}, exitcode={2}\n".format( + msg = "check_mode={}, kill option={}, exitcode={}\n".format( check_mode, additional_opts["kill_random_test"], retncode ) diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index e954e7007..3ce811972 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -27,6 +27,7 @@ #include "db/write_batch_internal.h" #include "file/filename.h" #include "rocksdb/cache.h" +#include "rocksdb/comparator.h" #include "rocksdb/experimental.h" #include "rocksdb/file_checksum.h" #include "rocksdb/filter_policy.h" @@ -110,6 +111,7 @@ const std::string LDBCommand::ARG_DECODE_BLOB_INDEX = "decode_blob_index"; const std::string LDBCommand::ARG_DUMP_UNCOMPRESSED_BLOBS = "dump_uncompressed_blobs"; const std::string LDBCommand::ARG_READ_TIMESTAMP = "read_timestamp"; +const std::string LDBCommand::ARG_GET_WRITE_UNIX_TIME = "get_write_unix_time"; const char* LDBCommand::DELIM = " ==> "; @@ -3622,11 +3624,12 @@ void BatchPutCommand::OverrideBaseOptions() { ScanCommand::ScanCommand(const std::vector& /*params*/, const std::map& options, const std::vector& flags) - : LDBCommand(options, flags, true, - BuildCmdLineOptions( - {ARG_TTL, ARG_NO_VALUE, ARG_HEX, ARG_KEY_HEX, ARG_TO, - ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP, ARG_MAX_KEYS, - ARG_TTL_START, ARG_TTL_END, ARG_READ_TIMESTAMP})), + : LDBCommand( + options, flags, true, + BuildCmdLineOptions({ARG_TTL, ARG_NO_VALUE, ARG_HEX, ARG_KEY_HEX, + ARG_TO, ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP, + ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END, + ARG_READ_TIMESTAMP, ARG_GET_WRITE_UNIX_TIME})), start_key_specified_(false), end_key_specified_(false), max_keys_scanned_(-1), @@ -3670,6 +3673,7 @@ ScanCommand::ScanCommand(const std::vector& /*params*/, ARG_MAX_KEYS + " has a value out-of-range"); } } + get_write_unix_time_ = IsFlagPresent(flags_, ARG_GET_WRITE_UNIX_TIME); } void ScanCommand::Help(std::string& ret) { @@ -3683,6 +3687,7 @@ void ScanCommand::Help(std::string& ret) { ret.append(" [--" + ARG_TTL_END + "=:- is exclusive]"); ret.append(" [--" + ARG_NO_VALUE + "]"); ret.append(" [--" + ARG_READ_TIMESTAMP + "=] "); + ret.append(" [--" + ARG_GET_WRITE_UNIX_TIME + "]"); ret.append("\n"); } @@ -3765,6 +3770,22 @@ void ScanCommand::DoCommand() { fprintf(stdout, "%s\n", str.c_str()); } + if (get_write_unix_time_) { + std::string write_unix_time; + uint64_t write_time_int = std::numeric_limits::max(); + Status s = + it->GetProperty("rocksdb.iterator.write-time", &write_unix_time); + if (s.ok()) { + s = DecodeU64Ts(write_unix_time, &write_time_int); + } + if (!s.ok()) { + fprintf(stdout, " Failed to get write unix time: %s\n", + s.ToString().c_str()); + } else { + fprintf(stdout, " write unix time: %s\n", + std::to_string(write_time_int).c_str()); + } + } num_keys_scanned++; if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) { break; diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h index 73130401e..3f7273dd5 100644 --- a/tools/ldb_cmd_impl.h +++ b/tools/ldb_cmd_impl.h @@ -511,6 +511,7 @@ class ScanCommand : public LDBCommand { bool end_key_specified_; int max_keys_scanned_; bool no_value_; + bool get_write_unix_time_; }; class DeleteCommand : public LDBCommand { diff --git a/tools/ldb_test.py b/tools/ldb_test.py index 09ab9b799..a8956f160 100644 --- a/tools/ldb_test.py +++ b/tools/ldb_test.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -from __future__ import absolute_import, division, print_function, unicode_literals import glob @@ -100,14 +99,14 @@ def assertRunOK(self, params, expectedOutput, unexpected=False): Uses the default test db. """ self.assertRunOKFull( - "%s %s" % (self.dbParam(self.DB_NAME), params), expectedOutput, unexpected + "{} {}".format(self.dbParam(self.DB_NAME), params), expectedOutput, unexpected ) def assertRunFAIL(self, params): """ Uses the default test db. """ - self.assertRunFAILFull("%s %s" % (self.dbParam(self.DB_NAME), params)) + self.assertRunFAILFull("{} {}".format(self.dbParam(self.DB_NAME), params)) def testSimpleStringPutGet(self): print("Running testSimpleStringPutGet...") @@ -180,18 +179,18 @@ def testSimpleStringPutGet(self): self.assertRunOK("checkconsistency", "OK") def dumpDb(self, params, dumpFile): - return 0 == run_err_null("./ldb dump %s > %s" % (params, dumpFile)) + return 0 == run_err_null("./ldb dump {} > {}".format(params, dumpFile)) def loadDb(self, params, dumpFile): - return 0 == run_err_null("cat %s | ./ldb load %s" % (dumpFile, params)) + return 0 == run_err_null("cat {} | ./ldb load {}".format(dumpFile, params)) def writeExternSst(self, params, inputDumpFile, outputSst): return 0 == run_err_null( - "cat %s | ./ldb write_extern_sst %s %s" % (inputDumpFile, outputSst, params) + "cat {} | ./ldb write_extern_sst {} {}".format(inputDumpFile, outputSst, params) ) def ingestExternSst(self, params, inputSst): - return 0 == run_err_null("./ldb ingest_extern_sst %s %s" % (inputSst, params)) + return 0 == run_err_null("./ldb ingest_extern_sst {} {}".format(inputSst, params)) def testStringBatchPut(self): print("Running testStringBatchPut...") @@ -444,11 +443,11 @@ def testDumpLoad(self): dumpFilePath = os.path.join(self.TMP_DIR, "dump6") loadedDbPath = os.path.join(self.TMP_DIR, "loaded_from_dump6") self.assertTrue( - self.dumpDb("--db=%s %s" % (origDbPath, extraParams), dumpFilePath) + self.dumpDb("--db={} {}".format(origDbPath, extraParams), dumpFilePath) ) self.assertTrue( self.loadDb( - "--db=%s %s --create_if_missing" % (loadedDbPath, extraParams), + "--db={} {} --create_if_missing".format(loadedDbPath, extraParams), dumpFilePath, ) ) @@ -503,7 +502,7 @@ def testIDumpBasics(self): "'b' seq:2, type:1 => val\nInternal keys in range: 2", ) self.assertRunOK( - "idump --input_key_hex --from=%s --to=%s" % (hex(ord("a")), hex(ord("b"))), + "idump --input_key_hex --from={} --to={}".format(hex(ord("a")), hex(ord("b"))), "'a' seq:1, type:1 => val\nInternal keys in range: 1", ) @@ -513,7 +512,7 @@ def testIDumpDecodeBlobIndex(self): self.assertRunOK("put b val --enable_blob_files", "OK") # Pattern to expect from dump with decode_blob_index flag enabled. - regex = ".*\[blob ref\].*" + regex = r".*\[blob ref\].*" expected_pattern = re.compile(regex) cmd = "idump %s --decode_blob_index" self.assertRunOKFull( @@ -589,7 +588,7 @@ def testCheckConsistency(self): self.assertRunFAIL("checkconsistency") def dumpLiveFiles(self, params, dumpFile): - return 0 == run_err_null("./ldb dump_live_files %s > %s" % (params, dumpFile)) + return 0 == run_err_null("./ldb dump_live_files {} > {}".format(params, dumpFile)) def testDumpLiveFiles(self): print("Running testDumpLiveFiles...") @@ -620,7 +619,7 @@ def testDumpLiveFiles(self): ) # Investigate the output - with open(dumpFilePath, "r") as tmp: + with open(dumpFilePath) as tmp: data = tmp.read() # Check that all the SST filenames have a correct full path (no multiple '/'). @@ -651,7 +650,7 @@ def testDumpLiveFiles(self): def listLiveFilesMetadata(self, params, dumpFile): return 0 == run_err_null( - "./ldb list_live_files_metadata %s > %s" % (params, dumpFile) + "./ldb list_live_files_metadata {} > {}".format(params, dumpFile) ) def testListLiveFilesMetadata(self): @@ -673,13 +672,13 @@ def testListLiveFilesMetadata(self): ) # Collect SST filename and level from dump_live_files - with open(dumpFilePath1, "r") as tmp: + with open(dumpFilePath1) as tmp: data = tmp.read() filename1 = re.findall(r".*\d+\.sst", data)[0] level1 = re.findall(r"level:\d+", data)[0].split(":")[1] # Collect SST filename and level from list_live_files_metadata - with open(dumpFilePath2, "r") as tmp: + with open(dumpFilePath2) as tmp: data = tmp.read() filename2 = re.findall(r".*\d+\.sst", data)[0] level2 = re.findall(r"level \d+", data)[0].split(" ")[1] @@ -712,7 +711,7 @@ def testListLiveFilesMetadata(self): # parse the output and create a map: # [key: sstFilename]->[value:[LSM level, Column Family Name]] referenceMap = {} - with open(dumpFilePath3, "r") as tmp: + with open(dumpFilePath3) as tmp: data = tmp.read() # Note: the following regex are contingent on what the # dump_live_files outputs. @@ -730,7 +729,7 @@ def testListLiveFilesMetadata(self): # parse the output and create a map: # [key: sstFilename]->[value:[LSM level, Column Family Name]] testMap = {} - with open(dumpFilePath4, "r") as tmp: + with open(dumpFilePath4) as tmp: data = tmp.read() # Since for each SST file, all the information is contained # on one line, the parsing is easy to perform and relies on @@ -771,7 +770,7 @@ def testManifestDump(self): num = "[0-9]+" st = ".*" subpat = st + " seq:" + num + ", type:" + num - regex = num + ":" + num + "\[" + subpat + ".." + subpat + "\]" + regex = num + ":" + num + r"\[" + subpat + ".." + subpat + r"\]" expected_pattern = re.compile(regex) cmd = "manifest_dump --db=%s" manifest_files = self.getManifests(dbPath) @@ -859,7 +858,7 @@ def testSSTDump(self): self.assertRunOK("get sst1", "sst1_val") # Pattern to expect from SST dump. - regex = ".*Sst file format:.*\n.*\[blob ref\].*" + regex = ".*Sst file format:.*\n.*\\[blob ref\\].*" expected_pattern = re.compile(regex) sst_files = self.getSSTFiles(dbPath) @@ -878,7 +877,7 @@ def testBlobDump(self): ) # Pattern to expect from blob file dump. - regex = ".*Blob log header[\s\S]*Blob log footer[\s\S]*Read record[\s\S]*Summary" # noqa + regex = r".*Blob log header[\s\S]*Blob log footer[\s\S]*Read record[\s\S]*Summary" # noqa expected_pattern = re.compile(regex) blob_files = self.getBlobFiles(dbPath) self.assertTrue(len(blob_files) >= 1) @@ -896,7 +895,7 @@ def testWALDump(self): self.assertRunOK("get wal1", "wal1_val") # Pattern to expect from WAL dump. - regex = "^Sequence,Count,ByteSize,Physical Offset,Key\(s\).*" + regex = r"^Sequence,Count,ByteSize,Physical Offset,Key\(s\).*" expected_pattern = re.compile(regex) wal_files = self.getWALFiles(dbPath) diff --git a/tools/write_stress_runner.py b/tools/write_stress_runner.py index f39f79cd4..515a1789d 100644 --- a/tools/write_stress_runner.py +++ b/tools/write_stress_runner.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -from __future__ import absolute_import, division, print_function, unicode_literals import argparse import random diff --git a/unreleased_history/bug_fixes/build_db_options.md b/unreleased_history/bug_fixes/build_db_options.md deleted file mode 100644 index 6994ea719..000000000 --- a/unreleased_history/bug_fixes/build_db_options.md +++ /dev/null @@ -1 +0,0 @@ -* Several DB option settings could be lost through `GetOptionsFromString()`, possibly elsewhere as well. Affected options, now fixed:`background_close_inactive_wals`, `write_dbid_to_manifest`, `write_identity_file`, `prefix_seek_opt_in_only` diff --git a/unreleased_history/bug_fixes/compressed_secondary_cache_account.md b/unreleased_history/bug_fixes/compressed_secondary_cache_account.md deleted file mode 100644 index 07c73b85e..000000000 --- a/unreleased_history/bug_fixes/compressed_secondary_cache_account.md +++ /dev/null @@ -1 +0,0 @@ -Fix under counting of allocated memory in the compressed secondary cache due to looking at the compressed block size rather than the actual memory allocated, which could be larger due to internal fragmentation. diff --git a/unreleased_history/bug_fixes/skip_insertion_tiered_sec_cache.md b/unreleased_history/bug_fixes/skip_insertion_tiered_sec_cache.md deleted file mode 100644 index 7dcbe099f..000000000 --- a/unreleased_history/bug_fixes/skip_insertion_tiered_sec_cache.md +++ /dev/null @@ -1 +0,0 @@ -Skip insertion of compressed blocks in the secondary cache if the lowest_used_cache_tier DB option is kVolatileTier. diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc index 37e39987e..11ab63b62 100644 --- a/util/compaction_job_stats_impl.cc +++ b/util/compaction_job_stats_impl.cc @@ -89,6 +89,8 @@ void CompactionJobStats::Add(const CompactionJobStats& stats) { num_single_del_fallthru += stats.num_single_del_fallthru; num_single_del_mismatch += stats.num_single_del_mismatch; + + is_remote_compaction |= stats.is_remote_compaction; } diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc index 3a64fc3fa..781d5a60a 100644 --- a/utilities/memory/memory_test.cc +++ b/utilities/memory/memory_test.cc @@ -34,15 +34,6 @@ class MemoryTest : public testing::Test { } } - void GetCachePointersFromTableFactory( - const TableFactory* factory, - std::unordered_set* cache_set) { - const auto bbto = factory->GetOptions(); - if (bbto != nullptr) { - cache_set->insert(bbto->block_cache.get()); - } - } - void GetCachePointers(const std::vector& dbs, std::unordered_set* cache_set) { cache_set->clear(); @@ -61,13 +52,8 @@ class MemoryTest : public testing::Test { cache_set->insert(db->GetDBOptions().row_cache.get()); // Cache from table factories - std::unordered_map iopts_map; if (db_impl != nullptr) { - ASSERT_OK(db_impl->TEST_GetAllImmutableCFOptions(&iopts_map)); - } - for (const auto& pair : iopts_map) { - GetCachePointersFromTableFactory(pair.second->table_factory.get(), - cache_set); + db_impl->TEST_GetAllBlockCaches(cache_set); } } } @@ -266,4 +252,3 @@ int main(int argc, char** argv) { return 0; #endif } - diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index 5c4530e61..c6de3c2e0 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -179,6 +179,8 @@ class DummyTableFactory : public TableFactory { } std::string GetPrintableOptions() const override { return ""; } + + std::unique_ptr Clone() const override { return nullptr; } }; class DummyMergeOperator : public MergeOperator { diff --git a/utilities/transactions/snapshot_checker.cc b/utilities/transactions/snapshot_checker.cc index fa94502cb..6b7a7a1a1 100644 --- a/utilities/transactions/snapshot_checker.cc +++ b/utilities/transactions/snapshot_checker.cc @@ -34,4 +34,21 @@ DisableGCSnapshotChecker* DisableGCSnapshotChecker::Instance() { STATIC_AVOID_DESTRUCTION(DisableGCSnapshotChecker, instance); return &instance; } + +bool DataIsDefinitelyInSnapshot(SequenceNumber seqno, SequenceNumber snapshot, + const SnapshotChecker* snapshot_checker) { + return ((seqno) <= (snapshot) && + (snapshot_checker == nullptr || + LIKELY(snapshot_checker->CheckInSnapshot((seqno), (snapshot)) == + SnapshotCheckerResult::kInSnapshot))); +} + +bool DataIsDefinitelyNotInSnapshot(SequenceNumber seqno, + SequenceNumber snapshot, + const SnapshotChecker* snapshot_checker) { + return ((seqno) > (snapshot) || + (snapshot_checker != nullptr && + UNLIKELY(snapshot_checker->CheckInSnapshot((seqno), (snapshot)) == + SnapshotCheckerResult::kNotInSnapshot))); +} } // namespace ROCKSDB_NAMESPACE