From 8522d9c74dd3daaacc2040067fbfd236de5adb24 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Tue, 11 Dec 2018 11:44:24 -0800 Subject: [PATCH 01/57] Prepare FragmentedRangeTombstoneIterator for use in compaction (#4740) Summary: To support the flush/compaction use cases of RangeDelAggregator in v2, FragmentedRangeTombstoneIterator now supports dropping tombstones that cannot be read in the compaction output file. Furthermore, FragmentedRangeTombstoneIterator supports the "snapshot striping" use case by allowing an iterator to be split by a list of snapshots. RangeDelAggregatorV2 will use these changes in a follow-up change. In the process of making these changes, other miscellaneous cleanups were also done in these files. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4740 Differential Revision: D13287382 Pulled By: abhimadan fbshipit-source-id: f5aeb03e1b3058049b80c02a558ee48f723fa48c --- db/memtable.cc | 2 +- db/range_del_aggregator_bench.cc | 4 +- db/range_del_aggregator_v2_test.cc | 34 ++-- db/range_tombstone_fragmenter.cc | 122 ++++++++++--- db/range_tombstone_fragmenter.h | 48 +++-- db/range_tombstone_fragmenter_test.cc | 247 ++++++++++++++++++++------ table/block_based_table_reader.cc | 2 +- 7 files changed, 341 insertions(+), 118 deletions(-) diff --git a/db/memtable.cc b/db/memtable.cc index c0166bb40..51b54d636 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -428,7 +428,7 @@ FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator( comparator_.comparator); auto* fragmented_iter = new FragmentedRangeTombstoneIterator( - fragmented_tombstone_list, read_seq, comparator_.comparator); + fragmented_tombstone_list, comparator_.comparator, read_seq); return fragmented_iter; } diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc index 01974702b..9fdcefc39 100644 --- a/db/range_del_aggregator_bench.cc +++ b/db/range_del_aggregator_bench.cc @@ -220,8 +220,8 @@ int main(int argc, char** argv) { std::unique_ptr fragmented_range_del_iter( new rocksdb::FragmentedRangeTombstoneIterator( - fragmented_range_tombstone_lists.back().get(), - rocksdb::kMaxSequenceNumber, icmp)); + fragmented_range_tombstone_lists.back().get(), icmp, + rocksdb::kMaxSequenceNumber)); if (FLAGS_use_v2_aggregator) { rocksdb::StopWatchNano stop_watch_add_tombstones( diff --git a/db/range_del_aggregator_v2_test.cc b/db/range_del_aggregator_v2_test.cc index 576d3339e..79cb548b1 100644 --- a/db/range_del_aggregator_v2_test.cc +++ b/db/range_del_aggregator_v2_test.cc @@ -173,8 +173,8 @@ TEST_F(RangeDelAggregatorV2Test, EmptyTruncatedIter) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, kMaxSequenceNumber, - bytewise_icmp)); + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, nullptr); @@ -192,8 +192,8 @@ TEST_F(RangeDelAggregatorV2Test, UntruncatedIter) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, kMaxSequenceNumber, - bytewise_icmp)); + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, nullptr); @@ -226,8 +226,8 @@ TEST_F(RangeDelAggregatorV2Test, UntruncatedIterWithSnapshot) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, 9 /* snapshot */, - bytewise_icmp)); + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + 9 /* snapshot */)); TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, nullptr); @@ -259,8 +259,8 @@ TEST_F(RangeDelAggregatorV2Test, TruncatedIter) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, kMaxSequenceNumber, - bytewise_icmp)); + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); InternalKey smallest("d", 7, kTypeValue); InternalKey largest("m", 9, kTypeValue); @@ -294,8 +294,8 @@ TEST_F(RangeDelAggregatorV2Test, SingleIterInAggregator) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, kMaxSequenceNumber, - bytewise_icmp)); + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, kMaxSequenceNumber); range_del_agg.AddTombstones(std::move(input_iter)); @@ -321,8 +321,8 @@ TEST_F(RangeDelAggregatorV2Test, MultipleItersInAggregator) { RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, kMaxSequenceNumber); for (const auto& fragment_list : fragment_lists) { std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator( - fragment_list.get(), kMaxSequenceNumber, bytewise_icmp)); + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); range_del_agg.AddTombstones(std::move(input_iter)); } @@ -353,8 +353,8 @@ TEST_F(RangeDelAggregatorV2Test, MultipleItersInAggregatorWithUpperBound) { RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19); for (const auto& fragment_list : fragment_lists) { std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), - 19 /* snapshot */, bytewise_icmp)); + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + 19 /* snapshot */)); range_del_agg.AddTombstones(std::move(input_iter)); } @@ -392,8 +392,8 @@ TEST_F(RangeDelAggregatorV2Test, MultipleTruncatedItersInAggregator) { const auto& fragment_list = fragment_lists[i]; const auto& bounds = iter_bounds[i]; std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), - 19 /* snapshot */, bytewise_icmp)); + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + 19 /* snapshot */)); range_del_agg.AddTombstones(std::move(input_iter), &bounds.first, &bounds.second); } @@ -432,7 +432,7 @@ TEST_F(RangeDelAggregatorV2Test, MultipleTruncatedItersInAggregatorSameLevel) { auto add_iter_to_agg = [&](size_t i) { std::unique_ptr input_iter( new FragmentedRangeTombstoneIterator(fragment_lists[i].get(), - 19 /* snapshot */, bytewise_icmp)); + bytewise_icmp, 19 /* snapshot */)); range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first, &iter_bounds[i].second); }; diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index 4137f25cf..1748c5430 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -20,7 +20,8 @@ namespace rocksdb { FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( std::unique_ptr unfragmented_tombstones, - const InternalKeyComparator& icmp) { + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots) { if (unfragmented_tombstones == nullptr) { return; } @@ -43,7 +44,8 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( } } if (is_sorted) { - FragmentTombstones(std::move(unfragmented_tombstones), icmp); + FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction, + snapshots); return; } @@ -61,12 +63,13 @@ FragmentedRangeTombstoneList::FragmentedRangeTombstoneList( // VectorIterator implicitly sorts by key during construction. auto iter = std::unique_ptr( new VectorIterator(std::move(keys), std::move(values), &icmp)); - FragmentTombstones(std::move(iter), icmp); + FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots); } void FragmentedRangeTombstoneList::FragmentTombstones( std::unique_ptr unfragmented_tombstones, - const InternalKeyComparator& icmp) { + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots) { Slice cur_start_key(nullptr, 0); auto cmp = ParsedInternalKeyComparator(&icmp); @@ -117,10 +120,38 @@ void FragmentedRangeTombstoneList::FragmentTombstones( } std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(), std::greater()); + size_t start_idx = tombstone_seqs_.size(); size_t end_idx = start_idx + seqnums_to_flush.size(); - tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(), - seqnums_to_flush.end()); + + if (for_compaction) { + // Drop all tombstone seqnums that are not preserved by a snapshot. + SequenceNumber next_snapshot = kMaxSequenceNumber; + for (auto seq : seqnums_to_flush) { + if (seq <= next_snapshot) { + // This seqnum is visible by a lower snapshot. + tombstone_seqs_.push_back(seq); + seq_set_.insert(seq); + auto upper_bound_it = + std::lower_bound(snapshots.begin(), snapshots.end(), seq); + if (upper_bound_it == snapshots.begin()) { + // This seqnum is the topmost one visible by the earliest + // snapshot. None of the seqnums below it will be visible, so we + // can skip them. + break; + } + next_snapshot = *std::prev(upper_bound_it); + } + } + end_idx = tombstone_seqs_.size(); + } else { + // The fragmentation is being done for reads, so preserve all seqnums. + tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(), + seqnums_to_flush.end()); + seq_set_.insert(seqnums_to_flush.begin(), seqnums_to_flush.end()); + } + + assert(start_idx < end_idx); tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx, end_idx); cur_start_key = cur_end_key; @@ -178,33 +209,41 @@ void FragmentedRangeTombstoneList::FragmentTombstones( } } +bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower, + SequenceNumber upper) const { + auto seq_it = seq_set_.lower_bound(lower); + return seq_it != seq_set_.end() && *seq_it <= upper; +} + FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( - const FragmentedRangeTombstoneList* tombstones, SequenceNumber snapshot, - const InternalKeyComparator& icmp) + const FragmentedRangeTombstoneList* tombstones, + const InternalKeyComparator& icmp, SequenceNumber _upper_bound, + SequenceNumber _lower_bound) : tombstone_start_cmp_(icmp.user_comparator()), tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), ucmp_(icmp.user_comparator()), tombstones_(tombstones), - snapshot_(snapshot) { + upper_bound_(_upper_bound), + lower_bound_(_lower_bound) { assert(tombstones_ != nullptr); - pos_ = tombstones_->end(); - pinned_pos_ = tombstones_->end(); + Invalidate(); } FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator( const std::shared_ptr& tombstones, - SequenceNumber snapshot, const InternalKeyComparator& icmp) + const InternalKeyComparator& icmp, SequenceNumber _upper_bound, + SequenceNumber _lower_bound) : tombstone_start_cmp_(icmp.user_comparator()), tombstone_end_cmp_(icmp.user_comparator()), + icmp_(&icmp), ucmp_(icmp.user_comparator()), tombstones_ref_(tombstones), tombstones_(tombstones_ref_.get()), - snapshot_(snapshot) { + upper_bound_(_upper_bound), + lower_bound_(_lower_bound) { assert(tombstones_ != nullptr); - pos_ = tombstones_->end(); - seq_pos_ = tombstones_->seq_end(); - pinned_pos_ = tombstones_->end(); - pinned_seq_pos_ = tombstones_->seq_end(); + Invalidate(); } void FragmentedRangeTombstoneIterator::SeekToFirst() { @@ -220,7 +259,7 @@ void FragmentedRangeTombstoneIterator::SeekToTopFirst() { pos_ = tombstones_->begin(); seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), tombstones_->seq_iter(pos_->seq_end_idx), - snapshot_, std::greater()); + upper_bound_, std::greater()); ScanForwardToVisibleTombstone(); } @@ -237,7 +276,7 @@ void FragmentedRangeTombstoneIterator::SeekToTopLast() { pos_ = std::prev(tombstones_->end()); seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), tombstones_->seq_iter(pos_->seq_end_idx), - snapshot_, std::greater()); + upper_bound_, std::greater()); ScanBackwardToVisibleTombstone(); } @@ -270,7 +309,7 @@ void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone( } seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), tombstones_->seq_iter(pos_->seq_end_idx), - snapshot_, std::greater()); + upper_bound_, std::greater()); } void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone( @@ -289,25 +328,28 @@ void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone( --pos_; seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), tombstones_->seq_iter(pos_->seq_end_idx), - snapshot_, std::greater()); + upper_bound_, std::greater()); } void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() { while (pos_ != tombstones_->end() && - seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) { + (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) || + *seq_pos_ < lower_bound_)) { ++pos_; if (pos_ == tombstones_->end()) { + Invalidate(); return; } seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), tombstones_->seq_iter(pos_->seq_end_idx), - snapshot_, std::greater()); + upper_bound_, std::greater()); } } void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() { while (pos_ != tombstones_->end() && - seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) { + (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) || + *seq_pos_ < lower_bound_)) { if (pos_ == tombstones_->begin()) { Invalidate(); return; @@ -315,7 +357,7 @@ void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() { --pos_; seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), tombstones_->seq_iter(pos_->seq_end_idx), - snapshot_, std::greater()); + upper_bound_, std::greater()); } } @@ -333,14 +375,13 @@ void FragmentedRangeTombstoneIterator::TopNext() { } seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), tombstones_->seq_iter(pos_->seq_end_idx), - snapshot_, std::greater()); + upper_bound_, std::greater()); ScanForwardToVisibleTombstone(); } void FragmentedRangeTombstoneIterator::Prev() { if (seq_pos_ == tombstones_->seq_begin()) { - pos_ = tombstones_->end(); - seq_pos_ = tombstones_->seq_end(); + Invalidate(); return; } --seq_pos_; @@ -358,7 +399,7 @@ void FragmentedRangeTombstoneIterator::TopPrev() { --pos_; seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx), tombstones_->seq_iter(pos_->seq_end_idx), - snapshot_, std::greater()); + upper_bound_, std::greater()); ScanBackwardToVisibleTombstone(); } @@ -372,4 +413,27 @@ SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum( return ValidPos() && ucmp_->Compare(start_key(), user_key) <= 0 ? seq() : 0; } +std::map> +FragmentedRangeTombstoneIterator::SplitBySnapshot( + const std::vector& snapshots) { + std::map> + splits; + SequenceNumber lower = 0; + SequenceNumber upper; + for (size_t i = 0; i <= snapshots.size(); i++) { + if (i >= snapshots.size()) { + upper = kMaxSequenceNumber; + } else { + upper = snapshots[i]; + } + if (tombstones_->ContainsRange(lower, upper)) { + splits.emplace(upper, std::unique_ptr( + new FragmentedRangeTombstoneIterator( + tombstones_, *icmp_, upper, lower))); + } + lower = upper + 1; + } + return splits; +} + } // namespace rocksdb diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index 2ad346af1..306a0347b 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -38,7 +39,8 @@ struct FragmentedRangeTombstoneList { }; FragmentedRangeTombstoneList( std::unique_ptr unfragmented_tombstones, - const InternalKeyComparator& icmp); + const InternalKeyComparator& icmp, bool for_compaction = false, + const std::vector& snapshots = {}); std::vector::const_iterator begin() const { return tombstones_.begin(); @@ -60,7 +62,11 @@ struct FragmentedRangeTombstoneList { return tombstone_seqs_.end(); } - bool empty() const { return tombstones_.size() == 0; } + bool empty() const { return tombstones_.empty(); } + + // Returns true if the stored tombstones contain with one with a sequence + // number in [lower, upper]. + bool ContainsRange(SequenceNumber lower, SequenceNumber upper) const; private: // Given an ordered range tombstone iterator unfragmented_tombstones, @@ -68,10 +74,12 @@ struct FragmentedRangeTombstoneList { // tombstones_ and tombstone_seqs_. void FragmentTombstones( std::unique_ptr unfragmented_tombstones, - const InternalKeyComparator& icmp); + const InternalKeyComparator& icmp, bool for_compaction, + const std::vector& snapshots); std::vector tombstones_; std::vector tombstone_seqs_; + std::set seq_set_; std::list pinned_slices_; PinnedIteratorsManager pinned_iters_mgr_; }; @@ -88,11 +96,13 @@ struct FragmentedRangeTombstoneList { class FragmentedRangeTombstoneIterator : public InternalIterator { public: FragmentedRangeTombstoneIterator( - const FragmentedRangeTombstoneList* tombstones, SequenceNumber snapshot, - const InternalKeyComparator& icmp); + const FragmentedRangeTombstoneList* tombstones, + const InternalKeyComparator& icmp, SequenceNumber upper_bound, + SequenceNumber lower_bound = 0); FragmentedRangeTombstoneIterator( const std::shared_ptr& tombstones, - SequenceNumber snapshot, const InternalKeyComparator& icmp); + const InternalKeyComparator& icmp, SequenceNumber upper_bound, + SequenceNumber lower_bound = 0); void SeekToFirst() override; void SeekToLast() override; @@ -136,10 +146,6 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { seq_pos_ = tombstones_->seq_end(); } - // TODO: implement properly - RangeTombstone tombstone() const { - return RangeTombstone(start_key(), end_key(), seq()); - } Slice start_key() const { return pos_->start_key; } Slice end_key() const { return pos_->end_key; } SequenceNumber seq() const { return *seq_pos_; } @@ -151,12 +157,24 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber, kTypeRangeDeletion); } - ParsedInternalKey internal_key() const { - return ParsedInternalKey(pos_->start_key, *seq_pos_, kTypeRangeDeletion); - } SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key); + // Splits the iterator into n+1 iterators (where n is the number of + // snapshots), each providing a view over a "stripe" of sequence numbers. The + // iterators are keyed by the upper bound of their ranges (the provided + // snapshots + kMaxSequenceNumber). + // + // NOTE: the iterators in the returned map are no longer valid if their + // parent iterator is deleted, since they do not modify the refcount of the + // underlying tombstone list. Therefore, this map should be deleted before + // the parent iterator. + std::map> + SplitBySnapshot(const std::vector& snapshots); + + SequenceNumber upper_bound() const { return upper_bound_; } + SequenceNumber lower_bound() const { return lower_bound_; } + private: using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack; @@ -217,10 +235,12 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { const RangeTombstoneStackStartComparator tombstone_start_cmp_; const RangeTombstoneStackEndComparator tombstone_end_cmp_; + const InternalKeyComparator* icmp_; const Comparator* ucmp_; std::shared_ptr tombstones_ref_; const FragmentedRangeTombstoneList* tombstones_; - SequenceNumber snapshot_; + SequenceNumber upper_bound_; + SequenceNumber lower_bound_; std::vector::const_iterator pos_; std::vector::const_iterator seq_pos_; mutable std::vector::const_iterator pinned_pos_; diff --git a/db/range_tombstone_fragmenter_test.cc b/db/range_tombstone_fragmenter_test.cc index fc6eddc29..ddd3f7741 100644 --- a/db/range_tombstone_fragmenter_test.cc +++ b/db/range_tombstone_fragmenter_test.cc @@ -29,15 +29,26 @@ std::unique_ptr MakeRangeDelIter( new test::VectorIterator(keys, values)); } +void CheckIterPosition(const RangeTombstone& tombstone, + const FragmentedRangeTombstoneIterator* iter) { + // Test InternalIterator interface. + EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key())); + EXPECT_EQ(tombstone.end_key_, iter->value()); + EXPECT_EQ(tombstone.seq_, iter->seq()); + + // Test FragmentedRangeTombstoneIterator interface. + EXPECT_EQ(tombstone.start_key_, iter->start_key()); + EXPECT_EQ(tombstone.end_key_, iter->end_key()); + EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key())); +} + void VerifyFragmentedRangeDels( FragmentedRangeTombstoneIterator* iter, const std::vector& expected_tombstones) { iter->SeekToFirst(); - for (size_t i = 0; i < expected_tombstones.size() && iter->Valid(); - i++, iter->Next()) { - EXPECT_EQ(iter->start_key(), expected_tombstones[i].start_key_); - EXPECT_EQ(iter->value(), expected_tombstones[i].end_key_); - EXPECT_EQ(iter->seq(), expected_tombstones[i].seq_); + for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(expected_tombstones[i], iter); } EXPECT_FALSE(iter->Valid()); } @@ -46,11 +57,9 @@ void VerifyVisibleTombstones( FragmentedRangeTombstoneIterator* iter, const std::vector& expected_tombstones) { iter->SeekToTopFirst(); - for (size_t i = 0; i < expected_tombstones.size() && iter->Valid(); - i++, iter->TopNext()) { - EXPECT_EQ(iter->start_key(), expected_tombstones[i].start_key_); - EXPECT_EQ(iter->value(), expected_tombstones[i].end_key_); - EXPECT_EQ(iter->seq(), expected_tombstones[i].seq_); + for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(expected_tombstones[i], iter); } EXPECT_FALSE(iter->Valid()); } @@ -69,9 +78,7 @@ void VerifySeek(FragmentedRangeTombstoneIterator* iter, ASSERT_FALSE(iter->Valid()); } else { ASSERT_TRUE(iter->Valid()); - EXPECT_EQ(testcase.expected_position.start_key_, iter->start_key()); - EXPECT_EQ(testcase.expected_position.end_key_, iter->value()); - EXPECT_EQ(testcase.expected_position.seq_, iter->seq()); + CheckIterPosition(testcase.expected_position, iter); } } } @@ -84,9 +91,7 @@ void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter, ASSERT_FALSE(iter->Valid()); } else { ASSERT_TRUE(iter->Valid()); - EXPECT_EQ(testcase.expected_position.start_key_, iter->start_key()); - EXPECT_EQ(testcase.expected_position.end_key_, iter->value()); - EXPECT_EQ(testcase.expected_position.seq_, iter->seq()); + CheckIterPosition(testcase.expected_position, iter); } } } @@ -112,8 +117,10 @@ TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}}); VerifyMaxCoveringTombstoneSeqnum(&iter, {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}}); @@ -124,8 +131,10 @@ TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); VerifyFragmentedRangeDels( &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}}); VerifyMaxCoveringTombstoneSeqnum(&iter, @@ -138,8 +147,10 @@ TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); VerifyFragmentedRangeDels( &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}}); VerifyMaxCoveringTombstoneSeqnum(&iter, @@ -152,8 +163,10 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}}); VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}}); @@ -165,8 +178,10 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}, @@ -186,8 +201,10 @@ TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound()); VerifyFragmentedRangeDels(&iter, {{"a", "c", 30}, {"a", "c", 20}, {"a", "c", 10}, @@ -211,16 +228,16 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter1(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); - FragmentedRangeTombstoneIterator iter2(&fragment_list, 9 /* snapshot */, - bytewise_icmp); - FragmentedRangeTombstoneIterator iter3(&fragment_list, 7 /* snapshot */, - bytewise_icmp); - FragmentedRangeTombstoneIterator iter4(&fragment_list, 5 /* snapshot */, - bytewise_icmp); - FragmentedRangeTombstoneIterator iter5(&fragment_list, 3 /* snapshot */, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 9 /* upper_bound */); + FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp, + 7 /* upper_bound */); + FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp, + 5 /* upper_bound */); + FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) { VerifyFragmentedRangeDels(iter, {{"a", "c", 10}, {"c", "e", 10}, @@ -234,6 +251,8 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) { {"l", "n", 4}}); } + ASSERT_EQ(0, iter1.lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound()); VerifyVisibleTombstones(&iter1, {{"a", "c", 10}, {"c", "e", 10}, {"e", "g", 8}, @@ -243,6 +262,8 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) { VerifyMaxCoveringTombstoneSeqnum( &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); + ASSERT_EQ(0, iter2.lower_bound()); + ASSERT_EQ(9, iter2.upper_bound()); VerifyVisibleTombstones(&iter2, {{"c", "e", 8}, {"e", "g", 8}, {"g", "i", 6}, @@ -251,6 +272,8 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) { VerifyMaxCoveringTombstoneSeqnum( &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); + ASSERT_EQ(0, iter3.lower_bound()); + ASSERT_EQ(7, iter3.upper_bound()); VerifyVisibleTombstones(&iter3, {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}, @@ -259,10 +282,14 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) { VerifyMaxCoveringTombstoneSeqnum( &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}}); + ASSERT_EQ(0, iter4.lower_bound()); + ASSERT_EQ(5, iter4.upper_bound()); VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}}); VerifyMaxCoveringTombstoneSeqnum( &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}}); + ASSERT_EQ(0, iter5.lower_bound()); + ASSERT_EQ(3, iter5.upper_bound()); VerifyVisibleTombstones(&iter5, {{"j", "l", 2}}); VerifyMaxCoveringTombstoneSeqnum( &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}}); @@ -277,8 +304,10 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter(&fragment_list, 9 /* snapshot */, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + 9 /* upper_bound */); + ASSERT_EQ(0, iter.lower_bound()); + ASSERT_EQ(9, iter.upper_bound()); VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, @@ -293,6 +322,116 @@ TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) { &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}}); } +TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* for_compaction */, + {} /* snapshots */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, + OverlapAndRepeatedStartKeyForCompactionWithSnapshot) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list( + std::move(range_del_iter), bytewise_icmp, true /* for_compaction */, + {20, 9} /* upper_bounds */); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + VerifyFragmentedRangeDels(&iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + + auto split_iters = iter.SplitBySnapshot({} /* snapshots */); + ASSERT_EQ(1, split_iters.size()); + + auto* split_iter = split_iters[kMaxSequenceNumber].get(); + ASSERT_EQ(0, split_iter->lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound()); + VerifyVisibleTombstones(split_iter, {{"a", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"g", "i", 6}, + {"j", "l", 4}, + {"l", "n", 4}}); +} + +TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, + {"j", "n", 4}, + {"c", "i", 6}, + {"c", "g", 8}, + {"j", "l", 2}}); + + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber /* upper_bound */); + + auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */); + ASSERT_EQ(5, split_iters.size()); + + auto* split_iter1 = split_iters[3].get(); + ASSERT_EQ(0, split_iter1->lower_bound()); + ASSERT_EQ(3, split_iter1->upper_bound()); + VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}}); + + auto* split_iter2 = split_iters[5].get(); + ASSERT_EQ(4, split_iter2->lower_bound()); + ASSERT_EQ(5, split_iter2->upper_bound()); + VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}}); + + auto* split_iter3 = split_iters[7].get(); + ASSERT_EQ(6, split_iter3->lower_bound()); + ASSERT_EQ(7, split_iter3->upper_bound()); + VerifyVisibleTombstones(split_iter3, + {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}}); + + auto* split_iter4 = split_iters[9].get(); + ASSERT_EQ(8, split_iter4->lower_bound()); + ASSERT_EQ(9, split_iter4->upper_bound()); + VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}}); + + auto* split_iter5 = split_iters[kMaxSequenceNumber].get(); + ASSERT_EQ(10, split_iter5->lower_bound()); + ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound()); + VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}}); +} + TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) { // Same tombstones as OverlapAndRepeatedStartKey. auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, @@ -304,8 +443,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter1(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); VerifySeek( &iter1, {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}}); @@ -313,8 +452,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) { &iter1, {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}}); - FragmentedRangeTombstoneIterator iter2(&fragment_list, 3 /* snapshot */, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); VerifySeek(&iter2, {{"a", {"j", "l", 2}}, {"e", {"j", "l", 2}}, {"l", {}, true /* out of range */}}); @@ -334,8 +473,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekCovered) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter1(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); VerifySeek( &iter1, {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}}); @@ -343,8 +482,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekCovered) { &iter1, {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}}); - FragmentedRangeTombstoneIterator iter2(&fragment_list, 3 /* snapshot */, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); VerifySeek(&iter2, {{"b", {"j", "l", 2}}, {"f", {"j", "l", 2}}, {"m", {}, true /* out of range */}}); @@ -364,8 +503,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter1(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); VerifySeek(&iter1, {{"c", {"c", "e", 10}}, {"g", {"g", "i", 6}}, {"i", {"j", "l", 4}}, @@ -375,8 +514,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) { {"i", {"g", "i", 6}}, {"n", {"l", "n", 4}}}); - FragmentedRangeTombstoneIterator iter2(&fragment_list, 3 /* snapshot */, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp, + 3 /* upper_bound */); VerifySeek(&iter2, {{"c", {"j", "l", 2}}, {"g", {"j", "l", 2}}, {"i", {"j", "l", 2}}, @@ -398,8 +537,8 @@ TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) { FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), bytewise_icmp); - FragmentedRangeTombstoneIterator iter(&fragment_list, kMaxSequenceNumber, - bytewise_icmp); + FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp, + kMaxSequenceNumber); VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}}); VerifySeekForPrev(&iter, {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}}); diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index fbc9af4ba..a126de88c 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -2348,7 +2348,7 @@ FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( snapshot = read_options.snapshot->GetSequenceNumber(); } return new FragmentedRangeTombstoneIterator( - rep_->fragmented_range_dels, snapshot, rep_->internal_comparator); + rep_->fragmented_range_dels, rep_->internal_comparator, snapshot); } InternalIterator* BlockBasedTable::NewUnfragmentedRangeTombstoneIterator( From 96de211f4cadd547a3662f4ac1595888d3d05b0c Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Mon, 17 Dec 2018 13:12:22 -0800 Subject: [PATCH 02/57] Add compaction logic to RangeDelAggregatorV2 (#4758) Summary: RangeDelAggregatorV2 now supports ShouldDelete calls on snapshot stripes and creation of range tombstone compaction iterators. RangeDelAggregator is no longer used on any non-test code path, and will be removed in a future commit. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4758 Differential Revision: D13439254 Pulled By: abhimadan fbshipit-source-id: fe105bcf8e3d4a2df37a622d5510843cd71b0401 --- db/builder.cc | 22 +- db/builder.h | 6 +- db/column_family.cc | 2 +- db/compaction_iterator.cc | 4 +- db/compaction_iterator.h | 8 +- db/compaction_iterator_test.cc | 14 +- db/compaction_job.cc | 39 ++-- db/compaction_job.h | 6 +- db/db_compaction_filter_test.cc | 12 +- db/db_impl_open.cc | 54 ++--- db/db_iter.cc | 6 +- db/db_iter.h | 2 +- db/db_test_util.cc | 12 +- db/flush_job.cc | 16 +- db/forward_iterator.cc | 12 +- db/merge_helper.cc | 5 +- db/merge_helper.h | 4 +- db/range_del_aggregator_bench.cc | 2 +- db/range_del_aggregator_v2.cc | 311 +++++++++++++++++++++++------ db/range_del_aggregator_v2.h | 229 +++++++++++++++++---- db/range_del_aggregator_v2_test.cc | 254 ++++++++++++++++++++++- db/range_tombstone_fragmenter.cc | 13 +- db/range_tombstone_fragmenter.h | 3 + db/repair.cc | 11 +- db/version_set.cc | 4 +- util/heap.h | 4 +- utilities/debug.cc | 4 +- 27 files changed, 829 insertions(+), 230 deletions(-) diff --git a/db/builder.cc b/db/builder.cc index 0d896846f..60067c425 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -18,6 +18,7 @@ #include "db/event_helpers.h" #include "db/internal_stats.h" #include "db/merge_helper.h" +#include "db/range_del_aggregator_v2.h" #include "db/table_cache.h" #include "db/version_edit.h" #include "monitoring/iostats_context_imp.h" @@ -65,8 +66,9 @@ Status BuildTable( const std::string& dbname, Env* env, const ImmutableCFOptions& ioptions, const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options, TableCache* table_cache, InternalIterator* iter, - std::unique_ptr range_del_iter, FileMetaData* meta, - const InternalKeyComparator& internal_comparator, + std::vector> + range_del_iters, + FileMetaData* meta, const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, uint32_t column_family_id, const std::string& column_family_name, @@ -86,12 +88,10 @@ Status BuildTable( Status s; meta->fd.file_size = 0; iter->SeekToFirst(); - std::unique_ptr range_del_agg( - new RangeDelAggregator(internal_comparator, snapshots)); - s = range_del_agg->AddTombstones(std::move(range_del_iter)); - if (!s.ok()) { - // may be non-ok if a range tombstone key is unparsable - return s; + std::unique_ptr range_del_agg( + new CompactionRangeDelAggregatorV2(&internal_comparator, snapshots)); + for (auto& range_del_iter : range_del_iters) { + range_del_agg->AddTombstones(std::move(range_del_iter)); } std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(), @@ -158,8 +158,10 @@ Status BuildTable( } } - for (auto it = range_del_agg->NewIterator(); it->Valid(); it->Next()) { - auto tombstone = it->Tombstone(); + auto range_del_it = range_del_agg->NewIterator(); + for (range_del_it->SeekToFirst(); range_del_it->Valid(); + range_del_it->Next()) { + auto tombstone = range_del_it->Tombstone(); auto kv = tombstone.Serialize(); builder->Add(kv.first.Encode(), kv.second); meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(), diff --git a/db/builder.h b/db/builder.h index 9995723df..b81355703 100644 --- a/db/builder.h +++ b/db/builder.h @@ -9,6 +9,7 @@ #include #include #include +#include "db/range_tombstone_fragmenter.h" #include "db/table_properties_collector.h" #include "options/cf_options.h" #include "rocksdb/comparator.h" @@ -65,8 +66,9 @@ extern Status BuildTable( const std::string& dbname, Env* env, const ImmutableCFOptions& options, const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options, TableCache* table_cache, InternalIterator* iter, - std::unique_ptr range_del_iter, FileMetaData* meta, - const InternalKeyComparator& internal_comparator, + std::vector> + range_del_iters, + FileMetaData* meta, const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, uint32_t column_family_id, const std::string& column_family_name, diff --git a/db/column_family.cc b/db/column_family.cc index 29298e62a..c1a85a341 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -945,7 +945,7 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( ScopedArenaIterator memtable_iter(merge_iter_builder.Finish()); auto read_seq = super_version->current->version_set()->LastSequence(); - RangeDelAggregatorV2 range_del_agg(&internal_comparator_, read_seq); + ReadRangeDelAggregatorV2 range_del_agg(&internal_comparator_, read_seq); auto* active_range_del_iter = super_version->mem->NewRangeTombstoneIterator(read_opts, read_seq); range_del_agg.AddTombstones( diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index d81b630f3..ad45602cc 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -18,7 +18,7 @@ CompactionIterator::CompactionIterator( SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, - RangeDelAggregator* range_del_agg, const Compaction* compaction, + CompactionRangeDelAggregatorV2* range_del_agg, const Compaction* compaction, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum) @@ -36,7 +36,7 @@ CompactionIterator::CompactionIterator( SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, - RangeDelAggregator* range_del_agg, + CompactionRangeDelAggregatorV2* range_del_agg, std::unique_ptr compaction, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, diff --git a/db/compaction_iterator.h b/db/compaction_iterator.h index 71359169c..1f6a135b8 100644 --- a/db/compaction_iterator.h +++ b/db/compaction_iterator.h @@ -13,7 +13,7 @@ #include "db/compaction_iteration_stats.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" -#include "db/range_del_aggregator.h" +#include "db/range_del_aggregator_v2.h" #include "db/snapshot_checker.h" #include "options/cf_options.h" #include "rocksdb/compaction_filter.h" @@ -64,7 +64,7 @@ class CompactionIterator { SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, - RangeDelAggregator* range_del_agg, + CompactionRangeDelAggregatorV2* range_del_agg, const Compaction* compaction = nullptr, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, @@ -77,7 +77,7 @@ class CompactionIterator { SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, - RangeDelAggregator* range_del_agg, + CompactionRangeDelAggregatorV2* range_del_agg, std::unique_ptr compaction, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, @@ -141,7 +141,7 @@ class CompactionIterator { Env* env_; bool report_detailed_time_; bool expect_valid_internal_key_; - RangeDelAggregator* range_del_agg_; + CompactionRangeDelAggregatorV2* range_del_agg_; std::unique_ptr compaction_; const CompactionFilter* compaction_filter_; const std::atomic* shutting_down_; diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc index 03c5a9c62..a81efafaa 100644 --- a/db/compaction_iterator_test.cc +++ b/db/compaction_iterator_test.cc @@ -221,10 +221,16 @@ class CompactionIteratorTest : public testing::TestWithParam { MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr, bool bottommost_level = false, SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber) { - std::unique_ptr range_del_iter( + std::unique_ptr unfragmented_range_del_iter( new test::VectorIterator(range_del_ks, range_del_vs)); - range_del_agg_.reset(new RangeDelAggregator(icmp_, snapshots_)); - ASSERT_OK(range_del_agg_->AddTombstones(std::move(range_del_iter))); + auto tombstone_list = std::make_shared( + std::move(unfragmented_range_del_iter), icmp_); + std::unique_ptr range_del_iter( + new FragmentedRangeTombstoneIterator(tombstone_list, icmp_, + kMaxSequenceNumber)); + range_del_agg_.reset( + new CompactionRangeDelAggregatorV2(&icmp_, snapshots_)); + range_del_agg_->AddTombstones(std::move(range_del_iter)); std::unique_ptr compaction; if (filter || bottommost_level) { @@ -292,7 +298,7 @@ class CompactionIteratorTest : public testing::TestWithParam { std::unique_ptr merge_helper_; std::unique_ptr iter_; std::unique_ptr c_iter_; - std::unique_ptr range_del_agg_; + std::unique_ptr range_del_agg_; std::unique_ptr snapshot_checker_; std::atomic shutting_down_{false}; FakeCompaction* compaction_proxy_; diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 8a878fe72..17be3156b 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -805,15 +805,13 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { assert(sub_compact != nullptr); ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); - RangeDelAggregatorV2 range_del_agg_v2(&cfd->internal_comparator(), - kMaxSequenceNumber /* upper_bound */); - auto* range_del_agg = - range_del_agg_v2.DelegateToRangeDelAggregator(existing_snapshots_); + CompactionRangeDelAggregatorV2 range_del_agg(&cfd->internal_comparator(), + existing_snapshots_); // Although the v2 aggregator is what the level iterator(s) know about, // the AddTombstones calls will be propagated down to the v1 aggregator. std::unique_ptr input(versions_->MakeInputIterator( - sub_compact->compaction, &range_del_agg_v2, env_optiosn_for_read_)); + sub_compact->compaction, &range_del_agg, env_optiosn_for_read_)); AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_PROCESS_KV); @@ -902,8 +900,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(), &existing_snapshots_, earliest_write_conflict_snapshot_, snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_), false, - range_del_agg, sub_compact->compaction, compaction_filter, shutting_down_, - preserve_deletes_seqnum_)); + &range_del_agg, sub_compact->compaction, compaction_filter, + shutting_down_, preserve_deletes_seqnum_)); auto c_iter = sub_compact->c_iter.get(); c_iter->SeekToFirst(); if (c_iter->Valid() && sub_compact->compaction->output_level() != 0) { @@ -1041,7 +1039,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } CompactionIterationStats range_del_out_stats; status = - FinishCompactionOutputFile(input_status, sub_compact, range_del_agg, + FinishCompactionOutputFile(input_status, sub_compact, &range_del_agg, &range_del_out_stats, next_key); RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); @@ -1092,8 +1090,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } if (status.ok() && sub_compact->builder == nullptr && - sub_compact->outputs.size() == 0 && - !range_del_agg->IsEmpty()) { + sub_compact->outputs.size() == 0 && !range_del_agg.IsEmpty()) { // handle subcompaction containing only range deletions status = OpenCompactionOutputFile(sub_compact); } @@ -1102,7 +1099,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // close the output file. if (sub_compact->builder != nullptr) { CompactionIterationStats range_del_out_stats; - Status s = FinishCompactionOutputFile(status, sub_compact, range_del_agg, + Status s = FinishCompactionOutputFile(status, sub_compact, &range_del_agg, &range_del_out_stats); if (status.ok()) { status = s; @@ -1168,7 +1165,7 @@ void CompactionJob::RecordDroppedKeys( Status CompactionJob::FinishCompactionOutputFile( const Status& input_status, SubcompactionState* sub_compact, - RangeDelAggregator* range_del_agg, + CompactionRangeDelAggregatorV2* range_del_agg, CompactionIterationStats* range_del_out_stats, const Slice* next_table_min_key /* = nullptr */) { AutoThreadOperationStageUpdater stage_updater( @@ -1220,11 +1217,6 @@ Status CompactionJob::FinishCompactionOutputFile( if (existing_snapshots_.size() > 0) { earliest_snapshot = existing_snapshots_[0]; } - auto it = range_del_agg->NewIterator(); - if (lower_bound != nullptr) { - it->Seek(*lower_bound); - } - bool has_overlapping_endpoints; if (upper_bound != nullptr && meta->largest.size() > 0) { has_overlapping_endpoints = @@ -1232,6 +1224,17 @@ Status CompactionJob::FinishCompactionOutputFile( } else { has_overlapping_endpoints = false; } + + auto it = range_del_agg->NewIterator(lower_bound, upper_bound, + has_overlapping_endpoints); + // Position the range tombstone output iterator. There may be tombstone + // fragments that are entirely out of range, so make sure that we do not + // include those. + if (lower_bound != nullptr) { + it->Seek(*lower_bound); + } else { + it->SeekToFirst(); + } for (; it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); if (upper_bound != nullptr) { @@ -1257,6 +1260,8 @@ Status CompactionJob::FinishCompactionOutputFile( } auto kv = tombstone.Serialize(); + assert(lower_bound == nullptr || + ucmp->Compare(*lower_bound, kv.second) < 0); sub_compact->builder->Add(kv.first.Encode(), kv.second); InternalKey smallest_candidate = std::move(kv.first); if (lower_bound != nullptr && diff --git a/db/compaction_job.h b/db/compaction_job.h index a31e8c142..86d97e1db 100644 --- a/db/compaction_job.h +++ b/db/compaction_job.h @@ -25,12 +25,12 @@ #include "db/job_context.h" #include "db/log_writer.h" #include "db/memtable_list.h" -#include "db/range_del_aggregator.h" +#include "db/range_del_aggregator_v2.h" #include "db/version_edit.h" #include "db/write_controller.h" #include "db/write_thread.h" -#include "options/db_options.h" #include "options/cf_options.h" +#include "options/db_options.h" #include "port/port.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_job_stats.h" @@ -104,7 +104,7 @@ class CompactionJob { Status FinishCompactionOutputFile( const Status& input_status, SubcompactionState* sub_compact, - RangeDelAggregator* range_del_agg, + CompactionRangeDelAggregatorV2* range_del_agg, CompactionIterationStats* range_del_out_stats, const Slice* next_table_min_key = nullptr); Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index 8dc2ce32c..63d2829d5 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -340,8 +340,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { Arena arena; { InternalKeyComparator icmp(options.comparator); - RangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregatorV2 range_del_agg( + &icmp, kMaxSequenceNumber /* upper_bound */); ScopedArenaIterator iter(dbfull()->NewInternalIterator( &arena, &range_del_agg, kMaxSequenceNumber, handles_[1])); iter->SeekToFirst(); @@ -430,8 +430,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { count = 0; { InternalKeyComparator icmp(options.comparator); - RangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregatorV2 range_del_agg( + &icmp, kMaxSequenceNumber /* upper_bound */); ScopedArenaIterator iter(dbfull()->NewInternalIterator( &arena, &range_del_agg, kMaxSequenceNumber, handles_[1])); iter->SeekToFirst(); @@ -648,8 +648,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { int total = 0; Arena arena; InternalKeyComparator icmp(options.comparator); - RangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* snapshots */); + ReadRangeDelAggregatorV2 range_del_agg(&icmp, + kMaxSequenceNumber /* snapshots */); ScopedArenaIterator iter(dbfull()->NewInternalIterator( &arena, &range_del_agg, kMaxSequenceNumber)); iter->SeekToFirst(); diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 24e649973..5ea8c61b5 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -23,8 +23,7 @@ #include "util/sync_point.h" namespace rocksdb { -Options SanitizeOptions(const std::string& dbname, - const Options& src) { +Options SanitizeOptions(const std::string& dbname, const Options& src) { auto db_options = SanitizeOptions(dbname, DBOptions(src)); ImmutableDBOptions immutable_db_options(db_options); auto cf_options = @@ -56,10 +55,9 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { result.write_buffer_manager.reset( new WriteBufferManager(result.db_write_buffer_size)); } - auto bg_job_limits = DBImpl::GetBGJobLimits(result.max_background_flushes, - result.max_background_compactions, - result.max_background_jobs, - true /* parallelize_compactions */); + auto bg_job_limits = DBImpl::GetBGJobLimits( + result.max_background_flushes, result.max_background_compactions, + result.max_background_jobs, true /* parallelize_compactions */); result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions, Env::Priority::LOW); result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes, @@ -107,14 +105,12 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { result.db_paths.emplace_back(dbname, std::numeric_limits::max()); } - if (result.use_direct_reads && - result.compaction_readahead_size == 0) { + if (result.use_direct_reads && result.compaction_readahead_size == 0) { TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr); result.compaction_readahead_size = 1024 * 1024 * 2; } - if (result.compaction_readahead_size > 0 || - result.use_direct_reads) { + if (result.compaction_readahead_size > 0 || result.use_direct_reads) { result.new_table_reader_for_compaction_inputs = true; } @@ -218,7 +214,7 @@ static Status ValidateOptions( return Status::OK(); } -} // namespace +} // namespace Status DBImpl::NewDB() { VersionEdit new_db; new_db.SetLogNumber(0); @@ -258,9 +254,8 @@ Status DBImpl::NewDB() { return s; } -Status DBImpl::CreateAndNewDirectory( - Env* env, const std::string& dirname, - std::unique_ptr* directory) { +Status DBImpl::CreateAndNewDirectory(Env* env, const std::string& dirname, + std::unique_ptr* directory) { // We call CreateDirIfMissing() as the directory may already exist (if we // are reopening a DB), when this happens we don't want creating the // directory to cause an error. However, we need to check if creating the @@ -341,8 +336,8 @@ Status DBImpl::Recover( } } else if (s.ok()) { if (immutable_db_options_.error_if_exists) { - return Status::InvalidArgument( - dbname_, "exists (error_if_exists is true)"); + return Status::InvalidArgument(dbname_, + "exists (error_if_exists is true)"); } } else { // Unexpected error reading file @@ -527,10 +522,9 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, std::map cf_name_id_map; std::map cf_lognumber_map; for (auto cfd : *versions_->GetColumnFamilySet()) { - cf_name_id_map.insert( - std::make_pair(cfd->GetName(), cfd->GetID())); + cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID())); cf_lognumber_map.insert( - std::make_pair(cfd->GetID(), cfd->GetLogNumber())); + std::make_pair(cfd->GetID(), cfd->GetLogNumber())); } immutable_db_options_.wal_filter->ColumnFamilyLogNumberMap(cf_lognumber_map, @@ -880,8 +874,8 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // VersionSet::next_file_number_ always to be strictly greater than any // log number versions_->MarkFileNumberUsed(max_log_number + 1); - status = versions_->LogAndApply( - cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + edit, &mutex_); if (!status.ok()) { // Recovery failed break; @@ -994,12 +988,17 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, if (use_custom_gc_ && snapshot_checker == nullptr) { snapshot_checker = DisableGCSnapshotChecker::Instance(); } + std::vector> + range_del_iters; + auto range_del_iter = + mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } s = BuildTable( dbname_, env_, *cfd->ioptions(), mutable_cf_options, env_options_for_compaction_, cfd->table_cache(), iter.get(), - std::unique_ptr( - mem->NewRangeTombstoneIterator(ro, versions_->LastSequence())), - &meta, cfd->internal_comparator(), + std::move(range_del_iters), &meta, cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(), snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), @@ -1033,8 +1032,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, stats.bytes_written = meta.fd.GetFileSize(); stats.num_output_files = 1; cfd->internal_stats()->AddCompactionStats(level, stats); - cfd->internal_stats()->AddCFStats( - InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize()); + cfd->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED, + meta.fd.GetFileSize()); RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); return s; } @@ -1227,7 +1226,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, !cfd->mem()->IsMergeOperatorSupported()) { s = Status::InvalidArgument( "The memtable of column family %s does not support merge operator " - "its options.merge_operator is non-null", cfd->GetName().c_str()); + "its options.merge_operator is non-null", + cfd->GetName().c_str()); } if (!s.ok()) { break; diff --git a/db/db_iter.cc b/db/db_iter.cc index 78a6bf47c..cc4a0d5f4 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -171,7 +171,7 @@ class DBIter final: public Iterator { iter_ = iter; iter_->SetPinnedItersMgr(&pinned_iters_mgr_); } - virtual RangeDelAggregatorV2* GetRangeDelAggregator() { + virtual ReadRangeDelAggregatorV2* GetRangeDelAggregator() { return &range_del_agg_; } @@ -341,7 +341,7 @@ class DBIter final: public Iterator { const bool total_order_seek_; // List of operands for merge operator. MergeContext merge_context_; - RangeDelAggregatorV2 range_del_agg_; + ReadRangeDelAggregatorV2 range_del_agg_; LocalStatistics local_stats_; PinnedIteratorsManager pinned_iters_mgr_; ReadCallback* read_callback_; @@ -1479,7 +1479,7 @@ Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); } -RangeDelAggregatorV2* ArenaWrappedDBIter::GetRangeDelAggregator() { +ReadRangeDelAggregatorV2* ArenaWrappedDBIter::GetRangeDelAggregator() { return db_iter_->GetRangeDelAggregator(); } diff --git a/db/db_iter.h b/db/db_iter.h index 3d359bbb1..6ee869135 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -48,7 +48,7 @@ class ArenaWrappedDBIter : public Iterator { // Get the arena to be used to allocate memory for DBIter to be wrapped, // as well as child iterators in it. virtual Arena* GetArena() { return &arena_; } - virtual RangeDelAggregatorV2* GetRangeDelAggregator(); + virtual ReadRangeDelAggregatorV2* GetRangeDelAggregator(); // Set the internal iterator wrapped inside the DB Iterator. Usually it is // a merging iterator. diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 50092653b..eeff7be51 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -814,8 +814,8 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) { Arena arena; auto options = CurrentOptions(); InternalKeyComparator icmp(options.comparator); - RangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregatorV2 range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); ScopedArenaIterator iter; if (cf == 0) { iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg, @@ -1227,8 +1227,8 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) { Arena arena; auto options = CurrentOptions(); InternalKeyComparator icmp(options.comparator); - RangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregatorV2 range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); // This should be defined after range_del_agg so that it destructs the // assigned iterator before it range_del_agg is already destructed. ScopedArenaIterator iter; @@ -1437,8 +1437,8 @@ void DBTestBase::VerifyDBInternal( std::vector> true_data) { Arena arena; InternalKeyComparator icmp(last_options_.comparator); - RangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregatorV2 range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); auto iter = dbfull()->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber); iter->SeekToFirst(); diff --git a/db/flush_job.cc b/db/flush_job.cc index 17ec22ed9..8769c849e 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -24,14 +24,15 @@ #include "db/event_helpers.h" #include "db/log_reader.h" #include "db/log_writer.h" +#include "db/memtable.h" #include "db/memtable_list.h" #include "db/merge_context.h" +#include "db/range_tombstone_fragmenter.h" #include "db/version_set.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" #include "monitoring/thread_status_util.h" #include "port/port.h" -#include "db/memtable.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/statistics.h" @@ -295,7 +296,8 @@ Status FlushJob::WriteLevel0Table() { // memtable and its associated range deletion memtable, respectively, at // corresponding indexes. std::vector memtables; - std::vector range_del_iters; + std::vector> + range_del_iters; ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -308,9 +310,9 @@ Status FlushJob::WriteLevel0Table() { cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber()); memtables.push_back(m->NewIterator(ro, &arena)); auto* range_del_iter = - m->NewRangeTombstoneIterator(ro, versions_->LastSequence()); + m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber); if (range_del_iter != nullptr) { - range_del_iters.push_back(range_del_iter); + range_del_iters.emplace_back(range_del_iter); } total_num_entries += m->num_entries(); total_num_deletes += m->num_deletes(); @@ -329,10 +331,6 @@ Status FlushJob::WriteLevel0Table() { ScopedArenaIterator iter( NewMergingIterator(&cfd_->internal_comparator(), &memtables[0], static_cast(memtables.size()), &arena)); - std::unique_ptr range_del_iter(NewMergingIterator( - &cfd_->internal_comparator(), - range_del_iters.empty() ? nullptr : &range_del_iters[0], - static_cast(range_del_iters.size()))); ROCKS_LOG_INFO(db_options_.info_log, "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started", cfd_->GetName().c_str(), job_context_->job_id, @@ -358,7 +356,7 @@ Status FlushJob::WriteLevel0Table() { s = BuildTable( dbname_, db_options_.env, *cfd_->ioptions(), mutable_cf_options_, env_options_, cfd_->table_cache(), iter.get(), - std::move(range_del_iter), &meta_, cfd_->internal_comparator(), + std::move(range_del_iters), &meta_, cfd_->internal_comparator(), cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(), cfd_->GetName(), existing_snapshots_, earliest_write_conflict_snapshot_, snapshot_checker_, diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index cdf9a07f6..226d56d5f 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -73,8 +73,8 @@ class ForwardLevelIterator : public InternalIterator { delete file_iter_; } - RangeDelAggregatorV2 range_del_agg(&cfd_->internal_comparator(), - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregatorV2 range_del_agg( + &cfd_->internal_comparator(), kMaxSequenceNumber /* upper_bound */); file_iter_ = cfd_->table_cache()->NewIterator( read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), *files_[file_index_], @@ -610,8 +610,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { // New sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); } - RangeDelAggregatorV2 range_del_agg(&cfd_->internal_comparator(), - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregatorV2 range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); if (!read_options_.ignore_range_deletions) { @@ -669,8 +669,8 @@ void ForwardIterator::RenewIterators() { mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_); svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_); - RangeDelAggregatorV2 range_del_agg(&cfd_->internal_comparator(), - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregatorV2 range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); if (!read_options_.ignore_range_deletions) { std::unique_ptr range_del_iter( svnew->mem->NewRangeTombstoneIterator( diff --git a/db/merge_helper.cc b/db/merge_helper.cc index dc6baa963..6f7e760ec 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -110,8 +110,11 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, // keys_ stores the list of keys encountered while merging. // operands_ stores the list of merge operands encountered while merging. // keys_[i] corresponds to operands_[i] for each i. +// +// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator +// and just pass the StripeRep corresponding to the stripe being merged. Status MergeHelper::MergeUntil(InternalIterator* iter, - RangeDelAggregator* range_del_agg, + CompactionRangeDelAggregatorV2* range_del_agg, const SequenceNumber stop_before, const bool at_bottom) { // Get a copy of the internal key, before it's invalidated by iter->Next() diff --git a/db/merge_helper.h b/db/merge_helper.h index 993bbe3e9..1c92a3492 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -11,7 +11,7 @@ #include "db/dbformat.h" #include "db/merge_context.h" -#include "db/range_del_aggregator.h" +#include "db/range_del_aggregator_v2.h" #include "db/snapshot_checker.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" @@ -78,7 +78,7 @@ class MergeHelper { // // REQUIRED: The first key in the input is not corrupted. Status MergeUntil(InternalIterator* iter, - RangeDelAggregator* range_del_agg = nullptr, + CompactionRangeDelAggregatorV2* range_del_agg = nullptr, const SequenceNumber stop_before = 0, const bool at_bottom = false); diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc index 9fdcefc39..0b8260960 100644 --- a/db/range_del_aggregator_bench.cc +++ b/db/range_del_aggregator_bench.cc @@ -194,7 +194,7 @@ int main(int argc, char** argv) { for (int i = 0; i < FLAGS_num_runs; i++) { rocksdb::RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, FLAGS_use_collapsed); - rocksdb::RangeDelAggregatorV2 range_del_agg_v2( + rocksdb::ReadRangeDelAggregatorV2 range_del_agg_v2( &icmp, rocksdb::kMaxSequenceNumber /* upper_bound */); std::vector > diff --git a/db/range_del_aggregator_v2.cc b/db/range_del_aggregator_v2.cc index a2ae4f7a3..b0667f6fd 100644 --- a/db/range_del_aggregator_v2.cc +++ b/db/range_del_aggregator_v2.cc @@ -26,7 +26,10 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator( std::unique_ptr iter, const InternalKeyComparator* icmp, const InternalKey* smallest, const InternalKey* largest) - : iter_(std::move(iter)), icmp_(icmp) { + : iter_(std::move(iter)), + icmp_(icmp), + smallest_ikey_(smallest), + largest_ikey_(largest) { if (smallest != nullptr) { pinned_bounds_.emplace_back(); auto& parsed_smallest = pinned_bounds_.back(); @@ -78,6 +81,8 @@ void TruncatedRangeDelIterator::Next() { iter_->TopNext(); } void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); } +void TruncatedRangeDelIterator::InternalNext() { iter_->Next(); } + // NOTE: target is a user key void TruncatedRangeDelIterator::Seek(const Slice& target) { if (largest_ != nullptr && @@ -86,6 +91,11 @@ void TruncatedRangeDelIterator::Seek(const Slice& target) { iter_->Invalidate(); return; } + if (smallest_ != nullptr && + icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) { + iter_->Seek(smallest_->user_key); + return; + } iter_->Seek(target); } @@ -97,12 +107,51 @@ void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) { iter_->Invalidate(); return; } + if (largest_ != nullptr && + icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) { + iter_->SeekForPrev(largest_->user_key); + return; + } iter_->SeekForPrev(target); } -void TruncatedRangeDelIterator::SeekToFirst() { iter_->SeekToTopFirst(); } +void TruncatedRangeDelIterator::SeekToFirst() { + if (smallest_ != nullptr) { + iter_->Seek(smallest_->user_key); + return; + } + iter_->SeekToTopFirst(); +} -void TruncatedRangeDelIterator::SeekToLast() { iter_->SeekToTopLast(); } +void TruncatedRangeDelIterator::SeekToLast() { + if (largest_ != nullptr) { + iter_->SeekForPrev(largest_->user_key); + return; + } + iter_->SeekToTopLast(); +} + +std::map> +TruncatedRangeDelIterator::SplitBySnapshot( + const std::vector& snapshots) { + using FragmentedIterPair = + std::pair>; + + auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots); + std::map> + split_truncated_iters; + std::for_each( + split_untruncated_iters.begin(), split_untruncated_iters.end(), + [&](FragmentedIterPair& iter_pair) { + std::unique_ptr truncated_iter( + new TruncatedRangeDelIterator(std::move(iter_pair.second), icmp_, + smallest_ikey_, largest_ikey_)); + split_truncated_iters.emplace(iter_pair.first, + std::move(truncated_iter)); + }); + return split_truncated_iters; +} ForwardRangeDelIterator::ForwardRangeDelIterator( const InternalKeyComparator* icmp, @@ -116,15 +165,6 @@ ForwardRangeDelIterator::ForwardRangeDelIterator( bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { assert(iters_ != nullptr); - // Pick up previously unseen iterators. - for (auto it = std::next(iters_->begin(), unused_idx_); it != iters_->end(); - ++it, ++unused_idx_) { - auto& iter = *it; - iter->Seek(parsed.user_key); - PushIter(iter.get(), parsed); - assert(active_iters_.size() == active_seqnums_.size()); - } - // Move active iterators that end before parsed. while (!active_iters_.empty() && icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) { @@ -171,15 +211,6 @@ ReverseRangeDelIterator::ReverseRangeDelIterator( bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { assert(iters_ != nullptr); - // Pick up previously unseen iterators. - for (auto it = std::next(iters_->begin(), unused_idx_); it != iters_->end(); - ++it, ++unused_idx_) { - auto& iter = *it; - iter->SeekForPrev(parsed.user_key); - PushIter(iter.get(), parsed); - assert(active_iters_.size() == active_seqnums_.size()); - } - // Move active iterators that start after parsed. while (!active_iters_.empty() && icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) { @@ -214,38 +245,33 @@ void ReverseRangeDelIterator::Invalidate() { inactive_iters_.clear(); } -RangeDelAggregatorV2::RangeDelAggregatorV2(const InternalKeyComparator* icmp, - SequenceNumber /* upper_bound */) - : icmp_(icmp), forward_iter_(icmp, &iters_), reverse_iter_(icmp, &iters_) {} - -void RangeDelAggregatorV2::AddTombstones( - std::unique_ptr input_iter, - const InternalKey* smallest, const InternalKey* largest) { - if (input_iter == nullptr || input_iter->empty()) { - return; - } - if (wrapped_range_del_agg != nullptr) { - wrapped_range_del_agg->AddTombstones(std::move(input_iter), smallest, - largest); - // TODO: this eats the status of the wrapped call; may want to propagate it - return; - } - iters_.emplace_back(new TruncatedRangeDelIterator(std::move(input_iter), - icmp_, smallest, largest)); -} - -bool RangeDelAggregatorV2::ShouldDelete(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode) { - if (wrapped_range_del_agg != nullptr) { - return wrapped_range_del_agg->ShouldDelete(parsed, mode); +bool RangeDelAggregatorV2::StripeRep::ShouldDelete( + const ParsedInternalKey& parsed, RangeDelPositioningMode mode) { + if (!InStripe(parsed.sequence) || IsEmpty()) { + return false; } - switch (mode) { case RangeDelPositioningMode::kForwardTraversal: - reverse_iter_.Invalidate(); + InvalidateReverseIter(); + + // Pick up previously unseen iterators. + for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx()); + it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) { + auto& iter = *it; + forward_iter_.AddNewIter(iter.get(), parsed); + } + return forward_iter_.ShouldDelete(parsed); case RangeDelPositioningMode::kBackwardTraversal: - forward_iter_.Invalidate(); + InvalidateForwardIter(); + + // Pick up previously unseen iterators. + for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx()); + it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) { + auto& iter = *it; + reverse_iter_.AddNewIter(iter.get(), parsed); + } + return reverse_iter_.ShouldDelete(parsed); default: assert(false); @@ -253,14 +279,13 @@ bool RangeDelAggregatorV2::ShouldDelete(const ParsedInternalKey& parsed, } } -bool RangeDelAggregatorV2::IsRangeOverlapped(const Slice& start, - const Slice& end) { - assert(wrapped_range_del_agg == nullptr); - InvalidateRangeDelMapPositions(); +bool RangeDelAggregatorV2::StripeRep::IsRangeOverlapped(const Slice& start, + const Slice& end) { + Invalidate(); // Set the internal start/end keys so that: - // - if start_ikey has the same user key and sequence number as the current - // end key, start_ikey will be considered greater; and + // - if start_ikey has the same user key and sequence number as the + // current end key, start_ikey will be considered greater; and // - if end_ikey has the same user key and sequence number as the current // start key, end_ikey will be considered greater. ParsedInternalKey start_ikey(start, kMaxSequenceNumber, @@ -279,9 +304,9 @@ bool RangeDelAggregatorV2::IsRangeOverlapped(const Slice& start, } if (!checked_candidate_tombstones) { - // Do an additional check for when the end of the range is the begin key - // of a tombstone, which we missed earlier since SeekForPrev'ing to the - // start was invalid. + // Do an additional check for when the end of the range is the begin + // key of a tombstone, which we missed earlier since SeekForPrev'ing + // to the start was invalid. iter->SeekForPrev(end); if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 && icmp_->Compare(iter->start_key(), end_ikey) <= 0) { @@ -292,4 +317,176 @@ bool RangeDelAggregatorV2::IsRangeOverlapped(const Slice& start, return false; } +void ReadRangeDelAggregatorV2::AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest, const InternalKey* largest) { + if (input_iter == nullptr || input_iter->empty()) { + return; + } + rep_.AddTombstones( + std::unique_ptr(new TruncatedRangeDelIterator( + std::move(input_iter), icmp_, smallest, largest))); +} + +bool ReadRangeDelAggregatorV2::ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) { + return rep_.ShouldDelete(parsed, mode); +} + +bool ReadRangeDelAggregatorV2::IsRangeOverlapped(const Slice& start, + const Slice& end) { + InvalidateRangeDelMapPositions(); + return rep_.IsRangeOverlapped(start, end); +} + +void CompactionRangeDelAggregatorV2::AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest, const InternalKey* largest) { + if (input_iter == nullptr || input_iter->empty()) { + return; + } + assert(input_iter->lower_bound() == 0); + assert(input_iter->upper_bound() == kMaxSequenceNumber); + parent_iters_.emplace_back(new TruncatedRangeDelIterator( + std::move(input_iter), icmp_, smallest, largest)); + + auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_); + for (auto& split_iter : split_iters) { + auto it = reps_.find(split_iter.first); + if (it == reps_.end()) { + bool inserted; + SequenceNumber upper_bound = split_iter.second->upper_bound(); + SequenceNumber lower_bound = split_iter.second->lower_bound(); + std::tie(it, inserted) = reps_.emplace( + split_iter.first, StripeRep(icmp_, upper_bound, lower_bound)); + assert(inserted); + } + assert(it != reps_.end()); + it->second.AddTombstones(std::move(split_iter.second)); + } +} + +bool CompactionRangeDelAggregatorV2::ShouldDelete( + const ParsedInternalKey& parsed, RangeDelPositioningMode mode) { + auto it = reps_.lower_bound(parsed.sequence); + if (it == reps_.end()) { + return false; + } + return it->second.ShouldDelete(parsed, mode); +} + +namespace { + +class TruncatedRangeDelMergingIter : public InternalIterator { + public: + TruncatedRangeDelMergingIter( + const InternalKeyComparator* icmp, const Slice* lower_bound, + const Slice* upper_bound, bool upper_bound_inclusive, + const std::vector>& children) + : icmp_(icmp), + lower_bound_(lower_bound), + upper_bound_(upper_bound), + upper_bound_inclusive_(upper_bound_inclusive), + heap_(StartKeyMinComparator(icmp)) { + for (auto& child : children) { + if (child != nullptr) { + assert(child->lower_bound() == 0); + assert(child->upper_bound() == kMaxSequenceNumber); + children_.push_back(child.get()); + } + } + } + + bool Valid() const override { + return !heap_.empty() && BeforeEndKey(heap_.top()); + } + Status status() const override { return Status::OK(); } + + void SeekToFirst() override { + heap_.clear(); + for (auto& child : children_) { + if (lower_bound_ != nullptr) { + child->Seek(*lower_bound_); + } else { + child->SeekToFirst(); + } + if (child->Valid()) { + heap_.push(child); + } + } + } + + void Next() override { + auto* top = heap_.top(); + top->InternalNext(); + if (top->Valid()) { + heap_.replace_top(top); + } else { + heap_.pop(); + } + } + + Slice key() const override { + auto* top = heap_.top(); + cur_start_key_.Set(top->start_key().user_key, top->seq(), + kTypeRangeDeletion); + return cur_start_key_.Encode(); + } + + Slice value() const override { + auto* top = heap_.top(); + assert(top->end_key().sequence == kMaxSequenceNumber); + return top->end_key().user_key; + } + + // Unused InternalIterator methods + void Prev() override { assert(false); } + void Seek(const Slice& /* target */) override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } + + private: + bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const { + if (upper_bound_ == nullptr) { + return true; + } + int cmp = icmp_->user_comparator()->Compare(iter->start_key().user_key, + *upper_bound_); + return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0; + } + + const InternalKeyComparator* icmp_; + const Slice* lower_bound_; + const Slice* upper_bound_; + bool upper_bound_inclusive_; + BinaryHeap heap_; + std::vector children_; + + mutable InternalKey cur_start_key_; +}; + +} // namespace + +std::unique_ptr +CompactionRangeDelAggregatorV2::NewIterator(const Slice* lower_bound, + const Slice* upper_bound, + bool upper_bound_inclusive) { + InvalidateRangeDelMapPositions(); + std::unique_ptr merging_iter( + new TruncatedRangeDelMergingIter(icmp_, lower_bound, upper_bound, + upper_bound_inclusive, parent_iters_)); + + // TODO: add tests where tombstone fragments can be outside of upper and lower + // bound range + auto fragmented_tombstone_list = + std::make_shared( + std::move(merging_iter), *icmp_, true /* for_compaction */, + *snapshots_); + + return std::unique_ptr( + new FragmentedRangeTombstoneIterator( + fragmented_tombstone_list, *icmp_, + kMaxSequenceNumber /* upper_bound */)); +} + } // namespace rocksdb diff --git a/db/range_del_aggregator_v2.h b/db/range_del_aggregator_v2.h index 8413bb9cb..306dbf249 100644 --- a/db/range_del_aggregator_v2.h +++ b/db/range_del_aggregator_v2.h @@ -5,6 +5,8 @@ #pragma once +#include +#include #include #include #include @@ -27,8 +29,6 @@ namespace rocksdb { -class RangeDelAggregatorV2; - class TruncatedRangeDelIterator { public: TruncatedRangeDelIterator( @@ -41,6 +41,8 @@ class TruncatedRangeDelIterator { void Next(); void Prev(); + void InternalNext(); + // Seeks to the tombstone with the highest viisble sequence number that covers // target (a user key). If no such tombstone exists, the position will be at // the earliest tombstone that ends after target. @@ -70,12 +72,22 @@ class TruncatedRangeDelIterator { SequenceNumber seq() const { return iter_->seq(); } + std::map> + SplitBySnapshot(const std::vector& snapshots); + + SequenceNumber upper_bound() const { return iter_->upper_bound(); } + + SequenceNumber lower_bound() const { return iter_->lower_bound(); } + private: std::unique_ptr iter_; const InternalKeyComparator* icmp_; const ParsedInternalKey* smallest_ = nullptr; const ParsedInternalKey* largest_ = nullptr; std::list pinned_bounds_; + + const InternalKey* smallest_ikey_; + const InternalKey* largest_ikey_; }; struct SeqMaxComparator { @@ -85,6 +97,17 @@ struct SeqMaxComparator { } }; +struct StartKeyMinComparator { + explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return icmp->Compare(a->start_key(), b->start_key()) > 0; + } + + const InternalKeyComparator* icmp; +}; + class ForwardRangeDelIterator { public: ForwardRangeDelIterator( @@ -94,20 +117,20 @@ class ForwardRangeDelIterator { bool ShouldDelete(const ParsedInternalKey& parsed); void Invalidate(); + void AddNewIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + iter->Seek(parsed.user_key); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + size_t UnusedIdx() const { return unused_idx_; } + void IncUnusedIdx() { unused_idx_++; } + private: using ActiveSeqSet = std::multiset; - struct StartKeyMinComparator { - explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} - - bool operator()(const TruncatedRangeDelIterator* a, - const TruncatedRangeDelIterator* b) const { - return icmp->Compare(a->start_key(), b->start_key()) > 0; - } - - const InternalKeyComparator* icmp; - }; struct EndKeyMinComparator { explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} @@ -124,7 +147,10 @@ class ForwardRangeDelIterator { if (!iter->Valid()) { // The iterator has been fully consumed, so we don't need to add it to // either of the heaps. - } else if (icmp_->Compare(parsed, iter->start_key()) < 0) { + return; + } + int cmp = icmp_->Compare(parsed, iter->start_key()); + if (cmp < 0) { PushInactiveIter(iter); } else { PushActiveIter(iter); @@ -171,6 +197,16 @@ class ReverseRangeDelIterator { bool ShouldDelete(const ParsedInternalKey& parsed); void Invalidate(); + void AddNewIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + iter->SeekForPrev(parsed.user_key); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + size_t UnusedIdx() const { return unused_idx_; } + void IncUnusedIdx() { unused_idx_++; } + private: using ActiveSeqSet = std::multiset; @@ -241,55 +277,160 @@ class ReverseRangeDelIterator { class RangeDelAggregatorV2 { public: - RangeDelAggregatorV2(const InternalKeyComparator* icmp, - SequenceNumber upper_bound); + explicit RangeDelAggregatorV2(const InternalKeyComparator* icmp) + : icmp_(icmp) {} + virtual ~RangeDelAggregatorV2() {} - void AddTombstones( + virtual void AddTombstones( std::unique_ptr input_iter, const InternalKey* smallest = nullptr, - const InternalKey* largest = nullptr); + const InternalKey* largest = nullptr) = 0; - bool ShouldDelete(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode); + bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) { + ParsedInternalKey parsed; + if (!ParseInternalKey(key, &parsed)) { + return false; + } + return ShouldDelete(parsed, mode); + } + virtual bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) = 0; - bool IsRangeOverlapped(const Slice& start, const Slice& end); + virtual void InvalidateRangeDelMapPositions() = 0; - void InvalidateRangeDelMapPositions() { - forward_iter_.Invalidate(); - reverse_iter_.Invalidate(); - } + virtual bool IsEmpty() const = 0; - bool IsEmpty() const { return iters_.empty(); } bool AddFile(uint64_t file_number) { return files_seen_.insert(file_number).second; } - // Adaptor method to pass calls through to an old-style RangeDelAggregator. - // Will be removed once this new version supports an iterator that can be used - // during flush/compaction. - RangeDelAggregator* DelegateToRangeDelAggregator( - const std::vector& snapshots) { - wrapped_range_del_agg.reset(new RangeDelAggregator( - *icmp_, snapshots, true /* collapse_deletions */)); - return wrapped_range_del_agg.get(); - } + protected: + class StripeRep { + public: + StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound, + SequenceNumber lower_bound) + : icmp_(icmp), + forward_iter_(icmp, &iters_), + reverse_iter_(icmp, &iters_), + upper_bound_(upper_bound), + lower_bound_(lower_bound) {} + + void AddTombstones(std::unique_ptr input_iter) { + iters_.push_back(std::move(input_iter)); + } - std::unique_ptr NewIterator() { - assert(wrapped_range_del_agg != nullptr); - return wrapped_range_del_agg->NewIterator(); - } + bool IsEmpty() const { return iters_.empty(); } + + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode); + + void Invalidate() { + InvalidateForwardIter(); + InvalidateReverseIter(); + } + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + private: + bool InStripe(SequenceNumber seq) const { + return lower_bound_ <= seq && seq <= upper_bound_; + } + + void InvalidateForwardIter() { forward_iter_.Invalidate(); } + + void InvalidateReverseIter() { reverse_iter_.Invalidate(); } + + const InternalKeyComparator* icmp_; + std::vector> iters_; + ForwardRangeDelIterator forward_iter_; + ReverseRangeDelIterator reverse_iter_; + SequenceNumber upper_bound_; + SequenceNumber lower_bound_; + }; - private: const InternalKeyComparator* icmp_; - std::vector> iters_; + private: std::set files_seen_; +}; - ForwardRangeDelIterator forward_iter_; - ReverseRangeDelIterator reverse_iter_; +class ReadRangeDelAggregatorV2 : public RangeDelAggregatorV2 { + public: + ReadRangeDelAggregatorV2(const InternalKeyComparator* icmp, + SequenceNumber upper_bound) + : RangeDelAggregatorV2(icmp), + rep_(icmp, upper_bound, 0 /* lower_bound */) {} + ~ReadRangeDelAggregatorV2() override {} + + using RangeDelAggregatorV2::ShouldDelete; + void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) override; + + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) override; + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); } + + bool IsEmpty() const override { return rep_.IsEmpty(); } + + private: + StripeRep rep_; +}; + +class CompactionRangeDelAggregatorV2 : public RangeDelAggregatorV2 { + public: + CompactionRangeDelAggregatorV2(const InternalKeyComparator* icmp, + const std::vector& snapshots) + : RangeDelAggregatorV2(icmp), snapshots_(&snapshots) {} + ~CompactionRangeDelAggregatorV2() override {} + + void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) override; + + using RangeDelAggregatorV2::ShouldDelete; + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) override; + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + void InvalidateRangeDelMapPositions() override { + for (auto& rep : reps_) { + rep.second.Invalidate(); + } + } + + bool IsEmpty() const override { + for (const auto& rep : reps_) { + if (!rep.second.IsEmpty()) { + return false; + } + } + return true; + } + + // Creates an iterator over all the range tombstones in the aggregator, for + // use in compaction. Nullptr arguments indicate that the iterator range is + // unbounded. + // NOTE: the boundaries are used for optimization purposes to reduce the + // number of tombstones that are passed to the fragmenter; they do not + // guarantee that the resulting iterator only contains range tombstones that + // cover keys in the provided range. If required, these bounds must be + // enforced during iteration. + std::unique_ptr NewIterator( + const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr, + bool upper_bound_inclusive = false); + + private: + std::vector> parent_iters_; + std::map reps_; - // TODO: remove once V2 supports exposing tombstone iterators - std::unique_ptr wrapped_range_del_agg; + const std::vector* snapshots_; }; } // namespace rocksdb diff --git a/db/range_del_aggregator_v2_test.cc b/db/range_del_aggregator_v2_test.cc index 79cb548b1..64f8ed079 100644 --- a/db/range_del_aggregator_v2_test.cc +++ b/db/range_del_aggregator_v2_test.cc @@ -158,7 +158,7 @@ void VerifyShouldDelete(RangeDelAggregatorV2* range_del_agg, } void VerifyIsRangeOverlapped( - RangeDelAggregatorV2* range_del_agg, + ReadRangeDelAggregatorV2* range_del_agg, const std::vector& test_cases) { for (const auto& test_case : test_cases) { EXPECT_EQ(test_case.result, @@ -166,6 +166,30 @@ void VerifyIsRangeOverlapped( } } +void CheckIterPosition(const RangeTombstone& tombstone, + const FragmentedRangeTombstoneIterator* iter) { + // Test InternalIterator interface. + EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key())); + EXPECT_EQ(tombstone.end_key_, iter->value()); + EXPECT_EQ(tombstone.seq_, iter->seq()); + + // Test FragmentedRangeTombstoneIterator interface. + EXPECT_EQ(tombstone.start_key_, iter->start_key()); + EXPECT_EQ(tombstone.end_key_, iter->end_key()); + EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key())); +} + +void VerifyFragmentedRangeDels( + FragmentedRangeTombstoneIterator* iter, + const std::vector& expected_tombstones) { + iter->SeekToFirst(); + for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(expected_tombstones[i], iter); + } + EXPECT_FALSE(iter->Valid()); +} + } // namespace TEST_F(RangeDelAggregatorV2Test, EmptyTruncatedIter) { @@ -253,7 +277,7 @@ TEST_F(RangeDelAggregatorV2Test, UntruncatedIterWithSnapshot) { {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); } -TEST_F(RangeDelAggregatorV2Test, TruncatedIter) { +TEST_F(RangeDelAggregatorV2Test, TruncatedIterPartiallyCutTombstones) { auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), @@ -289,6 +313,36 @@ TEST_F(RangeDelAggregatorV2Test, TruncatedIter) { {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); } +TEST_F(RangeDelAggregatorV2Test, TruncatedIterFullyCutTombstones) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + InternalKey smallest("f", 7, kTypeValue); + InternalKey largest("i", 9, kTypeValue); + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, + &smallest, &largest); + + VerifyIterator(&iter, bytewise_icmp, + {{InternalValue("f", 7), UncutEndpoint("g"), 8}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}}); +} + TEST_F(RangeDelAggregatorV2Test, SingleIterInAggregator) { auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}}); FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), @@ -297,7 +351,7 @@ TEST_F(RangeDelAggregatorV2Test, SingleIterInAggregator) { new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, kMaxSequenceNumber)); - RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, kMaxSequenceNumber); + ReadRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, kMaxSequenceNumber); range_del_agg.AddTombstones(std::move(input_iter)); VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false}, @@ -318,7 +372,7 @@ TEST_F(RangeDelAggregatorV2Test, MultipleItersInAggregator) { {{{"a", "e", 10}, {"c", "g", 8}}, {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); - RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, kMaxSequenceNumber); + ReadRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, kMaxSequenceNumber); for (const auto& fragment_list : fragment_lists) { std::unique_ptr input_iter( new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, @@ -350,7 +404,7 @@ TEST_F(RangeDelAggregatorV2Test, MultipleItersInAggregatorWithUpperBound) { {{{"a", "e", 10}, {"c", "g", 8}}, {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); - RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19); + ReadRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19); for (const auto& fragment_list : fragment_lists) { std::unique_ptr input_iter( new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, @@ -387,7 +441,7 @@ TEST_F(RangeDelAggregatorV2Test, MultipleTruncatedItersInAggregator) { InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)}, {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}}; - RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19); + ReadRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19); for (size_t i = 0; i < fragment_lists.size(); i++) { const auto& fragment_list = fragment_lists[i]; const auto& bounds = iter_bounds[i]; @@ -427,7 +481,7 @@ TEST_F(RangeDelAggregatorV2Test, MultipleTruncatedItersInAggregatorSameLevel) { InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)}, {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}}; - RangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19); + ReadRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19); auto add_iter_to_agg = [&](size_t i) { std::unique_ptr input_iter( @@ -461,6 +515,192 @@ TEST_F(RangeDelAggregatorV2Test, MultipleTruncatedItersInAggregatorSameLevel) { {"zz", "zzz", false}}); } +TEST_F(RangeDelAggregatorV2Test, CompactionAggregatorNoSnapshots) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots; + CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true}, + {InternalValue("b", 19), false}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}, + {InternalValue("h", 24), true}, + {InternalValue("i", 24), false}, + {InternalValue("ii", 14), true}, + {InternalValue("j", 14), false}}); + + auto range_del_compaction_iter = range_del_agg.NewIterator(); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20}, + {"b", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"h", "i", 25}, + {"ii", "j", 15}}); +} + +TEST_F(RangeDelAggregatorV2Test, CompactionAggregatorWithSnapshots) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + VerifyShouldDelete( + &range_del_agg, + { + {InternalValue("a", 19), false}, // [10, 19] + {InternalValue("a", 9), false}, // [0, 9] + {InternalValue("b", 9), false}, // [0, 9] + {InternalValue("d", 9), false}, // [0, 9] + {InternalValue("d", 7), true}, // [0, 9] + {InternalValue("e", 7), true}, // [0, 9] + {InternalValue("g", 7), false}, // [0, 9] + {InternalValue("h", 24), true}, // [20, kMaxSequenceNumber] + {InternalValue("i", 24), false}, // [20, kMaxSequenceNumber] + {InternalValue("ii", 14), true}, // [10, 19] + {InternalValue("j", 14), false} // [10, 19] + }); + + auto range_del_compaction_iter = range_del_agg.NewIterator(); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20}, + {"a", "b", 10}, + {"b", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"e", "g", 8}, + {"h", "i", 25}, + {"ii", "j", 15}}); +} + +TEST_F(RangeDelAggregatorV2Test, CompactionAggregatorEmptyIteratorLeft) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("_"); + Slice end("__"); +} + +TEST_F(RangeDelAggregatorV2Test, CompactionAggregatorEmptyIteratorRight) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("p"); + Slice end("q"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {}); +} + +TEST_F(RangeDelAggregatorV2Test, CompactionAggregatorBoundedIterator) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("bb"); + Slice end("e"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), + {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels( + range_del_compaction_iter2.get(), + {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}}); +} + +TEST_F(RangeDelAggregatorV2Test, + CompactionAggregatorBoundedIteratorExtraFragments) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "d", 10}, {"c", "g", 8}}, + {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } + + Slice start("bb"); + Slice end("e"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10}, + {"b", "c", 20}, + {"b", "c", 10}, + {"c", "d", 10}, + {"c", "d", 8}, + {"d", "f", 30}, + {"d", "f", 8}, + {"f", "g", 8}}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10}, + {"b", "c", 20}, + {"b", "c", 10}, + {"c", "d", 10}, + {"c", "d", 8}, + {"d", "f", 30}, + {"d", "f", 8}, + {"f", "g", 8}}); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/range_tombstone_fragmenter.cc b/db/range_tombstone_fragmenter.cc index 1748c5430..f9d9f2feb 100644 --- a/db/range_tombstone_fragmenter.cc +++ b/db/range_tombstone_fragmenter.cc @@ -174,6 +174,11 @@ void FragmentedRangeTombstoneList::FragmentTombstones( const Slice& ikey = unfragmented_tombstones->key(); Slice tombstone_start_key = ExtractUserKey(ikey); SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey); + if (!unfragmented_tombstones->IsKeyPinned()) { + pinned_slices_.emplace_back(tombstone_start_key.data(), + tombstone_start_key.size()); + tombstone_start_key = pinned_slices_.back(); + } no_tombstones = false; Slice tombstone_end_key = unfragmented_tombstones->value(); @@ -188,13 +193,7 @@ void FragmentedRangeTombstoneList::FragmentTombstones( // this new start key. flush_current_tombstones(tombstone_start_key); } - if (unfragmented_tombstones->IsKeyPinned()) { - cur_start_key = tombstone_start_key; - } else { - pinned_slices_.emplace_back(tombstone_start_key.data(), - tombstone_start_key.size()); - cur_start_key = pinned_slices_.back(); - } + cur_start_key = tombstone_start_key; cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion); } diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index 306a0347b..a0b77b677 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -146,6 +146,9 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { seq_pos_ = tombstones_->seq_end(); } + RangeTombstone Tombstone() const { + return RangeTombstone(start_key(), end_key(), seq()); + } Slice start_key() const { return pos_->start_key; } Slice end_key() const { return pos_->end_key; } SequenceNumber seq() const { return *seq_pos_; } diff --git a/db/repair.cc b/db/repair.cc index e7d72f917..4e93a161c 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -417,11 +417,16 @@ class Repairer { SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance(); auto write_hint = cfd->CalculateSSTWriteHint(0); + std::vector> + range_del_iters; + auto range_del_iter = + mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber); + if (range_del_iter != nullptr) { + range_del_iters.emplace_back(range_del_iter); + } status = BuildTable( dbname_, env_, *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(), - env_options_, table_cache_, iter.get(), - std::unique_ptr( - mem->NewRangeTombstoneIterator(ro, vset_.LastSequence())), + env_options_, table_cache_, iter.get(), std::move(range_del_iters), &meta, cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(), {}, kMaxSequenceNumber, snapshot_checker, kNoCompression, diff --git a/db/version_set.cc b/db/version_set.cc index 8349f2857..ad5f898d0 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1057,8 +1057,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, Arena arena; Status status; - RangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregatorV2 range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); *overlap = false; diff --git a/util/heap.h b/util/heap.h index 8f253d27c..6093c20e2 100644 --- a/util/heap.h +++ b/util/heap.h @@ -92,9 +92,7 @@ class BinaryHeap { reset_root_cmp_cache(); } - bool empty() const { - return data_.empty(); - } + bool empty() const { return data_.empty(); } size_t size() const { return data_.size(); } diff --git a/utilities/debug.cc b/utilities/debug.cc index 3e4912fe7..3dfde980e 100644 --- a/utilities/debug.cc +++ b/utilities/debug.cc @@ -19,8 +19,8 @@ Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, DBImpl* idb = static_cast(db->GetRootDB()); auto icmp = InternalKeyComparator(idb->GetOptions().comparator); - RangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregatorV2 range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); Arena arena; ScopedArenaIterator iter( idb->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber)); From 33564d2c100b4b4669ce77c544b811cd04223f9d Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Mon, 17 Dec 2018 17:26:56 -0800 Subject: [PATCH 03/57] Remove v1 RangeDelAggregator (#4778) Summary: Now that v2 is fully functional, the v1 aggregator is removed. The v2 aggregator has been renamed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4778 Differential Revision: D13495930 Pulled By: abhimadan fbshipit-source-id: 9d69500a60a283e79b6c4fa938fc68a8aa4d40d6 --- CMakeLists.txt | 2 - Makefile | 6 +- TARGETS | 6 - db/builder.cc | 6 +- db/column_family.cc | 4 +- db/compaction_iterator.cc | 4 +- db/compaction_iterator.h | 8 +- db/compaction_iterator_test.cc | 5 +- db/compaction_job.cc | 8 +- db/compaction_job.h | 4 +- db/db_compaction_filter_test.cc | 12 +- db/db_impl.cc | 13 +- db/db_impl.h | 15 +- db/db_impl_readonly.cc | 1 - db/db_iter.cc | 6 +- db/db_iter.h | 4 +- db/db_memtable_test.cc | 4 +- db/db_test_util.cc | 12 +- db/forward_iterator.cc | 14 +- db/memtable_list.cc | 2 +- db/memtable_list.h | 4 +- db/memtable_list_test.cc | 1 - db/merge_context.h | 6 +- db/merge_helper.cc | 2 +- db/merge_helper.h | 4 +- db/range_del_aggregator.cc | 1015 +++++++++++---------------- db/range_del_aggregator.h | 568 ++++++++++----- db/range_del_aggregator_bench.cc | 44 +- db/range_del_aggregator_test.cc | 1039 +++++++++++++++++----------- db/range_del_aggregator_v2.cc | 492 ------------- db/range_del_aggregator_v2.h | 436 ------------ db/range_del_aggregator_v2_test.cc | 709 ------------------- db/table_cache.cc | 2 +- db/table_cache.h | 4 +- db/version_set.cc | 14 +- db/version_set.h | 8 +- src.mk | 2 - utilities/debug.cc | 4 +- 38 files changed, 1519 insertions(+), 2971 deletions(-) delete mode 100644 db/range_del_aggregator_v2.cc delete mode 100644 db/range_del_aggregator_v2.h delete mode 100644 db/range_del_aggregator_v2_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index a8eb39783..98e2e1973 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -504,7 +504,6 @@ set(SOURCES db/merge_helper.cc db/merge_operator.cc db/range_del_aggregator.cc - db/range_del_aggregator_v2.cc db/range_tombstone_fragmenter.cc db/repair.cc db/snapshot_impl.cc @@ -907,7 +906,6 @@ if(WITH_TESTS) db/plain_table_db_test.cc db/prefix_test.cc db/range_del_aggregator_test.cc - db/range_del_aggregator_v2_test.cc db/range_tombstone_fragmenter_test.cc db/repair_test.cc db/table_properties_collector_test.cc diff --git a/Makefile b/Makefile index d4d563f4d..09e2cd3ea 100644 --- a/Makefile +++ b/Makefile @@ -543,7 +543,6 @@ TESTS = \ persistent_cache_test \ statistics_test \ lua_test \ - range_del_aggregator_test \ lru_cache_test \ object_registry_test \ repair_test \ @@ -554,7 +553,7 @@ TESTS = \ trace_analyzer_test \ repeatable_thread_test \ range_tombstone_fragmenter_test \ - range_del_aggregator_v2_test \ + range_del_aggregator_test \ sst_file_reader_test \ PARALLEL_TEST = \ @@ -1588,9 +1587,6 @@ repeatable_thread_test: util/repeatable_thread_test.o $(LIBOBJECTS) $(TESTHARNES range_tombstone_fragmenter_test: db/range_tombstone_fragmenter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -range_del_aggregator_v2_test: db/range_del_aggregator_v2_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) - $(AM_LINK) - sst_file_reader_test: table/sst_file_reader_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 246c2efee..96b6e6e62 100644 --- a/TARGETS +++ b/TARGETS @@ -124,7 +124,6 @@ cpp_library( "db/merge_helper.cc", "db/merge_operator.cc", "db/range_del_aggregator.cc", - "db/range_del_aggregator_v2.cc", "db/range_tombstone_fragmenter.cc", "db/repair.cc", "db/snapshot_impl.cc", @@ -935,11 +934,6 @@ ROCKS_TESTS = [ "db/range_del_aggregator_test.cc", "serial", ], - [ - "range_del_aggregator_v2_test", - "db/range_del_aggregator_v2_test.cc", - "serial", - ], [ "range_tombstone_fragmenter_test", "db/range_tombstone_fragmenter_test.cc", diff --git a/db/builder.cc b/db/builder.cc index 60067c425..b13b68aeb 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -18,7 +18,7 @@ #include "db/event_helpers.h" #include "db/internal_stats.h" #include "db/merge_helper.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "db/table_cache.h" #include "db/version_edit.h" #include "monitoring/iostats_context_imp.h" @@ -88,8 +88,8 @@ Status BuildTable( Status s; meta->fd.file_size = 0; iter->SeekToFirst(); - std::unique_ptr range_del_agg( - new CompactionRangeDelAggregatorV2(&internal_comparator, snapshots)); + std::unique_ptr range_del_agg( + new CompactionRangeDelAggregator(&internal_comparator, snapshots)); for (auto& range_del_iter : range_del_iters) { range_del_agg->AddTombstones(std::move(range_del_iter)); } diff --git a/db/column_family.cc b/db/column_family.cc index c1a85a341..9a3ae99ca 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -25,7 +25,7 @@ #include "db/db_impl.h" #include "db/internal_stats.h" #include "db/job_context.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "db/table_properties_collector.h" #include "db/version_set.h" #include "db/write_controller.h" @@ -945,7 +945,7 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( ScopedArenaIterator memtable_iter(merge_iter_builder.Finish()); auto read_seq = super_version->current->version_set()->LastSequence(); - ReadRangeDelAggregatorV2 range_del_agg(&internal_comparator_, read_seq); + ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq); auto* active_range_del_iter = super_version->mem->NewRangeTombstoneIterator(read_opts, read_seq); range_del_agg.AddTombstones( diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index ad45602cc..43583af4a 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -18,7 +18,7 @@ CompactionIterator::CompactionIterator( SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, - CompactionRangeDelAggregatorV2* range_del_agg, const Compaction* compaction, + CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, const SequenceNumber preserve_deletes_seqnum) @@ -36,7 +36,7 @@ CompactionIterator::CompactionIterator( SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, - CompactionRangeDelAggregatorV2* range_del_agg, + CompactionRangeDelAggregator* range_del_agg, std::unique_ptr compaction, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, diff --git a/db/compaction_iterator.h b/db/compaction_iterator.h index 1f6a135b8..6fbd3d0ef 100644 --- a/db/compaction_iterator.h +++ b/db/compaction_iterator.h @@ -13,7 +13,7 @@ #include "db/compaction_iteration_stats.h" #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "db/snapshot_checker.h" #include "options/cf_options.h" #include "rocksdb/compaction_filter.h" @@ -64,7 +64,7 @@ class CompactionIterator { SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, - CompactionRangeDelAggregatorV2* range_del_agg, + CompactionRangeDelAggregator* range_del_agg, const Compaction* compaction = nullptr, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, @@ -77,7 +77,7 @@ class CompactionIterator { SequenceNumber earliest_write_conflict_snapshot, const SnapshotChecker* snapshot_checker, Env* env, bool report_detailed_time, bool expect_valid_internal_key, - CompactionRangeDelAggregatorV2* range_del_agg, + CompactionRangeDelAggregator* range_del_agg, std::unique_ptr compaction, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, @@ -141,7 +141,7 @@ class CompactionIterator { Env* env_; bool report_detailed_time_; bool expect_valid_internal_key_; - CompactionRangeDelAggregatorV2* range_del_agg_; + CompactionRangeDelAggregator* range_del_agg_; std::unique_ptr compaction_; const CompactionFilter* compaction_filter_; const std::atomic* shutting_down_; diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc index a81efafaa..07a9e6ef8 100644 --- a/db/compaction_iterator_test.cc +++ b/db/compaction_iterator_test.cc @@ -228,8 +228,7 @@ class CompactionIteratorTest : public testing::TestWithParam { std::unique_ptr range_del_iter( new FragmentedRangeTombstoneIterator(tombstone_list, icmp_, kMaxSequenceNumber)); - range_del_agg_.reset( - new CompactionRangeDelAggregatorV2(&icmp_, snapshots_)); + range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_)); range_del_agg_->AddTombstones(std::move(range_del_iter)); std::unique_ptr compaction; @@ -298,7 +297,7 @@ class CompactionIteratorTest : public testing::TestWithParam { std::unique_ptr merge_helper_; std::unique_ptr iter_; std::unique_ptr c_iter_; - std::unique_ptr range_del_agg_; + std::unique_ptr range_del_agg_; std::unique_ptr snapshot_checker_; std::atomic shutting_down_{false}; FakeCompaction* compaction_proxy_; diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 17be3156b..0bdf78cfc 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -36,7 +36,7 @@ #include "db/memtable_list.h" #include "db/merge_context.h" #include "db/merge_helper.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "db/version_set.h" #include "monitoring/iostats_context_imp.h" #include "monitoring/perf_context_imp.h" @@ -805,8 +805,8 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { assert(sub_compact != nullptr); ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); - CompactionRangeDelAggregatorV2 range_del_agg(&cfd->internal_comparator(), - existing_snapshots_); + CompactionRangeDelAggregator range_del_agg(&cfd->internal_comparator(), + existing_snapshots_); // Although the v2 aggregator is what the level iterator(s) know about, // the AddTombstones calls will be propagated down to the v1 aggregator. @@ -1165,7 +1165,7 @@ void CompactionJob::RecordDroppedKeys( Status CompactionJob::FinishCompactionOutputFile( const Status& input_status, SubcompactionState* sub_compact, - CompactionRangeDelAggregatorV2* range_del_agg, + CompactionRangeDelAggregator* range_del_agg, CompactionIterationStats* range_del_out_stats, const Slice* next_table_min_key /* = nullptr */) { AutoThreadOperationStageUpdater stage_updater( diff --git a/db/compaction_job.h b/db/compaction_job.h index 86d97e1db..596b5cc60 100644 --- a/db/compaction_job.h +++ b/db/compaction_job.h @@ -25,7 +25,7 @@ #include "db/job_context.h" #include "db/log_writer.h" #include "db/memtable_list.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "db/version_edit.h" #include "db/write_controller.h" #include "db/write_thread.h" @@ -104,7 +104,7 @@ class CompactionJob { Status FinishCompactionOutputFile( const Status& input_status, SubcompactionState* sub_compact, - CompactionRangeDelAggregatorV2* range_del_agg, + CompactionRangeDelAggregator* range_del_agg, CompactionIterationStats* range_del_out_stats, const Slice* next_table_min_key = nullptr); Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index 63d2829d5..25045d01d 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -340,8 +340,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { Arena arena; { InternalKeyComparator icmp(options.comparator); - ReadRangeDelAggregatorV2 range_del_agg( - &icmp, kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); ScopedArenaIterator iter(dbfull()->NewInternalIterator( &arena, &range_del_agg, kMaxSequenceNumber, handles_[1])); iter->SeekToFirst(); @@ -430,8 +430,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilter) { count = 0; { InternalKeyComparator icmp(options.comparator); - ReadRangeDelAggregatorV2 range_del_agg( - &icmp, kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); ScopedArenaIterator iter(dbfull()->NewInternalIterator( &arena, &range_del_agg, kMaxSequenceNumber, handles_[1])); iter->SeekToFirst(); @@ -648,8 +648,8 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { int total = 0; Arena arena; InternalKeyComparator icmp(options.comparator); - ReadRangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* snapshots */); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* snapshots */); ScopedArenaIterator iter(dbfull()->NewInternalIterator( &arena, &range_del_agg, kMaxSequenceNumber)); iter->SeekToFirst(); diff --git a/db/db_impl.cc b/db/db_impl.cc index e22ce20a4..e259864d7 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -45,7 +45,6 @@ #include "db/memtable_list.h" #include "db/merge_context.h" #include "db/merge_helper.h" -#include "db/range_del_aggregator.h" #include "db/range_tombstone_fragmenter.h" #include "db/table_cache.h" #include "db/table_properties_collector.h" @@ -1033,7 +1032,7 @@ bool DBImpl::SetPreserveDeletesSequenceNumber(SequenceNumber seqnum) { } InternalIterator* DBImpl::NewInternalIterator( - Arena* arena, RangeDelAggregatorV2* range_del_agg, SequenceNumber sequence, + Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence, ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; if (column_family == nullptr) { @@ -1150,10 +1149,12 @@ static void CleanupIteratorState(void* arg1, void* /*arg2*/) { } } // namespace -InternalIterator* DBImpl::NewInternalIterator( - const ReadOptions& read_options, ColumnFamilyData* cfd, - SuperVersion* super_version, Arena* arena, - RangeDelAggregatorV2* range_del_agg, SequenceNumber sequence) { +InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, + ColumnFamilyData* cfd, + SuperVersion* super_version, + Arena* arena, + RangeDelAggregator* range_del_agg, + SequenceNumber sequence) { InternalIterator* internal_iter; assert(arena != nullptr); assert(range_del_agg != nullptr); diff --git a/db/db_impl.h b/db/db_impl.h index 2cabe756a..7d509c807 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -31,7 +31,7 @@ #include "db/log_writer.h" #include "db/logs_with_prep_tracker.h" #include "db/pre_release_callback.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "db/read_callback.h" #include "db/snapshot_checker.h" #include "db/snapshot_impl.h" @@ -374,8 +374,8 @@ class DBImpl : public DB { // The keys of this iterator are internal keys (see format.h). // The returned iterator should be deleted when no longer needed. InternalIterator* NewInternalIterator( - Arena* arena, RangeDelAggregatorV2* range_del_agg, - SequenceNumber sequence, ColumnFamilyHandle* column_family = nullptr); + Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence, + ColumnFamilyHandle* column_family = nullptr); LogsWithPrepTracker* logs_with_prep_tracker() { return &logs_with_prep_tracker_; @@ -578,12 +578,9 @@ class DBImpl : public DB { const WriteController& write_controller() { return write_controller_; } - InternalIterator* NewInternalIterator(const ReadOptions&, - ColumnFamilyData* cfd, - SuperVersion* super_version, - Arena* arena, - RangeDelAggregatorV2* range_del_agg, - SequenceNumber sequence); + InternalIterator* NewInternalIterator( + const ReadOptions&, ColumnFamilyData* cfd, SuperVersion* super_version, + Arena* arena, RangeDelAggregator* range_del_agg, SequenceNumber sequence); // hollow transactions shell used for recovery. // these will then be passed to TransactionDB so that diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index c4a55b6ec..bd7099f00 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -9,7 +9,6 @@ #include "db/db_impl.h" #include "db/db_iter.h" #include "db/merge_context.h" -#include "db/range_del_aggregator.h" #include "monitoring/perf_context_imp.h" namespace rocksdb { diff --git a/db/db_iter.cc b/db/db_iter.cc index cc4a0d5f4..348247aa3 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -171,7 +171,7 @@ class DBIter final: public Iterator { iter_ = iter; iter_->SetPinnedItersMgr(&pinned_iters_mgr_); } - virtual ReadRangeDelAggregatorV2* GetRangeDelAggregator() { + virtual ReadRangeDelAggregator* GetRangeDelAggregator() { return &range_del_agg_; } @@ -341,7 +341,7 @@ class DBIter final: public Iterator { const bool total_order_seek_; // List of operands for merge operator. MergeContext merge_context_; - ReadRangeDelAggregatorV2 range_del_agg_; + ReadRangeDelAggregator range_del_agg_; LocalStatistics local_stats_; PinnedIteratorsManager pinned_iters_mgr_; ReadCallback* read_callback_; @@ -1479,7 +1479,7 @@ Iterator* NewDBIterator(Env* env, const ReadOptions& read_options, ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); } -ReadRangeDelAggregatorV2* ArenaWrappedDBIter::GetRangeDelAggregator() { +ReadRangeDelAggregator* ArenaWrappedDBIter::GetRangeDelAggregator() { return db_iter_->GetRangeDelAggregator(); } diff --git a/db/db_iter.h b/db/db_iter.h index 6ee869135..a640f0296 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -12,7 +12,7 @@ #include #include "db/db_impl.h" #include "db/dbformat.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "options/cf_options.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" @@ -48,7 +48,7 @@ class ArenaWrappedDBIter : public Iterator { // Get the arena to be used to allocate memory for DBIter to be wrapped, // as well as child iterators in it. virtual Arena* GetArena() { return &arena_; } - virtual ReadRangeDelAggregatorV2* GetRangeDelAggregator(); + virtual ReadRangeDelAggregator* GetRangeDelAggregator(); // Set the internal iterator wrapped inside the DB Iterator. Usually it is // a merging iterator. diff --git a/db/db_memtable_test.cc b/db/db_memtable_test.cc index 5f47a9481..96025d7db 100644 --- a/db/db_memtable_test.cc +++ b/db/db_memtable_test.cc @@ -8,6 +8,7 @@ #include "db/db_test_util.h" #include "db/memtable.h" +#include "db/range_del_aggregator.h" #include "port/stack_trace.h" #include "rocksdb/memtablerep.h" #include "rocksdb/slice_transform.h" @@ -135,7 +136,8 @@ TEST_F(DBMemTableTest, DuplicateSeq) { MergeContext merge_context; Options options; InternalKeyComparator ikey_cmp(options.comparator); - RangeDelAggregator range_del_agg(ikey_cmp, {} /* snapshots */); + ReadRangeDelAggregator range_del_agg(&ikey_cmp, + kMaxSequenceNumber /* upper_bound */); // Create a MemTable InternalKeyComparator cmp(BytewiseComparator()); diff --git a/db/db_test_util.cc b/db/db_test_util.cc index eeff7be51..de096d254 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -814,8 +814,8 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) { Arena arena; auto options = CurrentOptions(); InternalKeyComparator icmp(options.comparator); - ReadRangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); ScopedArenaIterator iter; if (cf == 0) { iter.set(dbfull()->NewInternalIterator(&arena, &range_del_agg, @@ -1227,8 +1227,8 @@ void DBTestBase::validateNumberOfEntries(int numValues, int cf) { Arena arena; auto options = CurrentOptions(); InternalKeyComparator icmp(options.comparator); - ReadRangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); // This should be defined after range_del_agg so that it destructs the // assigned iterator before it range_del_agg is already destructed. ScopedArenaIterator iter; @@ -1437,8 +1437,8 @@ void DBTestBase::VerifyDBInternal( std::vector> true_data) { Arena arena; InternalKeyComparator icmp(last_options_.comparator); - ReadRangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); auto iter = dbfull()->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber); iter->SeekToFirst(); diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 226d56d5f..f44a09756 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -15,7 +15,7 @@ #include "db/db_iter.h" #include "db/dbformat.h" #include "db/job_context.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "db/range_tombstone_fragmenter.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" @@ -73,8 +73,8 @@ class ForwardLevelIterator : public InternalIterator { delete file_iter_; } - ReadRangeDelAggregatorV2 range_del_agg( - &cfd_->internal_comparator(), kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); file_iter_ = cfd_->table_cache()->NewIterator( read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), *files_[file_index_], @@ -610,8 +610,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { // New sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); } - ReadRangeDelAggregatorV2 range_del_agg(&cfd_->internal_comparator(), - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); if (!read_options_.ignore_range_deletions) { @@ -669,8 +669,8 @@ void ForwardIterator::RenewIterators() { mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_); svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_); - ReadRangeDelAggregatorV2 range_del_agg(&cfd_->internal_comparator(), - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(), + kMaxSequenceNumber /* upper_bound */); if (!read_options_.ignore_range_deletions) { std::unique_ptr range_del_iter( svnew->mem->NewRangeTombstoneIterator( diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 4c0af1e89..9145135d6 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -159,7 +159,7 @@ bool MemTableListVersion::GetFromList( Status MemTableListVersion::AddRangeTombstoneIterators( const ReadOptions& read_opts, Arena* /*arena*/, - RangeDelAggregatorV2* range_del_agg) { + RangeDelAggregator* range_del_agg) { assert(range_del_agg != nullptr); for (auto& m : memlist_) { // Using kMaxSequenceNumber is OK because these are immutable memtables. diff --git a/db/memtable_list.h b/db/memtable_list.h index 70bab1c38..6315167a1 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -15,7 +15,7 @@ #include "db/dbformat.h" #include "db/logs_with_prep_tracker.h" #include "db/memtable.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "monitoring/instrumented_mutex.h" #include "rocksdb/db.h" #include "rocksdb/iterator.h" @@ -91,7 +91,7 @@ class MemTableListVersion { } Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena, - RangeDelAggregatorV2* range_del_agg); + RangeDelAggregator* range_del_agg); void AddIterators(const ReadOptions& options, std::vector* iterator_list, diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index 06554f1ab..96032a465 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -8,7 +8,6 @@ #include #include #include "db/merge_context.h" -#include "db/range_del_aggregator.h" #include "db/version_set.h" #include "db/write_controller.h" #include "rocksdb/db.h" diff --git a/db/merge_context.h b/db/merge_context.h index c226f64e5..fd06441f7 100644 --- a/db/merge_context.h +++ b/db/merge_context.h @@ -79,7 +79,8 @@ class MergeContext { return GetOperandsDirectionForward(); } - // Return all the operands in the order as they were merged (passed to FullMerge or FullMergeV2) + // Return all the operands in the order as they were merged (passed to + // FullMerge or FullMergeV2) const std::vector& GetOperandsDirectionForward() { if (!operand_list_) { return empty_operand_list; @@ -89,7 +90,8 @@ class MergeContext { return *operand_list_; } - // Return all the operands in the reversed order relative to how they were merged (passed to FullMerge or FullMergeV2) + // Return all the operands in the reversed order relative to how they were + // merged (passed to FullMerge or FullMergeV2) const std::vector& GetOperandsDirectionBackward() { if (!operand_list_) { return empty_operand_list; diff --git a/db/merge_helper.cc b/db/merge_helper.cc index 6f7e760ec..f33dafd8e 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -114,7 +114,7 @@ Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator, // TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator // and just pass the StripeRep corresponding to the stripe being merged. Status MergeHelper::MergeUntil(InternalIterator* iter, - CompactionRangeDelAggregatorV2* range_del_agg, + CompactionRangeDelAggregator* range_del_agg, const SequenceNumber stop_before, const bool at_bottom) { // Get a copy of the internal key, before it's invalidated by iter->Next() diff --git a/db/merge_helper.h b/db/merge_helper.h index 1c92a3492..670cba598 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -11,7 +11,7 @@ #include "db/dbformat.h" #include "db/merge_context.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "db/snapshot_checker.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" @@ -78,7 +78,7 @@ class MergeHelper { // // REQUIRED: The first key in the input is not corrupted. Status MergeUntil(InternalIterator* iter, - CompactionRangeDelAggregatorV2* range_del_agg = nullptr, + CompactionRangeDelAggregator* range_del_agg = nullptr, const SequenceNumber stop_before = 0, const bool at_bottom = false); diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 331758558..8a6b0a51f 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -1,709 +1,492 @@ -// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #include "db/range_del_aggregator.h" -#include "util/heap.h" -#include +#include "db/compaction_iteration_stats.h" +#include "db/dbformat.h" +#include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" +#include "db/version_edit.h" +#include "include/rocksdb/comparator.h" +#include "include/rocksdb/types.h" +#include "table/internal_iterator.h" +#include "table/scoped_arena_iterator.h" +#include "table/table_builder.h" +#include "util/heap.h" +#include "util/kv_map.h" +#include "util/vector_iterator.h" namespace rocksdb { -struct TombstoneStartKeyComparator { - explicit TombstoneStartKeyComparator(const InternalKeyComparator* c) - : cmp(c) {} - - bool operator()(const TruncatedRangeTombstone& a, - const TruncatedRangeTombstone& b) const { - return cmp->Compare(a.start_key_, b.start_key_) < 0; - } - - const InternalKeyComparator* cmp; -}; - -// An UncollapsedRangeDelMap is quick to create but slow to answer ShouldDelete -// queries. -class UncollapsedRangeDelMap : public RangeDelMap { - typedef std::vector Rep; - - class Iterator : public RangeDelIterator { - const Rep& rep_; - Rep::const_iterator iter_; - - public: - Iterator(const Rep& rep) : rep_(rep), iter_(rep.begin()) {} - bool Valid() const override { return iter_ != rep_.end(); } - void Next() override { iter_++; } - - void Seek(const Slice&) override { - fprintf(stderr, - "UncollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n"); - abort(); +TruncatedRangeDelIterator::TruncatedRangeDelIterator( + std::unique_ptr iter, + const InternalKeyComparator* icmp, const InternalKey* smallest, + const InternalKey* largest) + : iter_(std::move(iter)), + icmp_(icmp), + smallest_ikey_(smallest), + largest_ikey_(largest) { + if (smallest != nullptr) { + pinned_bounds_.emplace_back(); + auto& parsed_smallest = pinned_bounds_.back(); + if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) { + assert(false); } - - void Seek(const ParsedInternalKey&) override { - fprintf(stderr, - "UncollapsedRangeDelMap::Iterator::Seek(ParsedInternalKey&) " - "unimplemented\n"); - abort(); + smallest_ = &parsed_smallest; + } + if (largest != nullptr) { + pinned_bounds_.emplace_back(); + auto& parsed_largest = pinned_bounds_.back(); + if (!ParseInternalKey(largest->Encode(), &parsed_largest)) { + assert(false); } - - RangeTombstone Tombstone() const override { return iter_->Tombstone(); } - }; - - Rep rep_; - const InternalKeyComparator* icmp_; - - public: - explicit UncollapsedRangeDelMap(const InternalKeyComparator* icmp) - : icmp_(icmp) {} - - bool ShouldDelete(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode) override { - (void)mode; - assert(mode == RangeDelPositioningMode::kFullScan); - for (const auto& tombstone : rep_) { - if (icmp_->Compare(parsed, tombstone.start_key_) < 0) { - continue; - } - if (parsed.sequence < tombstone.seq_ && - icmp_->Compare(parsed, tombstone.end_key_) < 0) { - return true; - } + if (parsed_largest.type == kTypeRangeDeletion && + parsed_largest.sequence == kMaxSequenceNumber) { + // The file boundary has been artificially extended by a range tombstone. + // We do not need to adjust largest to properly truncate range + // tombstones that extend past the boundary. + } else if (parsed_largest.sequence == 0) { + // The largest key in the sstable has a sequence number of 0. Since we + // guarantee that no internal keys with the same user key and sequence + // number can exist in a DB, we know that the largest key in this sstable + // cannot exist as the smallest key in the next sstable. This further + // implies that no range tombstone in this sstable covers largest; + // otherwise, the file boundary would have been artificially extended. + // + // Therefore, we will never truncate a range tombstone at largest, so we + // can leave it unchanged. + } else { + // The same user key may straddle two sstable boundaries. To ensure that + // the truncated end key can cover the largest key in this sstable, reduce + // its sequence number by 1. + parsed_largest.sequence -= 1; } - return false; + largest_ = &parsed_largest; } +} - bool IsRangeOverlapped(const ParsedInternalKey& start, - const ParsedInternalKey& end) override { - for (const auto& tombstone : rep_) { - if (icmp_->Compare(start, tombstone.end_key_) < 0 && - icmp_->Compare(tombstone.start_key_, end) <= 0 && - icmp_->Compare(tombstone.start_key_, tombstone.end_key_) < 0) { - return true; - } - } - return false; - } +bool TruncatedRangeDelIterator::Valid() const { + return iter_->Valid() && + (smallest_ == nullptr || + icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) && + (largest_ == nullptr || + icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0); +} - void AddTombstone(TruncatedRangeTombstone tombstone) override { - rep_.emplace_back(tombstone); - } +void TruncatedRangeDelIterator::Next() { iter_->TopNext(); } - size_t Size() const override { return rep_.size(); } +void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); } - void InvalidatePosition() override {} // no-op +void TruncatedRangeDelIterator::InternalNext() { iter_->Next(); } - std::unique_ptr NewIterator() override { - std::sort(rep_.begin(), rep_.end(), TombstoneStartKeyComparator(icmp_)); - return std::unique_ptr(new Iterator(this->rep_)); +// NOTE: target is a user key +void TruncatedRangeDelIterator::Seek(const Slice& target) { + if (largest_ != nullptr && + icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber, + kTypeRangeDeletion)) <= 0) { + iter_->Invalidate(); + return; } -}; - -// A CollapsedRangeDelMap is slow to create but quick to answer ShouldDelete -// queries. -// -// An explanation of the design follows. Suppose we have tombstones [b, n) @ 1, -// [e, h) @ 2, [q, t) @ 2, and [g, k) @ 3. Visually, the tombstones look like -// this: -// -// 3: g---k -// 2: e---h q--t -// 1: b------------n -// -// The CollapsedRangeDelMap representation is based on the observation that -// wherever tombstones overlap, we need only store the tombstone with the -// largest seqno. From the perspective of a read at seqno 4 or greater, this set -// of tombstones is exactly equivalent: -// -// 3: g---k -// 2: e--g q--t -// 1: b--e k--n -// -// Because these tombstones do not overlap, they can be efficiently represented -// in an ordered map from keys to sequence numbers. Each entry should be thought -// of as a transition from one tombstone to the next. In this example, the -// CollapsedRangeDelMap would store the following entries, in order: -// -// b → 1, e → 2, g → 3, k → 1, n → 0, q → 2, t → 0 -// -// If a tombstone ends before the next tombstone begins, a sentinel seqno of 0 -// is installed to indicate that no tombstone exists. This occurs at keys n and -// t in the example above. -// -// To check whether a key K is covered by a tombstone, the map is binary -// searched for the last key less than K. K is covered iff the map entry has a -// larger seqno than K. As an example, consider the key h @ 4. It would be -// compared against the map entry g → 3 and determined to be uncovered. By -// contrast, the key h @ 2 would be determined to be covered. -class CollapsedRangeDelMap : public RangeDelMap { - typedef std::map - Rep; - - class Iterator : public RangeDelIterator { - void MaybeSeekPastSentinel() { - if (Valid() && iter_->second == 0) { - iter_++; - } - } - - const Rep& rep_; - Rep::const_iterator iter_; - - public: - Iterator(const Rep& rep) : rep_(rep), iter_(rep.begin()) {} - - bool Valid() const override { return iter_ != rep_.end(); } - - void Next() override { - iter_++; - MaybeSeekPastSentinel(); - } - - void Seek(const Slice&) override { - fprintf(stderr, "CollapsedRangeDelMap::Iterator::Seek(Slice&) unimplemented\n"); - abort(); - } - - void Seek(const ParsedInternalKey& target) override { - iter_ = rep_.upper_bound(target); - if (iter_ != rep_.begin()) { - iter_--; - } - MaybeSeekPastSentinel(); - } - - RangeTombstone Tombstone() const override { - assert(Valid()); - assert(std::next(iter_) != rep_.end()); - assert(iter_->second != 0); - RangeTombstone tombstone; - tombstone.start_key_ = iter_->first.user_key; - tombstone.end_key_ = std::next(iter_)->first.user_key; - tombstone.seq_ = iter_->second; - return tombstone; - } - }; - - Rep rep_; - Rep::iterator iter_; - const InternalKeyComparator* icmp_; - - public: - explicit CollapsedRangeDelMap(const InternalKeyComparator* icmp) - : rep_(ParsedInternalKeyComparator(icmp)), - icmp_(icmp) { - InvalidatePosition(); + if (smallest_ != nullptr && + icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) { + iter_->Seek(smallest_->user_key); + return; } + iter_->Seek(target); +} - bool ShouldDelete(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode) override { - if (iter_ == rep_.end() && - (mode == RangeDelPositioningMode::kForwardTraversal || - mode == RangeDelPositioningMode::kBackwardTraversal)) { - // invalid (e.g., if AddTombstones() changed the deletions), so need to - // reseek - mode = RangeDelPositioningMode::kBinarySearch; - } - switch (mode) { - case RangeDelPositioningMode::kFullScan: - assert(false); - case RangeDelPositioningMode::kForwardTraversal: - assert(iter_ != rep_.end()); - if (iter_ == rep_.begin() && - icmp_->Compare(parsed, iter_->first) < 0) { - // before start of deletion intervals - return false; - } - while (std::next(iter_) != rep_.end() && - icmp_->Compare(std::next(iter_)->first, parsed) <= 0) { - ++iter_; - } - break; - case RangeDelPositioningMode::kBackwardTraversal: - assert(iter_ != rep_.end()); - while (iter_ != rep_.begin() && - icmp_->Compare(parsed, iter_->first) < 0) { - --iter_; - } - if (iter_ == rep_.begin() && - icmp_->Compare(parsed, iter_->first) < 0) { - // before start of deletion intervals - return false; - } - break; - case RangeDelPositioningMode::kBinarySearch: - iter_ = rep_.upper_bound(parsed); - if (iter_ == rep_.begin()) { - // before start of deletion intervals - return false; - } - --iter_; - break; - } - assert(iter_ != rep_.end() && - icmp_->Compare(iter_->first, parsed) <= 0); - assert(std::next(iter_) == rep_.end() || - icmp_->Compare(parsed, std::next(iter_)->first) < 0); - return parsed.sequence < iter_->second; +// NOTE: target is a user key +void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) { + if (smallest_ != nullptr && + icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion), + *smallest_) < 0) { + iter_->Invalidate(); + return; } - - bool IsRangeOverlapped(const ParsedInternalKey&, - const ParsedInternalKey&) override { - // Unimplemented because the only client of this method, file ingestion, - // uses uncollapsed maps. - fprintf(stderr, "CollapsedRangeDelMap::IsRangeOverlapped unimplemented"); - abort(); + if (largest_ != nullptr && + icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) { + iter_->SeekForPrev(largest_->user_key); + return; } + iter_->SeekForPrev(target); +} - void AddTombstone(TruncatedRangeTombstone t) override { - if (icmp_->Compare(t.start_key_, t.end_key_) >= 0 || t.seq_ == 0) { - // The tombstone covers no keys. Nothing to do. - return; - } - - auto it = rep_.upper_bound(t.start_key_); - auto prev_seq = [&]() { - return it == rep_.begin() ? 0 : std::prev(it)->second; - }; - - // end_seq stores the seqno of the last transition that the new tombstone - // covered. This is the seqno that we'll install if we need to insert a - // transition for the new tombstone's end key. - SequenceNumber end_seq = 0; - - // In the diagrams below, the new tombstone is always [c, k) @ 2. The - // existing tombstones are varied to depict different scenarios. Uppercase - // letters are used to indicate points that exist in the map, while - // lowercase letters are used to indicate points that do not exist in the - // map. The location of the iterator is marked with a caret; it may point - // off the end of the diagram to indicate that it is positioned at a - // entry with a larger key whose specific key is irrelevant. - - if (t.seq_ > prev_seq()) { - // The new tombstone's start point covers the existing tombstone: - // - // 3: 3: A--C 3: 3: - // 2: c--- OR 2: c--- OR 2: c--- OR 2: c------ - // 1: A--C 1: 1: A------ 1: C------ - // ^ ^ ^ ^ - end_seq = prev_seq(); - Rep::iterator pit; - if (it != rep_.begin() && (pit = std::prev(it)) != rep_.begin() && - icmp_->Compare(pit->first, t.start_key_) == 0 && - std::prev(pit)->second == t.seq_) { - // The new tombstone starts at the end of an existing tombstone with an - // identical seqno: - // - // 3: - // 2: A--C--- - // 1: - // ^ - // Merge the tombstones by removing the existing tombstone's end key. - it = rep_.erase(std::prev(it)); - } else { - // Insert a new transition at the new tombstone's start point, or raise - // the existing transition at that point to the new tombstone's seqno. - rep_[t.start_key_] = t.seq_; // operator[] will overwrite existing entry - } - } else { - // The new tombstone's start point is covered by an existing tombstone: - // - // 3: A----- OR 3: C------ OR - // 2: c--- 2: c------ 2: C------ - // ^ ^ ^ - // Do nothing. - } - - // Look at all the existing transitions that overlap the new tombstone. - while (it != rep_.end() && icmp_->Compare(it->first, t.end_key_) < 0) { - if (t.seq_ >= it->second) { - // The transition is to an existing tombstone that the new tombstone - // covers. Save the covered tombstone's seqno. We'll need to return to - // it if the new tombstone ends before the existing tombstone. - end_seq = it->second; - - if (t.seq_ == prev_seq()) { - // The previous transition is to the seqno of the new tombstone: - // - // 3: 3: 3: --F - // 2: C------ OR 2: C------ OR 2: F---- - // 1: F--- 1: ---F 1: H-- - // ^ ^ ^ - // - // Erase this transition. It's been superseded. - it = rep_.erase(it); - continue; // skip increment; erase positions iterator correctly - } else { - // The previous transition is to a tombstone that covers the new - // tombstone, but this transition is to a tombstone that is covered by - // the new tombstone. That is, this is the end of a run of existing - // tombstones that cover the new tombstone: - // - // 3: A---E OR 3: E-G - // 2: c---- 2: ------ - // ^ ^ - // Preserve this transition point, but raise it to the new tombstone's - // seqno. - it->second = t.seq_; - } - } else { - // The transition is to an existing tombstone that covers the new - // tombstone: - // - // 4: 4: --F - // 3: F-- OR 3: F-- - // 2: ----- 2: ----- - // ^ ^ - // Do nothing. - } - ++it; - } - - if (t.seq_ == prev_seq()) { - // The new tombstone is unterminated in the map. - if (it != rep_.end() && t.seq_ == it->second && - icmp_->Compare(it->first, t.end_key_) == 0) { - // The new tombstone ends at the start of another tombstone with an - // identical seqno. Merge the tombstones by removing the existing - // tombstone's start key. - rep_.erase(it); - } else if (end_seq == prev_seq() || - (it != rep_.end() && end_seq == it->second)) { - // The new tombstone is implicitly ended because its end point is - // contained within an existing tombstone with the same seqno: - // - // 2: ---k--N - // ^ - } else { - // The new tombstone needs an explicit end point. - // - // 3: OR 3: --G OR 3: --G K-- - // 2: C-------k 2: G---k 2: G---k - // ^ ^ ^ - // Install one that returns to the last seqno we covered. Because end - // keys are exclusive, if there's an existing transition at t.end_key_, - // it takes precedence over the transition that we install here. - rep_.emplace(t.end_key_, - end_seq); // emplace is a noop if existing entry - } - } else { - // The new tombstone is implicitly ended because its end point is covered - // by an existing tombstone with a higher seqno. - // - // 3: I---M OR 3: A-----------M - // 2: ----k 2: c-------k - // ^ ^ - // Do nothing. - } +void TruncatedRangeDelIterator::SeekToFirst() { + if (smallest_ != nullptr) { + iter_->Seek(smallest_->user_key); + return; } + iter_->SeekToTopFirst(); +} - size_t Size() const override { return rep_.empty() ? 0 : rep_.size() - 1; } +void TruncatedRangeDelIterator::SeekToLast() { + if (largest_ != nullptr) { + iter_->SeekForPrev(largest_->user_key); + return; + } + iter_->SeekToTopLast(); +} - void InvalidatePosition() override { iter_ = rep_.end(); } +std::map> +TruncatedRangeDelIterator::SplitBySnapshot( + const std::vector& snapshots) { + using FragmentedIterPair = + std::pair>; + + auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots); + std::map> + split_truncated_iters; + std::for_each( + split_untruncated_iters.begin(), split_untruncated_iters.end(), + [&](FragmentedIterPair& iter_pair) { + std::unique_ptr truncated_iter( + new TruncatedRangeDelIterator(std::move(iter_pair.second), icmp_, + smallest_ikey_, largest_ikey_)); + split_truncated_iters.emplace(iter_pair.first, + std::move(truncated_iter)); + }); + return split_truncated_iters; +} - std::unique_ptr NewIterator() override { - return std::unique_ptr(new Iterator(this->rep_)); +ForwardRangeDelIterator::ForwardRangeDelIterator( + const InternalKeyComparator* icmp, + const std::vector>* iters) + : icmp_(icmp), + iters_(iters), + unused_idx_(0), + active_seqnums_(SeqMaxComparator()), + active_iters_(EndKeyMinComparator(icmp)), + inactive_iters_(StartKeyMinComparator(icmp)) {} + +bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { + assert(iters_ != nullptr); + // Move active iterators that end before parsed. + while (!active_iters_.empty() && + icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) { + TruncatedRangeDelIterator* iter = PopActiveIter(); + do { + iter->Next(); + } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + // Move inactive iterators that start before parsed. + while (!inactive_iters_.empty() && + icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) { + TruncatedRangeDelIterator* iter = PopInactiveIter(); + while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) { + iter->Next(); + } + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); } -}; -RangeDelAggregator::RangeDelAggregator( - const InternalKeyComparator& icmp, - const std::vector& snapshots, - bool collapse_deletions /* = true */) - : upper_bound_(kMaxSequenceNumber), - icmp_(icmp), - collapse_deletions_(collapse_deletions) { - InitRep(snapshots); + return active_seqnums_.empty() + ? false + : (*active_seqnums_.begin())->seq() > parsed.sequence; } -RangeDelAggregator::RangeDelAggregator(const InternalKeyComparator& icmp, - SequenceNumber snapshot, - bool collapse_deletions /* = false */) - : upper_bound_(snapshot), - icmp_(icmp), - collapse_deletions_(collapse_deletions) {} - -void RangeDelAggregator::InitRep(const std::vector& snapshots) { - assert(rep_ == nullptr); - rep_.reset(new Rep()); - rep_->snapshots_ = snapshots; - // Data newer than any snapshot falls in this catch-all stripe - rep_->snapshots_.emplace_back(kMaxSequenceNumber); - rep_->pinned_iters_mgr_.StartPinning(); +void ForwardRangeDelIterator::Invalidate() { + unused_idx_ = 0; + active_iters_.clear(); + active_seqnums_.clear(); + inactive_iters_.clear(); } -std::unique_ptr RangeDelAggregator::NewRangeDelMap() { - RangeDelMap* tombstone_map; - if (collapse_deletions_) { - tombstone_map = new CollapsedRangeDelMap(&icmp_); - } else { - tombstone_map = new UncollapsedRangeDelMap(&icmp_); +ReverseRangeDelIterator::ReverseRangeDelIterator( + const InternalKeyComparator* icmp, + const std::vector>* iters) + : icmp_(icmp), + iters_(iters), + unused_idx_(0), + active_seqnums_(SeqMaxComparator()), + active_iters_(StartKeyMaxComparator(icmp)), + inactive_iters_(EndKeyMaxComparator(icmp)) {} + +bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { + assert(iters_ != nullptr); + // Move active iterators that start after parsed. + while (!active_iters_.empty() && + icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) { + TruncatedRangeDelIterator* iter = PopActiveIter(); + do { + iter->Prev(); + } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + // Move inactive iterators that end after parsed. + while (!inactive_iters_.empty() && + icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) { + TruncatedRangeDelIterator* iter = PopInactiveIter(); + while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) { + iter->Prev(); + } + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); } - return std::unique_ptr(tombstone_map); -} -bool RangeDelAggregator::ShouldDeleteImpl(const Slice& internal_key, - RangeDelPositioningMode mode) { - assert(rep_ != nullptr); - ParsedInternalKey parsed; - if (!ParseInternalKey(internal_key, &parsed)) { - assert(false); - return false; - } - return ShouldDeleteImpl(parsed, mode); + return active_seqnums_.empty() + ? false + : (*active_seqnums_.begin())->seq() > parsed.sequence; } -bool RangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode) { - assert(IsValueType(parsed.type)); - assert(rep_ != nullptr); - auto* tombstone_map = GetRangeDelMapIfExists(parsed.sequence); - if (tombstone_map == nullptr || tombstone_map->IsEmpty()) { - return false; - } - return tombstone_map->ShouldDelete(parsed, mode); +void ReverseRangeDelIterator::Invalidate() { + unused_idx_ = 0; + active_iters_.clear(); + active_seqnums_.clear(); + inactive_iters_.clear(); } -bool RangeDelAggregator::IsRangeOverlapped(const Slice& start, - const Slice& end) { - // Unimplemented because the only client of this method, file ingestion, - // uses uncollapsed maps. - assert(!collapse_deletions_); - if (rep_ == nullptr) { +bool RangeDelAggregator::StripeRep::ShouldDelete( + const ParsedInternalKey& parsed, RangeDelPositioningMode mode) { + if (!InStripe(parsed.sequence) || IsEmpty()) { return false; } - ParsedInternalKey start_ikey(start, kMaxSequenceNumber, kMaxValue); - ParsedInternalKey end_ikey(end, 0, static_cast(0)); - for (const auto& stripe : rep_->stripe_map_) { - if (stripe.second.first->IsRangeOverlapped(start_ikey, end_ikey)) { - return true; - } + switch (mode) { + case RangeDelPositioningMode::kForwardTraversal: + InvalidateReverseIter(); + + // Pick up previously unseen iterators. + for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx()); + it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) { + auto& iter = *it; + forward_iter_.AddNewIter(iter.get(), parsed); + } + + return forward_iter_.ShouldDelete(parsed); + case RangeDelPositioningMode::kBackwardTraversal: + InvalidateForwardIter(); + + // Pick up previously unseen iterators. + for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx()); + it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) { + auto& iter = *it; + reverse_iter_.AddNewIter(iter.get(), parsed); + } + + return reverse_iter_.ShouldDelete(parsed); + default: + assert(false); + return false; } - return false; } -Status RangeDelAggregator::AddTombstones( - std::unique_ptr input, - const InternalKey* smallest, - const InternalKey* largest) { - if (input == nullptr) { - return Status::OK(); - } - input->SeekToFirst(); - bool first_iter = true; - while (input->Valid()) { - if (first_iter) { - if (rep_ == nullptr) { - InitRep({upper_bound_}); - } else { - InvalidateRangeDelMapPositions(); - } - first_iter = false; - } - ParsedInternalKey parsed_key; - bool parsed; - if (input->IsKeyPinned()) { - parsed = ParseInternalKey(input->key(), &parsed_key); - } else { - // The tombstone map holds slices into the iterator's memory. Make a - // copy of the key if it is not pinned. - rep_->pinned_slices_.emplace_back(input->key().data(), - input->key().size()); - parsed = ParseInternalKey(rep_->pinned_slices_.back(), &parsed_key); - } - if (!parsed) { - return Status::Corruption("Unable to parse range tombstone InternalKey"); - } - Slice end_user_key; - if (input->IsValuePinned()) { - end_user_key = input->value(); - } else { - // The tombstone map holds slices into the iterator's memory. Make a - // copy of the value if it is not pinned. - rep_->pinned_slices_.emplace_back(input->value().data(), - input->value().size()); - end_user_key = rep_->pinned_slices_.back(); - } - ParsedInternalKey start_key(parsed_key.user_key, kMaxSequenceNumber, - kMaxValue); - ParsedInternalKey end_key(end_user_key, kMaxSequenceNumber, kMaxValue); - // Truncate the tombstone to the range [smallest, largest]. - if (smallest != nullptr) { - ParsedInternalKey parsed_smallest; - if (ParseInternalKey(smallest->Encode(), &parsed_smallest) && - icmp_.Compare(start_key, parsed_smallest) < 0) { - start_key.user_key = parsed_smallest.user_key; - start_key.sequence = parsed_smallest.sequence; +bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start, + const Slice& end) { + Invalidate(); + + // Set the internal start/end keys so that: + // - if start_ikey has the same user key and sequence number as the + // current end key, start_ikey will be considered greater; and + // - if end_ikey has the same user key and sequence number as the current + // start key, end_ikey will be considered greater. + ParsedInternalKey start_ikey(start, kMaxSequenceNumber, + static_cast(0)); + ParsedInternalKey end_ikey(end, 0, static_cast(0)); + for (auto& iter : iters_) { + bool checked_candidate_tombstones = false; + for (iter->SeekForPrev(start); + iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0; + iter->Next()) { + checked_candidate_tombstones = true; + if (icmp_->Compare(start_ikey, iter->end_key()) < 0 && + icmp_->Compare(iter->start_key(), end_ikey) <= 0) { + return true; } } - if (largest != nullptr) { - ParsedInternalKey parsed_largest; - if (ParseInternalKey(largest->Encode(), &parsed_largest) && - icmp_.Compare(end_key, parsed_largest) > 0) { - end_key.user_key = parsed_largest.user_key; - if (parsed_largest.sequence != kMaxSequenceNumber) { - // The same user key straddles two adjacent sstables. To make sure we - // can truncate to a range that includes the largest point key in the - // first sstable, set the tombstone end key's sequence number to 1 - // less than the largest key. - assert(parsed_largest.sequence != 0); - end_key.sequence = parsed_largest.sequence - 1; - } else { - // The SST file boundary was artificially extended by a range tombstone. - // We will not see any entries in this SST with this user key, so we - // can leave the seqnum at kMaxSequenceNumber. - } + + if (!checked_candidate_tombstones) { + // Do an additional check for when the end of the range is the begin + // key of a tombstone, which we missed earlier since SeekForPrev'ing + // to the start was invalid. + iter->SeekForPrev(end); + if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 && + icmp_->Compare(iter->start_key(), end_ikey) <= 0) { + return true; } } - TruncatedRangeTombstone tombstone(start_key, end_key, parsed_key.sequence); - GetRangeDelMap(parsed_key.sequence).AddTombstone(std::move(tombstone)); - input->Next(); } - if (!first_iter) { - rep_->pinned_iters_mgr_.PinIterator(input.release(), false /* arena */); - } - return Status::OK(); + return false; } -void RangeDelAggregator::InvalidateRangeDelMapPositions() { - if (rep_ == nullptr) { +void ReadRangeDelAggregator::AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest, const InternalKey* largest) { + if (input_iter == nullptr || input_iter->empty()) { return; } - for (auto& stripe : rep_->stripe_map_) { - stripe.second.first->InvalidatePosition(); - } + rep_.AddTombstones( + std::unique_ptr(new TruncatedRangeDelIterator( + std::move(input_iter), icmp_, smallest, largest))); } -RangeDelMap* RangeDelAggregator::GetRangeDelMapIfExists(SequenceNumber seq) { - assert(rep_ != nullptr); - // The stripe includes seqnum for the snapshot above and excludes seqnum for - // the snapshot below. - if (rep_->stripe_map_.empty()) { - return nullptr; - } - StripeMap::iterator iter = rep_->stripe_map_.lower_bound(seq); - if (iter == rep_->stripe_map_.end()) { - return nullptr; - } - size_t snapshot_idx = iter->second.second; - if (snapshot_idx > 0 && seq <= rep_->snapshots_[snapshot_idx - 1]) { - return nullptr; - } - return iter->second.first.get(); +bool ReadRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) { + return rep_.ShouldDelete(parsed, mode); } -RangeDelMap& RangeDelAggregator::GetRangeDelMap(SequenceNumber seq) { - assert(rep_ != nullptr); - // The stripe includes seqnum for the snapshot above and excludes seqnum for - // the snapshot below. - std::vector::iterator iter = - std::lower_bound(rep_->snapshots_.begin(), rep_->snapshots_.end(), seq); - // catch-all stripe justifies this assertion in either of above cases - assert(iter != rep_->snapshots_.end()); - if (rep_->stripe_map_.find(*iter) == rep_->stripe_map_.end()) { - rep_->stripe_map_.emplace( - *iter, - std::make_pair(NewRangeDelMap(), iter - rep_->snapshots_.begin())); - } - return *rep_->stripe_map_[*iter].first; +bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start, + const Slice& end) { + InvalidateRangeDelMapPositions(); + return rep_.IsRangeOverlapped(start, end); } -bool RangeDelAggregator::IsEmpty() { - if (rep_ == nullptr) { - return true; +void CompactionRangeDelAggregator::AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest, const InternalKey* largest) { + if (input_iter == nullptr || input_iter->empty()) { + return; } - for (const auto& stripe : rep_->stripe_map_) { - if (!stripe.second.first->IsEmpty()) { - return false; + assert(input_iter->lower_bound() == 0); + assert(input_iter->upper_bound() == kMaxSequenceNumber); + parent_iters_.emplace_back(new TruncatedRangeDelIterator( + std::move(input_iter), icmp_, smallest, largest)); + + auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_); + for (auto& split_iter : split_iters) { + auto it = reps_.find(split_iter.first); + if (it == reps_.end()) { + bool inserted; + SequenceNumber upper_bound = split_iter.second->upper_bound(); + SequenceNumber lower_bound = split_iter.second->lower_bound(); + std::tie(it, inserted) = reps_.emplace( + split_iter.first, StripeRep(icmp_, upper_bound, lower_bound)); + assert(inserted); } + assert(it != reps_.end()); + it->second.AddTombstones(std::move(split_iter.second)); } - return true; } -bool RangeDelAggregator::AddFile(uint64_t file_number) { - if (rep_ == nullptr) { - return true; +bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) { + auto it = reps_.lower_bound(parsed.sequence); + if (it == reps_.end()) { + return false; } - return rep_->added_files_.emplace(file_number).second; + return it->second.ShouldDelete(parsed, mode); } -class MergingRangeDelIter : public RangeDelIterator { +namespace { + +class TruncatedRangeDelMergingIter : public InternalIterator { public: - MergingRangeDelIter(const Comparator* c) - : heap_(IterMinHeap(IterComparator(c))), current_(nullptr) {} - - void AddIterator(std::unique_ptr iter) { - if (iter->Valid()) { - heap_.push(iter.get()); - iters_.push_back(std::move(iter)); - current_ = heap_.top(); + TruncatedRangeDelMergingIter( + const InternalKeyComparator* icmp, const Slice* lower_bound, + const Slice* upper_bound, bool upper_bound_inclusive, + const std::vector>& children) + : icmp_(icmp), + lower_bound_(lower_bound), + upper_bound_(upper_bound), + upper_bound_inclusive_(upper_bound_inclusive), + heap_(StartKeyMinComparator(icmp)) { + for (auto& child : children) { + if (child != nullptr) { + assert(child->lower_bound() == 0); + assert(child->upper_bound() == kMaxSequenceNumber); + children_.push_back(child.get()); + } } } - bool Valid() const override { return current_ != nullptr; } + bool Valid() const override { + return !heap_.empty() && BeforeEndKey(heap_.top()); + } + Status status() const override { return Status::OK(); } + + void SeekToFirst() override { + heap_.clear(); + for (auto& child : children_) { + if (lower_bound_ != nullptr) { + child->Seek(*lower_bound_); + } else { + child->SeekToFirst(); + } + if (child->Valid()) { + heap_.push(child); + } + } + } void Next() override { - current_->Next(); - if (current_->Valid()) { - heap_.replace_top(current_); + auto* top = heap_.top(); + top->InternalNext(); + if (top->Valid()) { + heap_.replace_top(top); } else { heap_.pop(); } - current_ = heap_.empty() ? nullptr : heap_.top(); } - void Seek(const Slice& target) override { - ParsedInternalKey ikey(target, kMaxSequenceNumber, kMaxValue); - Seek(ikey); + Slice key() const override { + auto* top = heap_.top(); + cur_start_key_.Set(top->start_key().user_key, top->seq(), + kTypeRangeDeletion); + return cur_start_key_.Encode(); } - void Seek(const ParsedInternalKey& target) override { - heap_.clear(); - for (auto& iter : iters_) { - iter->Seek(target); - if (iter->Valid()) { - heap_.push(iter.get()); - } - } - current_ = heap_.empty() ? nullptr : heap_.top(); + Slice value() const override { + auto* top = heap_.top(); + assert(top->end_key().sequence == kMaxSequenceNumber); + return top->end_key().user_key; } - RangeTombstone Tombstone() const override { return current_->Tombstone(); } + // Unused InternalIterator methods + void Prev() override { assert(false); } + void Seek(const Slice& /* target */) override { assert(false); } + void SeekForPrev(const Slice& /* target */) override { assert(false); } + void SeekToLast() override { assert(false); } private: - struct IterComparator { - IterComparator(const Comparator* c) : cmp(c) {} - - bool operator()(const RangeDelIterator* a, - const RangeDelIterator* b) const { - // Note: counterintuitively, returning the tombstone with the larger start - // key puts the tombstone with the smallest key at the top of the heap. - return cmp->Compare(a->Tombstone().start_key_, - b->Tombstone().start_key_) > 0; + bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const { + if (upper_bound_ == nullptr) { + return true; } + int cmp = icmp_->user_comparator()->Compare(iter->start_key().user_key, + *upper_bound_); + return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0; + } - const Comparator* cmp; - }; - - typedef BinaryHeap IterMinHeap; + const InternalKeyComparator* icmp_; + const Slice* lower_bound_; + const Slice* upper_bound_; + bool upper_bound_inclusive_; + BinaryHeap heap_; + std::vector children_; - std::vector> iters_; - IterMinHeap heap_; - RangeDelIterator* current_; + mutable InternalKey cur_start_key_; }; -std::unique_ptr RangeDelAggregator::NewIterator() { - std::unique_ptr iter( - new MergingRangeDelIter(icmp_.user_comparator())); - if (rep_ != nullptr) { - for (const auto& stripe : rep_->stripe_map_) { - iter->AddIterator(stripe.second.first->NewIterator()); - } - } - return std::move(iter); +} // namespace + +std::unique_ptr +CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound, + const Slice* upper_bound, + bool upper_bound_inclusive) { + InvalidateRangeDelMapPositions(); + std::unique_ptr merging_iter( + new TruncatedRangeDelMergingIter(icmp_, lower_bound, upper_bound, + upper_bound_inclusive, parent_iters_)); + + // TODO: add tests where tombstone fragments can be outside of upper and lower + // bound range + auto fragmented_tombstone_list = + std::make_shared( + std::move(merging_iter), *icmp_, true /* for_compaction */, + *snapshots_); + + return std::unique_ptr( + new FragmentedRangeTombstoneIterator( + fragmented_tombstone_list, *icmp_, + kMaxSequenceNumber /* upper_bound */)); } } // namespace rocksdb diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index 8a89ec9f1..a59cbaf1b 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -1,10 +1,12 @@ -// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #pragma once +#include +#include #include #include #include @@ -14,220 +16,422 @@ #include "db/compaction_iteration_stats.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" +#include "db/range_del_aggregator.h" +#include "db/range_tombstone_fragmenter.h" #include "db/version_edit.h" #include "include/rocksdb/comparator.h" #include "include/rocksdb/types.h" #include "table/internal_iterator.h" #include "table/scoped_arena_iterator.h" #include "table/table_builder.h" +#include "util/heap.h" #include "util/kv_map.h" namespace rocksdb { -// RangeDelMaps maintain position across calls to ShouldDelete. The caller may -// wish to specify a mode to optimize positioning the iterator during the next -// call to ShouldDelete. The non-kFullScan modes are only available when -// deletion collapsing is enabled. -// -// For example, if we invoke Next() on an iterator, kForwardTraversal should be -// specified to advance one-by-one through deletions until one is found with its -// interval containing the key. This will typically be faster than doing a full -// binary search (kBinarySearch). -enum class RangeDelPositioningMode { - kFullScan, // used iff collapse_deletions_ == false - kForwardTraversal, - kBackwardTraversal, - kBinarySearch, +class TruncatedRangeDelIterator { + public: + TruncatedRangeDelIterator( + std::unique_ptr iter, + const InternalKeyComparator* icmp, const InternalKey* smallest, + const InternalKey* largest); + + bool Valid() const; + + void Next(); + void Prev(); + + void InternalNext(); + + // Seeks to the tombstone with the highest viisble sequence number that covers + // target (a user key). If no such tombstone exists, the position will be at + // the earliest tombstone that ends after target. + void Seek(const Slice& target); + + // Seeks to the tombstone with the highest viisble sequence number that covers + // target (a user key). If no such tombstone exists, the position will be at + // the latest tombstone that starts before target. + void SeekForPrev(const Slice& target); + + void SeekToFirst(); + void SeekToLast(); + + ParsedInternalKey start_key() const { + return (smallest_ == nullptr || + icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0) + ? iter_->parsed_start_key() + : *smallest_; + } + + ParsedInternalKey end_key() const { + return (largest_ == nullptr || + icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0) + ? iter_->parsed_end_key() + : *largest_; + } + + SequenceNumber seq() const { return iter_->seq(); } + + std::map> + SplitBySnapshot(const std::vector& snapshots); + + SequenceNumber upper_bound() const { return iter_->upper_bound(); } + + SequenceNumber lower_bound() const { return iter_->lower_bound(); } + + private: + std::unique_ptr iter_; + const InternalKeyComparator* icmp_; + const ParsedInternalKey* smallest_ = nullptr; + const ParsedInternalKey* largest_ = nullptr; + std::list pinned_bounds_; + + const InternalKey* smallest_ikey_; + const InternalKey* largest_ikey_; +}; + +struct SeqMaxComparator { + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return a->seq() > b->seq(); + } }; -// TruncatedRangeTombstones are a slight generalization of regular -// RangeTombstones that can represent truncations caused by SST boundaries. -// Instead of using user keys to represent the start and end keys, they instead -// use internal keys, whose sequence number indicates the sequence number of -// the smallest/largest SST key (in the case where a tombstone is untruncated, -// the sequence numbers will be kMaxSequenceNumber for both start and end -// keys). Like RangeTombstones, TruncatedRangeTombstone are also -// end-key-exclusive. -struct TruncatedRangeTombstone { - TruncatedRangeTombstone(const ParsedInternalKey& sk, - const ParsedInternalKey& ek, SequenceNumber s) - : start_key_(sk), end_key_(ek), seq_(s) {} - - RangeTombstone Tombstone() const { - // The RangeTombstone returned here can cover less than the - // TruncatedRangeTombstone when its end key has a seqnum that is not - // kMaxSequenceNumber. Since this method is only used by RangeDelIterators - // (which in turn are only used during flush/compaction), we avoid this - // problem by using truncation boundaries spanning multiple SSTs, which - // are selected in a way that guarantee a clean break at the end key. - assert(end_key_.sequence == kMaxSequenceNumber); - return RangeTombstone(start_key_.user_key, end_key_.user_key, seq_); - } - - ParsedInternalKey start_key_; - ParsedInternalKey end_key_; - SequenceNumber seq_; +struct StartKeyMinComparator { + explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return icmp->Compare(a->start_key(), b->start_key()) > 0; + } + + const InternalKeyComparator* icmp; }; -// A RangeDelIterator iterates over range deletion tombstones. -class RangeDelIterator { +class ForwardRangeDelIterator { public: - virtual ~RangeDelIterator() = default; - - virtual bool Valid() const = 0; - virtual void Next() = 0; - // NOTE: the Slice passed to this method must be a user key. - virtual void Seek(const Slice& target) = 0; - virtual void Seek(const ParsedInternalKey& target) = 0; - virtual RangeTombstone Tombstone() const = 0; + ForwardRangeDelIterator( + const InternalKeyComparator* icmp, + const std::vector>* iters); + + bool ShouldDelete(const ParsedInternalKey& parsed); + void Invalidate(); + + void AddNewIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + iter->Seek(parsed.user_key); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + size_t UnusedIdx() const { return unused_idx_; } + void IncUnusedIdx() { unused_idx_++; } + + private: + using ActiveSeqSet = + std::multiset; + + struct EndKeyMinComparator { + explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const ActiveSeqSet::const_iterator& a, + const ActiveSeqSet::const_iterator& b) const { + return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0; + } + + const InternalKeyComparator* icmp; + }; + + void PushIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + if (!iter->Valid()) { + // The iterator has been fully consumed, so we don't need to add it to + // either of the heaps. + return; + } + int cmp = icmp_->Compare(parsed, iter->start_key()); + if (cmp < 0) { + PushInactiveIter(iter); + } else { + PushActiveIter(iter); + } + } + + void PushActiveIter(TruncatedRangeDelIterator* iter) { + auto seq_pos = active_seqnums_.insert(iter); + active_iters_.push(seq_pos); + } + + TruncatedRangeDelIterator* PopActiveIter() { + auto active_top = active_iters_.top(); + auto iter = *active_top; + active_iters_.pop(); + active_seqnums_.erase(active_top); + return iter; + } + + void PushInactiveIter(TruncatedRangeDelIterator* iter) { + inactive_iters_.push(iter); + } + + TruncatedRangeDelIterator* PopInactiveIter() { + auto* iter = inactive_iters_.top(); + inactive_iters_.pop(); + return iter; + } + + const InternalKeyComparator* icmp_; + const std::vector>* iters_; + size_t unused_idx_; + ActiveSeqSet active_seqnums_; + BinaryHeap active_iters_; + BinaryHeap inactive_iters_; }; -// A RangeDelMap keeps track of range deletion tombstones within a snapshot -// stripe. -// -// RangeDelMaps are used internally by RangeDelAggregator. They are not intended -// to be used directly. -class RangeDelMap { +class ReverseRangeDelIterator { public: - virtual ~RangeDelMap() = default; + ReverseRangeDelIterator( + const InternalKeyComparator* icmp, + const std::vector>* iters); - virtual bool ShouldDelete(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode) = 0; - virtual bool IsRangeOverlapped(const ParsedInternalKey& start, - const ParsedInternalKey& end) = 0; - virtual void InvalidatePosition() = 0; + bool ShouldDelete(const ParsedInternalKey& parsed); + void Invalidate(); + + void AddNewIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + iter->SeekForPrev(parsed.user_key); + PushIter(iter, parsed); + assert(active_iters_.size() == active_seqnums_.size()); + } + + size_t UnusedIdx() const { return unused_idx_; } + void IncUnusedIdx() { unused_idx_++; } + + private: + using ActiveSeqSet = + std::multiset; + + struct EndKeyMaxComparator { + explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const TruncatedRangeDelIterator* a, + const TruncatedRangeDelIterator* b) const { + return icmp->Compare(a->end_key(), b->end_key()) < 0; + } + + const InternalKeyComparator* icmp; + }; + struct StartKeyMaxComparator { + explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {} + + bool operator()(const ActiveSeqSet::const_iterator& a, + const ActiveSeqSet::const_iterator& b) const { + return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0; + } - virtual size_t Size() const = 0; - bool IsEmpty() const { return Size() == 0; } + const InternalKeyComparator* icmp; + }; + + void PushIter(TruncatedRangeDelIterator* iter, + const ParsedInternalKey& parsed) { + if (!iter->Valid()) { + // The iterator has been fully consumed, so we don't need to add it to + // either of the heaps. + } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) { + PushInactiveIter(iter); + } else { + PushActiveIter(iter); + } + } + + void PushActiveIter(TruncatedRangeDelIterator* iter) { + auto seq_pos = active_seqnums_.insert(iter); + active_iters_.push(seq_pos); + } + + TruncatedRangeDelIterator* PopActiveIter() { + auto active_top = active_iters_.top(); + auto iter = *active_top; + active_iters_.pop(); + active_seqnums_.erase(active_top); + return iter; + } - virtual void AddTombstone(TruncatedRangeTombstone tombstone) = 0; - virtual std::unique_ptr NewIterator() = 0; + void PushInactiveIter(TruncatedRangeDelIterator* iter) { + inactive_iters_.push(iter); + } + + TruncatedRangeDelIterator* PopInactiveIter() { + auto* iter = inactive_iters_.top(); + inactive_iters_.pop(); + return iter; + } + + const InternalKeyComparator* icmp_; + const std::vector>* iters_; + size_t unused_idx_; + ActiveSeqSet active_seqnums_; + BinaryHeap active_iters_; + BinaryHeap inactive_iters_; }; -// A RangeDelAggregator aggregates range deletion tombstones as they are -// encountered in memtables/SST files. It provides methods that check whether a -// key is covered by range tombstones or write the relevant tombstones to a new -// SST file. +enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal }; class RangeDelAggregator { public: - // @param snapshots These are used to organize the tombstones into snapshot - // stripes, which is the seqnum range between consecutive snapshots, - // including the higher snapshot and excluding the lower one. Currently, - // this is used by ShouldDelete() to prevent deletion of keys that are - // covered by range tombstones in other snapshot stripes. This constructor - // is used for writes (flush/compaction). All DB snapshots are provided - // such that no keys are removed that are uncovered according to any DB - // snapshot. - // Note this overload does not lazily initialize Rep. - RangeDelAggregator(const InternalKeyComparator& icmp, - const std::vector& snapshots, - bool collapse_deletions = true); - - // @param upper_bound Similar to snapshots above, except with a single - // snapshot, which allows us to store the snapshot on the stack and defer - // initialization of heap-allocating members (in Rep) until the first range - // deletion is encountered. This constructor is used in case of reads (get/ - // iterator), for which only the user snapshot (upper_bound) is provided - // such that the seqnum space is divided into two stripes. Only the older - // stripe will be used by ShouldDelete(). - RangeDelAggregator(const InternalKeyComparator& icmp, - SequenceNumber upper_bound, - bool collapse_deletions = false); - - // Returns whether the key should be deleted, which is the case when it is - // covered by a range tombstone residing in the same snapshot stripe. - // @param mode If collapse_deletions_ is true, this dictates how we will find - // the deletion whose interval contains this key. Otherwise, its - // value must be kFullScan indicating linear scan from beginning. - bool ShouldDelete( - const ParsedInternalKey& parsed, - RangeDelPositioningMode mode = RangeDelPositioningMode::kFullScan) { - if (rep_ == nullptr) { + explicit RangeDelAggregator(const InternalKeyComparator* icmp) + : icmp_(icmp) {} + virtual ~RangeDelAggregator() {} + + virtual void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) = 0; + + bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) { + ParsedInternalKey parsed; + if (!ParseInternalKey(key, &parsed)) { return false; } - return ShouldDeleteImpl(parsed, mode); + return ShouldDelete(parsed, mode); } - bool ShouldDelete( - const Slice& internal_key, - RangeDelPositioningMode mode = RangeDelPositioningMode::kFullScan) { - if (rep_ == nullptr) { - return false; + virtual bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) = 0; + + virtual void InvalidateRangeDelMapPositions() = 0; + + virtual bool IsEmpty() const = 0; + + bool AddFile(uint64_t file_number) { + return files_seen_.insert(file_number).second; + } + + protected: + class StripeRep { + public: + StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound, + SequenceNumber lower_bound) + : icmp_(icmp), + forward_iter_(icmp, &iters_), + reverse_iter_(icmp, &iters_), + upper_bound_(upper_bound), + lower_bound_(lower_bound) {} + + void AddTombstones(std::unique_ptr input_iter) { + iters_.push_back(std::move(input_iter)); } - return ShouldDeleteImpl(internal_key, mode); - } - bool ShouldDeleteImpl(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode); - bool ShouldDeleteImpl(const Slice& internal_key, - RangeDelPositioningMode mode); - - // Checks whether range deletions cover any keys between `start` and `end`, - // inclusive. - // - // @param start User key representing beginning of range to check for overlap. - // @param end User key representing end of range to check for overlap. This - // argument is inclusive, so the existence of a range deletion covering - // `end` causes this to return true. + + bool IsEmpty() const { return iters_.empty(); } + + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode); + + void Invalidate() { + InvalidateForwardIter(); + InvalidateReverseIter(); + } + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + private: + bool InStripe(SequenceNumber seq) const { + return lower_bound_ <= seq && seq <= upper_bound_; + } + + void InvalidateForwardIter() { forward_iter_.Invalidate(); } + + void InvalidateReverseIter() { reverse_iter_.Invalidate(); } + + const InternalKeyComparator* icmp_; + std::vector> iters_; + ForwardRangeDelIterator forward_iter_; + ReverseRangeDelIterator reverse_iter_; + SequenceNumber upper_bound_; + SequenceNumber lower_bound_; + }; + + const InternalKeyComparator* icmp_; + + private: + std::set files_seen_; +}; + +class ReadRangeDelAggregator : public RangeDelAggregator { + public: + ReadRangeDelAggregator(const InternalKeyComparator* icmp, + SequenceNumber upper_bound) + : RangeDelAggregator(icmp), + rep_(icmp, upper_bound, 0 /* lower_bound */) {} + ~ReadRangeDelAggregator() override {} + + using RangeDelAggregator::ShouldDelete; + void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) override; + + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) override; + bool IsRangeOverlapped(const Slice& start, const Slice& end); - // Adds tombstones to the tombstone aggregation structure maintained by this - // object. Tombstones are truncated to smallest and largest. If smallest (or - // largest) is null, it is not used for truncation. When adding range - // tombstones present in an sstable, smallest and largest should be set to - // the smallest and largest keys from the sstable file metadata. Note that - // tombstones end keys are exclusive while largest is inclusive. - // @return non-OK status if any of the tombstone keys are corrupted. - Status AddTombstones(std::unique_ptr input, - const InternalKey* smallest = nullptr, - const InternalKey* largest = nullptr); - - // Resets iterators maintained across calls to ShouldDelete(). This may be - // called when the tombstones change, or the owner may call explicitly, e.g., - // if it's an iterator that just seeked to an arbitrary position. The effect - // of invalidation is that the following call to ShouldDelete() will binary - // search for its tombstone. - void InvalidateRangeDelMapPositions(); - - bool IsEmpty(); - bool AddFile(uint64_t file_number); - - // Create a new iterator over the range deletion tombstones in all of the - // snapshot stripes in this aggregator. Tombstones are presented in start key - // order. Tombstones with the same start key are presented in arbitrary order. - // - // The iterator is invalidated after any call to AddTombstones. It is the - // caller's responsibility to avoid using invalid iterators. - std::unique_ptr NewIterator(); + void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); } + + bool IsEmpty() const override { return rep_.IsEmpty(); } private: - // Maps snapshot seqnum -> map of tombstones that fall in that stripe, i.e., - // their seqnums are greater than the next smaller snapshot's seqnum, and the - // corresponding index into the list of snapshots. Each entry is lazily - // initialized. - typedef std::map, size_t>> - StripeMap; - - struct Rep { - std::vector snapshots_; - StripeMap stripe_map_; - PinnedIteratorsManager pinned_iters_mgr_; - std::list pinned_slices_; - std::set added_files_; - }; - // Initializes rep_ lazily. This aggregator object is constructed for every - // read, so expensive members should only be created when necessary, i.e., - // once the first range deletion is encountered. - void InitRep(const std::vector& snapshots); - - std::unique_ptr NewRangeDelMap(); - RangeDelMap* GetRangeDelMapIfExists(SequenceNumber seq); - RangeDelMap& GetRangeDelMap(SequenceNumber seq); - - SequenceNumber upper_bound_; - std::unique_ptr rep_; - const InternalKeyComparator& icmp_; - // collapse range deletions so they're binary searchable - const bool collapse_deletions_; + StripeRep rep_; +}; + +class CompactionRangeDelAggregator : public RangeDelAggregator { + public: + CompactionRangeDelAggregator(const InternalKeyComparator* icmp, + const std::vector& snapshots) + : RangeDelAggregator(icmp), snapshots_(&snapshots) {} + ~CompactionRangeDelAggregator() override {} + + void AddTombstones( + std::unique_ptr input_iter, + const InternalKey* smallest = nullptr, + const InternalKey* largest = nullptr) override; + + using RangeDelAggregator::ShouldDelete; + bool ShouldDelete(const ParsedInternalKey& parsed, + RangeDelPositioningMode mode) override; + + bool IsRangeOverlapped(const Slice& start, const Slice& end); + + void InvalidateRangeDelMapPositions() override { + for (auto& rep : reps_) { + rep.second.Invalidate(); + } + } + + bool IsEmpty() const override { + for (const auto& rep : reps_) { + if (!rep.second.IsEmpty()) { + return false; + } + } + return true; + } + + // Creates an iterator over all the range tombstones in the aggregator, for + // use in compaction. Nullptr arguments indicate that the iterator range is + // unbounded. + // NOTE: the boundaries are used for optimization purposes to reduce the + // number of tombstones that are passed to the fragmenter; they do not + // guarantee that the resulting iterator only contains range tombstones that + // cover keys in the provided range. If required, these bounds must be + // enforced during iteration. + std::unique_ptr NewIterator( + const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr, + bool upper_bound_inclusive = false); + + private: + std::vector> parent_iters_; + std::map reps_; + + const std::vector* snapshots_; }; } // namespace rocksdb diff --git a/db/range_del_aggregator_bench.cc b/db/range_del_aggregator_bench.cc index 0b8260960..7ecdbc5af 100644 --- a/db/range_del_aggregator_bench.cc +++ b/db/range_del_aggregator_bench.cc @@ -20,7 +20,6 @@ int main() { #include #include "db/range_del_aggregator.h" -#include "db/range_del_aggregator_v2.h" #include "db/range_tombstone_fragmenter.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" @@ -48,8 +47,6 @@ DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width"); DEFINE_double(tombstone_width_stddev, 0.0, "standard deviation of range tombstone width"); -DEFINE_bool(use_collapsed, true, "use the collapsed range tombstone map"); - DEFINE_int32(seed, 0, "random number generator seed"); DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run"); @@ -57,8 +54,6 @@ DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run"); DEFINE_int32(add_tombstones_per_run, 1, "number of AddTombstones calls per run"); -DEFINE_bool(use_v2_aggregator, false, "benchmark RangeDelAggregatorV2"); - namespace { struct Stats { @@ -187,14 +182,10 @@ int main(int argc, char** argv) { std::vector( FLAGS_num_range_tombstones); } - auto mode = FLAGS_use_collapsed - ? rocksdb::RangeDelPositioningMode::kForwardTraversal - : rocksdb::RangeDelPositioningMode::kFullScan; + auto mode = rocksdb::RangeDelPositioningMode::kForwardTraversal; for (int i = 0; i < FLAGS_num_runs; i++) { - rocksdb::RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, - FLAGS_use_collapsed); - rocksdb::ReadRangeDelAggregatorV2 range_del_agg_v2( + rocksdb::ReadRangeDelAggregator range_del_agg( &icmp, rocksdb::kMaxSequenceNumber /* upper_bound */); std::vector > @@ -223,17 +214,10 @@ int main(int argc, char** argv) { fragmented_range_tombstone_lists.back().get(), icmp, rocksdb::kMaxSequenceNumber)); - if (FLAGS_use_v2_aggregator) { - rocksdb::StopWatchNano stop_watch_add_tombstones( - rocksdb::Env::Default(), true /* auto_start */); - range_del_agg_v2.AddTombstones(std::move(fragmented_range_del_iter)); - stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos(); - } else { - rocksdb::StopWatchNano stop_watch_add_tombstones( - rocksdb::Env::Default(), true /* auto_start */); - range_del_agg.AddTombstones(std::move(range_del_iter)); - stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos(); - } + rocksdb::StopWatchNano stop_watch_add_tombstones(rocksdb::Env::Default(), + true /* auto_start */); + range_del_agg.AddTombstones(std::move(fragmented_range_del_iter)); + stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos(); } rocksdb::ParsedInternalKey parsed_key; @@ -247,18 +231,10 @@ int main(int argc, char** argv) { std::string key_string = rocksdb::Key(first_key + j); parsed_key.user_key = key_string; - uint64_t call_time; - if (FLAGS_use_v2_aggregator) { - rocksdb::StopWatchNano stop_watch_should_delete(rocksdb::Env::Default(), - true /* auto_start */); - range_del_agg_v2.ShouldDelete(parsed_key, mode); - call_time = stop_watch_should_delete.ElapsedNanos(); - } else { - rocksdb::StopWatchNano stop_watch_should_delete(rocksdb::Env::Default(), - true /* auto_start */); - range_del_agg.ShouldDelete(parsed_key, mode); - call_time = stop_watch_should_delete.ElapsedNanos(); - } + rocksdb::StopWatchNano stop_watch_should_delete(rocksdb::Env::Default(), + true /* auto_start */); + range_del_agg.ShouldDelete(parsed_key, mode); + uint64_t call_time = stop_watch_should_delete.ElapsedNanos(); if (j == 0) { stats.time_first_should_delete += call_time; diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 2cfc6540e..28c8129ec 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -1,13 +1,17 @@ -// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include +#include "db/range_del_aggregator.h" + +#include +#include +#include #include "db/db_test_util.h" -#include "db/range_del_aggregator.h" -#include "rocksdb/comparator.h" +#include "db/dbformat.h" +#include "db/range_tombstone_fragmenter.h" #include "util/testutil.h" namespace rocksdb { @@ -16,452 +20,685 @@ class RangeDelAggregatorTest : public testing::Test {}; namespace { -struct ExpectedPoint { - Slice begin; - SequenceNumber seq; - bool expectAlive; -}; - -enum Direction { - kForward, - kReverse, -}; - -struct AddTombstonesArgs { - const std::vector tombstones; - const InternalKey* smallest; - const InternalKey* largest; -}; - static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); -void AddTombstones(RangeDelAggregator* range_del_agg, - const std::vector& range_dels, - const InternalKey* smallest = nullptr, - const InternalKey* largest = nullptr) { +std::unique_ptr MakeRangeDelIter( + const std::vector& range_dels) { std::vector keys, values; for (const auto& range_del : range_dels) { auto key_and_value = range_del.Serialize(); keys.push_back(key_and_value.first.Encode().ToString()); values.push_back(key_and_value.second.ToString()); } - std::unique_ptr range_del_iter( + return std::unique_ptr( new test::VectorIterator(keys, values)); - range_del_agg->AddTombstones(std::move(range_del_iter), smallest, largest); } -void VerifyTombstonesEq(const RangeTombstone& a, const RangeTombstone& b) { - ASSERT_EQ(a.seq_, b.seq_); - ASSERT_EQ(a.start_key_, b.start_key_); - ASSERT_EQ(a.end_key_, b.end_key_); -} - -void VerifyRangeDelIter( - RangeDelIterator* range_del_iter, - const std::vector& expected_range_dels) { - size_t i = 0; - for (; range_del_iter->Valid(); range_del_iter->Next(), i++) { - VerifyTombstonesEq(expected_range_dels[i], range_del_iter->Tombstone()); +std::vector> +MakeFragmentedTombstoneLists( + const std::vector>& range_dels_list) { + std::vector> fragment_lists; + for (const auto& range_dels : range_dels_list) { + auto range_del_iter = MakeRangeDelIter(range_dels); + fragment_lists.emplace_back(new FragmentedRangeTombstoneList( + std::move(range_del_iter), bytewise_icmp)); } - ASSERT_EQ(expected_range_dels.size(), i); - ASSERT_FALSE(range_del_iter->Valid()); + return fragment_lists; } -void VerifyRangeDels( - const std::vector& all_args, - const std::vector& expected_points, - const std::vector& expected_collapsed_range_dels, - const InternalKeyComparator& icmp = bytewise_icmp) { - // Test same result regardless of which order the range deletions are added - // and regardless of collapsed mode. - for (bool collapsed : {false, true}) { - for (Direction dir : {kForward, kReverse}) { - RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, collapsed); - std::vector all_range_dels; - - for (const auto& args : all_args) { - std::vector range_dels = args.tombstones; - if (dir == kReverse) { - std::reverse(range_dels.begin(), range_dels.end()); - } - all_range_dels.insert(all_range_dels.end(), range_dels.begin(), - range_dels.end()); - AddTombstones(&range_del_agg, range_dels, args.smallest, args.largest); - } - - auto mode = RangeDelPositioningMode::kFullScan; - if (collapsed) { - mode = RangeDelPositioningMode::kForwardTraversal; - } - - for (const auto expected_point : expected_points) { - ParsedInternalKey parsed_key; - parsed_key.user_key = expected_point.begin; - parsed_key.sequence = expected_point.seq; - parsed_key.type = kTypeValue; - std::string ikey; - AppendInternalKey(&ikey, parsed_key); - ASSERT_FALSE(range_del_agg.ShouldDelete(ikey, mode)); - if (parsed_key.sequence > 0) { - --parsed_key.sequence; - ikey.clear(); - AppendInternalKey(&ikey, parsed_key); - if (expected_point.expectAlive) { - ASSERT_FALSE(range_del_agg.ShouldDelete(ikey, mode)); - } else { - ASSERT_TRUE(range_del_agg.ShouldDelete(ikey, mode)); - } - } - } - - if (collapsed) { - all_range_dels = expected_collapsed_range_dels; - VerifyRangeDelIter(range_del_agg.NewIterator().get(), all_range_dels); - } else if (all_args.size() == 1 && all_args[0].smallest == nullptr && - all_args[0].largest == nullptr) { - // Tombstones in an uncollapsed map are presented in start key - // order. Tombstones with the same start key are presented in - // insertion order. We don't handle tombstone truncation here, so the - // verification is only performed if no truncation was requested. - std::stable_sort(all_range_dels.begin(), all_range_dels.end(), - [&](const RangeTombstone& a, const RangeTombstone& b) { - return icmp.user_comparator()->Compare( - a.start_key_, b.start_key_) < 0; - }); - VerifyRangeDelIter(range_del_agg.NewIterator().get(), all_range_dels); - } - } - } - - RangeDelAggregator range_del_agg(icmp, {} /* snapshots */, - false /* collapse_deletions */); - for (const auto& args : all_args) { - AddTombstones(&range_del_agg, args.tombstones, args.smallest, args.largest); - } - for (size_t i = 1; i < expected_points.size(); ++i) { - bool overlapped = range_del_agg.IsRangeOverlapped( - expected_points[i - 1].begin, expected_points[i].begin); - if (expected_points[i - 1].seq > 0 || expected_points[i].seq > 0) { - ASSERT_TRUE(overlapped); - } else { - ASSERT_FALSE(overlapped); - } - } -} - -} // anonymous namespace - -TEST_F(RangeDelAggregatorTest, Empty) { VerifyRangeDels({}, {{"a", 0}}, {}); } - -TEST_F(RangeDelAggregatorTest, SameStartAndEnd) { - VerifyRangeDels({{{{"a", "a", 5}}}}, {{" ", 0}, {"a", 0}, {"b", 0}}, {}); -} - -TEST_F(RangeDelAggregatorTest, Single) { - VerifyRangeDels({{{{"a", "b", 10}}}}, {{" ", 0}, {"a", 10}, {"b", 0}}, - {{"a", "b", 10}}); -} - -TEST_F(RangeDelAggregatorTest, OverlapAboveLeft) { - VerifyRangeDels({{{{"a", "c", 10}, {"b", "d", 5}}}}, - {{" ", 0}, {"a", 10}, {"c", 5}, {"d", 0}}, - {{"a", "c", 10}, {"c", "d", 5}}); -} - -TEST_F(RangeDelAggregatorTest, OverlapAboveRight) { - VerifyRangeDels({{{{"a", "c", 5}, {"b", "d", 10}}}}, - {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}}, - {{"a", "b", 5}, {"b", "d", 10}}); -} - -TEST_F(RangeDelAggregatorTest, OverlapAboveMiddle) { - VerifyRangeDels({{{{"a", "d", 5}, {"b", "c", 10}}}}, - {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 5}, {"d", 0}}, - {{"a", "b", 5}, {"b", "c", 10}, {"c", "d", 5}}); -} - -TEST_F(RangeDelAggregatorTest, OverlapAboveMiddleReverse) { - VerifyRangeDels({{{{"d", "a", 5}, {"c", "b", 10}}}}, - {{"z", 0}, {"d", 5}, {"c", 10}, {"b", 5}, {"a", 0}}, - {{"d", "c", 5}, {"c", "b", 10}, {"b", "a", 5}}, - InternalKeyComparator(ReverseBytewiseComparator())); -} - -TEST_F(RangeDelAggregatorTest, OverlapFully) { - VerifyRangeDels({{{{"a", "d", 10}, {"b", "c", 5}}}}, - {{" ", 0}, {"a", 10}, {"d", 0}}, {{"a", "d", 10}}); -} +struct TruncatedIterScanTestCase { + ParsedInternalKey start; + ParsedInternalKey end; + SequenceNumber seq; +}; -TEST_F(RangeDelAggregatorTest, OverlapPoint) { - VerifyRangeDels({{{{"a", "b", 5}, {"b", "c", 10}}}}, - {{" ", 0}, {"a", 5}, {"b", 10}, {"c", 0}}, - {{"a", "b", 5}, {"b", "c", 10}}); -} +struct TruncatedIterSeekTestCase { + Slice target; + ParsedInternalKey start; + ParsedInternalKey end; + SequenceNumber seq; + bool invalid; +}; -TEST_F(RangeDelAggregatorTest, SameStartKey) { - VerifyRangeDels({{{{"a", "c", 5}, {"a", "b", 10}}}}, - {{" ", 0}, {"a", 10}, {"b", 5}, {"c", 0}}, - {{"a", "b", 10}, {"b", "c", 5}}); -} +struct ShouldDeleteTestCase { + ParsedInternalKey lookup_key; + bool result; +}; -TEST_F(RangeDelAggregatorTest, SameEndKey) { - VerifyRangeDels({{{{"a", "d", 5}, {"b", "d", 10}}}}, - {{" ", 0}, {"a", 5}, {"b", 10}, {"d", 0}}, - {{"a", "b", 5}, {"b", "d", 10}}); -} +struct IsRangeOverlappedTestCase { + Slice start; + Slice end; + bool result; +}; -TEST_F(RangeDelAggregatorTest, GapsBetweenRanges) { - VerifyRangeDels({{{{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}}}, - {{" ", 0}, - {"a", 5}, - {"b", 0}, - {"c", 10}, - {"d", 0}, - {"da", 0}, - {"e", 15}, - {"f", 0}}, - {{"a", "b", 5}, {"c", "d", 10}, {"e", "f", 15}}); +ParsedInternalKey UncutEndpoint(const Slice& s) { + return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion); } -TEST_F(RangeDelAggregatorTest, IdenticalSameSeqNo) { - VerifyRangeDels({{{{"a", "b", 5}, {"a", "b", 5}}}}, - {{" ", 0}, {"a", 5}, {"b", 0}}, - {{"a", "b", 5}}); +ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq) { + return ParsedInternalKey(key, seq, kTypeValue); } -TEST_F(RangeDelAggregatorTest, ContiguousSameSeqNo) { - VerifyRangeDels({{{{"a", "b", 5}, {"b", "c", 5}}}}, - {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 0}}, - {{"a", "c", 5}}); +void VerifyIterator( + TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp, + const std::vector& expected_range_dels) { + // Test forward iteration. + iter->SeekToFirst(); + for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start)); + EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end)); + EXPECT_EQ(expected_range_dels[i].seq, iter->seq()); + } + EXPECT_FALSE(iter->Valid()); + + // Test reverse iteration. + iter->SeekToLast(); + std::vector reverse_expected_range_dels( + expected_range_dels.rbegin(), expected_range_dels.rend()); + for (size_t i = 0; i < reverse_expected_range_dels.size(); + i++, iter->Prev()) { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), + reverse_expected_range_dels[i].start)); + EXPECT_EQ( + 0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end)); + EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq()); + } + EXPECT_FALSE(iter->Valid()); } -TEST_F(RangeDelAggregatorTest, OverlappingSameSeqNo) { - VerifyRangeDels({{{{"a", "c", 5}, {"b", "d", 5}}}}, - {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, - {{"a", "d", 5}}); +void VerifySeek(TruncatedRangeDelIterator* iter, + const InternalKeyComparator& icmp, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + iter->Seek(test_case.target); + if (test_case.invalid) { + ASSERT_FALSE(iter->Valid()); + } else { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start)); + EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end)); + EXPECT_EQ(test_case.seq, iter->seq()); + } + } } -TEST_F(RangeDelAggregatorTest, CoverSameSeqNo) { - VerifyRangeDels({{{{"a", "d", 5}, {"b", "c", 5}}}}, - {{" ", 0}, {"a", 5}, {"b", 5}, {"c", 5}, {"d", 0}}, - {{"a", "d", 5}}); +void VerifySeekForPrev( + TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + iter->SeekForPrev(test_case.target); + if (test_case.invalid) { + ASSERT_FALSE(iter->Valid()); + } else { + ASSERT_TRUE(iter->Valid()); + EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start)); + EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end)); + EXPECT_EQ(test_case.seq, iter->seq()); + } + } } -// Note the Cover* tests also test cases where tombstones are inserted under a -// larger one when VerifyRangeDels() runs them in reverse -TEST_F(RangeDelAggregatorTest, CoverMultipleFromLeft) { - VerifyRangeDels( - {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "f", 20}}}}, - {{" ", 0}, {"a", 20}, {"f", 15}, {"g", 0}}, - {{"a", "f", 20}, {"f", "g", 15}}); +void VerifyShouldDelete(RangeDelAggregator* range_del_agg, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + EXPECT_EQ( + test_case.result, + range_del_agg->ShouldDelete( + test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal)); + } + for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) { + const auto& test_case = *it; + EXPECT_EQ( + test_case.result, + range_del_agg->ShouldDelete( + test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal)); + } } -TEST_F(RangeDelAggregatorTest, CoverMultipleFromRight) { - VerifyRangeDels( - {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"c", "h", 20}}}}, - {{" ", 0}, {"b", 5}, {"c", 20}, {"h", 0}}, - {{"b", "c", 5}, {"c", "h", 20}}); +void VerifyIsRangeOverlapped( + ReadRangeDelAggregator* range_del_agg, + const std::vector& test_cases) { + for (const auto& test_case : test_cases) { + EXPECT_EQ(test_case.result, + range_del_agg->IsRangeOverlapped(test_case.start, test_case.end)); + } } -TEST_F(RangeDelAggregatorTest, CoverMultipleFully) { - VerifyRangeDels( - {{{{"b", "d", 5}, {"c", "f", 10}, {"e", "g", 15}, {"a", "h", 20}}}}, - {{" ", 0}, {"a", 20}, {"h", 0}}, {{"a", "h", 20}}); -} +void CheckIterPosition(const RangeTombstone& tombstone, + const FragmentedRangeTombstoneIterator* iter) { + // Test InternalIterator interface. + EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key())); + EXPECT_EQ(tombstone.end_key_, iter->value()); + EXPECT_EQ(tombstone.seq_, iter->seq()); -TEST_F(RangeDelAggregatorTest, AlternateMultipleAboveBelow) { - VerifyRangeDels( - {{{{"b", "d", 15}, {"c", "f", 10}, {"e", "g", 20}, {"a", "h", 5}}}}, - {{" ", 0}, {"a", 5}, {"b", 15}, {"d", 10}, {"e", 20}, {"g", 5}, {"h", 0}}, - {{"a", "b", 5}, - {"b", "d", 15}, - {"d", "e", 10}, - {"e", "g", 20}, - {"g", "h", 5}}); + // Test FragmentedRangeTombstoneIterator interface. + EXPECT_EQ(tombstone.start_key_, iter->start_key()); + EXPECT_EQ(tombstone.end_key_, iter->end_key()); + EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key())); } -TEST_F(RangeDelAggregatorTest, MergingIteratorAllEmptyStripes) { - for (bool collapsed : {true, false}) { - RangeDelAggregator range_del_agg(bytewise_icmp, {1, 2}, collapsed); - VerifyRangeDelIter(range_del_agg.NewIterator().get(), {}); +void VerifyFragmentedRangeDels( + FragmentedRangeTombstoneIterator* iter, + const std::vector& expected_tombstones) { + iter->SeekToFirst(); + for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) { + ASSERT_TRUE(iter->Valid()); + CheckIterPosition(expected_tombstones[i], iter); } -} - -TEST_F(RangeDelAggregatorTest, MergingIteratorOverlappingStripes) { - for (bool collapsed : {true, false}) { - RangeDelAggregator range_del_agg(bytewise_icmp, {5, 15, 25, 35}, collapsed); - AddTombstones( - &range_del_agg, - {{"d", "e", 10}, {"aa", "b", 20}, {"c", "d", 30}, {"a", "b", 10}}); - VerifyRangeDelIter( - range_del_agg.NewIterator().get(), - {{"a", "b", 10}, {"aa", "b", 20}, {"c", "d", 30}, {"d", "e", 10}}); + EXPECT_FALSE(iter->Valid()); +} + +} // namespace + +TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) { + auto range_del_iter = MakeRangeDelIter({}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, + nullptr); + + iter.SeekToFirst(); + ASSERT_FALSE(iter.Valid()); + + iter.SeekToLast(); + ASSERT_FALSE(iter.Valid()); +} + +TEST_F(RangeDelAggregatorTest, UntruncatedIter) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, + nullptr); + + VerifyIterator(&iter, bytewise_icmp, + {{UncutEndpoint("a"), UncutEndpoint("e"), 10}, + {UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {UncutEndpoint("j"), UncutEndpoint("n"), 4}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); +} + +TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + 9 /* snapshot */)); + + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, + nullptr); + + VerifyIterator(&iter, bytewise_icmp, + {{UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {UncutEndpoint("j"), UncutEndpoint("n"), 4}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4}, + {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); +} + +TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + InternalKey smallest("d", 7, kTypeValue); + InternalKey largest("m", 9, kTypeValue); + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, + &smallest, &largest); + + VerifyIterator(&iter, bytewise_icmp, + {{InternalValue("d", 7), UncutEndpoint("e"), 10}, + {UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {UncutEndpoint("j"), InternalValue("m", 8), 4}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("j"), InternalValue("m", 8), 4}, + {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"", InternalValue("d", 7), UncutEndpoint("e"), 10}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10}, + {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, + {"n", UncutEndpoint("j"), InternalValue("m", 8), 4}, + {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); +} + +TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) { + auto range_del_iter = + MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + InternalKey smallest("f", 7, kTypeValue); + InternalKey largest("i", 9, kTypeValue); + TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, + &smallest, &largest); + + VerifyIterator(&iter, bytewise_icmp, + {{InternalValue("f", 7), UncutEndpoint("g"), 8}}); + + VerifySeek( + &iter, bytewise_icmp, + {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); + + VerifySeekForPrev( + &iter, bytewise_icmp, + {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, + {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, + {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}}); +} + +TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) { + auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}}); + FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), + bytewise_icmp); + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, + kMaxSequenceNumber)); + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber); + range_del_agg.AddTombstones(std::move(input_iter)); + + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "c", true}, + {"d", "f", true}, + {"g", "l", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleItersInAggregator) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); } -} -TEST_F(RangeDelAggregatorTest, MergingIteratorSeek) { - RangeDelAggregator range_del_agg(bytewise_icmp, {5, 15}, - true /* collapsed */); - AddTombstones(&range_del_agg, {{"a", "c", 10}, - {"b", "c", 11}, - {"f", "g", 10}, - {"c", "d", 20}, - {"e", "f", 20}}); - auto it = range_del_agg.NewIterator(); - - // Verify seek positioning. - it->Seek(""); - VerifyTombstonesEq(it->Tombstone(), {"a", "b", 10}); - it->Seek("a"); - VerifyTombstonesEq(it->Tombstone(), {"a", "b", 10}); - it->Seek("aa"); - VerifyTombstonesEq(it->Tombstone(), {"a", "b", 10}); - it->Seek("b"); - VerifyTombstonesEq(it->Tombstone(), {"b", "c", 11}); - it->Seek("c"); - VerifyTombstonesEq(it->Tombstone(), {"c", "d", 20}); - it->Seek("dd"); - VerifyTombstonesEq(it->Tombstone(), {"e", "f", 20}); - it->Seek("f"); - VerifyTombstonesEq(it->Tombstone(), {"f", "g", 10}); - it->Seek("g"); - ASSERT_EQ(it->Valid(), false); - it->Seek("h"); - ASSERT_EQ(it->Valid(), false); - - // Verify iteration after seek. - it->Seek("c"); - VerifyRangeDelIter(it.get(), - {{"c", "d", 20}, {"e", "f", 20}, {"f", "g", 10}}); -} + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true}, + {InternalValue("b", 19), false}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}, + {InternalValue("h", 24), true}, + {InternalValue("i", 24), false}, + {InternalValue("ii", 14), true}, + {InternalValue("j", 14), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "c", true}, + {"d", "f", true}, + {"g", "l", true}, + {"x", "y", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleItersInAggregatorWithUpperBound) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + 19 /* snapshot */)); + range_del_agg.AddTombstones(std::move(input_iter)); + } -TEST_F(RangeDelAggregatorTest, TruncateTombstones) { - const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion); - const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); - VerifyRangeDels( - {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, - {{"a", 10, true}, // truncated - {"b", 10, false}, // not truncated - {"d", 10, false}, // not truncated - {"e", 10, true}}, // truncated - {{"b", "c", 10}, {"d", "e", 10}}); -} + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false}, + {InternalValue("a", 9), true}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}, + {InternalValue("h", 24), false}, + {InternalValue("i", 24), false}, + {InternalValue("ii", 14), true}, + {InternalValue("j", 14), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "c", true}, + {"d", "f", true}, + {"g", "l", true}, + {"x", "y", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregator) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}}); + std::vector> iter_bounds = { + {InternalKey("a", 4, kTypeValue), + InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("m", 20, kTypeValue), + InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}}; + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19); + for (size_t i = 0; i < fragment_lists.size(); i++) { + const auto& fragment_list = fragment_lists[i]; + const auto& bounds = iter_bounds[i]; + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + 19 /* snapshot */)); + range_del_agg.AddTombstones(std::move(input_iter), &bounds.first, + &bounds.second); + } -TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateBelowTombstone) { - const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion); - const InternalKey largest( - "e", 3, // could happen if "e" is in consecutive sstables - kTypeValue); - VerifyRangeDels( - {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, - {{"a", 10, true}, // truncated - {"b", 10, false}, // not truncated - {"d", 10, false}, // not truncated - {"e", 10, false}, // not truncated - {"e", 2, true}}, // truncated here - {{"b", "c", 10}, {"d", "e", 10}}); -} + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false}, + {InternalValue("a", 9), false}, + {InternalValue("a", 4), true}, + {InternalValue("m", 10), false}, + {InternalValue("m", 9), true}, + {InternalValue("x", 10), false}, + {InternalValue("x", 9), false}, + {InternalValue("x", 5), true}, + {InternalValue("z", 9), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "n", true}, + {"l", "x", true}, + {"w", "z", true}, + {"zzz", "zz", false}, + {"zz", "zzz", false}}); +} + +TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregatorSameLevel) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}}); + std::vector> iter_bounds = { + {InternalKey("a", 4, kTypeValue), + InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("m", 20, kTypeValue), + InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)}, + {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}}; + + ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19); + + auto add_iter_to_agg = [&](size_t i) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_lists[i].get(), + bytewise_icmp, 19 /* snapshot */)); + range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first, + &iter_bounds[i].second); + }; + + add_iter_to_agg(0); + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false}, + {InternalValue("a", 9), false}, + {InternalValue("a", 4), true}}); + + add_iter_to_agg(1); + VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false}, + {InternalValue("m", 9), true}}); + + add_iter_to_agg(2); + VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false}, + {InternalValue("x", 9), false}, + {InternalValue("x", 5), true}, + {InternalValue("z", 9), false}}); + + VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, + {"_", "a", true}, + {"a", "n", true}, + {"l", "x", true}, + {"w", "z", true}, + {"zzz", "zz", false}, + {"zz", "zzz", false}}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorNoSnapshots) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } -TEST_F(RangeDelAggregatorTest, OverlappingLargestKeyTruncateAboveTombstone) { - const InternalKey smallest("b", kMaxSequenceNumber, kTypeRangeDeletion); - const InternalKey largest( - "e", 15, // could happen if "e" is in consecutive sstables - kTypeValue); - VerifyRangeDels( - {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, - {{"a", 10, true}, // truncated - {"b", 10, false}, // not truncated - {"d", 10, false}, // not truncated - {"e", kMaxSequenceNumber, true}}, // truncated - {{"b", "c", 10}, {"d", "e", 10}}); -} + VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true}, + {InternalValue("b", 19), false}, + {InternalValue("b", 9), true}, + {InternalValue("d", 9), true}, + {InternalValue("e", 7), true}, + {InternalValue("g", 7), false}, + {InternalValue("h", 24), true}, + {InternalValue("i", 24), false}, + {InternalValue("ii", 14), true}, + {InternalValue("j", 14), false}}); + + auto range_del_compaction_iter = range_del_agg.NewIterator(); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20}, + {"b", "c", 10}, + {"c", "e", 10}, + {"e", "g", 8}, + {"h", "i", 25}, + {"ii", "j", 15}}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorWithSnapshots) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } -TEST_F(RangeDelAggregatorTest, OverlappingSmallestKeyTruncateBelowTombstone) { - const InternalKey smallest("b", 5, kTypeValue); - const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); - VerifyRangeDels( - {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, - {{"a", 10, true}, // truncated - {"b", 10, true}, // truncated - {"b", 6, false}, // not truncated; start boundary moved - {"d", 10, false}, // not truncated - {"e", kMaxSequenceNumber, true}}, // truncated - {{"b", "c", 10}, {"d", "e", 10}}); -} + VerifyShouldDelete( + &range_del_agg, + { + {InternalValue("a", 19), false}, // [10, 19] + {InternalValue("a", 9), false}, // [0, 9] + {InternalValue("b", 9), false}, // [0, 9] + {InternalValue("d", 9), false}, // [0, 9] + {InternalValue("d", 7), true}, // [0, 9] + {InternalValue("e", 7), true}, // [0, 9] + {InternalValue("g", 7), false}, // [0, 9] + {InternalValue("h", 24), true}, // [20, kMaxSequenceNumber] + {InternalValue("i", 24), false}, // [20, kMaxSequenceNumber] + {InternalValue("ii", 14), true}, // [10, 19] + {InternalValue("j", 14), false} // [10, 19] + }); + + auto range_del_compaction_iter = range_del_agg.NewIterator(); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20}, + {"a", "b", 10}, + {"b", "c", 10}, + {"c", "e", 10}, + {"c", "e", 8}, + {"e", "g", 8}, + {"h", "i", 25}, + {"ii", "j", 15}}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorLeft) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } -TEST_F(RangeDelAggregatorTest, OverlappingSmallestKeyTruncateAboveTombstone) { - const InternalKey smallest("b", 15, kTypeValue); - const InternalKey largest("e", kMaxSequenceNumber, kTypeRangeDeletion); - VerifyRangeDels( - {{{{"a", "c", 10}, {"d", "f", 10}}, &smallest, &largest}}, - {{"a", 10, true}, // truncated - {"b", 15, true}, // truncated - {"b", 10, false}, // not truncated - {"d", 10, false}, // not truncated - {"e", kMaxSequenceNumber, true}}, // truncated - {{"b", "c", 10}, {"d", "e", 10}}); + Slice start("_"); + Slice end("__"); } -TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapAboveTombstone) { - const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion); - const InternalKey largest1("c", 20, kTypeValue); - const InternalKey smallest2("c", 10, kTypeValue); - const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion); - VerifyRangeDels( - {{{{"b", "d", 5}}, &smallest1, &largest1}, - {{{"b", "d", 5}}, &smallest2, &largest2}}, - {{"b", 5, false}, // not truncated - {"c", 5, false}}, // not truncated - {{"b", "c", 5}, {"c", "d", 5}}); // not collapsed due to boundaries -} +TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); -TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapBelowTombstone) { - const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion); - const InternalKey largest1("c", 20, kTypeValue); - const InternalKey smallest2("c", 10, kTypeValue); - const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion); - VerifyRangeDels( - {{{{"b", "d", 30}}, &smallest1, &largest1}, - {{{"b", "d", 30}}, &smallest2, &largest2}}, - {{"b", 30, false}, // not truncated - {"c", 30, false}, // not truncated - {"c", 19, true}, // truncated here (keys in this range should not exist) - {"c", 11, false}}, // not truncated again - {{"b", "c", 30}, {"c", "d", 30}}); // not collapsed due to boundaries -} + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } -TEST_F(RangeDelAggregatorTest, OverlappingBoundaryGapContainsTombstone) { - const InternalKey smallest1("b", kMaxSequenceNumber, kTypeRangeDeletion); - const InternalKey largest1("c", 20, kTypeValue); - const InternalKey smallest2("c", 10, kTypeValue); - const InternalKey largest2("e", kMaxSequenceNumber, kTypeRangeDeletion); - VerifyRangeDels( - {{{{"b", "d", 15}}, &smallest1, &largest1}, - {{{"b", "d", 15}}, &smallest2, &largest2}}, - {{"b", 15, false}, // not truncated - {"c", 15, true}, // truncated (keys in this range should not exist) - {"c", 11, false}}, // not truncated here - {{"b", "c", 15}, {"c", "d", 15}}); // not collapsed due to boundaries -} + Slice start("p"); + Slice end("q"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {}); +} + +TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "e", 10}, {"c", "g", 8}}, + {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } -TEST_F(RangeDelAggregatorTest, FileCoversOneKeyAndTombstoneAbove) { - const InternalKey smallest("a", kMaxSequenceNumber, kTypeRangeDeletion); - const InternalKey largest("a", 20, kTypeValue); - VerifyRangeDels( - {{{{"a", "b", 35}}, &smallest, &largest}}, - {{"a", 40, true}, // not truncated - {"a", 35, false}}, // not truncated - {{"a", "a", 35}}); // empty tombstone but can't occur during a compaction -} + Slice start("bb"); + Slice end("e"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), + {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels( + range_del_compaction_iter2.get(), + {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}}); +} + +TEST_F(RangeDelAggregatorTest, + CompactionAggregatorBoundedIteratorExtraFragments) { + auto fragment_lists = MakeFragmentedTombstoneLists( + {{{"a", "d", 10}, {"c", "g", 8}}, + {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}}); + + std::vector snapshots{9, 19}; + CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots); + for (const auto& fragment_list : fragment_lists) { + std::unique_ptr input_iter( + new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, + kMaxSequenceNumber)); + range_del_agg.AddTombstones(std::move(input_iter)); + } -TEST_F(RangeDelAggregatorTest, FileCoversOneKeyAndTombstoneBelow) { - const InternalKey smallest("a", kMaxSequenceNumber, kTypeRangeDeletion); - const InternalKey largest("a", 20, kTypeValue); - VerifyRangeDels( - {{{{"a", "b", 15}}, &smallest, &largest}}, - {{"a", 20, true}, // truncated here - {"a", 15, true}}, // truncated - {{"a", "a", 15}}); // empty tombstone but can't occur during a compaction + Slice start("bb"); + Slice end("e"); + auto range_del_compaction_iter1 = + range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10}, + {"b", "c", 20}, + {"b", "c", 10}, + {"c", "d", 10}, + {"c", "d", 8}, + {"d", "f", 30}, + {"d", "f", 8}, + {"f", "g", 8}}); + + auto range_del_compaction_iter2 = + range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); + VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10}, + {"b", "c", 20}, + {"b", "c", 10}, + {"c", "d", 10}, + {"c", "d", 8}, + {"d", "f", 30}, + {"d", "f", 8}, + {"f", "g", 8}}); } } // namespace rocksdb diff --git a/db/range_del_aggregator_v2.cc b/db/range_del_aggregator_v2.cc deleted file mode 100644 index b0667f6fd..000000000 --- a/db/range_del_aggregator_v2.cc +++ /dev/null @@ -1,492 +0,0 @@ -// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#include "db/range_del_aggregator_v2.h" - -#include "db/compaction_iteration_stats.h" -#include "db/dbformat.h" -#include "db/pinned_iterators_manager.h" -#include "db/range_del_aggregator.h" -#include "db/range_tombstone_fragmenter.h" -#include "db/version_edit.h" -#include "include/rocksdb/comparator.h" -#include "include/rocksdb/types.h" -#include "table/internal_iterator.h" -#include "table/scoped_arena_iterator.h" -#include "table/table_builder.h" -#include "util/heap.h" -#include "util/kv_map.h" -#include "util/vector_iterator.h" - -namespace rocksdb { - -TruncatedRangeDelIterator::TruncatedRangeDelIterator( - std::unique_ptr iter, - const InternalKeyComparator* icmp, const InternalKey* smallest, - const InternalKey* largest) - : iter_(std::move(iter)), - icmp_(icmp), - smallest_ikey_(smallest), - largest_ikey_(largest) { - if (smallest != nullptr) { - pinned_bounds_.emplace_back(); - auto& parsed_smallest = pinned_bounds_.back(); - if (!ParseInternalKey(smallest->Encode(), &parsed_smallest)) { - assert(false); - } - smallest_ = &parsed_smallest; - } - if (largest != nullptr) { - pinned_bounds_.emplace_back(); - auto& parsed_largest = pinned_bounds_.back(); - if (!ParseInternalKey(largest->Encode(), &parsed_largest)) { - assert(false); - } - if (parsed_largest.type == kTypeRangeDeletion && - parsed_largest.sequence == kMaxSequenceNumber) { - // The file boundary has been artificially extended by a range tombstone. - // We do not need to adjust largest to properly truncate range - // tombstones that extend past the boundary. - } else if (parsed_largest.sequence == 0) { - // The largest key in the sstable has a sequence number of 0. Since we - // guarantee that no internal keys with the same user key and sequence - // number can exist in a DB, we know that the largest key in this sstable - // cannot exist as the smallest key in the next sstable. This further - // implies that no range tombstone in this sstable covers largest; - // otherwise, the file boundary would have been artificially extended. - // - // Therefore, we will never truncate a range tombstone at largest, so we - // can leave it unchanged. - } else { - // The same user key may straddle two sstable boundaries. To ensure that - // the truncated end key can cover the largest key in this sstable, reduce - // its sequence number by 1. - parsed_largest.sequence -= 1; - } - largest_ = &parsed_largest; - } -} - -bool TruncatedRangeDelIterator::Valid() const { - return iter_->Valid() && - (smallest_ == nullptr || - icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) && - (largest_ == nullptr || - icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0); -} - -void TruncatedRangeDelIterator::Next() { iter_->TopNext(); } - -void TruncatedRangeDelIterator::Prev() { iter_->TopPrev(); } - -void TruncatedRangeDelIterator::InternalNext() { iter_->Next(); } - -// NOTE: target is a user key -void TruncatedRangeDelIterator::Seek(const Slice& target) { - if (largest_ != nullptr && - icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber, - kTypeRangeDeletion)) <= 0) { - iter_->Invalidate(); - return; - } - if (smallest_ != nullptr && - icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) { - iter_->Seek(smallest_->user_key); - return; - } - iter_->Seek(target); -} - -// NOTE: target is a user key -void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) { - if (smallest_ != nullptr && - icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion), - *smallest_) < 0) { - iter_->Invalidate(); - return; - } - if (largest_ != nullptr && - icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) { - iter_->SeekForPrev(largest_->user_key); - return; - } - iter_->SeekForPrev(target); -} - -void TruncatedRangeDelIterator::SeekToFirst() { - if (smallest_ != nullptr) { - iter_->Seek(smallest_->user_key); - return; - } - iter_->SeekToTopFirst(); -} - -void TruncatedRangeDelIterator::SeekToLast() { - if (largest_ != nullptr) { - iter_->SeekForPrev(largest_->user_key); - return; - } - iter_->SeekToTopLast(); -} - -std::map> -TruncatedRangeDelIterator::SplitBySnapshot( - const std::vector& snapshots) { - using FragmentedIterPair = - std::pair>; - - auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots); - std::map> - split_truncated_iters; - std::for_each( - split_untruncated_iters.begin(), split_untruncated_iters.end(), - [&](FragmentedIterPair& iter_pair) { - std::unique_ptr truncated_iter( - new TruncatedRangeDelIterator(std::move(iter_pair.second), icmp_, - smallest_ikey_, largest_ikey_)); - split_truncated_iters.emplace(iter_pair.first, - std::move(truncated_iter)); - }); - return split_truncated_iters; -} - -ForwardRangeDelIterator::ForwardRangeDelIterator( - const InternalKeyComparator* icmp, - const std::vector>* iters) - : icmp_(icmp), - iters_(iters), - unused_idx_(0), - active_seqnums_(SeqMaxComparator()), - active_iters_(EndKeyMinComparator(icmp)), - inactive_iters_(StartKeyMinComparator(icmp)) {} - -bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { - assert(iters_ != nullptr); - // Move active iterators that end before parsed. - while (!active_iters_.empty() && - icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) { - TruncatedRangeDelIterator* iter = PopActiveIter(); - do { - iter->Next(); - } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0); - PushIter(iter, parsed); - assert(active_iters_.size() == active_seqnums_.size()); - } - - // Move inactive iterators that start before parsed. - while (!inactive_iters_.empty() && - icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) { - TruncatedRangeDelIterator* iter = PopInactiveIter(); - while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) { - iter->Next(); - } - PushIter(iter, parsed); - assert(active_iters_.size() == active_seqnums_.size()); - } - - return active_seqnums_.empty() - ? false - : (*active_seqnums_.begin())->seq() > parsed.sequence; -} - -void ForwardRangeDelIterator::Invalidate() { - unused_idx_ = 0; - active_iters_.clear(); - active_seqnums_.clear(); - inactive_iters_.clear(); -} - -ReverseRangeDelIterator::ReverseRangeDelIterator( - const InternalKeyComparator* icmp, - const std::vector>* iters) - : icmp_(icmp), - iters_(iters), - unused_idx_(0), - active_seqnums_(SeqMaxComparator()), - active_iters_(StartKeyMaxComparator(icmp)), - inactive_iters_(EndKeyMaxComparator(icmp)) {} - -bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { - assert(iters_ != nullptr); - // Move active iterators that start after parsed. - while (!active_iters_.empty() && - icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) { - TruncatedRangeDelIterator* iter = PopActiveIter(); - do { - iter->Prev(); - } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0); - PushIter(iter, parsed); - assert(active_iters_.size() == active_seqnums_.size()); - } - - // Move inactive iterators that end after parsed. - while (!inactive_iters_.empty() && - icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) { - TruncatedRangeDelIterator* iter = PopInactiveIter(); - while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) { - iter->Prev(); - } - PushIter(iter, parsed); - assert(active_iters_.size() == active_seqnums_.size()); - } - - return active_seqnums_.empty() - ? false - : (*active_seqnums_.begin())->seq() > parsed.sequence; -} - -void ReverseRangeDelIterator::Invalidate() { - unused_idx_ = 0; - active_iters_.clear(); - active_seqnums_.clear(); - inactive_iters_.clear(); -} - -bool RangeDelAggregatorV2::StripeRep::ShouldDelete( - const ParsedInternalKey& parsed, RangeDelPositioningMode mode) { - if (!InStripe(parsed.sequence) || IsEmpty()) { - return false; - } - switch (mode) { - case RangeDelPositioningMode::kForwardTraversal: - InvalidateReverseIter(); - - // Pick up previously unseen iterators. - for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx()); - it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) { - auto& iter = *it; - forward_iter_.AddNewIter(iter.get(), parsed); - } - - return forward_iter_.ShouldDelete(parsed); - case RangeDelPositioningMode::kBackwardTraversal: - InvalidateForwardIter(); - - // Pick up previously unseen iterators. - for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx()); - it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) { - auto& iter = *it; - reverse_iter_.AddNewIter(iter.get(), parsed); - } - - return reverse_iter_.ShouldDelete(parsed); - default: - assert(false); - return false; - } -} - -bool RangeDelAggregatorV2::StripeRep::IsRangeOverlapped(const Slice& start, - const Slice& end) { - Invalidate(); - - // Set the internal start/end keys so that: - // - if start_ikey has the same user key and sequence number as the - // current end key, start_ikey will be considered greater; and - // - if end_ikey has the same user key and sequence number as the current - // start key, end_ikey will be considered greater. - ParsedInternalKey start_ikey(start, kMaxSequenceNumber, - static_cast(0)); - ParsedInternalKey end_ikey(end, 0, static_cast(0)); - for (auto& iter : iters_) { - bool checked_candidate_tombstones = false; - for (iter->SeekForPrev(start); - iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0; - iter->Next()) { - checked_candidate_tombstones = true; - if (icmp_->Compare(start_ikey, iter->end_key()) < 0 && - icmp_->Compare(iter->start_key(), end_ikey) <= 0) { - return true; - } - } - - if (!checked_candidate_tombstones) { - // Do an additional check for when the end of the range is the begin - // key of a tombstone, which we missed earlier since SeekForPrev'ing - // to the start was invalid. - iter->SeekForPrev(end); - if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 && - icmp_->Compare(iter->start_key(), end_ikey) <= 0) { - return true; - } - } - } - return false; -} - -void ReadRangeDelAggregatorV2::AddTombstones( - std::unique_ptr input_iter, - const InternalKey* smallest, const InternalKey* largest) { - if (input_iter == nullptr || input_iter->empty()) { - return; - } - rep_.AddTombstones( - std::unique_ptr(new TruncatedRangeDelIterator( - std::move(input_iter), icmp_, smallest, largest))); -} - -bool ReadRangeDelAggregatorV2::ShouldDelete(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode) { - return rep_.ShouldDelete(parsed, mode); -} - -bool ReadRangeDelAggregatorV2::IsRangeOverlapped(const Slice& start, - const Slice& end) { - InvalidateRangeDelMapPositions(); - return rep_.IsRangeOverlapped(start, end); -} - -void CompactionRangeDelAggregatorV2::AddTombstones( - std::unique_ptr input_iter, - const InternalKey* smallest, const InternalKey* largest) { - if (input_iter == nullptr || input_iter->empty()) { - return; - } - assert(input_iter->lower_bound() == 0); - assert(input_iter->upper_bound() == kMaxSequenceNumber); - parent_iters_.emplace_back(new TruncatedRangeDelIterator( - std::move(input_iter), icmp_, smallest, largest)); - - auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_); - for (auto& split_iter : split_iters) { - auto it = reps_.find(split_iter.first); - if (it == reps_.end()) { - bool inserted; - SequenceNumber upper_bound = split_iter.second->upper_bound(); - SequenceNumber lower_bound = split_iter.second->lower_bound(); - std::tie(it, inserted) = reps_.emplace( - split_iter.first, StripeRep(icmp_, upper_bound, lower_bound)); - assert(inserted); - } - assert(it != reps_.end()); - it->second.AddTombstones(std::move(split_iter.second)); - } -} - -bool CompactionRangeDelAggregatorV2::ShouldDelete( - const ParsedInternalKey& parsed, RangeDelPositioningMode mode) { - auto it = reps_.lower_bound(parsed.sequence); - if (it == reps_.end()) { - return false; - } - return it->second.ShouldDelete(parsed, mode); -} - -namespace { - -class TruncatedRangeDelMergingIter : public InternalIterator { - public: - TruncatedRangeDelMergingIter( - const InternalKeyComparator* icmp, const Slice* lower_bound, - const Slice* upper_bound, bool upper_bound_inclusive, - const std::vector>& children) - : icmp_(icmp), - lower_bound_(lower_bound), - upper_bound_(upper_bound), - upper_bound_inclusive_(upper_bound_inclusive), - heap_(StartKeyMinComparator(icmp)) { - for (auto& child : children) { - if (child != nullptr) { - assert(child->lower_bound() == 0); - assert(child->upper_bound() == kMaxSequenceNumber); - children_.push_back(child.get()); - } - } - } - - bool Valid() const override { - return !heap_.empty() && BeforeEndKey(heap_.top()); - } - Status status() const override { return Status::OK(); } - - void SeekToFirst() override { - heap_.clear(); - for (auto& child : children_) { - if (lower_bound_ != nullptr) { - child->Seek(*lower_bound_); - } else { - child->SeekToFirst(); - } - if (child->Valid()) { - heap_.push(child); - } - } - } - - void Next() override { - auto* top = heap_.top(); - top->InternalNext(); - if (top->Valid()) { - heap_.replace_top(top); - } else { - heap_.pop(); - } - } - - Slice key() const override { - auto* top = heap_.top(); - cur_start_key_.Set(top->start_key().user_key, top->seq(), - kTypeRangeDeletion); - return cur_start_key_.Encode(); - } - - Slice value() const override { - auto* top = heap_.top(); - assert(top->end_key().sequence == kMaxSequenceNumber); - return top->end_key().user_key; - } - - // Unused InternalIterator methods - void Prev() override { assert(false); } - void Seek(const Slice& /* target */) override { assert(false); } - void SeekForPrev(const Slice& /* target */) override { assert(false); } - void SeekToLast() override { assert(false); } - - private: - bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const { - if (upper_bound_ == nullptr) { - return true; - } - int cmp = icmp_->user_comparator()->Compare(iter->start_key().user_key, - *upper_bound_); - return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0; - } - - const InternalKeyComparator* icmp_; - const Slice* lower_bound_; - const Slice* upper_bound_; - bool upper_bound_inclusive_; - BinaryHeap heap_; - std::vector children_; - - mutable InternalKey cur_start_key_; -}; - -} // namespace - -std::unique_ptr -CompactionRangeDelAggregatorV2::NewIterator(const Slice* lower_bound, - const Slice* upper_bound, - bool upper_bound_inclusive) { - InvalidateRangeDelMapPositions(); - std::unique_ptr merging_iter( - new TruncatedRangeDelMergingIter(icmp_, lower_bound, upper_bound, - upper_bound_inclusive, parent_iters_)); - - // TODO: add tests where tombstone fragments can be outside of upper and lower - // bound range - auto fragmented_tombstone_list = - std::make_shared( - std::move(merging_iter), *icmp_, true /* for_compaction */, - *snapshots_); - - return std::unique_ptr( - new FragmentedRangeTombstoneIterator( - fragmented_tombstone_list, *icmp_, - kMaxSequenceNumber /* upper_bound */)); -} - -} // namespace rocksdb diff --git a/db/range_del_aggregator_v2.h b/db/range_del_aggregator_v2.h deleted file mode 100644 index 306dbf249..000000000 --- a/db/range_del_aggregator_v2.h +++ /dev/null @@ -1,436 +0,0 @@ -// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "db/compaction_iteration_stats.h" -#include "db/dbformat.h" -#include "db/pinned_iterators_manager.h" -#include "db/range_del_aggregator.h" -#include "db/range_tombstone_fragmenter.h" -#include "db/version_edit.h" -#include "include/rocksdb/comparator.h" -#include "include/rocksdb/types.h" -#include "table/internal_iterator.h" -#include "table/scoped_arena_iterator.h" -#include "table/table_builder.h" -#include "util/heap.h" -#include "util/kv_map.h" - -namespace rocksdb { - -class TruncatedRangeDelIterator { - public: - TruncatedRangeDelIterator( - std::unique_ptr iter, - const InternalKeyComparator* icmp, const InternalKey* smallest, - const InternalKey* largest); - - bool Valid() const; - - void Next(); - void Prev(); - - void InternalNext(); - - // Seeks to the tombstone with the highest viisble sequence number that covers - // target (a user key). If no such tombstone exists, the position will be at - // the earliest tombstone that ends after target. - void Seek(const Slice& target); - - // Seeks to the tombstone with the highest viisble sequence number that covers - // target (a user key). If no such tombstone exists, the position will be at - // the latest tombstone that starts before target. - void SeekForPrev(const Slice& target); - - void SeekToFirst(); - void SeekToLast(); - - ParsedInternalKey start_key() const { - return (smallest_ == nullptr || - icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0) - ? iter_->parsed_start_key() - : *smallest_; - } - - ParsedInternalKey end_key() const { - return (largest_ == nullptr || - icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0) - ? iter_->parsed_end_key() - : *largest_; - } - - SequenceNumber seq() const { return iter_->seq(); } - - std::map> - SplitBySnapshot(const std::vector& snapshots); - - SequenceNumber upper_bound() const { return iter_->upper_bound(); } - - SequenceNumber lower_bound() const { return iter_->lower_bound(); } - - private: - std::unique_ptr iter_; - const InternalKeyComparator* icmp_; - const ParsedInternalKey* smallest_ = nullptr; - const ParsedInternalKey* largest_ = nullptr; - std::list pinned_bounds_; - - const InternalKey* smallest_ikey_; - const InternalKey* largest_ikey_; -}; - -struct SeqMaxComparator { - bool operator()(const TruncatedRangeDelIterator* a, - const TruncatedRangeDelIterator* b) const { - return a->seq() > b->seq(); - } -}; - -struct StartKeyMinComparator { - explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} - - bool operator()(const TruncatedRangeDelIterator* a, - const TruncatedRangeDelIterator* b) const { - return icmp->Compare(a->start_key(), b->start_key()) > 0; - } - - const InternalKeyComparator* icmp; -}; - -class ForwardRangeDelIterator { - public: - ForwardRangeDelIterator( - const InternalKeyComparator* icmp, - const std::vector>* iters); - - bool ShouldDelete(const ParsedInternalKey& parsed); - void Invalidate(); - - void AddNewIter(TruncatedRangeDelIterator* iter, - const ParsedInternalKey& parsed) { - iter->Seek(parsed.user_key); - PushIter(iter, parsed); - assert(active_iters_.size() == active_seqnums_.size()); - } - - size_t UnusedIdx() const { return unused_idx_; } - void IncUnusedIdx() { unused_idx_++; } - - private: - using ActiveSeqSet = - std::multiset; - - struct EndKeyMinComparator { - explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {} - - bool operator()(const ActiveSeqSet::const_iterator& a, - const ActiveSeqSet::const_iterator& b) const { - return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0; - } - - const InternalKeyComparator* icmp; - }; - - void PushIter(TruncatedRangeDelIterator* iter, - const ParsedInternalKey& parsed) { - if (!iter->Valid()) { - // The iterator has been fully consumed, so we don't need to add it to - // either of the heaps. - return; - } - int cmp = icmp_->Compare(parsed, iter->start_key()); - if (cmp < 0) { - PushInactiveIter(iter); - } else { - PushActiveIter(iter); - } - } - - void PushActiveIter(TruncatedRangeDelIterator* iter) { - auto seq_pos = active_seqnums_.insert(iter); - active_iters_.push(seq_pos); - } - - TruncatedRangeDelIterator* PopActiveIter() { - auto active_top = active_iters_.top(); - auto iter = *active_top; - active_iters_.pop(); - active_seqnums_.erase(active_top); - return iter; - } - - void PushInactiveIter(TruncatedRangeDelIterator* iter) { - inactive_iters_.push(iter); - } - - TruncatedRangeDelIterator* PopInactiveIter() { - auto* iter = inactive_iters_.top(); - inactive_iters_.pop(); - return iter; - } - - const InternalKeyComparator* icmp_; - const std::vector>* iters_; - size_t unused_idx_; - ActiveSeqSet active_seqnums_; - BinaryHeap active_iters_; - BinaryHeap inactive_iters_; -}; - -class ReverseRangeDelIterator { - public: - ReverseRangeDelIterator( - const InternalKeyComparator* icmp, - const std::vector>* iters); - - bool ShouldDelete(const ParsedInternalKey& parsed); - void Invalidate(); - - void AddNewIter(TruncatedRangeDelIterator* iter, - const ParsedInternalKey& parsed) { - iter->SeekForPrev(parsed.user_key); - PushIter(iter, parsed); - assert(active_iters_.size() == active_seqnums_.size()); - } - - size_t UnusedIdx() const { return unused_idx_; } - void IncUnusedIdx() { unused_idx_++; } - - private: - using ActiveSeqSet = - std::multiset; - - struct EndKeyMaxComparator { - explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {} - - bool operator()(const TruncatedRangeDelIterator* a, - const TruncatedRangeDelIterator* b) const { - return icmp->Compare(a->end_key(), b->end_key()) < 0; - } - - const InternalKeyComparator* icmp; - }; - struct StartKeyMaxComparator { - explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {} - - bool operator()(const ActiveSeqSet::const_iterator& a, - const ActiveSeqSet::const_iterator& b) const { - return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0; - } - - const InternalKeyComparator* icmp; - }; - - void PushIter(TruncatedRangeDelIterator* iter, - const ParsedInternalKey& parsed) { - if (!iter->Valid()) { - // The iterator has been fully consumed, so we don't need to add it to - // either of the heaps. - } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) { - PushInactiveIter(iter); - } else { - PushActiveIter(iter); - } - } - - void PushActiveIter(TruncatedRangeDelIterator* iter) { - auto seq_pos = active_seqnums_.insert(iter); - active_iters_.push(seq_pos); - } - - TruncatedRangeDelIterator* PopActiveIter() { - auto active_top = active_iters_.top(); - auto iter = *active_top; - active_iters_.pop(); - active_seqnums_.erase(active_top); - return iter; - } - - void PushInactiveIter(TruncatedRangeDelIterator* iter) { - inactive_iters_.push(iter); - } - - TruncatedRangeDelIterator* PopInactiveIter() { - auto* iter = inactive_iters_.top(); - inactive_iters_.pop(); - return iter; - } - - const InternalKeyComparator* icmp_; - const std::vector>* iters_; - size_t unused_idx_; - ActiveSeqSet active_seqnums_; - BinaryHeap active_iters_; - BinaryHeap inactive_iters_; -}; - -class RangeDelAggregatorV2 { - public: - explicit RangeDelAggregatorV2(const InternalKeyComparator* icmp) - : icmp_(icmp) {} - virtual ~RangeDelAggregatorV2() {} - - virtual void AddTombstones( - std::unique_ptr input_iter, - const InternalKey* smallest = nullptr, - const InternalKey* largest = nullptr) = 0; - - bool ShouldDelete(const Slice& key, RangeDelPositioningMode mode) { - ParsedInternalKey parsed; - if (!ParseInternalKey(key, &parsed)) { - return false; - } - return ShouldDelete(parsed, mode); - } - virtual bool ShouldDelete(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode) = 0; - - virtual void InvalidateRangeDelMapPositions() = 0; - - virtual bool IsEmpty() const = 0; - - bool AddFile(uint64_t file_number) { - return files_seen_.insert(file_number).second; - } - - protected: - class StripeRep { - public: - StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound, - SequenceNumber lower_bound) - : icmp_(icmp), - forward_iter_(icmp, &iters_), - reverse_iter_(icmp, &iters_), - upper_bound_(upper_bound), - lower_bound_(lower_bound) {} - - void AddTombstones(std::unique_ptr input_iter) { - iters_.push_back(std::move(input_iter)); - } - - bool IsEmpty() const { return iters_.empty(); } - - bool ShouldDelete(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode); - - void Invalidate() { - InvalidateForwardIter(); - InvalidateReverseIter(); - } - - bool IsRangeOverlapped(const Slice& start, const Slice& end); - - private: - bool InStripe(SequenceNumber seq) const { - return lower_bound_ <= seq && seq <= upper_bound_; - } - - void InvalidateForwardIter() { forward_iter_.Invalidate(); } - - void InvalidateReverseIter() { reverse_iter_.Invalidate(); } - - const InternalKeyComparator* icmp_; - std::vector> iters_; - ForwardRangeDelIterator forward_iter_; - ReverseRangeDelIterator reverse_iter_; - SequenceNumber upper_bound_; - SequenceNumber lower_bound_; - }; - - const InternalKeyComparator* icmp_; - - private: - std::set files_seen_; -}; - -class ReadRangeDelAggregatorV2 : public RangeDelAggregatorV2 { - public: - ReadRangeDelAggregatorV2(const InternalKeyComparator* icmp, - SequenceNumber upper_bound) - : RangeDelAggregatorV2(icmp), - rep_(icmp, upper_bound, 0 /* lower_bound */) {} - ~ReadRangeDelAggregatorV2() override {} - - using RangeDelAggregatorV2::ShouldDelete; - void AddTombstones( - std::unique_ptr input_iter, - const InternalKey* smallest = nullptr, - const InternalKey* largest = nullptr) override; - - bool ShouldDelete(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode) override; - - bool IsRangeOverlapped(const Slice& start, const Slice& end); - - void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); } - - bool IsEmpty() const override { return rep_.IsEmpty(); } - - private: - StripeRep rep_; -}; - -class CompactionRangeDelAggregatorV2 : public RangeDelAggregatorV2 { - public: - CompactionRangeDelAggregatorV2(const InternalKeyComparator* icmp, - const std::vector& snapshots) - : RangeDelAggregatorV2(icmp), snapshots_(&snapshots) {} - ~CompactionRangeDelAggregatorV2() override {} - - void AddTombstones( - std::unique_ptr input_iter, - const InternalKey* smallest = nullptr, - const InternalKey* largest = nullptr) override; - - using RangeDelAggregatorV2::ShouldDelete; - bool ShouldDelete(const ParsedInternalKey& parsed, - RangeDelPositioningMode mode) override; - - bool IsRangeOverlapped(const Slice& start, const Slice& end); - - void InvalidateRangeDelMapPositions() override { - for (auto& rep : reps_) { - rep.second.Invalidate(); - } - } - - bool IsEmpty() const override { - for (const auto& rep : reps_) { - if (!rep.second.IsEmpty()) { - return false; - } - } - return true; - } - - // Creates an iterator over all the range tombstones in the aggregator, for - // use in compaction. Nullptr arguments indicate that the iterator range is - // unbounded. - // NOTE: the boundaries are used for optimization purposes to reduce the - // number of tombstones that are passed to the fragmenter; they do not - // guarantee that the resulting iterator only contains range tombstones that - // cover keys in the provided range. If required, these bounds must be - // enforced during iteration. - std::unique_ptr NewIterator( - const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr, - bool upper_bound_inclusive = false); - - private: - std::vector> parent_iters_; - std::map reps_; - - const std::vector* snapshots_; -}; - -} // namespace rocksdb diff --git a/db/range_del_aggregator_v2_test.cc b/db/range_del_aggregator_v2_test.cc deleted file mode 100644 index 64f8ed079..000000000 --- a/db/range_del_aggregator_v2_test.cc +++ /dev/null @@ -1,709 +0,0 @@ -// Copyright (c) 2018-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#include "db/range_del_aggregator_v2.h" - -#include -#include -#include - -#include "db/db_test_util.h" -#include "db/dbformat.h" -#include "db/range_tombstone_fragmenter.h" -#include "util/testutil.h" - -namespace rocksdb { - -class RangeDelAggregatorV2Test : public testing::Test {}; - -namespace { - -static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator()); - -std::unique_ptr MakeRangeDelIter( - const std::vector& range_dels) { - std::vector keys, values; - for (const auto& range_del : range_dels) { - auto key_and_value = range_del.Serialize(); - keys.push_back(key_and_value.first.Encode().ToString()); - values.push_back(key_and_value.second.ToString()); - } - return std::unique_ptr( - new test::VectorIterator(keys, values)); -} - -std::vector> -MakeFragmentedTombstoneLists( - const std::vector>& range_dels_list) { - std::vector> fragment_lists; - for (const auto& range_dels : range_dels_list) { - auto range_del_iter = MakeRangeDelIter(range_dels); - fragment_lists.emplace_back(new FragmentedRangeTombstoneList( - std::move(range_del_iter), bytewise_icmp)); - } - return fragment_lists; -} - -struct TruncatedIterScanTestCase { - ParsedInternalKey start; - ParsedInternalKey end; - SequenceNumber seq; -}; - -struct TruncatedIterSeekTestCase { - Slice target; - ParsedInternalKey start; - ParsedInternalKey end; - SequenceNumber seq; - bool invalid; -}; - -struct ShouldDeleteTestCase { - ParsedInternalKey lookup_key; - bool result; -}; - -struct IsRangeOverlappedTestCase { - Slice start; - Slice end; - bool result; -}; - -ParsedInternalKey UncutEndpoint(const Slice& s) { - return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion); -} - -ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq) { - return ParsedInternalKey(key, seq, kTypeValue); -} - -void VerifyIterator( - TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp, - const std::vector& expected_range_dels) { - // Test forward iteration. - iter->SeekToFirst(); - for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) { - ASSERT_TRUE(iter->Valid()); - EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start)); - EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end)); - EXPECT_EQ(expected_range_dels[i].seq, iter->seq()); - } - EXPECT_FALSE(iter->Valid()); - - // Test reverse iteration. - iter->SeekToLast(); - std::vector reverse_expected_range_dels( - expected_range_dels.rbegin(), expected_range_dels.rend()); - for (size_t i = 0; i < reverse_expected_range_dels.size(); - i++, iter->Prev()) { - ASSERT_TRUE(iter->Valid()); - EXPECT_EQ(0, icmp.Compare(iter->start_key(), - reverse_expected_range_dels[i].start)); - EXPECT_EQ( - 0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end)); - EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq()); - } - EXPECT_FALSE(iter->Valid()); -} - -void VerifySeek(TruncatedRangeDelIterator* iter, - const InternalKeyComparator& icmp, - const std::vector& test_cases) { - for (const auto& test_case : test_cases) { - iter->Seek(test_case.target); - if (test_case.invalid) { - ASSERT_FALSE(iter->Valid()); - } else { - ASSERT_TRUE(iter->Valid()); - EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start)); - EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end)); - EXPECT_EQ(test_case.seq, iter->seq()); - } - } -} - -void VerifySeekForPrev( - TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp, - const std::vector& test_cases) { - for (const auto& test_case : test_cases) { - iter->SeekForPrev(test_case.target); - if (test_case.invalid) { - ASSERT_FALSE(iter->Valid()); - } else { - ASSERT_TRUE(iter->Valid()); - EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start)); - EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end)); - EXPECT_EQ(test_case.seq, iter->seq()); - } - } -} - -void VerifyShouldDelete(RangeDelAggregatorV2* range_del_agg, - const std::vector& test_cases) { - for (const auto& test_case : test_cases) { - EXPECT_EQ( - test_case.result, - range_del_agg->ShouldDelete( - test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal)); - } - for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) { - const auto& test_case = *it; - EXPECT_EQ( - test_case.result, - range_del_agg->ShouldDelete( - test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal)); - } -} - -void VerifyIsRangeOverlapped( - ReadRangeDelAggregatorV2* range_del_agg, - const std::vector& test_cases) { - for (const auto& test_case : test_cases) { - EXPECT_EQ(test_case.result, - range_del_agg->IsRangeOverlapped(test_case.start, test_case.end)); - } -} - -void CheckIterPosition(const RangeTombstone& tombstone, - const FragmentedRangeTombstoneIterator* iter) { - // Test InternalIterator interface. - EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key())); - EXPECT_EQ(tombstone.end_key_, iter->value()); - EXPECT_EQ(tombstone.seq_, iter->seq()); - - // Test FragmentedRangeTombstoneIterator interface. - EXPECT_EQ(tombstone.start_key_, iter->start_key()); - EXPECT_EQ(tombstone.end_key_, iter->end_key()); - EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key())); -} - -void VerifyFragmentedRangeDels( - FragmentedRangeTombstoneIterator* iter, - const std::vector& expected_tombstones) { - iter->SeekToFirst(); - for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) { - ASSERT_TRUE(iter->Valid()); - CheckIterPosition(expected_tombstones[i], iter); - } - EXPECT_FALSE(iter->Valid()); -} - -} // namespace - -TEST_F(RangeDelAggregatorV2Test, EmptyTruncatedIter) { - auto range_del_iter = MakeRangeDelIter({}); - FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), - bytewise_icmp); - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, - kMaxSequenceNumber)); - - TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, - nullptr); - - iter.SeekToFirst(); - ASSERT_FALSE(iter.Valid()); - - iter.SeekToLast(); - ASSERT_FALSE(iter.Valid()); -} - -TEST_F(RangeDelAggregatorV2Test, UntruncatedIter) { - auto range_del_iter = - MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); - FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), - bytewise_icmp); - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, - kMaxSequenceNumber)); - - TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, - nullptr); - - VerifyIterator(&iter, bytewise_icmp, - {{UncutEndpoint("a"), UncutEndpoint("e"), 10}, - {UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {UncutEndpoint("j"), UncutEndpoint("n"), 4}}); - - VerifySeek( - &iter, bytewise_icmp, - {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4}, - {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, - {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}}); - - VerifySeekForPrev( - &iter, bytewise_icmp, - {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4}, - {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); -} - -TEST_F(RangeDelAggregatorV2Test, UntruncatedIterWithSnapshot) { - auto range_del_iter = - MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); - FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), - bytewise_icmp); - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, - 9 /* snapshot */)); - - TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, - nullptr); - - VerifyIterator(&iter, bytewise_icmp, - {{UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {UncutEndpoint("j"), UncutEndpoint("n"), 4}}); - - VerifySeek( - &iter, bytewise_icmp, - {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4}, - {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, - {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}}); - - VerifySeekForPrev( - &iter, bytewise_icmp, - {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4}, - {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); -} - -TEST_F(RangeDelAggregatorV2Test, TruncatedIterPartiallyCutTombstones) { - auto range_del_iter = - MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); - FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), - bytewise_icmp); - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, - kMaxSequenceNumber)); - - InternalKey smallest("d", 7, kTypeValue); - InternalKey largest("m", 9, kTypeValue); - TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, - &smallest, &largest); - - VerifyIterator(&iter, bytewise_icmp, - {{InternalValue("d", 7), UncutEndpoint("e"), 10}, - {UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {UncutEndpoint("j"), InternalValue("m", 8), 4}}); - - VerifySeek( - &iter, bytewise_icmp, - {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("j"), InternalValue("m", 8), 4}, - {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, - {"", InternalValue("d", 7), UncutEndpoint("e"), 10}}); - - VerifySeekForPrev( - &iter, bytewise_icmp, - {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"n", UncutEndpoint("j"), InternalValue("m", 8), 4}, - {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); -} - -TEST_F(RangeDelAggregatorV2Test, TruncatedIterFullyCutTombstones) { - auto range_del_iter = - MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}}); - FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), - bytewise_icmp); - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, - kMaxSequenceNumber)); - - InternalKey smallest("f", 7, kTypeValue); - InternalKey largest("i", 9, kTypeValue); - TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, - &smallest, &largest); - - VerifyIterator(&iter, bytewise_icmp, - {{InternalValue("f", 7), UncutEndpoint("g"), 8}}); - - VerifySeek( - &iter, bytewise_icmp, - {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8}, - {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, - {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); - - VerifySeekForPrev( - &iter, bytewise_icmp, - {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, - {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, - {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}}); -} - -TEST_F(RangeDelAggregatorV2Test, SingleIterInAggregator) { - auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}}); - FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter), - bytewise_icmp); - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp, - kMaxSequenceNumber)); - - ReadRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, kMaxSequenceNumber); - range_del_agg.AddTombstones(std::move(input_iter)); - - VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false}, - {InternalValue("b", 9), true}, - {InternalValue("d", 9), true}, - {InternalValue("e", 7), true}, - {InternalValue("g", 7), false}}); - - VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, - {"_", "a", true}, - {"a", "c", true}, - {"d", "f", true}, - {"g", "l", false}}); -} - -TEST_F(RangeDelAggregatorV2Test, MultipleItersInAggregator) { - auto fragment_lists = MakeFragmentedTombstoneLists( - {{{"a", "e", 10}, {"c", "g", 8}}, - {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); - - ReadRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, kMaxSequenceNumber); - for (const auto& fragment_list : fragment_lists) { - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, - kMaxSequenceNumber)); - range_del_agg.AddTombstones(std::move(input_iter)); - } - - VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true}, - {InternalValue("b", 19), false}, - {InternalValue("b", 9), true}, - {InternalValue("d", 9), true}, - {InternalValue("e", 7), true}, - {InternalValue("g", 7), false}, - {InternalValue("h", 24), true}, - {InternalValue("i", 24), false}, - {InternalValue("ii", 14), true}, - {InternalValue("j", 14), false}}); - - VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, - {"_", "a", true}, - {"a", "c", true}, - {"d", "f", true}, - {"g", "l", true}, - {"x", "y", false}}); -} - -TEST_F(RangeDelAggregatorV2Test, MultipleItersInAggregatorWithUpperBound) { - auto fragment_lists = MakeFragmentedTombstoneLists( - {{{"a", "e", 10}, {"c", "g", 8}}, - {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); - - ReadRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19); - for (const auto& fragment_list : fragment_lists) { - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, - 19 /* snapshot */)); - range_del_agg.AddTombstones(std::move(input_iter)); - } - - VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false}, - {InternalValue("a", 9), true}, - {InternalValue("b", 9), true}, - {InternalValue("d", 9), true}, - {InternalValue("e", 7), true}, - {InternalValue("g", 7), false}, - {InternalValue("h", 24), false}, - {InternalValue("i", 24), false}, - {InternalValue("ii", 14), true}, - {InternalValue("j", 14), false}}); - - VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, - {"_", "a", true}, - {"a", "c", true}, - {"d", "f", true}, - {"g", "l", true}, - {"x", "y", false}}); -} - -TEST_F(RangeDelAggregatorV2Test, MultipleTruncatedItersInAggregator) { - auto fragment_lists = MakeFragmentedTombstoneLists( - {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}}); - std::vector> iter_bounds = { - {InternalKey("a", 4, kTypeValue), - InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)}, - {InternalKey("m", 20, kTypeValue), - InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)}, - {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}}; - - ReadRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19); - for (size_t i = 0; i < fragment_lists.size(); i++) { - const auto& fragment_list = fragment_lists[i]; - const auto& bounds = iter_bounds[i]; - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, - 19 /* snapshot */)); - range_del_agg.AddTombstones(std::move(input_iter), &bounds.first, - &bounds.second); - } - - VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false}, - {InternalValue("a", 9), false}, - {InternalValue("a", 4), true}, - {InternalValue("m", 10), false}, - {InternalValue("m", 9), true}, - {InternalValue("x", 10), false}, - {InternalValue("x", 9), false}, - {InternalValue("x", 5), true}, - {InternalValue("z", 9), false}}); - - VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, - {"_", "a", true}, - {"a", "n", true}, - {"l", "x", true}, - {"w", "z", true}, - {"zzz", "zz", false}, - {"zz", "zzz", false}}); -} - -TEST_F(RangeDelAggregatorV2Test, MultipleTruncatedItersInAggregatorSameLevel) { - auto fragment_lists = MakeFragmentedTombstoneLists( - {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}}); - std::vector> iter_bounds = { - {InternalKey("a", 4, kTypeValue), - InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)}, - {InternalKey("m", 20, kTypeValue), - InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)}, - {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}}; - - ReadRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, 19); - - auto add_iter_to_agg = [&](size_t i) { - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_lists[i].get(), - bytewise_icmp, 19 /* snapshot */)); - range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first, - &iter_bounds[i].second); - }; - - add_iter_to_agg(0); - VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false}, - {InternalValue("a", 9), false}, - {InternalValue("a", 4), true}}); - - add_iter_to_agg(1); - VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false}, - {InternalValue("m", 9), true}}); - - add_iter_to_agg(2); - VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false}, - {InternalValue("x", 9), false}, - {InternalValue("x", 5), true}, - {InternalValue("z", 9), false}}); - - VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false}, - {"_", "a", true}, - {"a", "n", true}, - {"l", "x", true}, - {"w", "z", true}, - {"zzz", "zz", false}, - {"zz", "zzz", false}}); -} - -TEST_F(RangeDelAggregatorV2Test, CompactionAggregatorNoSnapshots) { - auto fragment_lists = MakeFragmentedTombstoneLists( - {{{"a", "e", 10}, {"c", "g", 8}}, - {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); - - std::vector snapshots; - CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); - for (const auto& fragment_list : fragment_lists) { - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, - kMaxSequenceNumber)); - range_del_agg.AddTombstones(std::move(input_iter)); - } - - VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true}, - {InternalValue("b", 19), false}, - {InternalValue("b", 9), true}, - {InternalValue("d", 9), true}, - {InternalValue("e", 7), true}, - {InternalValue("g", 7), false}, - {InternalValue("h", 24), true}, - {InternalValue("i", 24), false}, - {InternalValue("ii", 14), true}, - {InternalValue("j", 14), false}}); - - auto range_del_compaction_iter = range_del_agg.NewIterator(); - VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20}, - {"b", "c", 10}, - {"c", "e", 10}, - {"e", "g", 8}, - {"h", "i", 25}, - {"ii", "j", 15}}); -} - -TEST_F(RangeDelAggregatorV2Test, CompactionAggregatorWithSnapshots) { - auto fragment_lists = MakeFragmentedTombstoneLists( - {{{"a", "e", 10}, {"c", "g", 8}}, - {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); - - std::vector snapshots{9, 19}; - CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); - for (const auto& fragment_list : fragment_lists) { - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, - kMaxSequenceNumber)); - range_del_agg.AddTombstones(std::move(input_iter)); - } - - VerifyShouldDelete( - &range_del_agg, - { - {InternalValue("a", 19), false}, // [10, 19] - {InternalValue("a", 9), false}, // [0, 9] - {InternalValue("b", 9), false}, // [0, 9] - {InternalValue("d", 9), false}, // [0, 9] - {InternalValue("d", 7), true}, // [0, 9] - {InternalValue("e", 7), true}, // [0, 9] - {InternalValue("g", 7), false}, // [0, 9] - {InternalValue("h", 24), true}, // [20, kMaxSequenceNumber] - {InternalValue("i", 24), false}, // [20, kMaxSequenceNumber] - {InternalValue("ii", 14), true}, // [10, 19] - {InternalValue("j", 14), false} // [10, 19] - }); - - auto range_del_compaction_iter = range_del_agg.NewIterator(); - VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20}, - {"a", "b", 10}, - {"b", "c", 10}, - {"c", "e", 10}, - {"c", "e", 8}, - {"e", "g", 8}, - {"h", "i", 25}, - {"ii", "j", 15}}); -} - -TEST_F(RangeDelAggregatorV2Test, CompactionAggregatorEmptyIteratorLeft) { - auto fragment_lists = MakeFragmentedTombstoneLists( - {{{"a", "e", 10}, {"c", "g", 8}}, - {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); - - std::vector snapshots{9, 19}; - CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); - for (const auto& fragment_list : fragment_lists) { - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, - kMaxSequenceNumber)); - range_del_agg.AddTombstones(std::move(input_iter)); - } - - Slice start("_"); - Slice end("__"); -} - -TEST_F(RangeDelAggregatorV2Test, CompactionAggregatorEmptyIteratorRight) { - auto fragment_lists = MakeFragmentedTombstoneLists( - {{{"a", "e", 10}, {"c", "g", 8}}, - {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); - - std::vector snapshots{9, 19}; - CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); - for (const auto& fragment_list : fragment_lists) { - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, - kMaxSequenceNumber)); - range_del_agg.AddTombstones(std::move(input_iter)); - } - - Slice start("p"); - Slice end("q"); - auto range_del_compaction_iter1 = - range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); - VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {}); - - auto range_del_compaction_iter2 = - range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); - VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {}); -} - -TEST_F(RangeDelAggregatorV2Test, CompactionAggregatorBoundedIterator) { - auto fragment_lists = MakeFragmentedTombstoneLists( - {{{"a", "e", 10}, {"c", "g", 8}}, - {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}}); - - std::vector snapshots{9, 19}; - CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); - for (const auto& fragment_list : fragment_lists) { - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, - kMaxSequenceNumber)); - range_del_agg.AddTombstones(std::move(input_iter)); - } - - Slice start("bb"); - Slice end("e"); - auto range_del_compaction_iter1 = - range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); - VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), - {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}}); - - auto range_del_compaction_iter2 = - range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); - VerifyFragmentedRangeDels( - range_del_compaction_iter2.get(), - {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}}); -} - -TEST_F(RangeDelAggregatorV2Test, - CompactionAggregatorBoundedIteratorExtraFragments) { - auto fragment_lists = MakeFragmentedTombstoneLists( - {{{"a", "d", 10}, {"c", "g", 8}}, - {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}}); - - std::vector snapshots{9, 19}; - CompactionRangeDelAggregatorV2 range_del_agg(&bytewise_icmp, snapshots); - for (const auto& fragment_list : fragment_lists) { - std::unique_ptr input_iter( - new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp, - kMaxSequenceNumber)); - range_del_agg.AddTombstones(std::move(input_iter)); - } - - Slice start("bb"); - Slice end("e"); - auto range_del_compaction_iter1 = - range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); - VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10}, - {"b", "c", 20}, - {"b", "c", 10}, - {"c", "d", 10}, - {"c", "d", 8}, - {"d", "f", 30}, - {"d", "f", 8}, - {"f", "g", 8}}); - - auto range_del_compaction_iter2 = - range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); - VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10}, - {"b", "c", 20}, - {"b", "c", 10}, - {"c", "d", 10}, - {"c", "d", 8}, - {"d", "f", 30}, - {"d", "f", 8}, - {"f", "g", 8}}); -} - -} // namespace rocksdb - -int main(int argc, char** argv) { - ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/db/table_cache.cc b/db/table_cache.cc index 829f5b21f..5c0f95716 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -185,7 +185,7 @@ Status TableCache::FindTable(const EnvOptions& env_options, InternalIterator* TableCache::NewIterator( const ReadOptions& options, const EnvOptions& env_options, const InternalKeyComparator& icomparator, const FileMetaData& file_meta, - RangeDelAggregatorV2* range_del_agg, const SliceTransform* prefix_extractor, + RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor, TableReader** table_reader_ptr, HistogramImpl* file_read_hist, bool for_compaction, Arena* arena, bool skip_filters, int level, const InternalKey* smallest_compaction_key, diff --git a/db/table_cache.h b/db/table_cache.h index 04485c4dc..e3936ab44 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -15,7 +15,7 @@ #include #include "db/dbformat.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "options/cf_options.h" #include "port/port.h" #include "rocksdb/cache.h" @@ -52,7 +52,7 @@ class TableCache { InternalIterator* NewIterator( const ReadOptions& options, const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, RangeDelAggregatorV2* range_del_agg, + const FileMetaData& file_meta, RangeDelAggregator* range_del_agg, const SliceTransform* prefix_extractor = nullptr, TableReader** table_reader_ptr = nullptr, HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, diff --git a/db/version_set.cc b/db/version_set.cc index ad5f898d0..cb7b08db0 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -459,7 +459,7 @@ class LevelIterator final : public InternalIterator { const EnvOptions& env_options, const InternalKeyComparator& icomparator, const LevelFilesBrief* flevel, const SliceTransform* prefix_extractor, bool should_sample, HistogramImpl* file_read_hist, bool for_compaction, - bool skip_filters, int level, RangeDelAggregatorV2* range_del_agg, + bool skip_filters, int level, RangeDelAggregator* range_del_agg, const std::vector* compaction_boundaries = nullptr) : table_cache_(table_cache), @@ -571,7 +571,7 @@ class LevelIterator final : public InternalIterator { bool skip_filters_; size_t file_index_; int level_; - RangeDelAggregatorV2* range_del_agg_; + RangeDelAggregator* range_del_agg_; IteratorWrapper file_iter_; // May be nullptr PinnedIteratorsManager* pinned_iters_mgr_; @@ -985,7 +985,7 @@ double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel( void Version::AddIterators(const ReadOptions& read_options, const EnvOptions& soptions, MergeIteratorBuilder* merge_iter_builder, - RangeDelAggregatorV2* range_del_agg) { + RangeDelAggregator* range_del_agg) { assert(storage_info_.finalized_); for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { @@ -998,7 +998,7 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, const EnvOptions& soptions, MergeIteratorBuilder* merge_iter_builder, int level, - RangeDelAggregatorV2* range_del_agg) { + RangeDelAggregator* range_del_agg) { assert(storage_info_.finalized_); if (level >= storage_info_.num_non_empty_levels()) { // This is an empty level @@ -1057,8 +1057,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, Arena arena; Status status; - ReadRangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); *overlap = false; @@ -4253,7 +4253,7 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { } InternalIterator* VersionSet::MakeInputIterator( - const Compaction* c, RangeDelAggregatorV2* range_del_agg, + const Compaction* c, RangeDelAggregator* range_del_agg, const EnvOptions& env_options_compactions) { auto cfd = c->column_family_data(); ReadOptions read_options; diff --git a/db/version_set.h b/db/version_set.h index ec9084beb..b50f653ba 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -34,7 +34,7 @@ #include "db/dbformat.h" #include "db/file_indexer.h" #include "db/log_reader.h" -#include "db/range_del_aggregator_v2.h" +#include "db/range_del_aggregator.h" #include "db/read_callback.h" #include "db/table_cache.h" #include "db/version_builder.h" @@ -538,11 +538,11 @@ class Version { // REQUIRES: This version has been saved (see VersionSet::SaveTo) void AddIterators(const ReadOptions&, const EnvOptions& soptions, MergeIteratorBuilder* merger_iter_builder, - RangeDelAggregatorV2* range_del_agg); + RangeDelAggregator* range_del_agg); void AddIteratorsForLevel(const ReadOptions&, const EnvOptions& soptions, MergeIteratorBuilder* merger_iter_builder, - int level, RangeDelAggregatorV2* range_del_agg); + int level, RangeDelAggregator* range_del_agg); Status OverlapWithLevelIterator(const ReadOptions&, const EnvOptions&, const Slice& smallest_user_key, @@ -935,7 +935,7 @@ class VersionSet { // Create an iterator that reads over the compaction inputs for "*c". // The caller should delete the iterator when no longer needed. InternalIterator* MakeInputIterator( - const Compaction* c, RangeDelAggregatorV2* range_del_agg, + const Compaction* c, RangeDelAggregator* range_del_agg, const EnvOptions& env_options_compactions); // Add all files listed in any live version to *live. diff --git a/src.mk b/src.mk index 5ba7f4b7c..990aa2ab7 100644 --- a/src.mk +++ b/src.mk @@ -44,7 +44,6 @@ LIB_SOURCES = \ db/merge_helper.cc \ db/merge_operator.cc \ db/range_del_aggregator.cc \ - db/range_del_aggregator_v2.cc \ db/range_tombstone_fragmenter.cc \ db/repair.cc \ db/snapshot_impl.cc \ @@ -334,7 +333,6 @@ MAIN_SOURCES = \ db/repair_test.cc \ db/range_del_aggregator_test.cc \ db/range_del_aggregator_bench.cc \ - db/range_del_aggregator_v2_test.cc \ db/range_tombstone_fragmenter_test.cc \ db/table_properties_collector_test.cc \ db/util_merge_operators_test.cc \ diff --git a/utilities/debug.cc b/utilities/debug.cc index 3dfde980e..72fcbf0f5 100644 --- a/utilities/debug.cc +++ b/utilities/debug.cc @@ -19,8 +19,8 @@ Status GetAllKeyVersions(DB* db, Slice begin_key, Slice end_key, DBImpl* idb = static_cast(db->GetRootDB()); auto icmp = InternalKeyComparator(idb->GetOptions().comparator); - ReadRangeDelAggregatorV2 range_del_agg(&icmp, - kMaxSequenceNumber /* upper_bound */); + ReadRangeDelAggregator range_del_agg(&icmp, + kMaxSequenceNumber /* upper_bound */); Arena arena; ScopedArenaIterator iter( idb->NewInternalIterator(&arena, &range_del_agg, kMaxSequenceNumber)); From de0891ec01ad762010380a0af37e74d59763df41 Mon Sep 17 00:00:00 2001 From: Abhishek Madan Date: Tue, 18 Dec 2018 14:10:31 -0800 Subject: [PATCH 04/57] Fix unused member compile error Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/4793 Differential Revision: D13509363 Pulled By: abhimadan fbshipit-source-id: 530b4765e3335d6ecd016bfaa89645f8aa98c61f --- db/range_del_aggregator.cc | 10 ++-------- db/range_del_aggregator.h | 14 ++++---------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 8a6b0a51f..3685d717d 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -154,17 +154,14 @@ TruncatedRangeDelIterator::SplitBySnapshot( } ForwardRangeDelIterator::ForwardRangeDelIterator( - const InternalKeyComparator* icmp, - const std::vector>* iters) + const InternalKeyComparator* icmp) : icmp_(icmp), - iters_(iters), unused_idx_(0), active_seqnums_(SeqMaxComparator()), active_iters_(EndKeyMinComparator(icmp)), inactive_iters_(StartKeyMinComparator(icmp)) {} bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { - assert(iters_ != nullptr); // Move active iterators that end before parsed. while (!active_iters_.empty() && icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) { @@ -200,17 +197,14 @@ void ForwardRangeDelIterator::Invalidate() { } ReverseRangeDelIterator::ReverseRangeDelIterator( - const InternalKeyComparator* icmp, - const std::vector>* iters) + const InternalKeyComparator* icmp) : icmp_(icmp), - iters_(iters), unused_idx_(0), active_seqnums_(SeqMaxComparator()), active_iters_(StartKeyMaxComparator(icmp)), inactive_iters_(EndKeyMaxComparator(icmp)) {} bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) { - assert(iters_ != nullptr); // Move active iterators that start after parsed. while (!active_iters_.empty() && icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) { diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index a59cbaf1b..712ae4583 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -110,9 +110,7 @@ struct StartKeyMinComparator { class ForwardRangeDelIterator { public: - ForwardRangeDelIterator( - const InternalKeyComparator* icmp, - const std::vector>* iters); + explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp); bool ShouldDelete(const ParsedInternalKey& parsed); void Invalidate(); @@ -181,7 +179,6 @@ class ForwardRangeDelIterator { } const InternalKeyComparator* icmp_; - const std::vector>* iters_; size_t unused_idx_; ActiveSeqSet active_seqnums_; BinaryHeap active_iters_; @@ -190,9 +187,7 @@ class ForwardRangeDelIterator { class ReverseRangeDelIterator { public: - ReverseRangeDelIterator( - const InternalKeyComparator* icmp, - const std::vector>* iters); + explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp); bool ShouldDelete(const ParsedInternalKey& parsed); void Invalidate(); @@ -268,7 +263,6 @@ class ReverseRangeDelIterator { } const InternalKeyComparator* icmp_; - const std::vector>* iters_; size_t unused_idx_; ActiveSeqSet active_seqnums_; BinaryHeap active_iters_; @@ -311,8 +305,8 @@ class RangeDelAggregator { StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound, SequenceNumber lower_bound) : icmp_(icmp), - forward_iter_(icmp, &iters_), - reverse_iter_(icmp, &iters_), + forward_iter_(icmp), + reverse_iter_(icmp), upper_bound_(upper_bound), lower_bound_(lower_bound) {} From 8a643b70fd5511cc16f8398148518ba7444b14ae Mon Sep 17 00:00:00 2001 From: Yi Wu Date: Thu, 3 Jan 2019 16:26:31 -0800 Subject: [PATCH 05/57] Detect if Jemalloc is linked with the binary (#4844) Summary: Declare Jemalloc non-standard APIs as weak symbols, so that if Jemalloc is linked with the binary, these symbols will be replaced by Jemalloc's, otherwise they will be nullptr. This is similar to how folly detect jemalloc, but we assume the main program use jemalloc as long as jemalloc is linked: https://github.com/facebook/folly/blob/master/folly/memory/Malloc.h#L147 Pull Request resolved: https://github.com/facebook/rocksdb/pull/4844 Differential Revision: D13574934 Pulled By: yiwu-arbug fbshipit-source-id: 7ea871beb1be7d5a1259cc38f9b78078793db2db --- TARGETS | 8 ++--- buckifier/targets_cfg.py | 8 ++--- db/malloc_stats.cc | 17 +++++------ port/jemalloc_helper.h | 49 +++++++++++++++++++++++++++++++ util/jemalloc_nodump_allocator.cc | 10 +++++-- util/jemalloc_nodump_allocator.h | 2 +- 6 files changed, 71 insertions(+), 23 deletions(-) create mode 100644 port/jemalloc_helper.h diff --git a/TARGETS b/TARGETS index 96b6e6e62..43f8bd5b2 100644 --- a/TARGETS +++ b/TARGETS @@ -67,13 +67,11 @@ is_opt_mode = build_mode.startswith("opt") if is_opt_mode: rocksdb_compiler_flags.append("-DNDEBUG") -default_allocator = read_config("fbcode", "default_allocator") - sanitizer = read_config("fbcode", "sanitizer") -# Let RocksDB aware of jemalloc existence. -# Do not enable it if sanitizer presents. -if is_opt_mode and default_allocator.startswith("jemalloc") and sanitizer == "": +# Do not enable jemalloc if sanitizer presents. RocksDB will further detect +# whether the binary is linked with jemalloc at runtime. +if sanitizer == "": rocksdb_compiler_flags.append("-DROCKSDB_JEMALLOC") rocksdb_external_deps.append(("jemalloc", None, "headers")) diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py index 5378f697a..7a2198fa7 100644 --- a/buckifier/targets_cfg.py +++ b/buckifier/targets_cfg.py @@ -71,13 +71,11 @@ if is_opt_mode: rocksdb_compiler_flags.append("-DNDEBUG") -default_allocator = read_config("fbcode", "default_allocator") - sanitizer = read_config("fbcode", "sanitizer") -# Let RocksDB aware of jemalloc existence. -# Do not enable it if sanitizer presents. -if is_opt_mode and default_allocator.startswith("jemalloc") and sanitizer == "": +# Do not enable jemalloc if sanitizer presents. RocksDB will further detect +# whether the binary is linked with jemalloc at runtime. +if sanitizer == "": rocksdb_compiler_flags.append("-DROCKSDB_JEMALLOC") rocksdb_external_deps.append(("jemalloc", None, "headers")) """ diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc index ba971b547..bcee5c3fb 100644 --- a/db/malloc_stats.cc +++ b/db/malloc_stats.cc @@ -13,17 +13,16 @@ #include #include +#include "port/jemalloc_helper.h" + + namespace rocksdb { #ifdef ROCKSDB_JEMALLOC -#ifdef __FreeBSD__ -#include -#else -#include "jemalloc/jemalloc.h" + #ifdef JEMALLOC_NO_RENAME #define malloc_stats_print je_malloc_stats_print #endif -#endif typedef struct { char* cur; @@ -41,10 +40,10 @@ static void GetJemallocStatus(void* mstat_arg, const char* status) { snprintf(mstat->cur, buf_size, "%s", status); mstat->cur += status_len; } -#endif // ROCKSDB_JEMALLOC - -#ifdef ROCKSDB_JEMALLOC void DumpMallocStats(std::string* stats) { + if (!HasJemalloc()) { + return; + } MallocStatus mstat; const unsigned int kMallocStatusLen = 1000000; std::unique_ptr buf{new char[kMallocStatusLen + 1]}; @@ -56,5 +55,5 @@ void DumpMallocStats(std::string* stats) { #else void DumpMallocStats(std::string*) {} #endif // ROCKSDB_JEMALLOC -} +} // namespace rocksdb #endif // !ROCKSDB_LITE diff --git a/port/jemalloc_helper.h b/port/jemalloc_helper.h new file mode 100644 index 000000000..412a80d26 --- /dev/null +++ b/port/jemalloc_helper.h @@ -0,0 +1,49 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifdef ROCKSDB_JEMALLOC +#ifdef __FreeBSD__ +#include +#else +#include +#endif + +// Declare non-standard jemalloc APIs as weak symbols. We can null-check these +// symbols to detect whether jemalloc is linked with the binary. +extern "C" void* mallocx(size_t, int) __attribute__((__weak__)); +extern "C" void* rallocx(void*, size_t, int) __attribute__((__weak__)); +extern "C" size_t xallocx(void*, size_t, size_t, int) __attribute__((__weak__)); +extern "C" size_t sallocx(const void*, int) __attribute__((__weak__)); +extern "C" void dallocx(void*, int) __attribute__((__weak__)); +extern "C" void sdallocx(void*, size_t, int) __attribute__((__weak__)); +extern "C" size_t nallocx(size_t, int) __attribute__((__weak__)); +extern "C" int mallctl(const char*, void*, size_t*, void*, size_t) + __attribute__((__weak__)); +extern "C" int mallctlnametomib(const char*, size_t*, size_t*) + __attribute__((__weak__)); +extern "C" int mallctlbymib(const size_t*, size_t, void*, size_t*, void*, + size_t) __attribute__((__weak__)); +extern "C" void malloc_stats_print(void (*)(void*, const char*), void*, + const char*) __attribute__((__weak__)); +extern "C" size_t malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void*) + JEMALLOC_CXX_THROW __attribute__((__weak__)); + +// Check if Jemalloc is linked with the binary. Note the main program might be +// using a different memory allocator even this method return true. +// It is loosely based on folly::usingJEMalloc(), minus the check that actually +// allocate memory and see if it is through jemalloc, to handle the dlopen() +// case: +// https://github.com/facebook/folly/blob/76cf8b5841fb33137cfbf8b224f0226437c855bc/folly/memory/Malloc.h#L147 +static inline bool HasJemalloc() { + return mallocx != nullptr && rallocx != nullptr && xallocx != nullptr && + sallocx != nullptr && dallocx != nullptr && sdallocx != nullptr && + nallocx != nullptr && mallctl != nullptr && + mallctlnametomib != nullptr && mallctlbymib != nullptr && + malloc_stats_print != nullptr && malloc_usable_size != nullptr; +} + +#endif // ROCKSDB_JEMALLOC diff --git a/util/jemalloc_nodump_allocator.cc b/util/jemalloc_nodump_allocator.cc index 1db939b4f..cdd08e932 100644 --- a/util/jemalloc_nodump_allocator.cc +++ b/util/jemalloc_nodump_allocator.cc @@ -133,12 +133,16 @@ Status NewJemallocNodumpAllocator( JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator) { *memory_allocator = nullptr; -#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR - (void) options; - return Status::NotSupported( + Status unsupported = Status::NotSupported( "JemallocNodumpAllocator only available with jemalloc version >= 5 " "and MADV_DONTDUMP is available."); +#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + (void)options; + return unsupported; #else + if (!HasJemalloc()) { + return unsupported; + } if (memory_allocator == nullptr) { return Status::InvalidArgument("memory_allocator must be non-null."); } diff --git a/util/jemalloc_nodump_allocator.h b/util/jemalloc_nodump_allocator.h index 914088de1..e93c12237 100644 --- a/util/jemalloc_nodump_allocator.h +++ b/util/jemalloc_nodump_allocator.h @@ -8,6 +8,7 @@ #include #include +#include "port/jemalloc_helper.h" #include "port/port.h" #include "rocksdb/memory_allocator.h" #include "util/core_local.h" @@ -15,7 +16,6 @@ #if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX) -#include #include #if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP) From ec43385bf319a2476a96eeb00d712b16504289d5 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 7 Dec 2018 17:03:49 -0800 Subject: [PATCH 06/57] Enable checkpoint of read-only db (#4681) Summary: 1. DBImplReadOnly::GetLiveFiles should not return NotSupported. Instead, it should call DBImpl::GetLiveFiles(flush_memtable=false). 2. In DBImp::Recover, we should also recover the OPTIONS file name and/or number so that an immediate subsequent GetLiveFiles will get the correct OPTIONS name. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4681 Differential Revision: D13069205 Pulled By: riversand963 fbshipit-source-id: 3e6a0174307d06db5a01feb099b306cea1f7f88a --- HISTORY.md | 10 +--- db/compacted_db_impl.h | 9 +-- db/db_impl_open.cc | 22 ++++++++ db/db_impl_readonly.h | 9 +-- utilities/checkpoint/checkpoint_test.cc | 73 +++++++++++++++++++++++++ 5 files changed, 107 insertions(+), 16 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 429c891db..0ead41a22 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,12 +1,4 @@ # Rocksdb Change Log -## Unreleased -### New Features - -### Public API Change - -### Bug Fixes -* Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls. - ## 5.18.0 (11/30/2018) ### New Features * Introduced `JemallocNodumpAllocator` memory allocator. When being use, block cache will be excluded from core dump. @@ -18,6 +10,7 @@ * Add xxhash64 checksum support * Introduced `MemoryAllocator`, which lets the user specify custom memory allocator for block based table. * Improved `DeleteRange` to prevent read performance degradation. The feature is no longer marked as experimental. +* Enabled checkpoint on readonly db (DBImplReadOnly). ### Public API Change * `DBOptions::use_direct_reads` now affects reads issued by `BackupEngine` on the database's SSTs. @@ -34,6 +27,7 @@ * Fixed Get correctness bug in the presence of range tombstones where merge operands covered by a range tombstone always result in NotFound. * Start populating `NO_FILE_CLOSES` ticker statistic, which was always zero previously. * The default value of NewBloomFilterPolicy()'s argument use_block_based_builder is changed to false. Note that this new default may cause large temp memory usage when building very large SST files. +* Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls. ## 5.17.0 (10/05/2018) ### Public API Change diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h index 736002e1e..5c574b4b9 100644 --- a/db/compacted_db_impl.h +++ b/db/compacted_db_impl.h @@ -67,10 +67,11 @@ class CompactedDBImpl : public DBImpl { virtual Status EnableFileDeletions(bool /*force*/) override { return Status::NotSupported("Not supported in compacted db mode."); } - virtual Status GetLiveFiles(std::vector&, - uint64_t* /*manifest_file_size*/, - bool /*flush_memtable*/ = true) override { - return Status::NotSupported("Not supported in compacted db mode."); + virtual Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { + return DBImpl::GetLiveFiles(ret, manifest_file_size, + false /* flush_memtable */); } using DBImpl::Flush; virtual Status Flush(const FlushOptions& /*options*/, diff --git a/db/db_impl_open.cc b/db/db_impl_open.cc index 5ea8c61b5..5196be7ba 100644 --- a/db/db_impl_open.cc +++ b/db/db_impl_open.cc @@ -474,6 +474,28 @@ Status DBImpl::Recover( } } + if (read_only) { + // If we are opening as read-only, we need to update options_file_number_ + // to reflect the most recent OPTIONS file. It does not matter for regular + // read-write db instance because options_file_number_ will later be + // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile. + std::vector file_names; + if (s.ok()) { + s = env_->GetChildren(GetName(), &file_names); + } + if (s.ok()) { + uint64_t number = 0; + uint64_t options_file_number = 0; + FileType type; + for (const auto& fname : file_names) { + if (ParseFileName(fname, &number, &type) && type == kOptionsFile) { + options_file_number = std::max(number, options_file_number); + } + } + versions_->options_file_number_ = options_file_number; + } + } + return s; } diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h index 6ebe1bce7..2d77dbac0 100644 --- a/db/db_impl_readonly.h +++ b/db/db_impl_readonly.h @@ -89,10 +89,11 @@ class DBImplReadOnly : public DBImpl { virtual Status EnableFileDeletions(bool /*force*/) override { return Status::NotSupported("Not supported operation in read only mode."); } - virtual Status GetLiveFiles(std::vector&, - uint64_t* /*manifest_file_size*/, - bool /*flush_memtable*/ = true) override { - return Status::NotSupported("Not supported operation in read only mode."); + virtual Status GetLiveFiles(std::vector& ret, + uint64_t* manifest_file_size, + bool /*flush_memtable*/) override { + return DBImpl::GetLiveFiles(ret, manifest_file_size, + false /* flush_memtable */); } using DBImpl::Flush; diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index 62c78faa8..b8436ccf5 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -164,6 +164,16 @@ class CheckpointTest : public testing::Test { return DB::OpenForReadOnly(options, dbname_, &db_); } + Status ReadOnlyReopenWithColumnFamilies(const std::vector& cfs, + const Options& options) { + std::vector column_families; + for (const auto& cf : cfs) { + column_families.emplace_back(cf, options); + } + return DB::OpenForReadOnly(options, dbname_, column_families, &handles_, + &db_); + } + Status TryReopen(const Options& options) { Close(); last_options_ = options; @@ -612,6 +622,69 @@ TEST_F(CheckpointTest, CheckpointWithUnsyncedDataDropped) { db_ = nullptr; } +TEST_F(CheckpointTest, CheckpointReadOnlyDB) { + ASSERT_OK(Put("foo", "foo_value")); + ASSERT_OK(Flush()); + Close(); + Options options = CurrentOptions(); + ASSERT_OK(ReadOnlyReopen(options)); + Checkpoint* checkpoint = nullptr; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + checkpoint = nullptr; + Close(); + DB* snapshot_db = nullptr; + ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db)); + ReadOptions read_opts; + std::string get_result; + ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result)); + ASSERT_EQ("foo_value", get_result); + delete snapshot_db; +} + +TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + for (int i = 0; i != 3; ++i) { + ASSERT_OK(Put(i, "foo", "foo_value")); + ASSERT_OK(Flush(i)); + } + Close(); + Status s = ReadOnlyReopenWithColumnFamilies( + {kDefaultColumnFamilyName, "pikachu", "eevee"}, options); + ASSERT_OK(s); + Checkpoint* checkpoint = nullptr; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + checkpoint = nullptr; + Close(); + + std::vector column_families{ + {kDefaultColumnFamilyName, options}, + {"pikachu", options}, + {"eevee", options}}; + DB* snapshot_db = nullptr; + std::vector snapshot_handles; + s = DB::Open(options, snapshot_name_, column_families, &snapshot_handles, + &snapshot_db); + ASSERT_OK(s); + ReadOptions read_opts; + for (int i = 0; i != 3; ++i) { + std::string get_result; + s = snapshot_db->Get(read_opts, snapshot_handles[i], "foo", &get_result); + ASSERT_OK(s); + ASSERT_EQ("foo_value", get_result); + } + + for (auto snapshot_h : snapshot_handles) { + delete snapshot_h; + } + snapshot_handles.clear(); + delete snapshot_db; +} + } // namespace rocksdb int main(int argc, char** argv) { From 663d24f4678af90df8a147fe2a75dc64f81d1de8 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 13 Dec 2018 15:10:16 -0800 Subject: [PATCH 07/57] Improve flushing multiple column families (#4708) Summary: If one column family is dropped, we should simply skip it and continue to flush other active ones. Currently we use Status::ShutdownInProgress to notify caller of column families being dropped. In the future, we should consider using a different Status code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4708 Differential Revision: D13378954 Pulled By: riversand963 fbshipit-source-id: 42f248cdf2d32d4c0f677cd39012694b8f1328ca --- HISTORY.md | 1 + db/db_flush_test.cc | 81 +++++++++++++++++++ db/db_impl_compaction_flush.cc | 83 ++++++++++++------- db/memtable_list.cc | 2 +- db/version_edit.cc | 2 +- db/version_set.cc | 79 ++++++++++++++++++- db/version_set_test.cc | 140 ++++++++++++++++++++++++++++++++- 7 files changed, 350 insertions(+), 38 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 0ead41a22..bf5bb12a0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,6 +11,7 @@ * Introduced `MemoryAllocator`, which lets the user specify custom memory allocator for block based table. * Improved `DeleteRange` to prevent read performance degradation. The feature is no longer marked as experimental. * Enabled checkpoint on readonly db (DBImplReadOnly). +* Make DB ignore dropped column families while committing results of atomic flush. ### Public API Change * `DBOptions::use_direct_reads` now affects reads issued by `BackupEngine` on the database's SSTs. diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index e9ae980b9..8a4d8fc63 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -407,6 +407,87 @@ TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) { Destroy(options); } +TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->EnableProcessing(); + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + std::vector cf_ids; + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_OK(Put(cf_id, "key", "value", wopts)); + cf_ids.push_back(cf_id); + } + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + ASSERT_TRUE(Flush(cf_ids).IsShutdownInProgress()); + Destroy(options); +} + +TEST_P(DBAtomicFlushTest, + FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) { + bool atomic_flush = GetParam(); + if (!atomic_flush) { + return; + } + Options options = CurrentOptions(); + options.create_if_missing = true; + options.atomic_flush = atomic_flush; + + CreateAndReopenWithCF({"pikachu", "eevee"}, options); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush", + "DBAtomicFlushTest::BeforeDropCF"}, + {"DBAtomicFlushTest::AfterDropCF", + "DBImpl::BackgroundCallFlush:start"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + size_t num_cfs = handles_.size(); + ASSERT_EQ(3, num_cfs); + WriteOptions wopts; + wopts.disableWAL = true; + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_OK(Put(cf_id, "key", "value", wopts)); + } + port::Thread user_thread([&]() { + TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF"); + ASSERT_OK(dbfull()->DropColumnFamily(handles_[1])); + TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF"); + }); + FlushOptions flush_opts; + flush_opts.wait = true; + ASSERT_OK(dbfull()->Flush(flush_opts, handles_)); + user_thread.join(); + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_EQ("value", Get(cf_id, "key")); + } + + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options); + num_cfs = handles_.size(); + ASSERT_EQ(2, num_cfs); + for (size_t i = 0; i != num_cfs; ++i) { + int cf_id = static_cast(i); + ASSERT_EQ("value", Get(cf_id, "key")); + } + Destroy(options); +} + INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest, testing::Bool()); diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index fa0b91877..1b9c1bb7c 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -219,20 +219,25 @@ Status DBImpl::FlushMemTablesToOutputFiles( return AtomicFlushMemTablesToOutputFiles(bg_flush_args, made_progress, job_context, log_buffer); } - Status s; + Status status; for (auto& arg : bg_flush_args) { ColumnFamilyData* cfd = arg.cfd_; const MutableCFOptions& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); SuperVersionContext* superversion_context = arg.superversion_context_; - s = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, - job_context, superversion_context, - log_buffer); + Status s = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, + job_context, superversion_context, + log_buffer); if (!s.ok()) { - break; + status = s; + if (!s.IsShutdownInProgress()) { + // At this point, DB is not shutting down, nor is cfd dropped. + // Something is wrong, thus we break out of the loop. + break; + } } } - return s; + return status; } /* @@ -331,8 +336,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( autovector> exec_status; for (int i = 0; i != num_cfs; ++i) { // Initially all jobs are not executed, with status OK. - std::pair elem(false, Status::OK()); - exec_status.emplace_back(elem); + exec_status.emplace_back(false, Status::OK()); } if (s.ok()) { @@ -341,10 +345,6 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( exec_status[i].second = jobs[i].Run(&logs_with_prep_tracker_, &file_meta[i]); exec_status[i].first = true; - if (!exec_status[i].second.ok()) { - s = exec_status[i].second; - break; - } } if (num_cfs > 1) { TEST_SYNC_POINT( @@ -352,17 +352,27 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( TEST_SYNC_POINT( "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"); } - if (s.ok()) { - exec_status[0].second = - jobs[0].Run(&logs_with_prep_tracker_, &file_meta[0]); - exec_status[0].first = true; - if (!exec_status[0].second.ok()) { - s = exec_status[0].second; + exec_status[0].second = + jobs[0].Run(&logs_with_prep_tracker_, &file_meta[0]); + exec_status[0].first = true; + + Status error_status; + for (const auto& e : exec_status) { + if (!e.second.ok()) { + s = e.second; + if (!e.second.IsShutdownInProgress()) { + // If a flush job did not return OK, and the CF is not dropped, and + // the DB is not shutting down, then we have to return this result to + // caller later. + error_status = e.second; + } } } + + s = error_status.ok() ? s : error_status; } - if (s.ok()) { + if (s.ok() || s.IsShutdownInProgress()) { // Sync on all distinct output directories. for (auto dir : distinct_output_dirs) { if (dir != nullptr) { @@ -376,6 +386,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( if (s.ok()) { autovector*> mems_list; for (int i = 0; i != num_cfs; ++i) { + if (cfds[i]->IsDropped()) { + continue; + } const auto& mems = jobs[i].GetMemTables(); mems_list.emplace_back(&mems); } @@ -383,6 +396,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( autovector imm_lists; autovector mutable_cf_options_list; for (auto cfd : *versions_->GetColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } all_cfds.emplace_back(cfd); imm_lists.emplace_back(cfd->imm()); mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions()); @@ -396,10 +412,13 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( } } - if (s.ok()) { + if (s.ok() || s.IsShutdownInProgress()) { assert(num_cfs == static_cast(job_context->superversion_contexts.size())); for (int i = 0; i != num_cfs; ++i) { + if (cfds[i]->IsDropped()) { + continue; + } InstallSuperVersionAndScheduleWork(cfds[i], &job_context->superversion_contexts[i], *cfds[i]->GetLatestMutableCFOptions()); @@ -415,6 +434,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( auto sfm = static_cast( immutable_db_options_.sst_file_manager.get()); for (int i = 0; i != num_cfs; ++i) { + if (cfds[i]->IsDropped()) { + continue; + } NotifyOnFlushCompleted(cfds[i], &file_meta[i], *cfds[i]->GetLatestMutableCFOptions(), job_context->job_id, jobs[i].GetTableProperties()); @@ -434,7 +456,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( #endif // ROCKSDB_LITE } - if (!s.ok()) { + // Need to undo atomic flush if something went wrong, i.e. s is not OK and + // it is not because of CF drop. + if (!s.ok() && !s.IsShutdownInProgress()) { // Have to cancel the flush jobs that have NOT executed because we need to // unref the versions. for (int i = 0; i != num_cfs; ++i) { @@ -442,17 +466,15 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( jobs[i].Cancel(); } } - if (!s.IsShutdownInProgress()) { - for (int i = 0; i != num_cfs; ++i) { - if (exec_status[i].first && exec_status[i].second.ok()) { - auto& mems = jobs[i].GetMemTables(); - cfds[i]->imm()->RollbackMemtableFlush(mems, - file_meta[i].fd.GetNumber()); - } + for (int i = 0; i != num_cfs; ++i) { + if (exec_status[i].first && exec_status[i].second.ok()) { + auto& mems = jobs[i].GetMemTables(); + cfds[i]->imm()->RollbackMemtableFlush(mems, + file_meta[i].fd.GetNumber()); } - Status new_bg_error = s; - error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } + Status new_bg_error = s; + error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } return s; @@ -1539,6 +1561,7 @@ Status DBImpl::AtomicFlushMemTables( write_thread_.ExitUnbatched(&w); } } + TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush"); if (s.ok() && flush_options.wait) { autovector flush_memtable_ids; diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 9145135d6..36c0a8f1d 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -426,7 +426,7 @@ Status MemTableList::TryInstallMemtableFlushResults( imm_lists[pos]->InstallNewVersion(); } - if (s.ok()) { + if (s.ok() || s.IsShutdownInProgress()) { for (size_t i = 0; i != batch_sz; ++i) { if (tmp_cfds[i]->IsDropped()) { continue; diff --git a/db/version_edit.cc b/db/version_edit.cc index adeca134d..e9f497999 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -579,7 +579,7 @@ std::string VersionEdit::DebugString(bool hex_key) const { AppendNumberTo(&r, max_column_family_); } if (is_in_atomic_group_) { - r.append("\n AtomicGroup: "); + r.append("\n AtomicGroup: "); AppendNumberTo(&r, remaining_entries_); r.append(" entries remains"); } diff --git a/db/version_set.cc b/db/version_set.cc index cb7b08db0..2d6997601 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2849,6 +2849,7 @@ Status VersionSet::ProcessManifestWrites( batch_edits.push_back(first_writer.edit_list.front()); } else { auto it = manifest_writers_.cbegin(); + size_t group_start = std::numeric_limits::max(); while (it != manifest_writers_.cend()) { if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) { // no group commits for column family add or drop @@ -2857,7 +2858,36 @@ Status VersionSet::ProcessManifestWrites( last_writer = *(it++); assert(last_writer != nullptr); assert(last_writer->cfd != nullptr); - if (last_writer->cfd != nullptr && last_writer->cfd->IsDropped()) { + if (last_writer->cfd->IsDropped()) { + // If we detect a dropped CF at this point, and the corresponding + // version edits belong to an atomic group, then we need to find out + // the preceding version edits in the same atomic group, and update + // their `remaining_entries_` member variable because we are NOT going + // to write the version edits' of dropped CF to the MANIFEST. If we + // don't update, then Recover can report corrupted atomic group because + // the `remaining_entries_` do not match. + if (!batch_edits.empty()) { + if (batch_edits.back()->is_in_atomic_group_ && + batch_edits.back()->remaining_entries_ > 0) { + assert(group_start < batch_edits.size()); + const auto& edit_list = last_writer->edit_list; + size_t k = 0; + while (k < edit_list.size()) { + if (!edit_list[k]->is_in_atomic_group_) { + break; + } else if (edit_list[k]->remaining_entries_ == 0) { + ++k; + break; + } + ++k; + } + for (auto i = group_start; i < batch_edits.size(); ++i) { + assert(static_cast(k) <= + batch_edits.back()->remaining_entries_); + batch_edits[i]->remaining_entries_ -= static_cast(k); + } + } + } continue; } // We do a linear search on versions because versions is small. @@ -2888,6 +2918,15 @@ Status VersionSet::ProcessManifestWrites( } assert(builder != nullptr); // make checker happy for (const auto& e : last_writer->edit_list) { + if (e->is_in_atomic_group_) { + if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ || + (batch_edits.back()->is_in_atomic_group_ && + batch_edits.back()->remaining_entries_ == 0)) { + group_start = batch_edits.size(); + } + } else if (group_start != std::numeric_limits::max()) { + group_start = std::numeric_limits::max(); + } LogAndApplyHelper(last_writer->cfd, builder, version, e, mu); batch_edits.push_back(e); } @@ -2900,6 +2939,42 @@ Status VersionSet::ProcessManifestWrites( } } +#ifndef NDEBUG + // Verify that version edits of atomic groups have correct + // remaining_entries_. + size_t k = 0; + while (k < batch_edits.size()) { + while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) { + ++k; + } + if (k == batch_edits.size()) { + break; + } + size_t i = k; + while (i < batch_edits.size()) { + if (!batch_edits[i]->is_in_atomic_group_) { + break; + } + assert(i - k + batch_edits[i]->remaining_entries_ == + batch_edits[k]->remaining_entries_); + if (batch_edits[i]->remaining_entries_ == 0) { + ++i; + break; + } + ++i; + } + assert(batch_edits[i - 1]->is_in_atomic_group_); + assert(0 == batch_edits[i - 1]->remaining_entries_); + std::vector tmp; + for (size_t j = k; j != i; ++j) { + tmp.emplace_back(batch_edits[j]); + } + TEST_SYNC_POINT_CALLBACK( + "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp); + k = i; + } +#endif // NDEBUG + uint64_t new_manifest_file_size = 0; Status s; @@ -3205,7 +3280,7 @@ Status VersionSet::LogAndApply( if (!manifest_writers_.empty()) { manifest_writers_.front()->cv.Signal(); } - return Status::OK(); + return Status::ShutdownInProgress(); } return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log, diff --git a/db/version_set_test.cc b/db/version_set_test.cc index c94ffb154..8b478ceb0 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -605,9 +605,13 @@ TEST_F(FindLevelFileTest, LevelOverlappingFiles) { ASSERT_TRUE(Overlaps("600", "700")); } -class VersionSetTest : public testing::Test { +class VersionSetTestBase { public: - VersionSetTest() + const static std::string kColumnFamilyName1; + const static std::string kColumnFamilyName2; + const static std::string kColumnFamilyName3; + + VersionSetTestBase() : env_(Env::Default()), dbname_(test::PerThreadDBPath("version_set_test")), db_options_(), @@ -635,8 +639,9 @@ class VersionSetTest : public testing::Test { new_db.SetNextFile(2); new_db.SetLastSequence(0); - const std::vector cf_names = {kDefaultColumnFamilyName, - "alice", "bob"}; + const std::vector cf_names = { + kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2, + kColumnFamilyName3}; const int kInitialNumOfCfs = static_cast(cf_names.size()); autovector new_cfs; uint64_t last_seq = 1; @@ -711,6 +716,15 @@ class VersionSetTest : public testing::Test { std::shared_ptr mock_table_factory_; }; +const std::string VersionSetTestBase::kColumnFamilyName1 = "alice"; +const std::string VersionSetTestBase::kColumnFamilyName2 = "bob"; +const std::string VersionSetTestBase::kColumnFamilyName3 = "charles"; + +class VersionSetTest : public VersionSetTestBase, public testing::Test { + public: + VersionSetTest() : VersionSetTestBase() {} +}; + TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { NewDB(); const int kGroupSize = 5; @@ -958,6 +972,124 @@ TEST_F(VersionSetTest, HandleIncorrectAtomicGroupSize) { versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); EXPECT_TRUE(incorrect_group_size); } + +class VersionSetTestDropOneCF : public VersionSetTestBase, + public testing::TestWithParam { + public: + VersionSetTestDropOneCF() : VersionSetTestBase() {} +}; + +// This test simulates the following execution sequence +// Time thread1 bg_flush_thr +// | Prepare version edits (e1,e2,e3) for atomic +// | flush cf1, cf2, cf3 +// | Enqueue e to drop cfi +// | to manifest_writers_ +// | Enqueue (e1,e2,e3) to manifest_writers_ +// | +// | Apply e, +// | cfi.IsDropped() is true +// | Apply (e1,e2,e3), +// | since cfi.IsDropped() == true, we need to +// | drop ei and write the rest to MANIFEST. +// V +// +// Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and +// last column family in an atomic group. +TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { + std::vector column_families; + SequenceNumber last_seqno; + std::unique_ptr log_writer; + PrepareManifest(&column_families, &last_seqno, &log_writer); + Status s = SetCurrentFile(env_, dbname_, 1, nullptr); + ASSERT_OK(s); + + EXPECT_OK(versions_->Recover(column_families, false /* read_only */)); + EXPECT_EQ(column_families.size(), + versions_->GetColumnFamilySet()->NumberOfColumnFamilies()); + + const int kAtomicGroupSize = 3; + const std::vector non_default_cf_names = { + kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3}; + + // Drop one column family + VersionEdit drop_cf_edit; + drop_cf_edit.DropColumnFamily(); + const std::string cf_to_drop_name(GetParam()); + auto cfd_to_drop = + versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name); + ASSERT_NE(nullptr, cfd_to_drop); + cfd_to_drop->Ref(); // Increase its refcount because cfd_to_drop is used later + drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID()); + mutex_.Lock(); + s = versions_->LogAndApply(cfd_to_drop, + *cfd_to_drop->GetLatestMutableCFOptions(), + &drop_cf_edit, &mutex_); + mutex_.Unlock(); + ASSERT_OK(s); + + std::vector edits(kAtomicGroupSize); + uint32_t remaining = kAtomicGroupSize; + size_t i = 0; + autovector cfds; + autovector mutable_cf_options_list; + autovector> edit_lists; + for (const auto& cf_name : non_default_cf_names) { + auto cfd = (cf_name != cf_to_drop_name) + ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name) + : cfd_to_drop; + ASSERT_NE(nullptr, cfd); + cfds.push_back(cfd); + mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions()); + edits[i].SetColumnFamily(cfd->GetID()); + edits[i].SetLogNumber(0); + edits[i].SetNextFile(2); + edits[i].MarkAtomicGroup(--remaining); + edits[i].SetLastSequence(last_seqno++); + autovector tmp_edits; + tmp_edits.push_back(&edits[i]); + edit_lists.emplace_back(tmp_edits); + ++i; + } + int called = 0; + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) { + std::vector* tmp_edits = + reinterpret_cast*>(arg); + EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size()); + for (const auto e : *tmp_edits) { + bool found = false; + for (const auto& e2 : edits) { + if (&e2 == e) { + found = true; + break; + } + } + ASSERT_TRUE(found); + } + ++called; + }); + SyncPoint::GetInstance()->EnableProcessing(); + mutex_.Lock(); + s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists, + &mutex_); + mutex_.Unlock(); + ASSERT_OK(s); + ASSERT_EQ(1, called); + if (cfd_to_drop->Unref()) { + delete cfd_to_drop; + cfd_to_drop = nullptr; + } +} + +INSTANTIATE_TEST_CASE_P( + AtomicGroup, VersionSetTestDropOneCF, + testing::Values(VersionSetTestBase::kColumnFamilyName1, + VersionSetTestBase::kColumnFamilyName2, + VersionSetTestBase::kColumnFamilyName3)); + } // namespace rocksdb int main(int argc, char** argv) { From e265e08a02135082d22cacf5673ecae68f78f63a Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 18 Dec 2018 16:43:12 -0800 Subject: [PATCH 08/57] Avoid switching empty memtable in certain cases (#4792) Summary: in certain cases, we do not perform memtable switching if the active memtable of the column family is empty. Two exceptions: 1. In manual flush, if cached_recoverable_state_empty_ is false, then we need to switch memtable due to requirement of transaction. 2. In switch WAL, we need to switch memtable anyway because we have to seal the memtable if the WAL on which it depends will be closed. This change can potentially delay the occurence of write stalls because number of memtables increase more slowly. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4792 Differential Revision: D13499501 Pulled By: riversand963 fbshipit-source-id: 91c9b17ae753578578039f3851667d93610005e1 --- db/db_impl_compaction_flush.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index 1b9c1bb7c..858923f0b 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -1462,11 +1462,16 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, write_thread_.EnterUnbatched(&w, &mutex_); } - if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || - !cached_recoverable_state_empty_.load()) { + if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { s = SwitchMemtable(cfd, &context); - flush_memtable_id = cfd->imm()->GetLatestMemTableID(); - flush_req.emplace_back(cfd, flush_memtable_id); + } + + if (s.ok()) { + if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || + !cached_recoverable_state_empty_.load()) { + flush_memtable_id = cfd->imm()->GetLatestMemTableID(); + flush_req.emplace_back(cfd, flush_memtable_id); + } } if (s.ok() && !flush_req.empty()) { @@ -1540,6 +1545,9 @@ Status DBImpl::AtomicFlushMemTables( } } for (auto cfd : cfds) { + if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) { + continue; + } cfd->Ref(); s = SwitchMemtable(cfd, &context); cfd->Unref(); From 35c950a94e1e9c8efa6bf4abbfa840899c73ffab Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 3 Jan 2019 20:53:52 -0800 Subject: [PATCH 09/57] Refactor atomic flush result installation to MANIFEST (#4791) Summary: as titled. Since different bg flush threads can flush different sets of column families (due to column family creation and drop), we decide not to let one thread perform atomic flush result installation for other threads. Bg flush threads will install their atomic flush results sequentially to MANIFEST, using a conditional variable, i.e. atomic_flush_install_cv_ to coordinate. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4791 Differential Revision: D13498930 Pulled By: riversand963 fbshipit-source-id: dd7482fc41f4bd22dad1e1ef7d4764ef424688d7 --- db/db_impl.cc | 2 +- db/db_impl.h | 19 +- db/db_impl_compaction_flush.cc | 74 +++++-- db/flush_job_test.cc | 9 +- db/memtable.h | 12 +- db/memtable_list.cc | 323 ++++++++++------------------- db/memtable_list.h | 51 +++-- db/memtable_list_test.cc | 359 +++++++++------------------------ db/version_set_test.cc | 4 +- 9 files changed, 311 insertions(+), 542 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index e259864d7..3f6e44676 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -220,7 +220,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, preserve_deletes_(options.preserve_deletes), closed_(false), error_handler_(this, immutable_db_options_, &mutex_), - atomic_flush_commit_in_progress_(false) { + atomic_flush_install_cv_(&mutex_) { // !batch_per_trx_ implies seq_per_batch_ because it is only unset for // WriteUnprepared, which should use seq_per_batch_. assert(batch_per_txn_ || seq_per_batch_); diff --git a/db/db_impl.h b/db/db_impl.h index 7d509c807..58fcacc67 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -1610,15 +1610,16 @@ class DBImpl : public DB { ErrorHandler error_handler_; - // True if the DB is committing atomic flush. - // TODO (yanqin) the current impl assumes that the entire DB belongs to - // a single atomic flush group. In the future we need to add a new class - // (struct) similar to the following to make it more general. - // struct AtomicFlushGroup { - // bool commit_in_progress_; - // std::vector imm_lists; - // }; - bool atomic_flush_commit_in_progress_; + // Conditional variable to coordinate installation of atomic flush results. + // With atomic flush, each bg thread installs the result of flushing multiple + // column families, and different threads can flush different column + // families. It's difficult to rely on one thread to perform batch + // installation for all threads. This is different from the non-atomic flush + // case. + // atomic_flush_install_cv_ makes sure that threads install atomic flush + // results sequentially. Flush results of memtables with lower IDs get + // installed to MANIFEST first. + InstrumentedCondVar atomic_flush_install_cv_; }; extern Options SanitizeOptions(const std::string& db, diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index 858923f0b..f6a3ffd1d 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -382,34 +382,65 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( } } } + } - if (s.ok()) { - autovector*> mems_list; - for (int i = 0; i != num_cfs; ++i) { - if (cfds[i]->IsDropped()) { - continue; - } + if (s.ok()) { + auto wait_to_install_func = [&]() { + bool ready = true; + for (size_t i = 0; i != cfds.size(); ++i) { const auto& mems = jobs[i].GetMemTables(); - mems_list.emplace_back(&mems); - } - autovector all_cfds; - autovector imm_lists; - autovector mutable_cf_options_list; - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->IsDropped()) { + if (cfds[i]->IsDropped()) { + // If the column family is dropped, then do not wait. continue; + } else if (!mems.empty() && + cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) { + // If a flush job needs to install the flush result for mems and + // mems[0] is not the earliest memtable, it means another thread must + // be installing flush results for the same column family, then the + // current thread needs to wait. + ready = false; + break; + } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <= + bg_flush_args[i].max_memtable_id_) { + // If a flush job does not need to install flush results, then it has + // to wait until all memtables up to max_memtable_id_ (inclusive) are + // installed. + ready = false; + break; } - all_cfds.emplace_back(cfd); - imm_lists.emplace_back(cfd->imm()); - mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions()); } + return ready; + }; + + bool resuming_from_bg_err = error_handler_.IsDBStopped(); + while ((!error_handler_.IsDBStopped() || + error_handler_.GetRecoveryError().ok()) && + !wait_to_install_func()) { + atomic_flush_install_cv_.Wait(); + } + + s = resuming_from_bg_err ? error_handler_.GetRecoveryError() + : error_handler_.GetBGError(); + } - s = MemTableList::TryInstallMemtableFlushResults( - imm_lists, all_cfds, mutable_cf_options_list, mems_list, - &atomic_flush_commit_in_progress_, &logs_with_prep_tracker_, - versions_.get(), &mutex_, file_meta, &job_context->memtables_to_free, - directories_.GetDbDir(), log_buffer); + if (s.ok()) { + autovector tmp_cfds; + autovector*> mems_list; + autovector mutable_cf_options_list; + for (int i = 0; i != num_cfs; ++i) { + const auto& mems = jobs[i].GetMemTables(); + if (!cfds[i]->IsDropped() && !mems.empty()) { + tmp_cfds.emplace_back(cfds[i]); + mems_list.emplace_back(&mems); + mutable_cf_options_list.emplace_back( + cfds[i]->GetLatestMutableCFOptions()); + } } + + s = InstallMemtableAtomicFlushResults( + nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list, + versions_.get(), &mutex_, file_meta, &job_context->memtables_to_free, + directories_.GetDbDir(), log_buffer); } if (s.ok() || s.IsShutdownInProgress()) { @@ -2077,6 +2108,7 @@ void DBImpl::BackgroundCallFlush() { bg_flush_scheduled_--; // See if there's more work to be done MaybeScheduleFlushOrCompaction(); + atomic_flush_install_cv_.SignalAll(); bg_cv_.SignalAll(); // IMPORTANT: there should be no code after calling SignalAll. This call may // signal the DB destructor that it's OK to proceed with destruction. In diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 519e01f2a..5ac5f2f93 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -279,7 +279,6 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber); mem->SetID(i); mem->Ref(); - mem->TEST_AtomicFlushSequenceNumber() = 123; for (size_t j = 0; j != num_keys_per_memtable; ++j) { std::string key(ToString(j + i * num_keys_per_memtable)); @@ -325,17 +324,13 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { const auto& mems = flush_jobs[i].GetMemTables(); mems_list.push_back(&mems); } - autovector imm_lists; autovector mutable_cf_options_list; for (auto cfd : all_cfds) { - imm_lists.push_back(cfd->imm()); mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions()); } - bool atomic_flush_commit_in_progress = false; - Status s = MemTableList::TryInstallMemtableFlushResults( - imm_lists, all_cfds, mutable_cf_options_list, mems_list, - &atomic_flush_commit_in_progress, nullptr /* logs_prep_tracker */, + Status s = InstallMemtableAtomicFlushResults( + nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list, versions_.get(), &mutex_, file_metas, &job_context.memtables_to_free, nullptr /* db_directory */, nullptr /* log_buffer */); ASSERT_OK(s); diff --git a/db/memtable.h b/db/memtable.h index aac09af40..6ce28961e 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -386,13 +386,15 @@ class MemTable { uint64_t GetID() const { return id_; } - SequenceNumber& TEST_AtomicFlushSequenceNumber() { - return atomic_flush_seqno_; - } + void SetFlushCompleted(bool completed) { flush_completed_ = completed; } + + uint64_t GetFileNumber() const { return file_number_; } - void TEST_SetFlushCompleted(bool completed) { flush_completed_ = completed; } + void SetFileNumber(uint64_t file_num) { file_number_ = file_num; } - void TEST_SetFileNumber(uint64_t file_num) { file_number_ = file_num; } + void SetFlushInProgress(bool in_progress) { + flush_in_progress_ = in_progress; + } private: enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED }; diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 36c0a8f1d..459d392d5 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -260,228 +260,6 @@ void MemTableListVersion::TrimHistory(autovector* to_delete) { } } -// Try to record multiple successful flush to the MANIFEST as an atomic unit. -// This function may just return Status::OK if there has already been -// a concurrent thread performing actual recording. -Status MemTableList::TryInstallMemtableFlushResults( - autovector& imm_lists, - const autovector& cfds, - const autovector& mutable_cf_options_list, - const autovector*>& mems_list, - bool* atomic_flush_commit_in_progress, LogsWithPrepTracker* prep_tracker, - VersionSet* vset, InstrumentedMutex* mu, - const autovector& file_metas, - autovector* to_delete, Directory* db_directory, - LogBuffer* log_buffer) { - AutoThreadOperationStageUpdater stage_updater( - ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); - mu->AssertHeld(); - - for (size_t k = 0; k != mems_list.size(); ++k) { - for (size_t i = 0; i != mems_list[k]->size(); ++i) { - assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0); - (*mems_list[k])[i]->flush_completed_ = true; - (*mems_list[k])[i]->file_number_ = file_metas[k].fd.GetNumber(); - } - } - - assert(atomic_flush_commit_in_progress != nullptr); - Status s; - if (*atomic_flush_commit_in_progress) { - // If the function reaches here, there must be a concurrent thread that - // have already started recording to MANIFEST. Therefore we should just - // return Status::OK and let the othe thread finish writing to MANIFEST on - // our behalf. - return s; - } - - // If the function reaches here, the current thread will start writing to - // MANIFEST. It may record to MANIFEST the flush results of other flushes. - *atomic_flush_commit_in_progress = true; - - auto comp = [&imm_lists](size_t lh, size_t rh) { - const auto& memlist1 = imm_lists[lh]->current_->memlist_; - const auto& memlist2 = imm_lists[rh]->current_->memlist_; - auto it1 = memlist1.rbegin(); - auto it2 = memlist2.rbegin(); - return (*it1)->atomic_flush_seqno_ > (*it2)->atomic_flush_seqno_; - }; - // The top of the heap is the memtable with smallest atomic_flush_seqno_. - std::priority_queue, decltype(comp)> heap(comp); - // Sequence number of the oldest unfinished atomic flush. - SequenceNumber min_unfinished_seqno = kMaxSequenceNumber; - // Populate the heap with first element of each imm iff. it has been - // flushed to storage, i.e. flush_completed_ is true. - size_t num = imm_lists.size(); - assert(num == cfds.size()); - for (size_t i = 0; i != num; ++i) { - std::list& memlist = imm_lists[i]->current_->memlist_; - if (memlist.empty()) { - continue; - } - auto it = memlist.rbegin(); - if ((*it)->flush_completed_) { - heap.emplace(i); - } else if (min_unfinished_seqno > (*it)->atomic_flush_seqno_) { - min_unfinished_seqno = (*it)->atomic_flush_seqno_; - } - } - - while (s.ok() && !heap.empty()) { - autovector batch; - SequenceNumber seqno = kMaxSequenceNumber; - // Pop from the heap the memtables that belong to the same atomic flush, - // namely their atomic_flush_seqno_ are equal. - do { - size_t pos = heap.top(); - const auto& memlist = imm_lists[pos]->current_->memlist_; - MemTable* mem = *(memlist.rbegin()); - if (seqno == kMaxSequenceNumber) { - // First mem in this batch. - seqno = mem->atomic_flush_seqno_; - batch.emplace_back(pos); - heap.pop(); - } else if (mem->atomic_flush_seqno_ == seqno) { - // mem has the same atomic_flush_seqno_, thus in the same atomic flush. - batch.emplace_back(pos); - heap.pop(); - } else if (mem->atomic_flush_seqno_ > seqno) { - // mem belongs to another atomic flush with higher seqno, break the - // loop. - break; - } - } while (!heap.empty()); - if (seqno >= min_unfinished_seqno) { - // If there is an older, unfinished atomic flush, then we should not - // proceed. - TEST_SYNC_POINT_CALLBACK( - "MemTableList::TryInstallMemtableFlushResults:" - "HasOlderUnfinishedAtomicFlush:0", - nullptr); - break; - } - - // Found the earliest, complete atomic flush. No earlier atomic flush is - // pending. Therefore ready to record it to the MANIFEST. - uint32_t num_entries = 0; - autovector tmp_cfds; - autovector tmp_mutable_cf_options_list; - std::vector> memtables_to_flush; - autovector> edit_lists; - for (auto pos : batch) { - tmp_cfds.emplace_back(cfds[pos]); - tmp_mutable_cf_options_list.emplace_back(mutable_cf_options_list[pos]); - const auto& memlist = imm_lists[pos]->current_->memlist_; - uint64_t batch_file_number = 0; - autovector tmp_mems; - autovector edits; - for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { - MemTable* m = *it; - if (!m->flush_completed_ || - (it != memlist.rbegin() && m->file_number_ != batch_file_number)) { - break; - } - if (it == memlist.rbegin()) { - batch_file_number = m->file_number_; - edits.push_back(m->GetEdits()); - ++num_entries; - } - tmp_mems.push_back(m); - } - edit_lists.push_back(edits); - memtables_to_flush.push_back(tmp_mems); - } - TEST_SYNC_POINT_CALLBACK( - "MemTableList::TryInstallMemtableFlushResults:FoundBatchToCommit:0", - &num_entries); - - // Mark the version edits as an atomic group - uint32_t remaining = num_entries; - for (auto& edit_list : edit_lists) { - assert(edit_list.size() == 1); - edit_list[0]->MarkAtomicGroup(--remaining); - } - assert(remaining == 0); - - size_t batch_sz = batch.size(); - assert(batch_sz > 0); - assert(batch_sz == memtables_to_flush.size()); - assert(batch_sz == tmp_cfds.size()); - assert(batch_sz == edit_lists.size()); - - if (vset->db_options()->allow_2pc) { - for (size_t i = 0; i != batch_sz; ++i) { - auto& edit_list = edit_lists[i]; - assert(!edit_list.empty()); - edit_list.back()->SetMinLogNumberToKeep( - PrecomputeMinLogNumberToKeep(vset, *tmp_cfds[i], edit_list, - memtables_to_flush[i], prep_tracker)); - } - } - // this can release and reacquire the mutex. - s = vset->LogAndApply(tmp_cfds, tmp_mutable_cf_options_list, edit_lists, mu, - db_directory); - - for (const auto pos : batch) { - imm_lists[pos]->InstallNewVersion(); - } - - if (s.ok() || s.IsShutdownInProgress()) { - for (size_t i = 0; i != batch_sz; ++i) { - if (tmp_cfds[i]->IsDropped()) { - continue; - } - size_t pos = batch[i]; - for (auto m : memtables_to_flush[i]) { - assert(m->file_number_ > 0); - uint64_t mem_id = m->GetID(); - ROCKS_LOG_BUFFER(log_buffer, - "[%s] Level-0 commit table #%" PRIu64 - ": memtable #%" PRIu64 " done", - tmp_cfds[i]->GetName().c_str(), m->file_number_, - mem_id); - imm_lists[pos]->current_->Remove(m, to_delete); - } - } - } else { - for (size_t i = 0; i != batch_sz; ++i) { - size_t pos = batch[i]; - for (auto m : memtables_to_flush[i]) { - uint64_t mem_id = m->GetID(); - ROCKS_LOG_BUFFER(log_buffer, - "[%s] Level-0 commit table #%" PRIu64 - ": memtable #%" PRIu64 " failed", - tmp_cfds[i]->GetName().c_str(), m->file_number_, - mem_id); - m->flush_completed_ = false; - m->flush_in_progress_ = false; - m->edit_.Clear(); - m->file_number_ = 0; - imm_lists[pos]->num_flush_not_started_++; - } - imm_lists[pos]->imm_flush_needed.store(true, std::memory_order_release); - } - } - // Adjust the heap AFTER installing new MemTableListVersions because the - // compare function 'comp' needs to capture the most up-to-date state of - // imm_lists. - for (auto pos : batch) { - const auto& memlist = imm_lists[pos]->current_->memlist_; - if (!memlist.empty()) { - MemTable* mem = *(memlist.rbegin()); - if (mem->flush_completed_) { - heap.emplace(pos); - } else if (min_unfinished_seqno > mem->atomic_flush_seqno_) { - min_unfinished_seqno = mem->atomic_flush_seqno_; - } - } - } - } - - *atomic_flush_commit_in_progress = false; - return s; -} - // Returns true if there is at least one memtable on which flush has // not yet started. bool MemTableList::IsFlushPending() const { @@ -749,4 +527,105 @@ uint64_t MemTableList::PrecomputeMinLogContainingPrepSection( return min_log; } +// Commit a successful atomic flush in the manifest file. +Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, VersionSet* vset, + InstrumentedMutex* mu, const autovector& file_metas, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer) { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); + mu->AssertHeld(); + + size_t num = mems_list.size(); + assert(cfds.size() == num); + if (imm_lists != nullptr) { + assert(imm_lists->size() == num); + } + for (size_t k = 0; k != num; ++k) { +#ifndef NDEBUG + const auto* imm = + (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); + if (!mems_list[k]->empty()) { + assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID()); + } +#endif + for (size_t i = 0; i != mems_list[k]->size(); ++i) { + assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0); + (*mems_list[k])[i]->SetFlushCompleted(true); + (*mems_list[k])[i]->SetFileNumber(file_metas[k].fd.GetNumber()); + } + } + + Status s; + + autovector> edit_lists; + uint32_t num_entries = 0; + for (const auto mems : mems_list) { + assert(mems != nullptr); + autovector edits; + assert(!mems->empty()); + edits.emplace_back((*mems)[0]->GetEdits()); + ++num_entries; + edit_lists.emplace_back(edits); + } + // Mark the version edits as an atomic group + for (auto& edits : edit_lists) { + assert(edits.size() == 1); + edits[0]->MarkAtomicGroup(--num_entries); + } + assert(0 == num_entries); + + // this can release and reacquire the mutex. + s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, + db_directory); + + for (size_t k = 0; k != cfds.size(); ++k) { + auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); + imm->InstallNewVersion(); + } + + if (s.ok() || s.IsShutdownInProgress()) { + for (size_t i = 0; i != cfds.size(); ++i) { + if (cfds[i]->IsDropped()) { + continue; + } + auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); + for (auto m : *mems_list[i]) { + assert(m->GetFileNumber() > 0); + uint64_t mem_id = m->GetID(); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + imm->current_->Remove(m, to_delete); + } + } + } else { + for (size_t i = 0; i != cfds.size(); ++i) { + auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i); + for (auto m : *mems_list[i]) { + uint64_t mem_id = m->GetID(); + ROCKS_LOG_BUFFER(log_buffer, + "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " failed", + cfds[i]->GetName().c_str(), m->GetFileNumber(), + mem_id); + m->SetFlushCompleted(false); + m->SetFlushInProgress(false); + m->GetEdits()->Clear(); + m->SetFileNumber(0); + imm->num_flush_not_started_++; + } + imm->imm_flush_needed.store(true, std::memory_order_release); + } + } + + return s; +} + } // namespace rocksdb diff --git a/db/memtable_list.h b/db/memtable_list.h index 6315167a1..be3f93562 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -31,6 +31,7 @@ class ColumnFamilyData; class InternalKeyComparator; class InstrumentedMutex; class MergeIteratorBuilder; +class MemTableList; // keeps a list of immutable memtables in a vector. the list is immutable // if refcount is bigger than one. It is used as a state for Get() and @@ -114,6 +115,18 @@ class MemTableListVersion { SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const; private: + friend class MemTableList; + + friend Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, + VersionSet* vset, InstrumentedMutex* mu, + const autovector& file_meta, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer); + // REQUIRE: m is an immutable memtable void Add(MemTable* m, autovector* to_delete); // REQUIRE: m is an immutable memtable @@ -132,8 +145,6 @@ class MemTableListVersion { void UnrefMemTable(autovector* to_delete, MemTable* m); - friend class MemTableList; - // Immutable MemTables that have not yet been flushed. std::list memlist_; @@ -163,18 +174,6 @@ class MemTableListVersion { // write thread.) class MemTableList { public: - // Commit a successful atomic flush in the manifest file - static Status TryInstallMemtableFlushResults( - autovector& imm_lists, - const autovector& cfds, - const autovector& mutable_cf_options_list, - const autovector*>& mems_list, - bool* atomic_flush_commit_in_progress, LogsWithPrepTracker* prep_tracker, - VersionSet* vset, InstrumentedMutex* mu, - const autovector& file_meta, - autovector* to_delete, Directory* db_directory, - LogBuffer* log_buffer); - // A list of memtables. explicit MemTableList(int min_write_buffer_number_to_merge, int max_write_buffer_number_to_maintain) @@ -296,6 +295,16 @@ class MemTableList { } private: + friend Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, + VersionSet* vset, InstrumentedMutex* mu, + const autovector& file_meta, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer); + // DB mutex held void InstallNewVersion(); @@ -317,4 +326,18 @@ class MemTableList { size_t current_memory_usage_; }; +// Installs memtable atomic flush results. +// In most cases, imm_lists is nullptr, and the function simply uses the +// immutable memtable lists associated with the cfds. There are unit tests that +// installs flush results for external immutable memtable lists other than the +// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case, +// imm_lists parameter is not nullptr. +extern Status InstallMemtableAtomicFlushResults( + const autovector* imm_lists, + const autovector& cfds, + const autovector& mutable_cf_options_list, + const autovector*>& mems_list, VersionSet* vset, + InstrumentedMutex* mu, const autovector& file_meta, + autovector* to_delete, Directory* db_directory, + LogBuffer* log_buffer); } // namespace rocksdb diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index 96032a465..d67eed9fa 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -85,17 +85,46 @@ class MemTableListTest : public testing::Test { Status Mock_InstallMemtableFlushResults( MemTableList* list, const MutableCFOptions& mutable_cf_options, const autovector& m, autovector* to_delete) { - autovector lists; - lists.emplace_back(list); - autovector*> mems_list; - mems_list.emplace_back(&m); - return Mock_InstallMemtableFlushResults( - lists, {0} /* cf_ids */, {&mutable_cf_options}, mems_list, to_delete); + // Create a mock Logger + test::NullLogger logger; + LogBuffer log_buffer(DEBUG_LEVEL, &logger); + + CreateDB(); + // Create a mock VersionSet + DBOptions db_options; + ImmutableDBOptions immutable_db_options(db_options); + EnvOptions env_options; + std::shared_ptr table_cache(NewLRUCache(50000, 16)); + WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size); + WriteController write_controller(10000000u); + + VersionSet versions(dbname, &immutable_db_options, env_options, + table_cache.get(), &write_buffer_manager, + &write_controller); + std::vector cf_descs; + cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); + cf_descs.emplace_back("one", ColumnFamilyOptions()); + cf_descs.emplace_back("two", ColumnFamilyOptions()); + + EXPECT_OK(versions.Recover(cf_descs, false)); + + // Create mock default ColumnFamilyData + auto column_family_set = versions.GetColumnFamilySet(); + LogsWithPrepTracker dummy_prep_tracker; + auto cfd = column_family_set->GetDefault(); + EXPECT_TRUE(nullptr != cfd); + uint64_t file_num = file_number.fetch_add(1); + // Create dummy mutex. + InstrumentedMutex mutex; + InstrumentedMutexLock l(&mutex); + return list->TryInstallMemtableFlushResults( + cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex, + file_num, to_delete, nullptr, &log_buffer); } // Calls MemTableList::InstallMemtableFlushResults() and sets up all // structures needed to call this function. - Status Mock_InstallMemtableFlushResults( + Status Mock_InstallMemtableAtomicFlushResults( autovector& lists, const autovector& cf_ids, const autovector& mutable_cf_options_list, const autovector*>& mems_list, @@ -127,25 +156,6 @@ class MemTableListTest : public testing::Test { auto column_family_set = versions.GetColumnFamilySet(); LogsWithPrepTracker dummy_prep_tracker; - if (1 == cf_ids.size()) { - auto cfd = column_family_set->GetColumnFamily(cf_ids[0]); - EXPECT_TRUE(nullptr != cfd); - EXPECT_EQ(1, lists.size()); - MemTableList* list = lists[0]; - EXPECT_EQ(1, mutable_cf_options_list.size()); - const MutableCFOptions& mutable_cf_options = - *(mutable_cf_options_list.at(0)); - const autovector* mems = mems_list.at(0); - EXPECT_TRUE(nullptr != mems); - - uint64_t file_num = file_number.fetch_add(1); - // Create dummy mutex. - InstrumentedMutex mutex; - InstrumentedMutexLock l(&mutex); - return list->TryInstallMemtableFlushResults( - cfd, mutable_cf_options, *mems, &dummy_prep_tracker, &versions, - &mutex, file_num, to_delete, nullptr, &log_buffer); - } autovector cfds; for (int i = 0; i != static_cast(cf_ids.size()); ++i) { cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i])); @@ -158,13 +168,11 @@ class MemTableListTest : public testing::Test { meta.fd = FileDescriptor(file_num, 0, 0); file_metas.emplace_back(meta); } - bool atomic_flush_commit_in_progress = false; InstrumentedMutex mutex; InstrumentedMutexLock l(&mutex); - return MemTableList::TryInstallMemtableFlushResults( - lists, cfds, mutable_cf_options_list, mems_list, - &atomic_flush_commit_in_progress, &dummy_prep_tracker, &versions, - &mutex, file_metas, to_delete, nullptr, &log_buffer); + return InstallMemtableAtomicFlushResults( + &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex, + file_metas, to_delete, nullptr, &log_buffer); } }; @@ -730,18 +738,28 @@ TEST_F(MemTableListTest, FlushPendingTest) { to_delete.clear(); } -TEST_F(MemTableListTest, FlushMultipleCFsTest) { +TEST_F(MemTableListTest, EmptyAtomicFlusTest) { + autovector lists; + autovector cf_ids; + autovector options_list; + autovector*> to_flush; + autovector to_delete; + Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list, + to_flush, &to_delete); + ASSERT_OK(s); + ASSERT_TRUE(to_delete.empty()); +} + +TEST_F(MemTableListTest, AtomicFlusTest) { const int num_cfs = 3; - const int num_tables_per_cf = 5; + const int num_tables_per_cf = 2; SequenceNumber seq = 1; - Status s; auto factory = std::make_shared(); options.memtable_factory = factory; ImmutableCFOptions ioptions(options); InternalKeyComparator cmp(BytewiseComparator()); WriteBufferManager wb(options.db_write_buffer_size); - autovector to_delete; // Create MemTableLists int min_write_buffer_number_to_merge = 3; @@ -782,135 +800,72 @@ TEST_F(MemTableListTest, FlushMultipleCFsTest) { std::vector> flush_candidates(num_cfs); // Nothing to flush - for (int i = 0; i != num_cfs; ++i) { - auto list = lists[i]; + for (auto i = 0; i != num_cfs; ++i) { + auto* list = lists[i]; ASSERT_FALSE(list->IsFlushPending()); ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); list->PickMemtablesToFlush(nullptr /* memtable_id */, &flush_candidates[i]); - ASSERT_EQ(0, static_cast(flush_candidates[i].size())); + ASSERT_EQ(0, flush_candidates[i].size()); } - // Request flush even though there is nothing to flush - for (int i = 0; i != num_cfs; ++i) { - auto list = lists[i]; + for (auto i = 0; i != num_cfs; ++i) { + auto* list = lists[i]; list->FlushRequested(); ASSERT_FALSE(list->IsFlushPending()); ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire)); } - - // Add tables to column families - for (int i = 0; i != num_cfs; ++i) { - for (int j = 0; j != num_tables_per_cf; ++j) { + autovector to_delete; + // Add tables to the immutable memtalbe lists associated with column families + for (auto i = 0; i != num_cfs; ++i) { + for (auto j = 0; j != num_tables_per_cf; ++j) { lists[i]->Add(tables[i][j], &to_delete); } ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed()); ASSERT_TRUE(lists[i]->IsFlushPending()); ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire)); } - + std::vector flush_memtable_ids = {1, 1, 0}; + // +----+ + // list[0]: |0 1| + // list[1]: |0 1| + // | +--+ + // list[2]: |0| 1 + // +-+ + // Pick memtables to flush + for (auto i = 0; i != num_cfs; ++i) { + flush_candidates[i].clear(); + lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i], + &flush_candidates[i]); + ASSERT_EQ(flush_memtable_ids[i] - 0 + 1, + static_cast(flush_candidates[i].size())); + } + autovector tmp_lists; + autovector tmp_cf_ids; + autovector tmp_options_list; autovector*> to_flush; - std::vector prev_memtable_ids; - // For each column family, determine the memtables to flush - for (int k = 0; k != 4; ++k) { - std::vector flush_memtable_ids; - if (0 == k) { - // +----+ - // list[0]: |0 1| 2 3 4 - // list[1]: |0 1| 2 3 4 - // | +--+ - // list[2]: |0| 1 2 3 4 - // +-+ - flush_memtable_ids = {1, 1, 0}; - } else if (1 == k) { - // +----+ +---+ - // list[0]: |0 1| |2 3| 4 - // list[1]: |0 1| |2 3| 4 - // | +--+ +---+ - // list[2]: |0| 1 2 3 4 - // +-+ - flush_memtable_ids = {3, 3, 0}; - } else if (2 == k) { - // +-----+ +---+ - // list[0]: |0 1| |2 3| 4 - // list[1]: |0 1| |2 3| 4 - // | +---+ +---+ - // | | +-------+ - // list[2]: |0| |1 2 3| 4 - // +-+ +-------+ - flush_memtable_ids = {3, 3, 3}; - } else { - // +-----+ +---+ +-+ - // list[0]: |0 1| |2 3| |4| - // list[1]: |0 1| |2 3| |4| - // | +---+ +---+ | | - // | | +-------+ | | - // list[2]: |0| |1 2 3| |4| - // +-+ +-------+ +-+ - flush_memtable_ids = {4, 4, 4}; - } - assert(num_cfs == static_cast(flush_memtable_ids.size())); - - // Pick memtables to flush - for (int i = 0; i != num_cfs; ++i) { - flush_candidates[i].clear(); - lists[i]->PickMemtablesToFlush(&flush_memtable_ids[i], - &flush_candidates[i]); - for (auto mem : flush_candidates[i]) { - mem->TEST_AtomicFlushSequenceNumber() = SequenceNumber(k); - } - if (prev_memtable_ids.empty()) { - ASSERT_EQ(flush_memtable_ids[i] - 0 + 1, flush_candidates[i].size()); - } else { - ASSERT_EQ(flush_memtable_ids[i] - prev_memtable_ids[i], - flush_candidates[i].size()); - } - ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed()); - ASSERT_FALSE(lists[i]->HasFlushRequested()); - if (flush_memtable_ids[i] == num_tables_per_cf - 1) { - ASSERT_FALSE( - lists[i]->imm_flush_needed.load(std::memory_order_acquire)); - } else { - ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire)); - } - } - prev_memtable_ids = flush_memtable_ids; - - if (k < 3) { - for (const auto& mems : flush_candidates) { - uint64_t file_num = file_number.fetch_add(1); - for (auto m : mems) { - m->TEST_SetFlushCompleted(true); - m->TEST_SetFileNumber(file_num); - } - } - } - - if (k == 0) { - // Rollback first pick of tables - for (int i = 0; i != num_cfs; ++i) { - auto list = lists[i]; - const auto& mems = flush_candidates[i]; - for (auto m : mems) { - m->TEST_SetFileNumber(0); - } - list->RollbackMemtableFlush(flush_candidates[i], 0); - ASSERT_TRUE(list->IsFlushPending()); - ASSERT_TRUE(list->imm_flush_needed.load(std::memory_order_acquire)); - } - prev_memtable_ids.clear(); + for (auto i = 0; i != num_cfs; ++i) { + if (!flush_candidates[i].empty()) { + to_flush.push_back(&flush_candidates[i]); + tmp_lists.push_back(lists[i]); + tmp_cf_ids.push_back(i); + tmp_options_list.push_back(mutable_cf_options_list[i]); } + } + Status s = Mock_InstallMemtableAtomicFlushResults( + tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete); + ASSERT_OK(s); - if (k == 3) { - for (int i = 0; i != num_cfs; ++i) { - to_flush.emplace_back(&flush_candidates[i]); + for (auto i = 0; i != num_cfs; ++i) { + for (auto j = 0; j != num_tables_per_cf; ++j) { + if (static_cast(j) <= flush_memtable_ids[i]) { + ASSERT_LT(0, tables[i][j]->GetFileNumber()); } } + ASSERT_EQ( + static_cast(num_tables_per_cf) - flush_candidates[i].size(), + lists[i]->NumNotFlushed()); } - s = Mock_InstallMemtableFlushResults(lists, cf_ids, mutable_cf_options_list, - to_flush, &to_delete); - ASSERT_OK(s); - to_delete.clear(); for (auto list : lists) { list->current()->Unref(&to_delete); @@ -932,126 +887,6 @@ TEST_F(MemTableListTest, FlushMultipleCFsTest) { ASSERT_EQ(m, m->Unref()); delete m; } - to_delete.clear(); -} - -TEST_F(MemTableListTest, HasOlderAtomicFlush) { - const size_t num_cfs = 3; - const size_t num_memtables_per_cf = 2; - SequenceNumber seq = 1; - Status s; - - auto factory = std::make_shared(); - options.memtable_factory = factory; - ImmutableCFOptions ioptions(options); - InternalKeyComparator cmp(BytewiseComparator()); - WriteBufferManager wb(options.db_write_buffer_size); - autovector to_delete; - - // Create MemTableLists - int min_write_buffer_number_to_merge = 3; - int max_write_buffer_number_to_maintain = 7; - autovector lists; - for (size_t i = 0; i != num_cfs; ++i) { - lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge, - max_write_buffer_number_to_maintain)); - } - - autovector cf_ids; - std::vector> tables; - autovector mutable_cf_options_list; - uint32_t cf_id = 0; - for (size_t k = 0; k != num_cfs; ++k) { - std::vector elem; - mutable_cf_options_list.emplace_back(new MutableCFOptions(options)); - uint64_t memtable_id = 0; - for (int i = 0; i != num_memtables_per_cf; ++i) { - MemTable* mem = - new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb, - kMaxSequenceNumber, cf_id); - mem->SetID(memtable_id++); - mem->Ref(); - - std::string value; - - mem->Add(++seq, kTypeValue, "key1", ToString(i)); - mem->Add(++seq, kTypeValue, "keyN" + ToString(i), "valueN"); - mem->Add(++seq, kTypeValue, "keyX" + ToString(i), "value"); - mem->Add(++seq, kTypeValue, "keyM" + ToString(i), "valueM"); - mem->Add(++seq, kTypeDeletion, "keyX" + ToString(i), ""); - - elem.push_back(mem); - } - tables.emplace_back(elem); - cf_ids.push_back(cf_id++); - } - - // Add tables to column families' immutable memtable lists - for (size_t i = 0; i != num_cfs; ++i) { - for (size_t j = 0; j != num_memtables_per_cf; ++j) { - lists[i]->Add(tables[i][j], &to_delete); - } - lists[i]->FlushRequested(); - ASSERT_EQ(num_memtables_per_cf, lists[i]->NumNotFlushed()); - ASSERT_TRUE(lists[i]->IsFlushPending()); - ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire)); - } - std::vector> flush_candidates(num_cfs); - for (size_t i = 0; i != num_cfs; ++i) { - lists[i]->PickMemtablesToFlush(nullptr, &flush_candidates[i]); - for (auto m : flush_candidates[i]) { - m->TEST_AtomicFlushSequenceNumber() = 123; - } - lists[i]->RollbackMemtableFlush(flush_candidates[i], 0); - } - uint64_t memtable_id = num_memtables_per_cf - 1; - autovector other_flush_candidates; - lists[0]->PickMemtablesToFlush(&memtable_id, &other_flush_candidates); - for (auto m : other_flush_candidates) { - m->TEST_AtomicFlushSequenceNumber() = 124; - m->TEST_SetFlushCompleted(true); - m->TEST_SetFileNumber(1); - } - autovector*> to_flush; - to_flush.emplace_back(&other_flush_candidates); - bool has_older_unfinished_atomic_flush = false; - bool found_batch_to_commit = false; - - SyncPoint::GetInstance()->SetCallBack( - "MemTableList::TryInstallMemtableFlushResults:" - "HasOlderUnfinishedAtomicFlush:0", - [&](void* /*arg*/) { has_older_unfinished_atomic_flush = true; }); - SyncPoint::GetInstance()->SetCallBack( - "MemTableList::TryInstallMemtableFlushResults:FoundBatchToCommit:0", - [&](void* /*arg*/) { found_batch_to_commit = true; }); - SyncPoint::GetInstance()->EnableProcessing(); - - s = Mock_InstallMemtableFlushResults(lists, cf_ids, mutable_cf_options_list, - to_flush, &to_delete); - ASSERT_OK(s); - ASSERT_TRUE(has_older_unfinished_atomic_flush); - ASSERT_FALSE(found_batch_to_commit); - - SyncPoint::GetInstance()->ClearAllCallBacks(); - - ASSERT_TRUE(to_delete.empty()); - for (auto list : lists) { - list->current()->Unref(&to_delete); - delete list; - } - lists.clear(); - ASSERT_EQ(num_cfs * num_memtables_per_cf, to_delete.size()); - for (auto m : to_delete) { - m->Ref(); - ASSERT_EQ(m, m->Unref()); - delete m; - } - to_delete.clear(); - for (auto& opts : mutable_cf_options_list) { - delete opts; - opts = nullptr; - } - mutable_cf_options_list.clear(); } } // namespace rocksdb diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 8b478ceb0..0379bd58a 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -1019,7 +1019,9 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { auto cfd_to_drop = versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name); ASSERT_NE(nullptr, cfd_to_drop); - cfd_to_drop->Ref(); // Increase its refcount because cfd_to_drop is used later + // Increase its refcount because cfd_to_drop is used later, and we need to + // prevent it from being deleted. + cfd_to_drop->Ref(); drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID()); mutex_.Lock(); s = versions_->LogAndApply(cfd_to_drop, From 97773d0967c9eb3140280b621b21b11192bc8ae2 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Mon, 7 Jan 2019 10:18:58 -0800 Subject: [PATCH 10/57] Update HISTORY.md --- HISTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index bf5bb12a0..822b919ef 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,7 +11,6 @@ * Introduced `MemoryAllocator`, which lets the user specify custom memory allocator for block based table. * Improved `DeleteRange` to prevent read performance degradation. The feature is no longer marked as experimental. * Enabled checkpoint on readonly db (DBImplReadOnly). -* Make DB ignore dropped column families while committing results of atomic flush. ### Public API Change * `DBOptions::use_direct_reads` now affects reads issued by `BackupEngine` on the database's SSTs. @@ -29,6 +28,7 @@ * Start populating `NO_FILE_CLOSES` ticker statistic, which was always zero previously. * The default value of NewBloomFilterPolicy()'s argument use_block_based_builder is changed to false. Note that this new default may cause large temp memory usage when building very large SST files. * Fix a deadlock caused by compaction and file ingestion waiting for each other in the event of write stalls. +* Make DB ignore dropped column families while committing results of atomic flush. ## 5.17.0 (10/05/2018) ### Public API Change From e78f5cfba73b50c4d7fdeb1a93391e6f85d4c248 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Fri, 4 Jan 2019 11:21:32 -0800 Subject: [PATCH 11/57] Fix point lookup on range tombstone sentinel endpoint (#4829) Summary: Previously for point lookup we decided which file to look into based on user key overlap only. We also did not truncate range tombstones in the point lookup code path. These two ideas did not interact well in cases like this: - L1 has range tombstone [a, c)#1 and point key b#2. The data is split between file1 with range [a#1,1, b#72057594037927935,15], and file2 with range [b#2, c#1]. - L1's file2 gets compacted to L2. - User issues `Get()` for b#3. - L1's file1 is opened and the range tombstone [a, c)#1 is found for b, while no point-key for b is found in L1. - `Get()` assumes that the range tombstone must cover all data in that range in lower levels, so short circuits and returns `NotFound`. The solution to this problem is to not look into files that only overlap with the point lookup at a range tombstone sentinel endpoint. In the above example, this would mean not opening L1's file1 or its tombstones during the `Get()`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4829 Differential Revision: D13561355 Pulled By: ajkr fbshipit-source-id: a13c21c816870a2f5d32a48af6dbd719a7d9d19f --- db/db_range_del_test.cc | 5 +++++ db/version_set.cc | 19 +++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index 508149a34..73bc4d275 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -1041,11 +1041,16 @@ TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) { // L2: // [key000000#1,1, key000000#1,1] // [key000002#6,1, key000004#72057594037927935,15] + // + // At the same time, verify the compaction does not cause the key at the + // endpoint (key000002#6,1) to disappear. + ASSERT_EQ(value, Get(Key(2))); auto begin_str = Key(3); const rocksdb::Slice begin = begin_str; dbfull()->TEST_CompactRange(1, &begin, nullptr); ASSERT_EQ(1, NumTableFilesAtLevel(1)); ASSERT_EQ(2, NumTableFilesAtLevel(2)); + ASSERT_EQ(value, Get(Key(2))); } { diff --git a/db/version_set.cc b/db/version_set.cc index 2d6997601..5aa2fac6c 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -301,17 +301,28 @@ class FilePicker { // On Level-n (n>=1), files are sorted. Binary search to find the // earliest file whose largest key >= ikey. Search left bound and // right bound are used to narrow the range. - if (search_left_bound_ == search_right_bound_) { - start_index = search_left_bound_; - } else if (search_left_bound_ < search_right_bound_) { + if (search_left_bound_ <= search_right_bound_) { if (search_right_bound_ == FileIndexer::kLevelMaxIndex) { search_right_bound_ = static_cast(curr_file_level_->num_files) - 1; } + // `search_right_bound_` is an inclusive upper-bound, but since it was + // determined based on user key, it is still possible the lookup key + // falls to the right of `search_right_bound_`'s corresponding file. + // So, pass a limit one higher, which allows us to detect this case. start_index = FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_, static_cast(search_left_bound_), - static_cast(search_right_bound_)); + static_cast(search_right_bound_) + 1); + if (start_index == search_right_bound_ + 1) { + // `ikey_` comes after `search_right_bound_`. The lookup key does + // not exist on this level, so let's skip this level and do a full + // binary search on the next level. + search_left_bound_ = 0; + search_right_bound_ = FileIndexer::kLevelMaxIndex; + curr_level_++; + continue; + } } else { // search_left_bound > search_right_bound, key does not exist in // this level. Since no comparison is done in this level, it will From 3bcc31295a3e434e29c9af93f80fa16421464edd Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 9 Jan 2019 15:51:02 -0800 Subject: [PATCH 12/57] Initialize two members in PerfContext (#4859) Summary: as titled. Currently it's possible to create a local object of type PerfContext since it's part of public API. Then it's safe to initialize the two members to 0. If PerfContext is created as thread-local object, then all members are zero-initialized according to C++ standard. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4859 Differential Revision: D13614504 Pulled By: riversand963 fbshipit-source-id: 406ff548e105a074f379ad1054d56fece5f524a0 --- include/rocksdb/perf_context.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 1b11fa3cc..3f125c213 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -201,8 +201,8 @@ struct PerfContext { uint64_t env_lock_file_nanos; uint64_t env_unlock_file_nanos; uint64_t env_new_logger_nanos; - std::map* level_to_perf_context; - bool per_level_perf_context_enabled; + std::map* level_to_perf_context = nullptr; + bool per_level_perf_context_enabled = false; }; // Get Thread-local PerfContext object pointer From 4eeb1bf0a66033aab23ba2675aea38b44e19252b Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 9 Jan 2019 16:15:59 -0800 Subject: [PATCH 13/57] Bump version to 5.18.1 --- include/rocksdb/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 89802521d..fbf98bde7 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -6,7 +6,7 @@ #define ROCKSDB_MAJOR 5 #define ROCKSDB_MINOR 18 -#define ROCKSDB_PATCH 0 +#define ROCKSDB_PATCH 1 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these From 9ae0528dc481add34ce6f4a072e51b827710bde2 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Wed, 16 Jan 2019 09:48:01 -0800 Subject: [PATCH 14/57] Use chrono::time_point instead of time_t (#4868) Summary: By convention, time_t almost always stores the integral number of seconds since 00:00 hours, Jan 1, 1970 UTC, according to http://www.cplusplus.com/reference/ctime/time_t/. We surely want more precision than seconds. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4868 Differential Revision: D13633046 Pulled By: riversand963 fbshipit-source-id: 4e01e23a22e8838023c51a91247a286dbf3a5396 --- HISTORY.md | 4 ++++ db/listener_test.cc | 9 +++++++++ include/rocksdb/listener.h | 12 +++++++++--- util/file_reader_writer.cc | 34 ++++++++++++++++++---------------- util/file_reader_writer.h | 20 ++++++++------------ 5 files changed, 48 insertions(+), 31 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 822b919ef..9040717fc 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,8 @@ # Rocksdb Change Log +## 5.18.2 (01/23/2019) +### Public API Change +* Change time resolution in FileOperationInfo. + ## 5.18.0 (11/30/2018) ### New Features * Introduced `JemallocNodumpAllocator` memory allocator. When being use, block cache will be excluded from core dump. diff --git a/db/listener_test.cc b/db/listener_test.cc index cbbffc8cb..894769d88 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -905,6 +905,7 @@ class TestFileOperationListener : public EventListener { if (info.status.ok()) { ++file_reads_success_; } + ReportDuration(info); } void OnFileWriteFinish(const FileOperationInfo& info) override { @@ -912,6 +913,7 @@ class TestFileOperationListener : public EventListener { if (info.status.ok()) { ++file_writes_success_; } + ReportDuration(info); } bool ShouldBeNotifiedOnFileIO() override { return true; } @@ -920,6 +922,13 @@ class TestFileOperationListener : public EventListener { std::atomic file_reads_success_; std::atomic file_writes_; std::atomic file_writes_success_; + + private: + void ReportDuration(const FileOperationInfo& info) const { + auto duration = std::chrono::duration_cast( + info.finish_timestamp - info.start_timestamp); + ASSERT_GT(duration.count(), 0); + } }; TEST_F(EventListenerTest, OnFileOperationTest) { diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 8ceb2ed4c..9b4e8a866 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include #include @@ -144,13 +145,18 @@ struct TableFileDeletionInfo { }; struct FileOperationInfo { + using TimePoint = std::chrono::time_point; + const std::string& path; uint64_t offset; size_t length; - time_t start_timestamp; - time_t finish_timestamp; + const TimePoint& start_timestamp; + const TimePoint& finish_timestamp; Status status; - FileOperationInfo(const std::string& _path) : path(_path) {} + FileOperationInfo(const std::string& _path, const TimePoint& start, + const TimePoint& finish) + : path(_path), start_timestamp(start), finish_timestamp(finish) {} }; struct FlushJobInfo { diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index 821d657b0..9e40d4d40 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -99,17 +99,18 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, } Slice tmp; - time_t start_ts = 0; + FileOperationInfo::TimePoint start_ts; uint64_t orig_offset = 0; if (ShouldNotifyListeners()) { - start_ts = std::chrono::system_clock::to_time_t( - std::chrono::system_clock::now()); + start_ts = std::chrono::system_clock::now(); orig_offset = aligned_offset + buf.CurrentSize(); } s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, &tmp, buf.Destination()); if (ShouldNotifyListeners()) { - NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, s); + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileReadFinish(orig_offset, tmp.size(), start_ts, finish_ts, + s); } buf.Size(buf.CurrentSize() + tmp.size()); @@ -145,16 +146,17 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, Slice tmp_result; #ifndef ROCKSDB_LITE - time_t start_ts = 0; + FileOperationInfo::TimePoint start_ts; if (ShouldNotifyListeners()) { - start_ts = std::chrono::system_clock::to_time_t( - std::chrono::system_clock::now()); + start_ts = std::chrono::system_clock::now(); } #endif s = file_->Read(offset + pos, allowed, &tmp_result, scratch + pos); #ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { - NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts, s); + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts, + finish_ts, s); } #endif @@ -442,18 +444,18 @@ Status WritableFileWriter::WriteBuffered(const char* data, size_t size) { TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); #ifndef ROCKSDB_LITE - time_t start_ts = 0; + FileOperationInfo::TimePoint start_ts; uint64_t old_size = writable_file_->GetFileSize(); if (ShouldNotifyListeners()) { - start_ts = std::chrono::system_clock::to_time_t( - std::chrono::system_clock::now()); + start_ts = std::chrono::system_clock::now(); old_size = next_write_offset_; } #endif s = writable_file_->Append(Slice(src, allowed)); #ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { - NotifyOnFileWriteFinish(old_size, allowed, start_ts, s); + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s); } #endif if (!s.ok()) { @@ -518,15 +520,15 @@ Status WritableFileWriter::WriteDirect() { { IOSTATS_TIMER_GUARD(write_nanos); TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); - time_t start_ts(0); + FileOperationInfo::TimePoint start_ts; if (ShouldNotifyListeners()) { - start_ts = std::chrono::system_clock::to_time_t( - std::chrono::system_clock::now()); + start_ts = std::chrono::system_clock::now(); } // direct writes must be positional s = writable_file_->PositionedAppend(Slice(src, size), write_offset); if (ShouldNotifyListeners()) { - NotifyOnFileWriteFinish(write_offset, size, start_ts, s); + auto finish_ts = std::chrono::system_clock::now(); + NotifyOnFileWriteFinish(write_offset, size, start_ts, finish_ts, s); } if (!s.ok()) { buf_.Size(file_advance + leftover_tail); diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index ec7acebcc..1083c685c 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -64,15 +64,13 @@ class SequentialFileReader { class RandomAccessFileReader { private: #ifndef ROCKSDB_LITE - void NotifyOnFileReadFinish(uint64_t offset, size_t length, time_t start_ts, + void NotifyOnFileReadFinish(uint64_t offset, size_t length, + const FileOperationInfo::TimePoint& start_ts, + const FileOperationInfo::TimePoint& finish_ts, const Status& status) const { - FileOperationInfo info(file_name_); + FileOperationInfo info(file_name_, start_ts, finish_ts); info.offset = offset; info.length = length; - info.start_timestamp = start_ts; - time_t finish_ts = - std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); - info.finish_timestamp = finish_ts; info.status = status; for (auto& listener : listeners_) { @@ -157,15 +155,13 @@ class RandomAccessFileReader { class WritableFileWriter { private: #ifndef ROCKSDB_LITE - void NotifyOnFileWriteFinish(uint64_t offset, size_t length, time_t start_ts, + void NotifyOnFileWriteFinish(uint64_t offset, size_t length, + const FileOperationInfo::TimePoint& start_ts, + const FileOperationInfo::TimePoint& finish_ts, const Status& status) { - FileOperationInfo info(file_name_); + FileOperationInfo info(file_name_, start_ts, finish_ts); info.offset = offset; info.length = length; - info.start_timestamp = start_ts; - time_t finish_ts = - std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); - info.finish_timestamp = finish_ts; info.status = status; for (auto& listener : listeners_) { From 35c05bca0f5de46f9151dcf5db7dc5924dd581b4 Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Tue, 22 Jan 2019 16:57:40 -0800 Subject: [PATCH 15/57] Deleting Blob files also goes through SstFileManager (#4904) Summary: Right now, deleting blob files is not rate limited, even if SstFileManger is specified. On the other hand, rate limiting blob deletion is not supported. With this change, Blob file deletion will go through SstFileManager too. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4904 Differential Revision: D13772545 Pulled By: siying fbshipit-source-id: bd1b1d0beb26d5167385e00b7ecb8b94b879de84 --- HISTORY.md | 1 + util/sst_file_manager_impl.cc | 1 + utilities/blob_db/blob_db_impl.cc | 6 ++-- utilities/blob_db/blob_db_test.cc | 47 +++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9040717fc..e81b3f30f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## 5.18.2 (01/23/2019) ### Public API Change * Change time resolution in FileOperationInfo. +* Deleting Blob files also go through SStFileManager. ## 5.18.0 (11/30/2018) ### New Features diff --git a/util/sst_file_manager_impl.cc b/util/sst_file_manager_impl.cc index ee1394bc9..0b46b24b1 100644 --- a/util/sst_file_manager_impl.cc +++ b/util/sst_file_manager_impl.cc @@ -403,6 +403,7 @@ bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) { Status SstFileManagerImpl::ScheduleFileDeletion( const std::string& file_path, const std::string& path_to_sync) { + TEST_SYNC_POINT("SstFileManagerImpl::ScheduleFileDeletion"); return delete_scheduler_.DeleteFile(file_path, path_to_sync); } diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index bdec65462..a3a93365c 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -26,6 +26,7 @@ #include "util/cast_util.h" #include "util/crc32c.h" #include "util/file_reader_writer.h" +#include "util/file_util.h" #include "util/filename.h" #include "util/logging.h" #include "util/mutexlock.h" @@ -1745,7 +1746,8 @@ std::pair BlobDBImpl::DeleteObsoleteFiles(bool aborted) { bfile->PathName().c_str()); blob_files_.erase(bfile->BlobFileNumber()); - Status s = env_->DeleteFile(bfile->PathName()); + Status s = DeleteSSTFile(&(db_impl_->immutable_db_options()), + bfile->PathName(), blob_dir_); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "File failed to be deleted as obsolete %s", @@ -1835,7 +1837,7 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options, uint64_t number; FileType type; if (ParseFileName(f, &number, &type) && type == kBlobFile) { - Status del = env->DeleteFile(blobdir + "/" + f); + Status del = DeleteSSTFile(&soptions, blobdir + "/" + f, blobdir); if (status.ok() && !del.ok()) { status = del; } diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index 1c1867e4e..d9cca123e 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -18,6 +18,7 @@ #include "util/cast_util.h" #include "util/fault_injection_test_env.h" #include "util/random.h" +#include "util/sst_file_manager_impl.h" #include "util/string_util.h" #include "util/sync_point.h" #include "util/testharness.h" @@ -762,6 +763,52 @@ TEST_F(BlobDBTest, ReadWhileGC) { } } +TEST_F(BlobDBTest, SstFileManager) { + // run the same test for Get(), MultiGet() and Iterator each. + std::shared_ptr sst_file_manager( + NewSstFileManager(mock_env_.get())); + sst_file_manager->SetDeleteRateBytesPerSecond(1); + SstFileManagerImpl *sfm = + static_cast(sst_file_manager.get()); + + BlobDBOptions bdb_options; + bdb_options.min_blob_size = 0; + Options db_options; + + int files_deleted_directly = 0; + int files_scheduled_to_delete = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::ScheduleFileDeletion", + [&](void * /*arg*/) { files_scheduled_to_delete++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteFile", + [&](void * /*arg*/) { files_deleted_directly++; }); + SyncPoint::GetInstance()->EnableProcessing(); + db_options.sst_file_manager = sst_file_manager; + + Open(bdb_options, db_options); + + // Create one obselete file and clean it. + blob_db_->Put(WriteOptions(), "foo", "bar"); + auto blob_files = blob_db_impl()->TEST_GetBlobFiles(); + ASSERT_EQ(1, blob_files.size()); + std::shared_ptr bfile = blob_files[0]; + ASSERT_OK(blob_db_impl()->TEST_CloseBlobFile(bfile)); + GCStats gc_stats; + ASSERT_OK(blob_db_impl()->TEST_GCFileAndUpdateLSM(bfile, &gc_stats)); + blob_db_impl()->TEST_DeleteObsoleteFiles(); + + // Even if SSTFileManager is not set, DB is creating a dummy one. + ASSERT_EQ(1, files_scheduled_to_delete); + ASSERT_EQ(0, files_deleted_directly); + Destroy(); + // Make sure that DestroyBlobDB() also goes through delete scheduler. + ASSERT_GE(2, files_scheduled_to_delete); + ASSERT_EQ(0, files_deleted_directly); + SyncPoint::GetInstance()->DisableProcessing(); + sfm->WaitForEmptyTrash(); +} + TEST_F(BlobDBTest, SnapshotAndGarbageCollection) { BlobDBOptions bdb_options; bdb_options.min_blob_size = 0; From 53f760b8a85ca23eedde96d98e8114e0562b6544 Mon Sep 17 00:00:00 2001 From: anand76 Date: Tue, 29 Jan 2019 14:27:30 -0800 Subject: [PATCH 16/57] Always delete Blob DB files in the background (#4928) Summary: Blob DB files are not tracked by the SFM, so they currently don't get deleted in the background. Force them to be deleted in background so rate limiting can be applied Pull Request resolved: https://github.com/facebook/rocksdb/pull/4928 Differential Revision: D13854649 Pulled By: anand1976 fbshipit-source-id: 8031ce66842ff0af440c715d886b377983dad7d8 --- util/delete_scheduler.cc | 7 ++++--- util/delete_scheduler.h | 7 +++++-- util/file_util.cc | 9 ++++++++- util/file_util.h | 5 +++++ util/sst_file_manager_impl.cc | 6 ++++-- util/sst_file_manager_impl.h | 7 +++++-- utilities/blob_db/blob_db_impl.cc | 6 +++--- 7 files changed, 34 insertions(+), 13 deletions(-) diff --git a/util/delete_scheduler.cc b/util/delete_scheduler.cc index a8078b94a..f5ee28448 100644 --- a/util/delete_scheduler.cc +++ b/util/delete_scheduler.cc @@ -52,11 +52,12 @@ DeleteScheduler::~DeleteScheduler() { } Status DeleteScheduler::DeleteFile(const std::string& file_path, - const std::string& dir_to_sync) { + const std::string& dir_to_sync, + const bool force_bg) { Status s; - if (rate_bytes_per_sec_.load() <= 0 || + if (rate_bytes_per_sec_.load() <= 0 || (!force_bg && total_trash_size_.load() > - sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load()) { + sst_file_manager_->GetTotalSize() * max_trash_db_ratio_.load())) { // Rate limiting is disabled or trash size makes up more than // max_trash_db_ratio_ (default 25%) of the total DB size TEST_SYNC_POINT("DeleteScheduler::DeleteFile"); diff --git a/util/delete_scheduler.h b/util/delete_scheduler.h index cbd13ecef..29b70517b 100644 --- a/util/delete_scheduler.h +++ b/util/delete_scheduler.h @@ -46,8 +46,11 @@ class DeleteScheduler { rate_bytes_per_sec_.store(bytes_per_sec); } - // Mark file as trash directory and schedule it's deletion - Status DeleteFile(const std::string& fname, const std::string& dir_to_sync); + // Mark file as trash directory and schedule it's deletion. If force_bg is + // set, it forces the file to always be deleted in the background thread, + // except when rate limiting is disabled + Status DeleteFile(const std::string& fname, const std::string& dir_to_sync, + const bool force_bg = false); // Wait for all files being deleteing in the background to finish or for // destructor to be called. diff --git a/util/file_util.cc b/util/file_util.cc index bf56592ef..3f730f3e8 100644 --- a/util/file_util.cc +++ b/util/file_util.cc @@ -89,16 +89,23 @@ Status CreateFile(Env* env, const std::string& destination, Status DeleteSSTFile(const ImmutableDBOptions* db_options, const std::string& fname, const std::string& dir_to_sync) { + return DeleteDBFile(db_options, fname, dir_to_sync, false); +} + +Status DeleteDBFile(const ImmutableDBOptions* db_options, + const std::string& fname, const std::string& dir_to_sync, + const bool force_bg) { #ifndef ROCKSDB_LITE auto sfm = static_cast(db_options->sst_file_manager.get()); if (sfm) { - return sfm->ScheduleFileDeletion(fname, dir_to_sync); + return sfm->ScheduleFileDeletion(fname, dir_to_sync, force_bg); } else { return db_options->env->DeleteFile(fname); } #else (void)dir_to_sync; + (void)force_bg; // SstFileManager is not supported in ROCKSDB_LITE return db_options->env->DeleteFile(fname); #endif diff --git a/util/file_util.h b/util/file_util.h index 5c05c9def..cd054518e 100644 --- a/util/file_util.h +++ b/util/file_util.h @@ -25,4 +25,9 @@ extern Status DeleteSSTFile(const ImmutableDBOptions* db_options, const std::string& fname, const std::string& path_to_sync); +extern Status DeleteDBFile(const ImmutableDBOptions* db_options, + const std::string& fname, + const std::string& path_to_sync, + const bool force_bg); + } // namespace rocksdb diff --git a/util/sst_file_manager_impl.cc b/util/sst_file_manager_impl.cc index 0b46b24b1..733cd9cf6 100644 --- a/util/sst_file_manager_impl.cc +++ b/util/sst_file_manager_impl.cc @@ -402,9 +402,11 @@ bool SstFileManagerImpl::CancelErrorRecovery(ErrorHandler* handler) { } Status SstFileManagerImpl::ScheduleFileDeletion( - const std::string& file_path, const std::string& path_to_sync) { + const std::string& file_path, const std::string& path_to_sync, + const bool force_bg) { TEST_SYNC_POINT("SstFileManagerImpl::ScheduleFileDeletion"); - return delete_scheduler_.DeleteFile(file_path, path_to_sync); + return delete_scheduler_.DeleteFile(file_path, path_to_sync, + force_bg); } void SstFileManagerImpl::WaitForEmptyTrash() { diff --git a/util/sst_file_manager_impl.h b/util/sst_file_manager_impl.h index d11035df8..211b4fa71 100644 --- a/util/sst_file_manager_impl.h +++ b/util/sst_file_manager_impl.h @@ -111,9 +111,12 @@ class SstFileManagerImpl : public SstFileManager { // not guaranteed bool CancelErrorRecovery(ErrorHandler* db); - // Mark file as trash and schedule it's deletion. + // Mark file as trash and schedule it's deletion. If force_bg is set, it + // forces the file to be deleting in the background regardless of DB size, + // except when rate limited delete is disabled virtual Status ScheduleFileDeletion(const std::string& file_path, - const std::string& dir_to_sync); + const std::string& dir_to_sync, + const bool force_bg = false); // Wait for all files being deleteing in the background to finish or for // destructor to be called. diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index a3a93365c..bf46bf6b1 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -1746,8 +1746,8 @@ std::pair BlobDBImpl::DeleteObsoleteFiles(bool aborted) { bfile->PathName().c_str()); blob_files_.erase(bfile->BlobFileNumber()); - Status s = DeleteSSTFile(&(db_impl_->immutable_db_options()), - bfile->PathName(), blob_dir_); + Status s = DeleteDBFile(&(db_impl_->immutable_db_options()), + bfile->PathName(), blob_dir_, true); if (!s.ok()) { ROCKS_LOG_ERROR(db_options_.info_log, "File failed to be deleted as obsolete %s", @@ -1837,7 +1837,7 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options, uint64_t number; FileType type; if (ParseFileName(f, &number, &type) && type == kBlobFile) { - Status del = DeleteSSTFile(&soptions, blobdir + "/" + f, blobdir); + Status del = DeleteDBFile(&soptions, blobdir + "/" + f, blobdir, true); if (status.ok() && !del.ok()) { status = del; } From acba14b3d903bed94a3345c471bd62b282c84f33 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 11 Jan 2019 17:40:44 -0800 Subject: [PATCH 17/57] Make a copy of MutableCFOptions to avoid race condition (#4876) Summary: If we do not do this, then reading MutableCFOptions may have a race condition with SetOptions which modifies MutableCFOptions. Also reserve space in advance for vectors to avoid reallocation changing the address of its elements. Test plan ``` $make clean && make -j32 all check $make clean && COMPILE_WITH_TSAN=1 make -j32 all check $make clean && COMPILE_WITH_ASAN=1 make -j32 all check ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/4876 Differential Revision: D13644500 Pulled By: riversand963 fbshipit-source-id: 4b8112c5c819d5a2922bb61ad1521b3d2fb2fd47 --- db/db_impl_compaction_flush.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index f6a3ffd1d..5847f05dd 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -222,8 +222,7 @@ Status DBImpl::FlushMemTablesToOutputFiles( Status status; for (auto& arg : bg_flush_args) { ColumnFamilyData* cfd = arg.cfd_; - const MutableCFOptions& mutable_cf_options = - *cfd->GetLatestMutableCFOptions(); + MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); SuperVersionContext* superversion_context = arg.superversion_context_; Status s = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, job_context, superversion_context, @@ -276,7 +275,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( } autovector distinct_output_dirs; std::vector jobs; + std::vector all_mutable_cf_options; int num_cfs = static_cast(cfds.size()); + all_mutable_cf_options.reserve(num_cfs); for (int i = 0; i < num_cfs; ++i) { auto cfd = cfds[i]; Directory* data_dir = GetDataDir(cfd, 0U); @@ -295,8 +296,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( distinct_output_dirs.emplace_back(data_dir); } - const MutableCFOptions& mutable_cf_options = - *cfd->GetLatestMutableCFOptions(); + all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions()); + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back(); const uint64_t* max_memtable_id = &(bg_flush_args[i].max_memtable_id_); jobs.emplace_back( dbname_, cfds[i], immutable_db_options_, mutable_cf_options, @@ -432,8 +433,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( if (!cfds[i]->IsDropped() && !mems.empty()) { tmp_cfds.emplace_back(cfds[i]); mems_list.emplace_back(&mems); - mutable_cf_options_list.emplace_back( - cfds[i]->GetLatestMutableCFOptions()); + mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]); } } @@ -1460,6 +1460,7 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, void DBImpl::GenerateFlushRequest(const autovector& cfds, FlushRequest* req) { assert(req != nullptr); + req->reserve(cfds.size()); for (const auto cfd : cfds) { if (nullptr == cfd) { // cfd may be null, see DBImpl::ScheduleFlushes From 65b229851093f1819b42b16784fd3668e32154d4 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Thu, 31 Jan 2019 14:28:53 -0800 Subject: [PATCH 18/57] Use correct FileMeta for atomic flush result install (#4932) Summary: 1. this commit fixes our handling of a combination of two separate edge cases. If a flush job does not pick any memtable to flush (because another flush job has already picked the same memtables), and the column family assigned to the flush job is dropped right before RocksDB calls rocksdb::InstallMemtableAtomicFlushResults, our original code passes a FileMetaData object whose file number is 0, failing the assertion in rocksdb::InstallMemtableAtomicFlushResults (assert(m->GetFileNumber() > 0)). 2. Also piggyback a small change: since we already create a local copy of column family's mutable CF options to eliminate potential race condition with `SetOptions` call, we might as well use the local copy in other function calls in the same scope. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4932 Differential Revision: D13901322 Pulled By: riversand963 fbshipit-source-id: b936580af7c127ea0c6c19ea10cd5fcede9fb0f9 --- db/db_impl_compaction_flush.cc | 22 ++++++++++------------ db/flush_job_test.cc | 10 ++++++++-- db/memtable_list.cc | 5 +++-- db/memtable_list.h | 6 +++--- db/memtable_list_test.cc | 9 +++++++-- 5 files changed, 31 insertions(+), 21 deletions(-) diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index 5847f05dd..a42e60f85 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -310,21 +310,18 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( jobs.back().PickMemTable(); } - autovector file_meta; + std::vector file_meta(num_cfs); Status s; assert(num_cfs == static_cast(jobs.size())); - for (int i = 0; i != num_cfs; ++i) { - file_meta.emplace_back(); - #ifndef ROCKSDB_LITE - const MutableCFOptions& mutable_cf_options = - *cfds[i]->GetLatestMutableCFOptions(); + for (int i = 0; i != num_cfs; ++i) { + const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i); // may temporarily unlock and lock the mutex. NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options, job_context->job_id, jobs[i].GetTableProperties()); -#endif /* !ROCKSDB_LITE */ } +#endif /* !ROCKSDB_LITE */ if (logfile_number_ > 0) { // TODO (yanqin) investigate whether we should sync the closed logs for @@ -428,19 +425,21 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( autovector tmp_cfds; autovector*> mems_list; autovector mutable_cf_options_list; + autovector tmp_file_meta; for (int i = 0; i != num_cfs; ++i) { const auto& mems = jobs[i].GetMemTables(); if (!cfds[i]->IsDropped() && !mems.empty()) { tmp_cfds.emplace_back(cfds[i]); mems_list.emplace_back(&mems); mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]); + tmp_file_meta.emplace_back(&file_meta[i]); } } s = InstallMemtableAtomicFlushResults( nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list, - versions_.get(), &mutex_, file_meta, &job_context->memtables_to_free, - directories_.GetDbDir(), log_buffer); + versions_.get(), &mutex_, tmp_file_meta, + &job_context->memtables_to_free, directories_.GetDbDir(), log_buffer); } if (s.ok() || s.IsShutdownInProgress()) { @@ -452,7 +451,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( } InstallSuperVersionAndScheduleWork(cfds[i], &job_context->superversion_contexts[i], - *cfds[i]->GetLatestMutableCFOptions()); + all_mutable_cf_options[i]); VersionStorageInfo::LevelSummaryStorage tmp; ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n", cfds[i]->GetName().c_str(), @@ -468,8 +467,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( if (cfds[i]->IsDropped()) { continue; } - NotifyOnFlushCompleted(cfds[i], &file_meta[i], - *cfds[i]->GetLatestMutableCFOptions(), + NotifyOnFlushCompleted(cfds[i], &file_meta[i], all_mutable_cf_options[i], job_context->job_id, jobs[i].GetTableProperties()); if (sfm) { std::string file_path = MakeTableFileName( diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 5ac5f2f93..1f7bc7b84 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -308,7 +308,9 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { k++; } HistogramData hist; - autovector file_metas; + std::vector file_metas; + // Call reserve to avoid auto-resizing + file_metas.reserve(flush_jobs.size()); mutex_.Lock(); for (auto& job : flush_jobs) { job.PickMemTable(); @@ -319,6 +321,10 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { ASSERT_OK(job.Run(nullptr /**/, &meta)); file_metas.emplace_back(meta); } + autovector file_meta_ptrs; + for (auto& meta : file_metas) { + file_meta_ptrs.push_back(&meta); + } autovector*> mems_list; for (size_t i = 0; i != all_cfds.size(); ++i) { const auto& mems = flush_jobs[i].GetMemTables(); @@ -331,7 +337,7 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { Status s = InstallMemtableAtomicFlushResults( nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list, - versions_.get(), &mutex_, file_metas, &job_context.memtables_to_free, + versions_.get(), &mutex_, file_meta_ptrs, &job_context.memtables_to_free, nullptr /* db_directory */, nullptr /* log_buffer */); ASSERT_OK(s); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 459d392d5..9397dbc7e 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -533,7 +533,7 @@ Status InstallMemtableAtomicFlushResults( const autovector& cfds, const autovector& mutable_cf_options_list, const autovector*>& mems_list, VersionSet* vset, - InstrumentedMutex* mu, const autovector& file_metas, + InstrumentedMutex* mu, const autovector& file_metas, autovector* to_delete, Directory* db_directory, LogBuffer* log_buffer) { AutoThreadOperationStageUpdater stage_updater( @@ -553,10 +553,11 @@ Status InstallMemtableAtomicFlushResults( assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID()); } #endif + assert(nullptr != file_metas[k]); for (size_t i = 0; i != mems_list[k]->size(); ++i) { assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0); (*mems_list[k])[i]->SetFlushCompleted(true); - (*mems_list[k])[i]->SetFileNumber(file_metas[k].fd.GetNumber()); + (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber()); } } diff --git a/db/memtable_list.h b/db/memtable_list.h index be3f93562..b56ad4932 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -123,7 +123,7 @@ class MemTableListVersion { const autovector& mutable_cf_options_list, const autovector*>& mems_list, VersionSet* vset, InstrumentedMutex* mu, - const autovector& file_meta, + const autovector& file_meta, autovector* to_delete, Directory* db_directory, LogBuffer* log_buffer); @@ -301,7 +301,7 @@ class MemTableList { const autovector& mutable_cf_options_list, const autovector*>& mems_list, VersionSet* vset, InstrumentedMutex* mu, - const autovector& file_meta, + const autovector& file_meta, autovector* to_delete, Directory* db_directory, LogBuffer* log_buffer); @@ -337,7 +337,7 @@ extern Status InstallMemtableAtomicFlushResults( const autovector& cfds, const autovector& mutable_cf_options_list, const autovector*>& mems_list, VersionSet* vset, - InstrumentedMutex* mu, const autovector& file_meta, + InstrumentedMutex* mu, const autovector& file_meta, autovector* to_delete, Directory* db_directory, LogBuffer* log_buffer); } // namespace rocksdb diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index d67eed9fa..f0f4b0bb0 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -161,18 +161,23 @@ class MemTableListTest : public testing::Test { cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i])); EXPECT_NE(nullptr, cfds[i]); } - autovector file_metas; + std::vector file_metas; + file_metas.reserve(cf_ids.size()); for (size_t i = 0; i != cf_ids.size(); ++i) { FileMetaData meta; uint64_t file_num = file_number.fetch_add(1); meta.fd = FileDescriptor(file_num, 0, 0); file_metas.emplace_back(meta); } + autovector file_meta_ptrs; + for (auto& meta : file_metas) { + file_meta_ptrs.push_back(&meta); + } InstrumentedMutex mutex; InstrumentedMutexLock l(&mutex); return InstallMemtableAtomicFlushResults( &lists, cfds, mutable_cf_options_list, mems_list, &versions, &mutex, - file_metas, to_delete, nullptr, &log_buffer); + file_meta_ptrs, to_delete, nullptr, &log_buffer); } }; From a1774dde9a5bd51bc6ece5988781c6f28cc69d48 Mon Sep 17 00:00:00 2001 From: Sagar Vemuri Date: Thu, 31 Jan 2019 15:49:35 -0800 Subject: [PATCH 19/57] Bump version to 5.18.2 --- HISTORY.md | 2 +- include/rocksdb/version.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index e81b3f30f..0d1382092 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,5 @@ # Rocksdb Change Log -## 5.18.2 (01/23/2019) +## 5.18.2 (01/31/2019) ### Public API Change * Change time resolution in FileOperationInfo. * Deleting Blob files also go through SStFileManager. diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index fbf98bde7..3e1d5e04c 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -6,7 +6,7 @@ #define ROCKSDB_MAJOR 5 #define ROCKSDB_MINOR 18 -#define ROCKSDB_PATCH 1 +#define ROCKSDB_PATCH 2 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these From b7434c29d2a6643de879160b179c663dd38283f6 Mon Sep 17 00:00:00 2001 From: yangzhijia Date: Tue, 5 Feb 2019 10:15:33 -0800 Subject: [PATCH 20/57] Properly set upper bound of subcompaction output (#4879) (#4898) Summary: Fix the ouput overlap bug when using subcompactions, the upper bound of output file was extended incorrectly. Pull Request resolved: https://github.com/facebook/rocksdb/pull/4898 Differential Revision: D13736107 Pulled By: ajkr fbshipit-source-id: 21dca09f81d5f07bf2766bf566f9b50dcab7d8e3 --- db/compaction_job.cc | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 0bdf78cfc..10aaef098 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1204,10 +1204,19 @@ Status CompactionJob::FinishCompactionOutputFile( lower_bound = nullptr; } if (next_table_min_key != nullptr) { - // This isn't the last file in the subcompaction, so extend until the next - // file starts. + // This may be the last file in the subcompaction in some cases, so we + // need to compare the end key of subcompaction with the next file start + // key. When the end key is chosen by the subcompaction, we know that + // it must be the biggest key in output file. Therefore, it is safe to + // use the smaller key as the upper bound of the output file, to ensure + // that there is no overlapping between different output files. upper_bound_guard = ExtractUserKey(*next_table_min_key); - upper_bound = &upper_bound_guard; + if (sub_compact->end != nullptr && + ucmp->Compare(upper_bound_guard, *sub_compact->end) >= 0) { + upper_bound = sub_compact->end; + } else { + upper_bound = &upper_bound_guard; + } } else { // This is the last file in the subcompaction, so extend until the // subcompaction ends. @@ -1225,6 +1234,13 @@ Status CompactionJob::FinishCompactionOutputFile( has_overlapping_endpoints = false; } + // The end key of the subcompaction must be bigger or equal to the upper + // bound. If the end of subcompaction is null or the upper bound is null, + // it means that this file is the last file in the compaction. So there + // will be no overlapping between this file and others. + assert(sub_compact->end == nullptr || + upper_bound == nullptr || + ucmp->Compare(*upper_bound , *sub_compact->end) <= 0); auto it = range_del_agg->NewIterator(lower_bound, upper_bound, has_overlapping_endpoints); // Position the range tombstone output iterator. There may be tombstone From 641fae60f63619ed5d0c9d9e4c4ea5a0ffa3e253 Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Mon, 11 Feb 2019 14:01:16 -0800 Subject: [PATCH 21/57] update history and bump version --- HISTORY.md | 4 ++++ include/rocksdb/version.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index 0d1382092..b26450022 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,4 +1,8 @@ # Rocksdb Change Log +## 5.18.3 (2/11/2019) +### Bug Fixes +* Fix possible LSM corruption when both range deletions and subcompactions are used. The symptom of this corruption is L1+ files overlapping in the user key space. + ## 5.18.2 (01/31/2019) ### Public API Change * Change time resolution in FileOperationInfo. diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 3e1d5e04c..24cef677f 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -6,7 +6,7 @@ #define ROCKSDB_MAJOR 5 #define ROCKSDB_MINOR 18 -#define ROCKSDB_PATCH 2 +#define ROCKSDB_PATCH 3 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these From 5e2f968c07e012e85f4477c3c75718266fd5bfa0 Mon Sep 17 00:00:00 2001 From: leventov Date: Mon, 4 Jun 2018 14:23:47 +0400 Subject: [PATCH 22/57] Revert "Core-local statistics" This reverts commit ac39d6bec5b2c23a2c3fd0f0e61d468be4f3e803. --- monitoring/statistics.cc | 193 ++++++++++++++++++++++++++------------- monitoring/statistics.h | 125 +++++++++++++++++-------- util/core_local.h | 9 +- 3 files changed, 222 insertions(+), 105 deletions(-) diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index cba427ae4..6a0c5e43d 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -10,10 +10,10 @@ #endif #include -#include "rocksdb/statistics.h" -#include "port/likely.h" #include #include +#include "port/likely.h" +#include "rocksdb/statistics.h" namespace rocksdb { @@ -223,8 +223,11 @@ std::shared_ptr CreateDBStatistics() { return std::make_shared(nullptr); } -StatisticsImpl::StatisticsImpl(std::shared_ptr stats) - : stats_(std::move(stats)) {} +StatisticsImpl::StatisticsImpl(std::shared_ptr stats, + bool enable_internal_stats) + : stats_shared_(stats), + stats_(stats.get()), + enable_internal_stats_(enable_internal_stats) {} StatisticsImpl::~StatisticsImpl() {} @@ -234,34 +237,80 @@ uint64_t StatisticsImpl::getTickerCount(uint32_t tickerType) const { } uint64_t StatisticsImpl::getTickerCountLocked(uint32_t tickerType) const { - assert(tickerType < TICKER_ENUM_MAX); - uint64_t res = 0; - for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) { - res += per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType]; + assert(enable_internal_stats_ ? tickerType < INTERNAL_TICKER_ENUM_MAX + : tickerType < TICKER_ENUM_MAX); + + uint64_t thread_local_sum = 0; + tickers_[tickerType].thread_value->Fold( + [](void* curr_ptr, void* res) { + auto* sum_ptr = static_cast(res); + *sum_ptr += static_cast(curr_ptr)->load( + std::memory_order_relaxed); + }, + &thread_local_sum); + return thread_local_sum + + tickers_[tickerType].merged_sum.load(std::memory_order_relaxed); +} + +std::unique_ptr +StatisticsImpl::HistogramInfo::getMergedHistogram() const { + std::unique_ptr res_hist(new HistogramImpl()); + { + MutexLock lock(&merge_lock); + res_hist->Merge(merged_hist); } - return res; + + thread_value->Fold( + [](void* curr_ptr, void* res) { + auto tmp_res_hist = static_cast(res); + auto curr_hist = static_cast(curr_ptr); + tmp_res_hist->Merge(*curr_hist); + }, + res_hist.get()); + return res_hist; } void StatisticsImpl::histogramData(uint32_t histogramType, HistogramData* const data) const { MutexLock lock(&aggregate_lock_); - getHistogramImplLocked(histogramType)->Data(data); + histogramDataLocked(histogramType, data); } -std::unique_ptr StatisticsImpl::getHistogramImplLocked( - uint32_t histogramType) const { - assert(histogramType < HISTOGRAM_ENUM_MAX); - std::unique_ptr res_hist(new HistogramImpl()); - for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) { - res_hist->Merge( - per_core_stats_.AccessAtCore(core_idx)->histograms_[histogramType]); - } - return res_hist; -} +void StatisticsImpl::histogramDataLocked(uint32_t histogramType, + HistogramData* const data) const { + assert(enable_internal_stats_ ? histogramType < INTERNAL_HISTOGRAM_ENUM_MAX + : histogramType < HISTOGRAM_ENUM_MAX); + histograms_[histogramType].getMergedHistogram()->Data(data); +} // namespace rocksdb std::string StatisticsImpl::getHistogramString(uint32_t histogramType) const { MutexLock lock(&aggregate_lock_); - return getHistogramImplLocked(histogramType)->ToString(); + assert(enable_internal_stats_ ? histogramType < INTERNAL_HISTOGRAM_ENUM_MAX + : histogramType < HISTOGRAM_ENUM_MAX); + return histograms_[histogramType].getMergedHistogram()->ToString(); +} +StatisticsImpl::ThreadTickerInfo* StatisticsImpl::getThreadTickerInfo( + uint32_t tickerType) { + auto info_ptr = + static_cast(tickers_[tickerType].thread_value->Get()); + if (info_ptr == nullptr) { + info_ptr = + new ThreadTickerInfo(0 /* value */, &tickers_[tickerType].merged_sum); + tickers_[tickerType].thread_value->Reset(info_ptr); + } + return info_ptr; +} + +StatisticsImpl::ThreadHistogramInfo* StatisticsImpl::getThreadHistogramInfo( + uint32_t histogram_type) { + auto info_ptr = static_cast( + histograms_[histogram_type].thread_value->Get()); + if (info_ptr == nullptr) { + info_ptr = new ThreadHistogramInfo(&histograms_[histogram_type].merged_hist, + &histograms_[histogram_type].merge_lock); + histograms_[histogram_type].thread_value->Reset(info_ptr); + } + return info_ptr; } void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) { @@ -275,13 +324,17 @@ void StatisticsImpl::setTickerCount(uint32_t tickerType, uint64_t count) { } void StatisticsImpl::setTickerCountLocked(uint32_t tickerType, uint64_t count) { - assert(tickerType < TICKER_ENUM_MAX); - for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) { - if (core_idx == 0) { - per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = count; - } else { - per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType] = 0; - } + assert(enable_internal_stats_ ? tickerType < INTERNAL_TICKER_ENUM_MAX + : tickerType < TICKER_ENUM_MAX); + + if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) { + tickers_[tickerType].thread_value->Fold( + [](void* curr_ptr, void* res) { + static_cast*>(curr_ptr)->store( + 0, std::memory_order_relaxed); + }, + nullptr /* res */); + tickers_[tickerType].merged_sum.store(count, std::memory_order_relaxed); } } @@ -289,11 +342,19 @@ uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) { uint64_t sum = 0; { MutexLock lock(&aggregate_lock_); - assert(tickerType < TICKER_ENUM_MAX); - for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) { - sum += - per_core_stats_.AccessAtCore(core_idx)->tickers_[tickerType].exchange( - 0, std::memory_order_relaxed); + assert(enable_internal_stats_ ? tickerType < INTERNAL_TICKER_ENUM_MAX + : tickerType < TICKER_ENUM_MAX); + + if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) { + tickers_[tickerType].thread_value->Fold( + [](void* curr_ptr, void* res) { + auto* sum_ptr = static_cast(res); + *sum_ptr += static_cast*>(curr_ptr)->exchange( + 0, std::memory_order_relaxed); + }, + &sum); + sum += tickers_[tickerType].merged_sum.exchange( + 0, std::memory_order_relaxed); } } if (stats_ && tickerType < TICKER_ENUM_MAX) { @@ -303,17 +364,25 @@ uint64_t StatisticsImpl::getAndResetTickerCount(uint32_t tickerType) { } void StatisticsImpl::recordTick(uint32_t tickerType, uint64_t count) { - assert(tickerType < TICKER_ENUM_MAX); - per_core_stats_.Access()->tickers_[tickerType].fetch_add( - count, std::memory_order_relaxed); + assert(enable_internal_stats_ ? tickerType < INTERNAL_TICKER_ENUM_MAX + : tickerType < TICKER_ENUM_MAX); + + if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) { + auto info_ptr = getThreadTickerInfo(tickerType); + info_ptr->value.fetch_add(count, std::memory_order_relaxed); + } if (stats_ && tickerType < TICKER_ENUM_MAX) { stats_->recordTick(tickerType, count); } } void StatisticsImpl::measureTime(uint32_t histogramType, uint64_t value) { - assert(histogramType < HISTOGRAM_ENUM_MAX); - per_core_stats_.Access()->histograms_[histogramType].Add(value); + assert(enable_internal_stats_ ? histogramType < INTERNAL_HISTOGRAM_ENUM_MAX + : histogramType < HISTOGRAM_ENUM_MAX); + + if (histogramType < HISTOGRAM_ENUM_MAX || enable_internal_stats_) { + getThreadHistogramInfo(histogramType)->value.Add(value); + } if (stats_ && histogramType < HISTOGRAM_ENUM_MAX) { stats_->measureTime(histogramType, value); } @@ -325,9 +394,11 @@ Status StatisticsImpl::Reset() { setTickerCountLocked(i, 0); } for (uint32_t i = 0; i < HISTOGRAM_ENUM_MAX; ++i) { - for (size_t core_idx = 0; core_idx < per_core_stats_.Size(); ++core_idx) { - per_core_stats_.AccessAtCore(core_idx)->histograms_[i].Clear(); - } + histograms_[i].thread_value->Fold( + [](void* curr_ptr, void* res) { + static_cast(curr_ptr)->Clear(); + }, + nullptr /* res */); } return Status::OK(); } @@ -337,36 +408,32 @@ namespace { // a buffer size used for temp string buffers const int kTmpStrBufferSize = 200; -} // namespace +} // namespace std::string StatisticsImpl::ToString() const { MutexLock lock(&aggregate_lock_); std::string res; res.reserve(20000); for (const auto& t : TickersNameMap) { - assert(t.first < TICKER_ENUM_MAX); - char buffer[kTmpStrBufferSize]; - snprintf(buffer, kTmpStrBufferSize, "%s COUNT : %" PRIu64 "\n", - t.second.c_str(), getTickerCountLocked(t.first)); - res.append(buffer); + if (t.first < TICKER_ENUM_MAX || enable_internal_stats_) { + char buffer[kTmpStrBufferSize]; + snprintf(buffer, kTmpStrBufferSize, "%s COUNT : %" PRIu64 "\n", + t.second.c_str(), getTickerCountLocked(t.first)); + res.append(buffer); + } } for (const auto& h : HistogramsNameMap) { - assert(h.first < HISTOGRAM_ENUM_MAX); - char buffer[kTmpStrBufferSize]; - HistogramData hData; - getHistogramImplLocked(h.first)->Data(&hData); - // don't handle failures - buffer should always be big enough and arguments - // should be provided correctly - int ret = snprintf( - buffer, kTmpStrBufferSize, - "%s P50 : %f P95 : %f P99 : %f P100 : %f COUNT : %" PRIu64 " SUM : %" - PRIu64 "\n", h.second.c_str(), hData.median, hData.percentile95, - hData.percentile99, hData.max, hData.count, hData.sum); - if (ret < 0 || ret >= kTmpStrBufferSize) { - assert(false); - continue; + if (h.first < HISTOGRAM_ENUM_MAX || enable_internal_stats_) { + char buffer[kTmpStrBufferSize]; + HistogramData hData; + histogramDataLocked(h.first, &hData); + snprintf( + buffer, kTmpStrBufferSize, + "%s statistics Percentiles :=> 50 : %f 95 : %f 99 : %f 100 : %f\n", + h.second.c_str(), hData.median, hData.percentile95, + hData.percentile99, hData.max); + res.append(buffer); } - res.append(buffer); } res.shrink_to_fit(); return res; @@ -376,4 +443,4 @@ bool StatisticsImpl::HistEnabledForType(uint32_t type) const { return type < HISTOGRAM_ENUM_MAX; } -} // namespace rocksdb +} // namespace rocksdb diff --git a/monitoring/statistics.h b/monitoring/statistics.h index dcd5f7a01..5ac4a4319 100644 --- a/monitoring/statistics.h +++ b/monitoring/statistics.h @@ -4,23 +4,17 @@ // (found in the LICENSE.Apache file in the root directory). // #pragma once -#include "rocksdb/statistics.h" -#include #include #include - +#include #include "monitoring/histogram.h" #include "port/likely.h" #include "port/port.h" +#include "rocksdb/statistics.h" #include "util/core_local.h" #include "util/mutexlock.h" - -#ifdef __clang__ -#define ROCKSDB_FIELD_UNUSED __attribute__((__unused__)) -#else -#define ROCKSDB_FIELD_UNUSED -#endif // __clang__ +#include "util/thread_local.h" #ifndef STRINGIFY #define STRINGIFY(x) #x @@ -41,7 +35,7 @@ enum HistogramsInternal : uint32_t { class StatisticsImpl : public Statistics { public: - StatisticsImpl(std::shared_ptr stats); + StatisticsImpl(std::shared_ptr stats, bool enable_internal_stats); virtual ~StatisticsImpl(); virtual uint64_t getTickerCount(uint32_t ticker_type) const override; @@ -59,41 +53,98 @@ class StatisticsImpl : public Statistics { virtual bool HistEnabledForType(uint32_t type) const override; private: - // If non-nullptr, forwards updates to the object pointed to by `stats_`. - std::shared_ptr stats_; - // Synchronizes anything that operates across other cores' local data, + std::shared_ptr stats_shared_; + Statistics* stats_; + bool enable_internal_stats_; + + // Synchronizes anything that operates on other threads' thread-specific data // such that operations like Reset() can be performed atomically. mutable port::Mutex aggregate_lock_; - // The ticker/histogram data are stored in this structure, which we will store - // per-core. It is cache-aligned, so tickers/histograms belonging to different - // cores can never share the same cache line. - // - // Alignment attributes expand to nothing depending on the platform - struct ALIGN_AS(CACHE_LINE_SIZE) StatisticsData { - std::atomic_uint_fast64_t tickers_[INTERNAL_TICKER_ENUM_MAX] = {{0}}; - HistogramImpl histograms_[INTERNAL_HISTOGRAM_ENUM_MAX]; -#ifndef HAVE_ALIGNED_NEW - char - padding[(CACHE_LINE_SIZE - - (INTERNAL_TICKER_ENUM_MAX * sizeof(std::atomic_uint_fast64_t) + - INTERNAL_HISTOGRAM_ENUM_MAX * sizeof(HistogramImpl)) % - CACHE_LINE_SIZE)] ROCKSDB_FIELD_UNUSED; -#endif - void *operator new(size_t s) { return port::cacheline_aligned_alloc(s); } - void *operator new[](size_t s) { return port::cacheline_aligned_alloc(s); } - void operator delete(void *p) { port::cacheline_aligned_free(p); } - void operator delete[](void *p) { port::cacheline_aligned_free(p); } + // Holds data maintained by each thread for implementing tickers. + struct ThreadTickerInfo { + std::atomic_uint_fast64_t value; + // During teardown, value will be summed into *merged_sum. + std::atomic_uint_fast64_t* merged_sum; + + ThreadTickerInfo(uint_fast64_t _value, + std::atomic_uint_fast64_t* _merged_sum) + : value(_value), merged_sum(_merged_sum) {} }; - static_assert(sizeof(StatisticsData) % CACHE_LINE_SIZE == 0, "Expected " TOSTRING(CACHE_LINE_SIZE) "-byte aligned"); + // Holds data maintained by each thread for implementing histograms. + struct ThreadHistogramInfo { + HistogramImpl value; + // During teardown, value will be merged into *merged_hist while holding + // *merge_lock, which also syncs with the merges necessary for reads. + HistogramImpl* merged_hist; + port::Mutex* merge_lock; + ThreadHistogramInfo(HistogramImpl* _merged_hist, port::Mutex* _merge_lock) + : value(), merged_hist(_merged_hist), merge_lock(_merge_lock) {} + }; + + // Holds global data for implementing tickers. + struct TickerInfo { + TickerInfo() + : thread_value(new ThreadLocalPtr(&mergeThreadValue)), merged_sum(0) {} + // Holds thread-specific pointer to ThreadTickerInfo + std::unique_ptr thread_value; + // Sum of thread-specific values for tickers that have been reset due to + // thread termination or ThreadLocalPtr destruction. Also, this is used by + // setTickerCount() to conveniently change the global value by setting this + // while simultaneously zeroing all thread-local values. + std::atomic_uint_fast64_t merged_sum; + + static void mergeThreadValue(void* ptr) { + auto info_ptr = static_cast(ptr); + *info_ptr->merged_sum += info_ptr->value; + delete info_ptr; + } + }; - CoreLocalArray per_core_stats_; + // Holds global data for implementing histograms. + struct HistogramInfo { + HistogramInfo() + : merged_hist(), + merge_lock(), + thread_value(new ThreadLocalPtr(&mergeThreadValue)) {} + // Merged thread-specific values for histograms that have been reset due to + // thread termination or ThreadLocalPtr destruction. Note these must be + // destroyed after thread_value since its destructor accesses them. + HistogramImpl merged_hist; + mutable port::Mutex merge_lock; + // Holds thread-specific pointer to ThreadHistogramInfo + std::unique_ptr thread_value; + + static void mergeThreadValue(void* ptr) { + auto info_ptr = static_cast(ptr); + { + MutexLock lock(info_ptr->merge_lock); + info_ptr->merged_hist->Merge(info_ptr->value); + } + delete info_ptr; + } + + // Returns a histogram that merges all histograms (thread-specific and + // previously merged ones). + std::unique_ptr getMergedHistogram() const; + }; uint64_t getTickerCountLocked(uint32_t ticker_type) const; - std::unique_ptr getHistogramImplLocked( - uint32_t histogram_type) const; + + void histogramDataLocked(uint32_t histogram_type, + HistogramData* const data) const; void setTickerCountLocked(uint32_t ticker_type, uint64_t count); + + // Returns the info for this tickerType/thread. It sets a new info with zeroed + // counter if none exists. + ThreadTickerInfo* getThreadTickerInfo(uint32_t ticker_type); + // Returns the info for this histogramType/thread. It sets a new histogram + // with zeroed data if none exists. + ThreadHistogramInfo* getThreadHistogramInfo(uint32_t histogram_type); + + TickerInfo tickers_[INTERNAL_TICKER_ENUM_MAX]; + HistogramInfo histograms_[INTERNAL_HISTOGRAM_ENUM_MAX]; }; // Utility functions @@ -118,4 +169,4 @@ inline void SetTickerCount(Statistics* statistics, uint32_t ticker_type, } } -} +} // namespace rocksdb diff --git a/util/core_local.h b/util/core_local.h index 4cc4fd90c..eb0b5a56d 100644 --- a/util/core_local.h +++ b/util/core_local.h @@ -5,15 +5,14 @@ #pragma once -#include -#include -#include -#include - #include "port/likely.h" #include "port/port.h" #include "util/random.h" +#include +#include +#include + namespace rocksdb { // An array of core-local values. Ideally the value type, T, is cache aligned to From 4e6b92c1ea56f14a6209197f17e65d10c56797c1 Mon Sep 17 00:00:00 2001 From: Alexander Toktarev Date: Thu, 7 Jun 2018 15:21:37 +0300 Subject: [PATCH 23/57] Update cmake to not build tests when not in Debug --- CMakeLists.txt | 1 + examples/perf.cc | 143 ++++++++++++++++++++++++++++++++++++++++++++++ util/sync_point.h | 7 ++- 3 files changed, 148 insertions(+), 3 deletions(-) create mode 100644 examples/perf.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 98e2e1973..85c59064e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,6 +174,7 @@ else() endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + add_definitions(-DNDEBUG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") include(CheckCXXCompilerFlag) CHECK_CXX_COMPILER_FLAG("-momit-leaf-frame-pointer" HAVE_OMIT_LEAF_FRAME_POINTER) diff --git a/examples/perf.cc b/examples/perf.cc new file mode 100644 index 000000000..5de6c9a4d --- /dev/null +++ b/examples/perf.cc @@ -0,0 +1,143 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include +#include +#include +#include +#include +#include "rocksdb/db.h" +#include + +using namespace std; +using namespace rocksdb; +using namespace std::chrono; + +#include +#include + +std::string kDBPath = "/repos/rocksdata"; + +template +T swap_endian(T u) { + static_assert(CHAR_BIT == 8, "CHAR_BIT != 8"); + + union { + T u; + unsigned char u8[sizeof(T)]; + } source, dest; + + source.u = u; + + for (size_t k = 0; k < sizeof(T); k++) + dest.u8[k] = source.u8[sizeof(T) - k - 1]; + + return dest.u; +} + +rocksdb::TableFactory *makeDictionaryTableFactory() { + auto block_opts = rocksdb::BlockBasedTableOptions{}; + block_opts.checksum = ChecksumType::kCRC32c; + block_opts.index_type = BlockBasedTableOptions::kHashSearch; + block_opts.filter_policy.reset(NewBloomFilterPolicy(10, false)); + block_opts.block_cache = + NewLRUCache(static_cast(1024 * 1024 * 1024)); + block_opts.cache_index_and_filter_blocks = true; + block_opts.cache_index_and_filter_blocks_with_high_priority = block_opts.cache_index_and_filter_blocks; + + auto *pPolicy = rocksdb::NewBloomFilterPolicy(10, false); + auto filter_ptr = std::shared_ptr(pPolicy); + block_opts.filter_policy = filter_ptr; + + return rocksdb::NewBlockBasedTableFactory(block_opts); +} + +int main() { + system("rm -rf /repos/rocksdata/*"); + + DB *db; + Options options; + // Optimize RocksDB. This is the easiest way to get RocksDB to perform well + //options.IncreaseParallelism(); + //options.OptimizeLevelStyleCompaction(); + // create the DB if it's not already present + options.create_if_missing = true; + options.compression = CompressionType::kNoCompression; + options.statistics = rocksdb::CreateDBStatistics(); + // open DB + Status s = DB::Open(options, kDBPath, &db); + + if (!s.ok()) { + std::cout << s.ToString(); + } + + assert(s.ok()); + + ColumnFamilyOptions cf_options{}; + + cf_options.table_factory.reset(makeDictionaryTableFactory()); + cf_options.prefix_extractor.reset(rocksdb::NewNoopTransform()); + cf_options.memtable_prefix_bloom_size_ratio = 0.02; + + std::string name("Name"); + ColumnFamilyHandle *cf; + Status status = db->CreateColumnFamily(cf_options, name, &cf); + + assert(s.ok()); + + u_int64_t *buffer = new u_int64_t[4]; + char *pointer = reinterpret_cast(buffer); + WriteBatch writeBatch{}; + u_int64_t max = 10000000; + + Slice key(pointer, 32); + Slice value(reinterpret_cast(&max), 8); + + + u_int64_t *buffer1 = new u_int64_t[4]; + char *pointer1 = reinterpret_cast(buffer1); + Slice key1(pointer1, 32); + + for (u_int64_t i = 0; i < 1000000000; i++) { + *(buffer) = swap_endian(i); + *(buffer + 1) = i + 1; + *(buffer + 2) = i + 2; + *(buffer + 3) = i + 3; + + writeBatch.Put(cf, key, value); + + if (i % 1000 == 0) { + Status s1 = db->Write(WriteOptions(), &writeBatch); + assert(s1.ok()); + writeBatch.Clear(); + } + + if (i % 10000000 == 0) { + std::string valuee; + // get value + uint64_t start = (uint64_t) std::chrono::duration_cast( + system_clock::now().time_since_epoch()).count(); + + u_int64_t k = i / 2; + *(buffer1) = swap_endian(k); + *(buffer1 + 1) = k + 1; + *(buffer1 + 2) = k + 2; + *(buffer1 + 3) = k + 3; + + s = db->Get(ReadOptions(), cf, key1, &valuee); + + uint64_t end = (uint64_t) std::chrono::duration_cast( + system_clock::now().time_since_epoch()).count(); + u_int64_t delta = end - start; + + printf("%" PRIu64 "\n", delta); + db->Flush(FlushOptions(), cf); + } + } + + db->DestroyColumnFamilyHandle(cf); + delete db; + return 0; +} diff --git a/util/sync_point.h b/util/sync_point.h index cb4b1e717..7aa114f2f 100644 --- a/util/sync_point.h +++ b/util/sync_point.h @@ -130,11 +130,12 @@ class SyncPoint { // utilized to re-produce race conditions between threads. // See TransactionLogIteratorRace in db_test.cc for an example use case. // TEST_SYNC_POINT is no op in release build. -#define TEST_SYNC_POINT(x) rocksdb::SyncPoint::GetInstance()->Process(x) +#define TEST_SYNC_POINT(x) + //rocksdb::SyncPoint::GetInstance()->Process(x) #define TEST_IDX_SYNC_POINT(x, index) \ - rocksdb::SyncPoint::GetInstance()->Process(x + std::to_string(index)) + //rocksdb::SyncPoint::GetInstance()->Process(x + std::to_string(index)) #define TEST_SYNC_POINT_CALLBACK(x, y) \ - rocksdb::SyncPoint::GetInstance()->Process(x, y) + //rocksdb::SyncPoint::GetInstance()->Process(x, y) #define INIT_SYNC_POINT_SINGLETONS() \ (void)rocksdb::SyncPoint::GetInstance(); #endif // NDEBUG From 05b54b1d36a488565c5e7351dd88e3a201b564b9 Mon Sep 17 00:00:00 2001 From: Alexander Toktarev Date: Wed, 4 Jul 2018 17:46:08 +0300 Subject: [PATCH 24/57] Column family destructor should print column family name on assertion error --- db/column_family.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/db/column_family.cc b/db/column_family.cc index 9a3ae99ca..8236efc6b 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include "db/compaction_picker.h" #include "db/compaction_picker_fifo.h" @@ -1224,6 +1225,11 @@ ColumnFamilySet::~ColumnFamilySet() { auto cfd = column_family_data_.begin()->second; bool last_ref __attribute__((__unused__)); last_ref = cfd->Unref(); + + if (!last_ref) { + std::cout << " Assertion error "+cfd->name_+"\n\n"; + } + assert(last_ref); delete cfd; } From b1cc1a88237e113902f379ad16284b0b7483e454 Mon Sep 17 00:00:00 2001 From: James Pack Date: Wed, 1 Aug 2018 02:08:42 -0400 Subject: [PATCH 25/57] Use the C++ steady_clock on Mac OSX, rather than using the Mach kernel clock service --- env/env_posix.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/env/env_posix.cc b/env/env_posix.cc index c2e456a66..1db4776a9 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -830,7 +830,7 @@ class PosixEnv : public Env { return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; #elif defined(OS_SOLARIS) return gethrtime(); -#elif defined(__MACH__) +#elif defined(__MACH__) && !defined(__APPLE__) clock_serv_t cclock; mach_timespec_t ts; host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); From 33659a375acca9b5d00c57b171da6038add180f0 Mon Sep 17 00:00:00 2001 From: Toktarev Alexander Date: Wed, 13 Mar 2019 16:31:01 +0300 Subject: [PATCH 26/57] Post cherry-pick fixes --- monitoring/statistics.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 6a0c5e43d..0964e1497 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -220,7 +220,7 @@ const std::vector> HistogramsNameMap = { }; std::shared_ptr CreateDBStatistics() { - return std::make_shared(nullptr); + return std::make_shared(nullptr, false); } StatisticsImpl::StatisticsImpl(std::shared_ptr stats, @@ -329,7 +329,7 @@ void StatisticsImpl::setTickerCountLocked(uint32_t tickerType, uint64_t count) { if (tickerType < TICKER_ENUM_MAX || enable_internal_stats_) { tickers_[tickerType].thread_value->Fold( - [](void* curr_ptr, void* res) { + [](void* curr_ptr, void* /*res*/) { static_cast*>(curr_ptr)->store( 0, std::memory_order_relaxed); }, @@ -395,7 +395,7 @@ Status StatisticsImpl::Reset() { } for (uint32_t i = 0; i < HISTOGRAM_ENUM_MAX; ++i) { histograms_[i].thread_value->Fold( - [](void* curr_ptr, void* res) { + [](void* curr_ptr, void* /*res*/) { static_cast(curr_ptr)->Clear(); }, nullptr /* res */); From fdb84c8a56c97d20e8fdedef8a8d0e13142dcf05 Mon Sep 17 00:00:00 2001 From: Toktarev Alexander Date: Tue, 19 Mar 2019 15:08:59 +0300 Subject: [PATCH 27/57] Upgrade to 5.18.3 --- build_tools/setup_centos7.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build_tools/setup_centos7.sh b/build_tools/setup_centos7.sh index c633131de..5a6d00797 100755 --- a/build_tools/setup_centos7.sh +++ b/build_tools/setup_centos7.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e -ROCKSDB_VERSION="5.10.3" +ROCKSDB_VERSION="5.18.3" ZSTD_VERSION="1.1.3" echo "This script configures CentOS with everything needed to build and run RocksDB" @@ -22,7 +22,7 @@ mkdir -pv /usr/local/rocksdb-${ROCKSDB_VERSION} ln -sfT /usr/local/rocksdb-${ROCKSDB_VERSION} /usr/local/rocksdb wget -qO /tmp/zstd-${ZSTD_VERSION}.tar.gz https://github.com/facebook/zstd/archive/v${ZSTD_VERSION}.tar.gz -wget -qO /tmp/rocksdb-${ROCKSDB_VERSION}.tar.gz https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz +wget -qO /tmp/rocksdb-${ROCKSDB_VERSION}.tar.gz https://github.com/stardog-union/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz cd /tmp From 318276132e6aafa822603b24df4f805dea2cfa83 Mon Sep 17 00:00:00 2001 From: James Pack Date: Sat, 29 Sep 2018 02:50:38 -0400 Subject: [PATCH 28/57] Working Bazel build --- .gitignore | 1 - BUILD | 78 + WORKSPACE | 23 + cache/BUILD | 69 + db/BUILD | 1849 +++++++++++++++++ empty_main.cc | 3 + env/BUILD | 155 ++ include/BUILD | 12 + memtable/BUILD | 100 + monitoring/BUILD | 107 + options/BUILD | 71 + port/BUILD | 85 + rocksdb.bzl | 67 + table/BUILD | 277 +++ third_party/gtest/BUILD | 19 + third_party/gtest/compile_test.cpp | 9 + third_party/lz4/BUILD | 18 + third_party/lz4/BUILD.external | 8 + third_party/lz4/compile_test.cpp | 10 + third_party/snappy/BUILD | 19 + third_party/snappy/compile_test.cpp | 10 + tools/BUILD | 18 + util/BUILD | 508 +++++ util/build_version.cc | 4 + util/compression.h | 10 +- utilities/BUILD | 151 ++ utilities/backupable/BUILD | 48 + utilities/checkpoint/BUILD | 49 + utilities/leveldb_options/BUILD | 6 + utilities/merge_operators/BUILD | 21 + utilities/merge_operators/string_append/BUILD | 46 + utilities/options/BUILD | 48 + utilities/table_properties_collectors/BUILD | 38 + utilities/transactions/BUILD | 104 + utilities/ttl/BUILD | 47 + utilities/write_batch_with_index/BUILD | 48 + 36 files changed, 4130 insertions(+), 6 deletions(-) create mode 100644 BUILD create mode 100644 WORKSPACE create mode 100644 cache/BUILD create mode 100644 db/BUILD create mode 100644 empty_main.cc create mode 100644 env/BUILD create mode 100644 include/BUILD create mode 100644 memtable/BUILD create mode 100644 monitoring/BUILD create mode 100644 options/BUILD create mode 100644 port/BUILD create mode 100644 rocksdb.bzl create mode 100644 table/BUILD create mode 100644 third_party/gtest/BUILD create mode 100644 third_party/gtest/compile_test.cpp create mode 100644 third_party/lz4/BUILD create mode 100644 third_party/lz4/BUILD.external create mode 100644 third_party/lz4/compile_test.cpp create mode 100644 third_party/snappy/BUILD create mode 100644 third_party/snappy/compile_test.cpp create mode 100644 tools/BUILD create mode 100644 util/BUILD create mode 100644 util/build_version.cc create mode 100644 utilities/BUILD create mode 100644 utilities/backupable/BUILD create mode 100644 utilities/checkpoint/BUILD create mode 100644 utilities/leveldb_options/BUILD create mode 100644 utilities/merge_operators/BUILD create mode 100644 utilities/merge_operators/string_append/BUILD create mode 100644 utilities/options/BUILD create mode 100644 utilities/table_properties_collectors/BUILD create mode 100644 utilities/transactions/BUILD create mode 100644 utilities/ttl/BUILD create mode 100644 utilities/write_batch_with_index/BUILD diff --git a/.gitignore b/.gitignore index e88ccfc00..dd210d814 100644 --- a/.gitignore +++ b/.gitignore @@ -33,7 +33,6 @@ manifest_dump sst_dump blob_dump column_aware_encoding_exp -util/build_version.cc build_tools/VALGRIND_LOGS/ coverage/COVERAGE_REPORT .gdbhistory diff --git a/BUILD b/BUILD new file mode 100644 index 000000000..e924a44d9 --- /dev/null +++ b/BUILD @@ -0,0 +1,78 @@ +config_setting( + name = "linux", + constraint_values = [ + "@bazel_tools//platforms:linux", + ], + visibility = ["//visibility:public"], +) + +config_setting( + name = "osx", + constraint_values = [ + "@bazel_tools//platforms:osx", + ], + visibility = ["//visibility:public"], +) + +config_setting( + name = "windows", + constraint_values = [ + "@bazel_tools//platforms:windows", + ], + visibility = ["//visibility:public"], +) + +config_setting( + name = "tests_enabled_debug_mode", + values = { + "compilation_mode": "dbg", + "define": "enable_tests=1", + }, + visibility = ["//visibility:public"], +) + +config_setting( + name = "tests_enabled_fastbuild_mode", + values = { + "compilation_mode": "fastbuild", + "define": "enable_tests=1", + }, + visibility = ["//visibility:public"], +) + +cc_library( + name = "rocksdb", + deps = [ + "//cache", + "//db", + "//env", + "//include", + "//memtable", + "//monitoring", + "//options", + "//port", + "//table", + "//third_party/gtest", + "//third_party/lz4", + "//third_party/snappy", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/table_properties_collectors", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "empty_main", + srcs = ["empty_main.cc"], + visibility = ["//visibility:public"], +) diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 000000000..6b77be6d4 --- /dev/null +++ b/WORKSPACE @@ -0,0 +1,23 @@ +workspace(name = "rocksdb") +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") + +http_archive( + name = "com_google_googletest", + urls = ["https://github.com/google/googletest/archive/release-1.8.1.zip"], + strip_prefix = "googletest-release-1.8.1", + sha256 = "927827c183d01734cc5cfef85e0ff3f5a92ffe6188e0d18e909c5efebf28a0c7", +) + +http_archive( + name = "com_google_snappy", + urls = ["https://github.com/stardog-union/snappy/archive/add_bazel.zip"], + strip_prefix = "snappy-add_bazel", +) + +new_http_archive( + name = "org_lz4", + urls = ["https://github.com/lz4/lz4/archive/v1.8.2.zip"], + strip_prefix = "lz4-1.8.2", + build_file = "third_party/lz4/BUILD.external", + sha256 = "6df2bc7b830d4a23ca6f0a19a772fc0a61100f98baa843f9bbf873a80b6840d5", +) diff --git a/cache/BUILD b/cache/BUILD new file mode 100644 index 000000000..216411109 --- /dev/null +++ b/cache/BUILD @@ -0,0 +1,69 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "cache", + srcs = glob(["*.cc"], exclude=["cache_bench.cc", "*_test.cc"]), + deps = [ + ":headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + "//include", + "//port:headers", + "//util:headers", + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "cache_test", + srcs = ["cache_test.cc"], + deps = [ + ":cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "lru_cache_test", + srcs = ["lru_cache_test.cc"], + deps = [ + ":cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/db/BUILD b/db/BUILD new file mode 100644 index 000000000..350c59bb5 --- /dev/null +++ b/db/BUILD @@ -0,0 +1,1849 @@ +load("//:rocksdb.bzl", "constrained_library", "constrained_test") + +EXCLUDED_FILES = ["c.cc", "forward_iterator_bench.cc"] + +cc_library( + name = "db", + srcs = glob(["*.cc"], exclude=["*_test.cc", "*_test2.cc", "*_test_util.cc"] + EXCLUDED_FILES), + deps = [ + ":headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + "//env:headers", + "//include", + "//include:with_extra_prefix", + "//memtable:headers", + "//monitoring:headers", + "//options:headers", + "//port:headers", + "//table:headers", + "//tools:headers", + "//util:headers", + "//utilities:headers", + ], + visibility = ["//visibility:public"], +) + +constrained_library( + name = "test_utils", + hdrs = ["db_test_util.h"], + srcs = ["db_test_util.cc"], + deps = [ + ":headers", + "//env", + "//env:test_utils", + "//table:test_utils", + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "column_family_test", + srcs = ["column_family_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], + tags = ["manual"], + timeout = "eternal", +) + +constrained_test( + name = "compact_files_test", + srcs = ["compact_files_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "compaction_iterator_test", + srcs = ["compaction_iterator_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "compaction_job_stats_test", + srcs = ["compaction_job_stats_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "compaction_job_test", + srcs = ["compaction_job_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "compaction_picker_test", + srcs = ["compaction_picker_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "comparator_db_test", + srcs = ["comparator_db_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "corruption_test", + srcs = ["corruption_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "cuckoo_table_db_test", + srcs = ["cuckoo_table_db_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_basic_test", + srcs = ["db_basic_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_blob_index_test", + srcs = ["db_blob_index_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_block_cache_test", + srcs = ["db_block_cache_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_bloom_filter_test", + srcs = ["db_bloom_filter_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_compaction_filter_test", + srcs = ["db_compaction_filter_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_compaction_test", + srcs = ["db_compaction_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], + timeout = "long", +) + +constrained_test( + name = "db_dynamic_level_test", + srcs = ["db_dynamic_level_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_encryption_test", + srcs = ["db_encryption_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_flush_test", + srcs = ["db_flush_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_inplace_update_test", + srcs = ["db_inplace_update_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_io_failure_test", + srcs = ["db_io_failure_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_iter_test", + srcs = ["db_iter_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_iterator_test", + srcs = ["db_iterator_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_log_iter_test", + srcs = ["db_log_iter_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_memtable_test", + srcs = ["db_memtable_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_merge_operator_test", + srcs = ["db_merge_operator_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_options_test", + srcs = ["db_options_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_properties_test", + srcs = ["db_properties_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_range_del_test", + srcs = ["db_range_del_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_sst_test", + srcs = ["db_sst_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], + tags = ["manual"], + timeout = "eternal", +) + +constrained_test( + name = "db_statistics_test", + srcs = ["db_statistics_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_table_properties_test", + srcs = ["db_table_properties_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/table_properties_collectors", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_tailing_iter_test", + srcs = ["db_tailing_iter_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_test", + srcs = ["db_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], + tags = ["manual"], + timeout = "eternal", +) + +constrained_test( + name = "db_universal_compaction_test", + srcs = ["db_universal_compaction_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], + timeout = "long", +) + +constrained_test( + name = "db_wal_test", + srcs = ["db_wal_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "db_write_test", + srcs = ["db_write_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "dbformat_test", + srcs = ["dbformat_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "deletefile_test", + srcs = ["deletefile_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "external_sst_file_basic_test", + srcs = ["external_sst_file_basic_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "external_sst_file_test", + srcs = ["external_sst_file_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], + tags = ["manual"], + timeout = "eternal", +) + +constrained_test( + name = "fault_injection_test", + srcs = ["fault_injection_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "file_indexer_test", + srcs = ["file_indexer_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "filename_test", + srcs = ["filename_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "flush_job_test", + srcs = ["flush_job_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "listener_test", + srcs = ["listener_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "log_test", + srcs = ["log_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "manual_compaction_test", + srcs = ["manual_compaction_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "memtable_list_test", + srcs = ["memtable_list_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "merge_helper_test", + srcs = ["merge_helper_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "merge_test", + srcs = ["merge_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "options_file_test", + srcs = ["options_file_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "perf_context_test", + srcs = ["perf_context_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "plain_table_db_test", + srcs = ["plain_table_db_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "prefix_test", + srcs = ["prefix_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "range_del_aggregator_test", + srcs = ["range_del_aggregator_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "repair_test", + srcs = ["repair_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "table_properties_collector_test", + srcs = ["table_properties_collector_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "version_builder_test", + srcs = ["version_builder_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "version_edit_test", + srcs = ["version_edit_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "version_set_test", + srcs = ["version_set_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "wal_manager_test", + srcs = ["wal_manager_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "write_batch_test", + srcs = ["write_batch_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "write_callback_test", + srcs = ["write_callback_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], + tags = ["manual"], + timeout = "eternal", +) + +constrained_test( + name = "write_controller_test", + srcs = ["write_controller_test.cc"], + deps = [ + ":db", + ":test_utils", + "//cache", + "//env", + "//memtable", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/empty_main.cc b/empty_main.cc new file mode 100644 index 000000000..ca68d24cc --- /dev/null +++ b/empty_main.cc @@ -0,0 +1,3 @@ +int main(int argc, char* argv[]) { + return 0; +} diff --git a/env/BUILD b/env/BUILD new file mode 100644 index 000000000..55d49c55f --- /dev/null +++ b/env/BUILD @@ -0,0 +1,155 @@ +load("//:rocksdb.bzl", "constrained_library", "constrained_test") + +COMMON_HDRS = [] + +PLATFORM_HDRS = select({ + "//:linux" : [ + "env_chroot.h", + "io_posix.h", + "posix_logger.h", + ], + "//:osx" : [ + "env_chroot.h", + "io_posix.h", + "posix_logger.h", + ], + "//:windows" : [], +}) + +COMMON_SRCS = [ + "env.cc", + "env_encryption.cc", +] + +PLATFORM_SRCS = select({ + "//:linux" : [ + "env_chroot.cc", + "env_posix.cc", + "io_posix.cc", + ], + "//:osx" : [ + "env_chroot.cc", + "env_posix.cc", + "io_posix.cc", + ], + "//:windows" : [], +}) + + +DEFAULT_DEFINES = select({ + "//:linux": ["ROCKSDB_PLATFORM_POSIX=1"], + "//:osx": ["ROCKSDB_PLATFORM_POSIX=1"], + "//conditions:default": [], +}) + +cc_library( + name = "env", + srcs = COMMON_SRCS + PLATFORM_SRCS, + deps = [ + ":headers", + "//include", + "//options", + ], + visibility = ["//visibility:public"], + defines = DEFAULT_DEFINES, +) + +cc_library( + name = "headers", + hdrs = COMMON_HDRS + PLATFORM_HDRS, + visibility = ["//visibility:public"], + defines = DEFAULT_DEFINES, +) + +constrained_library( + name = "test_utils", + hdrs = ["mock_env.h"], + srcs = ["mock_env.cc"], + deps = [ + "//db:headers", + "//port:headers", + "//third_party/gtest", + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "env_basic_test", + srcs = ["env_basic_test.cc"], + deps = [ + ":env", + ":test_utils", + "//cache", + "//db", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "env_test", + srcs = ["env_test.cc"], + deps = [ + ":env", + ":test_utils", + "//cache", + "//db", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "mock_env_test", + srcs = ["mock_env_test.cc"], + deps = [ + ":env", + ":test_utils", + "//cache", + "//db", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/include/BUILD b/include/BUILD new file mode 100644 index 000000000..ec9fc4a6b --- /dev/null +++ b/include/BUILD @@ -0,0 +1,12 @@ +cc_library( + name = "include", + hdrs = glob(["**/*.h"]), + visibility = ["//visibility:public"], + strip_include_prefix = "/include", +) + +cc_library( + name = "with_extra_prefix", + hdrs = glob(["**/*.h"]), + visibility = ["//visibility:public"], +) diff --git a/memtable/BUILD b/memtable/BUILD new file mode 100644 index 000000000..ac5673f4f --- /dev/null +++ b/memtable/BUILD @@ -0,0 +1,100 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "memtable", + srcs = glob(["*.cc"], exclude=["memtablerep_bench.cc", "*_test.cc"]), + deps = [ + ":headers", + "//db:headers", + "//include", + ], + visibility = ["//visibility:public"], + alwayslink = 1, +) + +cc_library( + name = "headers", + srcs = glob(["*.h"]), + visibility = ["//visibility:public"], +) + +constrained_test( + name = "inlineskiplist_test", + srcs = ["inlineskiplist_test.cc"], + deps = [ + ":memtable", + "//cache", + "//db", + "//env", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "skiplist_test", + srcs = ["skiplist_test.cc"], + deps = [ + ":memtable", + "//cache", + "//db", + "//env", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "write_buffer_manager_test", + srcs = ["write_buffer_manager_test.cc"], + deps = [ + ":memtable", + "//cache", + "//db", + "//env", + "//monitoring", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/monitoring/BUILD b/monitoring/BUILD new file mode 100644 index 000000000..2e9f565ab --- /dev/null +++ b/monitoring/BUILD @@ -0,0 +1,107 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "monitoring", + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + ":headers", + "//db:headers", + "//include", + "//include:with_extra_prefix", + "//options:headers", + "//port:headers", + "//table:headers", + "//util:headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + "//include", + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "histogram_test", + srcs = ["histogram_test.cc"], + deps = [ + ":monitoring", + "//cache", + "//db", + "//env", + "//memtable", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "iostats_context_test", + srcs = ["iostats_context_test.cc"], + deps = [ + ":monitoring", + "//cache", + "//db", + "//env", + "//memtable", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "statistics_test", + srcs = ["statistics_test.cc"], + deps = [ + ":monitoring", + "//cache", + "//db", + "//env", + "//memtable", + "//options", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/options/BUILD b/options/BUILD new file mode 100644 index 000000000..29e158702 --- /dev/null +++ b/options/BUILD @@ -0,0 +1,71 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "options", + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + ":headers", + "//db:headers", + "//include", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + srcs = glob(["*.h"]), + visibility = ["//visibility:public"], +) + +constrained_test( + name = "options_test", + srcs = ["options_test.cc"], + deps = [ + ":options", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "options_settable_test", + srcs = ["options_settable_test.cc"], + deps = [ + ":options", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/port/BUILD b/port/BUILD new file mode 100644 index 000000000..9c249cb2f --- /dev/null +++ b/port/BUILD @@ -0,0 +1,85 @@ +PLATFORM_DEFINES = select({ + "//:linux": [ + "OS_LINUX", + "ROCKSDB_PLATFORM_POSIX", + "HAVE_SSE42", + "ROCKSDB_FALLOCATE_PRESENT", + "ROCKSDB_LIB_IO_POSIX", + "ROCKSDB_MALLOC_USABLE_SIZE", + "ROCKSDB_PTHREAD_ADAPTIVE_MUTEX", + "ROCKSDB_RANGESYNC_PRESENT", + "ROCKSDB_SCHED_GETCPU_PRESENT", + "ROCKSDB_SUPPORT_THREAD_LOCAL", + "rocksdb_shared_EXPORTS", + ], + "//:osx": [ + "OS_MACOSX", + "ROCKSDB_PLATFORM_POSIX", + ], + "//:windows": [ + "OS_WIN", + "ROCKSDB_PLATFORM_POSIX", + ], + "//conditions:default": [], +}) + +FEATURE_DEFINES = [ + "LZ4", + "SNAPPY", +] + +DEFAULT_DEFINES = PLATFORM_DEFINES + FEATURE_DEFINES + +COMMON_HDRS = [ + "dirent.h", + "likely.h", + "port.h", + "stack_trace.h", + "sys_time.h", + "util_logger.h", +] + +PLATFORM_HDRS = select({ + "//:linux" : [ + "port_posix.h", + ], + "//:osx" : [ + "port_posix.h", + ], + "//:windows" : glob(["win/*.h"]) + ["xpress.h"], +}) + +COMMON_SRCS = [ + "stack_trace.cc", +] + +PLATFORM_SRCS = select({ + "//:linux" : [ + "port_posix.cc", + ], + "//:osx" : [ + "port_posix.cc", + ], + "//:windows" : glob(["win/*.cc"]), +}) + +cc_library( + name = "port", + srcs = COMMON_SRCS + PLATFORM_SRCS, + deps = [ + ":headers", + "//util:headers", + ], + visibility = ["//visibility:public"], + defines = DEFAULT_DEFINES, +) + +cc_library( + name = "headers", + hdrs = COMMON_HDRS + PLATFORM_HDRS, + deps = [ + "//env:headers", + ], + visibility = ["//visibility:public"], + defines = DEFAULT_DEFINES, +) diff --git a/rocksdb.bzl b/rocksdb.bzl new file mode 100644 index 000000000..1355dabbd --- /dev/null +++ b/rocksdb.bzl @@ -0,0 +1,67 @@ +def constrained_test( + name, + srcs = [], + deps = [], + copts = [], + linkopts = [], + defines = [], + includes = [], + visibility = ["//visibility:public"], + tags = [], + timeout = None): + return native.cc_test( + name = name, + srcs = select({ + "//:tests_enabled_debug_mode": srcs, + "//:tests_enabled_fastbuild_mode": srcs, + "//conditions:default": [], + }), + deps = select({ + "//:tests_enabled_debug_mode": deps, + "//:tests_enabled_fastbuild_mode": deps, + "//conditions:default": ["//:empty_main"], + }), + copts = copts, + linkopts = linkopts, + defines = defines, + includes = includes, + visibility = visibility, + tags = tags, + timeout = timeout, + ) + +def constrained_library( + name, + hdrs = [], + srcs = [], + deps = [], + copts = [], + linkopts = [], + defines = [], + includes = [], + visibility = ["//visibility:public"], + testonly = True): + return native.cc_library( + name = name, + hdrs = select({ + "//:tests_enabled_debug_mode": hdrs, + "//:tests_enabled_fastbuild_mode": hdrs, + "//conditions:default": [], + }), + srcs = select({ + "//:tests_enabled_debug_mode": srcs, + "//:tests_enabled_fastbuild_mode": srcs, + "//conditions:default": [], + }), + deps = select({ + "//:tests_enabled_debug_mode": deps, + "//:tests_enabled_fastbuild_mode": deps, + "//conditions:default": ["//:empty_main"], + }), + copts = copts, + linkopts = linkopts, + defines = defines, + includes = includes, + visibility = visibility, + testonly = testonly, + ) diff --git a/table/BUILD b/table/BUILD new file mode 100644 index 000000000..3d70f7a0a --- /dev/null +++ b/table/BUILD @@ -0,0 +1,277 @@ +load("//:rocksdb.bzl", "constrained_library", "constrained_test") + +cc_library( + name = "table", + srcs = glob(["*.cc"], exclude=["table_reader_bench.cc", "*_test.*", "mock_table.*"]), + deps = [ + ":headers", + "//db:headers", + "//include", + "//options:headers", + "//util:headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"], exclude=["*_test.*", "mock_table.*"]), + deps = [ + ], + visibility = ["//visibility:public"], +) + +constrained_library( + name = "test_utils", + hdrs = glob(["mock_table.h"]), + srcs = glob(["mock_table.cc"]), + deps = [ + "//memtable:headers", + "//options", + "//util:test_utils", + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "block_based_filter_block_test", + srcs = ["block_based_filter_block_test.cc"], + deps = [ + ":table", + ":test_utils", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//options", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "block_test", + srcs = ["block_test.cc"], + deps = [ + ":table", + ":test_utils", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//options", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "cleanable_test", + srcs = ["cleanable_test.cc"], + deps = [ + ":table", + ":test_utils", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//options", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "cuckoo_table_builder_test", + srcs = ["cuckoo_table_builder_test.cc"], + deps = [ + ":table", + ":test_utils", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//options", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "cuckoo_table_reader_test", + srcs = ["cuckoo_table_reader_test.cc"], + deps = [ + ":table", + ":test_utils", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//options", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "full_filter_block_test", + srcs = ["full_filter_block_test.cc"], + deps = [ + ":table", + ":test_utils", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//options", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "merger_test", + srcs = ["merger_test.cc"], + deps = [ + ":table", + ":test_utils", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//options", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "partitioned_filter_block_test", + srcs = ["partitioned_filter_block_test.cc"], + deps = [ + ":table", + ":test_utils", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//options", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "table_test", + srcs = ["table_test.cc"], + deps = [ + ":table", + ":test_utils", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//options", + "//third_party/gtest", + "//util", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/leveldb_options", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/third_party/gtest/BUILD b/third_party/gtest/BUILD new file mode 100644 index 000000000..2102b7179 --- /dev/null +++ b/third_party/gtest/BUILD @@ -0,0 +1,19 @@ +licenses(["notice"]) + +cc_library( + name = "gtest", + deps = [ + "@com_google_googletest//:gtest_main", + ], + visibility = ["//visibility:public"], +) + +cc_test( + name = "compile_test", + srcs = [ + "compile_test.cpp", + ], + deps = [ + ":gtest", + ], +) diff --git a/third_party/gtest/compile_test.cpp b/third_party/gtest/compile_test.cpp new file mode 100644 index 000000000..02d15dfae --- /dev/null +++ b/third_party/gtest/compile_test.cpp @@ -0,0 +1,9 @@ +#include "gtest/gtest.h" + +namespace stardog { + namespace { + TEST(CompileTest, Compiles) { + EXPECT_TRUE(true); + } + } +} diff --git a/third_party/lz4/BUILD b/third_party/lz4/BUILD new file mode 100644 index 000000000..0ea72e479 --- /dev/null +++ b/third_party/lz4/BUILD @@ -0,0 +1,18 @@ +licenses(["notice"]) + +cc_library( + name = "lz4", + deps = ["@org_lz4//:lib"], + visibility = ["//visibility:public"], +) + +cc_test( + name = "compile_test", + srcs = [ + "compile_test.cpp", + ], + deps = [ + ":lz4", + "//third_party/gtest", + ], +) diff --git a/third_party/lz4/BUILD.external b/third_party/lz4/BUILD.external new file mode 100644 index 000000000..80d78fa06 --- /dev/null +++ b/third_party/lz4/BUILD.external @@ -0,0 +1,8 @@ +cc_library( + name = "lib", + # We include the lz4.c as a header as lz4hc.c actually does #include "lz4.c". + hdrs = glob(["lib/*.h"]) + ["lib/lz4.c"], + srcs = glob(["lib/*.c"]), + visibility = ["//visibility:public"], + strip_include_prefix = "lib", +) diff --git a/third_party/lz4/compile_test.cpp b/third_party/lz4/compile_test.cpp new file mode 100644 index 000000000..37eb69215 --- /dev/null +++ b/third_party/lz4/compile_test.cpp @@ -0,0 +1,10 @@ +#include "lz4.h" +#include "gtest/gtest.h" + +namespace stardog { + namespace { + TEST(CompileTest, Compiles) { + EXPECT_TRUE(true); + } + } +} diff --git a/third_party/snappy/BUILD b/third_party/snappy/BUILD new file mode 100644 index 000000000..667b5a787 --- /dev/null +++ b/third_party/snappy/BUILD @@ -0,0 +1,19 @@ +licenses(["notice"]) + +cc_library( + name = "snappy", + deps = ["@com_google_snappy//:snappy"], + visibility = ["//visibility:public"], +) + +cc_test( + name = "compile_test", + srcs = [ + "compile_test.cpp", + ], + deps = [ + ":snappy", + "//third_party/gtest", + ], +) + diff --git a/third_party/snappy/compile_test.cpp b/third_party/snappy/compile_test.cpp new file mode 100644 index 000000000..121703740 --- /dev/null +++ b/third_party/snappy/compile_test.cpp @@ -0,0 +1,10 @@ +#include "snappy.h" +#include "gtest/gtest.h" + +namespace stardog { + namespace { + TEST(CompileTest, Compiles) { + EXPECT_TRUE(true); + } + } +} diff --git a/tools/BUILD b/tools/BUILD new file mode 100644 index 000000000..c5b5ed253 --- /dev/null +++ b/tools/BUILD @@ -0,0 +1,18 @@ +cc_library( + name = "tools", + deps = [ + ":headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = ["sst_dump_tool_imp.h"], + deps = [ + "//include", + "//options:headers", + "//util:headers", + ], + visibility = ["//visibility:public"], +) diff --git a/util/BUILD b/util/BUILD new file mode 100644 index 000000000..4ef948ee0 --- /dev/null +++ b/util/BUILD @@ -0,0 +1,508 @@ +load("//:rocksdb.bzl", "constrained_library", "constrained_test") + +cc_library( + name = "util", + srcs = glob(["*.cc"], exclude=["log_write_bench.cc", "*_test.cc", "testharness.*", "*_test_env.*"]), + deps = [ + ":headers", + "//db:headers", + "//include", + "//options:headers", + "//port", + ], + copts = ["-msse4.2", "-mpclmul"], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"], exclude=["testharness.*", "*_test_env.*"]), + deps = [ + "//port:headers", + "//third_party/lz4", + "//third_party/snappy", + ], + visibility = ["//visibility:public"], +) + +constrained_library( + name = "test_utils", + hdrs = glob(["*_test_env.h"]) + ["testharness.h"], + srcs = glob(["*_test_env.cc"]) + ["testharness.cc"], + deps = [ + "//db:headers", + "//port:headers", + "//third_party/gtest", + "//env:test_utils", + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "arena_test", + srcs = ["arena_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "auto_roll_logger_test", + srcs = ["auto_roll_logger_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "autovector_test", + srcs = ["autovector_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "bloom_test", + srcs = ["bloom_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "coding_test", + srcs = ["coding_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "crc32c_test", + srcs = ["crc32c_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "delete_scheduler_test", + srcs = ["delete_scheduler_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "dynamic_bloom_test", + srcs = ["dynamic_bloom_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "event_logger_test", + srcs = ["event_logger_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "filelock_test", + srcs = ["filelock_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "file_reader_writer_test", + srcs = ["file_reader_writer_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "hash_test", + srcs = ["hash_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "heap_test", + srcs = ["heap_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "rate_limiter_test", + srcs = ["rate_limiter_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//db:test_utils", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "slice_transform_test", + srcs = ["slice_transform_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "thread_list_test", + srcs = ["thread_list_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "thread_local_test", + srcs = ["thread_local_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "timer_queue_test", + srcs = ["timer_queue_test.cc"], + deps = [ + ":test_utils", + ":util", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/util/build_version.cc b/util/build_version.cc new file mode 100644 index 000000000..b323f6aff --- /dev/null +++ b/util/build_version.cc @@ -0,0 +1,4 @@ +#include "build_version.h" +const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:REDACTED"; +const char* rocksdb_build_git_date = "rocksdb_build_git_date:REDACTED"; +const char* rocksdb_build_compile_date = __DATE__; diff --git a/util/compression.h b/util/compression.h index e91faeac6..ebb305b83 100644 --- a/util/compression.h +++ b/util/compression.h @@ -20,20 +20,20 @@ #include "util/memory_allocator.h" #ifdef SNAPPY -#include +#include "snappy.h" #endif #ifdef ZLIB -#include +#include "zlib.h" #endif #ifdef BZIP2 -#include +#include "bzlib.h" #endif #if defined(LZ4) -#include -#include +#include "lz4.h" +#include "lz4hc.h" #endif #if defined(ZSTD) diff --git a/utilities/BUILD b/utilities/BUILD new file mode 100644 index 000000000..5617e6c79 --- /dev/null +++ b/utilities/BUILD @@ -0,0 +1,151 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "utilities", + srcs = glob(["*.cc"], exclude = ["column_aware_encoding_exp.cc", "*_test.cc", "env_librados.cc"]), + deps = [ + ":headers", + "//db:headers", + "//monitoring:headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + "//util:headers", + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "column_aware_encoding_test", + srcs = ["column_aware_encoding_test.cc"], + deps = [ + ":utilities", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "env_mirror_test", + srcs = ["env_mirror_test.cc"], + deps = [ + ":utilities", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "env_timed_test", + srcs = ["env_timed_test.cc"], + deps = [ + ":utilities", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "object_registry_test", + srcs = ["object_registry_test.cc"], + deps = [ + ":utilities", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "util_merge_operators_test", + srcs = ["util_merge_operators_test.cc"], + deps = [ + ":utilities", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/utilities/backupable/BUILD b/utilities/backupable/BUILD new file mode 100644 index 000000000..e7b27986e --- /dev/null +++ b/utilities/backupable/BUILD @@ -0,0 +1,48 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "backupable", + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + ":headers", + "//db:headers", + "//include", + "//port:headers", + "//util:headers", + "//utilities/checkpoint:headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "test", + srcs = glob(["*_test.cc"]), + deps = [ + ":backupable", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/utilities/checkpoint/BUILD b/utilities/checkpoint/BUILD new file mode 100644 index 000000000..5e5c6b183 --- /dev/null +++ b/utilities/checkpoint/BUILD @@ -0,0 +1,49 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "checkpoint", + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + ":headers", + "//db:headers", + "//include", + "//port:headers", + "//util:headers", + "//utilities/write_batch_with_index", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "checkpoint_test", + srcs = ["checkpoint_test.cc"], + deps = [ + ":checkpoint", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities", + "//utilities/backupable", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/ttl", + "//utilities/transactions", + "//utilities/write_batch_with_index", + ], +) diff --git a/utilities/leveldb_options/BUILD b/utilities/leveldb_options/BUILD new file mode 100644 index 000000000..bb57131e3 --- /dev/null +++ b/utilities/leveldb_options/BUILD @@ -0,0 +1,6 @@ +cc_library( + name = "leveldb_options", + srcs = ["leveldb_options.cc"], + deps = ["//include"], + visibility = ["//visibility:public"], +) diff --git a/utilities/merge_operators/BUILD b/utilities/merge_operators/BUILD new file mode 100644 index 000000000..629ae615a --- /dev/null +++ b/utilities/merge_operators/BUILD @@ -0,0 +1,21 @@ +cc_library( + name = "merge_operators", + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + ":headers", + "//db:headers", + "//include", + "//port:headers", + "//util:headers", + "//utilities/checkpoint:headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + ], + visibility = ["//visibility:public"], +) diff --git a/utilities/merge_operators/string_append/BUILD b/utilities/merge_operators/string_append/BUILD new file mode 100644 index 000000000..cef16063b --- /dev/null +++ b/utilities/merge_operators/string_append/BUILD @@ -0,0 +1,46 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "string_append", + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + ":headers", + "//include", + "//utilities:headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "stringappend_test", + srcs = ["stringappend_test.cc"], + deps = [ + ":string_append", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/utilities/options/BUILD b/utilities/options/BUILD new file mode 100644 index 000000000..97d27e021 --- /dev/null +++ b/utilities/options/BUILD @@ -0,0 +1,48 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "options", + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + ":headers", + "//db:headers", + "//include", + "//port:headers", + "//util:headers", + "//utilities/checkpoint:headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "options_util_test", + srcs = ["options_util_test.cc"], + deps = [ + ":options", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/transactions", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) diff --git a/utilities/table_properties_collectors/BUILD b/utilities/table_properties_collectors/BUILD new file mode 100644 index 000000000..f577049f7 --- /dev/null +++ b/utilities/table_properties_collectors/BUILD @@ -0,0 +1,38 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "table_properties_collectors", + hdrs = glob(["*.h"]), + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + "//include", + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "compact_on_deletion_collector_test", + srcs = ["compact_on_deletion_collector_test.cc"], + deps = [ + ":table_properties_collectors", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/ttl", + "//utilities/transactions", + "//utilities/write_batch_with_index", + ], +) diff --git a/utilities/transactions/BUILD b/utilities/transactions/BUILD new file mode 100644 index 000000000..db8fa3a95 --- /dev/null +++ b/utilities/transactions/BUILD @@ -0,0 +1,104 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "transactions", + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + ":headers", + "//db:headers", + "//include", + "//port:headers", + "//util:headers", + "//utilities/checkpoint:headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + "//utilities/merge_operators/string_append:headers", + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "optimistic_transaction_test", + srcs = ["optimistic_transaction_test.cc"], + deps = [ + ":transactions", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], +) + +constrained_test( + name = "transaction_test", + srcs = ["transaction_test.cc"], + deps = [ + ":transactions", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], + timeout = "eternal", + tags = ["manual"], +) + +constrained_test( + name = "write_prepared_transaction_test", + srcs = ["write_prepared_transaction_test.cc"], + deps = [ + ":transactions", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/ttl", + "//utilities/write_batch_with_index", + ], + timeout = "eternal", + tags = ["manual"], +) diff --git a/utilities/ttl/BUILD b/utilities/ttl/BUILD new file mode 100644 index 000000000..6d837ff5b --- /dev/null +++ b/utilities/ttl/BUILD @@ -0,0 +1,47 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "ttl", + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + ":headers", + "//db:headers", + "//include", + "//port:headers", + "//util:headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "ttl_test", + srcs = ["ttl_test.cc"], + deps = [ + ":ttl", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/write_batch_with_index", + ], +) diff --git a/utilities/write_batch_with_index/BUILD b/utilities/write_batch_with_index/BUILD new file mode 100644 index 000000000..000c3d3f3 --- /dev/null +++ b/utilities/write_batch_with_index/BUILD @@ -0,0 +1,48 @@ +load("//:rocksdb.bzl", "constrained_test") + +cc_library( + name = "write_batch_with_index", + srcs = glob(["*.cc"], exclude=["*_test.cc"]), + deps = [ + ":headers", + "//db:headers", + "//include", + "//memtable:headers", + ], + visibility = ["//visibility:public"], +) + +cc_library( + name = "headers", + hdrs = glob(["*.h"]), + deps = [ + "//options:headers", + ], + visibility = ["//visibility:public"], +) + +constrained_test( + name = "test", + srcs = glob(["*_test.cc"]), + deps = [ + ":write_batch_with_index", + "//cache", + "//db", + "//env", + "//memtable", + "//monitoring", + "//table", + "//table:test_utils", + "//third_party/gtest", + "//util", + "//util:test_utils", + "//utilities", + "//utilities/backupable", + "//utilities/checkpoint", + "//utilities/merge_operators", + "//utilities/merge_operators/string_append", + "//utilities/options", + "//utilities/transactions", + "//utilities/ttl", + ], +) From 2520fc9dc3e64c07d1acf72c69719e164ca49801 Mon Sep 17 00:00:00 2001 From: James Pack Date: Thu, 11 Oct 2018 06:41:31 -0700 Subject: [PATCH 29/57] Update OSX defines --- port/BUILD | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/port/BUILD b/port/BUILD index 9c249cb2f..b5e795221 100644 --- a/port/BUILD +++ b/port/BUILD @@ -3,18 +3,21 @@ PLATFORM_DEFINES = select({ "OS_LINUX", "ROCKSDB_PLATFORM_POSIX", "HAVE_SSE42", - "ROCKSDB_FALLOCATE_PRESENT", "ROCKSDB_LIB_IO_POSIX", + "ROCKSDB_SUPPORT_THREAD_LOCAL", + "ROCKSDB_FALLOCATE_PRESENT", "ROCKSDB_MALLOC_USABLE_SIZE", "ROCKSDB_PTHREAD_ADAPTIVE_MUTEX", "ROCKSDB_RANGESYNC_PRESENT", "ROCKSDB_SCHED_GETCPU_PRESENT", - "ROCKSDB_SUPPORT_THREAD_LOCAL", "rocksdb_shared_EXPORTS", ], "//:osx": [ "OS_MACOSX", "ROCKSDB_PLATFORM_POSIX", + "HAVE_SSE42", + "ROCKSDB_LIB_IO_POSIX", + "rocksdb_shared_EXPORTS", ], "//:windows": [ "OS_WIN", From 4704cbea0c686f7c363d988d61abeff5ae7504b1 Mon Sep 17 00:00:00 2001 From: James Pack Date: Mon, 15 Oct 2018 06:39:19 -0400 Subject: [PATCH 30/57] Build with toolchain --- .bazelrc | 1 + WORKSPACE | 13 +++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 .bazelrc diff --git a/.bazelrc b/.bazelrc new file mode 100644 index 000000000..0f380d34b --- /dev/null +++ b/.bazelrc @@ -0,0 +1 @@ +build --crosstool_top=@toolchain//cpp:toolchain diff --git a/WORKSPACE b/WORKSPACE index 6b77be6d4..4d4228dbf 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -21,3 +21,16 @@ new_http_archive( build_file = "third_party/lz4/BUILD.external", sha256 = "6df2bc7b830d4a23ca6f0a19a772fc0a61100f98baa843f9bbf873a80b6840d5", ) + +http_archive( + name = "toolchain", + urls = [ + # The file: URL is useful for testing the build, but is not generally necessary since Bazel handles caching + # external dependencies. + # TODO(james): Remove this URL when the Bazel build is stable. + # "file:///home/james/git/toolchain-master.tgz", + "https://github.com/stardog-union/toolchain/archive/master.zip", + ], + strip_prefix = "toolchain-master", + sha256 = "d0740cacb99833911baba82041bb4429f9d3182522fe0fd4c131335ac8343891", +) From e9a1ddf7911fba0f3c143accc05670dce2e3dd8d Mon Sep 17 00:00:00 2001 From: James Pack Date: Mon, 15 Oct 2018 07:40:47 -0400 Subject: [PATCH 31/57] Fixed link issue --- db/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/db/BUILD b/db/BUILD index 350c59bb5..28543cbf4 100644 --- a/db/BUILD +++ b/db/BUILD @@ -9,6 +9,7 @@ cc_library( ":headers", ], visibility = ["//visibility:public"], + alwayslink = 1, ) cc_library( From 30b49bc613d5471d053dfed1c72331545dbb017b Mon Sep 17 00:00:00 2001 From: James Pack Date: Thu, 25 Oct 2018 16:18:00 -0400 Subject: [PATCH 32/57] Windows is not POSIX --- port/BUILD | 1 - 1 file changed, 1 deletion(-) diff --git a/port/BUILD b/port/BUILD index b5e795221..afb85c817 100644 --- a/port/BUILD +++ b/port/BUILD @@ -21,7 +21,6 @@ PLATFORM_DEFINES = select({ ], "//:windows": [ "OS_WIN", - "ROCKSDB_PLATFORM_POSIX", ], "//conditions:default": [], }) From 34703d91c445414f5d3e3adf506e4bed02f3d223 Mon Sep 17 00:00:00 2001 From: James Pack Date: Thu, 6 Dec 2018 06:45:58 -0500 Subject: [PATCH 33/57] Exclude benchmarking executable range_del_aggregator_bench.cc from db cc_library --- db/BUILD | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/db/BUILD b/db/BUILD index 28543cbf4..c9b3d756d 100644 --- a/db/BUILD +++ b/db/BUILD @@ -1,6 +1,10 @@ load("//:rocksdb.bzl", "constrained_library", "constrained_test") -EXCLUDED_FILES = ["c.cc", "forward_iterator_bench.cc"] +EXCLUDED_FILES = [ + "c.cc", + "forward_iterator_bench.cc", + "range_del_aggregator_bench.cc", +] cc_library( name = "db", From f4644de43b25fae88a35fe6935aa61e1b4127ae8 Mon Sep 17 00:00:00 2001 From: James Pack Date: Sun, 28 Oct 2018 12:16:37 -0400 Subject: [PATCH 34/57] Remove dependency on gtest_main as that seems to cause some bad dependencies in starrocks tests --- third_party/gtest/BUILD | 2 +- third_party/gtest/compile_test.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/third_party/gtest/BUILD b/third_party/gtest/BUILD index 2102b7179..c197318d4 100644 --- a/third_party/gtest/BUILD +++ b/third_party/gtest/BUILD @@ -3,7 +3,7 @@ licenses(["notice"]) cc_library( name = "gtest", deps = [ - "@com_google_googletest//:gtest_main", + "@com_google_googletest//:gtest", ], visibility = ["//visibility:public"], ) diff --git a/third_party/gtest/compile_test.cpp b/third_party/gtest/compile_test.cpp index 02d15dfae..91617474e 100644 --- a/third_party/gtest/compile_test.cpp +++ b/third_party/gtest/compile_test.cpp @@ -7,3 +7,8 @@ namespace stardog { } } } + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} From 0b86b91403b0678496e5c161e86004c181121982 Mon Sep 17 00:00:00 2001 From: Toktarev Alexander Date: Tue, 19 Mar 2019 21:50:11 +0300 Subject: [PATCH 35/57] Include jemalloc_helper.h only if ROCKSDB_JEMALLOC is enabled --- db/malloc_stats.cc | 3 ++- util/jemalloc_nodump_allocator.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc index bcee5c3fb..02b43a67d 100644 --- a/db/malloc_stats.cc +++ b/db/malloc_stats.cc @@ -13,8 +13,9 @@ #include #include +#ifdef ROCKSDB_JEMALLOC #include "port/jemalloc_helper.h" - +#endif // ROCKSDB_JEMALLOC namespace rocksdb { diff --git a/util/jemalloc_nodump_allocator.h b/util/jemalloc_nodump_allocator.h index e93c12237..499c28d6c 100644 --- a/util/jemalloc_nodump_allocator.h +++ b/util/jemalloc_nodump_allocator.h @@ -8,8 +8,11 @@ #include #include +#ifdef ROCKSDB_JEMALLOC #include "port/jemalloc_helper.h" #include "port/port.h" +#endif // ROCKSDB_JEMALLOC + #include "rocksdb/memory_allocator.h" #include "util/core_local.h" #include "util/thread_local.h" From 98bf022593396cb6c5ec0bcddfc4fde4e66b17c8 Mon Sep 17 00:00:00 2001 From: Toktarev Alexander Date: Thu, 21 Mar 2019 15:58:49 +0300 Subject: [PATCH 36/57] Added jemalloc_helper.h as a dependency --- port/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/port/BUILD b/port/BUILD index afb85c817..cd653849e 100644 --- a/port/BUILD +++ b/port/BUILD @@ -36,6 +36,7 @@ COMMON_HDRS = [ "dirent.h", "likely.h", "port.h", + "jemalloc_helper.h", "stack_trace.h", "sys_time.h", "util_logger.h", From feb54904c799dbe35d48023765a9dc5289585f52 Mon Sep 17 00:00:00 2001 From: Toktarev Alexander Date: Tue, 16 Apr 2019 12:50:47 +0300 Subject: [PATCH 37/57] Make content_flags_ to be available for public API --- include/rocksdb/write_batch.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index c40c448fd..ce82ceabd 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -171,6 +171,10 @@ class WriteBatch : public WriteBatchBase { // Otherwise returns Status::OK(). Status PopSavePoint() override; + void setContentFlag(uint32_t theContentFlag) { + content_flags_.store(theContentFlag,std::memory_order_seq_cst); + } + // Support for iterating over the contents of a batch. class Handler { public: From 41129d5bf8791d863682523c589495de4bf9bdbe Mon Sep 17 00:00:00 2001 From: James Pack Date: Thu, 26 Sep 2019 12:13:15 -0400 Subject: [PATCH 38/57] Turned off performance measurement by default --- monitoring/perf_context_imp.h | 4 +++- port/BUILD | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h index d67654914..a30a2eb9c 100644 --- a/monitoring/perf_context_imp.h +++ b/monitoring/perf_context_imp.h @@ -23,11 +23,13 @@ extern thread_local PerfContext perf_context; #if defined(NPERF_CONTEXT) #define PERF_TIMER_GUARD(metric) -#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition) +#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ + ticker_type) #define PERF_TIMER_MEASURE(metric) #define PERF_TIMER_STOP(metric) #define PERF_TIMER_START(metric) #define PERF_COUNTER_ADD(metric, value) +#define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) #else diff --git a/port/BUILD b/port/BUILD index cd653849e..ff9ed9886 100644 --- a/port/BUILD +++ b/port/BUILD @@ -11,6 +11,7 @@ PLATFORM_DEFINES = select({ "ROCKSDB_RANGESYNC_PRESENT", "ROCKSDB_SCHED_GETCPU_PRESENT", "rocksdb_shared_EXPORTS", + "NPERF_CONTEXT", ], "//:osx": [ "OS_MACOSX", @@ -18,9 +19,11 @@ PLATFORM_DEFINES = select({ "HAVE_SSE42", "ROCKSDB_LIB_IO_POSIX", "rocksdb_shared_EXPORTS", + "NPERF_CONTEXT", ], "//:windows": [ "OS_WIN", + "NPERF_CONTEXT", ], "//conditions:default": [], }) From 5636fbae4d90b3fcdbd34ee2efb54c96580f52ac Mon Sep 17 00:00:00 2001 From: matthewvon Date: Tue, 21 Jan 2020 12:40:02 -0500 Subject: [PATCH 39/57] clean include prefix from rocksdb include paths to fix pragma once problem in Windows build. --- db/range_del_aggregator.cc | 4 ++-- db/range_del_aggregator.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index 3685d717d..f2a941f93 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -11,8 +11,8 @@ #include "db/range_del_aggregator.h" #include "db/range_tombstone_fragmenter.h" #include "db/version_edit.h" -#include "include/rocksdb/comparator.h" -#include "include/rocksdb/types.h" +#include "rocksdb/comparator.h" +#include "rocksdb/types.h" #include "table/internal_iterator.h" #include "table/scoped_arena_iterator.h" #include "table/table_builder.h" diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index 712ae4583..015f8b7e4 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -19,8 +19,8 @@ #include "db/range_del_aggregator.h" #include "db/range_tombstone_fragmenter.h" #include "db/version_edit.h" -#include "include/rocksdb/comparator.h" -#include "include/rocksdb/types.h" +#include "rocksdb/comparator.h" +#include "rocksdb/types.h" #include "table/internal_iterator.h" #include "table/scoped_arena_iterator.h" #include "table/table_builder.h" From 75c14f96c59c19245feb98bab5e7f6685a374672 Mon Sep 17 00:00:00 2001 From: matthewvon Date: Wed, 22 Jan 2020 11:29:13 -0500 Subject: [PATCH 40/57] force win_jemalloc out of the bazel build --- port/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/port/BUILD b/port/BUILD index ff9ed9886..684199889 100644 --- a/port/BUILD +++ b/port/BUILD @@ -66,7 +66,7 @@ PLATFORM_SRCS = select({ "//:osx" : [ "port_posix.cc", ], - "//:windows" : glob(["win/*.cc"]), + "//:windows" : glob(["win/*.cc"], exclude = ["win/win_jemalloc.cc"]), }) cc_library( From 8847d69bf1a3133fe27a012023be8212dc026b45 Mon Sep 17 00:00:00 2001 From: matthewvon Date: Wed, 22 Jan 2020 13:24:31 -0500 Subject: [PATCH 41/57] wow, we do build these utility files that have the same stupid include paths. --- utilities/column_aware_encoding_util.cc | 4 ++-- utilities/column_aware_encoding_util.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/utilities/column_aware_encoding_util.cc b/utilities/column_aware_encoding_util.cc index 222ee4680..f50925df6 100644 --- a/utilities/column_aware_encoding_util.cc +++ b/utilities/column_aware_encoding_util.cc @@ -17,8 +17,8 @@ #include #include #include -#include "include/rocksdb/comparator.h" -#include "include/rocksdb/slice.h" +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" #include "rocksdb/env.h" #include "rocksdb/status.h" #include "table/block_based_table_builder.h" diff --git a/utilities/column_aware_encoding_util.h b/utilities/column_aware_encoding_util.h index c2c4fa2d6..9d99b9ef2 100644 --- a/utilities/column_aware_encoding_util.h +++ b/utilities/column_aware_encoding_util.h @@ -8,10 +8,10 @@ #include #include #include "db/dbformat.h" -#include "include/rocksdb/env.h" -#include "include/rocksdb/listener.h" -#include "include/rocksdb/options.h" -#include "include/rocksdb/status.h" +#include "rocksdb/env.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/status.h" #include "options/cf_options.h" #include "table/block_based_table_reader.h" From c16455044c8690a6e9b02047a4f7b40d316a87aa Mon Sep 17 00:00:00 2001 From: matthewvon Date: Mon, 27 Jan 2020 12:02:35 -0500 Subject: [PATCH 42/57] additional deps to help Windows build --- port/BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/port/BUILD b/port/BUILD index 684199889..475465292 100644 --- a/port/BUILD +++ b/port/BUILD @@ -75,6 +75,8 @@ cc_library( deps = [ ":headers", "//util:headers", + "//monitoring:headers", + "//include:include", ], visibility = ["//visibility:public"], defines = DEFAULT_DEFINES, From 4556837de2971d7e5d616dd1c254f0a82798c873 Mon Sep 17 00:00:00 2001 From: matthewvon Date: Tue, 11 Feb 2020 15:08:27 -0500 Subject: [PATCH 43/57] mac os cross build suddenly needs std::move definition? why? --- include/rocksdb/status.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index 40b374ecf..23db3d6e7 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include "rocksdb/slice.h" namespace rocksdb { From 6def808d0066b8153eeb7c9502bedfcd5885e8d8 Mon Sep 17 00:00:00 2001 From: John Bresnahan Date: Thu, 19 Mar 2020 07:18:49 -1000 Subject: [PATCH 44/57] Adding "_WIN32_WINNT=_WIN32_WINNT_VISTA" to compile defines --- port/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/port/BUILD b/port/BUILD index 475465292..f84f06e56 100644 --- a/port/BUILD +++ b/port/BUILD @@ -24,6 +24,7 @@ PLATFORM_DEFINES = select({ "//:windows": [ "OS_WIN", "NPERF_CONTEXT", + "_WIN32_WINNT=_WIN32_WINNT_VISTA", ], "//conditions:default": [], }) From c6063a449904eded64391bccfbce37e53475a1f7 Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Thu, 28 May 2020 16:26:30 -0400 Subject: [PATCH 45/57] Feature: AES CTR encryption (#6) * mostly compiling version of env_encrypt_2.h ... one std::move error ... fix next * cleans up build (must add export EXTRA_LDFLAGS=-lcrypto -lssl manually) * hmm, somebody messed with SYNC_POINT defines ... and made debug unbuildable ... unit tests too * Add the original encryption Env to the env_basic_test unit test suite. * I lied. Forced to clean up Java style code in env_encryption by moving declaration half to env_encryption.h to be able to unit test new OpenSSL encryption. What a pain. * Going back to original layout. Rebuilt env_encryption.h/.cc into proper C++ declaration / definition split to allow proper inheritance (resuse). * first batch of tests, other than one for file size, work with new encryption code that does not yet encrypt (infrastructure validation). * ok, GetFileSize() corrected (slow but corrected). Considering whether or not to do same for GetChildrenFileAttributes * and add GetChildrenFileAttributes() update * Activate AESBlockAccessCipherStream. Remove dead code from original copying of env_encryption.cc. Unit test works. * change unique_ptr with deleter to traditional pointer code. Circle build did not want to compile it. And add some error checking to EncryptBlock() * use EVP_MD_CTX_create/destroy instead of new/free. This is openssl 1.0 syntax that is compatible in openssl 1.1 * move the definition of Sha1Description_t(std::string &) to .cc file in hopes of eliminating link issues in starrocks unit tests * again move some AES stuff from .h to .cc * add helper constructor to AesCtrKey_t (NOT TESTED). add IsValid() to Sha1Description and AesCtrKey_t * attempt to make initialization easier with Sha1Description_t as const. * const was a really bad idea * need explicit copy constructor with move disabled * removed delete of move constructor ... removal seems suspect * add env_encrypt_2_test to builds. Test Sha1Description_t. * saving for safety. first NIST AES case matches. code is in hack state. will clean and add other cases tomorrow. * code clean up within EncryptBlock. push to see if circle compiles * add remaining NIST cases. * added operator== for unit testing * remove non-portable byteswap.h ... not using it anymore ... and breaks OSX build * attempt include fix for osx * make openssl dependency OS specific * backport files used in Facebook/rocksdb PR * rename our env_encrypt_2 to more rocksdb-like env_encrypt2 * create conditional build of EnvEncrypt2 based on flag ROCKSDB_OPENSSL_AES_CTR * linux library loader code. not integrated. not yet supporting OSX * Linux library load ready for libcrypto SHA1 and RAND functions (includes unit tests). AES CTR functions next. * Add remaining functions from libcrypto that are used in EncryptedEnv2 * slight change to have .dylib names instead of .so names on OSX build * Create EncryptedEnv2::WriteKey_t and ReadKeys_t to simplify look of code. * create EncryptedEnv2::Default() to help time static loading of libcrypto. * clang-format applied * remove conditional openssl from OSX build * hmm, missed removing include files for openssl * Revert "hmm, missed removing include files for openssl" This reverts commit e22a1f63cb0e6a6ca8ca44621284ff9aad3c2221. * Revert "remove conditional openssl from OSX build" This reverts commit 4eef8d4161ccf63d1856f238389f1f54a4d9d814. * address PR comments from Alex Co-authored-by: matthewvon Co-authored-by: MatthewVon --- CMakeLists.txt | 3 + Makefile | 8 + TARGETS | 12 + env/BUILD | 12 + env/env_basic_test.cc | 32 +- env/env_encrypt2.cc | 558 ++++++++++++++++++++++++ env/env_encrypt2_test.cc | 704 +++++++++++++++++++++++++++++++ env/env_encryption.cc | 602 ++++++++++++++------------ include/BUILD | 12 + include/rocksdb/env_encrypt2.h | 469 ++++++++++++++++++++ include/rocksdb/env_encryption.h | 508 ++++++++++++++++------ include/rocksdb/perf_context.h | 6 + port/BUILD | 2 + src.mk | 4 + util/BUILD | 8 + util/build_version.cc | 4 +- util/library_loader.cc | 114 +++++ util/library_loader.h | 169 ++++++++ util/library_loader_test.cc | 87 ++++ util/sync_point.h | 8 +- 20 files changed, 2890 insertions(+), 432 deletions(-) create mode 100644 env/env_encrypt2.cc create mode 100644 env/env_encrypt2_test.cc create mode 100644 include/rocksdb/env_encrypt2.h create mode 100644 util/library_loader.cc create mode 100644 util/library_loader.h create mode 100644 util/library_loader_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 85c59064e..92011100e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -522,6 +522,7 @@ set(SOURCES env/env.cc env/env_chroot.cc env/env_encryption.cc + env/env_encrypt2.cc env/env_hdfs.cc env/mock_env.cc memtable/alloc_tracker.cc @@ -918,6 +919,7 @@ if(WITH_TESTS) db/write_callback_test.cc db/write_controller_test.cc env/env_basic_test.cc + env/env_encrypt2_test.cc env/env_test.cc env/mock_env_test.cc memtable/inlineskiplist_test.cc @@ -956,6 +958,7 @@ if(WITH_TESTS) util/hash_test.cc util/heap_test.cc util/rate_limiter_test.cc + util/library_loader_test.cc util/repeatable_thread_test.cc util/slice_transform_test.cc util/timer_queue_test.cc diff --git a/Makefile b/Makefile index 09e2cd3ea..b6b0abf46 100644 --- a/Makefile +++ b/Makefile @@ -419,8 +419,10 @@ TESTS = \ coding_test \ inlineskiplist_test \ env_basic_test \ + env_encrypt2_test \ env_test \ hash_test \ + library_loader_test \ thread_local_test \ rate_limiter_test \ perf_context_test \ @@ -1135,6 +1137,9 @@ coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) hash_test: util/hash_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +library_loader_test: util/library_loader_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + option_change_migration_test: utilities/option_change_migration/option_change_migration_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -1306,6 +1311,9 @@ sim_cache_test: utilities/simulator_cache/sim_cache_test.o db/db_test_util.o $(L spatial_db_test: utilities/spatialdb/spatial_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +env_encrypt2_test: env/env_encrypt2_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + env_mirror_test: utilities/env_mirror_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 43f8bd5b2..8a34df4ac 100644 --- a/TARGETS +++ b/TARGETS @@ -139,6 +139,7 @@ cpp_library( "env/env.cc", "env/env_chroot.cc", "env/env_encryption.cc", + "env/env_encrypt2.cc", "env/env_hdfs.cc", "env/env_posix.cc", "env/io_posix.cc", @@ -227,6 +228,7 @@ cpp_library( "util/filter_policy.cc", "util/hash.cc", "util/jemalloc_nodump_allocator.cc", + "util/library_loader.cc", "util/log_buffer.cc", "util/murmurhash.cc", "util/random.cc", @@ -702,6 +704,11 @@ ROCKS_TESTS = [ "env/env_basic_test.cc", "serial", ], + [ + "env_encrypt2_test", + "env/env_encrypt2_test.cc", + "serial", + ], [ "env_test", "env/env_test.cc", @@ -812,6 +819,11 @@ ROCKS_TESTS = [ "tools/ldb_cmd_test.cc", "serial", ], + [ + "library_loader_test", + "util/library_loader_test.cc", + "serial", + ], [ "listener_test", "db/listener_test.cc", diff --git a/env/BUILD b/env/BUILD index 55d49c55f..22176e63d 100644 --- a/env/BUILD +++ b/env/BUILD @@ -19,6 +19,7 @@ PLATFORM_HDRS = select({ COMMON_SRCS = [ "env.cc", "env_encryption.cc", + "env_encrypt2.cc", ] PLATFORM_SRCS = select({ @@ -100,6 +101,17 @@ constrained_test( ], ) +constrained_test( + name = "env_encrypt2_test", + srcs = ["env_encrypt2_test.cc"], + deps = [ + ":env", + ":test_utils", + "//util", + "//util:test_utils", + ], +) + constrained_test( name = "env_test", srcs = ["env_test.cc"], diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index 22983dbec..42a6a98ef 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -1,17 +1,19 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +#include #include #include #include -#include #include "env/mock_env.h" #include "rocksdb/env.h" +#include "rocksdb/env_encryption.h" #include "rocksdb/utilities/object_registry.h" #include "util/testharness.h" - namespace rocksdb { // Normalizes trivial differences across Envs such that these test cases can @@ -21,8 +23,8 @@ class NormalizingEnvWrapper : public EnvWrapper { explicit NormalizingEnvWrapper(Env* base) : EnvWrapper(base) {} // Removes . and .. from directory listing - virtual Status GetChildren(const std::string& dir, - std::vector* result) override { + Status GetChildren(const std::string& dir, + std::vector* result) override { Status status = EnvWrapper::GetChildren(dir, result); if (status.ok()) { result->erase(std::remove_if(result->begin(), result->end(), @@ -35,7 +37,7 @@ class NormalizingEnvWrapper : public EnvWrapper { } // Removes . and .. from directory listing - virtual Status GetChildrenFileAttributes( + Status GetChildrenFileAttributes( const std::string& dir, std::vector* result) override { Status status = EnvWrapper::GetChildrenFileAttributes(dir, result); if (status.ok()) { @@ -60,11 +62,9 @@ class EnvBasicTestWithParam : public testing::Test, test_dir_ = test::PerThreadDBPath(env_, "env_basic_test"); } - void SetUp() { - env_->CreateDirIfMissing(test_dir_); - } + void SetUp() override { env_->CreateDirIfMissing(test_dir_); } - void TearDown() { + void TearDown() override { std::vector files; env_->GetChildren(test_dir_, &files); for (const auto& file : files) { @@ -90,6 +90,19 @@ INSTANTIATE_TEST_CASE_P(EnvDefault, EnvMoreTestWithParam, static std::unique_ptr mock_env(new MockEnv(Env::Default())); INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam, ::testing::Values(mock_env.get())); + +// next statements run env test against default encryption code. +static ROT13BlockCipher encrypt_block_rot13(32); + +static CTREncryptionProvider encrypt_provider_ctr(encrypt_block_rot13); + +static std::unique_ptr encrypt_env(new NormalizingEnvWrapper( + NewEncryptedEnv(Env::Default(), &encrypt_provider_ctr))); +INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvBasicTestWithParam, + ::testing::Values(encrypt_env.get())); +INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvMoreTestWithParam, + ::testing::Values(encrypt_env.get())); + #ifndef ROCKSDB_LITE static std::unique_ptr mem_env(NewMemEnv(Env::Default())); INSTANTIATE_TEST_CASE_P(MemEnv, EnvBasicTestWithParam, @@ -111,6 +124,7 @@ std::vector GetCustomEnvs() { const char* uri = getenv("TEST_ENV_URI"); if (uri != nullptr) { custom_env = NewCustomObject(uri, &custom_env_guard); +// Env::LoadEnv(uri, &custom_env); } } diff --git a/env/env_encrypt2.cc b/env/env_encrypt2.cc new file mode 100644 index 000000000..a2af1087d --- /dev/null +++ b/env/env_encrypt2.cc @@ -0,0 +1,558 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// +// env_encryption.cc copied to this file then modified. + +#ifdef ROCKSDB_OPENSSL_AES_CTR +#ifndef ROCKSDB_LITE + +#include "rocksdb/env_encrypt2.h" + +#include +#include +#include + +#include "util/coding.h" +#include "util/random.h" + +#endif + +namespace rocksdb { + +// following define block from page 70: +// https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf +#if !defined(ALIGN16) +#if defined(__GNUC__) +#define ALIGN16 __attribute__((aligned(16))) +#else +#define ALIGN16 __declspec(align(16)) +#endif +#endif + +#ifndef ROCKSDB_LITE + +Sha1Description_t::Sha1Description_t(const std::string& key_desc_str) { + bool good = {true}; + int ret_val; + unsigned len; + + memset(desc, 0, EVP_MAX_MD_SIZE); + if (0 != key_desc_str.length() && EncryptedEnv2::crypto_.IsValid()) { + std::unique_ptr context( + EncryptedEnv2::crypto_.EVP_MD_CTX_new(), + EncryptedEnv2::crypto_.EVP_MD_CTX_free_ptr()); + + ret_val = EncryptedEnv2::crypto_.EVP_DigestInit_ex( + context.get(), EncryptedEnv2::crypto_.EVP_sha1(), nullptr); + good = (1 == ret_val); + if (good) { + ret_val = EncryptedEnv2::crypto_.EVP_DigestUpdate( + context.get(), key_desc_str.c_str(), key_desc_str.length()); + good = (1 == ret_val); + } + + if (good) { + ret_val = + EncryptedEnv2::crypto_.EVP_DigestFinal_ex(context.get(), desc, &len); + good = (1 == ret_val); + } + } else { + good = false; + } + + valid = good; +} + +AesCtrKey_t::AesCtrKey_t(const std::string& key_str) : valid(false) { + memset(key, 0, EVP_MAX_KEY_LENGTH); + + // simple parse: must be 64 characters long and hexadecimal values + if (64 == key_str.length()) { + auto bad_pos = key_str.find_first_not_of("abcdefABCDEF0123456789"); + if (std::string::npos == bad_pos) { + for (size_t idx = 0, idx2 = 0; idx < key_str.length(); idx += 2, ++idx2) { + std::string hex_string(key_str.substr(idx, 2)); + key[idx2] = std::stoul(hex_string, 0, 16); + } + valid = true; + } + } +} + +// +// AES_BLOCK_SIZE assumed to be 16 +// +typedef union { + uint64_t nums[2]; + uint8_t bytes[AES_BLOCK_SIZE]; +} AesAlignedBlock_t; + +Status AESBlockAccessCipherStream::EncryptBlock(uint64_t blockIndex, char* data, + char* /*scratch*/) { + // + // AES_BLOCK_SIZE assumed to be 16 + // + assert(AES_BLOCK_SIZE == 16); + assert(sizeof(AesAlignedBlock_t) == AES_BLOCK_SIZE); + + Status status; + ALIGN16 AesAlignedBlock_t block_in, block_out, iv; + int out_len = 0, in_len = {AES_BLOCK_SIZE}, ret_val; + + if (EncryptedEnv2::crypto_.IsValid()) { + std::unique_ptr context( + EncryptedEnv2::crypto_.EVP_CIPHER_CTX_new(), + EncryptedEnv2::crypto_.EVP_CIPHER_CTX_free_ptr()); + + // https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf + memcpy(iv.bytes, nonce_, AES_BLOCK_SIZE / 2); + EncodeFixed64((char*)&iv.bytes[AES_BLOCK_SIZE / 2], + blockIndex); // this will be little endian + block_in.nums[0] = 0; + block_in.nums[1] = 0; + + ret_val = EncryptedEnv2::crypto_.EVP_EncryptInit_ex( + context.get(), EncryptedEnv2::crypto_.EVP_aes_256_ctr(), nullptr, + key_.key, iv.bytes); + if (1 == ret_val) { + ret_val = EncryptedEnv2::crypto_.EVP_EncryptUpdate( + context.get(), block_out.bytes, &out_len, block_in.bytes, in_len); + + if (1 != ret_val || AES_BLOCK_SIZE != out_len) { + status = Status::InvalidArgument("EVP_EncryptUpdate failed: ", + AES_BLOCK_SIZE == out_len + ? "bad return value" + : "output length short"); + } + } else { + status = Status::InvalidArgument("EVP_EncryptInit_ex failed."); + } + + // XOR data with ciphertext. + uint64_t* data_ptr; + data_ptr = (uint64_t*)data; + *data_ptr ^= block_out.nums[0]; + data_ptr = (uint64_t*)(data + 8); + *data_ptr ^= block_out.nums[1]; + } else { + status = Status::NotSupported( + "libcrypto not available for encryption/decryption."); + } + + return status; +} + +Status AESBlockAccessCipherStream::DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) { + return EncryptBlock(blockIndex, data, scratch); +} + +Status CTREncryptionProvider2::CreateNewPrefix(const std::string& /*fname*/, + char* prefix, + size_t prefixLength) { + Status s; + if (EncryptedEnv2::crypto_.IsValid()) { + if (sizeof(Prefix0_t) <= prefixLength) { + int ret_val; + + Prefix0_t* pf = {(Prefix0_t*)prefix}; + memcpy(pf->key_description_, key_desc_.desc, sizeof(key_desc_.desc)); + ret_val = EncryptedEnv2::crypto_.RAND_bytes( + (unsigned char*)&pf->nonce_, + AES_BLOCK_SIZE / 2); // RAND_poll() to initialize + if (1 != ret_val) { + s = Status::NotSupported("RAND_bytes failed"); + } + } else { + s = Status::NotSupported("Prefix size needs to be 28 or more"); + } + } else { + s = Status::NotSupported("RAND_bytes() from libcrypto not available."); + } + + return s; +} + +// Returns an Env that encrypts data when stored on disk and decrypts data when +// read from disk. +Env* NewEncryptedEnv2(Env* base_env, EncryptedEnv2::ReadKeys_t encrypt_read, + EncryptedEnv2::WriteKey_t encrypt_write) { + Env* ret_env{base_env}; + EncryptedEnv2* new_env{nullptr}; + + if (Env::Default() == base_env) { + // use safer static construction so libcrypto is synchronously loaded + new_env = + (EncryptedEnv2*)EncryptedEnv2::Default(encrypt_read, encrypt_write); + } else if (nullptr != base_env) { + new_env = new EncryptedEnv2(base_env, encrypt_read, encrypt_write); + } + + // warning, dynamic loading of libcrypto could be delayed ... making this + // false + if (nullptr != new_env && new_env->IsValid()) { + ret_env = new_env; + } + + return ret_env; +} + +EncryptedEnv2::EncryptedEnv2(Env* base_env, + EncryptedEnv2::ReadKeys_t encrypt_read, + EncryptedEnv2::WriteKey_t encrypt_write) + : EnvWrapper(base_env), + encrypt_read_(encrypt_read), + encrypt_write_(encrypt_write), + valid_(false) { + valid_ = crypto_.IsValid(); + + // warning, dynamic loading of libcrypto could be delayed ... making this + // false + if (IsValid()) { + crypto_.RAND_poll(); + } +} + +EncryptedEnv2::EncryptedEnv2(Env* base_env) + : EnvWrapper(base_env), valid_(false) {} + +void EncryptedEnv2::SetKeys(EncryptedEnv2::ReadKeys_t encrypt_read, + EncryptedEnv2::WriteKey_t encrypt_write) { + encrypt_read_ = encrypt_read; + encrypt_write_ = encrypt_write; + + valid_ = crypto_.IsValid(); + + if (IsValid()) { + crypto_.RAND_poll(); + } +} + +// NewSequentialFile opens a file for sequential reading. +Status EncryptedEnv2::NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + result->reset(); + if (options.use_mmap_reads) { + return Status::InvalidArgument(); + } + + // Open file using underlying Env implementation + std::unique_ptr underlying; + auto status = EnvWrapper::NewSequentialFile(fname, &underlying, options); + if (status.ok()) { + std::shared_ptr provider; + std::unique_ptr stream; + status = ReadSeqEncryptionPrefix(underlying.get(), provider, + stream); + + if (status.ok()) { + if (provider) { + (*result) = std::unique_ptr( + new EncryptedSequentialFile(underlying.release(), stream.release(), + provider->GetPrefixLength())); + + } else { + // normal file, not encrypted + // sequential file might not allow backing up to begining, close and + // reopen + underlying.reset(nullptr); + status = EnvWrapper::NewSequentialFile(fname, result, options); + } + } + } + + return status; +} + +// NewRandomAccessFile opens a file for random read access. +Status EncryptedEnv2::NewRandomAccessFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& options) { + result->reset(); + if (options.use_mmap_reads) { + return Status::InvalidArgument(); + } + + // Open file using underlying Env implementation + std::unique_ptr underlying; + auto status = EnvWrapper::NewRandomAccessFile(fname, &underlying, options); + if (status.ok()) { + std::shared_ptr provider; + std::unique_ptr stream; + status = ReadRandEncryptionPrefix(underlying.get(), + provider, stream); + + if (status.ok()) { + if (provider) { + (*result) = + std::unique_ptr(new EncryptedRandomAccessFile( + underlying.release(), stream.release(), + provider->GetPrefixLength())); + + } else { + // normal file, not encrypted + (*result).reset(underlying.release()); + } + } + } + return status; +} + +// NewWritableFile opens a file for sequential writing. +Status EncryptedEnv2::NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + Status status; + result->reset(); + + if (!options.use_mmap_writes) { + // Open file using underlying Env implementation + std::unique_ptr underlying; + status = EnvWrapper::NewWritableFile(fname, &underlying, options); + + if (status.ok()) { + if (IsWriteEncrypted()) { + std::unique_ptr stream; + + status = WriteSeqEncryptionPrefix(underlying.get(), stream); + + if (status.ok()) { + (*result) = std::unique_ptr(new EncryptedWritableFile( + underlying.release(), stream.release(), + encrypt_write_.second->GetPrefixLength())); + } + } else { + (*result).reset(underlying.release()); + } + } + } else { + status = Status::InvalidArgument(); + } + + return status; +} + +// Create an object that writes to a new file with the specified +// name. Deletes any existing file with the same name and creates a +// new file. On success, stores a pointer to the new file in +// *result and returns OK. On failure stores nullptr in *result and +// returns non-OK. +// +// The returned file will only be accessed by one thread at a time. +Status EncryptedEnv2::ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + Status status; + result->reset(); + + if (!options.use_mmap_writes) { + // Open file using underlying Env implementation + std::unique_ptr underlying; + status = EnvWrapper::ReopenWritableFile(fname, &underlying, options); + + if (status.ok()) { + if (IsWriteEncrypted()) { + std::unique_ptr stream; + + status = WriteSeqEncryptionPrefix(underlying.get(), stream); + + if (status.ok()) { + (*result) = std::unique_ptr(new EncryptedWritableFile( + underlying.release(), stream.release(), + encrypt_write_.second->GetPrefixLength())); + } + } else { + (*result).reset(underlying.release()); + } + } + } else { + status = Status::InvalidArgument(); + } + + return status; +} + +// Reuse an existing file by renaming it and opening it as writable. +Status EncryptedEnv2::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* result, + const EnvOptions& options) { + Status status; + result->reset(); + + if (!options.use_mmap_writes) { + // Open file using underlying Env implementation + std::unique_ptr underlying; + status = + EnvWrapper::ReuseWritableFile(fname, old_fname, &underlying, options); + + if (status.ok()) { + if (IsWriteEncrypted()) { + std::unique_ptr stream; + + status = WriteSeqEncryptionPrefix(underlying.get(), stream); + + if (status.ok()) { + (*result) = std::unique_ptr(new EncryptedWritableFile( + underlying.release(), stream.release(), + encrypt_write_.second->GetPrefixLength())); + } + } else { + (*result).reset(underlying.release()); + } + } + } else { + status = Status::InvalidArgument(); + } + + return status; +} + +// Open `fname` for random read and write, if file doesn't exist the file +// will be created. On success, stores a pointer to the new file in +// *result and returns OK. On failure returns non-OK. +// +// The returned file will only be accessed by one thread at a time. +Status EncryptedEnv2::NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { + Status status; + result->reset(); + + // Check file exists + bool isNewFile = !FileExists(fname).ok(); + + if (!options.use_mmap_writes && !options.use_mmap_reads) { + // Open file using underlying Env implementation + std::unique_ptr underlying; + status = EnvWrapper::NewRandomRWFile(fname, &underlying, options); + + if (status.ok()) { + std::shared_ptr provider; + std::unique_ptr stream; + + if (!isNewFile) { + // file exists, get existing crypto info + status = ReadRandEncryptionPrefix(underlying.get(), + provider, stream); + + } else { + // new file + if (IsWriteEncrypted()) { + status = WriteRandEncryptionPrefix(underlying.get(), stream); + provider = encrypt_write_.second; + } + } + + // establish encrypt or not, finalize file object + if (status.ok()) { + if (provider) { + (*result) = std::unique_ptr( + new EncryptedRandomRWFile(underlying.release(), stream.release(), + provider->GetPrefixLength())); + } else { + (*result).reset(underlying.release()); + } + } + } + } else { + status = Status::InvalidArgument(); + } + + return status; +} + +// Store in *result the attributes of the children of the specified directory. +// In case the implementation lists the directory prior to iterating the files +// and files are concurrently deleted, the deleted files will be omitted from +// result. +// The name attributes are relative to "dir". +// Original contents of *results are dropped. +// Returns OK if "dir" exists and "*result" contains its children. +// NotFound if "dir" does not exist, the calling process does not have +// permission to access "dir", or if "dir" is invalid. +// IOError if an IO Error was encountered +Status EncryptedEnv2::GetChildrenFileAttributes( + const std::string& dir, std::vector* result) { + auto status = EnvWrapper::GetChildrenFileAttributes(dir, result); + if (status.ok()) { + // this is slightly expensive, but fortunately not used heavily + std::shared_ptr provider; + + for (auto it = std::begin(*result); it != std::end(*result); ++it) { + status = GetEncryptionProvider(it->name, provider); + + if (status.ok() && provider) { + size_t prefixLength = provider->GetPrefixLength(); + + if (prefixLength <= it->size_bytes) it->size_bytes -= prefixLength; + } + } + } + + return status; +} + +// Store the size of fname in *file_size. +Status EncryptedEnv2::GetFileSize(const std::string& fname, + uint64_t* file_size) { + Status status; + status = EnvWrapper::GetFileSize(fname, file_size); + + if (status.ok()) { + // this is slightly expensive, but fortunately not used heavily + std::shared_ptr provider; + status = GetEncryptionProvider(fname, provider); + if (status.ok() && provider) { + size_t prefixLength = provider->GetPrefixLength(); + if (prefixLength <= *file_size) *file_size -= prefixLength; + } + } + + return status; +} + +Status EncryptedEnv2::GetEncryptionProvider( + const std::string& fname, std::shared_ptr& provider) { + std::unique_ptr underlying; + EnvOptions options; + Status status; + + provider.reset(); + status = Env::Default()->NewSequentialFile(fname, &underlying, options); + + if (status.ok()) { + std::unique_ptr stream; + status = EncryptedEnv2::ReadSeqEncryptionPrefix(underlying.get(), provider, + stream); + } + + return status; +} + +UnixLibCrypto EncryptedEnv2::crypto_; + +Env* EncryptedEnv2::Default() { + // the rational for this routine is to help force the static + // loading of UnixLibCrypto before other routines start + // using the encryption code. + static EncryptedEnv2 default_env(Env::Default()); + return &default_env; +} + +Env* EncryptedEnv2::Default(EncryptedEnv2::ReadKeys_t encrypt_read, + EncryptedEnv2::WriteKey_t encrypt_write) { + EncryptedEnv2* default_env = (EncryptedEnv2*)Default(); + default_env->SetKeys(encrypt_read, encrypt_write); + return default_env; +} + +#endif // ROCKSDB_LITE + +} // namespace rocksdb + +#endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/env/env_encrypt2_test.cc b/env/env_encrypt2_test.cc new file mode 100644 index 000000000..ca27b4754 --- /dev/null +++ b/env/env_encrypt2_test.cc @@ -0,0 +1,704 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/env_encrypt2.h" + +#include "rocksdb/options.h" +#include "rocksdb/sst_file_writer.h" +#include "util/testharness.h" + +#ifdef ROCKSDB_OPENSSL_AES_CTR + +namespace rocksdb { + +class EnvEncrypt2_Sha1 {}; + +TEST(EnvEncrypt2_Sha1, Default) { + Sha1Description_t desc; + + ASSERT_FALSE(desc.IsValid()); + for (size_t idx = 0; idx < sizeof(desc.desc); ++idx) { + ASSERT_TRUE('\0' == desc.desc[idx]); + } +} + +TEST(EnvEncrypt2_Sha1, Constructors) { + Sha1Description_t desc; + + // verify we know size of desc.desc + ASSERT_TRUE(64 == sizeof(desc.desc)); + + uint8_t bytes[128], *ptr; + for (size_t idx = 0; idx < sizeof(bytes); ++idx) { + bytes[idx] = idx + 1; + } + + Sha1Description_t desc_bad1(bytes, 128); + ASSERT_FALSE(desc_bad1.IsValid()); + + Sha1Description_t desc_bad2(bytes, 65); + ASSERT_FALSE(desc_bad2.IsValid()); + + Sha1Description_t desc_good1(bytes, 64); + ASSERT_TRUE(desc_good1.IsValid()); + ptr = (uint8_t*)memchr(desc_good1.desc, 0, 64); + ASSERT_TRUE(nullptr == ptr); + + Sha1Description_t desc_good2(bytes, 63); + ASSERT_TRUE(desc_good2.IsValid()); + ptr = (uint8_t*)memchr(desc_good2.desc, 0, 64); + ASSERT_TRUE(&desc_good2.desc[63] == ptr); + + Sha1Description_t desc_good3(bytes, 1); + ASSERT_TRUE(desc_good3.IsValid()); + ptr = (uint8_t*)memchr(desc_good3.desc, 0, 64); + ASSERT_TRUE(&desc_good3.desc[1] == ptr); + + Sha1Description_t desc_good4(bytes, 0); + ASSERT_TRUE(desc_good4.IsValid()); + ptr = (uint8_t*)memchr(desc_good4.desc, 0, 64); + ASSERT_TRUE(&desc_good4.desc[0] == ptr); + + Sha1Description_t desc_str1(""); + ASSERT_FALSE(desc_str1.IsValid()); + + uint8_t md2[] = {0x35, 0x6a, 0x19, 0x2b, 0x79, 0x13, 0xb0, 0x4c, 0x54, 0x57, + 0x4d, 0x18, 0xc2, 0x8d, 0x46, 0xe6, 0x39, 0x54, 0x28, 0xab}; + Sha1Description_t desc_str2("1"); + ASSERT_TRUE(desc_str2.IsValid()); + ASSERT_TRUE(0 == memcmp(md2, desc_str2.desc, sizeof(md2))); + for (size_t idx = sizeof(md2); idx < sizeof(desc_str2.desc); ++idx) { + ASSERT_TRUE(0 == desc_str2.desc[idx]); + } + + uint8_t md3[] = {0x7b, 0x52, 0x00, 0x9b, 0x64, 0xfd, 0x0a, 0x2a, 0x49, 0xe6, + 0xd8, 0xa9, 0x39, 0x75, 0x30, 0x77, 0x79, 0x2b, 0x05, 0x54}; + Sha1Description_t desc_str3("12"); + ASSERT_TRUE(desc_str3.IsValid()); + ASSERT_TRUE(0 == memcmp(md3, desc_str3.desc, sizeof(md3))); + for (size_t idx = sizeof(md3); idx < sizeof(desc_str3.desc); ++idx) { + ASSERT_TRUE(0 == desc_str3.desc[idx]); + } +} + +TEST(EnvEncrypt2_Sha1, Copy) { + // assignment + uint8_t md1[] = {0xdb, 0x8a, 0xc1, 0xc2, 0x59, 0xeb, 0x89, 0xd4, 0xa1, 0x31, + 0xb2, 0x53, 0xba, 0xcf, 0xca, 0x5f, 0x31, 0x9d, 0x54, 0xf2}; + Sha1Description_t desc1("HelloWorld"), desc2; + ASSERT_TRUE(desc1.IsValid()); + ASSERT_FALSE(desc2.IsValid()); + + desc2 = desc1; + ASSERT_TRUE(desc1.IsValid()); + ASSERT_TRUE(desc2.IsValid()); + ASSERT_TRUE(0 == memcmp(md1, desc1.desc, sizeof(md1))); + for (size_t idx = sizeof(md1); idx < sizeof(desc1.desc); ++idx) { + ASSERT_TRUE(0 == desc1.desc[idx]); + } + ASSERT_TRUE(0 == memcmp(md1, desc2.desc, sizeof(md1))); + for (size_t idx = sizeof(md1); idx < sizeof(desc2.desc); ++idx) { + ASSERT_TRUE(0 == desc2.desc[idx]); + } + + // copy constructor + uint8_t md3[] = {0x17, 0x09, 0xcc, 0x51, 0x65, 0xf5, 0x50, 0x4d, 0x46, 0xde, + 0x2f, 0x3a, 0x7a, 0xff, 0x57, 0x45, 0x20, 0x8a, 0xed, 0x44}; + Sha1Description_t desc3("A little be longer title for a key"); + ASSERT_TRUE(desc3.IsValid()); + + Sha1Description_t desc4(desc3); + ASSERT_TRUE(desc3.IsValid()); + ASSERT_TRUE(desc4.IsValid()); + ASSERT_TRUE(0 == memcmp(md3, desc3.desc, sizeof(md3))); + for (size_t idx = sizeof(md3); idx < sizeof(desc3.desc); ++idx) { + ASSERT_TRUE(0 == desc3.desc[idx]); + } + ASSERT_TRUE(0 == memcmp(md3, desc4.desc, sizeof(md3))); + for (size_t idx = sizeof(md3); idx < sizeof(desc4.desc); ++idx) { + ASSERT_TRUE(0 == desc4.desc[idx]); + } +} + +class EnvEncrypt2_Key {}; + +TEST(EnvEncrypt2_Key, Default) { + AesCtrKey_t key; + + ASSERT_FALSE(key.IsValid()); + for (size_t idx = 0; idx < sizeof(key.key); ++idx) { + ASSERT_TRUE('\0' == key.key[idx]); + } +} + +TEST(EnvEncrypt2_Key, Constructors) { + AesCtrKey_t key; + + // verify we know size of key.key + ASSERT_TRUE(64 == sizeof(key.key)); + + uint8_t bytes[128], *ptr; + for (size_t idx = 0; idx < sizeof(bytes); ++idx) { + bytes[idx] = idx + 1; + } + + AesCtrKey_t key_bad1(bytes, 128); + ASSERT_FALSE(key_bad1.IsValid()); + + AesCtrKey_t key_bad2(bytes, 65); + ASSERT_FALSE(key_bad2.IsValid()); + + AesCtrKey_t key_good1(bytes, 64); + ASSERT_TRUE(key_good1.IsValid()); + ptr = (uint8_t*)memchr(key_good1.key, 0, 64); + ASSERT_TRUE(nullptr == ptr); + + AesCtrKey_t key_good2(bytes, 63); + ASSERT_TRUE(key_good2.IsValid()); + ptr = (uint8_t*)memchr(key_good2.key, 0, 64); + ASSERT_TRUE(&key_good2.key[63] == ptr); + + AesCtrKey_t key_good3(bytes, 1); + ASSERT_TRUE(key_good3.IsValid()); + ptr = (uint8_t*)memchr(key_good3.key, 0, 64); + ASSERT_TRUE(&key_good3.key[1] == ptr); + + AesCtrKey_t key_good4(bytes, 0); + ASSERT_TRUE(key_good4.IsValid()); + ptr = (uint8_t*)memchr(key_good4.key, 0, 64); + ASSERT_TRUE(&key_good4.key[0] == ptr); + + AesCtrKey_t key_str1(""); + ASSERT_FALSE(key_str1.IsValid()); + + AesCtrKey_t key_str2("0x35"); + ASSERT_FALSE(key_str2.IsValid()); + + // 1234567890123456789012345678901234567890123456789012345678901234 + AesCtrKey_t key_str3( + "RandomSixtyFourCharactersLaLaLaLaJust a bunch of letters, not 0x"); + ASSERT_FALSE(key_str2.IsValid()); + + uint8_t key4[] = {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, + 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20}; + // 1234567890123456789012345678901234567890123456789012345678901234 + AesCtrKey_t key_str4( + "0102030405060708090A0B0C0D0E0F101112131415161718191a1b1c1d1e1f20"); + ASSERT_TRUE(key_str4.IsValid()); + ASSERT_TRUE(0 == memcmp(key4, key_str4.key, sizeof(key4))); +} + +TEST(EnvEncrypt2_Key, Copy) { + // assignment + uint8_t data1[] = {0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, + 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, + 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4}; + AesCtrKey_t key1(data1, sizeof(data1)), key2; + ASSERT_TRUE(key1.IsValid()); + ASSERT_FALSE(key2.IsValid()); + + key2 = key1; + ASSERT_TRUE(key1.IsValid()); + ASSERT_TRUE(key2.IsValid()); + ASSERT_TRUE(0 == memcmp(data1, key1.key, sizeof(data1))); + ASSERT_TRUE(0 == memcmp(data1, key2.key, sizeof(data1))); + + // copy constructor + uint8_t data3[] = {0x21, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, + 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0x22, 0x20, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, + 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4}; + AesCtrKey_t key3(data3, sizeof(data3)); + ASSERT_TRUE(key3.IsValid()); + + AesCtrKey_t key4(key3); + ASSERT_TRUE(key3.IsValid()); + ASSERT_TRUE(key4.IsValid()); + ASSERT_TRUE(0 == memcmp(data3, key3.key, sizeof(data3))); + ASSERT_TRUE(0 == memcmp(data3, key4.key, sizeof(data3))); +} + +class EnvEncrypt2_Provider {}; + +class CipherStreamWrapper : public BlockAccessCipherStream { + public: + Status TESTEncryptBlock(uint64_t blockIndex, char* data, char* scratch) { + return EncryptBlock(blockIndex, data, scratch); + } + Status TESTDecryptBlock(uint64_t blockIndex, char* data, char* scratch) { + return DecryptBlock(blockIndex, data, scratch); + } +}; + +TEST(EnvEncrypt2_Provider, NistExamples) { + uint8_t key[] = {0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, + 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, + 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4}; + uint8_t init[] = {0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff}; + + uint8_t plain1[] = {0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, + 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a}; + uint8_t cypher1[] = {0x60, 0x1e, 0xc3, 0x13, 0x77, 0x57, 0x89, 0xa5, + 0xb7, 0xa7, 0xf5, 0x04, 0xbb, 0xf3, 0xd2, 0x28}; + + uint8_t plain2[] = {0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, + 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51}; + uint8_t cypher2[] = {0xf4, 0x43, 0xe3, 0xca, 0x4d, 0x62, 0xb5, 0x9a, + 0xca, 0x84, 0xe9, 0x90, 0xca, 0xca, 0xf5, 0xc5}; + + uint8_t plain3[] = {0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, + 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef}; + uint8_t cypher3[] = {0x2b, 0x09, 0x30, 0xda, 0xa2, 0x3d, 0xe9, 0x4c, + 0xe8, 0x70, 0x17, 0xba, 0x2d, 0x84, 0x98, 0x8d}; + + uint8_t plain4[] = {0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, + 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10}; + uint8_t cypher4[] = {0xdf, 0xc9, 0xc5, 0x8d, 0xb6, 0x7a, 0xad, 0xa6, + 0x13, 0xc2, 0xdd, 0x08, 0x45, 0x79, 0x41, 0xa6}; + + CTREncryptionProvider2 provider("NistExampleKey", key, sizeof(key)); + // only first 8 bytes of init taken in next call + std::unique_ptr stream( + provider.CreateCipherStream2(1, init)); + + uint64_t offset; + uint8_t block[sizeof(plain1)]; + uint8_t* patch = (uint8_t*)&offset; // little endian assumed + + // + // forward ... encryption + // + memcpy((void*)&offset, (void*)&init[8], 8); + memcpy((void*)block, (void*)plain1, 16); + CipherStreamWrapper* wrap = (CipherStreamWrapper*)stream.get(); + + Status status = wrap->TESTEncryptBlock(offset, (char*)block, nullptr); + ASSERT_TRUE(0 == memcmp(cypher1, block, sizeof(block))); + + memcpy((void*)&offset, (void*)&init[8], 8); + memcpy((void*)block, (void*)plain2, 16); + *(patch + 7) = 0x00; + *(patch + 6) = 0xff; + + status = wrap->TESTEncryptBlock(offset, (char*)block, nullptr); + ASSERT_TRUE(0 == memcmp(cypher2, block, sizeof(block))); + + memcpy((void*)&offset, (void*)&init[8], 8); + memcpy((void*)block, (void*)plain3, 16); + *(patch + 7) = 0x01; + *(patch + 6) = 0xff; + + status = wrap->TESTEncryptBlock(offset, (char*)block, nullptr); + ASSERT_TRUE(0 == memcmp(cypher3, block, sizeof(block))); + + memcpy((void*)&offset, (void*)&init[8], 8); + memcpy((void*)block, (void*)plain4, 16); + *(patch + 7) = 0x02; + *(patch + 6) = 0xff; + + status = wrap->TESTEncryptBlock(offset, (char*)block, nullptr); + ASSERT_TRUE(0 == memcmp(cypher4, block, sizeof(block))); + + // + // backward -- decryption + // + memcpy((void*)&offset, (void*)&init[8], 8); + memcpy((void*)block, (void*)cypher1, 16); + + status = wrap->TESTDecryptBlock(offset, (char*)block, nullptr); + ASSERT_TRUE(0 == memcmp(plain1, block, sizeof(block))); + + memcpy((void*)&offset, (void*)&init[8], 8); + memcpy((void*)block, (void*)cypher2, 16); + *(patch + 7) = 0x00; + *(patch + 6) = 0xff; + + status = wrap->TESTDecryptBlock(offset, (char*)block, nullptr); + ASSERT_TRUE(0 == memcmp(plain2, block, sizeof(block))); + + memcpy((void*)&offset, (void*)&init[8], 8); + memcpy((void*)block, (void*)cypher3, 16); + *(patch + 7) = 0x01; + *(patch + 6) = 0xff; + + status = wrap->TESTDecryptBlock(offset, (char*)block, nullptr); + ASSERT_TRUE(0 == memcmp(plain3, block, sizeof(block))); + + memcpy((void*)&offset, (void*)&init[8], 8); + memcpy((void*)block, (void*)cypher4, 16); + *(patch + 7) = 0x02; + *(patch + 6) = 0xff; + + status = wrap->TESTDecryptBlock(offset, (char*)block, nullptr); + ASSERT_TRUE(0 == memcmp(plain4, block, sizeof(block))); +} + +// +// The following is copied from env_basic_test.cc +// + +// Normalizes trivial differences across Envs such that these test cases can +// run on all Envs. +class NormalizingEnvWrapper : public EnvWrapper { + public: + explicit NormalizingEnvWrapper(Env* base) : EnvWrapper(base) {} + + // Removes . and .. from directory listing + virtual Status GetChildren(const std::string& dir, + std::vector* result) override { + Status status = EnvWrapper::GetChildren(dir, result); + if (status.ok()) { + result->erase(std::remove_if(result->begin(), result->end(), + [](const std::string& s) { + return s == "." || s == ".."; + }), + result->end()); + } + return status; + } + + // Removes . and .. from directory listing + virtual Status GetChildrenFileAttributes( + const std::string& dir, std::vector* result) override { + Status status = EnvWrapper::GetChildrenFileAttributes(dir, result); + if (status.ok()) { + result->erase(std::remove_if(result->begin(), result->end(), + [](const FileAttributes& fa) { + return fa.name == "." || fa.name == ".."; + }), + result->end()); + } + return status; + } +}; + +class EnvBasicTestWithParam : public testing::Test, + public ::testing::WithParamInterface { + public: + Env* env_; + const EnvOptions soptions_; + std::string test_dir_; + + EnvBasicTestWithParam() : env_(GetParam()) { + test_dir_ = test::PerThreadDBPath(env_, "env_encrypt2_test"); + } + + void SetUp() { env_->CreateDirIfMissing(test_dir_); } + + void TearDown() { + std::vector files; + env_->GetChildren(test_dir_, &files); + for (const auto& file : files) { + // don't know whether it's file or directory, try both. The tests must + // only create files or empty directories, so one must succeed, else the + // directory's corrupted. + Status s = env_->DeleteFile(test_dir_ + "/" + file); + if (!s.ok()) { + ASSERT_OK(env_->DeleteDir(test_dir_ + "/" + file)); + } + } + } +}; + +class EnvMoreTestWithParam : public EnvBasicTestWithParam {}; + +// next statements run env test against encrypt_2 code. +static std::string KeyName = {"A key name"}; +static Sha1Description_t KeyDesc(KeyName); + +// this key is from +// https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf, +// example F.5.5 +static uint8_t key256[] = {0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, + 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, + 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4}; +std::shared_ptr encrypt2_provider_ctr( + new CTREncryptionProvider2(KeyName, key256, 32)); + +static EncryptedEnv2::ReadKeys_t encrypt_readers = { + {KeyDesc, encrypt2_provider_ctr}}; +static EncryptedEnv2::WriteKey_t encrypt_writer = {KeyDesc, + encrypt2_provider_ctr}; + +static std::unique_ptr encrypt2_env(new NormalizingEnvWrapper( + EncryptedEnv2::Default(encrypt_readers, encrypt_writer))); + +INSTANTIATE_TEST_CASE_P(EncryptedEnv2, EnvBasicTestWithParam, + ::testing::Values(encrypt2_env.get())); + +TEST_P(EnvBasicTestWithParam, Basics) { + uint64_t file_size; + std::unique_ptr writable_file; + std::vector children; + + // Check that the directory is empty. + ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/non_existent")); + ASSERT_TRUE(!env_->GetFileSize(test_dir_ + "/non_existent", &file_size).ok()); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_EQ(0U, children.size()); + + // Create a file. + ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + + // Check that the file exists. + ASSERT_OK(env_->FileExists(test_dir_ + "/f")); + ASSERT_OK(env_->GetFileSize(test_dir_ + "/f", &file_size)); + ASSERT_EQ(0U, file_size); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_EQ(1U, children.size()); + ASSERT_EQ("f", children[0]); + ASSERT_OK(env_->DeleteFile(test_dir_ + "/f")); + + // Write to the file. + ASSERT_OK( + env_->NewWritableFile(test_dir_ + "/f1", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("abc")); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + ASSERT_OK( + env_->NewWritableFile(test_dir_ + "/f2", &writable_file, soptions_)); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + + // Check for expected size. + ASSERT_OK(env_->GetFileSize(test_dir_ + "/f1", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that renaming works. + ASSERT_TRUE( + !env_->RenameFile(test_dir_ + "/non_existent", test_dir_ + "/g").ok()); + ASSERT_OK(env_->RenameFile(test_dir_ + "/f1", test_dir_ + "/g")); + ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/f1")); + ASSERT_OK(env_->FileExists(test_dir_ + "/g")); + ASSERT_OK(env_->GetFileSize(test_dir_ + "/g", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that renaming overwriting works + ASSERT_OK(env_->RenameFile(test_dir_ + "/f2", test_dir_ + "/g")); + ASSERT_OK(env_->GetFileSize(test_dir_ + "/g", &file_size)); + ASSERT_EQ(0U, file_size); + + // Check that opening non-existent file fails. + std::unique_ptr seq_file; + std::unique_ptr rand_file; + ASSERT_TRUE(!env_->NewSequentialFile(test_dir_ + "/non_existent", &seq_file, + soptions_) + .ok()); + ASSERT_TRUE(!seq_file); + ASSERT_TRUE(!env_->NewRandomAccessFile(test_dir_ + "/non_existent", + &rand_file, soptions_) + .ok()); + ASSERT_TRUE(!rand_file); + + // Check that deleting works. + ASSERT_TRUE(!env_->DeleteFile(test_dir_ + "/non_existent").ok()); + ASSERT_OK(env_->DeleteFile(test_dir_ + "/g")); + ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/g")); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_EQ(0U, children.size()); + ASSERT_TRUE( + env_->GetChildren(test_dir_ + "/non_existent", &children).IsNotFound()); +} + +TEST_P(EnvBasicTestWithParam, ReadWrite) { + std::unique_ptr writable_file; + std::unique_ptr seq_file; + std::unique_ptr rand_file; + Slice result; + char scratch[100]; + + ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("hello ")); + ASSERT_OK(writable_file->Append("world")); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + + // Read sequentially. + ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(seq_file->Skip(1)); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF. + ASSERT_EQ(0U, result.size()); + ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file. + ASSERT_OK(seq_file->Read(1000, &result, scratch)); + ASSERT_EQ(0U, result.size()); + + // Random reads. + ASSERT_OK(env_->NewRandomAccessFile(test_dir_ + "/f", &rand_file, soptions_)); + ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d". + ASSERT_EQ(0, result.compare("d")); + + // Too high offset. + ASSERT_TRUE(rand_file->Read(1000, 5, &result, scratch).ok()); +} + +TEST_P(EnvBasicTestWithParam, Misc) { + std::unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile(test_dir_ + "/b", &writable_file, soptions_)); + + // These are no-ops, but we test they return success. + ASSERT_OK(writable_file->Sync()); + ASSERT_OK(writable_file->Flush()); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); +} + +TEST_P(EnvBasicTestWithParam, LargeWrite) { + const size_t kWriteSize = 300 * 1024; + char* scratch = new char[kWriteSize * 2]; + + std::string write_data; + for (size_t i = 0; i < kWriteSize; ++i) { + write_data.append(1, static_cast(i)); + } + + std::unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile(test_dir_ + "/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("foo")); + ASSERT_OK(writable_file->Append(write_data)); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + + std::unique_ptr seq_file; + Slice result; + ASSERT_OK(env_->NewSequentialFile(test_dir_ + "/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo". + ASSERT_EQ(0, result.compare("foo")); + + size_t read = 0; + std::string read_data; + while (read < kWriteSize) { + ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch)); + read_data.append(result.data(), result.size()); + read += result.size(); + } + ASSERT_TRUE(write_data == read_data); + delete[] scratch; +} + +TEST_P(EnvMoreTestWithParam, GetModTime) { + ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/dir1")); + uint64_t mtime1 = 0x0; + ASSERT_OK(env_->GetFileModificationTime(test_dir_ + "/dir1", &mtime1)); +} + +TEST_P(EnvMoreTestWithParam, MakeDir) { + ASSERT_OK(env_->CreateDir(test_dir_ + "/j")); + ASSERT_OK(env_->FileExists(test_dir_ + "/j")); + std::vector children; + env_->GetChildren(test_dir_, &children); + ASSERT_EQ(1U, children.size()); + // fail because file already exists + ASSERT_TRUE(!env_->CreateDir(test_dir_ + "/j").ok()); + ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/j")); + ASSERT_OK(env_->DeleteDir(test_dir_ + "/j")); + ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/j")); +} + +TEST_P(EnvMoreTestWithParam, GetChildren) { + // empty folder returns empty vector + std::vector children; + std::vector childAttr; + ASSERT_OK(env_->CreateDirIfMissing(test_dir_)); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_OK(env_->FileExists(test_dir_)); + ASSERT_OK(env_->GetChildrenFileAttributes(test_dir_, &childAttr)); + ASSERT_EQ(0U, children.size()); + ASSERT_EQ(0U, childAttr.size()); + + // folder with contents returns relative path to test dir + ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/niu")); + ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/you")); + ASSERT_OK(env_->CreateDirIfMissing(test_dir_ + "/guo")); + ASSERT_OK(env_->GetChildren(test_dir_, &children)); + ASSERT_OK(env_->GetChildrenFileAttributes(test_dir_, &childAttr)); + ASSERT_EQ(3U, children.size()); + ASSERT_EQ(3U, childAttr.size()); + for (auto each : children) { + env_->DeleteDir(test_dir_ + "/" + each); + } // necessary for default POSIX env + + // non-exist directory returns IOError + ASSERT_OK(env_->DeleteDir(test_dir_)); + ASSERT_TRUE(!env_->FileExists(test_dir_).ok()); + ASSERT_TRUE(!env_->GetChildren(test_dir_, &children).ok()); + ASSERT_TRUE(!env_->GetChildrenFileAttributes(test_dir_, &childAttr).ok()); + + // if dir is a file, returns IOError + ASSERT_OK(env_->CreateDir(test_dir_)); + std::unique_ptr writable_file; + ASSERT_OK( + env_->NewWritableFile(test_dir_ + "/file", &writable_file, soptions_)); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); + ASSERT_TRUE(!env_->GetChildren(test_dir_ + "/file", &children).ok()); + ASSERT_EQ(0U, children.size()); +} + +class SstWriterBug : public testing::Test { + public: + std::string test_dir_; + Env* env_default_ = Env::Default(); + + SstWriterBug() { + test_dir_ = test::PerThreadDBPath(env_default_, "env_encrypt2_test"); + } + + void SetUp() { env_default_->CreateDirIfMissing(test_dir_); } + + void TearDown() { + std::vector files; + env_default_->GetChildren(test_dir_, &files); + for (const auto& file : files) { + // don't know whether it's file or directory, try both. The tests must + // only create files or empty directories, so one must succeed, else the + // directory's corrupted. + Status s = env_default_->DeleteFile(test_dir_ + "/" + file); + if (!s.ok()) { + ASSERT_OK(env_default_->DeleteDir(test_dir_ + "/" + file)); + } + } + } +}; + +#if 0 +TEST(SstWriterBug, BugCheck) { + + Options sstOptions; + + sstOptions.env = encrypt2_env.get(); + + // auto* cf = reinterpret_cast(theCfHandle); + rocksdb::ColumnFamilyHandle * cf = nullptr; + // sstOptions.compression = (CompressionType)theCompression; + auto* sst_file_writer = new rocksdb::SstFileWriter(EnvOptions(), sstOptions, sstOptions.comparator, cf); + std::string path = test::PerThreadDBPath("BugCheck1"); + Status ss = sst_file_writer->Open(path); + ASSERT_OK(ss); +} +#endif + +} // namespace rocksdb + +#endif // ROCKSDB_OPENSSL_AES_CTR + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/env/env_encryption.cc b/env/env_encryption.cc index e38693e3c..dfbcccaf4 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -5,11 +5,14 @@ #ifndef ROCKSDB_LITE +#include "rocksdb/env_encryption.h" + #include +#include #include #include -#include "rocksdb/env_encryption.h" +#include "monitoring/perf_context_imp.h" #include "util/aligned_buffer.h" #include "util/coding.h" #include "util/random.h" @@ -20,19 +23,6 @@ namespace rocksdb { #ifndef ROCKSDB_LITE -class EncryptedSequentialFile : public SequentialFile { - private: - std::unique_ptr file_; - std::unique_ptr stream_; - uint64_t offset_; - size_t prefixLength_; - - public: - // Default ctor. Given underlying sequential file is supposed to be at - // offset == prefixLength. - EncryptedSequentialFile(SequentialFile* f, BlockAccessCipherStream* s, size_t prefixLength) - : file_(f), stream_(s), offset_(prefixLength), prefixLength_(prefixLength) { - } // Read up to "n" bytes from the file. "scratch[0..n-1]" may be // written by this routine. Sets "*result" to the data that was @@ -42,78 +32,73 @@ class EncryptedSequentialFile : public SequentialFile { // If an error was encountered, returns a non-OK status. // // REQUIRES: External synchronization - virtual Status Read(size_t n, Slice* result, char* scratch) override { - assert(scratch); - Status status = file_->Read(n, result, scratch); - if (!status.ok()) { - return status; - } - status = stream_->Decrypt(offset_, (char*)result->data(), result->size()); - offset_ += result->size(); // We've already ready data from disk, so update offset_ even if decryption fails. +Status EncryptedSequentialFile::Read(size_t n, Slice* result, char* scratch) { + assert(scratch); + Status status = file_->Read(n, result, scratch); + if (!status.ok()) { return status; } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = stream_->Decrypt(offset_, (char*)result->data(), result->size()); + } + offset_ += result->size(); // We've already ready data from disk, so update + // offset_ even if decryption fails. + return status; +} - // Skip "n" bytes from the file. This is guaranteed to be no - // slower that reading the same data, but may be faster. - // - // If end of file is reached, skipping will stop at the end of the - // file, and Skip will return OK. - // - // REQUIRES: External synchronization - virtual Status Skip(uint64_t n) override { - auto status = file_->Skip(n); - if (!status.ok()) { - return status; - } - offset_ += n; +// Skip "n" bytes from the file. This is guaranteed to be no +// slower that reading the same data, but may be faster. +// +// If end of file is reached, skipping will stop at the end of the +// file, and Skip will return OK. +// +// REQUIRES: External synchronization +Status EncryptedSequentialFile::Skip(uint64_t n) { + auto status = file_->Skip(n); + if (!status.ok()) { return status; } + offset_ += n; + return status; +} - // Indicates the upper layers if the current SequentialFile implementation - // uses direct IO. - virtual bool use_direct_io() const override { - return file_->use_direct_io(); - } +// Indicates the upper layers if the current SequentialFile implementation +// uses direct IO. +bool EncryptedSequentialFile::use_direct_io() const { + return file_->use_direct_io(); +} - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O - virtual size_t GetRequiredBufferAlignment() const override { - return file_->GetRequiredBufferAlignment(); - } +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O +size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. - virtual Status InvalidateCache(size_t offset, size_t length) override { - return file_->InvalidateCache(offset + prefixLength_, length); - } +Status EncryptedSequentialFile::InvalidateCache(size_t offset, size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} // Positioned Read for direct I/O // If Direct I/O enabled, offset, n, and scratch should be properly aligned - virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result, char* scratch) override { - assert(scratch); - offset += prefixLength_; // Skip prefix - auto status = file_->PositionedRead(offset, n, result, scratch); - if (!status.ok()) { - return status; - } - offset_ = offset + result->size(); - status = stream_->Decrypt(offset, (char*)result->data(), result->size()); +Status EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, + Slice* result, char* scratch) { + assert(scratch); + offset += prefixLength_; // Skip prefix + auto status = file_->PositionedRead(offset, n, result, scratch); + if (!status.ok()) { return status; } - -}; - -// A file abstraction for randomly reading the contents of a file. -class EncryptedRandomAccessFile : public RandomAccessFile { - private: - std::unique_ptr file_; - std::unique_ptr stream_; - size_t prefixLength_; - - public: - EncryptedRandomAccessFile(RandomAccessFile* f, BlockAccessCipherStream* s, size_t prefixLength) - : file_(f), stream_(s), prefixLength_(prefixLength) { } + offset_ = offset + result->size(); + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = stream_->Decrypt(offset, (char*)result->data(), result->size()); + } + return status; +} // Read up to "n" bytes from the file starting at "offset". // "scratch[0..n-1]" may be written by this routine. Sets "*result" @@ -125,22 +110,26 @@ class EncryptedRandomAccessFile : public RandomAccessFile { // // Safe for concurrent use by multiple threads. // If Direct I/O enabled, offset, n, and scratch should be aligned properly. - virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override { - assert(scratch); - offset += prefixLength_; - auto status = file_->Read(offset, n, result, scratch); - if (!status.ok()) { - return status; - } - status = stream_->Decrypt(offset, (char*)result->data(), result->size()); +Status EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + assert(scratch); + offset += prefixLength_; + auto status = file_->Read(offset, n, result, scratch); + if (!status.ok()) { return status; } + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = stream_->Decrypt(offset, (char*)result->data(), result->size()); + } + return status; +} // Readahead the file starting from offset by n bytes for caching. - virtual Status Prefetch(uint64_t offset, size_t n) override { - //return Status::OK(); - return file_->Prefetch(offset + prefixLength_, n); - } +Status EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n) { + // return Status::OK(); + return file_->Prefetch(offset + prefixLength_, n); +} // Tries to get an unique ID for this file that will be the same each time // the file is opened (and will stay the same while the file is open). @@ -157,124 +146,128 @@ class EncryptedRandomAccessFile : public RandomAccessFile { // a single varint. // // Note: these IDs are only valid for the duration of the process. - virtual size_t GetUniqueId(char* id, size_t max_size) const override { - return file_->GetUniqueId(id, max_size); - }; +size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return file_->GetUniqueId(id, max_size); +}; - virtual void Hint(AccessPattern pattern) override { - file_->Hint(pattern); - } +void EncryptedRandomAccessFile::Hint(AccessPattern pattern) { + file_->Hint(pattern); +} // Indicates the upper layers if the current RandomAccessFile implementation // uses direct IO. - virtual bool use_direct_io() const override { - return file_->use_direct_io(); - } +bool EncryptedRandomAccessFile::use_direct_io() const { + return file_->use_direct_io(); +} // Use the returned alignment value to allocate // aligned buffer for Direct I/O - virtual size_t GetRequiredBufferAlignment() const override { - return file_->GetRequiredBufferAlignment(); - } +size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. - virtual Status InvalidateCache(size_t offset, size_t length) override { - return file_->InvalidateCache(offset + prefixLength_, length); - } -}; +Status EncryptedRandomAccessFile::InvalidateCache(size_t offset, + size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} // A file abstraction for sequential writing. The implementation // must provide buffering since callers may append small fragments // at a time to the file. -class EncryptedWritableFile : public WritableFileWrapper { - private: - std::unique_ptr file_; - std::unique_ptr stream_; - size_t prefixLength_; - - public: - // Default ctor. Prefix is assumed to be written already. - EncryptedWritableFile(WritableFile* f, BlockAccessCipherStream* s, size_t prefixLength) - : WritableFileWrapper(f), file_(f), stream_(s), prefixLength_(prefixLength) { } - - Status Append(const Slice& data) override { - AlignedBuffer buf; - Status status; - Slice dataToAppend(data); - if (data.size() > 0) { - auto offset = file_->GetFileSize(); // size including prefix - // Encrypt in cloned buffer - buf.Alignment(GetRequiredBufferAlignment()); - buf.AllocateNewBuffer(data.size()); - memmove(buf.BufferStart(), data.data(), data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), data.size()); - if (!status.ok()) { - return status; - } - dataToAppend = Slice(buf.BufferStart(), data.size()); +Status EncryptedWritableFile::Append(const Slice& data) { + AlignedBuffer buf; + Status status; + Slice dataToAppend(data); + if (data.size() > 0) { + auto offset = file_->GetFileSize(); // size including prefix + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + // TODO (sagar0): Modify AlignedBuffer.Append to allow doing a memmove + // so that the next two lines can be replaced with buf.Append(). + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + { + PERF_TIMER_GUARD(encrypt_data_nanos); + status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); } - status = file_->Append(dataToAppend); if (!status.ok()) { return status; } + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); + } + status = file_->Append(dataToAppend); + if (!status.ok()) { return status; } + return status; +} - Status PositionedAppend(const Slice& data, uint64_t offset) override { - AlignedBuffer buf; - Status status; - Slice dataToAppend(data); - offset += prefixLength_; - if (data.size() > 0) { - // Encrypt in cloned buffer - buf.Alignment(GetRequiredBufferAlignment()); - buf.AllocateNewBuffer(data.size()); - memmove(buf.BufferStart(), data.data(), data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), data.size()); - if (!status.ok()) { - return status; - } - dataToAppend = Slice(buf.BufferStart(), data.size()); +Status EncryptedWritableFile::PositionedAppend(const Slice& data, + uint64_t offset) { + AlignedBuffer buf; + Status status; + Slice dataToAppend(data); + offset += prefixLength_; + if (data.size() > 0) { + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + { + PERF_TIMER_GUARD(encrypt_data_nanos); + status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); } - status = file_->PositionedAppend(dataToAppend, offset); if (!status.ok()) { return status; } + dataToAppend = Slice(buf.BufferStart(), buf.CurrentSize()); + } + status = file_->PositionedAppend(dataToAppend, offset); + if (!status.ok()) { return status; } + return status; +} // Indicates the upper layers if the current WritableFile implementation // uses direct IO. - virtual bool use_direct_io() const override { return file_->use_direct_io(); } +bool EncryptedWritableFile::use_direct_io() const { + return file_->use_direct_io(); +} // Use the returned alignment value to allocate // aligned buffer for Direct I/O - virtual size_t GetRequiredBufferAlignment() const override { return file_->GetRequiredBufferAlignment(); } +size_t EncryptedWritableFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} - /* - * Get the size of valid data in the file. - */ - virtual uint64_t GetFileSize() override { - return file_->GetFileSize() - prefixLength_; - } +/* + * Get the size of valid data in the file. + */ +uint64_t EncryptedWritableFile::GetFileSize() { + return file_->GetFileSize() - prefixLength_; +} // Truncate is necessary to trim the file to the correct size // before closing. It is not always possible to keep track of the file // size due to whole pages writes. The behavior is undefined if called // with other writes to follow. - virtual Status Truncate(uint64_t size) override { - return file_->Truncate(size + prefixLength_); - } +Status EncryptedWritableFile::Truncate(uint64_t size) { + return file_->Truncate(size + prefixLength_); +} // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. // This call has no effect on dirty pages in the cache. - virtual Status InvalidateCache(size_t offset, size_t length) override { - return file_->InvalidateCache(offset + prefixLength_, length); - } +Status EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) { + return file_->InvalidateCache(offset + prefixLength_, length); +} // Sync a file range with disk. // offset is the starting byte of the file range to be synchronized. @@ -282,103 +275,95 @@ class EncryptedWritableFile : public WritableFileWrapper { // This asks the OS to initiate flushing the cached data to disk, // without waiting for completion. // Default implementation does nothing. - virtual Status RangeSync(uint64_t offset, uint64_t nbytes) override { - return file_->RangeSync(offset + prefixLength_, nbytes); - } +Status EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) { + return file_->RangeSync(offset + prefixLength_, nbytes); +} // PrepareWrite performs any necessary preparation for a write // before the write actually occurs. This allows for pre-allocation // of space on devices where it can result in less file // fragmentation and/or less waste from over-zealous filesystem // pre-allocation. - virtual void PrepareWrite(size_t offset, size_t len) override { - file_->PrepareWrite(offset + prefixLength_, len); - } +void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len) { + file_->PrepareWrite(offset + prefixLength_, len); +} // Pre-allocates space for a file. - virtual Status Allocate(uint64_t offset, uint64_t len) override { - return file_->Allocate(offset + prefixLength_, len); - } -}; +Status EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len) { + return file_->Allocate(offset + prefixLength_, len); +} // A file abstraction for random reading and writing. -class EncryptedRandomRWFile : public RandomRWFile { - private: - std::unique_ptr file_; - std::unique_ptr stream_; - size_t prefixLength_; - public: - EncryptedRandomRWFile(RandomRWFile* f, BlockAccessCipherStream* s, size_t prefixLength) - : file_(f), stream_(s), prefixLength_(prefixLength) {} - - // Indicates if the class makes use of direct I/O - // If false you must pass aligned buffer to Write() - virtual bool use_direct_io() const override { return file_->use_direct_io(); } +// Indicates if the class makes use of direct I/O +// If false you must pass aligned buffer to Write() +bool EncryptedRandomRWFile::use_direct_io() const { + return file_->use_direct_io(); +} // Use the returned alignment value to allocate // aligned buffer for Direct I/O - virtual size_t GetRequiredBufferAlignment() const override { - return file_->GetRequiredBufferAlignment(); - } +size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const { + return file_->GetRequiredBufferAlignment(); +} // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Pass aligned buffer when use_direct_io() returns true. - virtual Status Write(uint64_t offset, const Slice& data) override { - AlignedBuffer buf; - Status status; - Slice dataToWrite(data); - offset += prefixLength_; - if (data.size() > 0) { - // Encrypt in cloned buffer - buf.Alignment(GetRequiredBufferAlignment()); - buf.AllocateNewBuffer(data.size()); - memmove(buf.BufferStart(), data.data(), data.size()); - status = stream_->Encrypt(offset, buf.BufferStart(), data.size()); - if (!status.ok()) { - return status; - } - dataToWrite = Slice(buf.BufferStart(), data.size()); +Status EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data) { + AlignedBuffer buf; + Status status; + Slice dataToWrite(data); + offset += prefixLength_; + if (data.size() > 0) { + // Encrypt in cloned buffer + buf.Alignment(GetRequiredBufferAlignment()); + buf.AllocateNewBuffer(data.size()); + memmove(buf.BufferStart(), data.data(), data.size()); + buf.Size(data.size()); + { + PERF_TIMER_GUARD(encrypt_data_nanos); + status = stream_->Encrypt(offset, buf.BufferStart(), buf.CurrentSize()); } - status = file_->Write(offset, dataToWrite); - return status; + if (!status.ok()) { + return status; + } + dataToWrite = Slice(buf.BufferStart(), buf.CurrentSize()); } + status = file_->Write(offset, dataToWrite); + return status; +} // Read up to `n` bytes starting from offset `offset` and store them in // result, provided `scratch` size should be at least `n`. // Returns Status::OK() on success. - virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override { - assert(scratch); - offset += prefixLength_; - auto status = file_->Read(offset, n, result, scratch); - if (!status.ok()) { - return status; - } - status = stream_->Decrypt(offset, (char*)result->data(), result->size()); +Status EncryptedRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + assert(scratch); + offset += prefixLength_; + auto status = file_->Read(offset, n, result, scratch); + if (!status.ok()) { return status; } - - virtual Status Flush() override { - return file_->Flush(); + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = stream_->Decrypt(offset, (char*)result->data(), result->size()); } + return status; +} - virtual Status Sync() override { - return file_->Sync(); - } +Status EncryptedRandomRWFile::Flush() { return file_->Flush(); } - virtual Status Fsync() override { - return file_->Fsync(); - } +Status EncryptedRandomRWFile::Sync() { return file_->Sync(); } - virtual Status Close() override { - return file_->Close(); - } -}; +Status EncryptedRandomRWFile::Fsync() { return file_->Fsync(); } + +Status EncryptedRandomRWFile::Close() { return file_->Close(); } -// EncryptedEnv implements an Env wrapper that adds encryption to files stored on disk. +// EncryptedEnv implements an Env wrapper that adds encryption to files stored +// on disk. class EncryptedEnv : public EnvWrapper { public: - EncryptedEnv(Env* base_env, EncryptionProvider *provider) + EncryptedEnv(Env* base_env, EncryptionProvider* provider) : EnvWrapper(base_env) { provider_ = provider; } @@ -402,21 +387,25 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Read prefix + // Read prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); - status = underlying->Read(prefixLength, &prefixSlice, prefixBuf.BufferStart()); + status = + underlying->Read(prefixLength, &prefixSlice, prefixBuf.BufferStart()); if (!status.ok()) { return status; } + prefixBuf.Size(prefixLength); } // Create cipher stream std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); + status = + provider_->CreateCipherStream(fname, options, prefixSlice, &stream); if (!status.ok()) { return status; } - (*result) = std::unique_ptr(new EncryptedSequentialFile(underlying.release(), stream.release(), prefixLength)); + (*result) = std::unique_ptr(new EncryptedSequentialFile( + underlying.release(), stream.release(), prefixLength)); return Status::OK(); } @@ -439,21 +428,25 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Read prefix + // Read prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); - status = underlying->Read(0, prefixLength, &prefixSlice, prefixBuf.BufferStart()); + status = underlying->Read(0, prefixLength, &prefixSlice, + prefixBuf.BufferStart()); if (!status.ok()) { return status; } + prefixBuf.Size(prefixLength); } // Create cipher stream std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); + status = + provider_->CreateCipherStream(fname, options, prefixSlice, &stream); if (!status.ok()) { return status; } - (*result) = std::unique_ptr(new EncryptedRandomAccessFile(underlying.release(), stream.release(), prefixLength)); + (*result) = std::unique_ptr(new EncryptedRandomAccessFile( + underlying.release(), stream.release(), prefixLength)); return Status::OK(); } @@ -476,12 +469,13 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Initialize prefix + // Initialize prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Append(prefixSlice); if (!status.ok()) { return status; @@ -489,11 +483,13 @@ class EncryptedEnv : public EnvWrapper { } // Create cipher stream std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); + status = + provider_->CreateCipherStream(fname, options, prefixSlice, &stream); if (!status.ok()) { return status; } - (*result) = std::unique_ptr(new EncryptedWritableFile(underlying.release(), stream.release(), prefixLength)); + (*result) = std::unique_ptr(new EncryptedWritableFile( + underlying.release(), stream.release(), prefixLength)); return Status::OK(); } @@ -522,12 +518,13 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Initialize prefix + // Initialize prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Append(prefixSlice); if (!status.ok()) { return status; @@ -535,11 +532,13 @@ class EncryptedEnv : public EnvWrapper { } // Create cipher stream std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); + status = + provider_->CreateCipherStream(fname, options, prefixSlice, &stream); if (!status.ok()) { return status; } - (*result) = std::unique_ptr(new EncryptedWritableFile(underlying.release(), stream.release(), prefixLength)); + (*result) = std::unique_ptr(new EncryptedWritableFile( + underlying.release(), stream.release(), prefixLength)); return Status::OK(); } @@ -554,7 +553,8 @@ class EncryptedEnv : public EnvWrapper { } // Open file using underlying Env implementation std::unique_ptr underlying; - Status status = EnvWrapper::ReuseWritableFile(fname, old_fname, &underlying, options); + Status status = + EnvWrapper::ReuseWritableFile(fname, old_fname, &underlying, options); if (!status.ok()) { return status; } @@ -563,12 +563,13 @@ class EncryptedEnv : public EnvWrapper { Slice prefixSlice; size_t prefixLength = provider_->GetPrefixLength(); if (prefixLength > 0) { - // Initialize prefix + // Initialize prefix prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); prefixBuf.AllocateNewBuffer(prefixLength); provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Append(prefixSlice); if (!status.ok()) { return status; @@ -576,11 +577,13 @@ class EncryptedEnv : public EnvWrapper { } // Create cipher stream std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); + status = + provider_->CreateCipherStream(fname, options, prefixSlice, &stream); if (!status.ok()) { return status; } - (*result) = std::unique_ptr(new EncryptedWritableFile(underlying.release(), stream.release(), prefixLength)); + (*result) = std::unique_ptr(new EncryptedWritableFile( + underlying.release(), stream.release(), prefixLength)); return Status::OK(); } @@ -614,15 +617,19 @@ class EncryptedEnv : public EnvWrapper { prefixBuf.AllocateNewBuffer(prefixLength); if (!isNewFile) { // File already exists, read prefix - status = underlying->Read(0, prefixLength, &prefixSlice, prefixBuf.BufferStart()); + status = underlying->Read(0, prefixLength, &prefixSlice, + prefixBuf.BufferStart()); if (!status.ok()) { return status; } + prefixBuf.Size(prefixLength); } else { - // File is new, initialize & write prefix - provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixLength); - // Write prefix + // File is new, initialize & write prefix + provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), + prefixLength); + prefixBuf.Size(prefixLength); + prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); + // Write prefix status = underlying->Write(0, prefixSlice); if (!status.ok()) { return status; @@ -631,39 +638,51 @@ class EncryptedEnv : public EnvWrapper { } // Create cipher stream std::unique_ptr stream; - status = provider_->CreateCipherStream(fname, options, prefixSlice, &stream); + status = + provider_->CreateCipherStream(fname, options, prefixSlice, &stream); if (!status.ok()) { return status; } - (*result) = std::unique_ptr(new EncryptedRandomRWFile(underlying.release(), stream.release(), prefixLength)); + (*result) = std::unique_ptr(new EncryptedRandomRWFile( + underlying.release(), stream.release(), prefixLength)); return Status::OK(); } - // Store in *result the attributes of the children of the specified directory. - // In case the implementation lists the directory prior to iterating the files - // and files are concurrently deleted, the deleted files will be omitted from + // Store in *result the attributes of the children of the specified + // directory. + // In case the implementation lists the directory prior to iterating the + // files + // and files are concurrently deleted, the deleted files will be omitted + // from // result. // The name attributes are relative to "dir". // Original contents of *results are dropped. // Returns OK if "dir" exists and "*result" contains its children. - // NotFound if "dir" does not exist, the calling process does not have + // NotFound if "dir" does not exist, the calling process does not + // have // permission to access "dir", or if "dir" is invalid. // IOError if an IO Error was encountered - virtual Status GetChildrenFileAttributes(const std::string& dir, std::vector* result) override { + virtual Status GetChildrenFileAttributes( + const std::string& dir, std::vector* result) override { auto status = EnvWrapper::GetChildrenFileAttributes(dir, result); if (!status.ok()) { return status; } size_t prefixLength = provider_->GetPrefixLength(); - for (auto it = std::begin(*result); it!=std::end(*result); ++it) { - assert(it->size_bytes >= prefixLength); + for (auto it = std::begin(*result); it != std::end(*result); ++it) { + // assert(it->size_bytes >= prefixLength); + // breaks env_basic_test when called on directory containing + // directories + // which makes subtraction of prefixLength worrisome since + // FileAttributes does not identify directories it->size_bytes -= prefixLength; } return Status::OK(); - } + } // Store the size of fname in *file_size. - virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) override { + virtual Status GetFileSize(const std::string& fname, + uint64_t* file_size) override { auto status = EnvWrapper::GetFileSize(fname, file_size); if (!status.ok()) { return status; @@ -671,15 +690,14 @@ class EncryptedEnv : public EnvWrapper { size_t prefixLength = provider_->GetPrefixLength(); assert(*file_size >= prefixLength); *file_size -= prefixLength; - return Status::OK(); + return Status::OK(); } private: - EncryptionProvider *provider_; + EncryptionProvider* provider_; }; - -// Returns an Env that encrypts data when stored on disk and decrypts data when +// Returns an Env that encrypts data when stored on disk and decrypts data when // read from disk. Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider) { return new EncryptedEnv(base_env, provider); @@ -702,14 +720,14 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t char *block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { - // We're not encrypting a full block. + // We're not encrypting a full block. // Copy data to blockBuffer if (!blockBuffer.get()) { // Allocate buffer blockBuffer = std::unique_ptr(new char[blockSize]); } block = blockBuffer.get(); - // Copy plain data to block buffer + // Copy plain data to block buffer memmove(block + blockOffset, data, n); } auto status = EncryptBlock(blockIndex, block, (char*)scratch.data()); @@ -747,14 +765,14 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t char *block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { - // We're not decrypting a full block. + // We're not decrypting a full block. // Copy data to blockBuffer if (!blockBuffer.get()) { // Allocate buffer blockBuffer = std::unique_ptr(new char[blockSize]); } block = blockBuffer.get(); - // Copy encrypted data to block buffer + // Copy encrypted data to block buffer memmove(block + blockOffset, data, n); } auto status = DecryptBlock(blockIndex, block, (char*)scratch.data()); @@ -765,6 +783,14 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t // Copy decrypted data back to `data`. memmove(data, block + blockOffset, n); } + + // Simply decrementing dataSize by n could cause it to underflow, + // which will very likely make it read over the original bounds later + assert(dataSize >= n); + if (dataSize < n) { + return Status::Corruption("Cannot decrypt data at given offset"); + } + dataSize -= n; if (dataSize == 0) { return Status::OK(); @@ -805,7 +831,7 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra memmove(scratch, iv_.data(), blockSize); EncodeFixed64(scratch, blockIndex + initialCounter_); - // Encrypt nonce+counter + // Encrypt nonce+counter auto status = cipher_.Encrypt(scratch); if (!status.ok()) { return status; @@ -821,13 +847,13 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra // Decrypt a block of data at the given block index. // Length of data is equal to BlockSize(); Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scratch) { - // For CTR decryption & encryption are the same + // For CTR decryption & encryption are the same return EncryptBlock(blockIndex, data, scratch); } // GetPrefixLength returns the length of the prefix that is added to every file // and used for storing encryption options. -// For optimal performance, the prefix length should be a multiple of +// For optimal performance, the prefix length should be a multiple of // the page size. size_t CTREncryptionProvider::GetPrefixLength() { return defaultPrefixLength; @@ -842,7 +868,7 @@ static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t & iv = Slice(prefix + blockSize, blockSize); } -// CreateNewPrefix initialized an allocated block of prefix memory +// CreateNewPrefix initialized an allocated block of prefix memory // for a new file. Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, char* prefix, @@ -862,16 +888,22 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, // Now populate the rest of the prefix, starting from the third block. PopulateSecretPrefixPart(prefix + (2 * blockSize), prefixLength - (2 * blockSize), blockSize); - // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial counter & IV unencrypted) + // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial + // counter & IV unencrypted) CTRCipherStream cipherStream(cipher_, prefixIV.data(), initialCounter); - auto status = cipherStream.Encrypt(0, prefix + (2 * blockSize), prefixLength - (2 * blockSize)); + Status status; + { + PERF_TIMER_GUARD(encrypt_data_nanos); + status = cipherStream.Encrypt(0, prefix + (2 * blockSize), + prefixLength - (2 * blockSize)); + } if (!status.ok()) { return status; } return Status::OK(); } -// PopulateSecretPrefixPart initializes the data into a new prefix block +// PopulateSecretPrefixPart initializes the data into a new prefix block // in plain text. // Returns the amount of space (starting from the start of the prefix) // that has been initialized. @@ -891,14 +923,28 @@ Status CTREncryptionProvider::CreateCipherStream( Slice iv; decodeCTRParameters(prefix.data(), blockSize, initialCounter, iv); - // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 with initial counter & IV are unencrypted) + // If the prefix is smaller than twice the block size, we would below read a + // very large chunk of the file (and very likely read over the bounds) + assert(prefix.size() >= 2 * blockSize); + if (prefix.size() < 2 * blockSize) { + return Status::Corruption("Unable to read from file " + fname + + ": read attempt would read beyond file bounds"); + } + + // Decrypt the encrypted part of the prefix, starting from block 2 (block 0, 1 + // with initial counter & IV are unencrypted) CTRCipherStream cipherStream(cipher_, iv.data(), initialCounter); - auto status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize), prefix.size() - (2 * blockSize)); + Status status; + { + PERF_TIMER_GUARD(decrypt_data_nanos); + status = cipherStream.Decrypt(0, (char*)prefix.data() + (2 * blockSize), + prefix.size() - (2 * blockSize)); + } if (!status.ok()) { return status; } - // Create cipher stream + // Create cipher stream return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, prefix, result); } diff --git a/include/BUILD b/include/BUILD index ec9fc4a6b..a95c20ba3 100644 --- a/include/BUILD +++ b/include/BUILD @@ -1,12 +1,24 @@ +DEPS = select({ + "@toolchain//:is_target_linux": [ + ], + "@toolchain//:is_target_osx": [ + "@openssl_osx//:openssl_dev", + ], + "@toolchain//:is_target_windows": [ + ], +}) + cc_library( name = "include", hdrs = glob(["**/*.h"]), visibility = ["//visibility:public"], + deps = DEPS, strip_include_prefix = "/include", ) cc_library( name = "with_extra_prefix", hdrs = glob(["**/*.h"]), + deps = DEPS, visibility = ["//visibility:public"], ) diff --git a/include/rocksdb/env_encrypt2.h b/include/rocksdb/env_encrypt2.h new file mode 100644 index 000000000..dc2669d94 --- /dev/null +++ b/include/rocksdb/env_encrypt2.h @@ -0,0 +1,469 @@ +// copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// +// env_encryption.cc copied to this file then modified. + +#ifdef ROCKSDB_OPENSSL_AES_CTR +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "openssl/aes.h" +#include "openssl/evp.h" +#include "openssl/rand.h" +#include "rocksdb/env_encryption.h" +#include "util/aligned_buffer.h" +#include "util/coding.h" +#include "util/library_loader.h" +#include "util/random.h" + +#endif + +namespace rocksdb { + +#ifndef ROCKSDB_LITE + +struct Sha1Description_t { + uint8_t desc[EVP_MAX_MD_SIZE]; + bool valid; + + Sha1Description_t() : valid(false) { memset(desc, 0, EVP_MAX_MD_SIZE); } + + Sha1Description_t(const Sha1Description_t& rhs) { *this = rhs; } + + Sha1Description_t& operator=(const Sha1Description_t& rhs) { + memcpy(desc, rhs.desc, sizeof(desc)); + valid = rhs.valid; + return *this; + } + + Sha1Description_t(uint8_t* Desc, size_t DescLen) : valid(false) { + memset(desc, 0, EVP_MAX_MD_SIZE); + if (DescLen <= EVP_MAX_MD_SIZE) { + memcpy(desc, Desc, DescLen); + valid = true; + } + } + + Sha1Description_t(const std::string& key_desc_str); + + // see AesCtrKey_t destructor below. This data is not really + // essential to clear, but trying to set pattern for future work. + // goal is to explicitly remove desc from memory once no longer needed + ~Sha1Description_t() { + memset(desc, 0, EVP_MAX_MD_SIZE); + valid = false; + } + + bool operator<(const Sha1Description_t& rhs) const { + return memcmp(desc, rhs.desc, EVP_MAX_MD_SIZE) < 0; + } + + bool operator==(const Sha1Description_t& rhs) const { + return 0 == memcmp(desc, rhs.desc, EVP_MAX_MD_SIZE) && valid == rhs.valid; + } + + bool IsValid() const { return valid; } +}; + +struct AesCtrKey_t { + uint8_t key[EVP_MAX_KEY_LENGTH]; + bool valid; + + AesCtrKey_t() : valid(false) { memset(key, 0, EVP_MAX_KEY_LENGTH); } + + AesCtrKey_t(const uint8_t* Key, size_t KeyLen) : valid(false) { + memset(key, 0, EVP_MAX_KEY_LENGTH); + if (KeyLen <= EVP_MAX_KEY_LENGTH) { + memcpy(key, Key, KeyLen); + valid = true; + } else { + valid = false; + } + } + + AesCtrKey_t(const std::string& key_str); + + // see Writing Solid Code, 2nd edition + // Chapter 9, page 321, Managing Secrets in Memory ... bullet 4 "Scrub the memory" + // Not saying this is essential or effective in initial implementation since current + // usage model loads all keys at start and only deletes them at shutdown. But does + // establish presidence. + // goal is to explicitly remove key from memory once no longer needed + ~AesCtrKey_t() { + memset(key, 0, EVP_MAX_KEY_LENGTH); + valid = false; + } + + bool operator==(const AesCtrKey_t& rhs) const { + return (0 == memcmp(key, rhs.key, EVP_MAX_KEY_LENGTH)) && + (valid == rhs.valid); + } + + bool IsValid() const { return valid; } +}; + +typedef char EncryptMarker_t[8]; +static EncryptMarker_t Marker = "Encrypt"; + +// long term: code_version could be used in a switch statement or factory +// parameter version 0 is 12 byte sha1 description hash, 128 bit (16 byte) +// nounce (assumed to be packed/byte aligned) +typedef struct { + uint8_t key_description_[EVP_MAX_MD_SIZE]; + uint8_t nonce_[AES_BLOCK_SIZE / 2]; // block size is 16 +} Prefix0_t; + +class AESBlockAccessCipherStream : public BlockAccessCipherStream { + public: + AESBlockAccessCipherStream(const AesCtrKey_t& key, uint8_t code_version, + uint8_t nonce[]) + : key_(key), code_version_(code_version) { + memcpy(&nonce_, nonce, AES_BLOCK_SIZE / 2); + } + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() override { return AES_BLOCK_SIZE; }; + + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + virtual void AllocateScratch(std::string&) override{}; + + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) override; + + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) override; + + AesCtrKey_t key_; + uint8_t code_version_; + uint8_t nonce_[AES_BLOCK_SIZE / 2]; +}; + +class CTREncryptionProvider2 : public EncryptionProvider { + public: + CTREncryptionProvider2() = delete; + + CTREncryptionProvider2(const CTREncryptionProvider&&) = delete; + + CTREncryptionProvider2(const Sha1Description_t& key_desc, + const AesCtrKey_t& key) + : valid_(false), key_desc_(key_desc), key_(key) { + valid_ = key_desc_.IsValid() && key_.IsValid(); + } + + CTREncryptionProvider2(const std::string& key_desc_str, + const uint8_t unformatted_key[], int bytes) + : valid_(false), key_desc_(key_desc_str), key_(unformatted_key, bytes) { + valid_ = key_desc_.IsValid() && key_.IsValid(); + } + + virtual size_t GetPrefixLength() override { + return sizeof(Prefix0_t) + sizeof(EncryptMarker_t); + } + + virtual Status CreateNewPrefix(const std::string& /*fname*/, char* prefix, + size_t prefixLength) override; + + virtual Status CreateCipherStream( + const std::string& /*fname*/, const EnvOptions& /*options*/, + Slice& /*prefix*/, + std::unique_ptr* /*result*/) override { + return Status::NotSupported("Wrong EncryptionProvider assumed"); + } + + virtual BlockAccessCipherStream* CreateCipherStream2(uint8_t code_version, + uint8_t nonce[]) { + return new AESBlockAccessCipherStream(key_, code_version, nonce); + } + + bool Valid() const { return valid_; }; + const Sha1Description_t& key_desc() const { return key_desc_; }; + const AesCtrKey_t& key() const { return key_; }; + + protected: + bool valid_; + Sha1Description_t key_desc_; + AesCtrKey_t key_; +}; + +// EncryptedEnv2 implements an Env wrapper that adds encryption to files stored +// on disk. + +class EncryptedEnv2 : public EnvWrapper { + public: + using WriteKey_t = + std::pair>; + using ReadKeys_t = + std::map>; + + static Env* Default(); + static Env* Default(ReadKeys_t encrypt_read, WriteKey_t encrypt_write); + + EncryptedEnv2(Env* base_env); + + EncryptedEnv2(Env* base_env, ReadKeys_t encrypt_read, + WriteKey_t encrypt_write); + + void SetKeys(ReadKeys_t encrypt_read, WriteKey_t encrypt_write); + + bool IsWriteEncrypted() const { return nullptr != encrypt_write_.second; } + + // NewSequentialFile opens a file for sequential reading. + virtual Status NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + // NewRandomAccessFile opens a file for random read access. + virtual Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + // NewWritableFile opens a file for sequential writing. + virtual Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + // Create an object that writes to a new file with the specified + // name. Deletes any existing file with the same name and creates a + // new file. On success, stores a pointer to the new file in + // *result and returns OK. On failure stores nullptr in *result and + // returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + // Reuse an existing file by renaming it and opening it as writable. + virtual Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + // Open `fname` for random read and write, if file doesn't exist the file + // will be created. On success, stores a pointer to the new file in + // *result and returns OK. On failure returns non-OK. + // + // The returned file will only be accessed by one thread at a time. + virtual Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; + + // Store in *result the attributes of the children of the specified directory. + // In case the implementation lists the directory prior to iterating the files + // and files are concurrently deleted, the deleted files will be omitted from + // result. + // The name attributes are relative to "dir". + // Original contents of *results are dropped. + // Returns OK if "dir" exists and "*result" contains its children. + // NotFound if "dir" does not exist, the calling process does not have + // permission to access "dir", or if "dir" is invalid. + // IOError if an IO Error was encountered + virtual Status GetChildrenFileAttributes( + const std::string& dir, std::vector* result) override; + + // Store the size of fname in *file_size. + virtual Status GetFileSize(const std::string& fname, + uint64_t* file_size) override; + + // only needed for GetChildrenFileAttributes & GetFileSize + virtual Status GetEncryptionProvider( + const std::string& fname, std::shared_ptr& provider); + + template + Status ReadSeqEncryptionPrefix( + TypeFile* f, std::shared_ptr& provider, + std::unique_ptr& stream) { + Status status; + + provider.reset(); // nullptr for provider implies "no encryption" + stream.release(); + + // Look for encryption marker + EncryptMarker_t marker; + Slice marker_slice; + status = f->Read(sizeof(marker), &marker_slice, marker); + if (status.ok()) { + if (sizeof(marker) == marker_slice.size() && + marker_slice.starts_with(Marker)) { + // code_version currently unused + uint8_t code_version = (uint8_t)marker_slice[7]; + + Slice prefix_slice; + Prefix0_t prefix_buffer; + status = + f->Read(sizeof(Prefix0_t), &prefix_slice, (char*)&prefix_buffer); + if (status.ok() && sizeof(Prefix0_t) == prefix_slice.size()) { + Sha1Description_t desc(prefix_buffer.key_description_, + sizeof(prefix_buffer.key_description_)); + + auto it = encrypt_read_.find(desc); + if (encrypt_read_.end() != it) { + CTREncryptionProvider2* ptr = + (CTREncryptionProvider2*)it->second.get(); + provider = it->second; + stream.reset(new AESBlockAccessCipherStream( + ptr->key(), code_version, prefix_buffer.nonce_)); + } else { + status = Status::NotSupported( + "No encryption key found to match input file"); + } + } + } + } + return status; + } + + template + Status ReadRandEncryptionPrefix( + TypeFile* f, std::shared_ptr& provider, + std::unique_ptr& stream) { + Status status; + + provider.reset(); // nullptr for provider implies "no encryption" + stream.release(); + + // Look for encryption marker + EncryptMarker_t marker; + Slice marker_slice; + status = f->Read(0, sizeof(marker), &marker_slice, marker); + if (status.ok()) { + if (sizeof(marker) == marker_slice.size() && + marker_slice.starts_with(Marker)) { + // code_version currently unused + uint8_t code_version = (uint8_t)marker_slice[7]; + + Slice prefix_slice; + Prefix0_t prefix_buffer; + status = f->Read(sizeof(marker), sizeof(Prefix0_t), &prefix_slice, + (char*)&prefix_buffer); + if (status.ok() && sizeof(Prefix0_t) == prefix_slice.size()) { + Sha1Description_t desc(prefix_buffer.key_description_, + sizeof(prefix_buffer.key_description_)); + + auto it = encrypt_read_.find(desc); + if (encrypt_read_.end() != it) { + CTREncryptionProvider2* ptr = + (CTREncryptionProvider2*)it->second.get(); + provider = it->second; + stream.reset(new AESBlockAccessCipherStream( + ptr->key(), code_version, prefix_buffer.nonce_)); + } else { + status = Status::NotSupported( + "No encryption key found to match input file"); + } + } + } + } + return status; + } + + template + Status WriteSeqEncryptionPrefix( + TypeFile* f, std::unique_ptr& stream) { + Status status; + + // set up Encryption maker, code version '0' + uint8_t code_version = {'0'}; + Prefix0_t prefix; + EncryptMarker_t marker; + strncpy(marker, Marker, sizeof(Marker)); + marker[sizeof(EncryptMarker_t) - 1] = code_version; + + Slice marker_slice(marker, sizeof(EncryptMarker_t)); + status = f->Append(marker_slice); + + if (status.ok()) { + // create nonce, then write it and key description + Slice prefix_slice((char*)&prefix, sizeof(prefix)); + + status = encrypt_write_.second->CreateNewPrefix( + std::string(), (char*)&prefix, + encrypt_write_.second->GetPrefixLength()); + + if (status.ok()) { + status = f->Append(prefix_slice); + } + } + + if (status.ok()) { + CTREncryptionProvider2* ptr = + (CTREncryptionProvider2*)encrypt_write_.second.get(); + stream.reset(new AESBlockAccessCipherStream(ptr->key(), code_version, + prefix.nonce_)); + } + + return status; + } + + template + Status WriteRandEncryptionPrefix( + TypeFile* f, std::unique_ptr& stream) { + Status status; + + // set up Encryption maker, code version '0' + uint8_t code_version = {'0'}; + Prefix0_t prefix; + EncryptMarker_t marker; + strncpy(marker, Marker, sizeof(Marker)); + marker[sizeof(EncryptMarker_t) - 1] = code_version; + + Slice marker_slice(marker, sizeof(EncryptMarker_t)); + status = f->Write(0, marker_slice); + + if (status.ok()) { + // create nonce, then write it and key description + Slice prefix_slice((char*)&prefix, sizeof(prefix)); + + status = encrypt_write_.second->CreateNewPrefix( + std::string(), (char*)&prefix, + encrypt_write_.second->GetPrefixLength()); + + if (status.ok()) { + status = f->Write(sizeof(EncryptMarker_t), prefix_slice); + } + } + + if (status.ok()) { + CTREncryptionProvider2* ptr = + (CTREncryptionProvider2*)encrypt_write_.second.get(); + stream.reset(new AESBlockAccessCipherStream(ptr->key(), code_version, + prefix.nonce_)); + } + + return status; + } + + bool IsValid() const { return valid_; } + + static UnixLibCrypto crypto_; + + protected: + std::map> + encrypt_read_; + std::pair> + encrypt_write_; + + bool valid_; +}; + +// Returns an Env that encrypts data when stored on disk and decrypts data when +// read from disk. Prefer EncryptedEnv2::Default(). +Env* NewEncryptedEnv2(Env* base_env, EncryptedEnv2::ReadKeys_t encrypt_read, + EncryptedEnv2::WriteKey_t encrypt_write); + +#endif // ROCKSDB_LITE + +} // namespace rocksdb + +#endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h index a6e919546..ff2898f4f 100644 --- a/include/rocksdb/env_encryption.h +++ b/include/rocksdb/env_encryption.h @@ -5,7 +5,7 @@ #pragma once -#if !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) #include @@ -15,184 +15,414 @@ namespace rocksdb { class EncryptionProvider; -// Returns an Env that encrypts data when stored on disk and decrypts data when +// Returns an Env that encrypts data when stored on disk and decrypts data when // read from disk. Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider); -// BlockAccessCipherStream is the base class for any cipher stream that -// supports random access at block level (without requiring data from other blocks). -// E.g. CTR (Counter operation mode) supports this requirement. +// BlockAccessCipherStream is the base class for any cipher stream that +// supports random access at block level (without requiring data from other +// blocks). E.g. CTR (Counter operation mode) supports this requirement. class BlockAccessCipherStream { - public: - virtual ~BlockAccessCipherStream() {}; + public: + virtual ~BlockAccessCipherStream(){}; - // BlockSize returns the size of each block supported by this cipher stream. - virtual size_t BlockSize() = 0; + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() = 0; - // Encrypt one or more (partial) blocks of data at the file offset. - // Length of data is given in dataSize. - virtual Status Encrypt(uint64_t fileOffset, char *data, size_t dataSize); + // Encrypt one or more (partial) blocks of data at the file offset. + // Length of data is given in dataSize. + virtual Status Encrypt(uint64_t fileOffset, char* data, size_t dataSize); - // Decrypt one or more (partial) blocks of data at the file offset. - // Length of data is given in dataSize. - virtual Status Decrypt(uint64_t fileOffset, char *data, size_t dataSize); + // Decrypt one or more (partial) blocks of data at the file offset. + // Length of data is given in dataSize. + virtual Status Decrypt(uint64_t fileOffset, char* data, size_t dataSize); - protected: - // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. - virtual void AllocateScratch(std::string&) = 0; + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + virtual void AllocateScratch(std::string&) = 0; - // Encrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); - virtual Status EncryptBlock(uint64_t blockIndex, char *data, char* scratch) = 0; + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) = 0; - // Decrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); - virtual Status DecryptBlock(uint64_t blockIndex, char *data, char* scratch) = 0; + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) = 0; }; -// BlockCipher +// BlockCipher class BlockCipher { - public: - virtual ~BlockCipher() {}; + public: + virtual ~BlockCipher(){}; - // BlockSize returns the size of each block supported by this cipher stream. - virtual size_t BlockSize() = 0; + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() = 0; - // Encrypt a block of data. - // Length of data is equal to BlockSize(). - virtual Status Encrypt(char *data) = 0; + // Encrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Encrypt(char* data) = 0; - // Decrypt a block of data. - // Length of data is equal to BlockSize(). - virtual Status Decrypt(char *data) = 0; + // Decrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Decrypt(char* data) = 0; }; // Implements a BlockCipher using ROT13. // -// Note: This is a sample implementation of BlockCipher, +// Note: This is a sample implementation of BlockCipher, // it is NOT considered safe and should NOT be used in production. class ROT13BlockCipher : public BlockCipher { - private: - size_t blockSize_; - public: - ROT13BlockCipher(size_t blockSize) - : blockSize_(blockSize) {} - virtual ~ROT13BlockCipher() {}; - - // BlockSize returns the size of each block supported by this cipher stream. - virtual size_t BlockSize() override { return blockSize_; } - - // Encrypt a block of data. - // Length of data is equal to BlockSize(). - virtual Status Encrypt(char *data) override; - - // Decrypt a block of data. - // Length of data is equal to BlockSize(). - virtual Status Decrypt(char *data) override; + private: + size_t blockSize_; + + public: + ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {} + virtual ~ROT13BlockCipher(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() override { return blockSize_; } + + // Encrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Encrypt(char* data) override; + + // Decrypt a block of data. + // Length of data is equal to BlockSize(). + virtual Status Decrypt(char* data) override; }; -// CTRCipherStream implements BlockAccessCipherStream using an -// Counter operations mode. +// CTRCipherStream implements BlockAccessCipherStream using an +// Counter operations mode. // See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation // -// Note: This is a possible implementation of BlockAccessCipherStream, +// Note: This is a possible implementation of BlockAccessCipherStream, // it is considered suitable for use. class CTRCipherStream final : public BlockAccessCipherStream { - private: - BlockCipher& cipher_; - std::string iv_; - uint64_t initialCounter_; - public: - CTRCipherStream(BlockCipher& c, const char *iv, uint64_t initialCounter) - : cipher_(c), iv_(iv, c.BlockSize()), initialCounter_(initialCounter) {}; - virtual ~CTRCipherStream() {}; - - // BlockSize returns the size of each block supported by this cipher stream. - virtual size_t BlockSize() override { return cipher_.BlockSize(); } - - protected: - // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. - virtual void AllocateScratch(std::string&) override; - - // Encrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); - virtual Status EncryptBlock(uint64_t blockIndex, char *data, char *scratch) override; - - // Decrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); - virtual Status DecryptBlock(uint64_t blockIndex, char *data, char *scratch) override; + private: + BlockCipher& cipher_; + std::string iv_; + uint64_t initialCounter_; + + public: + CTRCipherStream(BlockCipher& c, const char* iv, uint64_t initialCounter) + : cipher_(c), iv_(iv, c.BlockSize()), initialCounter_(initialCounter){}; + virtual ~CTRCipherStream(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + virtual size_t BlockSize() override { return cipher_.BlockSize(); } + + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + virtual void AllocateScratch(std::string&) override; + + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) override; + + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + virtual Status DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) override; }; -// The encryption provider is used to create a cipher stream for a specific file. -// The returned cipher stream will be used for actual encryption/decryption -// actions. +// The encryption provider is used to create a cipher stream for a specific +// file. The returned cipher stream will be used for actual +// encryption/decryption actions. class EncryptionProvider { public: - virtual ~EncryptionProvider() {}; - - // GetPrefixLength returns the length of the prefix that is added to every file - // and used for storing encryption options. - // For optimal performance, the prefix length should be a multiple of - // the page size. - virtual size_t GetPrefixLength() = 0; - - // CreateNewPrefix initialized an allocated block of prefix memory - // for a new file. - virtual Status CreateNewPrefix(const std::string& fname, char *prefix, size_t prefixLength) = 0; - - // CreateCipherStream creates a block access cipher stream for a file given - // given name and options. - virtual Status CreateCipherStream( - const std::string& fname, const EnvOptions& options, Slice& prefix, - std::unique_ptr* result) = 0; + virtual ~EncryptionProvider(){}; + + // GetPrefixLength returns the length of the prefix that is added to every + // file and used for storing encryption options. For optimal performance, the + // prefix length should be a multiple of the page size. + virtual size_t GetPrefixLength() = 0; + + // CreateNewPrefix initialized an allocated block of prefix memory + // for a new file. + virtual Status CreateNewPrefix(const std::string& fname, char* prefix, + size_t prefixLength) = 0; + + // CreateCipherStream creates a block access cipher stream for a file given + // given name and options. + virtual Status CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr* result) = 0; }; -// This encryption provider uses a CTR cipher stream, with a given block cipher +// This encryption provider uses a CTR cipher stream, with a given block cipher // and IV. // -// Note: This is a possible implementation of EncryptionProvider, +// Note: This is a possible implementation of EncryptionProvider, // it is considered suitable for use, provided a safe BlockCipher is used. class CTREncryptionProvider : public EncryptionProvider { - private: - BlockCipher& cipher_; - protected: - const static size_t defaultPrefixLength = 4096; + private: + BlockCipher& cipher_; + + protected: + const static size_t defaultPrefixLength = 4096; + + public: + CTREncryptionProvider(BlockCipher& c) : cipher_(c){}; + virtual ~CTREncryptionProvider() {} + + // GetPrefixLength returns the length of the prefix that is added to every + // file + // and used for storing encryption options. + // For optimal performance, the prefix length should be a multiple of + // the page size. + virtual size_t GetPrefixLength() override; + + // CreateNewPrefix initialized an allocated block of prefix memory + // for a new file. + virtual Status CreateNewPrefix(const std::string& fname, char* prefix, + size_t prefixLength) override; + + // CreateCipherStream creates a block access cipher stream for a file given + // given name and options. + virtual Status CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr* result) override; + + protected: + // PopulateSecretPrefixPart initializes the data into a new prefix block + // that will be encrypted. This function will store the data in plain text. + // It will be encrypted later (before written to disk). + // Returns the amount of space (starting from the start of the prefix) + // that has been initialized. + virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength, + size_t blockSize); + + // CreateCipherStreamFromPrefix creates a block access cipher stream for a + // file given + // given name and options. The given prefix is already decrypted. + virtual Status CreateCipherStreamFromPrefix( + const std::string& fname, const EnvOptions& options, + uint64_t initialCounter, const Slice& iv, const Slice& prefix, + std::unique_ptr* result); +}; + +class EncryptedSequentialFile : public SequentialFile { + private: + std::unique_ptr file_; + std::unique_ptr stream_; + uint64_t offset_; + size_t prefixLength_; + + public: + // Default ctor. Given underlying sequential file is supposed to be at + // offset == prefixLength. + EncryptedSequentialFile(SequentialFile* f, BlockAccessCipherStream* s, + size_t prefixLength) + : file_(f), + stream_(s), + offset_(prefixLength), + prefixLength_(prefixLength) {} + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization + virtual Status Read(size_t n, Slice* result, char* scratch) override; + + // Skip "n" bytes from the file. This is guaranteed to be no + // slower that reading the same data, but may be faster. + // + // If end of file is reached, skipping will stop at the end of the + // file, and Skip will return OK. + // + // REQUIRES: External synchronization + virtual Status Skip(uint64_t n) override; + + // Indicates the upper layers if the current SequentialFile implementation + // uses direct IO. + virtual bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t offset, size_t length) override; + + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned + virtual Status PositionedRead(uint64_t offset, size_t n, Slice* result, + char* scratch) override; +}; + +// A file abstraction for randomly reading the contents of a file. +class EncryptedRandomAccessFile : public RandomAccessFile { + private: + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; + + public: + EncryptedRandomAccessFile(RandomAccessFile* f, BlockAccessCipherStream* s, + size_t prefixLength) + : file_(f), stream_(s), prefixLength_(prefixLength) {} + + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + // Readahead the file starting from offset by n bytes for caching. + virtual Status Prefetch(uint64_t offset, size_t n) override; + + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. + virtual size_t GetUniqueId(char* id, size_t max_size) const override; + + virtual void Hint(AccessPattern pattern) override; + + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. + virtual bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + virtual Status InvalidateCache(size_t offset, size_t length) override; +}; + +// A file abstraction for sequential writing. The implementation +// must provide buffering since callers may append small fragments +// at a time to the file. +class EncryptedWritableFile : public WritableFileWrapper { + private: + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; + + public: + // Default ctor. Prefix is assumed to be written already. + EncryptedWritableFile(WritableFile* f, BlockAccessCipherStream* s, + size_t prefixLength) + : WritableFileWrapper(f), + file_(f), + stream_(s), + prefixLength_(prefixLength) {} + + Status Append(const Slice& data) override; + + Status PositionedAppend(const Slice& data, uint64_t offset) override; + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + virtual bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const override; + + /* + * Get the size of valid data in the file. + */ + virtual uint64_t GetFileSize() override; + + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. + virtual Status Truncate(uint64_t size) override; + + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. + virtual Status InvalidateCache(size_t offset, size_t length) override; + + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual Status RangeSync(uint64_t offset, uint64_t nbytes) override; + + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. + virtual void PrepareWrite(size_t offset, size_t len) override; + + // Pre-allocates space for a file. + virtual Status Allocate(uint64_t offset, uint64_t len) override; +}; + +// A file abstraction for random reading and writing. +class EncryptedRandomRWFile : public RandomRWFile { + private: + std::unique_ptr file_; + std::unique_ptr stream_; + size_t prefixLength_; public: - CTREncryptionProvider(BlockCipher& c) - : cipher_(c) {}; - virtual ~CTREncryptionProvider() {} - - // GetPrefixLength returns the length of the prefix that is added to every file - // and used for storing encryption options. - // For optimal performance, the prefix length should be a multiple of - // the page size. - virtual size_t GetPrefixLength() override; - - // CreateNewPrefix initialized an allocated block of prefix memory - // for a new file. - virtual Status CreateNewPrefix(const std::string& fname, char *prefix, size_t prefixLength) override; - - // CreateCipherStream creates a block access cipher stream for a file given - // given name and options. - virtual Status CreateCipherStream( - const std::string& fname, const EnvOptions& options, Slice& prefix, - std::unique_ptr* result) override; - - protected: - // PopulateSecretPrefixPart initializes the data into a new prefix block - // that will be encrypted. This function will store the data in plain text. - // It will be encrypted later (before written to disk). - // Returns the amount of space (starting from the start of the prefix) - // that has been initialized. - virtual size_t PopulateSecretPrefixPart(char *prefix, size_t prefixLength, size_t blockSize); - - // CreateCipherStreamFromPrefix creates a block access cipher stream for a file given - // given name and options. The given prefix is already decrypted. - virtual Status CreateCipherStreamFromPrefix( - const std::string& fname, const EnvOptions& options, - uint64_t initialCounter, const Slice& iv, const Slice& prefix, - std::unique_ptr* result); + EncryptedRandomRWFile(RandomRWFile* f, BlockAccessCipherStream* s, + size_t prefixLength) + : file_(f), stream_(s), prefixLength_(prefixLength) {} + + // Indicates if the class makes use of direct I/O + // If false you must pass aligned buffer to Write() + virtual bool use_direct_io() const override; + + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O + virtual size_t GetRequiredBufferAlignment() const override; + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + virtual Status Write(uint64_t offset, const Slice& data) override; + + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + + virtual Status Flush() override; + + virtual Status Sync() override; + + virtual Status Fsync() override; + + virtual Status Close() override; }; } // namespace rocksdb diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 3f125c213..dad4413b4 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -201,6 +201,12 @@ struct PerfContext { uint64_t env_lock_file_nanos; uint64_t env_unlock_file_nanos; uint64_t env_new_logger_nanos; + + // Time spent in encrypting data. Populated when EncryptedEnv is used. + uint64_t encrypt_data_nanos; + // Time spent in decrypting data. Populated when EncryptedEnv is used. + uint64_t decrypt_data_nanos; + std::map* level_to_perf_context = nullptr; bool per_level_perf_context_enabled = false; }; diff --git a/port/BUILD b/port/BUILD index f84f06e56..d79463fbf 100644 --- a/port/BUILD +++ b/port/BUILD @@ -12,6 +12,7 @@ PLATFORM_DEFINES = select({ "ROCKSDB_SCHED_GETCPU_PRESENT", "rocksdb_shared_EXPORTS", "NPERF_CONTEXT", + "ROCKSDB_OPENSSL_AES_CTR", ], "//:osx": [ "OS_MACOSX", @@ -20,6 +21,7 @@ PLATFORM_DEFINES = select({ "ROCKSDB_LIB_IO_POSIX", "rocksdb_shared_EXPORTS", "NPERF_CONTEXT", + "ROCKSDB_OPENSSL_AES_CTR", ], "//:windows": [ "OS_WIN", diff --git a/src.mk b/src.mk index 990aa2ab7..f32bf6945 100644 --- a/src.mk +++ b/src.mk @@ -61,6 +61,7 @@ LIB_SOURCES = \ env/env.cc \ env/env_chroot.cc \ env/env_encryption.cc \ + env/env_encrypt2.cc \ env/env_hdfs.cc \ env/env_posix.cc \ env/io_posix.cc \ @@ -146,6 +147,7 @@ LIB_SOURCES = \ util/filter_policy.cc \ util/hash.cc \ util/jemalloc_nodump_allocator.cc \ + util/library_loader.cc \ util/log_buffer.cc \ util/murmurhash.cc \ util/random.cc \ @@ -344,6 +346,7 @@ MAIN_SOURCES = \ db/write_callback_test.cc \ db/write_controller_test.cc \ env/env_basic_test.cc \ + env/env_encrypt2_test.cc \ env/env_test.cc \ env/mock_env_test.cc \ memtable/inlineskiplist_test.cc \ @@ -382,6 +385,7 @@ MAIN_SOURCES = \ util/dynamic_bloom_test.cc \ util/event_logger_test.cc \ util/filelock_test.cc \ + util/library_loader_test.cc \ util/log_write_bench.cc \ util/rate_limiter_test.cc \ util/repeatable_thread_test.cc \ diff --git a/util/BUILD b/util/BUILD index 4ef948ee0..63534c67f 100644 --- a/util/BUILD +++ b/util/BUILD @@ -376,6 +376,14 @@ constrained_test( ], ) +constrained_test( + name = "library_loader_test", + srcs = ["library_loader_test.cc"], + deps = [ + ":test_utils", + ":util", + ], +) constrained_test( name = "rate_limiter_test", srcs = ["rate_limiter_test.cc"], diff --git a/util/build_version.cc b/util/build_version.cc index b323f6aff..8465b15ea 100644 --- a/util/build_version.cc +++ b/util/build_version.cc @@ -1,4 +1,4 @@ #include "build_version.h" -const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:REDACTED"; -const char* rocksdb_build_git_date = "rocksdb_build_git_date:REDACTED"; +const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:42f9f56c6fc201804d3905deb5f6337669578220"; +const char* rocksdb_build_git_date = "rocksdb_build_git_date:2020-04-22"; const char* rocksdb_build_compile_date = __DATE__; diff --git a/util/library_loader.cc b/util/library_loader.cc new file mode 100644 index 000000000..081a929a2 --- /dev/null +++ b/util/library_loader.cc @@ -0,0 +1,114 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/library_loader.h" + +#include + +// link with -ldl + +namespace rocksdb { + +#ifdef OS_MACOSX + const char * UnixLibCrypto::crypto_lib_name_ = "libcrypto.dylib"; +#else + const char * UnixLibCrypto::crypto_lib_name_ = "libcrypto.so"; +#endif + +UnixLibraryLoader::UnixLibraryLoader(const char * library_name) + : dl_handle_(nullptr) { + + if (nullptr != library_name && '\0' != *library_name) { + dl_handle_ = dlopen(library_name, RTLD_NOW | RTLD_GLOBAL); + + is_valid_ = (nullptr != dl_handle_); + + if (!is_valid_) { + last_error_msg_ = dlerror(); + } + } +} + + +UnixLibraryLoader::~UnixLibraryLoader() { + if (nullptr != dl_handle_ ) { + int ret_val = dlclose(dl_handle_); + dl_handle_ = nullptr; + is_valid_ = false; + + if (0 != ret_val) { + last_error_msg_ = dlerror(); + } + } +} + + +void * UnixLibraryLoader::GetEntryPoint(const char * function_name) { + void * ret_ptr = {nullptr}; + + if (is_valid_) { + ret_ptr = dlsym(dl_handle_, function_name); + if (nullptr == ret_ptr) { + last_error_msg_ = dlerror(); + } + } + + return ret_ptr; +} + + +size_t UnixLibraryLoader::GetEntryPoints(std::map & functions) { + size_t num_found {0}; + + if (is_valid_) { + for (auto& func : functions) { + void * tmp_ptr; + + tmp_ptr = GetEntryPoint(func.first.c_str()); + if (nullptr != tmp_ptr) { + ++num_found; + func.second = tmp_ptr; + } + } + } + + return num_found; +} + +UnixLibCrypto::UnixLibCrypto() + : UnixLibraryLoader(crypto_lib_name_) { + if (is_valid_) { + // size of map minus two since _new/_create and _free/_destroy + // only resolve one of the two. + is_valid_ = ((functions_.size()-2) == GetEntryPoints(functions_)); + + ctx_new_ = (EVP_MD_CTX_new_t) functions_["EVP_MD_CTX_new"]; + if (nullptr == ctx_new_) { + ctx_new_ = (EVP_MD_CTX_new_t) functions_["EVP_MD_CTX_create"]; + } + + digest_init_ = (EVP_DigestInit_ex_t) functions_["EVP_DigestInit_ex"]; + sha1_ = (EVP_sha1_t) functions_["EVP_sha1"]; + digest_update_ = (EVP_DigestUpdate_t) functions_["EVP_DigestUpdate"]; + digest_final_ = (EVP_DigestFinal_ex_t) functions_["EVP_DigestFinal_ex"]; + + ctx_free_ = (EVP_MD_CTX_free_t) functions_["EVP_MD_CTX_free"]; + if (nullptr == ctx_free_) { + ctx_free_ = (EVP_MD_CTX_free_t) functions_["EVP_MD_CTX_destroy"]; + } + + rand_bytes_ = (RAND_bytes_t) functions_["RAND_bytes"]; + rand_poll_ = (RAND_poll_t) functions_["RAND_poll"]; + + cipher_new_ = (EVP_CIPHER_CTX_new_t) functions_["EVP_CIPHER_CTX_new"]; + cipher_free_ = (EVP_CIPHER_CTX_free_t) functions_["EVP_CIPHER_CTX_free"]; + encrypt_init_ = (EVP_EncryptInit_ex_t) functions_["EVP_EncryptInit_ex"]; + aes_256_ctr_ = (EVP_aes_256_ctr_t) functions_["EVP_aes_256_ctr"]; + encrypt_update_ = (EVP_EncryptUpdate_t) functions_["EVP_EncryptUpdate"]; + + } +} + +} // namespace rocksdb diff --git a/util/library_loader.h b/util/library_loader.h new file mode 100644 index 000000000..dd9cffd94 --- /dev/null +++ b/util/library_loader.h @@ -0,0 +1,169 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include +#include +#include + +namespace rocksdb { + +// Base class / interface +// expectation is to derive one class for unux and one for Windows +// +class LibraryLoader { + public: + LibraryLoader() : is_valid_(false) {}; + virtual ~LibraryLoader() = default; + + bool IsValid() const {return is_valid_;} + + virtual void * GetEntryPoint(const char * function_name) = 0; + + protected: + bool is_valid_; +}; + + +class UnixLibraryLoader : public LibraryLoader { + public: + UnixLibraryLoader() = delete; + + UnixLibraryLoader(const char * library_name); + + virtual ~UnixLibraryLoader(); + + virtual void * GetEntryPoint(const char * function_name) override; + + virtual size_t GetEntryPoints(std::map & functions); + +protected: + void * dl_handle_; + std::string last_error_msg_; +}; + + +class UnixLibCrypto : public UnixLibraryLoader { +public: + UnixLibCrypto(); + virtual ~UnixLibCrypto() = default; + + // _new & _free are ssl 1.1, replacing 1.0 _create & _destroy + using EVP_MD_CTX_new_t = EVP_MD_CTX * (*)(void); + using EVP_DigestInit_ex_t = int (*)(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl); + using EVP_sha1_t = const EVP_MD * (*)(void); + using EVP_DigestUpdate_t = int (*)(EVP_MD_CTX *ctx, const void *d, size_t cnt); + using EVP_DigestFinal_ex_t = int (*)(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *s); + using EVP_MD_CTX_free_t = void (*)(EVP_MD_CTX *ctx); + + EVP_MD_CTX * EVP_MD_CTX_new() const {return ctx_new_();}; + + int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl) { + return digest_init_(ctx, type, impl); + } + + const EVP_MD * EVP_sha1() {return sha1_();} + + int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *d, size_t cnt) { + return digest_update_(ctx, d, cnt); + } + + int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *s) { + return digest_final_(ctx, md, s); + } + + void EVP_MD_CTX_free(EVP_MD_CTX *ctx) { + ctx_free_(ctx); + } + + EVP_MD_CTX_free_t EVP_MD_CTX_free_ptr() { + return ctx_free_; + } + + using RAND_bytes_t = int (*)(unsigned char *buf, int num); + using RAND_poll_t = int (*)(); + + int RAND_bytes(unsigned char *buf, int num) { + return rand_bytes_(buf, num); + } + + int RAND_poll() { + return rand_poll_(); + } + + using EVP_CIPHER_CTX_new_t = EVP_CIPHER_CTX * (*)(void); + using EVP_CIPHER_CTX_free_t = void (*)(EVP_CIPHER_CTX *ctx); + using EVP_EncryptInit_ex_t = int (*)(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *type, + ENGINE *impl, const unsigned char *key, const unsigned char *iv); + using EVP_aes_256_ctr_t = const EVP_CIPHER * (*)(void); + using EVP_EncryptUpdate_t = int (*)(EVP_CIPHER_CTX *ctx, unsigned char *out, + int *outl, const unsigned char *in, int inl); + + EVP_CIPHER_CTX *EVP_CIPHER_CTX_new(void) const {return cipher_new_();}; + + void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *ctx) { + cipher_free_(ctx); + } + + EVP_CIPHER_CTX_free_t EVP_CIPHER_CTX_free_ptr() { + return cipher_free_; + } + + int EVP_EncryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *type, + ENGINE *impl, const unsigned char *key, const unsigned char *iv) { + return encrypt_init_(ctx, type, impl, key, iv); + } + + const EVP_CIPHER * EVP_aes_256_ctr() { + return aes_256_ctr_(); + } + + int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, + int *outl, const unsigned char *in, int inl) { + return encrypt_update_(ctx, out, outl, in, inl); + } + + static const char * crypto_lib_name_; + +protected: + std::map functions_ { + {"EVP_MD_CTX_new", nullptr}, {"EVP_MD_CTX_create", nullptr}, + {"EVP_DigestInit_ex", nullptr}, + {"EVP_sha1", nullptr}, + {"EVP_DigestUpdate", nullptr}, + {"EVP_DigestFinal_ex", nullptr}, + {"EVP_MD_CTX_free", nullptr}, {"EVP_MD_CTX_destroy", nullptr}, + + {"RAND_bytes", nullptr}, + {"RAND_poll", nullptr}, + + {"EVP_CIPHER_CTX_new", nullptr}, + {"EVP_CIPHER_CTX_free", nullptr}, + {"EVP_EncryptInit_ex", nullptr}, + {"EVP_aes_256_ctr", nullptr}, + {"EVP_EncryptUpdate", nullptr}, + + }; + + EVP_MD_CTX_new_t ctx_new_; + EVP_DigestInit_ex_t digest_init_; + EVP_sha1_t sha1_; + EVP_DigestUpdate_t digest_update_; + EVP_DigestFinal_ex_t digest_final_; + EVP_MD_CTX_free_t ctx_free_; + + RAND_bytes_t rand_bytes_; + RAND_poll_t rand_poll_; + + EVP_CIPHER_CTX_new_t cipher_new_; + EVP_CIPHER_CTX_free_t cipher_free_; + EVP_EncryptInit_ex_t encrypt_init_; + EVP_aes_256_ctr_t aes_256_ctr_; + EVP_EncryptUpdate_t encrypt_update_; + +}; + +} // namespace rocksdb diff --git a/util/library_loader_test.cc b/util/library_loader_test.cc new file mode 100644 index 000000000..cb8e094bd --- /dev/null +++ b/util/library_loader_test.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "util/library_loader.h" + +namespace rocksdb { + +#ifdef OS_MACOSX + static const char * LIB_M_NAME = "libm.dylib"; + static const char * LIB_BAD_NAME = "libbubbagump.dylib"; + static const char * LIB_SSL_NAME = "libssl.dylib"; +#else + static const char * LIB_M_NAME = "libm.so.6"; + static const char * LIB_BAD_NAME = "libbubbagump.so"; + static const char * LIB_SSL_NAME = "libssl.so"; +#endif + + +class UnixLibraryLoaderTest {}; + +TEST(UnixLibraryLoaderTest, Simple) { + + UnixLibraryLoader works(LIB_M_NAME); + UnixLibraryLoader fails(LIB_BAD_NAME); + + ASSERT_TRUE(works.IsValid()); + ASSERT_FALSE(fails.IsValid()); + + double (*floor)(double); + + floor = (double (*)(double))works.GetEntryPoint("floor"); + ASSERT_TRUE(nullptr != floor); + ASSERT_TRUE(2.0 == (*floor)(2.2)); + +} + +TEST(UnixLibraryLoaderTest, SSL) { + UnixLibraryLoader ssl(LIB_SSL_NAME); + UnixLibraryLoader crypto(UnixLibCrypto::crypto_lib_name_); + + ASSERT_TRUE(ssl.IsValid()); + ASSERT_TRUE(crypto.IsValid()); + +} + +TEST(UnixLibraryLoaderTest, Crypto) { + UnixLibCrypto crypto; + uint8_t desc[EVP_MAX_MD_SIZE]; + EVP_MD_CTX * context; + int ret_val; + unsigned len; + + ASSERT_TRUE(crypto.IsValid()); + + context = crypto.EVP_MD_CTX_create(); + ASSERT_TRUE(nullptr != context); + + ret_val = crypto.EVP_DigestInit_ex(context, crypto.EVP_sha1(), nullptr); + ASSERT_TRUE(1 == ret_val); + + ret_val = crypto.EVP_DigestUpdate(context, "1", 1); + ASSERT_TRUE(1 == ret_val); + + ret_val = crypto.EVP_DigestFinal_ex(context, desc, &len); + ASSERT_TRUE(1 == ret_val); + ASSERT_TRUE(20 == len); + + uint8_t md2[] = {0x35, 0x6a, 0x19, 0x2b, 0x79, 0x13, 0xb0, 0x4c, + 0x54, 0x57, 0x4d, 0x18, 0xc2, 0x8d, 0x46, 0xe6, + 0x39, 0x54, 0x28, 0xab}; + ASSERT_TRUE(0 == memcmp(md2, desc, sizeof(md2))); + + crypto.EVP_MD_CTX_free(context); + +} + + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/util/sync_point.h b/util/sync_point.h index 7aa114f2f..b9767d3d3 100644 --- a/util/sync_point.h +++ b/util/sync_point.h @@ -130,12 +130,12 @@ class SyncPoint { // utilized to re-produce race conditions between threads. // See TransactionLogIteratorRace in db_test.cc for an example use case. // TEST_SYNC_POINT is no op in release build. -#define TEST_SYNC_POINT(x) - //rocksdb::SyncPoint::GetInstance()->Process(x) +#define TEST_SYNC_POINT(x) \ + rocksdb::SyncPoint::GetInstance()->Process(x) #define TEST_IDX_SYNC_POINT(x, index) \ - //rocksdb::SyncPoint::GetInstance()->Process(x + std::to_string(index)) + rocksdb::SyncPoint::GetInstance()->Process(x + std::to_string(index)) #define TEST_SYNC_POINT_CALLBACK(x, y) \ - //rocksdb::SyncPoint::GetInstance()->Process(x, y) + rocksdb::SyncPoint::GetInstance()->Process(x, y) #define INIT_SYNC_POINT_SINGLETONS() \ (void)rocksdb::SyncPoint::GetInstance(); #endif // NDEBUG From a7d2e8d154d250532b924dd264d7a190e3103169 Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Fri, 5 Jun 2020 10:23:40 -0400 Subject: [PATCH 46/57] fill in missing CompactionJobInfo member. (#8) Co-authored-by: matthewvon --- db/db_impl_compaction_flush.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/db/db_impl_compaction_flush.cc b/db/db_impl_compaction_flush.cc index a42e60f85..53907d178 100644 --- a/db/db_impl_compaction_flush.cc +++ b/db/db_impl_compaction_flush.cc @@ -1057,6 +1057,7 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex"); { CompactionJobInfo info; + info.cf_id = cfd->GetID(); info.cf_name = cfd->GetName(); info.status = st; info.thread_id = env_->GetThreadID(); From 5cb15956a2e646b6b407a476168a04dfe993ade2 Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Fri, 12 Jun 2020 10:36:33 -0400 Subject: [PATCH 47/57] default skiplist height same as original leveldb which had 4M write buffers. Raising to 17 to match Basho. Have seen 5% improvement in bsbm100m create. (#9) Co-authored-by: MatthewVon --- memtable/inlineskiplist.h | 2 +- memtable/skiplist.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/memtable/inlineskiplist.h b/memtable/inlineskiplist.h index 1ef8f2b6d..3b12a42af 100644 --- a/memtable/inlineskiplist.h +++ b/memtable/inlineskiplist.h @@ -72,7 +72,7 @@ class InlineSkipList { // in the allocator must remain allocated for the lifetime of the // skiplist object. explicit InlineSkipList(Comparator cmp, Allocator* allocator, - int32_t max_height = 12, + int32_t max_height = 17, int32_t branching_factor = 4); // Allocates a key and a skip-list node, returning a pointer to the key diff --git a/memtable/skiplist.h b/memtable/skiplist.h index 47a89034e..22c9690c3 100644 --- a/memtable/skiplist.h +++ b/memtable/skiplist.h @@ -50,7 +50,7 @@ class SkipList { // and will allocate memory using "*allocator". Objects allocated in the // allocator must remain allocated for the lifetime of the skiplist object. explicit SkipList(Comparator cmp, Allocator* allocator, - int32_t max_height = 12, int32_t branching_factor = 4); + int32_t max_height = 17, int32_t branching_factor = 4); // Insert key into the list. // REQUIRES: nothing that compares equal to key is currently in the list. From 844661eeea4259a5e782437eb55221cd8cd51c4a Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Tue, 23 Jun 2020 14:16:14 -0400 Subject: [PATCH 48/57] Porting Facebook required changes back to Stardog (#10) * add ROCKSDB_LITE ifdef around encryption * adjust code per Facebook PR comments * adjust code per Facebook PR comments Co-authored-by: matthewvon --- env/env_basic_test.cc | 8 +++---- env/env_encryption.cc | 14 +++++------ include/rocksdb/env_encryption.h | 40 +++++++++++++++++++------------- 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index 42a6a98ef..54a492939 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -12,8 +12,8 @@ #include "env/mock_env.h" #include "rocksdb/env.h" #include "rocksdb/env_encryption.h" -#include "rocksdb/utilities/object_registry.h" #include "util/testharness.h" + namespace rocksdb { // Normalizes trivial differences across Envs such that these test cases can @@ -91,6 +91,7 @@ static std::unique_ptr mock_env(new MockEnv(Env::Default())); INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam, ::testing::Values(mock_env.get())); +#ifndef ROCKSDB_LITE // next statements run env test against default encryption code. static ROT13BlockCipher encrypt_block_rot13(32); @@ -102,6 +103,7 @@ INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvBasicTestWithParam, ::testing::Values(encrypt_env.get())); INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvMoreTestWithParam, ::testing::Values(encrypt_env.get())); +#endif // ROCKSDB_LITE #ifndef ROCKSDB_LITE static std::unique_ptr mem_env(NewMemEnv(Env::Default())); @@ -117,13 +119,11 @@ namespace { // ValuesIn() will skip running tests when given an empty collection. std::vector GetCustomEnvs() { static Env* custom_env; - static std::unique_ptr custom_env_guard; static bool init = false; if (!init) { init = true; const char* uri = getenv("TEST_ENV_URI"); if (uri != nullptr) { - custom_env = NewCustomObject(uri, &custom_env_guard); // Env::LoadEnv(uri, &custom_env); } } @@ -363,7 +363,7 @@ TEST_P(EnvMoreTestWithParam, GetChildren) { ASSERT_EQ(0U, children.size()); } -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/env/env_encryption.cc b/env/env_encryption.cc index dfbcccaf4..df7cf504e 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -405,7 +405,7 @@ class EncryptedEnv : public EnvWrapper { return status; } (*result) = std::unique_ptr(new EncryptedSequentialFile( - underlying.release(), stream.release(), prefixLength)); + std::move(underlying), std::move(stream), prefixLength)); return Status::OK(); } @@ -446,7 +446,7 @@ class EncryptedEnv : public EnvWrapper { return status; } (*result) = std::unique_ptr(new EncryptedRandomAccessFile( - underlying.release(), stream.release(), prefixLength)); + std::move(underlying), std::move(stream), prefixLength)); return Status::OK(); } @@ -489,7 +489,7 @@ class EncryptedEnv : public EnvWrapper { return status; } (*result) = std::unique_ptr(new EncryptedWritableFile( - underlying.release(), stream.release(), prefixLength)); + std::move(underlying), std::move(stream), prefixLength)); return Status::OK(); } @@ -538,7 +538,7 @@ class EncryptedEnv : public EnvWrapper { return status; } (*result) = std::unique_ptr(new EncryptedWritableFile( - underlying.release(), stream.release(), prefixLength)); + std::move(underlying), std::move(stream), prefixLength)); return Status::OK(); } @@ -583,7 +583,7 @@ class EncryptedEnv : public EnvWrapper { return status; } (*result) = std::unique_ptr(new EncryptedWritableFile( - underlying.release(), stream.release(), prefixLength)); + std::move(underlying), std::move(stream), prefixLength)); return Status::OK(); } @@ -644,7 +644,7 @@ class EncryptedEnv : public EnvWrapper { return status; } (*result) = std::unique_ptr(new EncryptedRandomRWFile( - underlying.release(), stream.release(), prefixLength)); + std::move(underlying), std::move(stream), prefixLength)); return Status::OK(); } @@ -961,4 +961,4 @@ Status CTREncryptionProvider::CreateCipherStreamFromPrefix( #endif // ROCKSDB_LITE -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h index ff2898f4f..f8c763b83 100644 --- a/include/rocksdb/env_encryption.h +++ b/include/rocksdb/env_encryption.h @@ -205,7 +205,7 @@ class CTREncryptionProvider : public EncryptionProvider { }; class EncryptedSequentialFile : public SequentialFile { - private: + protected: std::unique_ptr file_; std::unique_ptr stream_; uint64_t offset_; @@ -214,10 +214,11 @@ class EncryptedSequentialFile : public SequentialFile { public: // Default ctor. Given underlying sequential file is supposed to be at // offset == prefixLength. - EncryptedSequentialFile(SequentialFile* f, BlockAccessCipherStream* s, + EncryptedSequentialFile(std::unique_ptr&& f, + std::unique_ptr&& s, size_t prefixLength) - : file_(f), - stream_(s), + : file_(std::move(f)), + stream_(std::move(s)), offset_(prefixLength), prefixLength_(prefixLength) {} @@ -261,15 +262,18 @@ class EncryptedSequentialFile : public SequentialFile { // A file abstraction for randomly reading the contents of a file. class EncryptedRandomAccessFile : public RandomAccessFile { - private: + protected: std::unique_ptr file_; std::unique_ptr stream_; size_t prefixLength_; public: - EncryptedRandomAccessFile(RandomAccessFile* f, BlockAccessCipherStream* s, + EncryptedRandomAccessFile(std::unique_ptr&& f, + std::unique_ptr&& s, size_t prefixLength) - : file_(f), stream_(s), prefixLength_(prefixLength) {} + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} // Read up to "n" bytes from the file starting at "offset". // "scratch[0..n-1]" may be written by this routine. Sets "*result" @@ -324,18 +328,19 @@ class EncryptedRandomAccessFile : public RandomAccessFile { // must provide buffering since callers may append small fragments // at a time to the file. class EncryptedWritableFile : public WritableFileWrapper { - private: + protected: std::unique_ptr file_; std::unique_ptr stream_; size_t prefixLength_; public: // Default ctor. Prefix is assumed to be written already. - EncryptedWritableFile(WritableFile* f, BlockAccessCipherStream* s, + EncryptedWritableFile(std::unique_ptr&& f, + std::unique_ptr&& s, size_t prefixLength) - : WritableFileWrapper(f), - file_(f), - stream_(s), + : WritableFileWrapper(f.get()), + file_(std::move(f)), + stream_(std::move(s)), prefixLength_(prefixLength) {} Status Append(const Slice& data) override; @@ -388,15 +393,18 @@ class EncryptedWritableFile : public WritableFileWrapper { // A file abstraction for random reading and writing. class EncryptedRandomRWFile : public RandomRWFile { - private: + protected: std::unique_ptr file_; std::unique_ptr stream_; size_t prefixLength_; public: - EncryptedRandomRWFile(RandomRWFile* f, BlockAccessCipherStream* s, + EncryptedRandomRWFile(std::unique_ptr&& f, + std::unique_ptr&& s, size_t prefixLength) - : file_(f), stream_(s), prefixLength_(prefixLength) {} + : file_(std::move(f)), + stream_(std::move(s)), + prefixLength_(prefixLength) {} // Indicates if the class makes use of direct I/O // If false you must pass aligned buffer to Write() @@ -425,6 +433,6 @@ class EncryptedRandomRWFile : public RandomRWFile { virtual Status Close() override; }; -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE #endif // !defined(ROCKSDB_LITE) From 2c35685e2e3b830880e054236f46ad0db0ba5df6 Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Tue, 30 Jun 2020 13:41:02 -0400 Subject: [PATCH 49/57] previous PR missed ifdefs within library_loader stuff and switch to unique_ptr in env_encryption declarations. added change to build_detect_platform that only benefits developer builds of rocksdb, i.e. Alex and me (#11) Co-authored-by: matthewvon --- build_tools/build_detect_platform | 12 ++++++++++++ env/env_encrypt2.cc | 12 ++++++------ util/library_loader.cc | 11 ++++++++++- util/library_loader.h | 13 +++++++++++-- util/library_loader_test.cc | 9 +++++++-- 5 files changed, 46 insertions(+), 11 deletions(-) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index f237fa4ee..dd66d6f37 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -485,6 +485,18 @@ EOF PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -faligned-new -DHAVE_ALIGNED_NEW" fi fi + + if ! test $ROCKSDB_DISABLE_OPENSSL; then + # Test whether openssl is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_OPENSSL_AES_CTR" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ldl" + fi + fi fi # TODO(tec): Fix -Wshorten-64-to-32 errors on FreeBSD and enable the warning. diff --git a/env/env_encrypt2.cc b/env/env_encrypt2.cc index a2af1087d..7f72ad821 100644 --- a/env/env_encrypt2.cc +++ b/env/env_encrypt2.cc @@ -252,7 +252,7 @@ Status EncryptedEnv2::NewSequentialFile(const std::string& fname, if (status.ok()) { if (provider) { (*result) = std::unique_ptr( - new EncryptedSequentialFile(underlying.release(), stream.release(), + new EncryptedSequentialFile(std::move(underlying), std::move(stream), provider->GetPrefixLength())); } else { @@ -290,7 +290,7 @@ Status EncryptedEnv2::NewRandomAccessFile( if (provider) { (*result) = std::unique_ptr(new EncryptedRandomAccessFile( - underlying.release(), stream.release(), + std::move(underlying), std::move(stream), provider->GetPrefixLength())); } else { @@ -322,7 +322,7 @@ Status EncryptedEnv2::NewWritableFile(const std::string& fname, if (status.ok()) { (*result) = std::unique_ptr(new EncryptedWritableFile( - underlying.release(), stream.release(), + std::move(underlying), std::move(stream), encrypt_write_.second->GetPrefixLength())); } } else { @@ -362,7 +362,7 @@ Status EncryptedEnv2::ReopenWritableFile(const std::string& fname, if (status.ok()) { (*result) = std::unique_ptr(new EncryptedWritableFile( - underlying.release(), stream.release(), + std::move(underlying), std::move(stream), encrypt_write_.second->GetPrefixLength())); } } else { @@ -398,7 +398,7 @@ Status EncryptedEnv2::ReuseWritableFile(const std::string& fname, if (status.ok()) { (*result) = std::unique_ptr(new EncryptedWritableFile( - underlying.release(), stream.release(), + std::move(underlying), std::move(stream), encrypt_write_.second->GetPrefixLength())); } } else { @@ -452,7 +452,7 @@ Status EncryptedEnv2::NewRandomRWFile(const std::string& fname, if (status.ok()) { if (provider) { (*result) = std::unique_ptr( - new EncryptedRandomRWFile(underlying.release(), stream.release(), + new EncryptedRandomRWFile(std::move(underlying), std::move(stream), provider->GetPrefixLength())); } else { (*result).reset(underlying.release()); diff --git a/util/library_loader.cc b/util/library_loader.cc index 081a929a2..a69f0326c 100644 --- a/util/library_loader.cc +++ b/util/library_loader.cc @@ -3,9 +3,15 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#ifdef ROCKSDB_OPENSSL_AES_CTR +#ifndef ROCKSDB_LITE + #include "util/library_loader.h" #include +#include + +#include "rocksdb/status.h" // link with -ldl @@ -16,7 +22,7 @@ namespace rocksdb { #else const char * UnixLibCrypto::crypto_lib_name_ = "libcrypto.so"; #endif - + UnixLibraryLoader::UnixLibraryLoader(const char * library_name) : dl_handle_(nullptr) { @@ -112,3 +118,6 @@ UnixLibCrypto::UnixLibCrypto() } } // namespace rocksdb + +#endif // ROCKSDB_LITE +#endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/util/library_loader.h b/util/library_loader.h index dd9cffd94..a9dacaabc 100644 --- a/util/library_loader.h +++ b/util/library_loader.h @@ -5,9 +5,16 @@ #pragma once +#ifdef ROCKSDB_OPENSSL_AES_CTR +#ifndef ROCKSDB_LITE + +#include + #include +#include #include -#include + +#include "rocksdb/env.h" namespace rocksdb { @@ -127,7 +134,7 @@ class UnixLibCrypto : public UnixLibraryLoader { } static const char * crypto_lib_name_; - + protected: std::map functions_ { {"EVP_MD_CTX_new", nullptr}, {"EVP_MD_CTX_create", nullptr}, @@ -167,3 +174,5 @@ class UnixLibCrypto : public UnixLibraryLoader { }; } // namespace rocksdb +#endif // ROCKSDB_LITE +#endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/util/library_loader_test.cc b/util/library_loader_test.cc index cb8e094bd..fcbe6c380 100644 --- a/util/library_loader_test.cc +++ b/util/library_loader_test.cc @@ -5,6 +5,9 @@ #include +#ifdef ROCKSDB_OPENSSL_AES_CTR +#ifndef ROCKSDB_LITE + #include "util/library_loader.h" namespace rocksdb { @@ -18,7 +21,7 @@ namespace rocksdb { static const char * LIB_BAD_NAME = "libbubbagump.so"; static const char * LIB_SSL_NAME = "libssl.so"; #endif - + class UnixLibraryLoaderTest {}; @@ -78,9 +81,11 @@ TEST(UnixLibraryLoaderTest, Crypto) { } - } // namespace rocksdb +#endif // ROCKSDB_LITE +#endif // ROCKSDB_OPENSSL_AES_CTR + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); From 4569cef318dac46412d86b1ee4e14d8d4ebc852d Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Thu, 9 Jul 2020 12:16:29 -0400 Subject: [PATCH 50/57] Backport changes to encryption that Facebook required. (#12) * this is a current file in facebook/rocksdb. copying it into our older code to simplify backporting encryption. * backport DynamicLibrary and LoadLibrary from rocksdb master * add missing line for CTR AES build * copy encryption updates required by Facebook * add new include file to stardog bazel build file * there is a likely code path where valid_ is not set in EncryptedEnvV2. this corrects. * correct code and usage of EncryptedRandomRWFileV2 * fix formating and grammar in some logging * per man page, switch to EVP_EncryptFinal_ex as soft reset of context object for reuse Co-authored-by: matthewvon --- build_tools/build_detect_platform | 1 + env/BUILD | 2 + env/env_encrypt2.cc | 747 ++++++++++++++++++++++------ env/env_encrypt2_impl.h | 81 +++ env/env_encrypt2_test.cc | 227 +++++---- env/env_encryption.cc | 206 ++++---- env/env_posix.cc | 97 ++++ include/rocksdb/env.h | 34 ++ include/rocksdb/env_encrypt2.h | 433 ++++++---------- include/rocksdb/env_encryption.h | 13 +- include/rocksdb/rocksdb_namespace.h | 10 + table/block_based_table_reader.cc | 18 +- util/library_loader.cc | 105 ++-- util/library_loader.h | 156 +++--- util/library_loader_test.cc | 41 +- 15 files changed, 1341 insertions(+), 830 deletions(-) create mode 100644 env/env_encrypt2_impl.h create mode 100644 include/rocksdb/rocksdb_namespace.h diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index dd66d6f37..e5c37a439 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -624,6 +624,7 @@ echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT" echo "ROCKSDB_MAJOR=$ROCKSDB_MAJOR" >> "$OUTPUT" echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT" echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT" +echo "ROCKSDB_OPENSSL_AES_CTR=$ROCKSDB_OPENSSL_AES_CTR" >> "$OUTPUT" echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT" echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT" echo "PROFILING_FLAGS=$PROFILING_FLAGS" >> "$OUTPUT" diff --git a/env/BUILD b/env/BUILD index 22176e63d..aaa39fcb8 100644 --- a/env/BUILD +++ b/env/BUILD @@ -5,11 +5,13 @@ COMMON_HDRS = [] PLATFORM_HDRS = select({ "//:linux" : [ "env_chroot.h", + "env_encrypt2_impl.h", "io_posix.h", "posix_logger.h", ], "//:osx" : [ "env_chroot.h", + "env_encrypt2_impl.h", "io_posix.h", "posix_logger.h", ], diff --git a/env/env_encrypt2.cc b/env/env_encrypt2.cc index 7f72ad821..290a649a2 100644 --- a/env/env_encrypt2.cc +++ b/env/env_encrypt2.cc @@ -15,48 +15,45 @@ #include #include +#include "env/env_encrypt2_impl.h" +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "util/aligned_buffer.h" #include "util/coding.h" #include "util/random.h" -#endif +namespace ROCKSDB_NAMESPACE { -namespace rocksdb { +static port::RWMutex key_lock; -// following define block from page 70: -// https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf -#if !defined(ALIGN16) -#if defined(__GNUC__) -#define ALIGN16 __attribute__((aligned(16))) -#else -#define ALIGN16 __declspec(align(16)) -#endif -#endif +// reuse cipher context between calls to Encrypt & Decrypt +static void do_nothing(EVP_CIPHER_CTX*){}; +thread_local static std::unique_ptr + aes_context(nullptr, &do_nothing); -#ifndef ROCKSDB_LITE - -Sha1Description_t::Sha1Description_t(const std::string& key_desc_str) { +Sha1Description::Sha1Description(const std::string& key_desc_str) { bool good = {true}; int ret_val; unsigned len; memset(desc, 0, EVP_MAX_MD_SIZE); - if (0 != key_desc_str.length() && EncryptedEnv2::crypto_.IsValid()) { + if (0 != key_desc_str.length() && EncryptedEnvV2::crypto_.IsValid()) { std::unique_ptr context( - EncryptedEnv2::crypto_.EVP_MD_CTX_new(), - EncryptedEnv2::crypto_.EVP_MD_CTX_free_ptr()); + EncryptedEnvV2::crypto_.EVP_MD_CTX_new(), + EncryptedEnvV2::crypto_.EVP_MD_CTX_free_ptr()); - ret_val = EncryptedEnv2::crypto_.EVP_DigestInit_ex( - context.get(), EncryptedEnv2::crypto_.EVP_sha1(), nullptr); + ret_val = EncryptedEnvV2::crypto_.EVP_DigestInit_ex( + context.get(), EncryptedEnvV2::crypto_.EVP_sha1(), nullptr); good = (1 == ret_val); if (good) { - ret_val = EncryptedEnv2::crypto_.EVP_DigestUpdate( + ret_val = EncryptedEnvV2::crypto_.EVP_DigestUpdate( context.get(), key_desc_str.c_str(), key_desc_str.length()); good = (1 == ret_val); } if (good) { ret_val = - EncryptedEnv2::crypto_.EVP_DigestFinal_ex(context.get(), desc, &len); + EncryptedEnvV2::crypto_.EVP_DigestFinal_ex(context.get(), desc, &len); good = (1 == ret_val); } } else { @@ -66,7 +63,7 @@ Sha1Description_t::Sha1Description_t(const std::string& key_desc_str) { valid = good; } -AesCtrKey_t::AesCtrKey_t(const std::string& key_str) : valid(false) { +AesCtrKey::AesCtrKey(const std::string& key_str) : valid(false) { memset(key, 0, EVP_MAX_KEY_LENGTH); // simple parse: must be 64 characters long and hexadecimal values @@ -82,61 +79,185 @@ AesCtrKey_t::AesCtrKey_t(const std::string& key_str) : valid(false) { } } + +void AESBlockAccessCipherStream::BigEndianAdd128(uint8_t* buf, + uint64_t value) { + uint8_t *sum, *addend, *carry, pre, post; + + sum = buf + 15; + + if (port::kLittleEndian) { + addend = (uint8_t*)&value; + } else { + addend = (uint8_t*)&value + 7; + } + + // future: big endian could be written as uint64_t add + for (int loop = 0; loop < 8 && value; ++loop) { + pre = *sum; + *sum += *addend; + post = *sum; + --sum; + value >>= 8; + + carry = sum; + // carry? + while (post < pre && buf <= carry) { + pre = *carry; + *carry += 1; + post = *carry; + --carry; + } + } // for +} + // // AES_BLOCK_SIZE assumed to be 16 // typedef union { uint64_t nums[2]; uint8_t bytes[AES_BLOCK_SIZE]; -} AesAlignedBlock_t; +} AesAlignedBlock; + +// "data" is assumed to be aligned at AES_BLOCK_SIZE or greater +Status AESBlockAccessCipherStream::Encrypt(uint64_t file_offset, char* data, + size_t data_size) { + Status status; + if (0 < data_size) { + if (EncryptedEnvV2::crypto_.IsValid()) { + int ret_val, out_len; + ALIGN16 AesAlignedBlock iv; + uint64_t block_index = file_offset / BlockSize(); + + // make a context once per thread + if (!aes_context) { + aes_context = + std::unique_ptr( + EncryptedEnvV2::crypto_.EVP_CIPHER_CTX_new(), + EncryptedEnvV2::crypto_.EVP_CIPHER_CTX_free_ptr()); + } + + memcpy(iv.bytes, nonce_, AES_BLOCK_SIZE); + BigEndianAdd128(iv.bytes, block_index); + + ret_val = EncryptedEnvV2::crypto_.EVP_EncryptInit_ex( + aes_context.get(), EncryptedEnvV2::crypto_.EVP_aes_256_ctr(), + nullptr, key_.key, iv.bytes); + if (1 == ret_val) { + out_len = 0; + ret_val = EncryptedEnvV2::crypto_.EVP_EncryptUpdate( + aes_context.get(), (unsigned char*)data, &out_len, + (unsigned char*)data, (int)data_size); + + if (1 != ret_val || (int)data_size != out_len) { + status = Status::InvalidArgument("EVP_EncryptUpdate failed: ", + (int)data_size == out_len + ? "bad return value" + : "output length short"); + } + // this is a soft reset of aes_context per man pages + uint8_t temp_buf[AES_BLOCK_SIZE]; + out_len = 0; + ret_val = EncryptedEnvV2::crypto_.EVP_EncryptFinal_ex( + aes_context.get(), temp_buf, &out_len); + + if (1 != ret_val || 0 != out_len) { + status = Status::InvalidArgument("EVP_EncryptFinal_ex failed."); + } + } else { + status = Status::InvalidArgument("EVP_EncryptInit_ex failed."); + } + } else { + status = Status::NotSupported( + "libcrypto not available for encryption/decryption."); + } + } -Status AESBlockAccessCipherStream::EncryptBlock(uint64_t blockIndex, char* data, - char* /*scratch*/) { - // - // AES_BLOCK_SIZE assumed to be 16 - // - assert(AES_BLOCK_SIZE == 16); - assert(sizeof(AesAlignedBlock_t) == AES_BLOCK_SIZE); + return status; +} + +// Decrypt one or more (partial) blocks of data at the file offset. +// Length of data is given in data_size. +// CTR Encrypt and Decrypt are synonyms. Using Encrypt calls here to reduce +// count of symbols loaded from libcrypto. +Status AESBlockAccessCipherStream::Decrypt(uint64_t file_offset, char* data, + size_t data_size) { + // Calculate block index + size_t block_size = BlockSize(); + uint64_t block_index = file_offset / block_size; + size_t block_offset = file_offset % block_size; + size_t remaining = data_size; + size_t prefix_size = 0; + uint8_t temp_buf[block_size]; Status status; - ALIGN16 AesAlignedBlock_t block_in, block_out, iv; - int out_len = 0, in_len = {AES_BLOCK_SIZE}, ret_val; - - if (EncryptedEnv2::crypto_.IsValid()) { - std::unique_ptr context( - EncryptedEnv2::crypto_.EVP_CIPHER_CTX_new(), - EncryptedEnv2::crypto_.EVP_CIPHER_CTX_free_ptr()); - - // https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf - memcpy(iv.bytes, nonce_, AES_BLOCK_SIZE / 2); - EncodeFixed64((char*)&iv.bytes[AES_BLOCK_SIZE / 2], - blockIndex); // this will be little endian - block_in.nums[0] = 0; - block_in.nums[1] = 0; - - ret_val = EncryptedEnv2::crypto_.EVP_EncryptInit_ex( - context.get(), EncryptedEnv2::crypto_.EVP_aes_256_ctr(), nullptr, + ALIGN16 AesAlignedBlock iv; + int out_len = 0, ret_val; + + if (EncryptedEnvV2::crypto_.IsValid()) { + // make a context once per thread + if (!aes_context) { + aes_context = std::unique_ptr( + EncryptedEnvV2::crypto_.EVP_CIPHER_CTX_new(), + EncryptedEnvV2::crypto_.EVP_CIPHER_CTX_free_ptr()); + } + + memcpy(iv.bytes, nonce_, AES_BLOCK_SIZE); + BigEndianAdd128(iv.bytes, block_index); + + ret_val = EncryptedEnvV2::crypto_.EVP_EncryptInit_ex( + aes_context.get(), EncryptedEnvV2::crypto_.EVP_aes_256_ctr(), nullptr, key_.key, iv.bytes); if (1 == ret_val) { - ret_val = EncryptedEnv2::crypto_.EVP_EncryptUpdate( - context.get(), block_out.bytes, &out_len, block_in.bytes, in_len); - - if (1 != ret_val || AES_BLOCK_SIZE != out_len) { - status = Status::InvalidArgument("EVP_EncryptUpdate failed: ", - AES_BLOCK_SIZE == out_len - ? "bad return value" - : "output length short"); + // handle uneven block start + if (0 != block_offset) { + prefix_size = block_size - block_offset; + if (data_size < prefix_size) { + prefix_size = data_size; + } + + memcpy(temp_buf + block_offset, data, prefix_size); + out_len = 0; + ret_val = EncryptedEnvV2::crypto_.EVP_EncryptUpdate( + aes_context.get(), temp_buf, &out_len, temp_buf, (int)block_size); + + if (1 != ret_val || (int)block_size != out_len) { + status = Status::InvalidArgument("EVP_EncryptUpdate failed 1: ", + (int)block_size == out_len + ? "bad return value" + : "output length short"); + } else { + memcpy(data, temp_buf + block_offset, prefix_size); + } + } + + // all remaining data, even block size not required + remaining -= prefix_size; + if (status.ok() && remaining) { + out_len = 0; + ret_val = EncryptedEnvV2::crypto_.EVP_EncryptUpdate( + aes_context.get(), (uint8_t*)data + prefix_size, &out_len, + (uint8_t*)data + prefix_size, (int)remaining); + + if (1 != ret_val || (int)remaining != out_len) { + status = Status::InvalidArgument("EVP_EncryptUpdate failed 2: ", + (int)remaining == out_len + ? "bad return value" + : "output length short"); + } + } + + // this is a soft reset of aes_context per man pages + out_len = 0; + ret_val = EncryptedEnvV2::crypto_.EVP_EncryptFinal_ex( + aes_context.get(), temp_buf, &out_len); + + if (1 != ret_val || 0 != out_len) { + status = Status::InvalidArgument("EVP_EncryptFinal_ex failed."); } } else { status = Status::InvalidArgument("EVP_EncryptInit_ex failed."); } - - // XOR data with ciphertext. - uint64_t* data_ptr; - data_ptr = (uint64_t*)data; - *data_ptr ^= block_out.nums[0]; - data_ptr = (uint64_t*)(data + 8); - *data_ptr ^= block_out.nums[1]; } else { status = Status::NotSupported( "libcrypto not available for encryption/decryption."); @@ -145,24 +266,18 @@ Status AESBlockAccessCipherStream::EncryptBlock(uint64_t blockIndex, char* data, return status; } -Status AESBlockAccessCipherStream::DecryptBlock(uint64_t blockIndex, char* data, - char* scratch) { - return EncryptBlock(blockIndex, data, scratch); -} - -Status CTREncryptionProvider2::CreateNewPrefix(const std::string& /*fname*/, - char* prefix, - size_t prefixLength) { +Status CTREncryptionProviderV2::CreateNewPrefix(const std::string& /*fname*/, + char* prefix, + size_t prefixLength) const { Status s; - if (EncryptedEnv2::crypto_.IsValid()) { - if (sizeof(Prefix0_t) <= prefixLength) { + if (EncryptedEnvV2::crypto_.IsValid()) { + if (sizeof(PrefixVersion0) <= prefixLength) { int ret_val; - Prefix0_t* pf = {(Prefix0_t*)prefix}; + PrefixVersion0* pf = {(PrefixVersion0*)prefix}; memcpy(pf->key_description_, key_desc_.desc, sizeof(key_desc_.desc)); - ret_val = EncryptedEnv2::crypto_.RAND_bytes( - (unsigned char*)&pf->nonce_, - AES_BLOCK_SIZE / 2); // RAND_poll() to initialize + ret_val = EncryptedEnvV2::crypto_.RAND_bytes((unsigned char*)&pf->nonce_, + AES_BLOCK_SIZE); if (1 != ret_val) { s = Status::NotSupported("RAND_bytes failed"); } @@ -176,19 +291,128 @@ Status CTREncryptionProvider2::CreateNewPrefix(const std::string& /*fname*/, return s; } +size_t CTREncryptionProviderV2::GetPrefixLength() const { + return sizeof(PrefixVersion0) + sizeof(EncryptMarker); +} + +BlockAccessCipherStream* CTREncryptionProviderV2::CreateCipherStream2( + uint8_t code_version, const uint8_t nonce[]) const { + return new AESBlockAccessCipherStream(key_, code_version, nonce); +} + +Status EncryptedWritableFileV2::Append(const Slice& data) { + AlignedBuffer buf; + Status status; + Slice dataToAppend(data); + if (data.size() > 0) { + size_t block_size = stream_->BlockSize(); + uint64_t offset = file_->GetFileSize(); // size including prefix + uint64_t block_offset = offset % block_size; + + // Encrypt in cloned buffer + buf.Alignment(block_size); + // worst case is one byte only in first and in last block, + // so 2*block_size-2 might be needed (simplified to 2*block_size) + buf.AllocateNewBuffer(data.size() + 2 * block_size); + memcpy(buf.BufferStart() + block_offset, data.data(), data.size()); + buf.Size(data.size() + block_offset); + { + PERF_TIMER_GUARD(encrypt_data_nanos); + status = stream_->Encrypt(offset - block_offset, buf.BufferStart(), + buf.CurrentSize()); + } + if (status.ok()) { + dataToAppend = Slice(buf.BufferStart() + block_offset, data.size()); + } + } + + if (status.ok()) { + status = file_->Append(dataToAppend); + } + + return status; +} + +Status EncryptedWritableFileV2::PositionedAppend(const Slice& data, + uint64_t offset) { + AlignedBuffer buf; + Status status; + Slice dataToAppend(data); + offset += prefixLength_; + if (data.size() > 0) { + size_t block_size = stream_->BlockSize(); + uint64_t block_offset = offset % block_size; + + // Encrypt in cloned buffer + buf.Alignment(block_size); + // worst case is one byte only in first and in last block, + // so 2*block_size-2 might be needed (simplified to 2*block_size) + buf.AllocateNewBuffer(data.size() + 2 * block_size); + memcpy(buf.BufferStart() + block_offset, data.data(), data.size()); + buf.Size(data.size() + block_offset); + { + PERF_TIMER_GUARD(encrypt_data_nanos); + status = stream_->Encrypt(offset - block_offset, buf.BufferStart(), + buf.CurrentSize()); + } + if (status.ok()) { + dataToAppend = Slice(buf.BufferStart() + block_offset, data.size()); + } + } + + if (status.ok()) { + status = file_->PositionedAppend(dataToAppend, offset); + } + + return status; +} + +Status EncryptedRandomRWFileV2::Write(uint64_t offset, const Slice& data) { + AlignedBuffer buf; + Status status; + Slice dataToWrite(data); + offset += prefixLength_; + if (data.size() > 0) { + size_t block_size = stream_->BlockSize(); + uint64_t block_offset = offset % block_size; + + // Encrypt in cloned buffer + buf.Alignment(block_size); + // worst case is one byte only in first and in last block, + // so 2*block_size-2 might be needed (simplified to 2*block_size) + buf.AllocateNewBuffer(data.size() + 2 * block_size); + memcpy(buf.BufferStart() + block_offset, data.data(), data.size()); + buf.Size(data.size() + block_offset); + { + PERF_TIMER_GUARD(encrypt_data_nanos); + status = stream_->Encrypt(offset - block_offset, buf.BufferStart(), + buf.CurrentSize()); + } + if (status.ok()) { + dataToWrite = Slice(buf.BufferStart()+block_offset, data.size()); + } + } + + if (status.ok()) { + status = file_->Write(offset, dataToWrite); + } + + return status; +} + // Returns an Env that encrypts data when stored on disk and decrypts data when // read from disk. -Env* NewEncryptedEnv2(Env* base_env, EncryptedEnv2::ReadKeys_t encrypt_read, - EncryptedEnv2::WriteKey_t encrypt_write) { +Env* NewEncryptedEnvV2(Env* base_env, EncryptedEnvV2::ReadKeys encrypt_read, + EncryptedEnvV2::WriteKey encrypt_write) { Env* ret_env{base_env}; - EncryptedEnv2* new_env{nullptr}; + EncryptedEnvV2* new_env{nullptr}; if (Env::Default() == base_env) { // use safer static construction so libcrypto is synchronously loaded new_env = - (EncryptedEnv2*)EncryptedEnv2::Default(encrypt_read, encrypt_write); + (EncryptedEnvV2*)EncryptedEnvV2::Default(encrypt_read, encrypt_write); } else if (nullptr != base_env) { - new_env = new EncryptedEnv2(base_env, encrypt_read, encrypt_write); + new_env = new EncryptedEnvV2(base_env, encrypt_read, encrypt_write); } // warning, dynamic loading of libcrypto could be delayed ... making this @@ -200,13 +424,12 @@ Env* NewEncryptedEnv2(Env* base_env, EncryptedEnv2::ReadKeys_t encrypt_read, return ret_env; } -EncryptedEnv2::EncryptedEnv2(Env* base_env, - EncryptedEnv2::ReadKeys_t encrypt_read, - EncryptedEnv2::WriteKey_t encrypt_write) - : EnvWrapper(base_env), - encrypt_read_(encrypt_read), - encrypt_write_(encrypt_write), - valid_(false) { +EncryptedEnvV2::EncryptedEnvV2(Env* base_env, + EncryptedEnvV2::ReadKeys encrypt_read, + EncryptedEnvV2::WriteKey encrypt_write) + : EnvWrapper(base_env), valid_(false) { + SetKeys(encrypt_read, encrypt_write); + valid_ = crypto_.IsValid(); // warning, dynamic loading of libcrypto could be delayed ... making this @@ -216,27 +439,211 @@ EncryptedEnv2::EncryptedEnv2(Env* base_env, } } -EncryptedEnv2::EncryptedEnv2(Env* base_env) - : EnvWrapper(base_env), valid_(false) {} +EncryptedEnvV2::EncryptedEnvV2(Env* base_env) + : EnvWrapper(base_env), valid_(false) { -void EncryptedEnv2::SetKeys(EncryptedEnv2::ReadKeys_t encrypt_read, - EncryptedEnv2::WriteKey_t encrypt_write) { + valid_ = crypto_.IsValid(); + if (IsValid()) { + crypto_.RAND_poll(); + } +} + +void EncryptedEnvV2::SetKeys(EncryptedEnvV2::ReadKeys encrypt_read, + EncryptedEnvV2::WriteKey encrypt_write) { + key_lock.WriteLock(); encrypt_read_ = encrypt_read; encrypt_write_ = encrypt_write; + key_lock.WriteUnlock(); - valid_ = crypto_.IsValid(); +} - if (IsValid()) { - crypto_.RAND_poll(); +bool EncryptedEnvV2::IsWriteEncrypted() const { + key_lock.ReadLock(); + bool ret_flag = (nullptr != encrypt_write_.second); + key_lock.ReadUnlock(); + return ret_flag; +} + +// +// common functions used with different file types +// (because there is not common base class for the file types +// +template +Status EncryptedEnvV2::ReadSeqEncryptionPrefix( + TypeFile* f, std::shared_ptr& provider, + std::unique_ptr& stream) { + Status status; + + provider.reset(); // nullptr for provider implies "no encryption" + stream.release(); + + // Look for encryption marker + EncryptMarker marker; + Slice marker_slice; + status = f->Read(sizeof(marker), &marker_slice, marker); + if (status.ok()) { + if (sizeof(marker) == marker_slice.size() && + marker_slice.starts_with(kEncryptMarker)) { + // code_version currently unused + uint8_t code_version = (uint8_t)marker_slice[7]; + + if (kEncryptCodeVersion0 == code_version) { + Slice prefix_slice; + PrefixVersion0 prefix_buffer; + status = f->Read(sizeof(PrefixVersion0), &prefix_slice, + (char*)&prefix_buffer); + if (status.ok() && sizeof(PrefixVersion0) == prefix_slice.size()) { + Sha1Description desc(prefix_buffer.key_description_, + sizeof(prefix_buffer.key_description_)); + + key_lock.ReadLock(); + auto it = encrypt_read_.find(desc); + if (encrypt_read_.end() != it) { + provider = it->second; + stream.reset(new AESBlockAccessCipherStream( + provider->key(), code_version, prefix_buffer.nonce_)); + + } else { + status = Status::NotSupported( + "No encryption key found to match input file"); + } + key_lock.ReadUnlock(); + } + } else { + status = + Status::NotSupported("Unknown encryption code version required."); + } + } + } + return status; +} + +template +Status EncryptedEnvV2::ReadRandEncryptionPrefix( + TypeFile* f, std::shared_ptr& provider, + std::unique_ptr& stream) { + Status status; + + provider.reset(); // nullptr for provider implies "no encryption" + stream.release(); + + // Look for encryption marker + EncryptMarker marker; + Slice marker_slice; + status = f->Read(0, sizeof(marker), &marker_slice, marker); + if (status.ok()) { + if (sizeof(marker) == marker_slice.size() && + marker_slice.starts_with(kEncryptMarker)) { + uint8_t code_version = (uint8_t)marker_slice[7]; + + if (kEncryptCodeVersion0 == code_version) { + Slice prefix_slice; + PrefixVersion0 prefix_buffer; + status = f->Read(sizeof(marker), sizeof(PrefixVersion0), &prefix_slice, + (char*)&prefix_buffer); + if (status.ok() && sizeof(PrefixVersion0) == prefix_slice.size()) { + Sha1Description desc(prefix_buffer.key_description_, + sizeof(prefix_buffer.key_description_)); + + key_lock.ReadLock(); + auto it = encrypt_read_.find(desc); + if (encrypt_read_.end() != it) { + provider = it->second; + stream.reset(new AESBlockAccessCipherStream( + provider->key(), code_version, prefix_buffer.nonce_)); + } else { + status = Status::NotSupported( + "No encryption key found to match input file"); + } + key_lock.ReadUnlock(); + } + } else { + status = + Status::NotSupported("Unknown encryption code version required."); + } + } } + return status; +} + +template +Status EncryptedEnvV2::WriteSeqEncryptionPrefix( + TypeFile* f, std::shared_ptr provider, + std::unique_ptr& stream) { + Status status; + + // set up Encryption maker, code version '0' + uint8_t code_version = {kEncryptCodeVersion0}; + PrefixVersion0 prefix; + EncryptMarker marker; + strncpy(marker, kEncryptMarker, sizeof(kEncryptMarker)); + marker[sizeof(EncryptMarker) - 1] = code_version; + + Slice marker_slice(marker, sizeof(EncryptMarker)); + status = f->Append(marker_slice); + + if (status.ok()) { + // create nonce, then write it and key description + Slice prefix_slice((char*)&prefix, sizeof(prefix)); + + status = provider->CreateNewPrefix(std::string(), (char*)&prefix, + provider->GetPrefixLength()); + + if (status.ok()) { + status = f->Append(prefix_slice); + } + } + + if (status.ok()) { + stream.reset(new AESBlockAccessCipherStream(provider->key(), code_version, + prefix.nonce_)); + } + + return status; +} + +template +Status EncryptedEnvV2::WriteRandEncryptionPrefix( + TypeFile* f, std::shared_ptr provider, + std::unique_ptr& stream) { + Status status; + + // set up Encryption maker, code version '0' + uint8_t code_version = {kEncryptCodeVersion0}; + PrefixVersion0 prefix; + EncryptMarker marker; + strncpy(marker, kEncryptMarker, sizeof(kEncryptMarker)); + marker[sizeof(EncryptMarker) - 1] = code_version; + + Slice marker_slice(marker, sizeof(EncryptMarker)); + status = f->Write(0, marker_slice); + + if (status.ok()) { + // create nonce, then write it and key description + Slice prefix_slice((char*)&prefix, sizeof(prefix)); + + status = provider->CreateNewPrefix(std::string(), (char*)&prefix, + provider->GetPrefixLength()); + + if (status.ok()) { + status = f->Write(sizeof(EncryptMarker), prefix_slice); + } + } + + if (status.ok()) { + stream.reset(new AESBlockAccessCipherStream(provider->key(), code_version, + prefix.nonce_)); + } + + return status; } // NewSequentialFile opens a file for sequential reading. -Status EncryptedEnv2::NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { +Status EncryptedEnvV2::NewSequentialFile( + const std::string& fname, std::unique_ptr* result, + const EnvOptions& options) { result->reset(); - if (options.use_mmap_reads) { + if (options.use_mmap_reads || options.use_direct_reads) { return Status::InvalidArgument(); } @@ -244,16 +651,16 @@ Status EncryptedEnv2::NewSequentialFile(const std::string& fname, std::unique_ptr underlying; auto status = EnvWrapper::NewSequentialFile(fname, &underlying, options); if (status.ok()) { - std::shared_ptr provider; + std::shared_ptr provider; std::unique_ptr stream; status = ReadSeqEncryptionPrefix(underlying.get(), provider, stream); if (status.ok()) { if (provider) { - (*result) = std::unique_ptr( - new EncryptedSequentialFile(std::move(underlying), std::move(stream), - provider->GetPrefixLength())); + (*result) = std::unique_ptr(new EncryptedSequentialFile( + std::move(underlying), std::move(stream), + provider->GetPrefixLength())); } else { // normal file, not encrypted @@ -269,11 +676,11 @@ Status EncryptedEnv2::NewSequentialFile(const std::string& fname, } // NewRandomAccessFile opens a file for random read access. -Status EncryptedEnv2::NewRandomAccessFile( +Status EncryptedEnvV2::NewRandomAccessFile( const std::string& fname, std::unique_ptr* result, const EnvOptions& options) { result->reset(); - if (options.use_mmap_reads) { + if (options.use_mmap_reads || options.use_direct_reads) { return Status::InvalidArgument(); } @@ -281,7 +688,7 @@ Status EncryptedEnv2::NewRandomAccessFile( std::unique_ptr underlying; auto status = EnvWrapper::NewRandomAccessFile(fname, &underlying, options); if (status.ok()) { - std::shared_ptr provider; + std::shared_ptr provider; std::unique_ptr stream; status = ReadRandEncryptionPrefix(underlying.get(), provider, stream); @@ -303,27 +710,33 @@ Status EncryptedEnv2::NewRandomAccessFile( } // NewWritableFile opens a file for sequential writing. -Status EncryptedEnv2::NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { +Status EncryptedEnvV2::NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { Status status; result->reset(); - if (!options.use_mmap_writes) { + if (!options.use_mmap_writes && !options.use_direct_writes) { // Open file using underlying Env implementation std::unique_ptr underlying; status = EnvWrapper::NewWritableFile(fname, &underlying, options); if (status.ok()) { - if (IsWriteEncrypted()) { + std::shared_ptr provider; + + key_lock.ReadLock(); + provider = encrypt_write_.second; + key_lock.ReadUnlock(); + + if (provider) { std::unique_ptr stream; - status = WriteSeqEncryptionPrefix(underlying.get(), stream); + status = WriteSeqEncryptionPrefix(underlying.get(), provider, stream); if (status.ok()) { - (*result) = std::unique_ptr(new EncryptedWritableFile( + (*result) = std::unique_ptr(new EncryptedWritableFileV2( std::move(underlying), std::move(stream), - encrypt_write_.second->GetPrefixLength())); + provider->GetPrefixLength())); } } else { (*result).reset(underlying.release()); @@ -343,27 +756,33 @@ Status EncryptedEnv2::NewWritableFile(const std::string& fname, // returns non-OK. // // The returned file will only be accessed by one thread at a time. -Status EncryptedEnv2::ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { +Status EncryptedEnvV2::ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { Status status; result->reset(); - if (!options.use_mmap_writes) { + if (!options.use_mmap_writes && !options.use_direct_writes) { // Open file using underlying Env implementation std::unique_ptr underlying; status = EnvWrapper::ReopenWritableFile(fname, &underlying, options); if (status.ok()) { - if (IsWriteEncrypted()) { + std::shared_ptr provider; + + key_lock.ReadLock(); + provider = encrypt_write_.second; + key_lock.ReadUnlock(); + + if (provider) { std::unique_ptr stream; - status = WriteSeqEncryptionPrefix(underlying.get(), stream); + status = WriteSeqEncryptionPrefix(underlying.get(), provider, stream); if (status.ok()) { (*result) = std::unique_ptr(new EncryptedWritableFile( std::move(underlying), std::move(stream), - encrypt_write_.second->GetPrefixLength())); + provider->GetPrefixLength())); } } else { (*result).reset(underlying.release()); @@ -377,29 +796,35 @@ Status EncryptedEnv2::ReopenWritableFile(const std::string& fname, } // Reuse an existing file by renaming it and opening it as writable. -Status EncryptedEnv2::ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) { +Status EncryptedEnvV2::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* result, + const EnvOptions& options) { Status status; result->reset(); - if (!options.use_mmap_writes) { + if (!options.use_mmap_writes && !options.use_direct_writes) { // Open file using underlying Env implementation std::unique_ptr underlying; status = EnvWrapper::ReuseWritableFile(fname, old_fname, &underlying, options); if (status.ok()) { - if (IsWriteEncrypted()) { + std::shared_ptr provider; + + key_lock.ReadLock(); + provider = encrypt_write_.second; + key_lock.ReadUnlock(); + + if (provider) { std::unique_ptr stream; - status = WriteSeqEncryptionPrefix(underlying.get(), stream); + status = WriteSeqEncryptionPrefix(underlying.get(), provider, stream); if (status.ok()) { (*result) = std::unique_ptr(new EncryptedWritableFile( std::move(underlying), std::move(stream), - encrypt_write_.second->GetPrefixLength())); + provider->GetPrefixLength())); } } else { (*result).reset(underlying.release()); @@ -417,43 +842,47 @@ Status EncryptedEnv2::ReuseWritableFile(const std::string& fname, // *result and returns OK. On failure returns non-OK. // // The returned file will only be accessed by one thread at a time. -Status EncryptedEnv2::NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { +Status EncryptedEnvV2::NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) { Status status; result->reset(); // Check file exists bool isNewFile = !FileExists(fname).ok(); - if (!options.use_mmap_writes && !options.use_mmap_reads) { + if (!options.use_mmap_writes && !options.use_mmap_reads && + !options.use_direct_writes && !options.use_direct_reads) { // Open file using underlying Env implementation std::unique_ptr underlying; status = EnvWrapper::NewRandomRWFile(fname, &underlying, options); if (status.ok()) { - std::shared_ptr provider; + std::shared_ptr provider; std::unique_ptr stream; if (!isNewFile) { // file exists, get existing crypto info status = ReadRandEncryptionPrefix(underlying.get(), provider, stream); - } else { // new file - if (IsWriteEncrypted()) { - status = WriteRandEncryptionPrefix(underlying.get(), stream); - provider = encrypt_write_.second; + key_lock.ReadLock(); + provider = encrypt_write_.second; + key_lock.ReadUnlock(); + + if (provider) { + status = + WriteRandEncryptionPrefix(underlying.get(), provider, stream); } } // establish encrypt or not, finalize file object if (status.ok()) { if (provider) { - (*result) = std::unique_ptr( - new EncryptedRandomRWFile(std::move(underlying), std::move(stream), - provider->GetPrefixLength())); + (*result) = std::unique_ptr(new EncryptedRandomRWFileV2( + std::move(underlying), std::move(stream), + provider->GetPrefixLength())); } else { (*result).reset(underlying.release()); } @@ -476,12 +905,12 @@ Status EncryptedEnv2::NewRandomRWFile(const std::string& fname, // NotFound if "dir" does not exist, the calling process does not have // permission to access "dir", or if "dir" is invalid. // IOError if an IO Error was encountered -Status EncryptedEnv2::GetChildrenFileAttributes( +Status EncryptedEnvV2::GetChildrenFileAttributes( const std::string& dir, std::vector* result) { auto status = EnvWrapper::GetChildrenFileAttributes(dir, result); if (status.ok()) { // this is slightly expensive, but fortunately not used heavily - std::shared_ptr provider; + std::shared_ptr provider; for (auto it = std::begin(*result); it != std::end(*result); ++it) { status = GetEncryptionProvider(it->name, provider); @@ -498,14 +927,14 @@ Status EncryptedEnv2::GetChildrenFileAttributes( } // Store the size of fname in *file_size. -Status EncryptedEnv2::GetFileSize(const std::string& fname, - uint64_t* file_size) { +Status EncryptedEnvV2::GetFileSize(const std::string& fname, + uint64_t* file_size) { Status status; status = EnvWrapper::GetFileSize(fname, file_size); if (status.ok()) { // this is slightly expensive, but fortunately not used heavily - std::shared_ptr provider; + std::shared_ptr provider; status = GetEncryptionProvider(fname, provider); if (status.ok() && provider) { size_t prefixLength = provider->GetPrefixLength(); @@ -516,8 +945,9 @@ Status EncryptedEnv2::GetFileSize(const std::string& fname, return status; } -Status EncryptedEnv2::GetEncryptionProvider( - const std::string& fname, std::shared_ptr& provider) { +Status EncryptedEnvV2::GetEncryptionProvider( + const std::string& fname, + std::shared_ptr& provider) { std::unique_ptr underlying; EnvOptions options; Status status; @@ -527,32 +957,31 @@ Status EncryptedEnv2::GetEncryptionProvider( if (status.ok()) { std::unique_ptr stream; - status = EncryptedEnv2::ReadSeqEncryptionPrefix(underlying.get(), provider, - stream); + status = EncryptedEnvV2::ReadSeqEncryptionPrefix(underlying.get(), provider, + stream); } return status; } -UnixLibCrypto EncryptedEnv2::crypto_; +UnixLibCrypto EncryptedEnvV2::crypto_; -Env* EncryptedEnv2::Default() { +Env* EncryptedEnvV2::Default() { // the rational for this routine is to help force the static // loading of UnixLibCrypto before other routines start // using the encryption code. - static EncryptedEnv2 default_env(Env::Default()); + static EncryptedEnvV2 default_env(Env::Default()); return &default_env; } -Env* EncryptedEnv2::Default(EncryptedEnv2::ReadKeys_t encrypt_read, - EncryptedEnv2::WriteKey_t encrypt_write) { - EncryptedEnv2* default_env = (EncryptedEnv2*)Default(); +Env* EncryptedEnvV2::Default(EncryptedEnvV2::ReadKeys encrypt_read, + EncryptedEnvV2::WriteKey encrypt_write) { + EncryptedEnvV2* default_env = (EncryptedEnvV2*)Default(); default_env->SetKeys(encrypt_read, encrypt_write); return default_env; } -#endif // ROCKSDB_LITE - -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE +#endif // ROCKSDB_LITE #endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/env/env_encrypt2_impl.h b/env/env_encrypt2_impl.h new file mode 100644 index 000000000..943143e64 --- /dev/null +++ b/env/env_encrypt2_impl.h @@ -0,0 +1,81 @@ +// copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifdef ROCKSDB_OPENSSL_AES_CTR +#ifndef ROCKSDB_LITE + +#include "openssl/aes.h" +#include "openssl/evp.h" +#include "rocksdb/env_encrypt2.h" + +namespace ROCKSDB_NAMESPACE { + +// following define block from page 70: +// https://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf +#if !defined(ALIGN16) +#if defined(__GNUC__) +#define ALIGN16 __attribute__((aligned(16))) +#else +#define ALIGN16 __declspec(align(16)) +#endif +#endif + +constexpr uint8_t kEncryptCodeVersion0{'0'}; + +typedef char EncryptMarker[8]; +static EncryptMarker kEncryptMarker = "Encrypt"; + +// long term: code_version could be used in a switch statement or factory +// prefix version 0 is 12 byte sha1 description hash, 128 bit (16 byte) +// nounce (assumed to be packed/byte aligned) +typedef struct { + uint8_t key_description_[EVP_MAX_MD_SIZE]; // max md is 64 + uint8_t nonce_[AES_BLOCK_SIZE]; // block size is 16 +} PrefixVersion0; + +class AESBlockAccessCipherStream : public BlockAccessCipherStream { + public: + AESBlockAccessCipherStream(const AesCtrKey& key, uint8_t code_version, + const uint8_t nonce[]) + : key_(key), code_version_(code_version) { + memcpy(&nonce_, nonce, AES_BLOCK_SIZE); + } + + // BlockSize returns the size of each block supported by this cipher stream. + size_t BlockSize() override { return AES_BLOCK_SIZE; }; + + // Encrypt one or more (partial) blocks of data at the file offset. + // Length of data is given in data_size. + Status Encrypt(uint64_t file_offset, char* data, size_t data_size) override; + + // Decrypt one or more (partial) blocks of data at the file offset. + // Length of data is given in data_size. + Status Decrypt(uint64_t file_offset, char* data, size_t data_size) override; + + // helper routine to combine 128 bit nounce_ with offset + static void BigEndianAdd128(uint8_t* buf, uint64_t value); + + protected: + void AllocateScratch(std::string&) override{}; + + Status EncryptBlock(uint64_t, char*, char*) override { + return Status::NotSupported("Wrong EncryptionProvider assumed"); + }; + + Status DecryptBlock(uint64_t, char*, char*) override { + return Status::NotSupported("Wrong EncryptionProvider assumed"); + }; + + AesCtrKey key_; + uint8_t code_version_; + uint8_t nonce_[AES_BLOCK_SIZE]; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE +#endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/env/env_encrypt2_test.cc b/env/env_encrypt2_test.cc index ca27b4754..5f6f05609 100644 --- a/env/env_encrypt2_test.cc +++ b/env/env_encrypt2_test.cc @@ -2,20 +2,20 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "rocksdb/env_encrypt2.h" - +#include "env/env_encrypt2_impl.h" #include "rocksdb/options.h" #include "rocksdb/sst_file_writer.h" #include "util/testharness.h" +#ifndef ROCKSDB_LITE #ifdef ROCKSDB_OPENSSL_AES_CTR -namespace rocksdb { +namespace ROCKSDB_NAMESPACE { class EnvEncrypt2_Sha1 {}; TEST(EnvEncrypt2_Sha1, Default) { - Sha1Description_t desc; + Sha1Description desc; ASSERT_FALSE(desc.IsValid()); for (size_t idx = 0; idx < sizeof(desc.desc); ++idx) { @@ -24,7 +24,7 @@ TEST(EnvEncrypt2_Sha1, Default) { } TEST(EnvEncrypt2_Sha1, Constructors) { - Sha1Description_t desc; + Sha1Description desc; // verify we know size of desc.desc ASSERT_TRUE(64 == sizeof(desc.desc)); @@ -34,38 +34,38 @@ TEST(EnvEncrypt2_Sha1, Constructors) { bytes[idx] = idx + 1; } - Sha1Description_t desc_bad1(bytes, 128); + Sha1Description desc_bad1(bytes, 128); ASSERT_FALSE(desc_bad1.IsValid()); - Sha1Description_t desc_bad2(bytes, 65); + Sha1Description desc_bad2(bytes, 65); ASSERT_FALSE(desc_bad2.IsValid()); - Sha1Description_t desc_good1(bytes, 64); + Sha1Description desc_good1(bytes, 64); ASSERT_TRUE(desc_good1.IsValid()); ptr = (uint8_t*)memchr(desc_good1.desc, 0, 64); ASSERT_TRUE(nullptr == ptr); - Sha1Description_t desc_good2(bytes, 63); + Sha1Description desc_good2(bytes, 63); ASSERT_TRUE(desc_good2.IsValid()); ptr = (uint8_t*)memchr(desc_good2.desc, 0, 64); ASSERT_TRUE(&desc_good2.desc[63] == ptr); - Sha1Description_t desc_good3(bytes, 1); + Sha1Description desc_good3(bytes, 1); ASSERT_TRUE(desc_good3.IsValid()); ptr = (uint8_t*)memchr(desc_good3.desc, 0, 64); ASSERT_TRUE(&desc_good3.desc[1] == ptr); - Sha1Description_t desc_good4(bytes, 0); + Sha1Description desc_good4(bytes, 0); ASSERT_TRUE(desc_good4.IsValid()); ptr = (uint8_t*)memchr(desc_good4.desc, 0, 64); ASSERT_TRUE(&desc_good4.desc[0] == ptr); - Sha1Description_t desc_str1(""); + Sha1Description desc_str1(""); ASSERT_FALSE(desc_str1.IsValid()); uint8_t md2[] = {0x35, 0x6a, 0x19, 0x2b, 0x79, 0x13, 0xb0, 0x4c, 0x54, 0x57, 0x4d, 0x18, 0xc2, 0x8d, 0x46, 0xe6, 0x39, 0x54, 0x28, 0xab}; - Sha1Description_t desc_str2("1"); + Sha1Description desc_str2("1"); ASSERT_TRUE(desc_str2.IsValid()); ASSERT_TRUE(0 == memcmp(md2, desc_str2.desc, sizeof(md2))); for (size_t idx = sizeof(md2); idx < sizeof(desc_str2.desc); ++idx) { @@ -74,7 +74,7 @@ TEST(EnvEncrypt2_Sha1, Constructors) { uint8_t md3[] = {0x7b, 0x52, 0x00, 0x9b, 0x64, 0xfd, 0x0a, 0x2a, 0x49, 0xe6, 0xd8, 0xa9, 0x39, 0x75, 0x30, 0x77, 0x79, 0x2b, 0x05, 0x54}; - Sha1Description_t desc_str3("12"); + Sha1Description desc_str3("12"); ASSERT_TRUE(desc_str3.IsValid()); ASSERT_TRUE(0 == memcmp(md3, desc_str3.desc, sizeof(md3))); for (size_t idx = sizeof(md3); idx < sizeof(desc_str3.desc); ++idx) { @@ -86,7 +86,7 @@ TEST(EnvEncrypt2_Sha1, Copy) { // assignment uint8_t md1[] = {0xdb, 0x8a, 0xc1, 0xc2, 0x59, 0xeb, 0x89, 0xd4, 0xa1, 0x31, 0xb2, 0x53, 0xba, 0xcf, 0xca, 0x5f, 0x31, 0x9d, 0x54, 0xf2}; - Sha1Description_t desc1("HelloWorld"), desc2; + Sha1Description desc1("HelloWorld"), desc2; ASSERT_TRUE(desc1.IsValid()); ASSERT_FALSE(desc2.IsValid()); @@ -105,10 +105,10 @@ TEST(EnvEncrypt2_Sha1, Copy) { // copy constructor uint8_t md3[] = {0x17, 0x09, 0xcc, 0x51, 0x65, 0xf5, 0x50, 0x4d, 0x46, 0xde, 0x2f, 0x3a, 0x7a, 0xff, 0x57, 0x45, 0x20, 0x8a, 0xed, 0x44}; - Sha1Description_t desc3("A little be longer title for a key"); + Sha1Description desc3("A little be longer title for a key"); ASSERT_TRUE(desc3.IsValid()); - Sha1Description_t desc4(desc3); + Sha1Description desc4(desc3); ASSERT_TRUE(desc3.IsValid()); ASSERT_TRUE(desc4.IsValid()); ASSERT_TRUE(0 == memcmp(md3, desc3.desc, sizeof(md3))); @@ -124,7 +124,7 @@ TEST(EnvEncrypt2_Sha1, Copy) { class EnvEncrypt2_Key {}; TEST(EnvEncrypt2_Key, Default) { - AesCtrKey_t key; + AesCtrKey key; ASSERT_FALSE(key.IsValid()); for (size_t idx = 0; idx < sizeof(key.key); ++idx) { @@ -133,7 +133,7 @@ TEST(EnvEncrypt2_Key, Default) { } TEST(EnvEncrypt2_Key, Constructors) { - AesCtrKey_t key; + AesCtrKey key; // verify we know size of key.key ASSERT_TRUE(64 == sizeof(key.key)); @@ -143,40 +143,40 @@ TEST(EnvEncrypt2_Key, Constructors) { bytes[idx] = idx + 1; } - AesCtrKey_t key_bad1(bytes, 128); + AesCtrKey key_bad1(bytes, 128); ASSERT_FALSE(key_bad1.IsValid()); - AesCtrKey_t key_bad2(bytes, 65); + AesCtrKey key_bad2(bytes, 65); ASSERT_FALSE(key_bad2.IsValid()); - AesCtrKey_t key_good1(bytes, 64); + AesCtrKey key_good1(bytes, 64); ASSERT_TRUE(key_good1.IsValid()); ptr = (uint8_t*)memchr(key_good1.key, 0, 64); ASSERT_TRUE(nullptr == ptr); - AesCtrKey_t key_good2(bytes, 63); + AesCtrKey key_good2(bytes, 63); ASSERT_TRUE(key_good2.IsValid()); ptr = (uint8_t*)memchr(key_good2.key, 0, 64); ASSERT_TRUE(&key_good2.key[63] == ptr); - AesCtrKey_t key_good3(bytes, 1); + AesCtrKey key_good3(bytes, 1); ASSERT_TRUE(key_good3.IsValid()); ptr = (uint8_t*)memchr(key_good3.key, 0, 64); ASSERT_TRUE(&key_good3.key[1] == ptr); - AesCtrKey_t key_good4(bytes, 0); + AesCtrKey key_good4(bytes, 0); ASSERT_TRUE(key_good4.IsValid()); ptr = (uint8_t*)memchr(key_good4.key, 0, 64); ASSERT_TRUE(&key_good4.key[0] == ptr); - AesCtrKey_t key_str1(""); + AesCtrKey key_str1(""); ASSERT_FALSE(key_str1.IsValid()); - AesCtrKey_t key_str2("0x35"); + AesCtrKey key_str2("0x35"); ASSERT_FALSE(key_str2.IsValid()); // 1234567890123456789012345678901234567890123456789012345678901234 - AesCtrKey_t key_str3( + AesCtrKey key_str3( "RandomSixtyFourCharactersLaLaLaLaJust a bunch of letters, not 0x"); ASSERT_FALSE(key_str2.IsValid()); @@ -185,7 +185,7 @@ TEST(EnvEncrypt2_Key, Constructors) { 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20}; // 1234567890123456789012345678901234567890123456789012345678901234 - AesCtrKey_t key_str4( + AesCtrKey key_str4( "0102030405060708090A0B0C0D0E0F101112131415161718191a1b1c1d1e1f20"); ASSERT_TRUE(key_str4.IsValid()); ASSERT_TRUE(0 == memcmp(key4, key_str4.key, sizeof(key4))); @@ -197,7 +197,7 @@ TEST(EnvEncrypt2_Key, Copy) { 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4}; - AesCtrKey_t key1(data1, sizeof(data1)), key2; + AesCtrKey key1(data1, sizeof(data1)), key2; ASSERT_TRUE(key1.IsValid()); ASSERT_FALSE(key2.IsValid()); @@ -212,10 +212,10 @@ TEST(EnvEncrypt2_Key, Copy) { 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0x22, 0x20, 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4}; - AesCtrKey_t key3(data3, sizeof(data3)); + AesCtrKey key3(data3, sizeof(data3)); ASSERT_TRUE(key3.IsValid()); - AesCtrKey_t key4(key3); + AesCtrKey key4(key3); ASSERT_TRUE(key3.IsValid()); ASSERT_TRUE(key4.IsValid()); ASSERT_TRUE(0 == memcmp(data3, key3.key, sizeof(data3))); @@ -226,11 +226,11 @@ class EnvEncrypt2_Provider {}; class CipherStreamWrapper : public BlockAccessCipherStream { public: - Status TESTEncryptBlock(uint64_t blockIndex, char* data, char* scratch) { - return EncryptBlock(blockIndex, data, scratch); + Status TESTEncrypt(uint64_t blockIndex, char* data, size_t size) { + return Encrypt(blockIndex, data, size); } - Status TESTDecryptBlock(uint64_t blockIndex, char* data, char* scratch) { - return DecryptBlock(blockIndex, data, scratch); + Status TESTDecrypt(uint64_t blockIndex, char* data, size_t size) { + return Decrypt(blockIndex, data, size); } }; @@ -262,83 +262,136 @@ TEST(EnvEncrypt2_Provider, NistExamples) { uint8_t cypher4[] = {0xdf, 0xc9, 0xc5, 0x8d, 0xb6, 0x7a, 0xad, 0xa6, 0x13, 0xc2, 0xdd, 0x08, 0x45, 0x79, 0x41, 0xa6}; - CTREncryptionProvider2 provider("NistExampleKey", key, sizeof(key)); - // only first 8 bytes of init taken in next call + CTREncryptionProviderV2 provider("NistExampleKey", key, sizeof(key)); + std::unique_ptr stream( provider.CreateCipherStream2(1, init)); uint64_t offset; uint8_t block[sizeof(plain1)]; - uint8_t* patch = (uint8_t*)&offset; // little endian assumed // // forward ... encryption // - memcpy((void*)&offset, (void*)&init[8], 8); + // memcpy((void*)&offset, (void*)&init[8], 8); + offset = 0; memcpy((void*)block, (void*)plain1, 16); CipherStreamWrapper* wrap = (CipherStreamWrapper*)stream.get(); - Status status = wrap->TESTEncryptBlock(offset, (char*)block, nullptr); + Status status = wrap->TESTEncrypt(offset, (char*)block, sizeof(block)); ASSERT_TRUE(0 == memcmp(cypher1, block, sizeof(block))); - memcpy((void*)&offset, (void*)&init[8], 8); + offset = 16; memcpy((void*)block, (void*)plain2, 16); - *(patch + 7) = 0x00; - *(patch + 6) = 0xff; - status = wrap->TESTEncryptBlock(offset, (char*)block, nullptr); + status = wrap->TESTEncrypt(offset, (char*)block, sizeof(block)); ASSERT_TRUE(0 == memcmp(cypher2, block, sizeof(block))); - memcpy((void*)&offset, (void*)&init[8], 8); + offset = 32; memcpy((void*)block, (void*)plain3, 16); - *(patch + 7) = 0x01; - *(patch + 6) = 0xff; - status = wrap->TESTEncryptBlock(offset, (char*)block, nullptr); + status = wrap->TESTEncrypt(offset, (char*)block, sizeof(block)); ASSERT_TRUE(0 == memcmp(cypher3, block, sizeof(block))); - memcpy((void*)&offset, (void*)&init[8], 8); + offset = 48; memcpy((void*)block, (void*)plain4, 16); - *(patch + 7) = 0x02; - *(patch + 6) = 0xff; - status = wrap->TESTEncryptBlock(offset, (char*)block, nullptr); + status = wrap->TESTEncrypt(offset, (char*)block, sizeof(block)); ASSERT_TRUE(0 == memcmp(cypher4, block, sizeof(block))); // // backward -- decryption // - memcpy((void*)&offset, (void*)&init[8], 8); + offset = 0; memcpy((void*)block, (void*)cypher1, 16); - status = wrap->TESTDecryptBlock(offset, (char*)block, nullptr); + status = wrap->TESTDecrypt(offset, (char*)block, sizeof(block)); ASSERT_TRUE(0 == memcmp(plain1, block, sizeof(block))); - memcpy((void*)&offset, (void*)&init[8], 8); + offset = 16; memcpy((void*)block, (void*)cypher2, 16); - *(patch + 7) = 0x00; - *(patch + 6) = 0xff; - status = wrap->TESTDecryptBlock(offset, (char*)block, nullptr); + status = wrap->TESTDecrypt(offset, (char*)block, sizeof(block)); ASSERT_TRUE(0 == memcmp(plain2, block, sizeof(block))); - memcpy((void*)&offset, (void*)&init[8], 8); + offset = 32; memcpy((void*)block, (void*)cypher3, 16); - *(patch + 7) = 0x01; - *(patch + 6) = 0xff; - status = wrap->TESTDecryptBlock(offset, (char*)block, nullptr); + status = wrap->TESTDecrypt(offset, (char*)block, sizeof(block)); ASSERT_TRUE(0 == memcmp(plain3, block, sizeof(block))); - memcpy((void*)&offset, (void*)&init[8], 8); + offset = 48; memcpy((void*)block, (void*)cypher4, 16); - *(patch + 7) = 0x02; - *(patch + 6) = 0xff; - status = wrap->TESTDecryptBlock(offset, (char*)block, nullptr); + status = wrap->TESTDecrypt(offset, (char*)block, sizeof(block)); ASSERT_TRUE(0 == memcmp(plain4, block, sizeof(block))); } +TEST(EnvEncrypt2_Provider, NistSingleCall) { + uint8_t key[] = {0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, + 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, + 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, + 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4}; + uint8_t init[] = {0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff}; + + uint8_t plain1[] = { + 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, + 0x11, 0x73, 0x93, 0x17, 0x2a, 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, + 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51, 0x30, + 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, + 0x1a, 0x0a, 0x52, 0xef, 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, + 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10}; + + uint8_t cypher1[] = { + 0x60, 0x1e, 0xc3, 0x13, 0x77, 0x57, 0x89, 0xa5, 0xb7, 0xa7, 0xf5, + 0x04, 0xbb, 0xf3, 0xd2, 0x28, 0xf4, 0x43, 0xe3, 0xca, 0x4d, 0x62, + 0xb5, 0x9a, 0xca, 0x84, 0xe9, 0x90, 0xca, 0xca, 0xf5, 0xc5, 0x2b, + 0x09, 0x30, 0xda, 0xa2, 0x3d, 0xe9, 0x4c, 0xe8, 0x70, 0x17, 0xba, + 0x2d, 0x84, 0x98, 0x8d, 0xdf, 0xc9, 0xc5, 0x8d, 0xb6, 0x7a, 0xad, + 0xa6, 0x13, 0xc2, 0xdd, 0x08, 0x45, 0x79, 0x41, 0xa6}; + + AesCtrKey aes_key(key, sizeof(key)); + uint8_t output[sizeof(plain1)]; + AESBlockAccessCipherStream stream(aes_key, 0, init); + uint64_t offset; + + // + // forward ... encryption + // + memcpy((void*)output, (void*)plain1, sizeof(plain1)); + // memcpy((void*)&offset, (void*)&init[8], 8); + offset = 0; + + Status status = stream.Encrypt(offset, (char*)output, sizeof(plain1)); + ASSERT_TRUE(status.ok()); + ASSERT_TRUE(0 == memcmp(cypher1, output, sizeof(output))); +} + +TEST(EnvEncrypt2_Provider, BigEndianAdd) { + uint8_t nounce1[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + uint8_t expect1[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}; + AESBlockAccessCipherStream::BigEndianAdd128(nounce1, 1); + ASSERT_TRUE(0 == memcmp(nounce1, expect1, sizeof(nounce1))); + + uint8_t nounce2[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + uint8_t expect2[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + AESBlockAccessCipherStream::BigEndianAdd128(nounce2, 1); + ASSERT_TRUE(0 == memcmp(nounce2, expect2, sizeof(nounce2))); + + uint8_t nounce3[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + uint8_t expect3[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00}; + AESBlockAccessCipherStream::BigEndianAdd128(nounce3, 0xff01); + ASSERT_TRUE(0 == memcmp(nounce3, expect3, sizeof(nounce3))); + +} + // // The following is copied from env_basic_test.cc // @@ -410,7 +463,7 @@ class EnvMoreTestWithParam : public EnvBasicTestWithParam {}; // next statements run env test against encrypt_2 code. static std::string KeyName = {"A key name"}; -static Sha1Description_t KeyDesc(KeyName); +static Sha1Description KeyDesc(KeyName); // this key is from // https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf, @@ -419,18 +472,18 @@ static uint8_t key256[] = {0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4}; -std::shared_ptr encrypt2_provider_ctr( - new CTREncryptionProvider2(KeyName, key256, 32)); +std::shared_ptr encrypt2_provider_ctr( + new CTREncryptionProviderV2(KeyName, key256, 32)); -static EncryptedEnv2::ReadKeys_t encrypt_readers = { +static EncryptedEnvV2::ReadKeys encrypt_readers = { {KeyDesc, encrypt2_provider_ctr}}; -static EncryptedEnv2::WriteKey_t encrypt_writer = {KeyDesc, - encrypt2_provider_ctr}; +static EncryptedEnvV2::WriteKey encrypt_writer = {KeyDesc, + encrypt2_provider_ctr}; static std::unique_ptr encrypt2_env(new NormalizingEnvWrapper( - EncryptedEnv2::Default(encrypt_readers, encrypt_writer))); + EncryptedEnvV2::Default(encrypt_readers, encrypt_writer))); -INSTANTIATE_TEST_CASE_P(EncryptedEnv2, EnvBasicTestWithParam, +INSTANTIATE_TEST_CASE_P(EncryptedEnvV2, EnvBasicTestWithParam, ::testing::Values(encrypt2_env.get())); TEST_P(EnvBasicTestWithParam, Basics) { @@ -438,6 +491,10 @@ TEST_P(EnvBasicTestWithParam, Basics) { std::unique_ptr writable_file; std::vector children; + // kill warning + std::string warn(kEncryptMarker); + warn.length(); + // Check that the directory is empty. ASSERT_EQ(Status::NotFound(), env_->FileExists(test_dir_ + "/non_existent")); ASSERT_TRUE(!env_->GetFileSize(test_dir_ + "/non_existent", &file_size).ok()); @@ -677,26 +734,10 @@ class SstWriterBug : public testing::Test { } }; -#if 0 -TEST(SstWriterBug, BugCheck) { - - Options sstOptions; - - sstOptions.env = encrypt2_env.get(); - - // auto* cf = reinterpret_cast(theCfHandle); - rocksdb::ColumnFamilyHandle * cf = nullptr; - // sstOptions.compression = (CompressionType)theCompression; - auto* sst_file_writer = new rocksdb::SstFileWriter(EnvOptions(), sstOptions, sstOptions.comparator, cf); - std::string path = test::PerThreadDBPath("BugCheck1"); - Status ss = sst_file_writer->Open(path); - ASSERT_OK(ss); -} -#endif - -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_OPENSSL_AES_CTR +#endif // ROCKSDB_LITE int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/env/env_encryption.cc b/env/env_encryption.cc index df7cf504e..76b7a8947 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -19,19 +19,18 @@ #endif -namespace rocksdb { +namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE - - // Read up to "n" bytes from the file. "scratch[0..n-1]" may be - // written by this routine. Sets "*result" to the data that was - // read (including if fewer than "n" bytes were successfully read). - // May set "*result" to point at data in "scratch[0..n-1]", so - // "scratch[0..n-1]" must be live when "*result" is used. - // If an error was encountered, returns a non-OK status. - // - // REQUIRES: External synchronization +// Read up to "n" bytes from the file. "scratch[0..n-1]" may be +// written by this routine. Sets "*result" to the data that was +// read (including if fewer than "n" bytes were successfully read). +// May set "*result" to point at data in "scratch[0..n-1]", so +// "scratch[0..n-1]" must be live when "*result" is used. +// If an error was encountered, returns a non-OK status. +// +// REQUIRES: External synchronization Status EncryptedSequentialFile::Read(size_t n, Slice* result, char* scratch) { assert(scratch); Status status = file_->Read(n, result, scratch); @@ -75,15 +74,15 @@ size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } - // Remove any kind of caching of data from the offset to offset+length - // of this file. If the length is 0, then it refers to the end of file. - // If the system is not caching the file contents, then this is a noop. +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. Status EncryptedSequentialFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } - // Positioned Read for direct I/O - // If Direct I/O enabled, offset, n, and scratch should be properly aligned +// Positioned Read for direct I/O +// If Direct I/O enabled, offset, n, and scratch should be properly aligned Status EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result, char* scratch) { assert(scratch); @@ -100,16 +99,16 @@ Status EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, return status; } - // Read up to "n" bytes from the file starting at "offset". - // "scratch[0..n-1]" may be written by this routine. Sets "*result" - // to the data that was read (including if fewer than "n" bytes were - // successfully read). May set "*result" to point at data in - // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when - // "*result" is used. If an error was encountered, returns a non-OK - // status. - // - // Safe for concurrent use by multiple threads. - // If Direct I/O enabled, offset, n, and scratch should be aligned properly. +// Read up to "n" bytes from the file starting at "offset". +// "scratch[0..n-1]" may be written by this routine. Sets "*result" +// to the data that was read (including if fewer than "n" bytes were +// successfully read). May set "*result" to point at data in +// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when +// "*result" is used. If an error was encountered, returns a non-OK +// status. +// +// Safe for concurrent use by multiple threads. +// If Direct I/O enabled, offset, n, and scratch should be aligned properly. Status EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { assert(scratch); @@ -125,27 +124,27 @@ Status EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, return status; } - // Readahead the file starting from offset by n bytes for caching. +// Readahead the file starting from offset by n bytes for caching. Status EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n) { // return Status::OK(); return file_->Prefetch(offset + prefixLength_, n); } - // Tries to get an unique ID for this file that will be the same each time - // the file is opened (and will stay the same while the file is open). - // Furthermore, it tries to make this ID at most "max_size" bytes. If such an - // ID can be created this function returns the length of the ID and places it - // in "id"; otherwise, this function returns 0, in which case "id" - // may not have been modified. - // - // This function guarantees, for IDs from a given environment, two unique ids - // cannot be made equal to each other by adding arbitrary bytes to one of - // them. That is, no unique ID is the prefix of another. - // - // This function guarantees that the returned ID will not be interpretable as - // a single varint. - // - // Note: these IDs are only valid for the duration of the process. +// Tries to get an unique ID for this file that will be the same each time +// the file is opened (and will stay the same while the file is open). +// Furthermore, it tries to make this ID at most "max_size" bytes. If such an +// ID can be created this function returns the length of the ID and places it +// in "id"; otherwise, this function returns 0, in which case "id" +// may not have been modified. +// +// This function guarantees, for IDs from a given environment, two unique ids +// cannot be made equal to each other by adding arbitrary bytes to one of +// them. That is, no unique ID is the prefix of another. +// +// This function guarantees that the returned ID will not be interpretable as +// a single varint. +// +// Note: these IDs are only valid for the duration of the process. size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { return file_->GetUniqueId(id, max_size); }; @@ -154,21 +153,21 @@ void EncryptedRandomAccessFile::Hint(AccessPattern pattern) { file_->Hint(pattern); } - // Indicates the upper layers if the current RandomAccessFile implementation - // uses direct IO. +// Indicates the upper layers if the current RandomAccessFile implementation +// uses direct IO. bool EncryptedRandomAccessFile::use_direct_io() const { return file_->use_direct_io(); } - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } - // Remove any kind of caching of data from the offset to offset+length - // of this file. If the length is 0, then it refers to the end of file. - // If the system is not caching the file contents, then this is a noop. +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. Status EncryptedRandomAccessFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); @@ -234,14 +233,14 @@ Status EncryptedWritableFile::PositionedAppend(const Slice& data, return status; } - // Indicates the upper layers if the current WritableFile implementation - // uses direct IO. +// Indicates the upper layers if the current WritableFile implementation +// uses direct IO. bool EncryptedWritableFile::use_direct_io() const { return file_->use_direct_io(); } - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O size_t EncryptedWritableFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } @@ -253,42 +252,42 @@ uint64_t EncryptedWritableFile::GetFileSize() { return file_->GetFileSize() - prefixLength_; } - // Truncate is necessary to trim the file to the correct size - // before closing. It is not always possible to keep track of the file - // size due to whole pages writes. The behavior is undefined if called - // with other writes to follow. +// Truncate is necessary to trim the file to the correct size +// before closing. It is not always possible to keep track of the file +// size due to whole pages writes. The behavior is undefined if called +// with other writes to follow. Status EncryptedWritableFile::Truncate(uint64_t size) { return file_->Truncate(size + prefixLength_); } - // Remove any kind of caching of data from the offset to offset+length - // of this file. If the length is 0, then it refers to the end of file. - // If the system is not caching the file contents, then this is a noop. - // This call has no effect on dirty pages in the cache. +// Remove any kind of caching of data from the offset to offset+length +// of this file. If the length is 0, then it refers to the end of file. +// If the system is not caching the file contents, then this is a noop. +// This call has no effect on dirty pages in the cache. Status EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } - // Sync a file range with disk. - // offset is the starting byte of the file range to be synchronized. - // nbytes specifies the length of the range to be synchronized. - // This asks the OS to initiate flushing the cached data to disk, - // without waiting for completion. - // Default implementation does nothing. +// Sync a file range with disk. +// offset is the starting byte of the file range to be synchronized. +// nbytes specifies the length of the range to be synchronized. +// This asks the OS to initiate flushing the cached data to disk, +// without waiting for completion. +// Default implementation does nothing. Status EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) { return file_->RangeSync(offset + prefixLength_, nbytes); } - // PrepareWrite performs any necessary preparation for a write - // before the write actually occurs. This allows for pre-allocation - // of space on devices where it can result in less file - // fragmentation and/or less waste from over-zealous filesystem - // pre-allocation. +// PrepareWrite performs any necessary preparation for a write +// before the write actually occurs. This allows for pre-allocation +// of space on devices where it can result in less file +// fragmentation and/or less waste from over-zealous filesystem +// pre-allocation. void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len) { file_->PrepareWrite(offset + prefixLength_, len); } - // Pre-allocates space for a file. +// Pre-allocates space for a file. Status EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len) { return file_->Allocate(offset + prefixLength_, len); } @@ -301,14 +300,14 @@ bool EncryptedRandomRWFile::use_direct_io() const { return file_->use_direct_io(); } - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O +// Use the returned alignment value to allocate +// aligned buffer for Direct I/O size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } - // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. - // Pass aligned buffer when use_direct_io() returns true. +// Write bytes in `data` at offset `offset`, Returns Status::OK() on success. +// Pass aligned buffer when use_direct_io() returns true. Status EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data) { AlignedBuffer buf; Status status; @@ -333,9 +332,9 @@ Status EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data) { return status; } - // Read up to `n` bytes starting from offset `offset` and store them in - // result, provided `scratch` size should be at least `n`. - // Returns Status::OK() on success. +// Read up to `n` bytes starting from offset `offset` and store them in +// result, provided `scratch` size should be at least `n`. +// Returns Status::OK() on success. Status EncryptedRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { assert(scratch); @@ -705,7 +704,8 @@ Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider) { // Encrypt one or more (partial) blocks of data at the file offset. // Length of data is given in dataSize. -Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t dataSize) { +Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, + size_t dataSize) { // Calculate block index auto blockSize = BlockSize(); uint64_t blockIndex = fileOffset / blockSize; @@ -717,7 +717,7 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t // Encrypt individual blocks. while (1) { - char *block = data; + char* block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { // We're not encrypting a full block. @@ -750,7 +750,8 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t // Decrypt one or more (partial) blocks of data at the file offset. // Length of data is given in dataSize. -Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t dataSize) { +Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data, + size_t dataSize) { // Calculate block index auto blockSize = BlockSize(); uint64_t blockIndex = fileOffset / blockSize; @@ -762,7 +763,7 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t // Decrypt individual blocks. while (1) { - char *block = data; + char* block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { // We're not decrypting a full block. @@ -803,18 +804,16 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t // Encrypt a block of data. // Length of data is equal to BlockSize(). -Status ROT13BlockCipher::Encrypt(char *data) { +Status ROT13BlockCipher::Encrypt(char* data) { for (size_t i = 0; i < blockSize_; ++i) { - data[i] += 13; + data[i] += 13; } return Status::OK(); } // Decrypt a block of data. // Length of data is equal to BlockSize(). -Status ROT13BlockCipher::Decrypt(char *data) { - return Encrypt(data); -} +Status ROT13BlockCipher::Decrypt(char* data) { return Encrypt(data); } // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. void CTRCipherStream::AllocateScratch(std::string& scratch) { @@ -824,8 +823,8 @@ void CTRCipherStream::AllocateScratch(std::string& scratch) { // Encrypt a block of data at the given block index. // Length of data is equal to BlockSize(); -Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scratch) { - +Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data, + char* scratch) { // Create nonce + counter auto blockSize = cipher_.BlockSize(); memmove(scratch, iv_.data(), blockSize); @@ -846,7 +845,8 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char *data, char* scra // Decrypt a block of data at the given block index. // Length of data is equal to BlockSize(); -Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scratch) { +Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char* data, + char* scratch) { // For CTR decryption & encryption are the same return EncryptBlock(blockIndex, data, scratch); } @@ -855,13 +855,14 @@ Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char *data, char* scra // and used for storing encryption options. // For optimal performance, the prefix length should be a multiple of // the page size. -size_t CTREncryptionProvider::GetPrefixLength() { +size_t CTREncryptionProvider::GetPrefixLength() const { return defaultPrefixLength; } // decodeCTRParameters decodes the initial counter & IV from the given // (plain text) prefix. -static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t &initialCounter, Slice &iv) { +static void decodeCTRParameters(const char* prefix, size_t blockSize, + uint64_t& initialCounter, Slice& iv) { // First block contains 64-bit initial counter initialCounter = DecodeFixed64(prefix); // Second block contains IV @@ -872,7 +873,7 @@ static void decodeCTRParameters(const char *prefix, size_t blockSize, uint64_t & // for a new file. Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, char* prefix, - size_t prefixLength) { + size_t prefixLength) const { // Create & seed rnd. Random rnd((uint32_t)Env::Default()->NowMicros()); // Fill entire prefix block with random values. @@ -886,7 +887,8 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, decodeCTRParameters(prefix, blockSize, initialCounter, prefixIV); // Now populate the rest of the prefix, starting from the third block. - PopulateSecretPrefixPart(prefix + (2 * blockSize), prefixLength - (2 * blockSize), blockSize); + PopulateSecretPrefixPart(prefix + (2 * blockSize), + prefixLength - (2 * blockSize), blockSize); // Encrypt the prefix, starting from block 2 (leave block 0, 1 with initial // counter & IV unencrypted) @@ -907,9 +909,8 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, // in plain text. // Returns the amount of space (starting from the start of the prefix) // that has been initialized. -size_t CTREncryptionProvider::PopulateSecretPrefixPart(char* /*prefix*/, - size_t /*prefixLength*/, - size_t /*blockSize*/) { +size_t CTREncryptionProvider::PopulateSecretPrefixPart( + char* /*prefix*/, size_t /*prefixLength*/, size_t /*blockSize*/) const { // Nothing to do here, put in custom data in override when needed. return 0; } @@ -945,11 +946,12 @@ Status CTREncryptionProvider::CreateCipherStream( } // Create cipher stream - return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, prefix, result); + return CreateCipherStreamFromPrefix(fname, options, initialCounter, iv, + prefix, result); } -// CreateCipherStreamFromPrefix creates a block access cipher stream for a file given -// given name and options. The given prefix is already decrypted. +// CreateCipherStreamFromPrefix creates a block access cipher stream for a file +// given given name and options. The given prefix is already decrypted. Status CTREncryptionProvider::CreateCipherStreamFromPrefix( const std::string& /*fname*/, const EnvOptions& /*options*/, uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/, @@ -959,6 +961,6 @@ Status CTREncryptionProvider::CreateCipherStreamFromPrefix( return Status::OK(); } -#endif // ROCKSDB_LITE +#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/env/env_posix.cc b/env/env_posix.cc index 1db4776a9..018da0d63 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -7,6 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors #include +#ifdef ROCKSDB_OPENSSL_AES_CTR +#include +#endif #include #include #if defined(OS_LINUX) @@ -69,6 +72,17 @@ #endif namespace rocksdb { +#if defined(OS_WIN) +static const char* kSharedLibExt = ".dll"; +static const char kPathSeparator = ';'; +#else +static const char kPathSeparator = ':'; +#if defined(OS_MACOSX) +static const char* kSharedLibExt = ".dylib"; +#else +static const char* kSharedLibExt = ".so"; +#endif +#endif namespace { @@ -76,6 +90,33 @@ ThreadStatusUpdater* CreateThreadStatusUpdater() { return new ThreadStatusUpdater(); } +#ifdef ROCKSDB_OPENSSL_AES_CTR +class PosixDynamicLibrary : public DynamicLibrary { + public: + PosixDynamicLibrary(const std::string& name, void* handle) + : name_(name), handle_(handle) {} + ~PosixDynamicLibrary() override { dlclose(handle_); } + + Status LoadSymbol(const std::string& sym_name, void** func) override { + assert(nullptr != func); + dlerror(); // Clear any old error + *func = dlsym(handle_, sym_name.c_str()); + if (*func != nullptr) { + return Status::OK(); + } else { + char* err = dlerror(); + return Status::NotFound("Error finding symbol: " + sym_name, err); + } + } + + const char* Name() const override { return name_.c_str(); } + + private: + std::string name_; + void* handle_; +}; +#endif // ROCKSDB_OPENSSL_AES_CTR + inline mode_t GetDBFileMode(bool allow_non_owner_access) { return allow_non_owner_access ? 0644 : 0600; } @@ -731,6 +772,62 @@ class PosixEnv : public Env { return result; } +#ifdef ROCKSDB_OPENSSL_AES_CTR + // Loads the named library into the result. + // If the input name is empty, the current executable is loaded + // On *nix systems, a "lib" prefix is added to the name if one is not supplied + // Comparably, the appropriate shared library extension is added to the name + // if not supplied. If search_path is not specified, the shared library will + // be loaded using the default path (LD_LIBRARY_PATH) If search_path is + // specified, the shared library will be searched for in the directories + // provided by the search path + Status LoadLibrary(const std::string& name, const std::string& path, + std::shared_ptr* result) override { + Status status; + std::string library_name = name; + assert(result != nullptr); + if (name.empty()) { + void* hndl = dlopen(NULL, RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(name, hndl)); + return Status::OK(); + } + } else { + if (library_name.find(kSharedLibExt) == std::string::npos) { + library_name = library_name + kSharedLibExt; + } +#if !defined(OS_WIN) + if (library_name.find('/') == std::string::npos && + library_name.compare(0, 3, "lib") != 0) { + library_name = "lib" + library_name; + } +#endif + if (path.empty()) { + void* hndl = dlopen(library_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(library_name, hndl)); + return Status::OK(); + } + } else { + std::string local_path; + std::stringstream ss(path); + while (getline(ss, local_path, kPathSeparator)) { + if (!path.empty()) { + std::string full_name = local_path + "/" + library_name; + void* hndl = dlopen(full_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new PosixDynamicLibrary(full_name, hndl)); + return Status::OK(); + } + } + } + } + } + return Status::IOError( + IOErrorMsg("Failed to open shared library: ", library_name), dlerror()); + } +#endif // ROCKSDB_OPENSSL_AES_CTR + virtual void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW, void* tag = nullptr, void (*unschedFunction)(void* arg) = nullptr) override; diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index bc439ac1c..b6c0317f8 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -34,6 +34,7 @@ namespace rocksdb { +class DynamicLibrary; class FileLock; class Logger; class RandomAccessFile; @@ -318,6 +319,18 @@ class Env { // REQUIRES: lock has not already been unlocked. virtual Status UnlockFile(FileLock* lock) = 0; + // Opens `lib_name` as a dynamic library. + // If the 'search_path' is specified, breaks the path into its components + // based on the appropriate platform separator (";" or ";") and looks for the + // library in those directories. If 'search path is not specified, uses the + // default library path search mechanism (such as LD_LIBRARY_PATH). On + // success, stores a dynamic library in `*result`. + virtual Status LoadLibrary(const std::string& /*lib_name*/, + const std::string& /*search_path */, + std::shared_ptr* /*result*/) { + return Status::NotSupported("LoadLibrary is not implemented in this Env"); + } + // Priority for scheduling job in thread pool enum Priority { BOTTOM, LOW, HIGH, TOTAL }; @@ -940,6 +953,27 @@ class FileLock { void operator=(const FileLock&); }; +class DynamicLibrary { + public: + virtual ~DynamicLibrary() {} + + // Returns the name of the dynamic library. + virtual const char* Name() const = 0; + + // Loads the symbol for sym_name from the library and updates the input + // function. Returns the loaded symbol. + template + Status LoadFunction(const std::string& sym_name, std::function* function) { + assert(nullptr != function); + void* ptr = nullptr; + Status s = LoadSymbol(sym_name, &ptr); + *function = reinterpret_cast(ptr); + return s; + } + // Loads and returns the symbol for sym_name from the library. + virtual Status LoadSymbol(const std::string& sym_name, void** func) = 0; +}; + extern void LogFlush(const std::shared_ptr& info_log); extern void Log(const InfoLogLevel log_level, diff --git a/include/rocksdb/env_encrypt2.h b/include/rocksdb/env_encrypt2.h index dc2669d94..b469f30e2 100644 --- a/include/rocksdb/env_encrypt2.h +++ b/include/rocksdb/env_encrypt2.h @@ -6,16 +6,20 @@ // // env_encryption.cc copied to this file then modified. +#pragma once + #ifdef ROCKSDB_OPENSSL_AES_CTR #ifndef ROCKSDB_LITE +#include +#include +#include + #include #include #include -#include "openssl/aes.h" -#include "openssl/evp.h" -#include "openssl/rand.h" +#include "env.h" #include "rocksdb/env_encryption.h" #include "util/aligned_buffer.h" #include "util/coding.h" @@ -24,83 +28,85 @@ #endif -namespace rocksdb { +namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE -struct Sha1Description_t { +struct Sha1Description { uint8_t desc[EVP_MAX_MD_SIZE]; bool valid; - Sha1Description_t() : valid(false) { memset(desc, 0, EVP_MAX_MD_SIZE); } + Sha1Description() : valid(false) { memset(desc, 0, EVP_MAX_MD_SIZE); } - Sha1Description_t(const Sha1Description_t& rhs) { *this = rhs; } + Sha1Description(const Sha1Description& rhs) { *this = rhs; } - Sha1Description_t& operator=(const Sha1Description_t& rhs) { + Sha1Description& operator=(const Sha1Description& rhs) { memcpy(desc, rhs.desc, sizeof(desc)); valid = rhs.valid; return *this; } - Sha1Description_t(uint8_t* Desc, size_t DescLen) : valid(false) { + Sha1Description(uint8_t* desc_in, size_t desc_len) : valid(false) { memset(desc, 0, EVP_MAX_MD_SIZE); - if (DescLen <= EVP_MAX_MD_SIZE) { - memcpy(desc, Desc, DescLen); + if (desc_len <= EVP_MAX_MD_SIZE) { + memcpy(desc, desc_in, desc_len); valid = true; } } - Sha1Description_t(const std::string& key_desc_str); + Sha1Description(const std::string& key_desc_str); - // see AesCtrKey_t destructor below. This data is not really + // see AesCtrKey destructor below. This data is not really // essential to clear, but trying to set pattern for future work. // goal is to explicitly remove desc from memory once no longer needed - ~Sha1Description_t() { + ~Sha1Description() { memset(desc, 0, EVP_MAX_MD_SIZE); valid = false; } - bool operator<(const Sha1Description_t& rhs) const { + bool operator<(const Sha1Description& rhs) const { return memcmp(desc, rhs.desc, EVP_MAX_MD_SIZE) < 0; } - bool operator==(const Sha1Description_t& rhs) const { + bool operator==(const Sha1Description& rhs) const { return 0 == memcmp(desc, rhs.desc, EVP_MAX_MD_SIZE) && valid == rhs.valid; } bool IsValid() const { return valid; } }; -struct AesCtrKey_t { +struct AesCtrKey { uint8_t key[EVP_MAX_KEY_LENGTH]; bool valid; - AesCtrKey_t() : valid(false) { memset(key, 0, EVP_MAX_KEY_LENGTH); } + AesCtrKey() : valid(false) { memset(key, 0, EVP_MAX_KEY_LENGTH); } - AesCtrKey_t(const uint8_t* Key, size_t KeyLen) : valid(false) { + AesCtrKey(const uint8_t* key_in, size_t key_len) : valid(false) { memset(key, 0, EVP_MAX_KEY_LENGTH); - if (KeyLen <= EVP_MAX_KEY_LENGTH) { - memcpy(key, Key, KeyLen); + if (key_len <= EVP_MAX_KEY_LENGTH) { + memcpy(key, key_in, key_len); valid = true; } else { valid = false; } } - AesCtrKey_t(const std::string& key_str); + AesCtrKey(const std::string& key_str); // see Writing Solid Code, 2nd edition - // Chapter 9, page 321, Managing Secrets in Memory ... bullet 4 "Scrub the memory" - // Not saying this is essential or effective in initial implementation since current - // usage model loads all keys at start and only deletes them at shutdown. But does - // establish presidence. + // Chapter 9, page 321, Managing Secrets in Memory ... bullet 4 "Scrub the + // memory" + // Not saying this is essential or effective in initial implementation since + // current + // usage model loads all keys at start and only deletes them at shutdown. But + // does establish presidence. // goal is to explicitly remove key from memory once no longer needed - ~AesCtrKey_t() { + ~AesCtrKey() { memset(key, 0, EVP_MAX_KEY_LENGTH); valid = false; } - bool operator==(const AesCtrKey_t& rhs) const { + bool operator==(const AesCtrKey& rhs) const { return (0 == memcmp(key, rhs.key, EVP_MAX_KEY_LENGTH)) && (valid == rhs.valid); } @@ -108,130 +114,117 @@ struct AesCtrKey_t { bool IsValid() const { return valid; } }; -typedef char EncryptMarker_t[8]; -static EncryptMarker_t Marker = "Encrypt"; - -// long term: code_version could be used in a switch statement or factory -// parameter version 0 is 12 byte sha1 description hash, 128 bit (16 byte) -// nounce (assumed to be packed/byte aligned) -typedef struct { - uint8_t key_description_[EVP_MAX_MD_SIZE]; - uint8_t nonce_[AES_BLOCK_SIZE / 2]; // block size is 16 -} Prefix0_t; - -class AESBlockAccessCipherStream : public BlockAccessCipherStream { - public: - AESBlockAccessCipherStream(const AesCtrKey_t& key, uint8_t code_version, - uint8_t nonce[]) - : key_(key), code_version_(code_version) { - memcpy(&nonce_, nonce, AES_BLOCK_SIZE / 2); - } - - // BlockSize returns the size of each block supported by this cipher stream. - virtual size_t BlockSize() override { return AES_BLOCK_SIZE; }; - - protected: - // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. - virtual void AllocateScratch(std::string&) override{}; - - // Encrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); - virtual Status EncryptBlock(uint64_t blockIndex, char* data, - char* scratch) override; - - // Decrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); - virtual Status DecryptBlock(uint64_t blockIndex, char* data, - char* scratch) override; - - AesCtrKey_t key_; - uint8_t code_version_; - uint8_t nonce_[AES_BLOCK_SIZE / 2]; -}; - -class CTREncryptionProvider2 : public EncryptionProvider { +class CTREncryptionProviderV2 : public EncryptionProvider { public: - CTREncryptionProvider2() = delete; + CTREncryptionProviderV2() = delete; - CTREncryptionProvider2(const CTREncryptionProvider&&) = delete; + CTREncryptionProviderV2(const CTREncryptionProvider&&) = delete; - CTREncryptionProvider2(const Sha1Description_t& key_desc, - const AesCtrKey_t& key) - : valid_(false), key_desc_(key_desc), key_(key) { + CTREncryptionProviderV2(const Sha1Description& key_desc_in, + const AesCtrKey& key_in) + : valid_(false), key_desc_(key_desc_in), key_(key_in) { valid_ = key_desc_.IsValid() && key_.IsValid(); } - CTREncryptionProvider2(const std::string& key_desc_str, - const uint8_t unformatted_key[], int bytes) + CTREncryptionProviderV2(const std::string& key_desc_str, + const uint8_t unformatted_key[], int bytes) : valid_(false), key_desc_(key_desc_str), key_(unformatted_key, bytes) { valid_ = key_desc_.IsValid() && key_.IsValid(); } - virtual size_t GetPrefixLength() override { - return sizeof(Prefix0_t) + sizeof(EncryptMarker_t); - } + size_t GetPrefixLength() const override; - virtual Status CreateNewPrefix(const std::string& /*fname*/, char* prefix, - size_t prefixLength) override; + Status CreateNewPrefix(const std::string& /*fname*/, char* prefix, + size_t prefixLength) const override; - virtual Status CreateCipherStream( + Status CreateCipherStream( const std::string& /*fname*/, const EnvOptions& /*options*/, Slice& /*prefix*/, std::unique_ptr* /*result*/) override { return Status::NotSupported("Wrong EncryptionProvider assumed"); } - virtual BlockAccessCipherStream* CreateCipherStream2(uint8_t code_version, - uint8_t nonce[]) { - return new AESBlockAccessCipherStream(key_, code_version, nonce); - } + virtual BlockAccessCipherStream* CreateCipherStream2( + uint8_t code_version, const uint8_t nonce[]) const; bool Valid() const { return valid_; }; - const Sha1Description_t& key_desc() const { return key_desc_; }; - const AesCtrKey_t& key() const { return key_; }; + const Sha1Description& key_desc() const { return key_desc_; }; + const AesCtrKey& key() const { return key_; }; protected: bool valid_; - Sha1Description_t key_desc_; - AesCtrKey_t key_; + Sha1Description key_desc_; + AesCtrKey key_; }; -// EncryptedEnv2 implements an Env wrapper that adds encryption to files stored -// on disk. +class EncryptedWritableFileV2 : public EncryptedWritableFile { + public: + // Default ctor. Prefix is assumed to be written already. + EncryptedWritableFileV2(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefix_length) + : EncryptedWritableFile(std::move(f), std::move(s), prefix_length) {} + + Status Append(const Slice& data) override; + + Status PositionedAppend(const Slice& data, uint64_t offset) override; + + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. + bool use_direct_io() const override { return false; }; +}; + +// A file abstraction for random reading and writing. +class EncryptedRandomRWFileV2 : public EncryptedRandomRWFile { + protected: -class EncryptedEnv2 : public EnvWrapper { public: - using WriteKey_t = - std::pair>; - using ReadKeys_t = - std::map>; + EncryptedRandomRWFileV2(std::unique_ptr&& f, + std::unique_ptr&& s, + size_t prefixLength) + : EncryptedRandomRWFile(std::move(f), std::move(s), prefixLength) {} + + // Indicates if the class makes use of direct I/OF + // If false you must pass aligned buffer to Write() + bool use_direct_io() const override {return false;}; + + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. + Status Write(uint64_t offset, const Slice& data) override; +}; - static Env* Default(); - static Env* Default(ReadKeys_t encrypt_read, WriteKey_t encrypt_write); +// EncryptedEnvV2 implements an Env wrapper that adds encryption to files stored +// on disk. +class EncryptedEnvV2 : public EnvWrapper { + public: + using WriteKey = std::pair>; + using ReadKeys = + std::map>; - EncryptedEnv2(Env* base_env); + static Env* Default(); + static Env* Default(ReadKeys encrypt_read, WriteKey encrypt_write); - EncryptedEnv2(Env* base_env, ReadKeys_t encrypt_read, - WriteKey_t encrypt_write); + EncryptedEnvV2(Env* base_env); - void SetKeys(ReadKeys_t encrypt_read, WriteKey_t encrypt_write); + EncryptedEnvV2(Env* base_env, ReadKeys encrypt_read, WriteKey encrypt_write); - bool IsWriteEncrypted() const { return nullptr != encrypt_write_.second; } + bool IsWriteEncrypted() const; // NewSequentialFile opens a file for sequential reading. - virtual Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; + Status NewSequentialFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; // NewRandomAccessFile opens a file for random read access. - virtual Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; + Status NewRandomAccessFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; // NewWritableFile opens a file for sequential writing. - virtual Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; + Status NewWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; // Create an object that writes to a new file with the specified // name. Deletes any existing file with the same name and creates a @@ -240,24 +233,24 @@ class EncryptedEnv2 : public EnvWrapper { // returns non-OK. // // The returned file will only be accessed by one thread at a time. - virtual Status ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; + Status ReopenWritableFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; // Reuse an existing file by renaming it and opening it as writable. - virtual Status ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) override; + Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + std::unique_ptr* result, + const EnvOptions& options) override; // Open `fname` for random read and write, if file doesn't exist the file // will be created. On success, stores a pointer to the new file in // *result and returns OK. On failure returns non-OK. // // The returned file will only be accessed by one thread at a time. - virtual Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; + Status NewRandomRWFile(const std::string& fname, + std::unique_ptr* result, + const EnvOptions& options) override; // Store in *result the attributes of the children of the specified directory. // In case the implementation lists the directory prior to iterating the files @@ -269,201 +262,61 @@ class EncryptedEnv2 : public EnvWrapper { // NotFound if "dir" does not exist, the calling process does not have // permission to access "dir", or if "dir" is invalid. // IOError if an IO Error was encountered - virtual Status GetChildrenFileAttributes( + Status GetChildrenFileAttributes( const std::string& dir, std::vector* result) override; // Store the size of fname in *file_size. - virtual Status GetFileSize(const std::string& fname, - uint64_t* file_size) override; + Status GetFileSize(const std::string& fname, uint64_t* file_size) override; // only needed for GetChildrenFileAttributes & GetFileSize virtual Status GetEncryptionProvider( - const std::string& fname, std::shared_ptr& provider); + const std::string& fname, + std::shared_ptr& provider); + + bool IsValid() const { return valid_; } + + protected: + // following is not thread safe, intended for constuction + // and unit test only + void SetKeys(ReadKeys encrypt_read, WriteKey encrypt_write); template Status ReadSeqEncryptionPrefix( - TypeFile* f, std::shared_ptr& provider, - std::unique_ptr& stream) { - Status status; - - provider.reset(); // nullptr for provider implies "no encryption" - stream.release(); - - // Look for encryption marker - EncryptMarker_t marker; - Slice marker_slice; - status = f->Read(sizeof(marker), &marker_slice, marker); - if (status.ok()) { - if (sizeof(marker) == marker_slice.size() && - marker_slice.starts_with(Marker)) { - // code_version currently unused - uint8_t code_version = (uint8_t)marker_slice[7]; - - Slice prefix_slice; - Prefix0_t prefix_buffer; - status = - f->Read(sizeof(Prefix0_t), &prefix_slice, (char*)&prefix_buffer); - if (status.ok() && sizeof(Prefix0_t) == prefix_slice.size()) { - Sha1Description_t desc(prefix_buffer.key_description_, - sizeof(prefix_buffer.key_description_)); - - auto it = encrypt_read_.find(desc); - if (encrypt_read_.end() != it) { - CTREncryptionProvider2* ptr = - (CTREncryptionProvider2*)it->second.get(); - provider = it->second; - stream.reset(new AESBlockAccessCipherStream( - ptr->key(), code_version, prefix_buffer.nonce_)); - } else { - status = Status::NotSupported( - "No encryption key found to match input file"); - } - } - } - } - return status; - } + TypeFile* f, std::shared_ptr& provider, + std::unique_ptr& stream); template Status ReadRandEncryptionPrefix( - TypeFile* f, std::shared_ptr& provider, - std::unique_ptr& stream) { - Status status; - - provider.reset(); // nullptr for provider implies "no encryption" - stream.release(); - - // Look for encryption marker - EncryptMarker_t marker; - Slice marker_slice; - status = f->Read(0, sizeof(marker), &marker_slice, marker); - if (status.ok()) { - if (sizeof(marker) == marker_slice.size() && - marker_slice.starts_with(Marker)) { - // code_version currently unused - uint8_t code_version = (uint8_t)marker_slice[7]; - - Slice prefix_slice; - Prefix0_t prefix_buffer; - status = f->Read(sizeof(marker), sizeof(Prefix0_t), &prefix_slice, - (char*)&prefix_buffer); - if (status.ok() && sizeof(Prefix0_t) == prefix_slice.size()) { - Sha1Description_t desc(prefix_buffer.key_description_, - sizeof(prefix_buffer.key_description_)); - - auto it = encrypt_read_.find(desc); - if (encrypt_read_.end() != it) { - CTREncryptionProvider2* ptr = - (CTREncryptionProvider2*)it->second.get(); - provider = it->second; - stream.reset(new AESBlockAccessCipherStream( - ptr->key(), code_version, prefix_buffer.nonce_)); - } else { - status = Status::NotSupported( - "No encryption key found to match input file"); - } - } - } - } - return status; - } + TypeFile* f, std::shared_ptr& provider, + std::unique_ptr& stream); template Status WriteSeqEncryptionPrefix( - TypeFile* f, std::unique_ptr& stream) { - Status status; - - // set up Encryption maker, code version '0' - uint8_t code_version = {'0'}; - Prefix0_t prefix; - EncryptMarker_t marker; - strncpy(marker, Marker, sizeof(Marker)); - marker[sizeof(EncryptMarker_t) - 1] = code_version; - - Slice marker_slice(marker, sizeof(EncryptMarker_t)); - status = f->Append(marker_slice); - - if (status.ok()) { - // create nonce, then write it and key description - Slice prefix_slice((char*)&prefix, sizeof(prefix)); - - status = encrypt_write_.second->CreateNewPrefix( - std::string(), (char*)&prefix, - encrypt_write_.second->GetPrefixLength()); - - if (status.ok()) { - status = f->Append(prefix_slice); - } - } - - if (status.ok()) { - CTREncryptionProvider2* ptr = - (CTREncryptionProvider2*)encrypt_write_.second.get(); - stream.reset(new AESBlockAccessCipherStream(ptr->key(), code_version, - prefix.nonce_)); - } - - return status; - } + TypeFile* f, std::shared_ptr provider, + std::unique_ptr& stream); template Status WriteRandEncryptionPrefix( - TypeFile* f, std::unique_ptr& stream) { - Status status; - - // set up Encryption maker, code version '0' - uint8_t code_version = {'0'}; - Prefix0_t prefix; - EncryptMarker_t marker; - strncpy(marker, Marker, sizeof(Marker)); - marker[sizeof(EncryptMarker_t) - 1] = code_version; - - Slice marker_slice(marker, sizeof(EncryptMarker_t)); - status = f->Write(0, marker_slice); - - if (status.ok()) { - // create nonce, then write it and key description - Slice prefix_slice((char*)&prefix, sizeof(prefix)); - - status = encrypt_write_.second->CreateNewPrefix( - std::string(), (char*)&prefix, - encrypt_write_.second->GetPrefixLength()); - - if (status.ok()) { - status = f->Write(sizeof(EncryptMarker_t), prefix_slice); - } - } - - if (status.ok()) { - CTREncryptionProvider2* ptr = - (CTREncryptionProvider2*)encrypt_write_.second.get(); - stream.reset(new AESBlockAccessCipherStream(ptr->key(), code_version, - prefix.nonce_)); - } - - return status; - } - - bool IsValid() const { return valid_; } + TypeFile* f, std::shared_ptr provider, + std::unique_ptr& stream); + public: static UnixLibCrypto crypto_; protected: - std::map> - encrypt_read_; - std::pair> - encrypt_write_; + ReadKeys encrypt_read_; + WriteKey encrypt_write_; bool valid_; }; // Returns an Env that encrypts data when stored on disk and decrypts data when -// read from disk. Prefer EncryptedEnv2::Default(). -Env* NewEncryptedEnv2(Env* base_env, EncryptedEnv2::ReadKeys_t encrypt_read, - EncryptedEnv2::WriteKey_t encrypt_write); +// read from disk. Prefer EncryptedEnvV2::Default(). +Env* NewEncryptedEnvV2(Env* base_env, EncryptedEnvV2::ReadKeys encrypt_read, + EncryptedEnvV2::WriteKey encrypt_write); #endif // ROCKSDB_LITE -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h index f8c763b83..25832e92d 100644 --- a/include/rocksdb/env_encryption.h +++ b/include/rocksdb/env_encryption.h @@ -10,8 +10,9 @@ #include #include "env.h" +#include "rocksdb_namespace.h" -namespace rocksdb { +namespace ROCKSDB_NAMESPACE { class EncryptionProvider; @@ -138,12 +139,12 @@ class EncryptionProvider { // GetPrefixLength returns the length of the prefix that is added to every // file and used for storing encryption options. For optimal performance, the // prefix length should be a multiple of the page size. - virtual size_t GetPrefixLength() = 0; + virtual size_t GetPrefixLength() const = 0; // CreateNewPrefix initialized an allocated block of prefix memory // for a new file. virtual Status CreateNewPrefix(const std::string& fname, char* prefix, - size_t prefixLength) = 0; + size_t prefixLength) const = 0; // CreateCipherStream creates a block access cipher stream for a file given // given name and options. @@ -173,12 +174,12 @@ class CTREncryptionProvider : public EncryptionProvider { // and used for storing encryption options. // For optimal performance, the prefix length should be a multiple of // the page size. - virtual size_t GetPrefixLength() override; + virtual size_t GetPrefixLength() const override; // CreateNewPrefix initialized an allocated block of prefix memory // for a new file. virtual Status CreateNewPrefix(const std::string& fname, char* prefix, - size_t prefixLength) override; + size_t prefixLength) const override; // CreateCipherStream creates a block access cipher stream for a file given // given name and options. @@ -193,7 +194,7 @@ class CTREncryptionProvider : public EncryptionProvider { // Returns the amount of space (starting from the start of the prefix) // that has been initialized. virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength, - size_t blockSize); + size_t blockSize) const; // CreateCipherStreamFromPrefix creates a block access cipher stream for a // file given diff --git a/include/rocksdb/rocksdb_namespace.h b/include/rocksdb/rocksdb_namespace.h new file mode 100644 index 000000000..e9f8620d0 --- /dev/null +++ b/include/rocksdb/rocksdb_namespace.h @@ -0,0 +1,10 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#ifndef ROCKSDB_NAMESPACE +#define ROCKSDB_NAMESPACE rocksdb +#endif diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index a126de88c..842d3aee7 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -700,8 +700,8 @@ Status GetGlobalSequenceNumber(const TableProperties& table_properties, // This is not an external sst file, global_seqno is not supported. snprintf( msg_buf.data(), msg_buf.max_size(), - "A non-external sst file have global seqno property with value %s", - seqno_pos->second.c_str()); + "A non-external sst file have global seqno property with value %lu", + DecodeFixed64(seqno_pos->second.c_str())); return Status::Corruption(msg_buf.data()); } return Status::OK(); @@ -713,9 +713,9 @@ Status GetGlobalSequenceNumber(const TableProperties& table_properties, std::array msg_buf; // This is a v1 external sst file, global_seqno is not supported. snprintf(msg_buf.data(), msg_buf.max_size(), - "An external sst file with version %u have global seqno " - "property with value %s", - version, seqno_pos->second.c_str()); + "An external sst file with version %u has global seqno " + "property with value %lu", + version, DecodeFixed64(seqno_pos->second.c_str())); return Status::Corruption(msg_buf.data()); } return Status::OK(); @@ -731,9 +731,9 @@ Status GetGlobalSequenceNumber(const TableProperties& table_properties, if (global_seqno != 0 && global_seqno != largest_seqno) { std::array msg_buf; snprintf(msg_buf.data(), msg_buf.max_size(), - "An external sst file with version %u have global seqno property " - "with value %s, while largest seqno in the file is %llu", - version, seqno_pos->second.c_str(), + "An external sst file with version %u has global seqno property " + "with value %llu, while largest seqno in the file is %llu", + version, static_cast(global_seqno), static_cast(largest_seqno)); return Status::Corruption(msg_buf.data()); } @@ -743,7 +743,7 @@ Status GetGlobalSequenceNumber(const TableProperties& table_properties, if (global_seqno > kMaxSequenceNumber) { std::array msg_buf; snprintf(msg_buf.data(), msg_buf.max_size(), - "An external sst file with version %u have global seqno property " + "An external sst file with version %u has global seqno property " "with value %llu, which is greater than kMaxSequenceNumber", version, static_cast(global_seqno)); return Status::Corruption(msg_buf.data()); diff --git a/util/library_loader.cc b/util/library_loader.cc index a69f0326c..837795e11 100644 --- a/util/library_loader.cc +++ b/util/library_loader.cc @@ -8,69 +8,36 @@ #include "util/library_loader.h" -#include #include #include "rocksdb/status.h" -// link with -ldl +namespace ROCKSDB_NAMESPACE { -namespace rocksdb { +const char* UnixLibCrypto::crypto_lib_name_ = "crypto"; -#ifdef OS_MACOSX - const char * UnixLibCrypto::crypto_lib_name_ = "libcrypto.dylib"; -#else - const char * UnixLibCrypto::crypto_lib_name_ = "libcrypto.so"; -#endif - -UnixLibraryLoader::UnixLibraryLoader(const char * library_name) - : dl_handle_(nullptr) { - - if (nullptr != library_name && '\0' != *library_name) { - dl_handle_ = dlopen(library_name, RTLD_NOW | RTLD_GLOBAL); - - is_valid_ = (nullptr != dl_handle_); - - if (!is_valid_) { - last_error_msg_ = dlerror(); - } - } +LibraryLoader::LibraryLoader(const char* library_name) : is_valid_(false) { + Status stat = Env::Default()->LoadLibrary(library_name, std::string(), &lib_); + is_valid_ = stat.ok(); // two lines to pass 5.4.0 compile unit test + is_valid_ = is_valid_ && nullptr != lib_.get(); } - -UnixLibraryLoader::~UnixLibraryLoader() { - if (nullptr != dl_handle_ ) { - int ret_val = dlclose(dl_handle_); - dl_handle_ = nullptr; - is_valid_ = false; - - if (0 != ret_val) { - last_error_msg_ = dlerror(); - } - } -} - - -void * UnixLibraryLoader::GetEntryPoint(const char * function_name) { - void * ret_ptr = {nullptr}; +void* LibraryLoader::GetEntryPoint(const char* function_name) { + void* ret_ptr = {nullptr}; if (is_valid_) { - ret_ptr = dlsym(dl_handle_, function_name); - if (nullptr == ret_ptr) { - last_error_msg_ = dlerror(); - } + Status stat = lib_->LoadSymbol(function_name, &ret_ptr); } return ret_ptr; } - -size_t UnixLibraryLoader::GetEntryPoints(std::map & functions) { - size_t num_found {0}; +size_t LibraryLoader::GetEntryPoints(std::map& functions) { + size_t num_found{0}; if (is_valid_) { for (auto& func : functions) { - void * tmp_ptr; + void* tmp_ptr; tmp_ptr = GetEntryPoint(func.first.c_str()); if (nullptr != tmp_ptr) { @@ -83,41 +50,45 @@ size_t UnixLibraryLoader::GetEntryPoints(std::map & functio return num_found; } -UnixLibCrypto::UnixLibCrypto() - : UnixLibraryLoader(crypto_lib_name_) { +UnixLibCrypto::UnixLibCrypto() : LibraryLoader(crypto_lib_name_) { if (is_valid_) { - // size of map minus two since _new/_create and _free/_destroy - // only resolve one of the two. - is_valid_ = ((functions_.size()-2) == GetEntryPoints(functions_)); + // size of map minus three since _new/_create, _free/_destroy + // and _reset/_cleanup only resolve one of the two. + is_valid_ = ((functions_.size() - 3) == GetEntryPoints(functions_)); - ctx_new_ = (EVP_MD_CTX_new_t) functions_["EVP_MD_CTX_new"]; + ctx_new_ = (EVP_MD_CTX_new_t)functions_["EVP_MD_CTX_new"]; if (nullptr == ctx_new_) { - ctx_new_ = (EVP_MD_CTX_new_t) functions_["EVP_MD_CTX_create"]; + ctx_new_ = (EVP_MD_CTX_new_t)functions_["EVP_MD_CTX_create"]; } - digest_init_ = (EVP_DigestInit_ex_t) functions_["EVP_DigestInit_ex"]; - sha1_ = (EVP_sha1_t) functions_["EVP_sha1"]; - digest_update_ = (EVP_DigestUpdate_t) functions_["EVP_DigestUpdate"]; - digest_final_ = (EVP_DigestFinal_ex_t) functions_["EVP_DigestFinal_ex"]; + digest_init_ = (EVP_DigestInit_ex_t)functions_["EVP_DigestInit_ex"]; + sha1_ = (EVP_sha1_t)functions_["EVP_sha1"]; + digest_update_ = (EVP_DigestUpdate_t)functions_["EVP_DigestUpdate"]; + digest_final_ = (EVP_DigestFinal_ex_t)functions_["EVP_DigestFinal_ex"]; - ctx_free_ = (EVP_MD_CTX_free_t) functions_["EVP_MD_CTX_free"]; + ctx_free_ = (EVP_MD_CTX_free_t)functions_["EVP_MD_CTX_free"]; if (nullptr == ctx_free_) { - ctx_free_ = (EVP_MD_CTX_free_t) functions_["EVP_MD_CTX_destroy"]; + ctx_free_ = (EVP_MD_CTX_free_t)functions_["EVP_MD_CTX_destroy"]; } - rand_bytes_ = (RAND_bytes_t) functions_["RAND_bytes"]; - rand_poll_ = (RAND_poll_t) functions_["RAND_poll"]; - - cipher_new_ = (EVP_CIPHER_CTX_new_t) functions_["EVP_CIPHER_CTX_new"]; - cipher_free_ = (EVP_CIPHER_CTX_free_t) functions_["EVP_CIPHER_CTX_free"]; - encrypt_init_ = (EVP_EncryptInit_ex_t) functions_["EVP_EncryptInit_ex"]; - aes_256_ctr_ = (EVP_aes_256_ctr_t) functions_["EVP_aes_256_ctr"]; - encrypt_update_ = (EVP_EncryptUpdate_t) functions_["EVP_EncryptUpdate"]; + rand_bytes_ = (RAND_bytes_t)functions_["RAND_bytes"]; + rand_poll_ = (RAND_poll_t)functions_["RAND_poll"]; + cipher_new_ = (EVP_CIPHER_CTX_new_t)functions_["EVP_CIPHER_CTX_new"]; + cipher_reset_ = (EVP_CIPHER_CTX_reset_t)functions_["EVP_CIPHER_CTX_reset"]; + if (nullptr == cipher_reset_) { + cipher_reset_ = + (EVP_CIPHER_CTX_reset_t)functions_["EVP_CIPHER_CTX_cleanup"]; + } + cipher_free_ = (EVP_CIPHER_CTX_free_t)functions_["EVP_CIPHER_CTX_free"]; + encrypt_init_ = (EVP_EncryptInit_ex_t)functions_["EVP_EncryptInit_ex"]; + aes_256_ctr_ = (EVP_aes_256_ctr_t)functions_["EVP_aes_256_ctr"]; + encrypt_update_ = (EVP_EncryptUpdate_t)functions_["EVP_EncryptUpdate"]; + encrypt_final_ = (EVP_EncryptFinal_ex_t)functions_["EVP_EncryptFinal_ex"]; } } -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE #endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/util/library_loader.h b/util/library_loader.h index a9dacaabc..38f9e66d4 100644 --- a/util/library_loader.h +++ b/util/library_loader.h @@ -15,64 +15,53 @@ #include #include "rocksdb/env.h" +#include "rocksdb/rocksdb_namespace.h" -namespace rocksdb { +namespace ROCKSDB_NAMESPACE { // Base class / interface // expectation is to derive one class for unux and one for Windows // class LibraryLoader { public: - LibraryLoader() : is_valid_(false) {}; + LibraryLoader() = delete; + LibraryLoader(const char *library_name); virtual ~LibraryLoader() = default; - bool IsValid() const {return is_valid_;} + bool IsValid() const { return is_valid_; } - virtual void * GetEntryPoint(const char * function_name) = 0; + virtual void *GetEntryPoint(const char *function_name); + + virtual size_t GetEntryPoints(std::map &functions); protected: bool is_valid_; + std::shared_ptr lib_; }; - -class UnixLibraryLoader : public LibraryLoader { +class UnixLibCrypto : public LibraryLoader { public: - UnixLibraryLoader() = delete; - - UnixLibraryLoader(const char * library_name); - - virtual ~UnixLibraryLoader(); - - virtual void * GetEntryPoint(const char * function_name) override; - - virtual size_t GetEntryPoints(std::map & functions); - -protected: - void * dl_handle_; - std::string last_error_msg_; -}; - - -class UnixLibCrypto : public UnixLibraryLoader { -public: UnixLibCrypto(); virtual ~UnixLibCrypto() = default; // _new & _free are ssl 1.1, replacing 1.0 _create & _destroy - using EVP_MD_CTX_new_t = EVP_MD_CTX * (*)(void); - using EVP_DigestInit_ex_t = int (*)(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl); - using EVP_sha1_t = const EVP_MD * (*)(void); - using EVP_DigestUpdate_t = int (*)(EVP_MD_CTX *ctx, const void *d, size_t cnt); - using EVP_DigestFinal_ex_t = int (*)(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *s); + using EVP_MD_CTX_new_t = EVP_MD_CTX *(*)(void); + using EVP_DigestInit_ex_t = int (*)(EVP_MD_CTX *ctx, const EVP_MD *type, + ENGINE *impl); + using EVP_sha1_t = const EVP_MD *(*)(void); + using EVP_DigestUpdate_t = int (*)(EVP_MD_CTX *ctx, const void *d, + size_t cnt); + using EVP_DigestFinal_ex_t = int (*)(EVP_MD_CTX *ctx, unsigned char *md, + unsigned int *s); using EVP_MD_CTX_free_t = void (*)(EVP_MD_CTX *ctx); - EVP_MD_CTX * EVP_MD_CTX_new() const {return ctx_new_();}; + EVP_MD_CTX *EVP_MD_CTX_new() const { return ctx_new_(); }; int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl) { return digest_init_(ctx, type, impl); } - const EVP_MD * EVP_sha1() {return sha1_();} + const EVP_MD *EVP_sha1() { return sha1_(); } int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *d, size_t cnt) { return digest_update_(ctx, d, cnt); @@ -82,76 +71,80 @@ class UnixLibCrypto : public UnixLibraryLoader { return digest_final_(ctx, md, s); } - void EVP_MD_CTX_free(EVP_MD_CTX *ctx) { - ctx_free_(ctx); - } + void EVP_MD_CTX_free(EVP_MD_CTX *ctx) { ctx_free_(ctx); } - EVP_MD_CTX_free_t EVP_MD_CTX_free_ptr() { - return ctx_free_; - } + EVP_MD_CTX_free_t EVP_MD_CTX_free_ptr() { return ctx_free_; } using RAND_bytes_t = int (*)(unsigned char *buf, int num); using RAND_poll_t = int (*)(); - int RAND_bytes(unsigned char *buf, int num) { - return rand_bytes_(buf, num); - } + int RAND_bytes(unsigned char *buf, int num) { return rand_bytes_(buf, num); } - int RAND_poll() { - return rand_poll_(); - } + int RAND_poll() { return rand_poll_(); } - using EVP_CIPHER_CTX_new_t = EVP_CIPHER_CTX * (*)(void); + using EVP_CIPHER_CTX_reset_t = int (*)(EVP_CIPHER_CTX *ctx); + using EVP_CIPHER_CTX_new_t = EVP_CIPHER_CTX *(*)(void); using EVP_CIPHER_CTX_free_t = void (*)(EVP_CIPHER_CTX *ctx); - using EVP_EncryptInit_ex_t = int (*)(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *type, - ENGINE *impl, const unsigned char *key, const unsigned char *iv); - using EVP_aes_256_ctr_t = const EVP_CIPHER * (*)(void); + using EVP_EncryptInit_ex_t = int (*)(EVP_CIPHER_CTX *ctx, + const EVP_CIPHER *type, ENGINE *impl, + const unsigned char *key, + const unsigned char *iv); + using EVP_aes_256_ctr_t = const EVP_CIPHER *(*)(void); using EVP_EncryptUpdate_t = int (*)(EVP_CIPHER_CTX *ctx, unsigned char *out, - int *outl, const unsigned char *in, int inl); + int *outl, const unsigned char *in, + int inl); + using EVP_EncryptFinal_ex_t = int (*)(EVP_CIPHER_CTX *ctx, unsigned char *out, + int *outl); - EVP_CIPHER_CTX *EVP_CIPHER_CTX_new(void) const {return cipher_new_();}; - void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *ctx) { - cipher_free_(ctx); - } + int EVP_CIPHER_CTX_reset(EVP_CIPHER_CTX *ctx) { return cipher_reset_(ctx); } - EVP_CIPHER_CTX_free_t EVP_CIPHER_CTX_free_ptr() { - return cipher_free_; - } + EVP_CIPHER_CTX *EVP_CIPHER_CTX_new(void) const { return cipher_new_(); } + + void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *ctx) { cipher_free_(ctx); } + + EVP_CIPHER_CTX_free_t EVP_CIPHER_CTX_free_ptr() { return cipher_free_; } int EVP_EncryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *type, - ENGINE *impl, const unsigned char *key, const unsigned char *iv) { + ENGINE *impl, const unsigned char *key, + const unsigned char *iv) { return encrypt_init_(ctx, type, impl, key, iv); } - const EVP_CIPHER * EVP_aes_256_ctr() { - return aes_256_ctr_(); - } + const EVP_CIPHER *EVP_aes_256_ctr() { return aes_256_ctr_(); } - int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, - int *outl, const unsigned char *in, int inl) { + int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl, + const unsigned char *in, int inl) { return encrypt_update_(ctx, out, outl, in, inl); } - static const char * crypto_lib_name_; - -protected: - std::map functions_ { - {"EVP_MD_CTX_new", nullptr}, {"EVP_MD_CTX_create", nullptr}, - {"EVP_DigestInit_ex", nullptr}, - {"EVP_sha1", nullptr}, - {"EVP_DigestUpdate", nullptr}, - {"EVP_DigestFinal_ex", nullptr}, - {"EVP_MD_CTX_free", nullptr}, {"EVP_MD_CTX_destroy", nullptr}, - - {"RAND_bytes", nullptr}, - {"RAND_poll", nullptr}, + int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl) { + return encrypt_final_(ctx, out, outl); + } + static const char *crypto_lib_name_; - {"EVP_CIPHER_CTX_new", nullptr}, - {"EVP_CIPHER_CTX_free", nullptr}, - {"EVP_EncryptInit_ex", nullptr}, - {"EVP_aes_256_ctr", nullptr}, - {"EVP_EncryptUpdate", nullptr}, + protected: + std::map functions_{ + {"EVP_MD_CTX_new", nullptr}, + {"EVP_MD_CTX_create", nullptr}, + {"EVP_DigestInit_ex", nullptr}, + {"EVP_sha1", nullptr}, + {"EVP_DigestUpdate", nullptr}, + {"EVP_DigestFinal_ex", nullptr}, + {"EVP_MD_CTX_free", nullptr}, + {"EVP_MD_CTX_destroy", nullptr}, + + {"RAND_bytes", nullptr}, + {"RAND_poll", nullptr}, + + {"EVP_CIPHER_CTX_new", nullptr}, + {"EVP_CIPHER_CTX_free", nullptr}, + {"EVP_EncryptInit_ex", nullptr}, + {"EVP_aes_256_ctr", nullptr}, + {"EVP_EncryptUpdate", nullptr}, + {"EVP_EncryptFinal_ex", nullptr}, + {"EVP_CIPHER_CTX_reset", nullptr}, + {"EVP_CIPHER_CTX_cleanup", nullptr}, }; @@ -165,14 +158,15 @@ class UnixLibCrypto : public UnixLibraryLoader { RAND_bytes_t rand_bytes_; RAND_poll_t rand_poll_; + EVP_CIPHER_CTX_reset_t cipher_reset_; EVP_CIPHER_CTX_new_t cipher_new_; EVP_CIPHER_CTX_free_t cipher_free_; EVP_EncryptInit_ex_t encrypt_init_; EVP_aes_256_ctr_t aes_256_ctr_; EVP_EncryptUpdate_t encrypt_update_; - + EVP_EncryptFinal_ex_t encrypt_final_; }; -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE #endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/util/library_loader_test.cc b/util/library_loader_test.cc index fcbe6c380..90a930dc4 100644 --- a/util/library_loader_test.cc +++ b/util/library_loader_test.cc @@ -10,25 +10,23 @@ #include "util/library_loader.h" -namespace rocksdb { +namespace ROCKSDB_NAMESPACE { #ifdef OS_MACOSX - static const char * LIB_M_NAME = "libm.dylib"; - static const char * LIB_BAD_NAME = "libbubbagump.dylib"; - static const char * LIB_SSL_NAME = "libssl.dylib"; +static const char* LIB_M_NAME = "libm"; +static const char* LIB_BAD_NAME = "libbubbagump.dylib"; +static const char* LIB_SSL_NAME = "ssl"; #else - static const char * LIB_M_NAME = "libm.so.6"; - static const char * LIB_BAD_NAME = "libbubbagump.so"; - static const char * LIB_SSL_NAME = "libssl.so"; +static const char* LIB_M_NAME = "libm.so.6"; +static const char* LIB_BAD_NAME = "libbubbagump.so"; +static const char* LIB_SSL_NAME = "ssl"; #endif - class UnixLibraryLoaderTest {}; TEST(UnixLibraryLoaderTest, Simple) { - - UnixLibraryLoader works(LIB_M_NAME); - UnixLibraryLoader fails(LIB_BAD_NAME); + LibraryLoader works(LIB_M_NAME); + LibraryLoader fails(LIB_BAD_NAME); ASSERT_TRUE(works.IsValid()); ASSERT_FALSE(fails.IsValid()); @@ -38,28 +36,27 @@ TEST(UnixLibraryLoaderTest, Simple) { floor = (double (*)(double))works.GetEntryPoint("floor"); ASSERT_TRUE(nullptr != floor); ASSERT_TRUE(2.0 == (*floor)(2.2)); - } TEST(UnixLibraryLoaderTest, SSL) { - UnixLibraryLoader ssl(LIB_SSL_NAME); - UnixLibraryLoader crypto(UnixLibCrypto::crypto_lib_name_); + LibraryLoader ssl(LIB_SSL_NAME); + LibraryLoader crypto(UnixLibCrypto::crypto_lib_name_); ASSERT_TRUE(ssl.IsValid()); ASSERT_TRUE(crypto.IsValid()); - } TEST(UnixLibraryLoaderTest, Crypto) { UnixLibCrypto crypto; uint8_t desc[EVP_MAX_MD_SIZE]; - EVP_MD_CTX * context; + EVP_MD_CTX* context; int ret_val; unsigned len; ASSERT_TRUE(crypto.IsValid()); - context = crypto.EVP_MD_CTX_create(); + // context = crypto.EVP_MD_CTX_create(); ... old call + context = crypto.EVP_MD_CTX_new(); // new call ASSERT_TRUE(nullptr != context); ret_val = crypto.EVP_DigestInit_ex(context, crypto.EVP_sha1(), nullptr); @@ -72,21 +69,19 @@ TEST(UnixLibraryLoaderTest, Crypto) { ASSERT_TRUE(1 == ret_val); ASSERT_TRUE(20 == len); - uint8_t md2[] = {0x35, 0x6a, 0x19, 0x2b, 0x79, 0x13, 0xb0, 0x4c, - 0x54, 0x57, 0x4d, 0x18, 0xc2, 0x8d, 0x46, 0xe6, - 0x39, 0x54, 0x28, 0xab}; + uint8_t md2[] = {0x35, 0x6a, 0x19, 0x2b, 0x79, 0x13, 0xb0, 0x4c, 0x54, 0x57, + 0x4d, 0x18, 0xc2, 0x8d, 0x46, 0xe6, 0x39, 0x54, 0x28, 0xab}; ASSERT_TRUE(0 == memcmp(md2, desc, sizeof(md2))); crypto.EVP_MD_CTX_free(context); - } -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE #endif // ROCKSDB_OPENSSL_AES_CTR int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); + return RUN_ALL_TESTS(); } From 6acb712fa26edc473af04691a956b151dee4db75 Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Fri, 21 Aug 2020 13:37:03 -0400 Subject: [PATCH 51/57] Feature/mv encrypt 2 (#15) * this is a current file in facebook/rocksdb. copying it into our older code to simplify backporting encryption. * backport DynamicLibrary and LoadLibrary from rocksdb master * add missing line for CTR AES build * copy encryption updates required by Facebook * add new include file to stardog bazel build file * there is a likely code path where valid_ is not set in EncryptedEnvV2. this corrects. * correct code and usage of EncryptedRandomRWFileV2 * fix formating and grammar in some logging * per man page, switch to EVP_EncryptFinal_ex as soft reset of context object for reuse * add test for RAND_poll and RAND_bytes loading * change method for making sure libcrypto is available ... matches changes in facebook code * add ToString to Sha Aes and provider * backport two changes from Facebook encryption PR: make key_lock_ a member variable instead of static to fix Mac OSX crash, and switch to ReadLock() and WriteLock() wrapper objects previously suggested by Alex. Co-authored-by: matthewvon --- env/env_encrypt2.cc | 181 ++++++++++++++++----------------- include/rocksdb/env_encrypt2.h | 44 +++++++- util/library_loader_test.cc | 11 ++ 3 files changed, 142 insertions(+), 94 deletions(-) diff --git a/env/env_encrypt2.cc b/env/env_encrypt2.cc index 290a649a2..0583c3300 100644 --- a/env/env_encrypt2.cc +++ b/env/env_encrypt2.cc @@ -14,17 +14,26 @@ #include #include #include +#include #include "env/env_encrypt2_impl.h" #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "util/aligned_buffer.h" #include "util/coding.h" +#include "util/mutexlock.h" #include "util/random.h" namespace ROCKSDB_NAMESPACE { -static port::RWMutex key_lock; +static std::once_flag crypto_loaded; +static std::shared_ptr crypto_shared; + +std::shared_ptr GetCrypto() { + std::call_once(crypto_loaded, + []() { crypto_shared = std::make_shared(); }); + return crypto_shared; +} // reuse cipher context between calls to Encrypt & Decrypt static void do_nothing(EVP_CIPHER_CTX*){}; @@ -32,28 +41,29 @@ thread_local static std::unique_ptr aes_context(nullptr, &do_nothing); Sha1Description::Sha1Description(const std::string& key_desc_str) { + GetCrypto(); // ensure libcryto available bool good = {true}; int ret_val; unsigned len; memset(desc, 0, EVP_MAX_MD_SIZE); - if (0 != key_desc_str.length() && EncryptedEnvV2::crypto_.IsValid()) { + if (0 != key_desc_str.length() && crypto_shared->IsValid()) { std::unique_ptr context( - EncryptedEnvV2::crypto_.EVP_MD_CTX_new(), - EncryptedEnvV2::crypto_.EVP_MD_CTX_free_ptr()); + crypto_shared->EVP_MD_CTX_new(), + crypto_shared->EVP_MD_CTX_free_ptr()); - ret_val = EncryptedEnvV2::crypto_.EVP_DigestInit_ex( - context.get(), EncryptedEnvV2::crypto_.EVP_sha1(), nullptr); + ret_val = crypto_shared->EVP_DigestInit_ex( + context.get(), crypto_shared->EVP_sha1(), nullptr); good = (1 == ret_val); if (good) { - ret_val = EncryptedEnvV2::crypto_.EVP_DigestUpdate( + ret_val = crypto_shared->EVP_DigestUpdate( context.get(), key_desc_str.c_str(), key_desc_str.length()); good = (1 == ret_val); } if (good) { ret_val = - EncryptedEnvV2::crypto_.EVP_DigestFinal_ex(context.get(), desc, &len); + crypto_shared->EVP_DigestFinal_ex(context.get(), desc, &len); good = (1 == ret_val); } } else { @@ -64,6 +74,7 @@ Sha1Description::Sha1Description(const std::string& key_desc_str) { } AesCtrKey::AesCtrKey(const std::string& key_str) : valid(false) { + GetCrypto(); // ensure libcryto available memset(key, 0, EVP_MAX_KEY_LENGTH); // simple parse: must be 64 characters long and hexadecimal values @@ -111,58 +122,53 @@ void AESBlockAccessCipherStream::BigEndianAdd128(uint8_t* buf, } // for } -// -// AES_BLOCK_SIZE assumed to be 16 -// -typedef union { - uint64_t nums[2]; - uint8_t bytes[AES_BLOCK_SIZE]; -} AesAlignedBlock; - // "data" is assumed to be aligned at AES_BLOCK_SIZE or greater Status AESBlockAccessCipherStream::Encrypt(uint64_t file_offset, char* data, size_t data_size) { Status status; if (0 < data_size) { - if (EncryptedEnvV2::crypto_.IsValid()) { + if (crypto_shared->IsValid()) { int ret_val, out_len; - ALIGN16 AesAlignedBlock iv; + ALIGN16 uint8_t iv[AES_BLOCK_SIZE]; uint64_t block_index = file_offset / BlockSize(); // make a context once per thread if (!aes_context) { aes_context = std::unique_ptr( - EncryptedEnvV2::crypto_.EVP_CIPHER_CTX_new(), - EncryptedEnvV2::crypto_.EVP_CIPHER_CTX_free_ptr()); + crypto_shared->EVP_CIPHER_CTX_new(), + crypto_shared->EVP_CIPHER_CTX_free_ptr()); } - memcpy(iv.bytes, nonce_, AES_BLOCK_SIZE); - BigEndianAdd128(iv.bytes, block_index); + memcpy(iv, nonce_, AES_BLOCK_SIZE); + BigEndianAdd128(iv, block_index); - ret_val = EncryptedEnvV2::crypto_.EVP_EncryptInit_ex( - aes_context.get(), EncryptedEnvV2::crypto_.EVP_aes_256_ctr(), - nullptr, key_.key, iv.bytes); + ret_val = crypto_shared->EVP_EncryptInit_ex( + aes_context.get(), crypto_shared->EVP_aes_256_ctr(), nullptr, + key_.key, iv); if (1 == ret_val) { out_len = 0; - ret_val = EncryptedEnvV2::crypto_.EVP_EncryptUpdate( + ret_val = crypto_shared->EVP_EncryptUpdate( aes_context.get(), (unsigned char*)data, &out_len, (unsigned char*)data, (int)data_size); - if (1 != ret_val || (int)data_size != out_len) { + if (1 == ret_val && (int)data_size == out_len) { + // this is a soft reset of aes_context per man pages + uint8_t temp_buf[AES_BLOCK_SIZE]; + out_len = 0; + ret_val = crypto_shared->EVP_EncryptFinal_ex(aes_context.get(), + temp_buf, &out_len); + + if (1 != ret_val || 0 != out_len) { + status = Status::InvalidArgument( + "EVP_EncryptFinal_ex failed: ", + (1 != ret_val) ? "bad return value" : "output length short"); + } + } else { status = Status::InvalidArgument("EVP_EncryptUpdate failed: ", (int)data_size == out_len - ? "bad return value" - : "output length short"); - } - // this is a soft reset of aes_context per man pages - uint8_t temp_buf[AES_BLOCK_SIZE]; - out_len = 0; - ret_val = EncryptedEnvV2::crypto_.EVP_EncryptFinal_ex( - aes_context.get(), temp_buf, &out_len); - - if (1 != ret_val || 0 != out_len) { - status = Status::InvalidArgument("EVP_EncryptFinal_ex failed."); + ? "bad return value" + : "output length short"); } } else { status = Status::InvalidArgument("EVP_EncryptInit_ex failed."); @@ -191,23 +197,23 @@ Status AESBlockAccessCipherStream::Decrypt(uint64_t file_offset, char* data, uint8_t temp_buf[block_size]; Status status; - ALIGN16 AesAlignedBlock iv; + ALIGN16 uint8_t iv[AES_BLOCK_SIZE]; int out_len = 0, ret_val; - if (EncryptedEnvV2::crypto_.IsValid()) { + if (crypto_shared->IsValid()) { // make a context once per thread if (!aes_context) { aes_context = std::unique_ptr( - EncryptedEnvV2::crypto_.EVP_CIPHER_CTX_new(), - EncryptedEnvV2::crypto_.EVP_CIPHER_CTX_free_ptr()); + crypto_shared->EVP_CIPHER_CTX_new(), + crypto_shared->EVP_CIPHER_CTX_free_ptr()); } - memcpy(iv.bytes, nonce_, AES_BLOCK_SIZE); - BigEndianAdd128(iv.bytes, block_index); + memcpy(iv, nonce_, AES_BLOCK_SIZE); + BigEndianAdd128(iv, block_index); - ret_val = EncryptedEnvV2::crypto_.EVP_EncryptInit_ex( - aes_context.get(), EncryptedEnvV2::crypto_.EVP_aes_256_ctr(), nullptr, - key_.key, iv.bytes); + ret_val = crypto_shared->EVP_EncryptInit_ex( + aes_context.get(), crypto_shared->EVP_aes_256_ctr(), nullptr, key_.key, + iv); if (1 == ret_val) { // handle uneven block start if (0 != block_offset) { @@ -218,7 +224,7 @@ Status AESBlockAccessCipherStream::Decrypt(uint64_t file_offset, char* data, memcpy(temp_buf + block_offset, data, prefix_size); out_len = 0; - ret_val = EncryptedEnvV2::crypto_.EVP_EncryptUpdate( + ret_val = crypto_shared->EVP_EncryptUpdate( aes_context.get(), temp_buf, &out_len, temp_buf, (int)block_size); if (1 != ret_val || (int)block_size != out_len) { @@ -235,7 +241,7 @@ Status AESBlockAccessCipherStream::Decrypt(uint64_t file_offset, char* data, remaining -= prefix_size; if (status.ok() && remaining) { out_len = 0; - ret_val = EncryptedEnvV2::crypto_.EVP_EncryptUpdate( + ret_val = crypto_shared->EVP_EncryptUpdate( aes_context.get(), (uint8_t*)data + prefix_size, &out_len, (uint8_t*)data + prefix_size, (int)remaining); @@ -249,7 +255,7 @@ Status AESBlockAccessCipherStream::Decrypt(uint64_t file_offset, char* data, // this is a soft reset of aes_context per man pages out_len = 0; - ret_val = EncryptedEnvV2::crypto_.EVP_EncryptFinal_ex( + ret_val = crypto_shared->EVP_EncryptFinal_ex( aes_context.get(), temp_buf, &out_len); if (1 != ret_val || 0 != out_len) { @@ -269,14 +275,15 @@ Status AESBlockAccessCipherStream::Decrypt(uint64_t file_offset, char* data, Status CTREncryptionProviderV2::CreateNewPrefix(const std::string& /*fname*/, char* prefix, size_t prefixLength) const { + GetCrypto(); // ensure libcryto available Status s; - if (EncryptedEnvV2::crypto_.IsValid()) { + if (crypto_shared->IsValid()) { if (sizeof(PrefixVersion0) <= prefixLength) { int ret_val; PrefixVersion0* pf = {(PrefixVersion0*)prefix}; memcpy(pf->key_description_, key_desc_.desc, sizeof(key_desc_.desc)); - ret_val = EncryptedEnvV2::crypto_.RAND_bytes((unsigned char*)&pf->nonce_, + ret_val = crypto_shared->RAND_bytes((unsigned char*)&pf->nonce_, AES_BLOCK_SIZE); if (1 != ret_val) { s = Status::NotSupported("RAND_bytes failed"); @@ -407,11 +414,7 @@ Env* NewEncryptedEnvV2(Env* base_env, EncryptedEnvV2::ReadKeys encrypt_read, Env* ret_env{base_env}; EncryptedEnvV2* new_env{nullptr}; - if (Env::Default() == base_env) { - // use safer static construction so libcrypto is synchronously loaded - new_env = - (EncryptedEnvV2*)EncryptedEnvV2::Default(encrypt_read, encrypt_write); - } else if (nullptr != base_env) { + if (nullptr != base_env) { new_env = new EncryptedEnvV2(base_env, encrypt_read, encrypt_write); } @@ -428,39 +431,35 @@ EncryptedEnvV2::EncryptedEnvV2(Env* base_env, EncryptedEnvV2::ReadKeys encrypt_read, EncryptedEnvV2::WriteKey encrypt_write) : EnvWrapper(base_env), valid_(false) { + init(); SetKeys(encrypt_read, encrypt_write); - - valid_ = crypto_.IsValid(); - - // warning, dynamic loading of libcrypto could be delayed ... making this - // false - if (IsValid()) { - crypto_.RAND_poll(); - } } + EncryptedEnvV2::EncryptedEnvV2(Env* base_env) : EnvWrapper(base_env), valid_(false) { + init(); +} - valid_ = crypto_.IsValid(); +void EncryptedEnvV2::init() { + crypto_ = GetCrypto(); + + valid_ = crypto_->IsValid(); if (IsValid()) { - crypto_.RAND_poll(); + crypto_->RAND_poll(); } } void EncryptedEnvV2::SetKeys(EncryptedEnvV2::ReadKeys encrypt_read, - EncryptedEnvV2::WriteKey encrypt_write) { - key_lock.WriteLock(); + EncryptedEnvV2::WriteKey encrypt_write) { + WriteLock lock(&key_lock_); encrypt_read_ = encrypt_read; encrypt_write_ = encrypt_write; - key_lock.WriteUnlock(); - } bool EncryptedEnvV2::IsWriteEncrypted() const { - key_lock.ReadLock(); + ReadLock lock(&key_lock_); bool ret_flag = (nullptr != encrypt_write_.second); - key_lock.ReadUnlock(); return ret_flag; } @@ -496,7 +495,7 @@ Status EncryptedEnvV2::ReadSeqEncryptionPrefix( Sha1Description desc(prefix_buffer.key_description_, sizeof(prefix_buffer.key_description_)); - key_lock.ReadLock(); + ReadLock lock(&key_lock_); auto it = encrypt_read_.find(desc); if (encrypt_read_.end() != it) { provider = it->second; @@ -507,7 +506,6 @@ Status EncryptedEnvV2::ReadSeqEncryptionPrefix( status = Status::NotSupported( "No encryption key found to match input file"); } - key_lock.ReadUnlock(); } } else { status = @@ -545,7 +543,7 @@ Status EncryptedEnvV2::ReadRandEncryptionPrefix( Sha1Description desc(prefix_buffer.key_description_, sizeof(prefix_buffer.key_description_)); - key_lock.ReadLock(); + ReadLock lock(&key_lock_); auto it = encrypt_read_.find(desc); if (encrypt_read_.end() != it) { provider = it->second; @@ -555,7 +553,6 @@ Status EncryptedEnvV2::ReadRandEncryptionPrefix( status = Status::NotSupported( "No encryption key found to match input file"); } - key_lock.ReadUnlock(); } } else { status = @@ -724,9 +721,10 @@ Status EncryptedEnvV2::NewWritableFile(const std::string& fname, if (status.ok()) { std::shared_ptr provider; - key_lock.ReadLock(); - provider = encrypt_write_.second; - key_lock.ReadUnlock(); + { + ReadLock lock(&key_lock_); + provider = encrypt_write_.second; + } if (provider) { std::unique_ptr stream; @@ -770,9 +768,10 @@ Status EncryptedEnvV2::ReopenWritableFile(const std::string& fname, if (status.ok()) { std::shared_ptr provider; - key_lock.ReadLock(); - provider = encrypt_write_.second; - key_lock.ReadUnlock(); + { + ReadLock lock(&key_lock_); + provider = encrypt_write_.second; + } if (provider) { std::unique_ptr stream; @@ -812,9 +811,10 @@ Status EncryptedEnvV2::ReuseWritableFile(const std::string& fname, if (status.ok()) { std::shared_ptr provider; - key_lock.ReadLock(); - provider = encrypt_write_.second; - key_lock.ReadUnlock(); + { + ReadLock lock(&key_lock_); + provider = encrypt_write_.second; + } if (provider) { std::unique_ptr stream; @@ -867,9 +867,10 @@ Status EncryptedEnvV2::NewRandomRWFile(const std::string& fname, provider, stream); } else { // new file - key_lock.ReadLock(); - provider = encrypt_write_.second; - key_lock.ReadUnlock(); + { + ReadLock lock(&key_lock_); + provider = encrypt_write_.second; + } if (provider) { status = @@ -964,8 +965,6 @@ Status EncryptedEnvV2::GetEncryptionProvider( return status; } -UnixLibCrypto EncryptedEnvV2::crypto_; - Env* EncryptedEnvV2::Default() { // the rational for this routine is to help force the static // loading of UnixLibCrypto before other routines start diff --git a/include/rocksdb/env_encrypt2.h b/include/rocksdb/env_encrypt2.h index b469f30e2..68b8199a5 100644 --- a/include/rocksdb/env_encrypt2.h +++ b/include/rocksdb/env_encrypt2.h @@ -73,6 +73,18 @@ struct Sha1Description { } bool IsValid() const { return valid; } + + std::string ToString(size_t byte_count = 20) const { + if (IsValid()) { + if (EVP_MAX_MD_SIZE < byte_count) { + byte_count = EVP_MAX_MD_SIZE; + } + rocksdb::Slice to_hex((const char *)desc, byte_count); + return to_hex.ToString(true); + } else { + return std::string(); + } + } }; struct AesCtrKey { @@ -112,6 +124,18 @@ struct AesCtrKey { } bool IsValid() const { return valid; } + + std::string ToString(size_t byte_count = 32) const { + if (IsValid()) { + if (EVP_MAX_KEY_LENGTH < byte_count) { + byte_count = EVP_MAX_KEY_LENGTH; + } + rocksdb::Slice to_hex((const char *)key, byte_count); + return to_hex.ToString(true); + } else { + return std::string(); + } + } }; class CTREncryptionProviderV2 : public EncryptionProvider { @@ -147,10 +171,22 @@ class CTREncryptionProviderV2 : public EncryptionProvider { virtual BlockAccessCipherStream* CreateCipherStream2( uint8_t code_version, const uint8_t nonce[]) const; - bool Valid() const { return valid_; }; + bool IsValid() const { return valid_; }; const Sha1Description& key_desc() const { return key_desc_; }; const AesCtrKey& key() const { return key_; }; + std::string ToString() const { + std::string result; + if (IsValid()) { + result = key_desc_.ToString(); + result += " : "; + result += key_.ToString(); + } else { + result = " : "; + } + return result; + } + protected: bool valid_; Sha1Description key_desc_; @@ -276,6 +312,8 @@ class EncryptedEnvV2 : public EnvWrapper { bool IsValid() const { return valid_; } protected: + void init(); + // following is not thread safe, intended for constuction // and unit test only void SetKeys(ReadKeys encrypt_read, WriteKey encrypt_write); @@ -301,12 +339,12 @@ class EncryptedEnvV2 : public EnvWrapper { std::unique_ptr& stream); public: - static UnixLibCrypto crypto_; + std::shared_ptr crypto_; protected: ReadKeys encrypt_read_; WriteKey encrypt_write_; - + mutable port::RWMutex key_lock_; bool valid_; }; diff --git a/util/library_loader_test.cc b/util/library_loader_test.cc index 90a930dc4..2aeb7bba5 100644 --- a/util/library_loader_test.cc +++ b/util/library_loader_test.cc @@ -76,6 +76,17 @@ TEST(UnixLibraryLoaderTest, Crypto) { crypto.EVP_MD_CTX_free(context); } +TEST(UnixLibraryLoaderTest, Random) { + UnixLibCrypto crypto; + + ASSERT_TRUE(crypto.IsValid()); + ASSERT_TRUE(1==crypto.RAND_poll()); + + uint8_t buffer[512]; + ASSERT_TRUE(1==crypto.RAND_bytes(buffer, sizeof(buffer))); + +} + } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE From c46e40df82228ad1690f8d88718e083bd6271f9a Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Fri, 28 Aug 2020 13:40:08 -0400 Subject: [PATCH 52/57] make SetKeys public since now rwlock protected. (#16) Co-authored-by: matthewvon --- include/rocksdb/env_encrypt2.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/rocksdb/env_encrypt2.h b/include/rocksdb/env_encrypt2.h index 68b8199a5..22d0279ff 100644 --- a/include/rocksdb/env_encrypt2.h +++ b/include/rocksdb/env_encrypt2.h @@ -245,6 +245,8 @@ class EncryptedEnvV2 : public EnvWrapper { EncryptedEnvV2(Env* base_env, ReadKeys encrypt_read, WriteKey encrypt_write); + void SetKeys(ReadKeys encrypt_read, WriteKey encrypt_write); + bool IsWriteEncrypted() const; // NewSequentialFile opens a file for sequential reading. @@ -314,10 +316,6 @@ class EncryptedEnvV2 : public EnvWrapper { protected: void init(); - // following is not thread safe, intended for constuction - // and unit test only - void SetKeys(ReadKeys encrypt_read, WriteKey encrypt_write); - template Status ReadSeqEncryptionPrefix( TypeFile* f, std::shared_ptr& provider, From 529487e41fde25d7d792aeac5b77e577be10e5d3 Mon Sep 17 00:00:00 2001 From: Matthew Von-Maszewski Date: Tue, 1 Sep 2020 11:31:02 -0400 Subject: [PATCH 53/57] make mutex mutable for const function I confirm facebook/rocksdb has made this same change. The size_t function is const. It was trying to use lock_ and a recent compiler update spit that out as "bad". Must make the lock mutable so it can change state in a const function --- util/channel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/channel.h b/util/channel.h index 0225482c0..a8a47680a 100644 --- a/util/channel.h +++ b/util/channel.h @@ -60,7 +60,7 @@ class channel { private: std::condition_variable cv_; - std::mutex lock_; + mutable std::mutex lock_; std::queue buffer_; bool eof_; }; From 7ecb24640179d6d9305e8cdf25676169f5a43ce8 Mon Sep 17 00:00:00 2001 From: matthewvon Date: Sun, 6 Sep 2020 15:01:09 -0400 Subject: [PATCH 54/57] backport Facebook changes to env_encryption ... then update Stardog openssl encryption --- CMakeLists.txt | 3 +- Makefile | 1 - TARGETS | 2 +- db/db_test_util.cc | 5 +- env/env_basic_test.cc | 25 +- env/env_encrypt2.cc | 986 ------------------ env/env_encryption.cc | 636 ++++++----- env/env_encryption_ctr.h | 137 +++ env/env_openssl.cc | 312 ++++++ ...env_encrypt2_impl.h => env_openssl_impl.h} | 11 +- include/rocksdb/convenience.h | 62 ++ include/rocksdb/env_encrypt2.h | 358 ------- include/rocksdb/env_encryption.h | 184 ++-- include/rocksdb/env_openssl.h | 188 ++++ port/port_posix.h | 1 + port/win/port_win.h | 1 + src.mk | 3 +- util/build_version.cc | 4 +- util/string_util.cc | 71 +- util/string_util.h | 20 +- 20 files changed, 1271 insertions(+), 1739 deletions(-) delete mode 100644 env/env_encrypt2.cc create mode 100644 env/env_encryption_ctr.h create mode 100644 env/env_openssl.cc rename env/{env_encrypt2_impl.h => env_openssl_impl.h} (89%) delete mode 100644 include/rocksdb/env_encrypt2.h create mode 100644 include/rocksdb/env_openssl.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 92011100e..a5a518f00 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -522,8 +522,8 @@ set(SOURCES env/env.cc env/env_chroot.cc env/env_encryption.cc - env/env_encrypt2.cc env/env_hdfs.cc + env/env_openssl.cc env/mock_env.cc memtable/alloc_tracker.cc memtable/hash_cuckoo_rep.cc @@ -919,7 +919,6 @@ if(WITH_TESTS) db/write_callback_test.cc db/write_controller_test.cc env/env_basic_test.cc - env/env_encrypt2_test.cc env/env_test.cc env/mock_env_test.cc memtable/inlineskiplist_test.cc diff --git a/Makefile b/Makefile index b6b0abf46..97db07077 100644 --- a/Makefile +++ b/Makefile @@ -419,7 +419,6 @@ TESTS = \ coding_test \ inlineskiplist_test \ env_basic_test \ - env_encrypt2_test \ env_test \ hash_test \ library_loader_test \ diff --git a/TARGETS b/TARGETS index 8a34df4ac..3c55db41c 100644 --- a/TARGETS +++ b/TARGETS @@ -139,8 +139,8 @@ cpp_library( "env/env.cc", "env/env_chroot.cc", "env/env_encryption.cc", - "env/env_encrypt2.cc", "env/env_hdfs.cc", + "env/env_openssl.cc", "env/env_posix.cc", "env/io_posix.cc", "env/mock_env.cc", diff --git a/db/db_test_util.cc b/db/db_test_util.cc index de096d254..31a03dc2e 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -43,7 +43,7 @@ SpecialEnv::SpecialEnv(Env* base) table_write_callback_ = nullptr; } #ifndef ROCKSDB_LITE -ROT13BlockCipher rot13Cipher_(16); +//ROT13BlockCipher rot13Cipher_(16); #endif // ROCKSDB_LITE DBTestBase::DBTestBase(const std::string path) @@ -53,7 +53,8 @@ DBTestBase::DBTestBase(const std::string path) !getenv("ENCRYPTED_ENV") ? nullptr : NewEncryptedEnv(mem_env_ ? mem_env_ : Env::Default(), - new CTREncryptionProvider(rot13Cipher_))), + // new CTREncryptionProvider(rot13Cipher_))), + nullptr)), #else encrypted_env_(nullptr), #endif // ROCKSDB_LITE diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index 54a492939..019851f03 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -10,6 +10,7 @@ #include #include "env/mock_env.h" +#include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/env_encryption.h" #include "util/testharness.h" @@ -19,7 +20,12 @@ namespace rocksdb { // Normalizes trivial differences across Envs such that these test cases can // run on all Envs. class NormalizingEnvWrapper : public EnvWrapper { + private: + std::unique_ptr base_; + public: + explicit NormalizingEnvWrapper(std::unique_ptr&& base) + : EnvWrapper(base.get()), base_(std::move(base)) {} explicit NormalizingEnvWrapper(Env* base) : EnvWrapper(base) {} // Removes . and .. from directory listing @@ -93,16 +99,21 @@ INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam, #ifndef ROCKSDB_LITE // next statements run env test against default encryption code. -static ROT13BlockCipher encrypt_block_rot13(32); - -static CTREncryptionProvider encrypt_provider_ctr(encrypt_block_rot13); +static Env* NewTestEncryptedEnv(Env* base, const std::string& provider_id) { + std::shared_ptr provider; + EXPECT_OK(EncryptionProvider::CreateFromString(ConfigOptions(), provider_id, + &provider)); + std::unique_ptr encrypted(NewEncryptedEnv(base, provider)); + return new NormalizingEnvWrapper(std::move(encrypted)); +} -static std::unique_ptr encrypt_env(new NormalizingEnvWrapper( - NewEncryptedEnv(Env::Default(), &encrypt_provider_ctr))); +// next statements run env test against default encryption code. +static std::unique_ptr ctr_encrypt_env(NewTestEncryptedEnv(Env::Default(), + "test://CTR")); INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvBasicTestWithParam, - ::testing::Values(encrypt_env.get())); + ::testing::Values(ctr_encrypt_env.get())); INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvMoreTestWithParam, - ::testing::Values(encrypt_env.get())); + ::testing::Values(ctr_encrypt_env.get())); #endif // ROCKSDB_LITE #ifndef ROCKSDB_LITE diff --git a/env/env_encrypt2.cc b/env/env_encrypt2.cc deleted file mode 100644 index 0583c3300..000000000 --- a/env/env_encrypt2.cc +++ /dev/null @@ -1,986 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -// -// env_encryption.cc copied to this file then modified. - -#ifdef ROCKSDB_OPENSSL_AES_CTR -#ifndef ROCKSDB_LITE - -#include "rocksdb/env_encrypt2.h" - -#include -#include -#include -#include - -#include "env/env_encrypt2_impl.h" -#include "monitoring/perf_context_imp.h" -#include "port/port.h" -#include "util/aligned_buffer.h" -#include "util/coding.h" -#include "util/mutexlock.h" -#include "util/random.h" - -namespace ROCKSDB_NAMESPACE { - -static std::once_flag crypto_loaded; -static std::shared_ptr crypto_shared; - -std::shared_ptr GetCrypto() { - std::call_once(crypto_loaded, - []() { crypto_shared = std::make_shared(); }); - return crypto_shared; -} - -// reuse cipher context between calls to Encrypt & Decrypt -static void do_nothing(EVP_CIPHER_CTX*){}; -thread_local static std::unique_ptr - aes_context(nullptr, &do_nothing); - -Sha1Description::Sha1Description(const std::string& key_desc_str) { - GetCrypto(); // ensure libcryto available - bool good = {true}; - int ret_val; - unsigned len; - - memset(desc, 0, EVP_MAX_MD_SIZE); - if (0 != key_desc_str.length() && crypto_shared->IsValid()) { - std::unique_ptr context( - crypto_shared->EVP_MD_CTX_new(), - crypto_shared->EVP_MD_CTX_free_ptr()); - - ret_val = crypto_shared->EVP_DigestInit_ex( - context.get(), crypto_shared->EVP_sha1(), nullptr); - good = (1 == ret_val); - if (good) { - ret_val = crypto_shared->EVP_DigestUpdate( - context.get(), key_desc_str.c_str(), key_desc_str.length()); - good = (1 == ret_val); - } - - if (good) { - ret_val = - crypto_shared->EVP_DigestFinal_ex(context.get(), desc, &len); - good = (1 == ret_val); - } - } else { - good = false; - } - - valid = good; -} - -AesCtrKey::AesCtrKey(const std::string& key_str) : valid(false) { - GetCrypto(); // ensure libcryto available - memset(key, 0, EVP_MAX_KEY_LENGTH); - - // simple parse: must be 64 characters long and hexadecimal values - if (64 == key_str.length()) { - auto bad_pos = key_str.find_first_not_of("abcdefABCDEF0123456789"); - if (std::string::npos == bad_pos) { - for (size_t idx = 0, idx2 = 0; idx < key_str.length(); idx += 2, ++idx2) { - std::string hex_string(key_str.substr(idx, 2)); - key[idx2] = std::stoul(hex_string, 0, 16); - } - valid = true; - } - } -} - - -void AESBlockAccessCipherStream::BigEndianAdd128(uint8_t* buf, - uint64_t value) { - uint8_t *sum, *addend, *carry, pre, post; - - sum = buf + 15; - - if (port::kLittleEndian) { - addend = (uint8_t*)&value; - } else { - addend = (uint8_t*)&value + 7; - } - - // future: big endian could be written as uint64_t add - for (int loop = 0; loop < 8 && value; ++loop) { - pre = *sum; - *sum += *addend; - post = *sum; - --sum; - value >>= 8; - - carry = sum; - // carry? - while (post < pre && buf <= carry) { - pre = *carry; - *carry += 1; - post = *carry; - --carry; - } - } // for -} - -// "data" is assumed to be aligned at AES_BLOCK_SIZE or greater -Status AESBlockAccessCipherStream::Encrypt(uint64_t file_offset, char* data, - size_t data_size) { - Status status; - if (0 < data_size) { - if (crypto_shared->IsValid()) { - int ret_val, out_len; - ALIGN16 uint8_t iv[AES_BLOCK_SIZE]; - uint64_t block_index = file_offset / BlockSize(); - - // make a context once per thread - if (!aes_context) { - aes_context = - std::unique_ptr( - crypto_shared->EVP_CIPHER_CTX_new(), - crypto_shared->EVP_CIPHER_CTX_free_ptr()); - } - - memcpy(iv, nonce_, AES_BLOCK_SIZE); - BigEndianAdd128(iv, block_index); - - ret_val = crypto_shared->EVP_EncryptInit_ex( - aes_context.get(), crypto_shared->EVP_aes_256_ctr(), nullptr, - key_.key, iv); - if (1 == ret_val) { - out_len = 0; - ret_val = crypto_shared->EVP_EncryptUpdate( - aes_context.get(), (unsigned char*)data, &out_len, - (unsigned char*)data, (int)data_size); - - if (1 == ret_val && (int)data_size == out_len) { - // this is a soft reset of aes_context per man pages - uint8_t temp_buf[AES_BLOCK_SIZE]; - out_len = 0; - ret_val = crypto_shared->EVP_EncryptFinal_ex(aes_context.get(), - temp_buf, &out_len); - - if (1 != ret_val || 0 != out_len) { - status = Status::InvalidArgument( - "EVP_EncryptFinal_ex failed: ", - (1 != ret_val) ? "bad return value" : "output length short"); - } - } else { - status = Status::InvalidArgument("EVP_EncryptUpdate failed: ", - (int)data_size == out_len - ? "bad return value" - : "output length short"); - } - } else { - status = Status::InvalidArgument("EVP_EncryptInit_ex failed."); - } - } else { - status = Status::NotSupported( - "libcrypto not available for encryption/decryption."); - } - } - - return status; -} - -// Decrypt one or more (partial) blocks of data at the file offset. -// Length of data is given in data_size. -// CTR Encrypt and Decrypt are synonyms. Using Encrypt calls here to reduce -// count of symbols loaded from libcrypto. -Status AESBlockAccessCipherStream::Decrypt(uint64_t file_offset, char* data, - size_t data_size) { - // Calculate block index - size_t block_size = BlockSize(); - uint64_t block_index = file_offset / block_size; - size_t block_offset = file_offset % block_size; - size_t remaining = data_size; - size_t prefix_size = 0; - uint8_t temp_buf[block_size]; - - Status status; - ALIGN16 uint8_t iv[AES_BLOCK_SIZE]; - int out_len = 0, ret_val; - - if (crypto_shared->IsValid()) { - // make a context once per thread - if (!aes_context) { - aes_context = std::unique_ptr( - crypto_shared->EVP_CIPHER_CTX_new(), - crypto_shared->EVP_CIPHER_CTX_free_ptr()); - } - - memcpy(iv, nonce_, AES_BLOCK_SIZE); - BigEndianAdd128(iv, block_index); - - ret_val = crypto_shared->EVP_EncryptInit_ex( - aes_context.get(), crypto_shared->EVP_aes_256_ctr(), nullptr, key_.key, - iv); - if (1 == ret_val) { - // handle uneven block start - if (0 != block_offset) { - prefix_size = block_size - block_offset; - if (data_size < prefix_size) { - prefix_size = data_size; - } - - memcpy(temp_buf + block_offset, data, prefix_size); - out_len = 0; - ret_val = crypto_shared->EVP_EncryptUpdate( - aes_context.get(), temp_buf, &out_len, temp_buf, (int)block_size); - - if (1 != ret_val || (int)block_size != out_len) { - status = Status::InvalidArgument("EVP_EncryptUpdate failed 1: ", - (int)block_size == out_len - ? "bad return value" - : "output length short"); - } else { - memcpy(data, temp_buf + block_offset, prefix_size); - } - } - - // all remaining data, even block size not required - remaining -= prefix_size; - if (status.ok() && remaining) { - out_len = 0; - ret_val = crypto_shared->EVP_EncryptUpdate( - aes_context.get(), (uint8_t*)data + prefix_size, &out_len, - (uint8_t*)data + prefix_size, (int)remaining); - - if (1 != ret_val || (int)remaining != out_len) { - status = Status::InvalidArgument("EVP_EncryptUpdate failed 2: ", - (int)remaining == out_len - ? "bad return value" - : "output length short"); - } - } - - // this is a soft reset of aes_context per man pages - out_len = 0; - ret_val = crypto_shared->EVP_EncryptFinal_ex( - aes_context.get(), temp_buf, &out_len); - - if (1 != ret_val || 0 != out_len) { - status = Status::InvalidArgument("EVP_EncryptFinal_ex failed."); - } - } else { - status = Status::InvalidArgument("EVP_EncryptInit_ex failed."); - } - } else { - status = Status::NotSupported( - "libcrypto not available for encryption/decryption."); - } - - return status; -} - -Status CTREncryptionProviderV2::CreateNewPrefix(const std::string& /*fname*/, - char* prefix, - size_t prefixLength) const { - GetCrypto(); // ensure libcryto available - Status s; - if (crypto_shared->IsValid()) { - if (sizeof(PrefixVersion0) <= prefixLength) { - int ret_val; - - PrefixVersion0* pf = {(PrefixVersion0*)prefix}; - memcpy(pf->key_description_, key_desc_.desc, sizeof(key_desc_.desc)); - ret_val = crypto_shared->RAND_bytes((unsigned char*)&pf->nonce_, - AES_BLOCK_SIZE); - if (1 != ret_val) { - s = Status::NotSupported("RAND_bytes failed"); - } - } else { - s = Status::NotSupported("Prefix size needs to be 28 or more"); - } - } else { - s = Status::NotSupported("RAND_bytes() from libcrypto not available."); - } - - return s; -} - -size_t CTREncryptionProviderV2::GetPrefixLength() const { - return sizeof(PrefixVersion0) + sizeof(EncryptMarker); -} - -BlockAccessCipherStream* CTREncryptionProviderV2::CreateCipherStream2( - uint8_t code_version, const uint8_t nonce[]) const { - return new AESBlockAccessCipherStream(key_, code_version, nonce); -} - -Status EncryptedWritableFileV2::Append(const Slice& data) { - AlignedBuffer buf; - Status status; - Slice dataToAppend(data); - if (data.size() > 0) { - size_t block_size = stream_->BlockSize(); - uint64_t offset = file_->GetFileSize(); // size including prefix - uint64_t block_offset = offset % block_size; - - // Encrypt in cloned buffer - buf.Alignment(block_size); - // worst case is one byte only in first and in last block, - // so 2*block_size-2 might be needed (simplified to 2*block_size) - buf.AllocateNewBuffer(data.size() + 2 * block_size); - memcpy(buf.BufferStart() + block_offset, data.data(), data.size()); - buf.Size(data.size() + block_offset); - { - PERF_TIMER_GUARD(encrypt_data_nanos); - status = stream_->Encrypt(offset - block_offset, buf.BufferStart(), - buf.CurrentSize()); - } - if (status.ok()) { - dataToAppend = Slice(buf.BufferStart() + block_offset, data.size()); - } - } - - if (status.ok()) { - status = file_->Append(dataToAppend); - } - - return status; -} - -Status EncryptedWritableFileV2::PositionedAppend(const Slice& data, - uint64_t offset) { - AlignedBuffer buf; - Status status; - Slice dataToAppend(data); - offset += prefixLength_; - if (data.size() > 0) { - size_t block_size = stream_->BlockSize(); - uint64_t block_offset = offset % block_size; - - // Encrypt in cloned buffer - buf.Alignment(block_size); - // worst case is one byte only in first and in last block, - // so 2*block_size-2 might be needed (simplified to 2*block_size) - buf.AllocateNewBuffer(data.size() + 2 * block_size); - memcpy(buf.BufferStart() + block_offset, data.data(), data.size()); - buf.Size(data.size() + block_offset); - { - PERF_TIMER_GUARD(encrypt_data_nanos); - status = stream_->Encrypt(offset - block_offset, buf.BufferStart(), - buf.CurrentSize()); - } - if (status.ok()) { - dataToAppend = Slice(buf.BufferStart() + block_offset, data.size()); - } - } - - if (status.ok()) { - status = file_->PositionedAppend(dataToAppend, offset); - } - - return status; -} - -Status EncryptedRandomRWFileV2::Write(uint64_t offset, const Slice& data) { - AlignedBuffer buf; - Status status; - Slice dataToWrite(data); - offset += prefixLength_; - if (data.size() > 0) { - size_t block_size = stream_->BlockSize(); - uint64_t block_offset = offset % block_size; - - // Encrypt in cloned buffer - buf.Alignment(block_size); - // worst case is one byte only in first and in last block, - // so 2*block_size-2 might be needed (simplified to 2*block_size) - buf.AllocateNewBuffer(data.size() + 2 * block_size); - memcpy(buf.BufferStart() + block_offset, data.data(), data.size()); - buf.Size(data.size() + block_offset); - { - PERF_TIMER_GUARD(encrypt_data_nanos); - status = stream_->Encrypt(offset - block_offset, buf.BufferStart(), - buf.CurrentSize()); - } - if (status.ok()) { - dataToWrite = Slice(buf.BufferStart()+block_offset, data.size()); - } - } - - if (status.ok()) { - status = file_->Write(offset, dataToWrite); - } - - return status; -} - -// Returns an Env that encrypts data when stored on disk and decrypts data when -// read from disk. -Env* NewEncryptedEnvV2(Env* base_env, EncryptedEnvV2::ReadKeys encrypt_read, - EncryptedEnvV2::WriteKey encrypt_write) { - Env* ret_env{base_env}; - EncryptedEnvV2* new_env{nullptr}; - - if (nullptr != base_env) { - new_env = new EncryptedEnvV2(base_env, encrypt_read, encrypt_write); - } - - // warning, dynamic loading of libcrypto could be delayed ... making this - // false - if (nullptr != new_env && new_env->IsValid()) { - ret_env = new_env; - } - - return ret_env; -} - -EncryptedEnvV2::EncryptedEnvV2(Env* base_env, - EncryptedEnvV2::ReadKeys encrypt_read, - EncryptedEnvV2::WriteKey encrypt_write) - : EnvWrapper(base_env), valid_(false) { - init(); - SetKeys(encrypt_read, encrypt_write); -} - - -EncryptedEnvV2::EncryptedEnvV2(Env* base_env) - : EnvWrapper(base_env), valid_(false) { - init(); -} - -void EncryptedEnvV2::init() { - crypto_ = GetCrypto(); - - valid_ = crypto_->IsValid(); - if (IsValid()) { - crypto_->RAND_poll(); - } -} - -void EncryptedEnvV2::SetKeys(EncryptedEnvV2::ReadKeys encrypt_read, - EncryptedEnvV2::WriteKey encrypt_write) { - WriteLock lock(&key_lock_); - encrypt_read_ = encrypt_read; - encrypt_write_ = encrypt_write; -} - -bool EncryptedEnvV2::IsWriteEncrypted() const { - ReadLock lock(&key_lock_); - bool ret_flag = (nullptr != encrypt_write_.second); - return ret_flag; -} - -// -// common functions used with different file types -// (because there is not common base class for the file types -// -template -Status EncryptedEnvV2::ReadSeqEncryptionPrefix( - TypeFile* f, std::shared_ptr& provider, - std::unique_ptr& stream) { - Status status; - - provider.reset(); // nullptr for provider implies "no encryption" - stream.release(); - - // Look for encryption marker - EncryptMarker marker; - Slice marker_slice; - status = f->Read(sizeof(marker), &marker_slice, marker); - if (status.ok()) { - if (sizeof(marker) == marker_slice.size() && - marker_slice.starts_with(kEncryptMarker)) { - // code_version currently unused - uint8_t code_version = (uint8_t)marker_slice[7]; - - if (kEncryptCodeVersion0 == code_version) { - Slice prefix_slice; - PrefixVersion0 prefix_buffer; - status = f->Read(sizeof(PrefixVersion0), &prefix_slice, - (char*)&prefix_buffer); - if (status.ok() && sizeof(PrefixVersion0) == prefix_slice.size()) { - Sha1Description desc(prefix_buffer.key_description_, - sizeof(prefix_buffer.key_description_)); - - ReadLock lock(&key_lock_); - auto it = encrypt_read_.find(desc); - if (encrypt_read_.end() != it) { - provider = it->second; - stream.reset(new AESBlockAccessCipherStream( - provider->key(), code_version, prefix_buffer.nonce_)); - - } else { - status = Status::NotSupported( - "No encryption key found to match input file"); - } - } - } else { - status = - Status::NotSupported("Unknown encryption code version required."); - } - } - } - return status; -} - -template -Status EncryptedEnvV2::ReadRandEncryptionPrefix( - TypeFile* f, std::shared_ptr& provider, - std::unique_ptr& stream) { - Status status; - - provider.reset(); // nullptr for provider implies "no encryption" - stream.release(); - - // Look for encryption marker - EncryptMarker marker; - Slice marker_slice; - status = f->Read(0, sizeof(marker), &marker_slice, marker); - if (status.ok()) { - if (sizeof(marker) == marker_slice.size() && - marker_slice.starts_with(kEncryptMarker)) { - uint8_t code_version = (uint8_t)marker_slice[7]; - - if (kEncryptCodeVersion0 == code_version) { - Slice prefix_slice; - PrefixVersion0 prefix_buffer; - status = f->Read(sizeof(marker), sizeof(PrefixVersion0), &prefix_slice, - (char*)&prefix_buffer); - if (status.ok() && sizeof(PrefixVersion0) == prefix_slice.size()) { - Sha1Description desc(prefix_buffer.key_description_, - sizeof(prefix_buffer.key_description_)); - - ReadLock lock(&key_lock_); - auto it = encrypt_read_.find(desc); - if (encrypt_read_.end() != it) { - provider = it->second; - stream.reset(new AESBlockAccessCipherStream( - provider->key(), code_version, prefix_buffer.nonce_)); - } else { - status = Status::NotSupported( - "No encryption key found to match input file"); - } - } - } else { - status = - Status::NotSupported("Unknown encryption code version required."); - } - } - } - return status; -} - -template -Status EncryptedEnvV2::WriteSeqEncryptionPrefix( - TypeFile* f, std::shared_ptr provider, - std::unique_ptr& stream) { - Status status; - - // set up Encryption maker, code version '0' - uint8_t code_version = {kEncryptCodeVersion0}; - PrefixVersion0 prefix; - EncryptMarker marker; - strncpy(marker, kEncryptMarker, sizeof(kEncryptMarker)); - marker[sizeof(EncryptMarker) - 1] = code_version; - - Slice marker_slice(marker, sizeof(EncryptMarker)); - status = f->Append(marker_slice); - - if (status.ok()) { - // create nonce, then write it and key description - Slice prefix_slice((char*)&prefix, sizeof(prefix)); - - status = provider->CreateNewPrefix(std::string(), (char*)&prefix, - provider->GetPrefixLength()); - - if (status.ok()) { - status = f->Append(prefix_slice); - } - } - - if (status.ok()) { - stream.reset(new AESBlockAccessCipherStream(provider->key(), code_version, - prefix.nonce_)); - } - - return status; -} - -template -Status EncryptedEnvV2::WriteRandEncryptionPrefix( - TypeFile* f, std::shared_ptr provider, - std::unique_ptr& stream) { - Status status; - - // set up Encryption maker, code version '0' - uint8_t code_version = {kEncryptCodeVersion0}; - PrefixVersion0 prefix; - EncryptMarker marker; - strncpy(marker, kEncryptMarker, sizeof(kEncryptMarker)); - marker[sizeof(EncryptMarker) - 1] = code_version; - - Slice marker_slice(marker, sizeof(EncryptMarker)); - status = f->Write(0, marker_slice); - - if (status.ok()) { - // create nonce, then write it and key description - Slice prefix_slice((char*)&prefix, sizeof(prefix)); - - status = provider->CreateNewPrefix(std::string(), (char*)&prefix, - provider->GetPrefixLength()); - - if (status.ok()) { - status = f->Write(sizeof(EncryptMarker), prefix_slice); - } - } - - if (status.ok()) { - stream.reset(new AESBlockAccessCipherStream(provider->key(), code_version, - prefix.nonce_)); - } - - return status; -} - -// NewSequentialFile opens a file for sequential reading. -Status EncryptedEnvV2::NewSequentialFile( - const std::string& fname, std::unique_ptr* result, - const EnvOptions& options) { - result->reset(); - if (options.use_mmap_reads || options.use_direct_reads) { - return Status::InvalidArgument(); - } - - // Open file using underlying Env implementation - std::unique_ptr underlying; - auto status = EnvWrapper::NewSequentialFile(fname, &underlying, options); - if (status.ok()) { - std::shared_ptr provider; - std::unique_ptr stream; - status = ReadSeqEncryptionPrefix(underlying.get(), provider, - stream); - - if (status.ok()) { - if (provider) { - (*result) = std::unique_ptr(new EncryptedSequentialFile( - std::move(underlying), std::move(stream), - provider->GetPrefixLength())); - - } else { - // normal file, not encrypted - // sequential file might not allow backing up to begining, close and - // reopen - underlying.reset(nullptr); - status = EnvWrapper::NewSequentialFile(fname, result, options); - } - } - } - - return status; -} - -// NewRandomAccessFile opens a file for random read access. -Status EncryptedEnvV2::NewRandomAccessFile( - const std::string& fname, std::unique_ptr* result, - const EnvOptions& options) { - result->reset(); - if (options.use_mmap_reads || options.use_direct_reads) { - return Status::InvalidArgument(); - } - - // Open file using underlying Env implementation - std::unique_ptr underlying; - auto status = EnvWrapper::NewRandomAccessFile(fname, &underlying, options); - if (status.ok()) { - std::shared_ptr provider; - std::unique_ptr stream; - status = ReadRandEncryptionPrefix(underlying.get(), - provider, stream); - - if (status.ok()) { - if (provider) { - (*result) = - std::unique_ptr(new EncryptedRandomAccessFile( - std::move(underlying), std::move(stream), - provider->GetPrefixLength())); - - } else { - // normal file, not encrypted - (*result).reset(underlying.release()); - } - } - } - return status; -} - -// NewWritableFile opens a file for sequential writing. -Status EncryptedEnvV2::NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - Status status; - result->reset(); - - if (!options.use_mmap_writes && !options.use_direct_writes) { - // Open file using underlying Env implementation - std::unique_ptr underlying; - status = EnvWrapper::NewWritableFile(fname, &underlying, options); - - if (status.ok()) { - std::shared_ptr provider; - - { - ReadLock lock(&key_lock_); - provider = encrypt_write_.second; - } - - if (provider) { - std::unique_ptr stream; - - status = WriteSeqEncryptionPrefix(underlying.get(), provider, stream); - - if (status.ok()) { - (*result) = std::unique_ptr(new EncryptedWritableFileV2( - std::move(underlying), std::move(stream), - provider->GetPrefixLength())); - } - } else { - (*result).reset(underlying.release()); - } - } - } else { - status = Status::InvalidArgument(); - } - - return status; -} - -// Create an object that writes to a new file with the specified -// name. Deletes any existing file with the same name and creates a -// new file. On success, stores a pointer to the new file in -// *result and returns OK. On failure stores nullptr in *result and -// returns non-OK. -// -// The returned file will only be accessed by one thread at a time. -Status EncryptedEnvV2::ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - Status status; - result->reset(); - - if (!options.use_mmap_writes && !options.use_direct_writes) { - // Open file using underlying Env implementation - std::unique_ptr underlying; - status = EnvWrapper::ReopenWritableFile(fname, &underlying, options); - - if (status.ok()) { - std::shared_ptr provider; - - { - ReadLock lock(&key_lock_); - provider = encrypt_write_.second; - } - - if (provider) { - std::unique_ptr stream; - - status = WriteSeqEncryptionPrefix(underlying.get(), provider, stream); - - if (status.ok()) { - (*result) = std::unique_ptr(new EncryptedWritableFile( - std::move(underlying), std::move(stream), - provider->GetPrefixLength())); - } - } else { - (*result).reset(underlying.release()); - } - } - } else { - status = Status::InvalidArgument(); - } - - return status; -} - -// Reuse an existing file by renaming it and opening it as writable. -Status EncryptedEnvV2::ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) { - Status status; - result->reset(); - - if (!options.use_mmap_writes && !options.use_direct_writes) { - // Open file using underlying Env implementation - std::unique_ptr underlying; - status = - EnvWrapper::ReuseWritableFile(fname, old_fname, &underlying, options); - - if (status.ok()) { - std::shared_ptr provider; - - { - ReadLock lock(&key_lock_); - provider = encrypt_write_.second; - } - - if (provider) { - std::unique_ptr stream; - - status = WriteSeqEncryptionPrefix(underlying.get(), provider, stream); - - if (status.ok()) { - (*result) = std::unique_ptr(new EncryptedWritableFile( - std::move(underlying), std::move(stream), - provider->GetPrefixLength())); - } - } else { - (*result).reset(underlying.release()); - } - } - } else { - status = Status::InvalidArgument(); - } - - return status; -} - -// Open `fname` for random read and write, if file doesn't exist the file -// will be created. On success, stores a pointer to the new file in -// *result and returns OK. On failure returns non-OK. -// -// The returned file will only be accessed by one thread at a time. -Status EncryptedEnvV2::NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) { - Status status; - result->reset(); - - // Check file exists - bool isNewFile = !FileExists(fname).ok(); - - if (!options.use_mmap_writes && !options.use_mmap_reads && - !options.use_direct_writes && !options.use_direct_reads) { - // Open file using underlying Env implementation - std::unique_ptr underlying; - status = EnvWrapper::NewRandomRWFile(fname, &underlying, options); - - if (status.ok()) { - std::shared_ptr provider; - std::unique_ptr stream; - - if (!isNewFile) { - // file exists, get existing crypto info - status = ReadRandEncryptionPrefix(underlying.get(), - provider, stream); - } else { - // new file - { - ReadLock lock(&key_lock_); - provider = encrypt_write_.second; - } - - if (provider) { - status = - WriteRandEncryptionPrefix(underlying.get(), provider, stream); - } - } - - // establish encrypt or not, finalize file object - if (status.ok()) { - if (provider) { - (*result) = std::unique_ptr(new EncryptedRandomRWFileV2( - std::move(underlying), std::move(stream), - provider->GetPrefixLength())); - } else { - (*result).reset(underlying.release()); - } - } - } - } else { - status = Status::InvalidArgument(); - } - - return status; -} - -// Store in *result the attributes of the children of the specified directory. -// In case the implementation lists the directory prior to iterating the files -// and files are concurrently deleted, the deleted files will be omitted from -// result. -// The name attributes are relative to "dir". -// Original contents of *results are dropped. -// Returns OK if "dir" exists and "*result" contains its children. -// NotFound if "dir" does not exist, the calling process does not have -// permission to access "dir", or if "dir" is invalid. -// IOError if an IO Error was encountered -Status EncryptedEnvV2::GetChildrenFileAttributes( - const std::string& dir, std::vector* result) { - auto status = EnvWrapper::GetChildrenFileAttributes(dir, result); - if (status.ok()) { - // this is slightly expensive, but fortunately not used heavily - std::shared_ptr provider; - - for (auto it = std::begin(*result); it != std::end(*result); ++it) { - status = GetEncryptionProvider(it->name, provider); - - if (status.ok() && provider) { - size_t prefixLength = provider->GetPrefixLength(); - - if (prefixLength <= it->size_bytes) it->size_bytes -= prefixLength; - } - } - } - - return status; -} - -// Store the size of fname in *file_size. -Status EncryptedEnvV2::GetFileSize(const std::string& fname, - uint64_t* file_size) { - Status status; - status = EnvWrapper::GetFileSize(fname, file_size); - - if (status.ok()) { - // this is slightly expensive, but fortunately not used heavily - std::shared_ptr provider; - status = GetEncryptionProvider(fname, provider); - if (status.ok() && provider) { - size_t prefixLength = provider->GetPrefixLength(); - if (prefixLength <= *file_size) *file_size -= prefixLength; - } - } - - return status; -} - -Status EncryptedEnvV2::GetEncryptionProvider( - const std::string& fname, - std::shared_ptr& provider) { - std::unique_ptr underlying; - EnvOptions options; - Status status; - - provider.reset(); - status = Env::Default()->NewSequentialFile(fname, &underlying, options); - - if (status.ok()) { - std::unique_ptr stream; - status = EncryptedEnvV2::ReadSeqEncryptionPrefix(underlying.get(), provider, - stream); - } - - return status; -} - -Env* EncryptedEnvV2::Default() { - // the rational for this routine is to help force the static - // loading of UnixLibCrypto before other routines start - // using the encryption code. - static EncryptedEnvV2 default_env(Env::Default()); - return &default_env; -} - -Env* EncryptedEnvV2::Default(EncryptedEnvV2::ReadKeys encrypt_read, - EncryptedEnvV2::WriteKey encrypt_write) { - EncryptedEnvV2* default_env = (EncryptedEnvV2*)Default(); - default_env->SetKeys(encrypt_read, encrypt_write); - return default_env; -} - -} // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE -#endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/env/env_encryption.cc b/env/env_encryption.cc index 76b7a8947..56a17667c 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -12,25 +12,78 @@ #include #include +#include "env/env_encryption_ctr.h" #include "monitoring/perf_context_imp.h" +#include "rocksdb/convenience.h" #include "util/aligned_buffer.h" #include "util/coding.h" #include "util/random.h" +#include "util/string_util.h" #endif namespace ROCKSDB_NAMESPACE { #ifndef ROCKSDB_LITE +static constexpr char kROT13CipherName[] = "ROT13"; +static constexpr char kCTRProviderName[] = "CTR"; + +Status BlockCipher::CreateFromString(const ConfigOptions& /*config_options*/, + const std::string& value, + std::shared_ptr* result) { + std::string id = value; + size_t colon = value.find(':'); + if (colon != std::string::npos) { + id = value.substr(0, colon); + } + if (id == kROT13CipherName) { + if (colon != std::string::npos) { + size_t block_size = ParseSizeT(value.substr(colon + 1)); + result->reset(new ROT13BlockCipher(block_size)); + } else { + result->reset(new ROT13BlockCipher(32)); + } + return Status::OK(); + } else { + return Status::NotSupported("Could not find cipher ", value); + } +} -// Read up to "n" bytes from the file. "scratch[0..n-1]" may be -// written by this routine. Sets "*result" to the data that was -// read (including if fewer than "n" bytes were successfully read). -// May set "*result" to point at data in "scratch[0..n-1]", so -// "scratch[0..n-1]" must be live when "*result" is used. -// If an error was encountered, returns a non-OK status. -// -// REQUIRES: External synchronization +Status EncryptionProvider::CreateFromString( + const ConfigOptions& /*config_options*/, const std::string& value, + std::shared_ptr* result) { + std::string id = value; + bool is_test = StartsWith(value, "test://"); + Status status = Status::OK(); + if (is_test) { + id = value.substr(strlen("test://")); + } + if (id == kCTRProviderName) { + result->reset(new CTREncryptionProvider()); + } else if (is_test) { + result->reset(new CTREncryptionProvider()); + } else { + return Status::NotSupported("Could not find provider ", value); + } + if (status.ok() && is_test) { + status = result->get()->TEST_Initialize(); + } + return status; +} + +std::shared_ptr EncryptionProvider::NewCTRProvider( + const std::shared_ptr& cipher) { + return std::make_shared(cipher); +} + + // Read up to "n" bytes from the file. "scratch[0..n-1]" may be + // written by this routine. Sets "*result" to the data that was + // read (including if fewer than "n" bytes were successfully read). + // May set "*result" to point at data in "scratch[0..n-1]", so + // "scratch[0..n-1]" must be live when "*result" is used. + // If an error was encountered, returns a non-OK status. + // + // REQUIRES: External synchronization Status EncryptedSequentialFile::Read(size_t n, Slice* result, char* scratch) { assert(scratch); Status status = file_->Read(n, result, scratch); @@ -74,15 +127,15 @@ size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } -// Remove any kind of caching of data from the offset to offset+length -// of this file. If the length is 0, then it refers to the end of file. -// If the system is not caching the file contents, then this is a noop. + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. Status EncryptedSequentialFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } -// Positioned Read for direct I/O -// If Direct I/O enabled, offset, n, and scratch should be properly aligned + // Positioned Read for direct I/O + // If Direct I/O enabled, offset, n, and scratch should be properly aligned Status EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result, char* scratch) { assert(scratch); @@ -99,16 +152,16 @@ Status EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, return status; } -// Read up to "n" bytes from the file starting at "offset". -// "scratch[0..n-1]" may be written by this routine. Sets "*result" -// to the data that was read (including if fewer than "n" bytes were -// successfully read). May set "*result" to point at data in -// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when -// "*result" is used. If an error was encountered, returns a non-OK -// status. -// -// Safe for concurrent use by multiple threads. -// If Direct I/O enabled, offset, n, and scratch should be aligned properly. + // Read up to "n" bytes from the file starting at "offset". + // "scratch[0..n-1]" may be written by this routine. Sets "*result" + // to the data that was read (including if fewer than "n" bytes were + // successfully read). May set "*result" to point at data in + // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when + // "*result" is used. If an error was encountered, returns a non-OK + // status. + // + // Safe for concurrent use by multiple threads. + // If Direct I/O enabled, offset, n, and scratch should be aligned properly. Status EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { assert(scratch); @@ -124,27 +177,27 @@ Status EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, return status; } -// Readahead the file starting from offset by n bytes for caching. + // Readahead the file starting from offset by n bytes for caching. Status EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n) { // return Status::OK(); return file_->Prefetch(offset + prefixLength_, n); } -// Tries to get an unique ID for this file that will be the same each time -// the file is opened (and will stay the same while the file is open). -// Furthermore, it tries to make this ID at most "max_size" bytes. If such an -// ID can be created this function returns the length of the ID and places it -// in "id"; otherwise, this function returns 0, in which case "id" -// may not have been modified. -// -// This function guarantees, for IDs from a given environment, two unique ids -// cannot be made equal to each other by adding arbitrary bytes to one of -// them. That is, no unique ID is the prefix of another. -// -// This function guarantees that the returned ID will not be interpretable as -// a single varint. -// -// Note: these IDs are only valid for the duration of the process. + // Tries to get an unique ID for this file that will be the same each time + // the file is opened (and will stay the same while the file is open). + // Furthermore, it tries to make this ID at most "max_size" bytes. If such an + // ID can be created this function returns the length of the ID and places it + // in "id"; otherwise, this function returns 0, in which case "id" + // may not have been modified. + // + // This function guarantees, for IDs from a given environment, two unique ids + // cannot be made equal to each other by adding arbitrary bytes to one of + // them. That is, no unique ID is the prefix of another. + // + // This function guarantees that the returned ID will not be interpretable as + // a single varint. + // + // Note: these IDs are only valid for the duration of the process. size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { return file_->GetUniqueId(id, max_size); }; @@ -153,21 +206,21 @@ void EncryptedRandomAccessFile::Hint(AccessPattern pattern) { file_->Hint(pattern); } -// Indicates the upper layers if the current RandomAccessFile implementation -// uses direct IO. + // Indicates the upper layers if the current RandomAccessFile implementation + // uses direct IO. bool EncryptedRandomAccessFile::use_direct_io() const { return file_->use_direct_io(); } -// Use the returned alignment value to allocate -// aligned buffer for Direct I/O + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } -// Remove any kind of caching of data from the offset to offset+length -// of this file. If the length is 0, then it refers to the end of file. -// If the system is not caching the file contents, then this is a noop. + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. Status EncryptedRandomAccessFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); @@ -233,14 +286,14 @@ Status EncryptedWritableFile::PositionedAppend(const Slice& data, return status; } -// Indicates the upper layers if the current WritableFile implementation -// uses direct IO. + // Indicates the upper layers if the current WritableFile implementation + // uses direct IO. bool EncryptedWritableFile::use_direct_io() const { return file_->use_direct_io(); } -// Use the returned alignment value to allocate -// aligned buffer for Direct I/O + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O size_t EncryptedWritableFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } @@ -252,42 +305,42 @@ uint64_t EncryptedWritableFile::GetFileSize() { return file_->GetFileSize() - prefixLength_; } -// Truncate is necessary to trim the file to the correct size -// before closing. It is not always possible to keep track of the file -// size due to whole pages writes. The behavior is undefined if called -// with other writes to follow. + // Truncate is necessary to trim the file to the correct size + // before closing. It is not always possible to keep track of the file + // size due to whole pages writes. The behavior is undefined if called + // with other writes to follow. Status EncryptedWritableFile::Truncate(uint64_t size) { return file_->Truncate(size + prefixLength_); } -// Remove any kind of caching of data from the offset to offset+length -// of this file. If the length is 0, then it refers to the end of file. -// If the system is not caching the file contents, then this is a noop. -// This call has no effect on dirty pages in the cache. + // Remove any kind of caching of data from the offset to offset+length + // of this file. If the length is 0, then it refers to the end of file. + // If the system is not caching the file contents, then this is a noop. + // This call has no effect on dirty pages in the cache. Status EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } -// Sync a file range with disk. -// offset is the starting byte of the file range to be synchronized. -// nbytes specifies the length of the range to be synchronized. -// This asks the OS to initiate flushing the cached data to disk, -// without waiting for completion. -// Default implementation does nothing. + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. Status EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) { return file_->RangeSync(offset + prefixLength_, nbytes); } -// PrepareWrite performs any necessary preparation for a write -// before the write actually occurs. This allows for pre-allocation -// of space on devices where it can result in less file -// fragmentation and/or less waste from over-zealous filesystem -// pre-allocation. + // PrepareWrite performs any necessary preparation for a write + // before the write actually occurs. This allows for pre-allocation + // of space on devices where it can result in less file + // fragmentation and/or less waste from over-zealous filesystem + // pre-allocation. void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len) { file_->PrepareWrite(offset + prefixLength_, len); } -// Pre-allocates space for a file. + // Pre-allocates space for a file. Status EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len) { return file_->Allocate(offset + prefixLength_, len); } @@ -300,14 +353,14 @@ bool EncryptedRandomRWFile::use_direct_io() const { return file_->use_direct_io(); } -// Use the returned alignment value to allocate -// aligned buffer for Direct I/O + // Use the returned alignment value to allocate + // aligned buffer for Direct I/O size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } -// Write bytes in `data` at offset `offset`, Returns Status::OK() on success. -// Pass aligned buffer when use_direct_io() returns true. + // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. + // Pass aligned buffer when use_direct_io() returns true. Status EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data) { AlignedBuffer buf; Status status; @@ -332,9 +385,9 @@ Status EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data) { return status; } -// Read up to `n` bytes starting from offset `offset` and store them in -// result, provided `scratch` size should be at least `n`. -// Returns Status::OK() on success. + // Read up to `n` bytes starting from offset `offset` and store them in + // result, provided `scratch` size should be at least `n`. + // Returns Status::OK() on success. Status EncryptedRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { assert(scratch); @@ -360,9 +413,152 @@ Status EncryptedRandomRWFile::Close() { return file_->Close(); } // EncryptedEnv implements an Env wrapper that adds encryption to files stored // on disk. -class EncryptedEnv : public EnvWrapper { +class EncryptedEnvImpl : public EnvWrapper { + Status GetWritableProvider(const std::string& /*fname*/, + EncryptionProvider** result) { + if (provider_) { + *result = provider_.get(); + return Status::OK(); + } else { + *result = nullptr; + return Status::NotFound("No WriteProvider specified"); + } + } + + Status GetReadableProvider(const std::string& /*fname*/, + EncryptionProvider** result) { + if (provider_) { + *result = provider_.get(); + return Status::OK(); + } else { + *result = nullptr; + return Status::NotFound("No Provider specified"); + } + } + + template + Status CreateWritableCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const EnvOptions& options, size_t* prefix_length, + std::unique_ptr* stream) { + EncryptionProvider* provider = nullptr; + *prefix_length = 0; + Status status = GetWritableProvider(fname, &provider); + if (!status.ok()) { + return status; + } else if (provider != nullptr) { + // Initialize & write prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider->GetPrefixLength(); + if (*prefix_length > 0) { + // Initialize prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + status = provider->CreateNewPrefix(fname, buffer.BufferStart(), + *prefix_length); + if (status.ok()) { + buffer.Size(*prefix_length); + prefix = Slice(buffer.BufferStart(), buffer.CurrentSize()); + // Write prefix + status = underlying->Append(prefix); + } + if (!status.ok()) { + return status; + } + } + // Create cipher stream + status = provider->CreateCipherStream(fname, options, prefix, stream); + } + return status; + } + + template + Status CreateRandomWriteCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const EnvOptions& options, size_t* prefix_length, + std::unique_ptr* stream) { + EncryptionProvider* provider = nullptr; + *prefix_length = 0; + Status status = GetWritableProvider(fname, &provider); + if (!status.ok()) { + return status; + } else if (provider != nullptr) { + // Initialize & write prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider->GetPrefixLength(); + if (*prefix_length > 0) { + // Initialize prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + status = provider->CreateNewPrefix(fname, buffer.BufferStart(), + *prefix_length); + if (status.ok()) { + buffer.Size(*prefix_length); + prefix = Slice(buffer.BufferStart(), buffer.CurrentSize()); + // Write prefix + status = underlying->Write(0, prefix); + } + if (!status.ok()) { + return status; + } + } + // Create cipher stream + status = provider->CreateCipherStream(fname, options, prefix, stream); + } + return status; + } + + template + Status CreateSequentialCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const EnvOptions& options, size_t* prefix_length, + std::unique_ptr* stream) { + // Read prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider_->GetPrefixLength(); + if (*prefix_length > 0) { + // Read prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + Status status = + underlying->Read(*prefix_length, &prefix, buffer.BufferStart()); + if (!status.ok()) { + return status; + } + buffer.Size(*prefix_length); + } + return provider_->CreateCipherStream(fname, options, prefix, stream); + } + + template + Status CreateRandomReadCipherStream( + const std::string& fname, const std::unique_ptr& underlying, + const EnvOptions& options, size_t* prefix_length, + std::unique_ptr* stream) { + // Read prefix (if needed) + AlignedBuffer buffer; + Slice prefix; + *prefix_length = provider_->GetPrefixLength(); + if (*prefix_length > 0) { + // Read prefix + buffer.Alignment(underlying->GetRequiredBufferAlignment()); + buffer.AllocateNewBuffer(*prefix_length); + Status status = + underlying->Read(0, *prefix_length, &prefix, buffer.BufferStart()); + if (!status.ok()) { + return status; + } + buffer.Size(*prefix_length); + } + return provider_->CreateCipherStream(fname, options, prefix, stream); + } + public: - EncryptedEnv(Env* base_env, EncryptionProvider* provider) + EncryptedEnvImpl(Env* base_env, + const std::shared_ptr& provider) : EnvWrapper(base_env) { provider_ = provider; } @@ -381,31 +577,16 @@ class EncryptedEnv : public EnvWrapper { if (!status.ok()) { return status; } - // Read prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - // Read prefix - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - status = - underlying->Read(prefixLength, &prefixSlice, prefixBuf.BufferStart()); - if (!status.ok()) { - return status; - } - prefixBuf.Size(prefixLength); - } // Create cipher stream std::unique_ptr stream; - status = - provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; + size_t prefix_length; + status = CreateSequentialCipherStream(fname, underlying, options, + &prefix_length, &stream); + if (status.ok()) { + result->reset(new EncryptedSequentialFile( + std::move(underlying), std::move(stream), prefix_length)); } - (*result) = std::unique_ptr(new EncryptedSequentialFile( - std::move(underlying), std::move(stream), prefixLength)); - return Status::OK(); + return status; } // NewRandomAccessFile opens a file for random read access. @@ -422,31 +603,15 @@ class EncryptedEnv : public EnvWrapper { if (!status.ok()) { return status; } - // Read prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - // Read prefix - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - status = underlying->Read(0, prefixLength, &prefixSlice, - prefixBuf.BufferStart()); - if (!status.ok()) { - return status; - } - prefixBuf.Size(prefixLength); - } - // Create cipher stream std::unique_ptr stream; - status = - provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; + size_t prefix_length; + status = CreateRandomReadCipherStream(fname, underlying, options, + &prefix_length, &stream); + if (status.ok()) { + result->reset(new EncryptedRandomAccessFile( + std::move(underlying), std::move(stream), prefix_length)); } - (*result) = std::unique_ptr(new EncryptedRandomAccessFile( - std::move(underlying), std::move(stream), prefixLength)); - return Status::OK(); + return status; } // NewWritableFile opens a file for sequential writing. @@ -463,33 +628,16 @@ class EncryptedEnv : public EnvWrapper { if (!status.ok()) { return status; } - // Initialize & write prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - // Initialize prefix - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixBuf.Size(prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); - // Write prefix - status = underlying->Append(prefixSlice); - if (!status.ok()) { - return status; - } - } // Create cipher stream std::unique_ptr stream; - status = - provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; + size_t prefix_length; + status = CreateWritableCipherStream(fname, underlying, options, + &prefix_length, &stream); + if (status.ok()) { + result->reset(new EncryptedWritableFile( + std::move(underlying), std::move(stream), prefix_length)); } - (*result) = std::unique_ptr(new EncryptedWritableFile( - std::move(underlying), std::move(stream), prefixLength)); - return Status::OK(); + return status; } // Create an object that writes to a new file with the specified @@ -512,33 +660,16 @@ class EncryptedEnv : public EnvWrapper { if (!status.ok()) { return status; } - // Initialize & write prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - // Initialize prefix - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixBuf.Size(prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); - // Write prefix - status = underlying->Append(prefixSlice); - if (!status.ok()) { - return status; - } - } // Create cipher stream std::unique_ptr stream; - status = - provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; + size_t prefix_length; + status = CreateWritableCipherStream(fname, underlying, options, + &prefix_length, &stream); + if (status.ok()) { + result->reset(new EncryptedWritableFile( + std::move(underlying), std::move(stream), prefix_length)); } - (*result) = std::unique_ptr(new EncryptedWritableFile( - std::move(underlying), std::move(stream), prefixLength)); - return Status::OK(); + return status; } // Reuse an existing file by renaming it and opening it as writable. @@ -557,33 +688,16 @@ class EncryptedEnv : public EnvWrapper { if (!status.ok()) { return status; } - // Initialize & write prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - // Initialize prefix - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), prefixLength); - prefixBuf.Size(prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); - // Write prefix - status = underlying->Append(prefixSlice); - if (!status.ok()) { - return status; - } - } // Create cipher stream std::unique_ptr stream; - status = - provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; + size_t prefix_length; + status = CreateWritableCipherStream(fname, underlying, options, + &prefix_length, &stream); + if (status.ok()) { + result->reset(new EncryptedWritableFile( + std::move(underlying), std::move(stream), prefix_length)); } - (*result) = std::unique_ptr(new EncryptedWritableFile( - std::move(underlying), std::move(stream), prefixLength)); - return Status::OK(); + return status; } // Open `fname` for random read and write, if file doesn't exist the file @@ -607,44 +721,22 @@ class EncryptedEnv : public EnvWrapper { if (!status.ok()) { return status; } - // Read or Initialize & write prefix (if needed) - AlignedBuffer prefixBuf; - Slice prefixSlice; - size_t prefixLength = provider_->GetPrefixLength(); - if (prefixLength > 0) { - prefixBuf.Alignment(underlying->GetRequiredBufferAlignment()); - prefixBuf.AllocateNewBuffer(prefixLength); - if (!isNewFile) { - // File already exists, read prefix - status = underlying->Read(0, prefixLength, &prefixSlice, - prefixBuf.BufferStart()); - if (!status.ok()) { - return status; - } - prefixBuf.Size(prefixLength); - } else { - // File is new, initialize & write prefix - provider_->CreateNewPrefix(fname, prefixBuf.BufferStart(), - prefixLength); - prefixBuf.Size(prefixLength); - prefixSlice = Slice(prefixBuf.BufferStart(), prefixBuf.CurrentSize()); - // Write prefix - status = underlying->Write(0, prefixSlice); - if (!status.ok()) { - return status; - } - } - } // Create cipher stream std::unique_ptr stream; - status = - provider_->CreateCipherStream(fname, options, prefixSlice, &stream); - if (!status.ok()) { - return status; + size_t prefix_length = 0; + if (!isNewFile) { + // File already exists, read prefix + status = CreateRandomReadCipherStream(fname, underlying, options, + &prefix_length, &stream); + } else { + status = CreateRandomWriteCipherStream(fname, underlying, options, + &prefix_length, &stream); } - (*result) = std::unique_ptr(new EncryptedRandomRWFile( - std::move(underlying), std::move(stream), prefixLength)); - return Status::OK(); + if (status.ok()) { + result->reset(new EncryptedRandomRWFile( + std::move(underlying), std::move(stream), prefix_length)); + } + return status; } // Store in *result the attributes of the children of the specified @@ -667,14 +759,19 @@ class EncryptedEnv : public EnvWrapper { if (!status.ok()) { return status; } - size_t prefixLength = provider_->GetPrefixLength(); for (auto it = std::begin(*result); it != std::end(*result); ++it) { // assert(it->size_bytes >= prefixLength); // breaks env_basic_test when called on directory containing // directories // which makes subtraction of prefixLength worrisome since // FileAttributes does not identify directories - it->size_bytes -= prefixLength; + EncryptionProvider* provider; + status = GetReadableProvider(it->name, &provider); + if (!status.ok()) { + return status; + } else if (provider != nullptr) { + it->size_bytes -= provider->GetPrefixLength(); + } } return Status::OK(); } @@ -686,26 +783,30 @@ class EncryptedEnv : public EnvWrapper { if (!status.ok()) { return status; } - size_t prefixLength = provider_->GetPrefixLength(); - assert(*file_size >= prefixLength); - *file_size -= prefixLength; - return Status::OK(); + EncryptionProvider* provider; + status = GetReadableProvider(fname, &provider); + if (provider != nullptr && status.ok()) { + size_t prefixLength = provider->GetPrefixLength(); + assert(*file_size >= prefixLength); + *file_size -= prefixLength; + } + return status; } private: - EncryptionProvider* provider_; + std::shared_ptr provider_; }; // Returns an Env that encrypts data when stored on disk and decrypts data when // read from disk. -Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider) { - return new EncryptedEnv(base_env, provider); +Env* NewEncryptedEnv(Env* base_env, + const std::shared_ptr& provider) { + return new EncryptedEnvImpl(base_env, provider); } // Encrypt one or more (partial) blocks of data at the file offset. // Length of data is given in dataSize. -Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, - size_t dataSize) { +Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char *data, size_t dataSize) { // Calculate block index auto blockSize = BlockSize(); uint64_t blockIndex = fileOffset / blockSize; @@ -717,7 +818,7 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, // Encrypt individual blocks. while (1) { - char* block = data; + char *block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { // We're not encrypting a full block. @@ -750,8 +851,7 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, // Decrypt one or more (partial) blocks of data at the file offset. // Length of data is given in dataSize. -Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data, - size_t dataSize) { +Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char *data, size_t dataSize) { // Calculate block index auto blockSize = BlockSize(); uint64_t blockIndex = fileOffset / blockSize; @@ -763,7 +863,7 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data, // Decrypt individual blocks. while (1) { - char* block = data; + char *block = data; size_t n = std::min(dataSize, blockSize - blockOffset); if (n != blockSize) { // We're not decrypting a full block. @@ -802,6 +902,8 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data, } } +const char* ROT13BlockCipher::Name() const { return kROT13CipherName; } + // Encrypt a block of data. // Length of data is equal to BlockSize(). Status ROT13BlockCipher::Encrypt(char* data) { @@ -817,7 +919,7 @@ Status ROT13BlockCipher::Decrypt(char* data) { return Encrypt(data); } // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. void CTRCipherStream::AllocateScratch(std::string& scratch) { - auto blockSize = cipher_.BlockSize(); + auto blockSize = cipher_->BlockSize(); scratch.reserve(blockSize); } @@ -826,12 +928,12 @@ void CTRCipherStream::AllocateScratch(std::string& scratch) { Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data, char* scratch) { // Create nonce + counter - auto blockSize = cipher_.BlockSize(); + auto blockSize = cipher_->BlockSize(); memmove(scratch, iv_.data(), blockSize); EncodeFixed64(scratch, blockIndex + initialCounter_); // Encrypt nonce+counter - auto status = cipher_.Encrypt(scratch); + auto status = cipher_->Encrypt(scratch); if (!status.ok()) { return status; } @@ -851,6 +953,8 @@ Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char* data, return EncryptBlock(blockIndex, data, scratch); } +const char* CTREncryptionProvider::Name() const { return kCTRProviderName; } + // GetPrefixLength returns the length of the prefix that is added to every file // and used for storing encryption options. // For optimal performance, the prefix length should be a multiple of @@ -859,6 +963,28 @@ size_t CTREncryptionProvider::GetPrefixLength() const { return defaultPrefixLength; } +Status CTREncryptionProvider::TEST_Initialize() { + if (!cipher_) { + return BlockCipher::CreateFromString( + ConfigOptions(), std::string(kROT13CipherName) + ":32", &cipher_); + } + return Status::OK(); +} + +Status CTREncryptionProvider::AddCipher(const std::string& /*descriptor*/, + const char* cipher, size_t len, + bool /*for_write*/) { + if (cipher_) { + return Status::NotSupported("Cannot add keys to CTREncryptionProvider"); + } else if (strcmp(kROT13CipherName, cipher) == 0) { + cipher_.reset(new ROT13BlockCipher(len)); + return Status::OK(); + } else { + return BlockCipher::CreateFromString(ConfigOptions(), std::string(cipher), + &cipher_); + } +} + // decodeCTRParameters decodes the initial counter & IV from the given // (plain text) prefix. static void decodeCTRParameters(const char* prefix, size_t blockSize, @@ -874,6 +1000,9 @@ static void decodeCTRParameters(const char* prefix, size_t blockSize, Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, char* prefix, size_t prefixLength) const { + if (!cipher_) { + return Status::InvalidArgument("Encryption Cipher is missing"); + } // Create & seed rnd. Random rnd((uint32_t)Env::Default()->NowMicros()); // Fill entire prefix block with random values. @@ -881,7 +1010,7 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, prefix[i] = rnd.Uniform(256) & 0xFF; } // Take random data to extract initial counter & IV - auto blockSize = cipher_.BlockSize(); + auto blockSize = cipher_->BlockSize(); uint64_t initialCounter; Slice prefixIV; decodeCTRParameters(prefix, blockSize, initialCounter, prefixIV); @@ -918,8 +1047,11 @@ size_t CTREncryptionProvider::PopulateSecretPrefixPart( Status CTREncryptionProvider::CreateCipherStream( const std::string& fname, const EnvOptions& options, Slice& prefix, std::unique_ptr* result) { + if (!cipher_) { + return Status::InvalidArgument("Encryption Cipher is missing"); + } // Read plain text part of prefix. - auto blockSize = cipher_.BlockSize(); + auto blockSize = cipher_->BlockSize(); uint64_t initialCounter; Slice iv; decodeCTRParameters(prefix.data(), blockSize, initialCounter, iv); @@ -961,6 +1093,6 @@ Status CTREncryptionProvider::CreateCipherStreamFromPrefix( return Status::OK(); } -#endif // ROCKSDB_LITE +#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/env/env_encryption_ctr.h b/env/env_encryption_ctr.h new file mode 100644 index 000000000..b22d7e45f --- /dev/null +++ b/env/env_encryption_ctr.h @@ -0,0 +1,137 @@ +// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#if !defined(ROCKSDB_LITE) + +#include "rocksdb/env_encryption.h" + +namespace ROCKSDB_NAMESPACE { + +// Implements a BlockCipher using ROT13. +// +// Note: This is a sample implementation of BlockCipher, +// it is NOT considered safe and should NOT be used in production. +class ROT13BlockCipher : public BlockCipher { + private: + size_t blockSize_; + + public: + ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {} + virtual ~ROT13BlockCipher(){}; + const char* Name() const override; + // BlockSize returns the size of each block supported by this cipher stream. + size_t BlockSize() override { return blockSize_; } + + // Encrypt a block of data. + // Length of data is equal to BlockSize(). + Status Encrypt(char* data) override; + + // Decrypt a block of data. + // Length of data is equal to BlockSize(). + Status Decrypt(char* data) override; +}; + +// CTRCipherStream implements BlockAccessCipherStream using an +// Counter operations mode. +// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation +// +// Note: This is a possible implementation of BlockAccessCipherStream, +// it is considered suitable for use. +class CTRCipherStream final : public BlockAccessCipherStream { + private: + std::shared_ptr cipher_; + std::string iv_; + uint64_t initialCounter_; + + public: + CTRCipherStream(const std::shared_ptr& c, const char* iv, + uint64_t initialCounter) + : cipher_(c), iv_(iv, c->BlockSize()), initialCounter_(initialCounter){}; + virtual ~CTRCipherStream(){}; + + // BlockSize returns the size of each block supported by this cipher stream. + size_t BlockSize() override { return cipher_->BlockSize(); } + + protected: + // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. + void AllocateScratch(std::string&) override; + + // Encrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + Status EncryptBlock(uint64_t blockIndex, char* data, char* scratch) override; + + // Decrypt a block of data at the given block index. + // Length of data is equal to BlockSize(); + Status DecryptBlock(uint64_t blockIndex, char* data, char* scratch) override; +}; + +// This encryption provider uses a CTR cipher stream, with a given block cipher +// and IV. +// +// Note: This is a possible implementation of EncryptionProvider, +// it is considered suitable for use, provided a safe BlockCipher is used. +class CTREncryptionProvider : public EncryptionProvider { + private: + std::shared_ptr cipher_; + + protected: + // For optimal performance when using direct IO, the prefix length should be a + // multiple of the page size. This size is to ensure the first real data byte + // is placed at largest known alignment point for direct io. + const static size_t defaultPrefixLength = 4096; + + public: + explicit CTREncryptionProvider( + const std::shared_ptr& c = nullptr) + : cipher_(c){}; + virtual ~CTREncryptionProvider() {} + + const char* Name() const override; + + // GetPrefixLength returns the length of the prefix that is added to every + // file + // and used for storing encryption options. + // For optimal performance when using direct IO, the prefix length should be a + // multiple of the page size. + size_t GetPrefixLength() const override; + + // CreateNewPrefix initialized an allocated block of prefix memory + // for a new file. + Status CreateNewPrefix(const std::string& fname, char* prefix, + size_t prefixLength) const override; + + // CreateCipherStream creates a block access cipher stream for a file given + // given name and options. + Status CreateCipherStream( + const std::string& fname, const EnvOptions& options, Slice& prefix, + std::unique_ptr* result) override; + + Status AddCipher(const std::string& descriptor, const char* /*cipher*/, + size_t /*len*/, bool /*for_write*/) override; + + protected: + Status TEST_Initialize() override; + + // PopulateSecretPrefixPart initializes the data into a new prefix block + // that will be encrypted. This function will store the data in plain text. + // It will be encrypted later (before written to disk). + // Returns the amount of space (starting from the start of the prefix) + // that has been initialized. + virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength, + size_t blockSize) const; + + // CreateCipherStreamFromPrefix creates a block access cipher stream for a + // file given + // given name and options. The given prefix is already decrypted. + virtual Status CreateCipherStreamFromPrefix( + const std::string& fname, const EnvOptions& options, + uint64_t initialCounter, const Slice& iv, const Slice& prefix, + std::unique_ptr* result); +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // !defined(ROCKSDB_LITE) diff --git a/env/env_openssl.cc b/env/env_openssl.cc new file mode 100644 index 000000000..81822ea9f --- /dev/null +++ b/env/env_openssl.cc @@ -0,0 +1,312 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// +// env_encryption.cc copied to this file then modified. + +#ifdef ROCKSDB_OPENSSL_AES_CTR +#ifndef ROCKSDB_LITE + +#include "rocksdb/env_openssl.h" + +#include +#include +#include +#include + +#include "env/env_openssl_impl.h" +#include "monitoring/perf_context_imp.h" +#include "port/port.h" +#include "util/aligned_buffer.h" +#include "util/coding.h" +#include "util/library_loader.h" +#include "util/mutexlock.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +static std::once_flag crypto_loaded; +static std::shared_ptr crypto_shared; + +std::shared_ptr GetCrypto() { + std::call_once(crypto_loaded, + []() { crypto_shared = std::make_shared(); }); + return crypto_shared; +} + +// reuse cipher context between calls to Encrypt & Decrypt +static void do_nothing(EVP_CIPHER_CTX*){}; +thread_local static std::unique_ptr + aes_context(nullptr, &do_nothing); + +ShaDescription::ShaDescription(const std::string& key_desc_str) { + GetCrypto(); // ensure libcryto available + bool good = {true}; + int ret_val; + unsigned len; + + memset(desc, 0, EVP_MAX_MD_SIZE); + if (0 != key_desc_str.length() && crypto_shared->IsValid()) { + std::unique_ptr context( + crypto_shared->EVP_MD_CTX_new(), + crypto_shared->EVP_MD_CTX_free_ptr()); + + ret_val = crypto_shared->EVP_DigestInit_ex( + context.get(), crypto_shared->EVP_sha1(), nullptr); + good = (1 == ret_val); + if (good) { + ret_val = crypto_shared->EVP_DigestUpdate( + context.get(), key_desc_str.c_str(), key_desc_str.length()); + good = (1 == ret_val); + } + + if (good) { + ret_val = + crypto_shared->EVP_DigestFinal_ex(context.get(), desc, &len); + good = (1 == ret_val); + } + } else { + good = false; + } + + valid = good; +} + +AesCtrKey::AesCtrKey(const std::string& key_str) : valid(false) { + GetCrypto(); // ensure libcryto available + memset(key, 0, EVP_MAX_KEY_LENGTH); + + // simple parse: must be 64 characters long and hexadecimal values + if (64 == key_str.length()) { + auto bad_pos = key_str.find_first_not_of("abcdefABCDEF0123456789"); + if (std::string::npos == bad_pos) { + for (size_t idx = 0, idx2 = 0; idx < key_str.length(); idx += 2, ++idx2) { + std::string hex_string(key_str.substr(idx, 2)); + key[idx2] = std::stoul(hex_string, 0, 16); + } + valid = true; + } + } +} + + +void AESBlockAccessCipherStream::BigEndianAdd128(uint8_t* buf, + uint64_t value) { + uint8_t *sum, *addend, *carry, pre, post; + + sum = buf + 15; + + if (port::kLittleEndian) { + addend = (uint8_t*)&value; + } else { + addend = (uint8_t*)&value + 7; + } + + // future: big endian could be written as uint64_t add + for (int loop = 0; loop < 8 && value; ++loop) { + pre = *sum; + *sum += *addend; + post = *sum; + --sum; + value >>= 8; + + carry = sum; + // carry? + while (post < pre && buf <= carry) { + pre = *carry; + *carry += 1; + post = *carry; + --carry; + } + } // for +} + +// "data" is assumed to be aligned at AES_BLOCK_SIZE or greater +Status AESBlockAccessCipherStream::Encrypt(uint64_t file_offset, char* data, + size_t data_size) { + Status status; + if (0 < data_size) { + if (crypto_shared->IsValid()) { + int ret_val, out_len; + ALIGN16 uint8_t iv[AES_BLOCK_SIZE]; + uint64_t block_index = file_offset / BlockSize(); + uint64_t remainder = file_offset % BlockSize(); + + // make a context once per thread + if (!aes_context) { + aes_context = + std::unique_ptr( + crypto_shared->EVP_CIPHER_CTX_new(), + crypto_shared->EVP_CIPHER_CTX_free_ptr()); + } + + memcpy(iv, nonce_, AES_BLOCK_SIZE); + BigEndianAdd128(iv, block_index); + ret_val = crypto_shared->EVP_EncryptInit_ex( + aes_context.get(), crypto_shared->EVP_aes_256_ctr(), nullptr, + key_.key, iv); + if (1 != ret_val) { + status = Status::InvalidArgument("EVP_EncryptInit_ex failed."); + } + + // if start not aligned to block size, do partial + if (1 == ret_val && 0 != remainder) { + size_t partial_len; + ALIGN16 uint8_t partial[AES_BLOCK_SIZE]; + memset(partial, 0, sizeof(partial)); + + partial_len = AES_BLOCK_SIZE - remainder; + if (data_size < partial_len) { + partial_len = data_size; + } + out_len = 0; + ret_val = crypto_shared->EVP_EncryptUpdate( + aes_context.get(), (unsigned char*)partial, &out_len, + (unsigned char*)partial, (int)sizeof(partial)); + + if (1 == ret_val && out_len == AES_BLOCK_SIZE) { + // xor against real data + for (size_t pos = 0; pos < partial_len; ++pos) { + *(data + pos) ^= partial[remainder + pos]; + } + } else { + status = Status::InvalidArgument("EVP_EncryptUpdate failed: ", + (int)data_size == AES_BLOCK_SIZE + ? "bad return value" + : "output length short"); + } + + data += partial_len; + BigEndianAdd128(iv, 1); + if (partial_len < data_size) { + data_size -= partial_len; + } else { + data_size = 0; + } + } + + // do remaining data: starts on boundry but may or may not end on one. + if (1 == ret_val && data_size) { + out_len = 0; + ret_val = crypto_shared->EVP_EncryptUpdate( + aes_context.get(), (unsigned char*)data, &out_len, + (unsigned char*)data, (int)data_size); + if (1 != ret_val || out_len != (int)data_size) { + status = Status::InvalidArgument("EVP_EncryptUpdate failed: ", + (int)data_size == out_len + ? "bad return value" + : "output length short"); + } + } + + // clean up + if (1 == ret_val) { + // this is a soft reset of aes_context per man pages + uint8_t temp_buf[AES_BLOCK_SIZE]; + out_len = 0; + ret_val = crypto_shared->EVP_EncryptFinal_ex(aes_context.get(), + temp_buf, &out_len); + + if (1 != ret_val || 0 != out_len) { + status = Status::InvalidArgument( + "EVP_EncryptFinal_ex failed: ", + (1 != ret_val) ? "bad return value" : "output length short"); + } + } + } else { + status = Status::NotSupported( + "libcrypto not available for encryption/decryption."); + } + } + + return status; +} + +// Decrypt one or more (partial) blocks of data at the file offset. +// Length of data is given in data_size. +// CTR Encrypt and Decrypt are synonyms. Using Encrypt calls here to reduce +// count of symbols loaded from libcrypto. +Status AESBlockAccessCipherStream::Decrypt(uint64_t file_offset, char* data, + size_t data_size) { + + return Encrypt(file_offset, data, data_size); +} + +Status EncryptionProviderOpenSSL::CreateNewPrefix(const std::string& /*fname*/, + char* prefix, + size_t prefixLength) const { + GetCrypto(); // ensure libcryto available + Status s; + if (crypto_shared->IsValid()) { + if (sizeof(PrefixVersion0) + sizeof(OpenSSLEncryptMarker) <= prefixLength) { + int ret_val; + + memcpy(prefix, kOpenSSLEncryptMarker, sizeof(kOpenSSLEncryptMarker)); + *(prefix + sizeof(kOpenSSLEncryptMarker)) = kOpenSSLEncryptCodeVersion1; + + PrefixVersion0* pf = {(PrefixVersion0*)(prefix + sizeof(OpenSSLEncryptMarker))}; + memcpy(pf->key_description_, encrypt_write_.first.desc, sizeof(encrypt_write_.first.desc)); + ret_val = crypto_shared->RAND_bytes((unsigned char*)&pf->nonce_, + AES_BLOCK_SIZE); + if (1 != ret_val) { + s = Status::NotSupported("RAND_bytes failed"); + } + } else { + s = Status::NotSupported("Prefix size needs to be 28 or more"); + } + } else { + s = Status::NotSupported("RAND_bytes() from libcrypto not available."); + } + + return s; +} + +size_t EncryptionProviderOpenSSL::GetPrefixLength() const { + return kDefaultPageSize; // for direct io alignment +} + +Status EncryptionProviderOpenSSL::CreateCipherStream( + const std::string& /*fname*/, const EnvOptions& /*options*/, + Slice& prefix, + std::unique_ptr* result) { + Status stat; + + // for direct io, prefix size matched to one page to keep file contents aligned. + if (kDefaultPageSize == prefix.size()) { + if (prefix.starts_with(kOpenSSLEncryptMarker)) { + uint8_t code_version = (uint8_t)prefix[sizeof(kOpenSSLEncryptMarker)]; + switch (code_version) { + case kOpenSSLEncryptCodeVersion1: { + PrefixVersion0 * prefix_struct = (PrefixVersion0 *)(prefix.data() + sizeof(OpenSSLEncryptMarker)); + ShaDescription desc(prefix_struct->key_description_, sizeof(PrefixVersion0::key_description_)); + auto read_key = encrypt_read_.find(desc); + + if (encrypt_read_.end() != read_key) { + (*result).reset(new AESBlockAccessCipherStream(read_key->second, code_version, prefix_struct->nonce_)); + } else { + stat = Status::NotSupported("File requires unknown encryption key"); + } + break; + } + + default: { + stat = Status::NotSupported("Unknown code version for this encryption provider"); + break; + } + } + } else { + stat = Status::NotSupported("Prefix marker wrong for this encryption provider"); + } + } else { + stat = Status::NotSupported("Prefix wrong size for this encryption provider"); + } + + return stat; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE +#endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/env/env_encrypt2_impl.h b/env/env_openssl_impl.h similarity index 89% rename from env/env_encrypt2_impl.h rename to env/env_openssl_impl.h index 943143e64..18b59b751 100644 --- a/env/env_encrypt2_impl.h +++ b/env/env_openssl_impl.h @@ -10,7 +10,9 @@ #include "openssl/aes.h" #include "openssl/evp.h" -#include "rocksdb/env_encrypt2.h" +#include + +#include "rocksdb/env_openssl.h" namespace ROCKSDB_NAMESPACE { @@ -24,10 +26,10 @@ namespace ROCKSDB_NAMESPACE { #endif #endif -constexpr uint8_t kEncryptCodeVersion0{'0'}; +constexpr uint8_t kOpenSSLEncryptCodeVersion1{'1'}; -typedef char EncryptMarker[8]; -static EncryptMarker kEncryptMarker = "Encrypt"; +typedef char OpenSSLEncryptMarker[8]; +static OpenSSLEncryptMarker kOpenSSLEncryptMarker = "Encrypt"; // long term: code_version could be used in a switch statement or factory // prefix version 0 is 12 byte sha1 description hash, 128 bit (16 byte) @@ -70,6 +72,7 @@ class AESBlockAccessCipherStream : public BlockAccessCipherStream { return Status::NotSupported("Wrong EncryptionProvider assumed"); }; + const std::string fname_; // saving this for debug logging as needed AesCtrKey key_; uint8_t code_version_; uint8_t nonce_[AES_BLOCK_SIZE]; diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index c6b11d032..4e4d1da03 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -15,6 +15,68 @@ namespace rocksdb { +class Env; +struct ColumnFamilyOptions; +struct DBOptions; +struct Options; + +// ConfigOptions containing the parameters/controls for +// comparing objects and converting to/from strings. +// These settings control how the methods +// treat errors (e.g. ignore_unknown_objects), the format +// of the serialization (e.g. delimiter), and how to compare +// options (sanity_level). +struct ConfigOptions { + // This enum defines the RocksDB options sanity level. + enum SanityLevel : unsigned char { + kSanityLevelNone = 0x01, // Performs no sanity check at all. + // Performs minimum check to ensure the RocksDB instance can be + // opened without corrupting / mis-interpreting the data. + kSanityLevelLooselyCompatible = 0x02, + // Perform exact match sanity check. + kSanityLevelExactMatch = 0xFF, + }; + + enum Depth { + kDepthDefault, // Traverse nested options that are not flagged as "shallow" + kDepthShallow, // Do not traverse into any nested options + kDepthDetailed, // Traverse nested options, overriding the options shallow + // setting + }; + + // When true, any unused options will be ignored and OK will be returned + bool ignore_unknown_options = false; + + // If the strings are escaped (old-style?) + bool input_strings_escaped = true; + + // The separator between options when converting to a string + std::string delimiter = ";"; + + // Controls how to traverse options during print/match stages + Depth depth = Depth::kDepthDefault; + + // Controls how options are serialized + // Controls how pedantic the comparison must be for equivalency + SanityLevel sanity_level = SanityLevel::kSanityLevelExactMatch; + // `file_readahead_size` is used for readahead for the option file. + size_t file_readahead_size = 512 * 1024; + + // The environment to use for this option + Env* env = Env::Default(); + + bool IsShallow() const { return depth == Depth::kDepthShallow; } + bool IsDetailed() const { return depth == Depth::kDepthDetailed; } + + bool IsCheckDisabled() const { + return sanity_level == SanityLevel::kSanityLevelNone; + } + + bool IsCheckEnabled(SanityLevel level) const { + return (level > SanityLevel::kSanityLevelNone && level <= sanity_level); + } +}; + #ifndef ROCKSDB_LITE // The following set of functions provide a way to construct RocksDB Options // from a string or a string-to-string map. Here're the general rule of diff --git a/include/rocksdb/env_encrypt2.h b/include/rocksdb/env_encrypt2.h deleted file mode 100644 index 22d0279ff..000000000 --- a/include/rocksdb/env_encrypt2.h +++ /dev/null @@ -1,358 +0,0 @@ -// copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -// -// env_encryption.cc copied to this file then modified. - -#pragma once - -#ifdef ROCKSDB_OPENSSL_AES_CTR -#ifndef ROCKSDB_LITE - -#include -#include -#include - -#include -#include -#include - -#include "env.h" -#include "rocksdb/env_encryption.h" -#include "util/aligned_buffer.h" -#include "util/coding.h" -#include "util/library_loader.h" -#include "util/random.h" - -#endif - -namespace ROCKSDB_NAMESPACE { - -#ifndef ROCKSDB_LITE - -struct Sha1Description { - uint8_t desc[EVP_MAX_MD_SIZE]; - bool valid; - - Sha1Description() : valid(false) { memset(desc, 0, EVP_MAX_MD_SIZE); } - - Sha1Description(const Sha1Description& rhs) { *this = rhs; } - - Sha1Description& operator=(const Sha1Description& rhs) { - memcpy(desc, rhs.desc, sizeof(desc)); - valid = rhs.valid; - return *this; - } - - Sha1Description(uint8_t* desc_in, size_t desc_len) : valid(false) { - memset(desc, 0, EVP_MAX_MD_SIZE); - if (desc_len <= EVP_MAX_MD_SIZE) { - memcpy(desc, desc_in, desc_len); - valid = true; - } - } - - Sha1Description(const std::string& key_desc_str); - - // see AesCtrKey destructor below. This data is not really - // essential to clear, but trying to set pattern for future work. - // goal is to explicitly remove desc from memory once no longer needed - ~Sha1Description() { - memset(desc, 0, EVP_MAX_MD_SIZE); - valid = false; - } - - bool operator<(const Sha1Description& rhs) const { - return memcmp(desc, rhs.desc, EVP_MAX_MD_SIZE) < 0; - } - - bool operator==(const Sha1Description& rhs) const { - return 0 == memcmp(desc, rhs.desc, EVP_MAX_MD_SIZE) && valid == rhs.valid; - } - - bool IsValid() const { return valid; } - - std::string ToString(size_t byte_count = 20) const { - if (IsValid()) { - if (EVP_MAX_MD_SIZE < byte_count) { - byte_count = EVP_MAX_MD_SIZE; - } - rocksdb::Slice to_hex((const char *)desc, byte_count); - return to_hex.ToString(true); - } else { - return std::string(); - } - } -}; - -struct AesCtrKey { - uint8_t key[EVP_MAX_KEY_LENGTH]; - bool valid; - - AesCtrKey() : valid(false) { memset(key, 0, EVP_MAX_KEY_LENGTH); } - - AesCtrKey(const uint8_t* key_in, size_t key_len) : valid(false) { - memset(key, 0, EVP_MAX_KEY_LENGTH); - if (key_len <= EVP_MAX_KEY_LENGTH) { - memcpy(key, key_in, key_len); - valid = true; - } else { - valid = false; - } - } - - AesCtrKey(const std::string& key_str); - - // see Writing Solid Code, 2nd edition - // Chapter 9, page 321, Managing Secrets in Memory ... bullet 4 "Scrub the - // memory" - // Not saying this is essential or effective in initial implementation since - // current - // usage model loads all keys at start and only deletes them at shutdown. But - // does establish presidence. - // goal is to explicitly remove key from memory once no longer needed - ~AesCtrKey() { - memset(key, 0, EVP_MAX_KEY_LENGTH); - valid = false; - } - - bool operator==(const AesCtrKey& rhs) const { - return (0 == memcmp(key, rhs.key, EVP_MAX_KEY_LENGTH)) && - (valid == rhs.valid); - } - - bool IsValid() const { return valid; } - - std::string ToString(size_t byte_count = 32) const { - if (IsValid()) { - if (EVP_MAX_KEY_LENGTH < byte_count) { - byte_count = EVP_MAX_KEY_LENGTH; - } - rocksdb::Slice to_hex((const char *)key, byte_count); - return to_hex.ToString(true); - } else { - return std::string(); - } - } -}; - -class CTREncryptionProviderV2 : public EncryptionProvider { - public: - CTREncryptionProviderV2() = delete; - - CTREncryptionProviderV2(const CTREncryptionProvider&&) = delete; - - CTREncryptionProviderV2(const Sha1Description& key_desc_in, - const AesCtrKey& key_in) - : valid_(false), key_desc_(key_desc_in), key_(key_in) { - valid_ = key_desc_.IsValid() && key_.IsValid(); - } - - CTREncryptionProviderV2(const std::string& key_desc_str, - const uint8_t unformatted_key[], int bytes) - : valid_(false), key_desc_(key_desc_str), key_(unformatted_key, bytes) { - valid_ = key_desc_.IsValid() && key_.IsValid(); - } - - size_t GetPrefixLength() const override; - - Status CreateNewPrefix(const std::string& /*fname*/, char* prefix, - size_t prefixLength) const override; - - Status CreateCipherStream( - const std::string& /*fname*/, const EnvOptions& /*options*/, - Slice& /*prefix*/, - std::unique_ptr* /*result*/) override { - return Status::NotSupported("Wrong EncryptionProvider assumed"); - } - - virtual BlockAccessCipherStream* CreateCipherStream2( - uint8_t code_version, const uint8_t nonce[]) const; - - bool IsValid() const { return valid_; }; - const Sha1Description& key_desc() const { return key_desc_; }; - const AesCtrKey& key() const { return key_; }; - - std::string ToString() const { - std::string result; - if (IsValid()) { - result = key_desc_.ToString(); - result += " : "; - result += key_.ToString(); - } else { - result = " : "; - } - return result; - } - - protected: - bool valid_; - Sha1Description key_desc_; - AesCtrKey key_; -}; - -class EncryptedWritableFileV2 : public EncryptedWritableFile { - public: - // Default ctor. Prefix is assumed to be written already. - EncryptedWritableFileV2(std::unique_ptr&& f, - std::unique_ptr&& s, - size_t prefix_length) - : EncryptedWritableFile(std::move(f), std::move(s), prefix_length) {} - - Status Append(const Slice& data) override; - - Status PositionedAppend(const Slice& data, uint64_t offset) override; - - // Indicates the upper layers if the current WritableFile implementation - // uses direct IO. - bool use_direct_io() const override { return false; }; -}; - -// A file abstraction for random reading and writing. -class EncryptedRandomRWFileV2 : public EncryptedRandomRWFile { - protected: - - public: - EncryptedRandomRWFileV2(std::unique_ptr&& f, - std::unique_ptr&& s, - size_t prefixLength) - : EncryptedRandomRWFile(std::move(f), std::move(s), prefixLength) {} - - // Indicates if the class makes use of direct I/OF - // If false you must pass aligned buffer to Write() - bool use_direct_io() const override {return false;}; - - // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. - // Pass aligned buffer when use_direct_io() returns true. - Status Write(uint64_t offset, const Slice& data) override; -}; - -// EncryptedEnvV2 implements an Env wrapper that adds encryption to files stored -// on disk. -class EncryptedEnvV2 : public EnvWrapper { - public: - using WriteKey = std::pair>; - using ReadKeys = - std::map>; - - static Env* Default(); - static Env* Default(ReadKeys encrypt_read, WriteKey encrypt_write); - - EncryptedEnvV2(Env* base_env); - - EncryptedEnvV2(Env* base_env, ReadKeys encrypt_read, WriteKey encrypt_write); - - void SetKeys(ReadKeys encrypt_read, WriteKey encrypt_write); - - bool IsWriteEncrypted() const; - - // NewSequentialFile opens a file for sequential reading. - Status NewSequentialFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - // NewRandomAccessFile opens a file for random read access. - Status NewRandomAccessFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - // NewWritableFile opens a file for sequential writing. - Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores nullptr in *result and - // returns non-OK. - // - // The returned file will only be accessed by one thread at a time. - Status ReopenWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - // Reuse an existing file by renaming it and opening it as writable. - Status ReuseWritableFile(const std::string& fname, - const std::string& old_fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - // Open `fname` for random read and write, if file doesn't exist the file - // will be created. On success, stores a pointer to the new file in - // *result and returns OK. On failure returns non-OK. - // - // The returned file will only be accessed by one thread at a time. - Status NewRandomRWFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override; - - // Store in *result the attributes of the children of the specified directory. - // In case the implementation lists the directory prior to iterating the files - // and files are concurrently deleted, the deleted files will be omitted from - // result. - // The name attributes are relative to "dir". - // Original contents of *results are dropped. - // Returns OK if "dir" exists and "*result" contains its children. - // NotFound if "dir" does not exist, the calling process does not have - // permission to access "dir", or if "dir" is invalid. - // IOError if an IO Error was encountered - Status GetChildrenFileAttributes( - const std::string& dir, std::vector* result) override; - - // Store the size of fname in *file_size. - Status GetFileSize(const std::string& fname, uint64_t* file_size) override; - - // only needed for GetChildrenFileAttributes & GetFileSize - virtual Status GetEncryptionProvider( - const std::string& fname, - std::shared_ptr& provider); - - bool IsValid() const { return valid_; } - - protected: - void init(); - - template - Status ReadSeqEncryptionPrefix( - TypeFile* f, std::shared_ptr& provider, - std::unique_ptr& stream); - - template - Status ReadRandEncryptionPrefix( - TypeFile* f, std::shared_ptr& provider, - std::unique_ptr& stream); - - template - Status WriteSeqEncryptionPrefix( - TypeFile* f, std::shared_ptr provider, - std::unique_ptr& stream); - - template - Status WriteRandEncryptionPrefix( - TypeFile* f, std::shared_ptr provider, - std::unique_ptr& stream); - - public: - std::shared_ptr crypto_; - - protected: - ReadKeys encrypt_read_; - WriteKey encrypt_write_; - mutable port::RWMutex key_lock_; - bool valid_; -}; - -// Returns an Env that encrypts data when stored on disk and decrypts data when -// read from disk. Prefer EncryptedEnvV2::Default(). -Env* NewEncryptedEnvV2(Env* base_env, EncryptedEnvV2::ReadKeys encrypt_read, - EncryptedEnvV2::WriteKey encrypt_write); - -#endif // ROCKSDB_LITE - -} // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h index 25832e92d..cc475769e 100644 --- a/include/rocksdb/env_encryption.h +++ b/include/rocksdb/env_encryption.h @@ -9,16 +9,19 @@ #include -#include "env.h" -#include "rocksdb_namespace.h" +#include "rocksdb/env.h" +#include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { class EncryptionProvider; +struct ConfigOptions; + // Returns an Env that encrypts data when stored on disk and decrypts data when // read from disk. -Env* NewEncryptedEnv(Env* base_env, EncryptionProvider* provider); +Env* NewEncryptedEnv(Env* base_env, + const std::shared_ptr& provider); // BlockAccessCipherStream is the base class for any cipher stream that // supports random access at block level (without requiring data from other @@ -58,6 +61,30 @@ class BlockCipher { public: virtual ~BlockCipher(){}; + // Creates a new BlockCipher from the input config_options and value + // The value describes the type of provider (and potentially optional + // configuration parameters) used to create this provider. + // For example, if the value is "ROT13", a ROT13BlockCipher is created. + // + // @param config_options Options to control how this cipher is created + // and initialized. + // @param value The value might be: + // - ROT13 Create a ROT13 Cipher + // - ROT13:nn Create a ROT13 Cipher with block size of nn + // @param result The new cipher object + // @return OK if the cipher was sucessfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + // Short-cut method to create a ROT13 BlockCipher. + // This cipher is only suitable for test purposes and should not be used in + // production!!! + static std::shared_ptr NewROT13Cipher(size_t block_size); + + virtual const char* Name() const = 0; // BlockSize returns the size of each block supported by this cipher stream. virtual size_t BlockSize() = 0; @@ -70,65 +97,6 @@ class BlockCipher { virtual Status Decrypt(char* data) = 0; }; -// Implements a BlockCipher using ROT13. -// -// Note: This is a sample implementation of BlockCipher, -// it is NOT considered safe and should NOT be used in production. -class ROT13BlockCipher : public BlockCipher { - private: - size_t blockSize_; - - public: - ROT13BlockCipher(size_t blockSize) : blockSize_(blockSize) {} - virtual ~ROT13BlockCipher(){}; - - // BlockSize returns the size of each block supported by this cipher stream. - virtual size_t BlockSize() override { return blockSize_; } - - // Encrypt a block of data. - // Length of data is equal to BlockSize(). - virtual Status Encrypt(char* data) override; - - // Decrypt a block of data. - // Length of data is equal to BlockSize(). - virtual Status Decrypt(char* data) override; -}; - -// CTRCipherStream implements BlockAccessCipherStream using an -// Counter operations mode. -// See https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation -// -// Note: This is a possible implementation of BlockAccessCipherStream, -// it is considered suitable for use. -class CTRCipherStream final : public BlockAccessCipherStream { - private: - BlockCipher& cipher_; - std::string iv_; - uint64_t initialCounter_; - - public: - CTRCipherStream(BlockCipher& c, const char* iv, uint64_t initialCounter) - : cipher_(c), iv_(iv, c.BlockSize()), initialCounter_(initialCounter){}; - virtual ~CTRCipherStream(){}; - - // BlockSize returns the size of each block supported by this cipher stream. - virtual size_t BlockSize() override { return cipher_.BlockSize(); } - - protected: - // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. - virtual void AllocateScratch(std::string&) override; - - // Encrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); - virtual Status EncryptBlock(uint64_t blockIndex, char* data, - char* scratch) override; - - // Decrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); - virtual Status DecryptBlock(uint64_t blockIndex, char* data, - char* scratch) override; -}; - // The encryption provider is used to create a cipher stream for a specific // file. The returned cipher stream will be used for actual // encryption/decryption actions. @@ -136,6 +104,33 @@ class EncryptionProvider { public: virtual ~EncryptionProvider(){}; + // Creates a new EncryptionProvider from the input config_options and value + // The value describes the type of provider (and potentially optional + // configuration parameters) used to create this provider. + // For example, if the value is "CTR", a CTREncryptionProvider will be + // created. If the value is preceded by "test://" (e.g test://CTR"), the + // TEST_Initialize method will be invoked prior to returning the provider. + // + // @param config_options Options to control how this provider is created + // and initialized. + // @param value The value might be: + // - CTR Create a CTR provider + // - test://CTR Create a CTR provider and initialize it for tests. + // @param result The new provider object + // @return OK if the provider was sucessfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + // Short-cut method to create a CTR-provider + static std::shared_ptr NewCTRProvider( + const std::shared_ptr& cipher); + + // Returns the name of this EncryptionProvider + virtual const char* Name() const = 0; + // GetPrefixLength returns the length of the prefix that is added to every // file and used for storing encryption options. For optimal performance, the // prefix length should be a multiple of the page size. @@ -146,63 +141,28 @@ class EncryptionProvider { virtual Status CreateNewPrefix(const std::string& fname, char* prefix, size_t prefixLength) const = 0; - // CreateCipherStream creates a block access cipher stream for a file given - // given name and options. - virtual Status CreateCipherStream( - const std::string& fname, const EnvOptions& options, Slice& prefix, - std::unique_ptr* result) = 0; -}; - -// This encryption provider uses a CTR cipher stream, with a given block cipher -// and IV. -// -// Note: This is a possible implementation of EncryptionProvider, -// it is considered suitable for use, provided a safe BlockCipher is used. -class CTREncryptionProvider : public EncryptionProvider { - private: - BlockCipher& cipher_; - - protected: - const static size_t defaultPrefixLength = 4096; - - public: - CTREncryptionProvider(BlockCipher& c) : cipher_(c){}; - virtual ~CTREncryptionProvider() {} - - // GetPrefixLength returns the length of the prefix that is added to every - // file - // and used for storing encryption options. - // For optimal performance, the prefix length should be a multiple of - // the page size. - virtual size_t GetPrefixLength() const override; - - // CreateNewPrefix initialized an allocated block of prefix memory - // for a new file. - virtual Status CreateNewPrefix(const std::string& fname, char* prefix, - size_t prefixLength) const override; + // Method to add a new cipher key for use by the EncryptionProvider. + // @param description Descriptor for this key. + // @param cipher The cryptographic key to use + // @param len The length of the cipher key + // @param for_write If true, this cipher should be used for writing files. + // If false, this cipher should only be used for reading + // files + // @return OK if the cipher was successfully added to the provider, non-OK + // otherwise + virtual Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) = 0; // CreateCipherStream creates a block access cipher stream for a file given // given name and options. virtual Status CreateCipherStream( const std::string& fname, const EnvOptions& options, Slice& prefix, - std::unique_ptr* result) override; + std::unique_ptr* result) = 0; protected: - // PopulateSecretPrefixPart initializes the data into a new prefix block - // that will be encrypted. This function will store the data in plain text. - // It will be encrypted later (before written to disk). - // Returns the amount of space (starting from the start of the prefix) - // that has been initialized. - virtual size_t PopulateSecretPrefixPart(char* prefix, size_t prefixLength, - size_t blockSize) const; - - // CreateCipherStreamFromPrefix creates a block access cipher stream for a - // file given - // given name and options. The given prefix is already decrypted. - virtual Status CreateCipherStreamFromPrefix( - const std::string& fname, const EnvOptions& options, - uint64_t initialCounter, const Slice& iv, const Slice& prefix, - std::unique_ptr* result); + // Optional method to initialize an EncryptionProvider in the TEST + // environment. + virtual Status TEST_Initialize() { return Status::OK(); } }; class EncryptedSequentialFile : public SequentialFile { diff --git a/include/rocksdb/env_openssl.h b/include/rocksdb/env_openssl.h new file mode 100644 index 000000000..99948fdfa --- /dev/null +++ b/include/rocksdb/env_openssl.h @@ -0,0 +1,188 @@ +// copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// +// env_encryption.cc copied to this file then modified. + +#pragma once + +#ifdef ROCKSDB_OPENSSL_AES_CTR +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/env_encryption.h" +#include "rocksdb/slice.h" + +namespace ROCKSDB_NAMESPACE { + +struct ShaDescription { + uint8_t desc[EVP_MAX_MD_SIZE]; + bool valid; + + ShaDescription() : valid(false) { memset(desc, 0, EVP_MAX_MD_SIZE); } + + ShaDescription(const ShaDescription& rhs) { *this = rhs; } + + ShaDescription& operator=(const ShaDescription& rhs) { + memcpy(desc, rhs.desc, sizeof(desc)); + valid = rhs.valid; + return *this; + } + + ShaDescription(uint8_t* desc_in, size_t desc_len) : valid(false) { + memset(desc, 0, EVP_MAX_MD_SIZE); + if (desc_len <= EVP_MAX_MD_SIZE) { + memcpy(desc, desc_in, desc_len); + valid = true; + } + } + + ShaDescription(const std::string& key_desc_str); + + // see AesCtrKey destructor below. This data is not really + // essential to clear, but trying to set pattern for future work. + // goal is to explicitly remove desc from memory once no longer needed + ~ShaDescription() { + memset(desc, 0, EVP_MAX_MD_SIZE); + valid = false; + } + + bool operator<(const ShaDescription& rhs) const { + return memcmp(desc, rhs.desc, EVP_MAX_MD_SIZE) < 0; + } + + bool operator==(const ShaDescription& rhs) const { + return 0 == memcmp(desc, rhs.desc, EVP_MAX_MD_SIZE) && valid == rhs.valid; + } + + bool IsValid() const { return valid; } + + std::string ToString(size_t byte_count = 20) const { + if (IsValid()) { + if (EVP_MAX_MD_SIZE < byte_count) { + byte_count = EVP_MAX_MD_SIZE; + } + rocksdb::Slice to_hex((const char *)desc, byte_count); + return to_hex.ToString(true); + } else { + return std::string(); + } + } +}; + +struct AesCtrKey { + uint8_t key[EVP_MAX_KEY_LENGTH]; + bool valid; + + AesCtrKey() : valid(false) { memset(key, 0, EVP_MAX_KEY_LENGTH); } + + AesCtrKey(const uint8_t* key_in, size_t key_len) : valid(false) { + memset(key, 0, EVP_MAX_KEY_LENGTH); + if (key_len <= EVP_MAX_KEY_LENGTH) { + memcpy(key, key_in, key_len); + valid = true; + } else { + valid = false; + } + } + + AesCtrKey(const std::string& key_str); + + // see Writing Solid Code, 2nd edition + // Chapter 9, page 321, Managing Secrets in Memory ... bullet 4 "Scrub the + // memory" + // Not saying this is essential or effective in initial implementation since + // current + // usage model loads all keys at start and only deletes them at shutdown. But + // does establish presidence. + // goal is to explicitly remove key from memory once no longer needed + ~AesCtrKey() { + memset(key, 0, EVP_MAX_KEY_LENGTH); + valid = false; + } + + bool operator==(const AesCtrKey& rhs) const { + return (0 == memcmp(key, rhs.key, EVP_MAX_KEY_LENGTH)) && + (valid == rhs.valid); + } + + bool IsValid() const { return valid; } + + std::string ToString(size_t byte_count = 32) const { + if (IsValid()) { + if (EVP_MAX_KEY_LENGTH < byte_count) { + byte_count = EVP_MAX_KEY_LENGTH; + } + rocksdb::Slice to_hex((const char *)key, byte_count); + return to_hex.ToString(true); + } else { + return std::string(); + } + } +}; + + +class EncryptionProviderOpenSSL : public EncryptionProvider { + public: + EncryptionProviderOpenSSL() = delete; + + EncryptionProviderOpenSSL(const EncryptionProviderOpenSSL&&) = delete; + + EncryptionProviderOpenSSL(const ShaDescription& key_desc_in, + const AesCtrKey& key_in) + : encrypt_read_({{key_desc_in, key_in}}), encrypt_write_({key_desc_in, key_in}) { + valid_ = key_desc_in.IsValid() && key_in.IsValid(); + } + + EncryptionProviderOpenSSL(const std::string& key_desc_str, + const uint8_t unformatted_key[], int bytes) + : valid_(false) { + ShaDescription desc(key_desc_str); + AesCtrKey aes(unformatted_key, bytes); + + encrypt_write_ = std::pair(desc, aes); + encrypt_read_.insert(std::pair(desc, aes)); + valid_ = desc.IsValid() && aes.IsValid(); + } + + const char * Name() const override {return "EncryptionProviderOpenSSL";} + + size_t GetPrefixLength() const override; + + Status CreateNewPrefix(const std::string& /*fname*/, char* prefix, + size_t prefixLength) const override; + + virtual Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) = 0; + + Status CreateCipherStream( + const std::string& /*fname*/, const EnvOptions& /*options*/, + Slice& /*prefix*/, + std::unique_ptr* /*result*/) override; + + bool IsValid() const { return valid_; }; + + protected: + using WriteKey = std::pair; + using ReadKeys = std::map; + + ReadKeys encrypt_read_; + WriteKey encrypt_write_; + mutable port::RWMutex key_lock_; + bool valid_; +}; + +} // namespace ROCKSDB_NAMESPACE + +#endif // ROCKSDB_LITE + +#endif // ROCKSDB_OPENSSL_AES_CTR diff --git a/port/port_posix.h b/port/port_posix.h index 2d2a7a79c..88f58d5d4 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -87,6 +87,7 @@ namespace port { // For use at db/file_indexer.h kLevelMaxIndex const uint32_t kMaxUint32 = std::numeric_limits::max(); const int kMaxInt32 = std::numeric_limits::max(); +const int kMinInt32 = std::numeric_limits::min(); const uint64_t kMaxUint64 = std::numeric_limits::max(); const int64_t kMaxInt64 = std::numeric_limits::max(); const size_t kMaxSizet = std::numeric_limits::max(); diff --git a/port/win/port_win.h b/port/win/port_win.h index 9b8ba9ff8..f6bbc5f14 100644 --- a/port/win/port_win.h +++ b/port/win/port_win.h @@ -93,6 +93,7 @@ namespace port { // For use at db/file_indexer.h kLevelMaxIndex const uint32_t kMaxUint32 = UINT32_MAX; const int kMaxInt32 = INT32_MAX; +const int kMinInt32 = INT32_MIN; const int64_t kMaxInt64 = INT64_MAX; const uint64_t kMaxUint64 = UINT64_MAX; diff --git a/src.mk b/src.mk index f32bf6945..4b671c7b8 100644 --- a/src.mk +++ b/src.mk @@ -61,8 +61,8 @@ LIB_SOURCES = \ env/env.cc \ env/env_chroot.cc \ env/env_encryption.cc \ - env/env_encrypt2.cc \ env/env_hdfs.cc \ + env/env_openssl.cc \ env/env_posix.cc \ env/io_posix.cc \ env/mock_env.cc \ @@ -346,7 +346,6 @@ MAIN_SOURCES = \ db/write_callback_test.cc \ db/write_controller_test.cc \ env/env_basic_test.cc \ - env/env_encrypt2_test.cc \ env/env_test.cc \ env/mock_env_test.cc \ memtable/inlineskiplist_test.cc \ diff --git a/util/build_version.cc b/util/build_version.cc index 8465b15ea..244273e55 100644 --- a/util/build_version.cc +++ b/util/build_version.cc @@ -1,4 +1,4 @@ #include "build_version.h" -const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:42f9f56c6fc201804d3905deb5f6337669578220"; -const char* rocksdb_build_git_date = "rocksdb_build_git_date:2020-04-22"; +const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:174ccbda9a8f5a64afc6f976676322f7ad332b85"; +const char* rocksdb_build_git_date = "rocksdb_build_git_date:2020-09-06"; const char* rocksdb_build_compile_date = __DATE__; diff --git a/util/string_util.cc b/util/string_util.cc index f3581105e..d098ecb0b 100644 --- a/util/string_util.cc +++ b/util/string_util.cc @@ -5,25 +5,22 @@ // #include "util/string_util.h" -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - #include -#include #include #include #include #include +#include #include #include #include #include #include -#include "rocksdb/env.h" +#include "port/port.h" +#include "port/sys_time.h" #include "rocksdb/slice.h" -namespace rocksdb { +namespace ROCKSDB_NAMESPACE { const std::string kNullptrString = "nullptr"; @@ -142,6 +139,16 @@ std::string BytesToHumanString(uint64_t bytes) { return std::string(buf); } +std::string TimeToHumanString(int unixtime) { + char time_buffer[80]; + time_t rawtime = unixtime; + struct tm tInfo; + struct tm* timeinfo = localtime_r(&rawtime, &tInfo); + assert(timeinfo == &tInfo); + strftime(time_buffer, 80, "%c", timeinfo); + return std::string(time_buffer); +} + std::string EscapeString(const Slice& value) { std::string r; AppendEscapedStringTo(&r, value); @@ -256,6 +263,20 @@ std::string trim(const std::string& str) { return std::string(); } +bool EndsWith(const std::string& string, const std::string& pattern) { + size_t plen = pattern.size(); + size_t slen = string.size(); + if (plen <= slen) { + return string.compare(slen - plen, plen, pattern) == 0; + } else { + return false; + } +} + +bool StartsWith(const std::string& string, const std::string& pattern) { + return string.compare(0, pattern.size(), pattern) == 0; +} + #ifndef ROCKSDB_LITE bool ParseBoolean(const std::string& type, const std::string& value) { @@ -276,6 +297,15 @@ uint32_t ParseUint32(const std::string& value) { } } +int32_t ParseInt32(const std::string& value) { + int64_t num = ParseInt64(value); + if (num <= port::kMaxInt32 && num >= port::kMinInt32) { + return static_cast(num); + } else { + throw std::out_of_range(value); + } +} + #endif uint64_t ParseUint64(const std::string& value) { @@ -303,6 +333,31 @@ uint64_t ParseUint64(const std::string& value) { return num; } +int64_t ParseInt64(const std::string& value) { + size_t endchar; +#ifndef CYGWIN + int64_t num = std::stoll(value.c_str(), &endchar); +#else + char* endptr; + int64_t num = std::strtoll(value.c_str(), &endptr, 0); + endchar = endptr - value.c_str(); +#endif + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10LL; + else if (c == 'm' || c == 'M') + num <<= 20LL; + else if (c == 'g' || c == 'G') + num <<= 30LL; + else if (c == 't' || c == 'T') + num <<= 40LL; + } + + return num; +} + int ParseInt(const std::string& value) { size_t endchar; #ifndef CYGWIN @@ -365,4 +420,4 @@ bool SerializeIntVector(const std::vector& vec, std::string* value) { return true; } -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE diff --git a/util/string_util.h b/util/string_util.h index b2bca40ac..5ff516cac 100644 --- a/util/string_util.h +++ b/util/string_util.h @@ -11,7 +11,9 @@ #include #include -namespace rocksdb { +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { class Slice; @@ -50,6 +52,10 @@ extern std::string NumberToHumanString(int64_t num); // ex: 1048576 -> 1.00 GB extern std::string BytesToHumanString(uint64_t bytes); +// Return a human-readable version of unix time +// ex: 1562116015 -> "Tue Jul 2 18:06:55 2019" +extern std::string TimeToHumanString(int unixtime); + // Append a human-readable time in micros. int AppendHumanMicros(uint64_t micros, char* output, int len, bool fixed_format); @@ -105,16 +111,26 @@ std::string UnescapeOptionString(const std::string& escaped_string); std::string trim(const std::string& str); +// Returns true if "string" ends with "pattern" +bool EndsWith(const std::string& string, const std::string& pattern); + +// Returns true if "string" starts with "pattern" +bool StartsWith(const std::string& string, const std::string& pattern); + #ifndef ROCKSDB_LITE bool ParseBoolean(const std::string& type, const std::string& value); uint32_t ParseUint32(const std::string& value); + +int32_t ParseInt32(const std::string& value); #endif uint64_t ParseUint64(const std::string& value); int ParseInt(const std::string& value); +int64_t ParseInt64(const std::string& value); + double ParseDouble(const std::string& value); size_t ParseSizeT(const std::string& value); @@ -125,4 +141,4 @@ bool SerializeIntVector(const std::vector& vec, std::string* value); extern const std::string kNullptrString; -} // namespace rocksdb +} // namespace ROCKSDB_NAMESPACE From ff46be73e3af0202825ffae743d590b688f5fbad Mon Sep 17 00:00:00 2001 From: matthewvon Date: Mon, 7 Sep 2020 12:34:35 -0400 Subject: [PATCH 55/57] update unit test (and fix what it found broken) --- CMakeLists.txt | 1 + Makefile | 3 +- TARGETS | 4 +- env/env_openssl.cc | 29 ++++++- ...v_encrypt2_test.cc => env_openssl_test.cc} | 86 +++++++++++-------- include/rocksdb/env_openssl.h | 6 +- src.mk | 1 + util/build_version.cc | 4 +- 8 files changed, 87 insertions(+), 47 deletions(-) rename env/{env_encrypt2_test.cc => env_openssl_test.cc} (92%) diff --git a/CMakeLists.txt b/CMakeLists.txt index a5a518f00..5fadcfa14 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -919,6 +919,7 @@ if(WITH_TESTS) db/write_callback_test.cc db/write_controller_test.cc env/env_basic_test.cc + env/env_openssl_test.cc env/env_test.cc env/mock_env_test.cc memtable/inlineskiplist_test.cc diff --git a/Makefile b/Makefile index 97db07077..045219a9a 100644 --- a/Makefile +++ b/Makefile @@ -419,6 +419,7 @@ TESTS = \ coding_test \ inlineskiplist_test \ env_basic_test \ + env_openssl_test \ env_test \ hash_test \ library_loader_test \ @@ -1310,7 +1311,7 @@ sim_cache_test: utilities/simulator_cache/sim_cache_test.o db/db_test_util.o $(L spatial_db_test: utilities/spatialdb/spatial_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -env_encrypt2_test: env/env_encrypt2_test.o $(LIBOBJECTS) $(TESTHARNESS) +env_openssl_test: env/env_openssl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) env_mirror_test: utilities/env_mirror_test.o $(LIBOBJECTS) $(TESTHARNESS) diff --git a/TARGETS b/TARGETS index 3c55db41c..bace70b65 100644 --- a/TARGETS +++ b/TARGETS @@ -705,8 +705,8 @@ ROCKS_TESTS = [ "serial", ], [ - "env_encrypt2_test", - "env/env_encrypt2_test.cc", + "env_openssl_test", + "env/env_openssl_test.cc", "serial", ], [ diff --git a/env/env_openssl.cc b/env/env_openssl.cc index 81822ea9f..1a2075cc5 100644 --- a/env/env_openssl.cc +++ b/env/env_openssl.cc @@ -244,9 +244,10 @@ Status EncryptionProviderOpenSSL::CreateNewPrefix(const std::string& /*fname*/, int ret_val; memcpy(prefix, kOpenSSLEncryptMarker, sizeof(kOpenSSLEncryptMarker)); - *(prefix + sizeof(kOpenSSLEncryptMarker)) = kOpenSSLEncryptCodeVersion1; + *(prefix + sizeof(kOpenSSLEncryptMarker) - 1) = kOpenSSLEncryptCodeVersion1; PrefixVersion0* pf = {(PrefixVersion0*)(prefix + sizeof(OpenSSLEncryptMarker))}; + ReadLock lock(&key_lock_); memcpy(pf->key_description_, encrypt_write_.first.desc, sizeof(encrypt_write_.first.desc)); ret_val = crypto_shared->RAND_bytes((unsigned char*)&pf->nonce_, AES_BLOCK_SIZE); @@ -276,11 +277,12 @@ Status EncryptionProviderOpenSSL::CreateCipherStream( // for direct io, prefix size matched to one page to keep file contents aligned. if (kDefaultPageSize == prefix.size()) { if (prefix.starts_with(kOpenSSLEncryptMarker)) { - uint8_t code_version = (uint8_t)prefix[sizeof(kOpenSSLEncryptMarker)]; + uint8_t code_version = (uint8_t)*(prefix.data()+sizeof(kOpenSSLEncryptMarker)-1); switch (code_version) { case kOpenSSLEncryptCodeVersion1: { PrefixVersion0 * prefix_struct = (PrefixVersion0 *)(prefix.data() + sizeof(OpenSSLEncryptMarker)); ShaDescription desc(prefix_struct->key_description_, sizeof(PrefixVersion0::key_description_)); + ReadLock lock(&key_lock_); auto read_key = encrypt_read_.find(desc); if (encrypt_read_.end() != read_key) { @@ -306,6 +308,29 @@ Status EncryptionProviderOpenSSL::CreateCipherStream( return stat; } +Status EncryptionProviderOpenSSL::AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) { + Status stat; + + // it is possible for one or both to be invalid implying ... unencrypted writes + ShaDescription desc(descriptor); + AesCtrKey aes((const uint8_t*)cipher, len); + + WriteLock lock(&key_lock_); + + auto it = encrypt_read_.insert(std::pair(desc, aes)); + if (it.second) { + if (for_write) { + encrypt_write_ = std::pair(desc, aes); + } + } else { + stat = Status::InvalidArgument("Duplicate encryption key"); + } + + return stat; +} + + } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff --git a/env/env_encrypt2_test.cc b/env/env_openssl_test.cc similarity index 92% rename from env/env_encrypt2_test.cc rename to env/env_openssl_test.cc index 5f6f05609..b96d77b3b 100644 --- a/env/env_encrypt2_test.cc +++ b/env/env_openssl_test.cc @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "env/env_encrypt2_impl.h" +#include "env/env_openssl_impl.h" #include "rocksdb/options.h" #include "rocksdb/sst_file_writer.h" #include "util/testharness.h" @@ -12,10 +12,10 @@ namespace ROCKSDB_NAMESPACE { -class EnvEncrypt2_Sha1 {}; +class EncryptOpenSSL_Sha {}; -TEST(EnvEncrypt2_Sha1, Default) { - Sha1Description desc; +TEST(EncryptOpenSSL_Sha, Default) { + ShaDescription desc; ASSERT_FALSE(desc.IsValid()); for (size_t idx = 0; idx < sizeof(desc.desc); ++idx) { @@ -23,8 +23,8 @@ TEST(EnvEncrypt2_Sha1, Default) { } } -TEST(EnvEncrypt2_Sha1, Constructors) { - Sha1Description desc; +TEST(EncryptOpenSSL_Sha, Constructors) { + ShaDescription desc; // verify we know size of desc.desc ASSERT_TRUE(64 == sizeof(desc.desc)); @@ -34,38 +34,38 @@ TEST(EnvEncrypt2_Sha1, Constructors) { bytes[idx] = idx + 1; } - Sha1Description desc_bad1(bytes, 128); + ShaDescription desc_bad1(bytes, 128); ASSERT_FALSE(desc_bad1.IsValid()); - Sha1Description desc_bad2(bytes, 65); + ShaDescription desc_bad2(bytes, 65); ASSERT_FALSE(desc_bad2.IsValid()); - Sha1Description desc_good1(bytes, 64); + ShaDescription desc_good1(bytes, 64); ASSERT_TRUE(desc_good1.IsValid()); ptr = (uint8_t*)memchr(desc_good1.desc, 0, 64); ASSERT_TRUE(nullptr == ptr); - Sha1Description desc_good2(bytes, 63); + ShaDescription desc_good2(bytes, 63); ASSERT_TRUE(desc_good2.IsValid()); ptr = (uint8_t*)memchr(desc_good2.desc, 0, 64); ASSERT_TRUE(&desc_good2.desc[63] == ptr); - Sha1Description desc_good3(bytes, 1); + ShaDescription desc_good3(bytes, 1); ASSERT_TRUE(desc_good3.IsValid()); ptr = (uint8_t*)memchr(desc_good3.desc, 0, 64); ASSERT_TRUE(&desc_good3.desc[1] == ptr); - Sha1Description desc_good4(bytes, 0); + ShaDescription desc_good4(bytes, 0); ASSERT_TRUE(desc_good4.IsValid()); ptr = (uint8_t*)memchr(desc_good4.desc, 0, 64); ASSERT_TRUE(&desc_good4.desc[0] == ptr); - Sha1Description desc_str1(""); + ShaDescription desc_str1(""); ASSERT_FALSE(desc_str1.IsValid()); uint8_t md2[] = {0x35, 0x6a, 0x19, 0x2b, 0x79, 0x13, 0xb0, 0x4c, 0x54, 0x57, 0x4d, 0x18, 0xc2, 0x8d, 0x46, 0xe6, 0x39, 0x54, 0x28, 0xab}; - Sha1Description desc_str2("1"); + ShaDescription desc_str2("1"); ASSERT_TRUE(desc_str2.IsValid()); ASSERT_TRUE(0 == memcmp(md2, desc_str2.desc, sizeof(md2))); for (size_t idx = sizeof(md2); idx < sizeof(desc_str2.desc); ++idx) { @@ -74,7 +74,7 @@ TEST(EnvEncrypt2_Sha1, Constructors) { uint8_t md3[] = {0x7b, 0x52, 0x00, 0x9b, 0x64, 0xfd, 0x0a, 0x2a, 0x49, 0xe6, 0xd8, 0xa9, 0x39, 0x75, 0x30, 0x77, 0x79, 0x2b, 0x05, 0x54}; - Sha1Description desc_str3("12"); + ShaDescription desc_str3("12"); ASSERT_TRUE(desc_str3.IsValid()); ASSERT_TRUE(0 == memcmp(md3, desc_str3.desc, sizeof(md3))); for (size_t idx = sizeof(md3); idx < sizeof(desc_str3.desc); ++idx) { @@ -82,11 +82,11 @@ TEST(EnvEncrypt2_Sha1, Constructors) { } } -TEST(EnvEncrypt2_Sha1, Copy) { +TEST(EncryptOpenSSL_Sha, Copy) { // assignment uint8_t md1[] = {0xdb, 0x8a, 0xc1, 0xc2, 0x59, 0xeb, 0x89, 0xd4, 0xa1, 0x31, 0xb2, 0x53, 0xba, 0xcf, 0xca, 0x5f, 0x31, 0x9d, 0x54, 0xf2}; - Sha1Description desc1("HelloWorld"), desc2; + ShaDescription desc1("HelloWorld"), desc2; ASSERT_TRUE(desc1.IsValid()); ASSERT_FALSE(desc2.IsValid()); @@ -105,10 +105,10 @@ TEST(EnvEncrypt2_Sha1, Copy) { // copy constructor uint8_t md3[] = {0x17, 0x09, 0xcc, 0x51, 0x65, 0xf5, 0x50, 0x4d, 0x46, 0xde, 0x2f, 0x3a, 0x7a, 0xff, 0x57, 0x45, 0x20, 0x8a, 0xed, 0x44}; - Sha1Description desc3("A little be longer title for a key"); + ShaDescription desc3("A little be longer title for a key"); ASSERT_TRUE(desc3.IsValid()); - Sha1Description desc4(desc3); + ShaDescription desc4(desc3); ASSERT_TRUE(desc3.IsValid()); ASSERT_TRUE(desc4.IsValid()); ASSERT_TRUE(0 == memcmp(md3, desc3.desc, sizeof(md3))); @@ -121,9 +121,9 @@ TEST(EnvEncrypt2_Sha1, Copy) { } } -class EnvEncrypt2_Key {}; +class EncryptOpenSSL_Key {}; -TEST(EnvEncrypt2_Key, Default) { +TEST(EncryptOpenSSL_Key, Default) { AesCtrKey key; ASSERT_FALSE(key.IsValid()); @@ -132,7 +132,7 @@ TEST(EnvEncrypt2_Key, Default) { } } -TEST(EnvEncrypt2_Key, Constructors) { +TEST(EncryptOpenSSL_Key, Constructors) { AesCtrKey key; // verify we know size of key.key @@ -191,7 +191,7 @@ TEST(EnvEncrypt2_Key, Constructors) { ASSERT_TRUE(0 == memcmp(key4, key_str4.key, sizeof(key4))); } -TEST(EnvEncrypt2_Key, Copy) { +TEST(EncryptOpenSSL_Key, Copy) { // assignment uint8_t data1[] = {0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, @@ -222,7 +222,7 @@ TEST(EnvEncrypt2_Key, Copy) { ASSERT_TRUE(0 == memcmp(data3, key4.key, sizeof(data3))); } -class EnvEncrypt2_Provider {}; +class EncryptOpenSSL_Provider {}; class CipherStreamWrapper : public BlockAccessCipherStream { public: @@ -234,7 +234,7 @@ class CipherStreamWrapper : public BlockAccessCipherStream { } }; -TEST(EnvEncrypt2_Provider, NistExamples) { +TEST(EncryptOpenSSL_Provider, NistExamples) { uint8_t key[] = {0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, @@ -262,10 +262,18 @@ TEST(EnvEncrypt2_Provider, NistExamples) { uint8_t cypher4[] = {0xdf, 0xc9, 0xc5, 0x8d, 0xb6, 0x7a, 0xad, 0xa6, 0x13, 0xc2, 0xdd, 0x08, 0x45, 0x79, 0x41, 0xa6}; - CTREncryptionProviderV2 provider("NistExampleKey", key, sizeof(key)); + EncryptionProviderOpenSSL provider("NistExampleKey", key, sizeof(key)); + ShaDescription desc("NistExampleKey"); + char prefix[4096]; + memcpy(prefix, "Encrypt1", 8); + memcpy(&prefix[8], desc.desc, EVP_MAX_MD_SIZE); + memcpy(&prefix[8+EVP_MAX_MD_SIZE], init, sizeof(init)); + rocksdb::Slice pref_slice(prefix, 4096); + std::unique_ptr stream; + rocksdb::Status stat = provider.CreateCipherStream("filename.txt", EnvOptions(), pref_slice, &stream); - std::unique_ptr stream( - provider.CreateCipherStream2(1, init)); + ASSERT_TRUE(stat.ok()); + ASSERT_TRUE(nullptr != stream.get()); uint64_t offset; uint8_t block[sizeof(plain1)]; @@ -327,7 +335,7 @@ TEST(EnvEncrypt2_Provider, NistExamples) { ASSERT_TRUE(0 == memcmp(plain4, block, sizeof(block))); } -TEST(EnvEncrypt2_Provider, NistSingleCall) { +TEST(EncryptOpenSSL_Provider, NistSingleCall) { uint8_t key[] = {0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, @@ -368,7 +376,7 @@ TEST(EnvEncrypt2_Provider, NistSingleCall) { ASSERT_TRUE(0 == memcmp(cypher1, output, sizeof(output))); } -TEST(EnvEncrypt2_Provider, BigEndianAdd) { +TEST(EncryptOpenSSL_Provider, BigEndianAdd) { uint8_t nounce1[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; uint8_t expect1[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -463,7 +471,7 @@ class EnvMoreTestWithParam : public EnvBasicTestWithParam {}; // next statements run env test against encrypt_2 code. static std::string KeyName = {"A key name"}; -static Sha1Description KeyDesc(KeyName); +static ShaDescription KeyDesc(KeyName); // this key is from // https://nvlpubs.nist.gov/nistpubs/Legacy/SP/nistspecialpublication800-38a.pdf, @@ -472,16 +480,20 @@ static uint8_t key256[] = {0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81, 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4}; -std::shared_ptr encrypt2_provider_ctr( - new CTREncryptionProviderV2(KeyName, key256, 32)); -static EncryptedEnvV2::ReadKeys encrypt_readers = { +std::shared_ptr encrypt2_provider_ctr( + new EncryptionProviderOpenSSL(KeyName, key256, 32)); + +//encrypt2_provider_ctr->AddCipher("A key name", (char *)key256, 32, true); + +#if 0 +static Encryption::ReadKeys encrypt_readers = { {KeyDesc, encrypt2_provider_ctr}}; static EncryptedEnvV2::WriteKey encrypt_writer = {KeyDesc, encrypt2_provider_ctr}; - +#endif static std::unique_ptr encrypt2_env(new NormalizingEnvWrapper( - EncryptedEnvV2::Default(encrypt_readers, encrypt_writer))); + NewEncryptedEnv(Env::Default(), encrypt2_provider_ctr))); INSTANTIATE_TEST_CASE_P(EncryptedEnvV2, EnvBasicTestWithParam, ::testing::Values(encrypt2_env.get())); @@ -492,7 +504,7 @@ TEST_P(EnvBasicTestWithParam, Basics) { std::vector children; // kill warning - std::string warn(kEncryptMarker); + std::string warn(kOpenSSLEncryptMarker); warn.length(); // Check that the directory is empty. diff --git a/include/rocksdb/env_openssl.h b/include/rocksdb/env_openssl.h index 99948fdfa..dc53cb834 100644 --- a/include/rocksdb/env_openssl.h +++ b/include/rocksdb/env_openssl.h @@ -161,8 +161,8 @@ class EncryptionProviderOpenSSL : public EncryptionProvider { Status CreateNewPrefix(const std::string& /*fname*/, char* prefix, size_t prefixLength) const override; - virtual Status AddCipher(const std::string& descriptor, const char* cipher, - size_t len, bool for_write) = 0; + Status AddCipher(const std::string& descriptor, const char* cipher, + size_t len, bool for_write) override; Status CreateCipherStream( const std::string& /*fname*/, const EnvOptions& /*options*/, @@ -171,10 +171,10 @@ class EncryptionProviderOpenSSL : public EncryptionProvider { bool IsValid() const { return valid_; }; - protected: using WriteKey = std::pair; using ReadKeys = std::map; + protected: ReadKeys encrypt_read_; WriteKey encrypt_write_; mutable port::RWMutex key_lock_; diff --git a/src.mk b/src.mk index 4b671c7b8..42d6e7b48 100644 --- a/src.mk +++ b/src.mk @@ -346,6 +346,7 @@ MAIN_SOURCES = \ db/write_callback_test.cc \ db/write_controller_test.cc \ env/env_basic_test.cc \ + env/env_openssl_test.cc \ env/env_test.cc \ env/mock_env_test.cc \ memtable/inlineskiplist_test.cc \ diff --git a/util/build_version.cc b/util/build_version.cc index 244273e55..b71f16143 100644 --- a/util/build_version.cc +++ b/util/build_version.cc @@ -1,4 +1,4 @@ #include "build_version.h" -const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:174ccbda9a8f5a64afc6f976676322f7ad332b85"; -const char* rocksdb_build_git_date = "rocksdb_build_git_date:2020-09-06"; +const char* rocksdb_build_git_sha = "rocksdb_build_git_sha:7ecb24640179d6d9305e8cdf25676169f5a43ce8"; +const char* rocksdb_build_git_date = "rocksdb_build_git_date:2020-09-07"; const char* rocksdb_build_compile_date = __DATE__; From 743b314fdc78e55f62146328a04bf26b2290e23e Mon Sep 17 00:00:00 2001 From: matthewvon Date: Tue, 8 Sep 2020 16:01:03 -0400 Subject: [PATCH 56/57] add a paranoid test for all potential partial block starts of Encrypt --- env/env_openssl_test.cc | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/env/env_openssl_test.cc b/env/env_openssl_test.cc index b96d77b3b..d8bccd22a 100644 --- a/env/env_openssl_test.cc +++ b/env/env_openssl_test.cc @@ -374,6 +374,30 @@ TEST(EncryptOpenSSL_Provider, NistSingleCall) { Status status = stream.Encrypt(offset, (char*)output, sizeof(plain1)); ASSERT_TRUE(status.ok()); ASSERT_TRUE(0 == memcmp(cypher1, output, sizeof(output))); + + // + // check partial blocks fore and aft + // + uint8_t scratch[sizeof(plain1) + 32]; // empty block before and after + + // outer loop is starting offset, inner loop is size of encryption + for (size_t outer = 0; outer < 16; ++outer) { + for (size_t inner = 0; inner < (64 - outer); ++ inner) { + memset(scratch, 0, sizeof(scratch)); + memcpy(&scratch[16 + outer], &plain1[outer], inner); + status = stream.Encrypt(outer, (char*)&scratch[16 + outer], inner); + ASSERT_TRUE(status.ok()); + ASSERT_TRUE(0 == memcmp(&cypher1[outer], &scratch[16 + outer], inner)); + + // the test depends upon fact that cypher1 contains no 0x00 byte + for (size_t loop=0; loop < 16 + outer; ++loop) { + ASSERT_TRUE('\0' == scratch[loop]); + } + for (size_t loop=(outer+inner+16); loop < 96; ++loop) { + ASSERT_TRUE('\0' == scratch[loop]); + } + } + } } TEST(EncryptOpenSSL_Provider, BigEndianAdd) { From a894db0d69c032434d2b811c2a487c70476985cc Mon Sep 17 00:00:00 2001 From: matthewvon Date: Tue, 8 Sep 2020 16:04:58 -0400 Subject: [PATCH 57/57] have paranoid test execute across two offset (32 bytes) instead of just 1 --- env/env_openssl_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/env/env_openssl_test.cc b/env/env_openssl_test.cc index d8bccd22a..fa2f60ca4 100644 --- a/env/env_openssl_test.cc +++ b/env/env_openssl_test.cc @@ -381,7 +381,7 @@ TEST(EncryptOpenSSL_Provider, NistSingleCall) { uint8_t scratch[sizeof(plain1) + 32]; // empty block before and after // outer loop is starting offset, inner loop is size of encryption - for (size_t outer = 0; outer < 16; ++outer) { + for (size_t outer = 0; outer < 32; ++outer) { for (size_t inner = 0; inner < (64 - outer); ++ inner) { memset(scratch, 0, sizeof(scratch)); memcpy(&scratch[16 + outer], &plain1[outer], inner);