Merge pull request #32210 from vespa-engine/balder/test-and-fix-repea…

…ted-repopulation - Add test for repopulation.
vespa-engine · Aug 21, 2024 · b666cc6 · b666cc6
2 parents 6b7c9d2 + 2691e70
commit b666cc6
Show file tree

Hide file tree

Showing 10 changed files with 119 additions and 49 deletions.
diff --git a/searchlib/src/tests/common/bitvector/condensedbitvector_test.cpp b/searchlib/src/tests/common/bitvector/condensedbitvector_test.cpp
@@ -1,11 +1,15 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #include <vespa/vespalib/testkit/test_kit.h>
 #include <vespa/searchlib/common/condensedbitvectors.h>
+#include <vespa/searchlib/common/bitvectorcache.h>
+#include <vespa/vespalib/stllike/hash_map.hpp>
 #include <vespa/log/log.h>
 
 LOG_SETUP("condensedbitvector_test");
 
 using search::CondensedBitVector;
+using search::BitVectorCache;
+using search::PopulateInterface;
 using vespalib::GenerationHolder;
 
 TEST("Verify state after init")
@@ -45,4 +49,88 @@ TEST("Verify set/get")
     EXPECT_EQUAL(1u, sum);
 }
 
+namespace {
+using DocIds = std::vector<int32_t>;
+using KeyDocIdsMap = vespalib::hash_map<uint64_t, DocIds>;
+struct DocIdIterator : public PopulateInterface::Iterator {
+    explicit DocIdIterator(const DocIds & docs) : _docs(docs), _index(0) {}
+    int32_t getNext() override {
+        return (_index < _docs.size()) ? _docs[_index++] : -1;
+    }
+
+    const DocIds & _docs;
+    uint32_t       _index;
+};
+class Populater : public PopulateInterface {
+public:
+    explicit Populater(const KeyDocIdsMap & m)
+        : _map(m),
+          _empty()
+    {}
+    Iterator::UP lookup(uint64_t key) const override {
+        return _map.contains(key)
+               ? std::make_unique<DocIdIterator>(_map[key])
+               : std::make_unique<DocIdIterator>(_empty);
+    }
+private:
+    const KeyDocIdsMap & _map;
+    DocIds _empty;
+};
+
+KeyDocIdsMap
+create(uint32_t numDocs, uint32_t numKeys, uint32_t seed) {
+    KeyDocIdsMap m;
+    std::srand(seed);
+    for (uint32_t k = 0; k < numKeys; k++) {
+        DocIds & docIds = m[k];
+        for (uint32_t d=0, count=(std::rand()%numDocs); d < count; d++) {
+            docIds.push_back(std::rand()%numDocs);
+        }
+    }
+    return m;
+}
+
+}
+
+TEST("Test repopulation of bitvector cache") {
+    GenerationHolder genHolder;
+    BitVectorCache cache(genHolder);
+    constexpr uint32_t numDocs = 100;
+    std::vector<uint8_t> countVector(numDocs);
+    EXPECT_TRUE(cache.lookupCachedSet({{0,5}}).empty());
+    cache.populate(numDocs, Populater(create(numDocs, 1, 1)));
+    EXPECT_TRUE(cache.lookupCachedSet({{0,5}}).empty());
+    cache.requirePopulation();
+    cache.populate(numDocs, Populater(create(numDocs, 1, 1)));
+    auto keySet = cache.lookupCachedSet({{0,5}, {1,10}});
+    EXPECT_EQUAL(1u, keySet.size());
+    EXPECT_TRUE(keySet.contains(0));
+    cache.computeCountVector(keySet, countVector);
+
+    std::vector<std::pair<uint64_t, uint64_t>> keys;
+    for (uint64_t i=0; i < 10; i++) {
+        keys.emplace_back(i, 10+i);
+    }
+    cache.lookupCachedSet(keys);
+    for (size_t i = 2; i < keys.size(); i++) {
+        cache.requirePopulation();
+        cache.populate(numDocs, Populater(create(numDocs, i, i)));
+        keySet = cache.lookupCachedSet(keys);
+        EXPECT_EQUAL(keys.size(), keySet.size());
+        cache.computeCountVector(keySet, countVector);
+    }
+
+    keySet = cache.lookupCachedSet(keys);
+    cache.computeCountVector(keySet, countVector);
+    cache.set(1, 7, false);
+    cache.computeCountVector(keySet, countVector);
+    uint8_t prev_7 = countVector[7];
+    cache.set(1, 7, true);
+    cache.computeCountVector(keySet, countVector);
+    EXPECT_EQUAL(prev_7 + 1, countVector[7]);
+    cache.set(1, 7, false);
+    cache.computeCountVector(keySet, countVector);
+    EXPECT_EQUAL(prev_7, countVector[7]);
+}
+
 TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/searchlib/src/vespa/searchlib/common/bitvectorcache.cpp b/searchlib/src/vespa/searchlib/common/bitvectorcache.cpp
@@ -17,51 +17,44 @@ BitVectorCache::BitVectorCache(GenerationHolder &genHolder)
       _needPopulation(false),
       _mutex(),
       _keys(),
-      _chunks(),
+      _chunk(),
       _genHolder(genHolder)
 {
 }
 
 BitVectorCache::~BitVectorCache() = default;
 
 void
-BitVectorCache::computeCountVector(KeySet & keys, CountVector & v) const
+BitVectorCache::computeCountVector(KeySet & keys, std::span<uint8_t> v) const
 {
     std::vector<Key> notFound;
-    std::vector<CondensedBitVector::KeySet> keySets;
-    ChunkV chunks;
+    CondensedBitVector::KeySet keySet;
+    CondensedBitVector::SP chunk;
     {
         std::shared_lock guard(_mutex);
-        keySets.resize(_chunks.size());
         auto end = _keys.end();
         for (Key k : keys) {
             auto found = _keys.find(k);
             if (found != end) {
                 const KeyMeta & m = found->second;
                 if (m.isCached()) {
-                    keySets[m.chunkId()].insert(m.chunkIndex());
+                    keySet.insert(m.chunkIndex());
                 } else {
                     notFound.push_back(k);
                 }
             } else {
                 notFound.push_back(k);
             }
         }
-        chunks = _chunks;
+        chunk = _chunk;
     }
     for (Key k : notFound) {
         keys.erase(k);
     }
-    size_t index(0);
-    if (chunks.empty()) {
+    if (!chunk) {
         memset(&v[0], 0, v.size());
-    }
-    for (const auto & chunk : chunks) {
-        if (index == 0) {
-            chunk->initializeCountVector(keySets[index++], v);
-        } else {
-            chunk->addCountVector(keySets[index++], v);
-        }
+    } else {
+        chunk->initializeCountVector(keySet, v);
     }
 }
 
@@ -117,7 +110,7 @@ bool
 BitVectorCache::hasCostChanged(const std::shared_lock<std::shared_mutex> & guard)
 {
     (void) guard;
-    if ( ! _chunks.empty()) {
+    if (_chunk) {
         SortedKeyMeta sorted(getSorted(_keys));
         double oldCached(0);
         for (auto & e : sorted) {
@@ -127,7 +120,7 @@ BitVectorCache::hasCostChanged(const std::shared_lock<std::shared_mutex> & guard
             }
         }
         double newCached(0);
-        for (size_t i(0); i < sorted.size() && i < _chunks[0]->getKeyCapacity(); i++) {
+        for (size_t i(0); i < sorted.size() && i < _chunk->getKeyCapacity(); i++) {
             const KeyMeta & m = *sorted[i].second;
             newCached += m.cost();
         }
@@ -182,15 +175,15 @@ void
 BitVectorCache::populate(uint32_t sz, const PopulateInterface & lookup)
 {
     if (!needPopulation()) return;
+    CondensedBitVector::UP chunk(CondensedBitVector::create(sz, _genHolder));
     std::unique_lock guard(_mutex);
     Key2Index newKeys(_keys);
     guard.unlock();
 
-    CondensedBitVector::UP chunk(CondensedBitVector::create(sz, _genHolder));
     populate(newKeys, *chunk, lookup);
 
     guard.lock();
-    _chunks.push_back(std::move(chunk));
+    _chunk = std::move(chunk);
     _keys.swap(newKeys);
     _needPopulation = false;
 }
@@ -203,33 +196,27 @@ BitVectorCache::set(Key key, uint32_t index, bool v)
     if (found != _keys.end()) {
         const KeyMeta & m(found->second);
         if (m.isCached()) {
-            _chunks[m.chunkId()]->set(m.chunkIndex(), index, v);
+            _chunk->set(m.chunkIndex(), index, v);
         }
     }
 }
 
-bool
-BitVectorCache::get(Key key, uint32_t index) const
-{
-    (void) key; (void) index;
-    return false;
-}
-
 void
 BitVectorCache::removeIndex(uint32_t index)
 {
     std::unique_lock guard(_mutex);
-    for (auto & chunk : _chunks) {
-        chunk->clearIndex(index);
+    if (_chunk) {
+        _chunk->clearIndex(index);
     }
 }
 
 
 void
 BitVectorCache::adjustDocIdLimit(uint32_t docId)
 {
-    for (auto &chunk : _chunks) {
-        chunk->adjustDocIdLimit(docId);
+    std::unique_lock guard(_mutex);
+    if (_chunk) {
+        _chunk->adjustDocIdLimit(docId);
     }
 }
 

diff --git a/searchlib/src/vespa/searchlib/common/bitvectorcache.h b/searchlib/src/vespa/searchlib/common/bitvectorcache.h
@@ -27,15 +27,13 @@ class BitVectorCache
     using Key = uint64_t;
     using KeySet = vespalib::hash_set<Key>;
     using KeyAndCountSet = std::vector<std::pair<Key, size_t>>;
-    using CountVector = CondensedBitVector::CountVector;
     using GenerationHolder = vespalib::GenerationHolder;
 
     explicit BitVectorCache(GenerationHolder &genHolder);
     ~BitVectorCache();
-    void computeCountVector(KeySet & keys, CountVector & v) const;
+    void computeCountVector(KeySet & keys, std::span<uint8_t> v) const;
     KeySet lookupCachedSet(const KeyAndCountSet & keys);
     void set(Key key, uint32_t index, bool v);
-    bool get(Key key, uint32_t index) const;
     void removeIndex(uint32_t index);
     void adjustDocIdLimit(uint32_t docId);
     void populate(uint32_t count, const PopulateInterface &);
@@ -67,7 +65,6 @@ class BitVectorCache
         bool     isCached()  const { return _chunkId >= 0; }
         size_t   bitCount()  const { return _bitCount; }
         size_t chunkIndex()  const { return _chunkIndex; }
-        size_t    chunkId()  const { return _chunkId; }
         size_t lookupCount() const { return _lookupCount.load(std::memory_order_relaxed); }
         KeyMeta &  lookup() { _lookupCount++; return *this; }
         KeyMeta &   bitCount(uint32_t v) {   _bitCount = v; return *this; }
@@ -82,7 +79,6 @@ class BitVectorCache
     };
     using Key2Index = vespalib::hash_map<Key, KeyMeta>;
     using SortedKeyMeta = std::vector<std::pair<Key, KeyMeta *>>;
-    using ChunkV = std::vector<CondensedBitVector::SP>;
 
     VESPA_DLL_LOCAL static SortedKeyMeta getSorted(Key2Index & keys);
     VESPA_DLL_LOCAL static void populate(Key2Index & newKeys, CondensedBitVector & chunk, const PopulateInterface & lookup);
@@ -92,7 +88,7 @@ class BitVectorCache
     std::atomic<bool>         _needPopulation;
     mutable std::shared_mutex _mutex;
     Key2Index                 _keys;
-    ChunkV                    _chunks;
+    CondensedBitVector::SP    _chunk;
     GenerationHolder         &_genHolder;
 };
 

diff --git a/searchlib/src/vespa/searchlib/common/condensedbitvectors.h b/searchlib/src/vespa/searchlib/common/condensedbitvectors.h
@@ -14,7 +14,6 @@ class CondensedBitVector
     using SP = std::shared_ptr<CondensedBitVector>;
     using Key = uint32_t;
     using KeySet = std::set<Key>;
-    using CountVector = std::span<uint8_t>;
 
     virtual ~CondensedBitVector();
 

diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp b/searchlib/src/vespa/searchlib/predicate/predicate_index.cpp
@@ -283,7 +283,7 @@ PredicateIndex::lookupCachedSet(const BitVectorCache::KeyAndCountSet & keys) con
 }
 
 void
-PredicateIndex::computeCountVector(BitVectorCache::KeySet & keys, BitVectorCache::CountVector & v) const
+PredicateIndex::computeCountVector(BitVectorCache::KeySet & keys, std::span<uint8_t> v) const
 {
     _cache.computeCountVector(keys, v);
 }

diff --git a/searchlib/src/vespa/searchlib/predicate/predicate_index.h b/searchlib/src/vespa/searchlib/predicate/predicate_index.h
@@ -98,7 +98,7 @@ class PredicateIndex : public PopulateInterface {
 
     void populateIfNeeded(size_t doc_id_limit);
     BitVectorCache::KeySet lookupCachedSet(const BitVectorCache::KeyAndCountSet & keys) const;
-    void computeCountVector(BitVectorCache::KeySet & keys, BitVectorCache::CountVector & v) const;
+    void computeCountVector(BitVectorCache::KeySet & keys, std::span<uint8_t> v) const;
 
     /*
      * Adjust size of structures to have space for docId.

diff --git a/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.cpp
@@ -265,7 +265,7 @@ PredicateBlueprint::fetchPostings(const ExecuteInfo &) {
         PredicateAttribute::MinFeatureHandle mfh = predicate_attribute().getMinFeatureVector();
         Alloc kv(Alloc::alloc(mfh.second, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE*4));
         _kVBacking.swap(kv);
-        _kV = BitVectorCache::CountVector(static_cast<uint8_t *>(_kVBacking.get()), mfh.second);
+        _kV = std::span<uint8_t>(static_cast<uint8_t *>(_kVBacking.get()), mfh.second);
         _index.computeCountVector(_cachedFeatures, _kV);
         for (const auto & entry : _bounds_dict_entries) {
             addBoundsPostingToK(entry.feature);

diff --git a/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.h b/searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.h
@@ -62,7 +62,7 @@ class PredicateBlueprint : public ComplexLeafBlueprint {
     }
 
     // Exposed for testing
-    const BitVectorCache::CountVector &  getKV() const { return _kV; }
+    std::span<uint8_t>  getKV() const { return _kV; }
     const BitVectorCache::KeySet &       getCachedFeatures() const { return _cachedFeatures; }
 private:
     using BTreeIterator = predicate::SimpleIndex<vespalib::datastore::EntryRef>::BTreeIterator;
@@ -85,7 +85,7 @@ class PredicateBlueprint : public ComplexLeafBlueprint {
     const PredicateAttribute        & _attribute;
     const predicate::PredicateIndex &_index;
     Alloc                            _kVBacking;
-    BitVectorCache::CountVector      _kV;
+    std::span<uint8_t>               _kV;
     BitVectorCache::KeySet           _cachedFeatures;
 
     std::vector<IntervalEntry>       _interval_dict_entries;

diff --git a/searchlib/src/vespa/searchlib/queryeval/predicate_search.cpp b/searchlib/src/vespa/searchlib/queryeval/predicate_search.cpp
@@ -132,10 +132,10 @@ SkipMinFeature::create(const uint8_t * min_feature, const uint8_t * kv, size_t s
 PredicateSearch::PredicateSearch(const uint8_t * minFeatureVector,
                                  const IntervalRange * interval_range_vector,
                                  IntervalRange max_interval_range,
-                                 CondensedBitVector::CountVector kV,
+                                 std::span<uint8_t> kV,
                                  vector<PredicatePostingList::UP> posting_lists,
                                  const fef::TermFieldMatchDataArray &tfmda)
-    : _skip(SkipMinFeature::create(minFeatureVector, &kV[0], kV.size())),
+    : _skip(SkipMinFeature::create(minFeatureVector, kV.data(), kV.size())),
       _posting_lists(std::move(posting_lists)),
       _sorted_indexes(_posting_lists.size()),
       _sorted_indexes_merge_buffer(_posting_lists.size()),

diff --git a/searchlib/src/vespa/searchlib/queryeval/predicate_search.h b/searchlib/src/vespa/searchlib/queryeval/predicate_search.h
@@ -19,7 +19,7 @@ class SkipMinFeature
 {
 public:
     using UP = std::unique_ptr<SkipMinFeature>;
-    virtual ~SkipMinFeature() { }
+    virtual ~SkipMinFeature() = default;
     VESPA_DLL_LOCAL virtual uint32_t next() = 0;
     static SkipMinFeature::UP create(const uint8_t * min_feature, const uint8_t * kv, size_t sz);
 };
@@ -54,10 +54,10 @@ class PredicateSearch : public SearchIterator {
     PredicateSearch(const uint8_t * minFeature,
                     const IntervalRange * interval_range_vector,
                     IntervalRange max_interval_range,
-                    CondensedBitVector::CountVector kV,
+                    std::span<uint8_t> kV,
                     std::vector<predicate::PredicatePostingList::UP> posting_lists,
                     const fef::TermFieldMatchDataArray &tfmda);
-    ~PredicateSearch();
+    ~PredicateSearch() override;
 
     void doSeek(uint32_t doc_id) override;
     void doUnpack(uint32_t doc_id) override;