Skip to content

Commit

Permalink
Merge pull request #32210 from vespa-engine/balder/test-and-fix-repea…
Browse files Browse the repository at this point in the history
…ted-repopulation

- Add test for repopulation.
  • Loading branch information
baldersheim authored Aug 21, 2024
2 parents 6b7c9d2 + 2691e70 commit b666cc6
Show file tree
Hide file tree
Showing 10 changed files with 119 additions and 49 deletions.
88 changes: 88 additions & 0 deletions searchlib/src/tests/common/bitvector/condensedbitvector_test.cpp
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <vespa/vespalib/testkit/test_kit.h>
#include <vespa/searchlib/common/condensedbitvectors.h>
#include <vespa/searchlib/common/bitvectorcache.h>
#include <vespa/vespalib/stllike/hash_map.hpp>
#include <vespa/log/log.h>

LOG_SETUP("condensedbitvector_test");

using search::CondensedBitVector;
using search::BitVectorCache;
using search::PopulateInterface;
using vespalib::GenerationHolder;

TEST("Verify state after init")
Expand Down Expand Up @@ -45,4 +49,88 @@ TEST("Verify set/get")
EXPECT_EQUAL(1u, sum);
}

namespace {
using DocIds = std::vector<int32_t>;
using KeyDocIdsMap = vespalib::hash_map<uint64_t, DocIds>;
struct DocIdIterator : public PopulateInterface::Iterator {
explicit DocIdIterator(const DocIds & docs) : _docs(docs), _index(0) {}
int32_t getNext() override {
return (_index < _docs.size()) ? _docs[_index++] : -1;
}

const DocIds & _docs;
uint32_t _index;
};
class Populater : public PopulateInterface {
public:
explicit Populater(const KeyDocIdsMap & m)
: _map(m),
_empty()
{}
Iterator::UP lookup(uint64_t key) const override {
return _map.contains(key)
? std::make_unique<DocIdIterator>(_map[key])
: std::make_unique<DocIdIterator>(_empty);
}
private:
const KeyDocIdsMap & _map;
DocIds _empty;
};

KeyDocIdsMap
create(uint32_t numDocs, uint32_t numKeys, uint32_t seed) {
KeyDocIdsMap m;
std::srand(seed);
for (uint32_t k = 0; k < numKeys; k++) {
DocIds & docIds = m[k];
for (uint32_t d=0, count=(std::rand()%numDocs); d < count; d++) {
docIds.push_back(std::rand()%numDocs);
}
}
return m;
}

}

TEST("Test repopulation of bitvector cache") {
GenerationHolder genHolder;
BitVectorCache cache(genHolder);
constexpr uint32_t numDocs = 100;
std::vector<uint8_t> countVector(numDocs);
EXPECT_TRUE(cache.lookupCachedSet({{0,5}}).empty());
cache.populate(numDocs, Populater(create(numDocs, 1, 1)));
EXPECT_TRUE(cache.lookupCachedSet({{0,5}}).empty());
cache.requirePopulation();
cache.populate(numDocs, Populater(create(numDocs, 1, 1)));
auto keySet = cache.lookupCachedSet({{0,5}, {1,10}});
EXPECT_EQUAL(1u, keySet.size());
EXPECT_TRUE(keySet.contains(0));
cache.computeCountVector(keySet, countVector);

std::vector<std::pair<uint64_t, uint64_t>> keys;
for (uint64_t i=0; i < 10; i++) {
keys.emplace_back(i, 10+i);
}
cache.lookupCachedSet(keys);
for (size_t i = 2; i < keys.size(); i++) {
cache.requirePopulation();
cache.populate(numDocs, Populater(create(numDocs, i, i)));
keySet = cache.lookupCachedSet(keys);
EXPECT_EQUAL(keys.size(), keySet.size());
cache.computeCountVector(keySet, countVector);
}

keySet = cache.lookupCachedSet(keys);
cache.computeCountVector(keySet, countVector);
cache.set(1, 7, false);
cache.computeCountVector(keySet, countVector);
uint8_t prev_7 = countVector[7];
cache.set(1, 7, true);
cache.computeCountVector(keySet, countVector);
EXPECT_EQUAL(prev_7 + 1, countVector[7]);
cache.set(1, 7, false);
cache.computeCountVector(keySet, countVector);
EXPECT_EQUAL(prev_7, countVector[7]);
}

TEST_MAIN() { TEST_RUN_ALL(); }
51 changes: 19 additions & 32 deletions searchlib/src/vespa/searchlib/common/bitvectorcache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,51 +17,44 @@ BitVectorCache::BitVectorCache(GenerationHolder &genHolder)
_needPopulation(false),
_mutex(),
_keys(),
_chunks(),
_chunk(),
_genHolder(genHolder)
{
}

BitVectorCache::~BitVectorCache() = default;

void
BitVectorCache::computeCountVector(KeySet & keys, CountVector & v) const
BitVectorCache::computeCountVector(KeySet & keys, std::span<uint8_t> v) const
{
std::vector<Key> notFound;
std::vector<CondensedBitVector::KeySet> keySets;
ChunkV chunks;
CondensedBitVector::KeySet keySet;
CondensedBitVector::SP chunk;
{
std::shared_lock guard(_mutex);
keySets.resize(_chunks.size());
auto end = _keys.end();
for (Key k : keys) {
auto found = _keys.find(k);
if (found != end) {
const KeyMeta & m = found->second;
if (m.isCached()) {
keySets[m.chunkId()].insert(m.chunkIndex());
keySet.insert(m.chunkIndex());
} else {
notFound.push_back(k);
}
} else {
notFound.push_back(k);
}
}
chunks = _chunks;
chunk = _chunk;
}
for (Key k : notFound) {
keys.erase(k);
}
size_t index(0);
if (chunks.empty()) {
if (!chunk) {
memset(&v[0], 0, v.size());
}
for (const auto & chunk : chunks) {
if (index == 0) {
chunk->initializeCountVector(keySets[index++], v);
} else {
chunk->addCountVector(keySets[index++], v);
}
} else {
chunk->initializeCountVector(keySet, v);
}
}

Expand Down Expand Up @@ -117,7 +110,7 @@ bool
BitVectorCache::hasCostChanged(const std::shared_lock<std::shared_mutex> & guard)
{
(void) guard;
if ( ! _chunks.empty()) {
if (_chunk) {
SortedKeyMeta sorted(getSorted(_keys));
double oldCached(0);
for (auto & e : sorted) {
Expand All @@ -127,7 +120,7 @@ BitVectorCache::hasCostChanged(const std::shared_lock<std::shared_mutex> & guard
}
}
double newCached(0);
for (size_t i(0); i < sorted.size() && i < _chunks[0]->getKeyCapacity(); i++) {
for (size_t i(0); i < sorted.size() && i < _chunk->getKeyCapacity(); i++) {
const KeyMeta & m = *sorted[i].second;
newCached += m.cost();
}
Expand Down Expand Up @@ -182,15 +175,15 @@ void
BitVectorCache::populate(uint32_t sz, const PopulateInterface & lookup)
{
if (!needPopulation()) return;
CondensedBitVector::UP chunk(CondensedBitVector::create(sz, _genHolder));
std::unique_lock guard(_mutex);
Key2Index newKeys(_keys);
guard.unlock();

CondensedBitVector::UP chunk(CondensedBitVector::create(sz, _genHolder));
populate(newKeys, *chunk, lookup);

guard.lock();
_chunks.push_back(std::move(chunk));
_chunk = std::move(chunk);
_keys.swap(newKeys);
_needPopulation = false;
}
Expand All @@ -203,33 +196,27 @@ BitVectorCache::set(Key key, uint32_t index, bool v)
if (found != _keys.end()) {
const KeyMeta & m(found->second);
if (m.isCached()) {
_chunks[m.chunkId()]->set(m.chunkIndex(), index, v);
_chunk->set(m.chunkIndex(), index, v);
}
}
}

bool
BitVectorCache::get(Key key, uint32_t index) const
{
(void) key; (void) index;
return false;
}

void
BitVectorCache::removeIndex(uint32_t index)
{
std::unique_lock guard(_mutex);
for (auto & chunk : _chunks) {
chunk->clearIndex(index);
if (_chunk) {
_chunk->clearIndex(index);
}
}


void
BitVectorCache::adjustDocIdLimit(uint32_t docId)
{
for (auto &chunk : _chunks) {
chunk->adjustDocIdLimit(docId);
std::unique_lock guard(_mutex);
if (_chunk) {
_chunk->adjustDocIdLimit(docId);
}
}

Expand Down
8 changes: 2 additions & 6 deletions searchlib/src/vespa/searchlib/common/bitvectorcache.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,13 @@ class BitVectorCache
using Key = uint64_t;
using KeySet = vespalib::hash_set<Key>;
using KeyAndCountSet = std::vector<std::pair<Key, size_t>>;
using CountVector = CondensedBitVector::CountVector;
using GenerationHolder = vespalib::GenerationHolder;

explicit BitVectorCache(GenerationHolder &genHolder);
~BitVectorCache();
void computeCountVector(KeySet & keys, CountVector & v) const;
void computeCountVector(KeySet & keys, std::span<uint8_t> v) const;
KeySet lookupCachedSet(const KeyAndCountSet & keys);
void set(Key key, uint32_t index, bool v);
bool get(Key key, uint32_t index) const;
void removeIndex(uint32_t index);
void adjustDocIdLimit(uint32_t docId);
void populate(uint32_t count, const PopulateInterface &);
Expand Down Expand Up @@ -67,7 +65,6 @@ class BitVectorCache
bool isCached() const { return _chunkId >= 0; }
size_t bitCount() const { return _bitCount; }
size_t chunkIndex() const { return _chunkIndex; }
size_t chunkId() const { return _chunkId; }
size_t lookupCount() const { return _lookupCount.load(std::memory_order_relaxed); }
KeyMeta & lookup() { _lookupCount++; return *this; }
KeyMeta & bitCount(uint32_t v) { _bitCount = v; return *this; }
Expand All @@ -82,7 +79,6 @@ class BitVectorCache
};
using Key2Index = vespalib::hash_map<Key, KeyMeta>;
using SortedKeyMeta = std::vector<std::pair<Key, KeyMeta *>>;
using ChunkV = std::vector<CondensedBitVector::SP>;

VESPA_DLL_LOCAL static SortedKeyMeta getSorted(Key2Index & keys);
VESPA_DLL_LOCAL static void populate(Key2Index & newKeys, CondensedBitVector & chunk, const PopulateInterface & lookup);
Expand All @@ -92,7 +88,7 @@ class BitVectorCache
std::atomic<bool> _needPopulation;
mutable std::shared_mutex _mutex;
Key2Index _keys;
ChunkV _chunks;
CondensedBitVector::SP _chunk;
GenerationHolder &_genHolder;
};

Expand Down
1 change: 0 additions & 1 deletion searchlib/src/vespa/searchlib/common/condensedbitvectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ class CondensedBitVector
using SP = std::shared_ptr<CondensedBitVector>;
using Key = uint32_t;
using KeySet = std::set<Key>;
using CountVector = std::span<uint8_t>;

virtual ~CondensedBitVector();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ PredicateIndex::lookupCachedSet(const BitVectorCache::KeyAndCountSet & keys) con
}

void
PredicateIndex::computeCountVector(BitVectorCache::KeySet & keys, BitVectorCache::CountVector & v) const
PredicateIndex::computeCountVector(BitVectorCache::KeySet & keys, std::span<uint8_t> v) const
{
_cache.computeCountVector(keys, v);
}
Expand Down
2 changes: 1 addition & 1 deletion searchlib/src/vespa/searchlib/predicate/predicate_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class PredicateIndex : public PopulateInterface {

void populateIfNeeded(size_t doc_id_limit);
BitVectorCache::KeySet lookupCachedSet(const BitVectorCache::KeyAndCountSet & keys) const;
void computeCountVector(BitVectorCache::KeySet & keys, BitVectorCache::CountVector & v) const;
void computeCountVector(BitVectorCache::KeySet & keys, std::span<uint8_t> v) const;

/*
* Adjust size of structures to have space for docId.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ PredicateBlueprint::fetchPostings(const ExecuteInfo &) {
PredicateAttribute::MinFeatureHandle mfh = predicate_attribute().getMinFeatureVector();
Alloc kv(Alloc::alloc(mfh.second, vespalib::alloc::MemoryAllocator::HUGEPAGE_SIZE*4));
_kVBacking.swap(kv);
_kV = BitVectorCache::CountVector(static_cast<uint8_t *>(_kVBacking.get()), mfh.second);
_kV = std::span<uint8_t>(static_cast<uint8_t *>(_kVBacking.get()), mfh.second);
_index.computeCountVector(_cachedFeatures, _kV);
for (const auto & entry : _bounds_dict_entries) {
addBoundsPostingToK(entry.feature);
Expand Down
4 changes: 2 additions & 2 deletions searchlib/src/vespa/searchlib/queryeval/predicate_blueprint.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class PredicateBlueprint : public ComplexLeafBlueprint {
}

// Exposed for testing
const BitVectorCache::CountVector & getKV() const { return _kV; }
std::span<uint8_t> getKV() const { return _kV; }
const BitVectorCache::KeySet & getCachedFeatures() const { return _cachedFeatures; }
private:
using BTreeIterator = predicate::SimpleIndex<vespalib::datastore::EntryRef>::BTreeIterator;
Expand All @@ -85,7 +85,7 @@ class PredicateBlueprint : public ComplexLeafBlueprint {
const PredicateAttribute & _attribute;
const predicate::PredicateIndex &_index;
Alloc _kVBacking;
BitVectorCache::CountVector _kV;
std::span<uint8_t> _kV;
BitVectorCache::KeySet _cachedFeatures;

std::vector<IntervalEntry> _interval_dict_entries;
Expand Down
4 changes: 2 additions & 2 deletions searchlib/src/vespa/searchlib/queryeval/predicate_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,10 @@ SkipMinFeature::create(const uint8_t * min_feature, const uint8_t * kv, size_t s
PredicateSearch::PredicateSearch(const uint8_t * minFeatureVector,
const IntervalRange * interval_range_vector,
IntervalRange max_interval_range,
CondensedBitVector::CountVector kV,
std::span<uint8_t> kV,
vector<PredicatePostingList::UP> posting_lists,
const fef::TermFieldMatchDataArray &tfmda)
: _skip(SkipMinFeature::create(minFeatureVector, &kV[0], kV.size())),
: _skip(SkipMinFeature::create(minFeatureVector, kV.data(), kV.size())),
_posting_lists(std::move(posting_lists)),
_sorted_indexes(_posting_lists.size()),
_sorted_indexes_merge_buffer(_posting_lists.size()),
Expand Down
6 changes: 3 additions & 3 deletions searchlib/src/vespa/searchlib/queryeval/predicate_search.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class SkipMinFeature
{
public:
using UP = std::unique_ptr<SkipMinFeature>;
virtual ~SkipMinFeature() { }
virtual ~SkipMinFeature() = default;
VESPA_DLL_LOCAL virtual uint32_t next() = 0;
static SkipMinFeature::UP create(const uint8_t * min_feature, const uint8_t * kv, size_t sz);
};
Expand Down Expand Up @@ -54,10 +54,10 @@ class PredicateSearch : public SearchIterator {
PredicateSearch(const uint8_t * minFeature,
const IntervalRange * interval_range_vector,
IntervalRange max_interval_range,
CondensedBitVector::CountVector kV,
std::span<uint8_t> kV,
std::vector<predicate::PredicatePostingList::UP> posting_lists,
const fef::TermFieldMatchDataArray &tfmda);
~PredicateSearch();
~PredicateSearch() override;

void doSeek(uint32_t doc_id) override;
void doUnpack(uint32_t doc_id) override;
Expand Down

0 comments on commit b666cc6

Please sign in to comment.