-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathdocset_iterators.h
512 lines (408 loc) · 21.6 KB
/
docset_iterators.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
// ## Important impl. requirement for all Iterators
// Iterators may not advance themselves (i.e via next() or advance() methods), or any of its sub-iterators they keep track of
// in their constructors. Doing so would cause all kinds of issues with Docsets Spans.
#pragma once
#include "docset_iterators_base.h"
#include <prioqueue.h>
#ifdef __clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wundefined-inline"
#endif
namespace Trinity {
struct queryexec_ctx;
struct candidate_document;
namespace Codecs {
struct PostingsListIterator;
}
namespace DocsSetIterators {
uint64_t cost(const Iterator *);
// This provides a DEFAULT scorer based on the iterator type
// in the future, you should be able to create your own wrappers that provide a score()
// based on the wrapped/owned iterator.
//
// For example, you may want a different IteratorScorer for all Disjunction-based iterators, where
// you multiply the accumulated score of their matched sub-iterators by a coefficient based on how many
// sub-iterators matched.
//
// It is not supported yet, but the ground-work to supporting has been laid out so all that's left is a way
// to perhaps augment ast_node with data that can be used by the exec.engine to materialize those wrappers.
//
// Besides, we do not expect most Trinity applications will use the ExecFlags::AccumulatedScoreScheme mode.
Iterator *wrap_iterator(queryexec_ctx *, Iterator *);
// This is Lucene's MinShouldMatchSumScorer.java port
// It's a clever design. From their description:
// It tracks iterators in:
// - lead: a linked list of iterators that are positioned on the desired docID, i.t (iterator->current() == curDocument.id)
//
// - tail: a heap/prioqueue that contains at most (matchThreshold - 1) iterators
// that are behind the desired docID. Those iterators are ordered by COST
// so that we can advance the least costly ones first. i.e it tracks
// the top-k most expensive trackers ordered by cost ASC, i.e top() is the
// least expensive tracker to advance.
//
// - head: a heap that tracks iterators that are past the desired documentID
// ordered by docID in order to advance quickly to the next candidate.
// i.e it tracks the top-k trackers ordered by docID ASC
//
// Finding the next match comes down to first setting the desired docID to the
// least entry in head, and then advancing tail until there's a match
//
// This is suitable for when you need to match between 2 and the total number of iterators
//
// It's not that fast though -- over 35% of run-time is spent manipulating head and tail priority queues.
// push/pop is fairly expensive, and unless we can figure out a way for this to work with sentinels or some other scheme, it's going to
// be faster to use a much simpler design that doesn't take into account costs etc, as long as it doesn't need to hammer those queues.
class DisjunctionSome final
: public Iterator {
friend uint64_t cost(const Iterator *);
friend struct IteratorScorer;
public:
struct it_tracker final {
Iterator * it;
uint64_t cost;
isrc_docid_t id; // we could just use it->current(), but this simplifies processing somewhat
it_tracker * next;
} * lead{nullptr};
bool allPLI;
public:
const uint16_t matchThreshold;
protected:
uint16_t curDocMatchedItsCnt{0};
it_tracker *trackersStorage{nullptr};
protected:
uint64_t cost_;
private:
struct Compare {
inline bool operator()(const it_tracker *const a, const it_tracker *const b) const noexcept {
return a->id < b->id;
}
};
struct CompareCost {
inline bool operator()(const it_tracker *const a, const it_tracker *const b) const noexcept {
return a->cost < b->cost;
}
};
Switch::priority_queue<it_tracker *, Compare> head;
Switch::priority_queue<it_tracker *, CompareCost> tail;
private:
void update_current();
void advance_tail(it_tracker *const top);
inline void advance_tail() {
advance_tail(tail.pop());
}
isrc_docid_t next_impl();
inline void add_lead(it_tracker *const t) {
t->next = lead;
lead = t;
++curDocMatchedItsCnt;
}
public:
DisjunctionSome(Iterator **const iterators, const uint16_t cnt, const uint16_t minMatch);
~DisjunctionSome() noexcept {
if (trackersStorage)
std::free(trackersStorage);
}
// Advancing to the next document. Iterators in the lead LL need to migrate to // tail. If tail is full, we take the least costly iterators and advance them.
isrc_docid_t next() override final;
isrc_docid_t advance(const isrc_docid_t target) override final;
// advance all entries from the tail to know about all matches on the current document
void update_matched_cnt();
#ifdef RDP_NEED_TOTAL_MATCHES
inline uint32_t total_matches() override final {
update_matched_cnt();
return curDocMatchedItsCnt;
}
#endif
};
// This is very handy, and we can use it to model NOT clauses
// e.g (foo NOT bar)
// so if you want to model e.g
// (a NOT b)
// you can use Filter(Iterator(a), Iterator(b))
struct Filter final
: public Iterator {
friend uint64_t cost(const Iterator *);
public:
// public so that build_span() can access them without going through accessors
Iterator *const req, *const filter;
private:
bool matches(const isrc_docid_t);
public:
Filter(Iterator *const r, Iterator *const f) noexcept
: Iterator{Type::Filter}, req{r}, filter{f} {
}
isrc_docid_t next() override final;
isrc_docid_t advance(const isrc_docid_t target) override final;
#ifdef RDP_NEED_TOTAL_MATCHES
inline uint32_t total_matches() override final {
return req->total_matches();
}
#endif
};
struct Optional final
: public Iterator {
public:
Iterator *const main, *const opt;
public:
Optional(Iterator *const m, Iterator *const o)
: Iterator{Type::Optional}, main{m}, opt{o} {
}
inline isrc_docid_t next() override final {
return curDocument.id = main->next();
}
inline isrc_docid_t advance(const isrc_docid_t target) override final {
return curDocument.id = main->advance(target);
}
#ifdef RDP_NEED_TOTAL_MATCHES
uint32_t total_matches() override final {
auto freq = main->total_matches();
const auto id = main->current();
auto optId = opt->current();
if (optId < id)
optId = opt->advance(id);
if (optId == id)
freq += opt->total_matches();
return freq;
}
#endif
};
struct idx_stack final {
uint16_t *const data;
uint16_t cnt;
idx_stack(const uint16_t capacity)
: data((uint16_t *)malloc(sizeof(uint16_t) * capacity)) {
}
~idx_stack() noexcept {
std::free(data);
}
};
struct DisjunctionAllPLI final
: public Iterator {
private:
struct Compare {
inline bool operator()(const Iterator *const a, const Iterator *const b) const noexcept {
return a->current() < b->current();
}
};
public:
idx_stack istack;
public:
Switch::priority_queue<Codecs::PostingsListIterator *, Compare> pq;
public:
DisjunctionAllPLI(Iterator **iterators, const isrc_docid_t cnt)
: Iterator{Type::DisjunctionAllPLI}, istack(cnt), pq{cnt} {
for (uint32_t i{0}; i != cnt; ++i)
pq.push((Codecs::PostingsListIterator *)iterators[i]);
}
isrc_docid_t next() override final;
isrc_docid_t advance(const isrc_docid_t target) override final;
#ifdef RDP_NEED_TOTAL_MATCHES
uint32_t total_matches() override final {
uint32_t cnt{0};
pq.for_each_top([&cnt](const auto it) { ++cnt; },
[](const auto a, const auto b) noexcept {
return a->current() == b->current();
});
return cnt;
}
#endif
};
// Will only match one, and will ignore the rest
// for 759721, std::priority_queue<> based requires 186ms
// while this based on Switch::priority_queue<> requires 128ms (32% faster)
//
// This is almost identifical to Disjunction, except it doesn't manipulate CDS
struct Disjunction final
: public Iterator {
private:
struct Compare final {
inline bool operator()(const Iterator *const a, const Iterator *const b) const noexcept {
return a->current() < b->current();
}
};
public:
idx_stack istack;
public:
Switch::priority_queue<Iterator *, Compare> pq;
public:
Disjunction(Iterator **iterators, const isrc_docid_t cnt)
: Iterator{Type::Disjunction}, istack(cnt), pq{cnt} {
for (uint32_t i{0}; i != cnt; ++i)
pq.push(iterators[i]);
}
isrc_docid_t next() override final;
isrc_docid_t advance(const isrc_docid_t target) override final;
#ifdef RDP_NEED_TOTAL_MATCHES
uint32_t total_matches() override final {
uint32_t cnt{0};
pq.for_each_top([&cnt](const auto it) { ++cnt; },
[](const auto a, const auto b) noexcept {
return a->current() == b->current();
});
return cnt;
}
#endif
};
struct ConjuctionAllPLI final
: public Iterator {
public:
Codecs::PostingsListIterator **const its;
uint16_t size;
private:
isrc_docid_t next_impl(isrc_docid_t id);
public:
ConjuctionAllPLI(Iterator **iterators, const uint16_t cnt)
: Iterator{Type::ConjuctionAllPLI}, size{cnt}, its((Codecs::PostingsListIterator **)malloc(sizeof(Codecs::PostingsListIterator *) * cnt)) {
require(cnt);
memcpy(its, iterators, cnt * sizeof(Codecs::PostingsListIterator *));
}
~ConjuctionAllPLI() noexcept {
std::free(its);
}
isrc_docid_t advance(const isrc_docid_t target) override final;
isrc_docid_t next() override final;
#ifdef RDP_NEED_TOTAL_MATCHES
inline uint32_t total_matches() override final {
return size;
}
#endif
};
// If we can, and we can, identify the set of leaders that are required like we do now for leadersAndSet
// then we won't need to use next(), but to use advance, then we can use
// the next_impl() design to skip ahead to the first likely common among on required decoders, instead of advancing in parallel and in-lock step
// because e.g in the case of phrases, this can be expensive.
struct Conjuction final
: public Iterator {
public:
Iterator **const its;
uint16_t size;
private:
isrc_docid_t next_impl(isrc_docid_t id);
public:
Conjuction(Iterator **iterators, const uint16_t cnt)
: Iterator{Type::Conjuction}, size{cnt}, its((Iterator **)malloc(sizeof(Iterator *) * cnt)) {
require(cnt);
memcpy(its, iterators, cnt * sizeof(Iterator *));
}
~Conjuction() noexcept {
std::free(its);
}
isrc_docid_t advance(const isrc_docid_t target) override final;
isrc_docid_t next() override final;
#ifdef RDP_NEED_TOTAL_MATCHES
inline uint32_t total_matches() override final {
return size;
}
#endif
};
// No longer inherring from Conjuction; some optimization opportunities open up when not doing so
// also important because Codecs::Decoder's don't capture_matched_term()
struct Phrase final
: public Iterator {
public:
bool consider_phrase_match(); // use by exec() / root iterations
public:
Codecs::PostingsListIterator **const its;
uint16_t size;
const uint16_t maxMatchCnt;
uint16_t matchCnt{0};
const bool release_docrefs;
private:
queryexec_ctx *const rctxRef;
isrc_docid_t next_impl(isrc_docid_t id);
public:
isrc_docid_t lastUncofirmedDID{DocIDsEND};
public:
Phrase(queryexec_ctx *r, Codecs::PostingsListIterator **iterators, const uint16_t cnt, const bool trackCnt, const bool docsOnly_)
: Iterator{Type::Phrase}, its((Codecs::PostingsListIterator **)malloc(sizeof(Codecs::PostingsListIterator *) * cnt)), size{cnt}, maxMatchCnt{uint16_t(trackCnt ? std::numeric_limits<uint16_t>::max() : 1)}, rctxRef{r}, release_docrefs{docsOnly_} {
require(cnt);
memcpy(its, iterators, sizeof(iterators[0]) * cnt);
}
~Phrase() noexcept {
std::free(its);
}
isrc_docid_t advance(const isrc_docid_t target) override final;
isrc_docid_t next() override final;
#ifdef RDP_NEED_TOTAL_MATCHES
inline uint32_t total_matches() override final {
return matchCnt;
}
#endif
};
struct VectorIDs final
: public Iterator {
friend uint64_t cost(const Iterator *);
protected:
std::vector<isrc_docid_t> ids;
private:
uint32_t idx{0};
public:
VectorIDs(const std::initializer_list<isrc_docid_t> list)
: Iterator{Type::VectorIDs} {
for (const auto id : list)
ids.push_back(id);
std::sort(ids.begin(), ids.end()); // TODO: use boost::spreadsort
}
inline isrc_docid_t next() override {
return idx == ids.size() ? curDocument.id = DocIDsEND : curDocument.id = ids[idx++];
}
isrc_docid_t advance(isrc_docid_t target) override {
isrc_docid_t id;
while ((id = next()) < target)
continue;
return id;
}
#ifdef RDP_NEED_TOTAL_MATCHES
inline uint32_t total_matches() override final {
return 1;
}
#endif
};
} // namespace DocsSetIterators
// See comments about it i relevant_document.h
struct relevant_document final
: public IteratorScorer {
double score_;
uint32_t matchesCnt;
struct dummy_iterator final
: public DocsSetIterators::Iterator {
isrc_docid_t advance(const isrc_docid_t target) override final {
return 0;
}
inline isrc_docid_t next() override final {
return 0;
}
dummy_iterator()
: Iterator{DocsSetIterators::Type::Dummy} {
}
} dummyIt;
relevant_document()
: IteratorScorer{&dummyIt} {
}
inline void set_document(const isrc_docid_t id) {
dummyIt.curDocument.id = id;
}
inline double iterator_score() override final {
return score_;
}
#ifdef RDP_NEED_TOTAL_MATCHES
// e.g for a single term, number of times matched in a document, for
// a conjuction the total number of operands, for a disjunction the total
// number of operands on document() etc.
inline uint32_t total_matches() override final {
return matchesCnt;
}
#endif
};
} // namespace Trinity
#ifdef RDP_NEED_TOTAL_MATCHES
uint32_t Trinity::IteratorScorer::total_matches() {
return it->total_matches();
}
#endif
Trinity::isrc_docid_t Trinity::IteratorScorer::document() const noexcept {
return it->current();
}
#ifdef __clang__
#pragma GCC diagnostic pop
#endif