Skip to content

Commit

Permalink
Add case for no deleted docs
Browse files Browse the repository at this point in the history
Signed-off-by: Sandesh Kumar <[email protected]>
  • Loading branch information
sandeshkr419 committed Jan 24, 2024
1 parent ce1082c commit a5b3baa
Showing 1 changed file with 86 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
Expand All @@ -49,6 +50,7 @@
import org.opensearch.common.util.LongHash;
import org.opensearch.core.common.io.stream.StreamOutput;
import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.index.mapper.DocCountFieldMapper;
import org.opensearch.search.DocValueFormat;
import org.opensearch.search.aggregations.AggregationExecutionException;
import org.opensearch.search.aggregations.Aggregator;
Expand All @@ -64,7 +66,6 @@
import org.opensearch.search.aggregations.bucket.terms.SignificanceLookup.BackgroundFrequencyForBytes;
import org.opensearch.search.aggregations.bucket.terms.heuristic.SignificanceHeuristic;
import org.opensearch.search.aggregations.support.ValuesSource;
import org.opensearch.search.aggregations.support.ValuesSource.Bytes.WithOrdinals;
import org.opensearch.search.internal.SearchContext;

import java.io.IOException;
Expand All @@ -78,6 +79,7 @@

import static org.opensearch.search.aggregations.InternalOrder.isKeyOrder;
import static org.apache.lucene.index.SortedSetDocValues.NO_MORE_ORDS;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

/**
* An aggregator of string values that relies on global ordinals in order to build buckets.
Expand All @@ -94,6 +96,8 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr
private final LongPredicate acceptedGlobalOrdinals;
private final long valueCount;

private final String fieldName;

private Weight weight;
private final GlobalOrdLookupFunction lookupGlobalOrd;
protected final CollectionStrategy collectionStrategy;
Expand Down Expand Up @@ -146,6 +150,7 @@ public GlobalOrdinalsStringTermsAggregator(
return new DenseGlobalOrds();
});
}
this.fieldName = ((ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource).indexFieldData.getFieldName();
}

String descriptCollectionStrategy() {
Expand All @@ -156,22 +161,80 @@ public void setWeight(Weight weight) {
this.weight = weight;
}

@Override
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
if (weight != null && weight.getQuery() instanceof MatchAllDocsQuery) {
if ((weight.count(ctx) == 0)
&& Terms.getTerms(ctx.reader(), String.valueOf(((WithOrdinals.FieldData) valuesSource).indexFieldData.getFieldName()))
.size() == 0) {
return LeafBucketCollector.NO_OP_COLLECTOR;
// } else if (weight.count(ctx) == ctx.reader().maxDoc() && weight.getQuery() instanceof MatchAllDocsQuery) {
// no deleted documents & top level query matches everything
// iterate over the terms - doc frequency for each termsEnum directly
// return appropriate LeafCollector
/**
Collects term frequencies for a given field from a LeafReaderContext.
@param ctx The LeafReaderContext to collect terms from
@param ords The SortedSetDocValues for the field's ordinals
@param ordCountConsumer A consumer to accept collected term frequencies
@return A LeafBucketCollector implementation that throws an exception, since collection is complete
@throws IOException If an I/O error occurs during reading */
LeafBucketCollector termDocFreqCollector(LeafReaderContext ctx, SortedSetDocValues ords, BiConsumer<Long, Integer> ordCountConsumer)
throws IOException {
// long n0 = System.nanoTime(), n1, n2, n3, n4, n5 = 0;
if (weight.count(ctx) != ctx.reader().maxDoc()) {
// Top-level query does not match all docs in this segment.
return null;
}
// n1 = System.nanoTime();

Terms aggTerms = ctx.reader().terms(this.fieldName);
if (aggTerms == null) {
// Field is not indexed.
return null;
}
// n2 = System.nanoTime();
NumericDocValues docCountValues = DocValues.getNumeric(ctx.reader(), DocCountFieldMapper.NAME);
if (docCountValues.nextDoc() != NO_MORE_DOCS) {
// This segment has at least one document with the _doc_count field.
return null;
}
// n3 = System.nanoTime();
TermsEnum indexTermsEnum = aggTerms.iterator();
BytesRef indexTerm = indexTermsEnum.next();
TermsEnum ordinalTermsEnum = ords.termsEnum();
BytesRef ordinalTerm = ordinalTermsEnum.next();
// n4 = System.nanoTime();
while (indexTerm != null && ordinalTerm != null) {
int compare = indexTerm.compareTo(ordinalTerm);
if (compare == 0) {
if (acceptedGlobalOrdinals.test(ordinalTermsEnum.ord())) {
ordCountConsumer.accept(ordinalTermsEnum.ord(), indexTermsEnum.docFreq());
}
indexTerm = indexTermsEnum.next();
ordinalTerm = ordinalTermsEnum.next();
} else if (compare < 0) {
indexTerm = indexTermsEnum.next();
} else {
ordinalTerm = ordinalTermsEnum.next();
}
// n5 = System.nanoTime();
}
// logger.info((n1 - n0) + " " + (n2 - n1) + " " + (n3 - n2) + " " + (n4 - n3) + " " + (n5 - n4));
// return new LeafBucketCollector() {
// @Override
// public void collect(int doc, long owningBucketOrd) {
// throw new CollectionTerminatedException();
// }
// };
return LeafBucketCollector.NO_OP_COLLECTOR;
}

@Override
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCollector sub) throws IOException {
SortedSetDocValues globalOrds = valuesSource.globalOrdinalsValues(ctx);
collectionStrategy.globalOrdsReady(globalOrds);

if (collectionStrategy instanceof DenseGlobalOrds && sub == LeafBucketCollector.NO_OP_COLLECTOR) {
LeafBucketCollector termDocFreqCollector = termDocFreqCollector(
ctx,
globalOrds,
(o, c) -> incrementBucketDocCount(collectionStrategy.globalOrdToBucketOrd(0, o), c)
);
if (termDocFreqCollector != null) {
return termDocFreqCollector;
}
}

SortedDocValues singleValues = DocValues.unwrapSingleton(globalOrds);
if (singleValues != null) {
segmentsWithSingleValuedOrds++;
Expand Down Expand Up @@ -369,6 +432,16 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, LeafBucketCol
final SortedSetDocValues segmentOrds = valuesSource.ordinalsValues(ctx);
segmentDocCounts = context.bigArrays().grow(segmentDocCounts, 1 + segmentOrds.getValueCount());
assert sub == LeafBucketCollector.NO_OP_COLLECTOR;

LeafBucketCollector termDocFreqCollector = this.termDocFreqCollector(
ctx,
segmentOrds,
(o, c) -> segmentDocCounts.increment(o + 1, c)
);
if (termDocFreqCollector != null) {
return termDocFreqCollector;
}

final SortedDocValues singleValues = DocValues.unwrapSingleton(segmentOrds);
mapping = valuesSource.globalOrdinalsMapping(ctx);
// Dense mode doesn't support include/exclude so we don't have to check it here.
Expand Down

0 comments on commit a5b3baa

Please sign in to comment.