Skip to content

Commit

Permalink
initial support for remap
Browse files Browse the repository at this point in the history
  • Loading branch information
jakelandis committed Feb 8, 2024
1 parent c6db792 commit 99efbbe
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -477,11 +477,12 @@ long globalOrdToBucketOrd(long owningBucketOrd, long globalOrd) {
return globalOrd;
}

boolean EXCLUDE_DELETE_DOCS = true; //TODO: model this as part of the agg itself
boolean EXCLUDE_DELETE_DOCS = true; // TODO: model this as part of the agg itself

@Override
void forEach(long owningBucketOrd, BucketInfoConsumer consumer) throws IOException {
assert owningBucketOrd == 0;
if(EXCLUDE_DELETE_DOCS) {
if (EXCLUDE_DELETE_DOCS) {
forEachExcludeDeletedDocs(consumer);
} else {
forEachIgnoreDeletedDocs(consumer);
Expand All @@ -491,7 +492,7 @@ void forEach(long owningBucketOrd, BucketInfoConsumer consumer) throws IOExcepti
/**
* Allows deleted docs in the results by ignoring the associated liveDocs. More performant than excluding them.
*/
private void forEachIgnoreDeletedDocs(BucketInfoConsumer consumer) throws IOException{
private void forEachIgnoreDeletedDocs(BucketInfoConsumer consumer) throws IOException {
for (long globalOrd = 0; globalOrd < valueCount; globalOrd++) {
if (false == acceptedGlobalOrdinals.test(globalOrd)) {
continue;
Expand All @@ -506,7 +507,7 @@ private void forEachIgnoreDeletedDocs(BucketInfoConsumer consumer) throws IOExce
/**
* Excludes deleted docs in the results by cross-checking with liveDocs. Less performant than ignoring liveDocs.
*/
private void forEachExcludeDeletedDocs(BucketInfoConsumer consumer) throws IOException{
private void forEachExcludeDeletedDocs(BucketInfoConsumer consumer) throws IOException {
LongHash accepted = null;
boolean acceptedAllGlobalOrdinals = false;
for (LeafReaderContext ctx : searcher().getTopReaderContext().leaves()) {
Expand Down Expand Up @@ -598,9 +599,21 @@ long globalOrdToBucketOrd(long owningBucketOrd, long globalOrd) {
return bucketOrds.find(owningBucketOrd, globalOrd);
}

boolean EXCLUDE_DELETE_DOCS = true; // TODO: model this as part of the agg itself

@Override
void forEach(long owningBucketOrd, BucketInfoConsumer consumer) throws IOException {
// TODO: fix here too ?
if (EXCLUDE_DELETE_DOCS && bucketCountThresholds.getMinDocCount() == 0) {
forEachExcludeDeletedDocs(owningBucketOrd, consumer);
} else {
forEachIgnoreDeletedDocs(owningBucketOrd, consumer);
}
}

/**
* Allows deleted docs in the results by ignoring the associated liveDocs. More performant than excluding them.
*/
void forEachIgnoreDeletedDocs(long owningBucketOrd, BucketInfoConsumer consumer) throws IOException {
if (bucketCountThresholds.getMinDocCount() == 0) {
for (long globalOrd = 0; globalOrd < valueCount; globalOrd++) {
if (false == acceptedGlobalOrdinals.test(globalOrd)) {
Expand Down Expand Up @@ -634,6 +647,71 @@ void forEach(long owningBucketOrd, BucketInfoConsumer consumer) throws IOExcepti
}
}

// FIXME : CHECK THIS LOGIC...ADDED AT END OF DAY IN HASTE

/**
* Excludes deleted docs in the results by cross-checking with liveDocs. Less performant than ignoring liveDocs.
*/
void forEachExcludeDeletedDocs(long owningBucketOrd, BucketInfoConsumer consumer) throws IOException {

assert bucketCountThresholds.getMinDocCount() == 0;
LongHash accepted = null;
boolean acceptedAllGlobalOrdinals = false;
for (LeafReaderContext ctx : searcher().getTopReaderContext().leaves()) {
if (acceptedAllGlobalOrdinals) {
break;
}
LeafReader reader = ctx.reader();
Bits liveDocs = reader.getLiveDocs();
SortedSetDocValues globalOrds = null;
for (int docId = 0; docId < reader.maxDoc(); ++docId) {
if (acceptedAllGlobalOrdinals) {
break;
}
if (liveDocs == null || liveDocs.get(docId)) { // document is not deleted
globalOrds = globalOrds == null ? valuesSource.globalOrdinalsValues(ctx) : globalOrds;
if (globalOrds.advanceExact(docId)) {
long maxDocOrdinals = globalOrds.getValueCount();
for (long globalOrd = globalOrds.nextOrd(); globalOrd != NO_MORE_ORDS; globalOrd = globalOrds.nextOrd()) {
if (accepted != null && accepted.find(globalOrd) >= 0) {
continue;
}
if (false == acceptedGlobalOrdinals.test(globalOrd)) {
continue;
}
if (false == acceptedGlobalOrdinals.test(globalOrd)) {
continue;
}
/*
* Use `add` instead of `find` here to assign an ordinal
* even if the global ord wasn't found so we can build
* sub-aggregations without trouble even though we haven't
* hit any documents for them. This is wasteful, but
* settings minDocCount == 0 is wasteful in general.....
*/
long bucketOrd = bucketOrds.add(owningBucketOrd, globalOrd);
long docCount;
if (bucketOrd < 0) {
bucketOrd = -1 - bucketOrd;
docCount = bucketDocCount(bucketOrd);
} else {
docCount = 0;
}
accepted = accepted == null ? new LongHash(maxDocOrdinals / 2, NON_RECYCLING_INSTANCE) : accepted;
assert globalOrd >= 0;
consumer.accept(globalOrd, bucketOrd, docCount);
accepted.add(globalOrd);
if (accepted.size() == maxDocOrdinals) { // stop looking if all of them have been accepted
acceptedAllGlobalOrdinals = true;
break;
}
}
}
}
}
}
}

@Override
public void close() {
bucketOrds.close();
Expand Down Expand Up @@ -667,7 +745,7 @@ private InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws
// if minDocCount == 0 then we can end up with more buckets then maxBucketOrd() returns
size = (int) Math.min(valueCount, bucketCountThresholds.getShardSize());

//TODO: remove all of this
// TODO: remove all of this
for (LeafReaderContext ctx : searcher().getIndexReader().leaves()) {
Bits liveDocs = ctx.reader().getLiveDocs();
if (liveDocs == null) { // all documents are live - no need to check per doc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ static void registerAggregators(ValuesSourceRegistry.Builder builder) {
* to take up nearly as much memory anyway it might be worth it to use
* filters. More experiment is required.
*/
static final long MAX_ORDS_TO_TRY_FILTERS = 1; //FIXME: put this back to 1000 you big dumb ape
static final long MAX_ORDS_TO_TRY_FILTERS = 1;

/**
* This supplier is used for all the field types that should be aggregated as bytes/strings,
Expand Down

0 comments on commit 99efbbe

Please sign in to comment.