Skip to content

Commit

Permalink
Adding aggregations in hybrid query
Browse files Browse the repository at this point in the history
Signed-off-by: Martin Gaievski <[email protected]>
  • Loading branch information
martin-gaievski committed Mar 12, 2024
1 parent c9cdcc1 commit fe4dfae
Show file tree
Hide file tree
Showing 12 changed files with 888 additions and 75 deletions.
4 changes: 1 addition & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Enhancements
### Bug Fixes
- Fix async actions are left in neural_sparse query ([#438](https://github.com/opensearch-project/neural-search/pull/438))
- Fixed exception for case when Hybrid query being wrapped into bool query ([#490](https://github.com/opensearch-project/neural-search/pull/490))
- Hybrid query and nested type fields ([#498](https://github.com/opensearch-project/neural-search/pull/498))
- Fix typo for sparse encoding processor factory([#578](https://github.com/opensearch-project/neural-search/pull/578))
- Add non-null check for queryBuilder in NeuralQueryEnricherProcessor ([#615](https://github.com/opensearch-project/neural-search/pull/615))
### Infrastructure
### Documentation
### Maintenance
- Added support for jdk-21 ([#500](https://github.com/opensearch-project/neural-search/pull/500)))
### Refactoring

## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.12...2.x)
### Features
### Enhancements
- Adding aggregations in hybrid query ([#630](https://github.com/opensearch-project/neural-search/pull/630))
### Bug Fixes
- Fix runtime exceptions in hybrid query for case when sub-query scorer return TwoPhase iterator that is incompatible with DISI iterator ([#624](https://github.com/opensearch-project/neural-search/pull/624))
### Infrastructure
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;

import org.apache.lucene.search.ScoreDoc;
Expand Down Expand Up @@ -131,13 +132,18 @@ private void updateQueryTopDocsWithCombinedScores(
compoundQueryTopDocs.setTotalHits(getTotalHits(topDocsPerSubQuery, maxHits));
}

/**
* Get max hits as number of unique doc ids from results of all sub-queries
* @param topDocsPerSubQuery list of topDocs objects for one shard
* @return number of unique doc ids
*/
protected int getMaxHits(final List<TopDocs> topDocsPerSubQuery) {
int maxHits = 0;
for (TopDocs topDocs : topDocsPerSubQuery) {
int hits = topDocs.scoreDocs.length;
maxHits = Math.max(maxHits, hits);
}
return maxHits;
Set<Integer> docIds = topDocsPerSubQuery.stream()
.filter(topDocs -> Objects.nonNull(topDocs.scoreDocs))
.flatMap(topDocs -> Arrays.stream(topDocs.scoreDocs))
.map(scoreDoc -> scoreDoc.doc)
.collect(Collectors.toSet());
return docIds.size();
}

private TotalHits getTotalHits(final List<TopDocs> topDocsPerSubQuery, int maxHits) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ protected boolean isThresholdReached() {
return hitCount >= getTotalHitsThreshold();
}

protected ScoreMode scoreMode() {
public ScoreMode scoreMode() {
return ScoreMode.TOP_SCORES;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import lombok.AllArgsConstructor;
import org.apache.lucene.search.CollectorManager;
import org.opensearch.neuralsearch.util.HybridQueryUtil;
import org.opensearch.search.aggregations.AggregationInitializationException;
import org.opensearch.search.aggregations.AggregationProcessor;
import org.opensearch.search.internal.SearchContext;
Expand All @@ -16,8 +17,6 @@
import java.io.IOException;
import java.util.List;

import static org.opensearch.neuralsearch.search.query.HybridQueryPhaseSearcher.isHybridQuery;

/**
* Defines logic for pre- and post-phases of document scores collection. Responsible for registering custom
* collector manager for hybris query (pre phase) and reducing results (post phase)
Expand All @@ -31,7 +30,7 @@ public class HybridAggregationProcessor implements AggregationProcessor {
public void preProcess(SearchContext context) {
delegateAggsProcessor.preProcess(context);

if (isHybridQuery(context.query(), context)) {
if (HybridQueryUtil.isHybridQuery(context.query(), context)) {
// adding collector manager for hybrid query
CollectorManager collectorManager;
try {
Expand All @@ -45,7 +44,7 @@ public void preProcess(SearchContext context) {

@Override
public void postProcess(SearchContext context) {
if (isHybridQuery(context.query(), context)) {
if (HybridQueryUtil.isHybridQuery(context.query(), context)) {
// for case when concurrent search is not enabled (default as of 2.12 release) reduce for collector
// managers is not called
// (https://github.com/opensearch-project/OpenSearch/blob/2.12/server/src/main/java/org/opensearch/search/query/QueryPhase.java#L333-L373)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@
import com.google.common.annotations.VisibleForTesting;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldExistsQuery;
import org.apache.lucene.search.Query;
import org.opensearch.common.settings.Settings;
import org.opensearch.index.mapper.MapperService;
import org.opensearch.index.mapper.SeqNoFieldMapper;
import org.opensearch.index.search.NestedHelper;
import org.opensearch.neuralsearch.query.HybridQuery;
import org.opensearch.neuralsearch.util.HybridQueryUtil;
import org.opensearch.search.aggregations.AggregationProcessor;
import org.opensearch.search.internal.ContextIndexSearcher;
import org.opensearch.search.internal.SearchContext;
Expand All @@ -34,10 +33,6 @@
@Log4j2
public class HybridQueryPhaseSearcher extends QueryPhaseSearcherWrapper {

public HybridQueryPhaseSearcher() {
super();
}

public boolean searchWith(
final SearchContext searchContext,
final ContextIndexSearcher searcher,
Expand All @@ -46,7 +41,7 @@ public boolean searchWith(
final boolean hasFilterCollector,
final boolean hasTimeout
) throws IOException {
if (!isHybridQuery(query, searchContext)) {
if (!HybridQueryUtil.isHybridQuery(query, searchContext)) {
validateQuery(searchContext, query);
return super.searchWith(searchContext, searcher, query, collectors, hasFilterCollector, hasTimeout);
} else {
Expand All @@ -55,46 +50,6 @@ public boolean searchWith(
}
}

@VisibleForTesting
static boolean isHybridQuery(final Query query, final SearchContext searchContext) {
if (query instanceof HybridQuery) {
return true;
} else if (isWrappedHybridQuery(query) && hasNestedFieldOrNestedDocs(query, searchContext)) {
/* Checking if this is a hybrid query that is wrapped into a Bool query by core Opensearch code
https://github.com/opensearch-project/OpenSearch/blob/main/server/src/main/java/org/opensearch/search/DefaultSearchContext.java#L367-L370.
main reason for that is performance optimization, at time of writing we are ok with loosing on performance if that's unblocks
hybrid query for indexes with nested field types.
in such case we consider query a valid hybrid query. Later in the code we will extract it and execute as a main query for
this search request.
below is sample structure of such query:
Boolean {
should: {
hybrid: {
sub_query1 {}
sub_query2 {}
}
}
filter: {
exists: {
field: "_primary_term"
}
}
}
TODO Need to add logic for passing hybrid sub-queries through the same logic in core to ensure there is no latency regression */
// we have already checked if query in instance of Boolean in higher level else if condition
return ((BooleanQuery) query).clauses()
.stream()
.filter(clause -> !(clause.getQuery() instanceof HybridQuery))
.allMatch(clause -> {
return clause.getOccur() == BooleanClause.Occur.FILTER
&& clause.getQuery() instanceof FieldExistsQuery
&& SeqNoFieldMapper.PRIMARY_TERM_NAME.equals(((FieldExistsQuery) clause.getQuery()).getField());
});
}
return false;
}

private static boolean hasNestedFieldOrNestedDocs(final Query query, final SearchContext searchContext) {
return searchContext.mapperService().hasNested() && new NestedHelper(searchContext.mapperService()).mightMatchNestedDocs(query);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.util;

import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldExistsQuery;
import org.apache.lucene.search.Query;
import org.opensearch.index.mapper.SeqNoFieldMapper;
import org.opensearch.index.search.NestedHelper;
import org.opensearch.neuralsearch.query.HybridQuery;
import org.opensearch.search.internal.SearchContext;

/**
* Utility class for anything related to hybrid query
*/
public class HybridQueryUtil {

Check warning on line 19 in src/main/java/org/opensearch/neuralsearch/util/HybridQueryUtil.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/opensearch/neuralsearch/util/HybridQueryUtil.java#L19

Added line #L19 was not covered by tests

public static boolean isHybridQuery(final Query query, final SearchContext searchContext) {
if (query instanceof HybridQuery) {
return true;
} else if (isWrappedHybridQuery(query) && hasNestedFieldOrNestedDocs(query, searchContext)) {
/* Checking if this is a hybrid query that is wrapped into a Bool query by core Opensearch code
https://github.com/opensearch-project/OpenSearch/blob/main/server/src/main/java/org/opensearch/search/DefaultSearchContext.java#L367-L370.
main reason for that is performance optimization, at time of writing we are ok with loosing on performance if that's unblocks
hybrid query for indexes with nested field types.
in such case we consider query a valid hybrid query. Later in the code we will extract it and execute as a main query for
this search request.
below is sample structure of such query:
Boolean {
should: {
hybrid: {
sub_query1 {}
sub_query2 {}
}
}
filter: {
exists: {
field: "_primary_term"
}
}
}
TODO Need to add logic for passing hybrid sub-queries through the same logic in core to ensure there is no latency regression */
// we have already checked if query in instance of Boolean in higher level else if condition
return ((BooleanQuery) query).clauses()
.stream()
.filter(clause -> clause.getQuery() instanceof HybridQuery == false)
.allMatch(clause -> {
return clause.getOccur() == BooleanClause.Occur.FILTER
&& clause.getQuery() instanceof FieldExistsQuery
&& SeqNoFieldMapper.PRIMARY_TERM_NAME.equals(((FieldExistsQuery) clause.getQuery()).getField());
});
}
return false;
}

private static boolean hasNestedFieldOrNestedDocs(final Query query, final SearchContext searchContext) {
return searchContext.mapperService().hasNested() && new NestedHelper(searchContext.mapperService()).mightMatchNestedDocs(query);
}

private static boolean isWrappedHybridQuery(final Query query) {
return query instanceof BooleanQuery
&& ((BooleanQuery) query).clauses().stream().anyMatch(clauseQuery -> clauseQuery.getQuery() instanceof HybridQuery);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public class NormalizationProcessorIT extends BaseNeuralSearchIT {
private final float[] testVector2 = createRandomVector(TEST_DIMENSION);
private final float[] testVector3 = createRandomVector(TEST_DIMENSION);
private final float[] testVector4 = createRandomVector(TEST_DIMENSION);
private final float[] testVector5 = createRandomVector(TEST_DIMENSION);
private final float[] testVector6 = createRandomVector(TEST_DIMENSION);

@Before
public void setUp() throws Exception {
Expand Down Expand Up @@ -318,7 +320,7 @@ private void initializeIndexIfNotExist(String indexName) throws IOException {
TEST_MULTI_DOC_INDEX_ONE_SHARD_NAME,
"5",
Collections.singletonList(TEST_KNN_VECTOR_FIELD_NAME_1),
Collections.singletonList(Floats.asList(testVector4).toArray()),
Collections.singletonList(Floats.asList(testVector5).toArray()),
Collections.singletonList(TEST_TEXT_FIELD_NAME_1),
Collections.singletonList(TEST_DOC_TEXT4)
);
Expand Down Expand Up @@ -365,15 +367,15 @@ private void initializeIndexIfNotExist(String indexName) throws IOException {
TEST_MULTI_DOC_INDEX_THREE_SHARDS_NAME,
"5",
Collections.singletonList(TEST_KNN_VECTOR_FIELD_NAME_1),
Collections.singletonList(Floats.asList(testVector4).toArray()),
Collections.singletonList(Floats.asList(testVector5).toArray()),
Collections.singletonList(TEST_TEXT_FIELD_NAME_1),
Collections.singletonList(TEST_DOC_TEXT4)
);
addKnnDoc(
TEST_MULTI_DOC_INDEX_THREE_SHARDS_NAME,
"6",
Collections.singletonList(TEST_KNN_VECTOR_FIELD_NAME_1),
Collections.singletonList(Floats.asList(testVector4).toArray()),
Collections.singletonList(Floats.asList(testVector6).toArray()),
Collections.singletonList(TEST_TEXT_FIELD_NAME_1),
Collections.singletonList(TEST_DOC_TEXT5)
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public void testCombination_whenMultipleSubqueriesResultsAndDefaultMethod_thenSc
assertNotNull(queryTopDocs);
assertEquals(3, queryTopDocs.size());

assertEquals(3, queryTopDocs.get(0).getScoreDocs().size());
assertEquals(5, queryTopDocs.get(0).getScoreDocs().size());
assertEquals(.5, queryTopDocs.get(0).getScoreDocs().get(0).score, DELTA_FOR_SCORE_ASSERTION);
assertEquals(1, queryTopDocs.get(0).getScoreDocs().get(0).doc);
assertEquals(.5, queryTopDocs.get(0).getScoreDocs().get(1).score, DELTA_FOR_SCORE_ASSERTION);
Expand Down
Loading

0 comments on commit fe4dfae

Please sign in to comment.