Adding aggregations in hybrid query (opensearch-project#630)

* Adding aggregations in hybrid query Signed-off-by: Martin Gaievski <[email protected]>
martin-gaievski · Mar 12, 2024 · f04c058 · f04c058
1 parent c9cdcc1
commit f04c058
Show file tree

Hide file tree

Showing 12 changed files with 1,044 additions and 67 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,19 +8,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ### Enhancements
 ### Bug Fixes
 - Fix async actions are left in neural_sparse query ([#438](https://github.com/opensearch-project/neural-search/pull/438))
-- Fixed exception for case when Hybrid query being wrapped into bool query ([#490](https://github.com/opensearch-project/neural-search/pull/490))
-- Hybrid query and nested type fields ([#498](https://github.com/opensearch-project/neural-search/pull/498))
 - Fix typo for sparse encoding processor factory([#578](https://github.com/opensearch-project/neural-search/pull/578))
 - Add non-null check for queryBuilder in NeuralQueryEnricherProcessor ([#615](https://github.com/opensearch-project/neural-search/pull/615))
 ### Infrastructure
 ### Documentation
 ### Maintenance
-- Added support for jdk-21 ([#500](https://github.com/opensearch-project/neural-search/pull/500)))
 ### Refactoring
 
 ## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.12...2.x)
 ### Features
 ### Enhancements
+- Adding aggregations in hybrid query ([#630](https://github.com/opensearch-project/neural-search/pull/630))
 ### Bug Fixes
 - Fix runtime exceptions in hybrid query for case when sub-query scorer return TwoPhase iterator that is incompatible with DISI iterator ([#624](https://github.com/opensearch-project/neural-search/pull/624))
 ### Infrastructure

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/combination/ScoreCombiner.java b/src/main/java/org/opensearch/neuralsearch/processor/combination/ScoreCombiner.java
@@ -10,6 +10,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Objects;
+import java.util.Set;
 import java.util.stream.Collectors;
 
 import org.apache.lucene.search.ScoreDoc;
@@ -131,13 +132,18 @@ private void updateQueryTopDocsWithCombinedScores(
         compoundQueryTopDocs.setTotalHits(getTotalHits(topDocsPerSubQuery, maxHits));
     }
 
+    /**
+     * Get max hits as number of unique doc ids from results of all sub-queries
+     * @param topDocsPerSubQuery list of topDocs objects for one shard
+     * @return number of unique doc ids
+     */
     protected int getMaxHits(final List<TopDocs> topDocsPerSubQuery) {
-        int maxHits = 0;
-        for (TopDocs topDocs : topDocsPerSubQuery) {
-            int hits = topDocs.scoreDocs.length;
-            maxHits = Math.max(maxHits, hits);
-        }
-        return maxHits;
+        Set<Integer> docIds = topDocsPerSubQuery.stream()
+            .filter(topDocs -> Objects.nonNull(topDocs.scoreDocs))
+            .flatMap(topDocs -> Arrays.stream(topDocs.scoreDocs))
+            .map(scoreDoc -> scoreDoc.doc)
+            .collect(Collectors.toSet());
+        return docIds.size();
     }
 
     private TotalHits getTotalHits(final List<TopDocs> topDocsPerSubQuery, int maxHits) {

diff --git a/src/main/java/org/opensearch/neuralsearch/search/query/HybridAggregationProcessor.java b/src/main/java/org/opensearch/neuralsearch/search/query/HybridAggregationProcessor.java
@@ -16,7 +16,7 @@
 import java.io.IOException;
 import java.util.List;
 
-import static org.opensearch.neuralsearch.search.query.HybridQueryPhaseSearcher.isHybridQuery;
+import static org.opensearch.neuralsearch.util.HybridQueryUtil.isHybridQuery;
 
 /**
  * Defines logic for pre- and post-phases of document scores collection. Responsible for registering custom

diff --git a/src/main/java/org/opensearch/neuralsearch/search/query/HybridQueryPhaseSearcher.java b/src/main/java/org/opensearch/neuralsearch/search/query/HybridQueryPhaseSearcher.java
@@ -11,11 +11,9 @@
 import com.google.common.annotations.VisibleForTesting;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.FieldExistsQuery;
 import org.apache.lucene.search.Query;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.index.mapper.MapperService;
-import org.opensearch.index.mapper.SeqNoFieldMapper;
 import org.opensearch.index.search.NestedHelper;
 import org.opensearch.neuralsearch.query.HybridQuery;
 import org.opensearch.search.aggregations.AggregationProcessor;
@@ -27,17 +25,15 @@
 
 import lombok.extern.log4j.Log4j2;
 
+import static org.opensearch.neuralsearch.util.HybridQueryUtil.isHybridQuery;
+
 /**
  * Custom search implementation to be used at {@link QueryPhase} for Hybrid Query search. For queries other than Hybrid the
  * upstream standard implementation of searcher is called.
  */
 @Log4j2
 public class HybridQueryPhaseSearcher extends QueryPhaseSearcherWrapper {
 
-    public HybridQueryPhaseSearcher() {
-        super();
-    }
-
     public boolean searchWith(
         final SearchContext searchContext,
         final ContextIndexSearcher searcher,
@@ -55,46 +51,6 @@ public boolean searchWith(
         }
     }
 
-    @VisibleForTesting
-    static boolean isHybridQuery(final Query query, final SearchContext searchContext) {
-        if (query instanceof HybridQuery) {
-            return true;
-        } else if (isWrappedHybridQuery(query) && hasNestedFieldOrNestedDocs(query, searchContext)) {
-            /* Checking if this is a hybrid query that is wrapped into a Bool query by core Opensearch code
-            https://github.com/opensearch-project/OpenSearch/blob/main/server/src/main/java/org/opensearch/search/DefaultSearchContext.java#L367-L370.
-            main reason for that is performance optimization, at time of writing we are ok with loosing on performance if that's unblocks
-            hybrid query for indexes with nested field types.
-            in such case we consider query a valid hybrid query. Later in the code we will extract it and execute as a main query for
-            this search request.
-            below is sample structure of such query:
-
-            Boolean {
-               should: {
-                   hybrid: {
-                       sub_query1 {}
-                       sub_query2 {}
-                   }
-               }
-               filter: {
-                   exists: {
-                       field: "_primary_term"
-                   }
-               }
-            }
-            TODO Need to add logic for passing hybrid sub-queries through the same logic in core to ensure there is no latency regression */
-            // we have already checked if query in instance of Boolean in higher level else if condition
-            return ((BooleanQuery) query).clauses()
-                .stream()
-                .filter(clause -> !(clause.getQuery() instanceof HybridQuery))
-                .allMatch(clause -> {
-                    return clause.getOccur() == BooleanClause.Occur.FILTER
-                        && clause.getQuery() instanceof FieldExistsQuery
-                        && SeqNoFieldMapper.PRIMARY_TERM_NAME.equals(((FieldExistsQuery) clause.getQuery()).getField());
-                });
-        }
-        return false;
-    }
-
     private static boolean hasNestedFieldOrNestedDocs(final Query query, final SearchContext searchContext) {
         return searchContext.mapperService().hasNested() && new NestedHelper(searchContext.mapperService()).mightMatchNestedDocs(query);
     }

diff --git a/src/main/java/org/opensearch/neuralsearch/util/HybridQueryUtil.java b/src/main/java/org/opensearch/neuralsearch/util/HybridQueryUtil.java
@@ -0,0 +1,71 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.util;
+
+import lombok.AccessLevel;
+import lombok.NoArgsConstructor;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FieldExistsQuery;
+import org.apache.lucene.search.Query;
+import org.opensearch.index.mapper.SeqNoFieldMapper;
+import org.opensearch.index.search.NestedHelper;
+import org.opensearch.neuralsearch.query.HybridQuery;
+import org.opensearch.search.internal.SearchContext;
+
+/**
+ * Utility class for anything related to hybrid query
+ */
+@NoArgsConstructor(access = AccessLevel.PRIVATE)
+public class HybridQueryUtil {
+
+    public static boolean isHybridQuery(final Query query, final SearchContext searchContext) {
+        if (query instanceof HybridQuery) {
+            return true;
+        } else if (isWrappedHybridQuery(query) && hasNestedFieldOrNestedDocs(query, searchContext)) {
+            /* Checking if this is a hybrid query that is wrapped into a Bool query by core Opensearch code
+            https://github.com/opensearch-project/OpenSearch/blob/main/server/src/main/java/org/opensearch/search/DefaultSearchContext.java#L367-L370.
+            main reason for that is performance optimization, at time of writing we are ok with loosing on performance if that's unblocks
+            hybrid query for indexes with nested field types.
+            in such case we consider query a valid hybrid query. Later in the code we will extract it and execute as a main query for
+            this search request.
+            below is sample structure of such query:
+
+            Boolean {
+               should: {
+                   hybrid: {
+                       sub_query1 {}
+                       sub_query2 {}
+                   }
+               }
+               filter: {
+                   exists: {
+                       field: "_primary_term"
+                   }
+               }
+            }
+            TODO Need to add logic for passing hybrid sub-queries through the same logic in core to ensure there is no latency regression */
+            // we have already checked if query in instance of Boolean in higher level else if condition
+            return ((BooleanQuery) query).clauses()
+                .stream()
+                .filter(clause -> clause.getQuery() instanceof HybridQuery == false)
+                .allMatch(clause -> {
+                    return clause.getOccur() == BooleanClause.Occur.FILTER
+                        && clause.getQuery() instanceof FieldExistsQuery
+                        && SeqNoFieldMapper.PRIMARY_TERM_NAME.equals(((FieldExistsQuery) clause.getQuery()).getField());
+                });
+        }
+        return false;
+    }
+
+    private static boolean hasNestedFieldOrNestedDocs(final Query query, final SearchContext searchContext) {
+        return searchContext.mapperService().hasNested() && new NestedHelper(searchContext.mapperService()).mightMatchNestedDocs(query);
+    }
+
+    private static boolean isWrappedHybridQuery(final Query query) {
+        return query instanceof BooleanQuery
+            && ((BooleanQuery) query).clauses().stream().anyMatch(clauseQuery -> clauseQuery.getQuery() instanceof HybridQuery);
+    }
+}
diff --git a/src/test/java/org/opensearch/neuralsearch/processor/NormalizationProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/NormalizationProcessorIT.java
@@ -52,6 +52,8 @@ public class NormalizationProcessorIT extends BaseNeuralSearchIT {
     private final float[] testVector2 = createRandomVector(TEST_DIMENSION);
     private final float[] testVector3 = createRandomVector(TEST_DIMENSION);
     private final float[] testVector4 = createRandomVector(TEST_DIMENSION);
+    private final float[] testVector5 = createRandomVector(TEST_DIMENSION);
+    private final float[] testVector6 = createRandomVector(TEST_DIMENSION);
 
     @Before
     public void setUp() throws Exception {
@@ -318,7 +320,7 @@ private void initializeIndexIfNotExist(String indexName) throws IOException {
                 TEST_MULTI_DOC_INDEX_ONE_SHARD_NAME,
                 "5",
                 Collections.singletonList(TEST_KNN_VECTOR_FIELD_NAME_1),
-                Collections.singletonList(Floats.asList(testVector4).toArray()),
+                Collections.singletonList(Floats.asList(testVector5).toArray()),
                 Collections.singletonList(TEST_TEXT_FIELD_NAME_1),
                 Collections.singletonList(TEST_DOC_TEXT4)
             );
@@ -365,15 +367,15 @@ private void initializeIndexIfNotExist(String indexName) throws IOException {
                 TEST_MULTI_DOC_INDEX_THREE_SHARDS_NAME,
                 "5",
                 Collections.singletonList(TEST_KNN_VECTOR_FIELD_NAME_1),
-                Collections.singletonList(Floats.asList(testVector4).toArray()),
+                Collections.singletonList(Floats.asList(testVector5).toArray()),
                 Collections.singletonList(TEST_TEXT_FIELD_NAME_1),
                 Collections.singletonList(TEST_DOC_TEXT4)
             );
             addKnnDoc(
                 TEST_MULTI_DOC_INDEX_THREE_SHARDS_NAME,
                 "6",
                 Collections.singletonList(TEST_KNN_VECTOR_FIELD_NAME_1),
-                Collections.singletonList(Floats.asList(testVector4).toArray()),
+                Collections.singletonList(Floats.asList(testVector6).toArray()),
                 Collections.singletonList(TEST_TEXT_FIELD_NAME_1),
                 Collections.singletonList(TEST_DOC_TEXT5)
             );

diff --git a/src/test/java/org/opensearch/neuralsearch/processor/ScoreCombinationTechniqueTests.java b/src/test/java/org/opensearch/neuralsearch/processor/ScoreCombinationTechniqueTests.java
@@ -63,7 +63,7 @@ public void testCombination_whenMultipleSubqueriesResultsAndDefaultMethod_thenSc
         assertNotNull(queryTopDocs);
         assertEquals(3, queryTopDocs.size());
 
-        assertEquals(3, queryTopDocs.get(0).getScoreDocs().size());
+        assertEquals(5, queryTopDocs.get(0).getScoreDocs().size());
         assertEquals(.5, queryTopDocs.get(0).getScoreDocs().get(0).score, DELTA_FOR_SCORE_ASSERTION);
         assertEquals(1, queryTopDocs.get(0).getScoreDocs().get(0).doc);
         assertEquals(.5, queryTopDocs.get(0).getScoreDocs().get(1).score, DELTA_FOR_SCORE_ASSERTION);