From 8394134d99ca68b5acc58833b89c7b0ce3c44c71 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Fri, 27 Oct 2023 11:59:53 +0800 Subject: [PATCH 1/7] rm bounded linear feature query Signed-off-by: zhichao-aws --- .../lucene/BoundedLinearFeatureQuery.java | 237 ------------------ .../query/NeuralSparseQueryBuilder.java | 10 +- 2 files changed, 2 insertions(+), 245 deletions(-) delete mode 100644 src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java diff --git a/src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java b/src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java deleted file mode 100644 index a914f3156..000000000 --- a/src/main/java/org/apache/lucene/BoundedLinearFeatureQuery.java +++ /dev/null @@ -1,237 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - * Modifications Copyright OpenSearch Contributors. See - * GitHub history for details. - */ - -/* - * This class is built based on lucene FeatureQuery. We use LinearFuntion to - * build the query and add an upperbound to it. - */ - -package org.apache.lucene; - -import java.io.IOException; -import java.util.Objects; - -import org.apache.lucene.index.ImpactsEnum; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.ImpactsDISI; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MaxScoreCache; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.QueryVisitor; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.Weight; -import org.apache.lucene.search.similarities.Similarity.SimScorer; -import org.apache.lucene.util.BytesRef; - -/** - * The feature queries of input tokens are wrapped by lucene BooleanQuery, which use WAND algorithm - * to accelerate the execution. The WAND algorithm leverage the score upper bound of sub-queries to - * skip non-competitive tokens. However, origin lucene FeatureQuery use Float.MAX_VALUE as the score - * upper bound, and this invalidates WAND. - * - * To mitigate this issue, we rewrite the FeatureQuery to BoundedLinearFeatureQuery. The caller can - * set the token score upperbound of this query. And according to our use case, we use LinearFunction - * as the score function. - * - * This class combines both FeatureQuery - * and FeatureField together - * and will be deprecated after OpenSearch upgraded lucene to version 9.8. - */ - -public final class BoundedLinearFeatureQuery extends Query { - - private final String fieldName; - private final String featureName; - private final Float scoreUpperBound; - - public BoundedLinearFeatureQuery(String fieldName, String featureName, Float scoreUpperBound) { - this.fieldName = Objects.requireNonNull(fieldName); - this.featureName = Objects.requireNonNull(featureName); - this.scoreUpperBound = Objects.requireNonNull(scoreUpperBound); - } - - @Override - public Query rewrite(IndexSearcher indexSearcher) throws IOException { - // LinearFunction return same object for rewrite - return super.rewrite(indexSearcher); - } - - @Override - public boolean equals(Object obj) { - if (obj == null || getClass() != obj.getClass()) { - return false; - } - BoundedLinearFeatureQuery that = (BoundedLinearFeatureQuery) obj; - return Objects.equals(fieldName, that.fieldName) - && Objects.equals(featureName, that.featureName) - && Objects.equals(scoreUpperBound, that.scoreUpperBound); - } - - @Override - public int hashCode() { - int h = getClass().hashCode(); - h = 31 * h + fieldName.hashCode(); - h = 31 * h + featureName.hashCode(); - h = 31 * h + scoreUpperBound.hashCode(); - return h; - } - - @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - if (!scoreMode.needsScores()) { - // We don't need scores (e.g. for faceting), and since features are stored as terms, - // allow TermQuery to optimize in this case - TermQuery tq = new TermQuery(new Term(fieldName, featureName)); - return searcher.rewrite(tq).createWeight(searcher, scoreMode, boost); - } - - return new Weight(this) { - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return false; - } - - @Override - public Explanation explain(LeafReaderContext context, int doc) throws IOException { - String desc = "weight(" + getQuery() + " in " + doc + ") [\" BoundedLinearFeatureQuery \"]"; - - Terms terms = context.reader().terms(fieldName); - if (terms == null) { - return Explanation.noMatch(desc + ". Field " + fieldName + " doesn't exist."); - } - TermsEnum termsEnum = terms.iterator(); - if (termsEnum.seekExact(new BytesRef(featureName)) == false) { - return Explanation.noMatch(desc + ". Feature " + featureName + " doesn't exist."); - } - - PostingsEnum postings = termsEnum.postings(null, PostingsEnum.FREQS); - if (postings.advance(doc) != doc) { - return Explanation.noMatch(desc + ". Feature " + featureName + " isn't set."); - } - - int freq = postings.freq(); - float featureValue = decodeFeatureValue(freq); - float score = boost * featureValue; - return Explanation.match( - score, - "Linear function on the " + fieldName + " field for the " + featureName + " feature, computed as w * S from:", - Explanation.match(boost, "w, weight of this function"), - Explanation.match(featureValue, "S, feature value") - ); - } - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - Terms terms = Terms.getTerms(context.reader(), fieldName); - TermsEnum termsEnum = terms.iterator(); - if (termsEnum.seekExact(new BytesRef(featureName)) == false) { - return null; - } - - final SimScorer scorer = new SimScorer() { - @Override - public float score(float freq, long norm) { - return boost * decodeFeatureValue(freq); - } - }; - final ImpactsEnum impacts = termsEnum.impacts(PostingsEnum.FREQS); - MaxScoreCache maxScoreCache = new MaxScoreCache(impacts, scorer); - final ImpactsDISI impactsDisi = new ImpactsDISI(impacts, maxScoreCache); - - return new Scorer(this) { - - @Override - public int docID() { - return impacts.docID(); - } - - @Override - public float score() throws IOException { - return scorer.score(impacts.freq(), 1L); - } - - @Override - public DocIdSetIterator iterator() { - return impactsDisi; - } - - @Override - public int advanceShallow(int target) throws IOException { - return impactsDisi.getMaxScoreCache().advanceShallow(target); - } - - @Override - public float getMaxScore(int upTo) throws IOException { - return impactsDisi.getMaxScoreCache().getMaxScore(upTo); - } - - @Override - public void setMinCompetitiveScore(float minScore) { - impactsDisi.setMinCompetitiveScore(minScore); - } - }; - } - }; - } - - @Override - public void visit(QueryVisitor visitor) { - if (visitor.acceptField(fieldName)) { - visitor.visitLeaf(this); - } - } - - @Override - public String toString(String field) { - return "BoundedLinearFeatureQuery(field=" + fieldName + ", feature=" + featureName + ", scoreUpperBound=" + scoreUpperBound + ")"; - } - - // the field and decodeFeatureValue are modified from FeatureField.decodeFeatureValue - static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15; - - // Rewriting this function to make scoreUpperBound work. - private float decodeFeatureValue(float freq) { - if (freq > MAX_FREQ) { - return scoreUpperBound; - } - int tf = (int) freq; // lossless - int featureBits = tf << 15; - return Math.min(Float.intBitsToFloat(featureBits), scoreUpperBound); - } -} diff --git a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java index d883af23d..2373ad060 100644 --- a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java +++ b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java @@ -21,10 +21,9 @@ import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.builder.EqualsBuilder; import org.apache.commons.lang.builder.HashCodeBuilder; -import org.apache.lucene.BoundedLinearFeatureQuery; +import org.apache.lucene.document.FeatureField; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BoostQuery; import org.apache.lucene.search.Query; import org.opensearch.common.SetOnce; import org.opensearch.core.ParseField; @@ -238,14 +237,9 @@ protected Query doToQuery(QueryShardContext context) throws IOException { Map queryTokens = queryTokensSupplier.get(); validateQueryTokens(queryTokens); - final Float scoreUpperBound = maxTokenScore != null ? maxTokenScore : Float.MAX_VALUE; - BooleanQuery.Builder builder = new BooleanQuery.Builder(); for (Map.Entry entry : queryTokens.entrySet()) { - builder.add( - new BoostQuery(new BoundedLinearFeatureQuery(fieldName, entry.getKey(), scoreUpperBound), entry.getValue()), - BooleanClause.Occur.SHOULD - ); + builder.add(FeatureField.newLinearQuery(fieldName, entry.getKey(), entry.getValue()), BooleanClause.Occur.SHOULD); } return builder.build(); } From dd0b44b9ab641da630c0d6d39c6922c98aaa327a Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Mon, 30 Oct 2023 10:39:51 +0800 Subject: [PATCH 2/7] deprecate max_token_score Signed-off-by: zhichao-aws --- .../query/NeuralSparseQueryBuilder.java | 7 +++-- .../query/NeuralSparseQueryBuilderTests.java | 26 +++++++++++++++++++ .../query/NeuralSparseQueryIT.java | 11 +------- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java index 2373ad060..1274107b7 100644 --- a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java +++ b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java @@ -61,8 +61,10 @@ public class NeuralSparseQueryBuilder extends AbstractQueryBuilder firstInnerHit = getFirstInnerHit(searchResponseAsMap); assertEquals("1", firstInnerHit.get("_id")); - Map queryTokens = runSparseModelInference(modelId, TEST_QUERY_TEXT); - float expectedScore = 0f; - for (Map.Entry entry : queryTokens.entrySet()) { - if (testRankFeaturesDoc.containsKey(entry.getKey())) { - expectedScore += entry.getValue() * Math.min( - getFeatureFieldCompressedNumber(testRankFeaturesDoc.get(entry.getKey())), - maxTokenScore - ); - } - } + float expectedScore = computeExpectedScore(modelId, testRankFeaturesDoc, TEST_QUERY_TEXT); assertEquals(expectedScore, objectToFloat(firstInnerHit.get("_score")), DELTA); } From 354560aa7c183820a944c3ece1e616ae62fe5866 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Mon, 30 Oct 2023 10:47:40 +0800 Subject: [PATCH 3/7] add changelog Signed-off-by: zhichao-aws --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 10063185c..438c4d65a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,3 +21,4 @@ Fixed exception for case when Hybrid query being wrapped into bool query ([#490] ### Documentation ### Maintenance ### Refactoring +Deprecate the `max_token_score` field in `neural_sparse` query clause ([#478](https://github.com/opensearch-project/neural-search/pull/478)) From daef81e7f7d37808feb5ed0c114b55d6d9597a02 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Mon, 30 Oct 2023 10:52:12 +0800 Subject: [PATCH 4/7] tidy Signed-off-by: zhichao-aws --- .../query/NeuralSparseQueryBuilderTests.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java index 6588b8c77..8681deeb5 100644 --- a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java +++ b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java @@ -129,13 +129,13 @@ public void testFromXContent_whenBuiltWithMaxTokenScore_thenThrowWarning() { } */ XContentBuilder xContentBuilder = XContentFactory.jsonBuilder() - .startObject() - .startObject(FIELD_NAME) - .field(QUERY_TEXT_FIELD.getPreferredName(), QUERY_TEXT) - .field(MODEL_ID_FIELD.getPreferredName(), MODEL_ID) - .field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), MAX_TOKEN_SCORE) - .endObject() - .endObject(); + .startObject() + .startObject(FIELD_NAME) + .field(QUERY_TEXT_FIELD.getPreferredName(), QUERY_TEXT) + .field(MODEL_ID_FIELD.getPreferredName(), MODEL_ID) + .field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), MAX_TOKEN_SCORE) + .endObject() + .endObject(); XContentParser contentParser = createParser(xContentBuilder); contentParser.nextToken(); From f52c4431695fa9b5df14e44c16b4396913760426 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Mon, 30 Oct 2023 11:30:49 +0800 Subject: [PATCH 5/7] fix ut Signed-off-by: zhichao-aws --- .../query/NeuralSparseQueryBuilderTests.java | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java index 8681deeb5..12e5394f9 100644 --- a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java +++ b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java @@ -88,7 +88,6 @@ public void testFromXContent_whenBuiltWithOptionals_thenBuildSuccessfully() { "VECTOR_FIELD": { "query_text": "string", "model_id": "string", - "max_token_score": 123.0, "boost": 10.0, "_name": "something", } @@ -99,7 +98,6 @@ public void testFromXContent_whenBuiltWithOptionals_thenBuildSuccessfully() { .startObject(FIELD_NAME) .field(QUERY_TEXT_FIELD.getPreferredName(), QUERY_TEXT) .field(MODEL_ID_FIELD.getPreferredName(), MODEL_ID) - .field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), MAX_TOKEN_SCORE) .field(BOOST_FIELD.getPreferredName(), BOOST) .field(NAME_FIELD.getPreferredName(), QUERY_NAME) .endObject() @@ -112,7 +110,6 @@ public void testFromXContent_whenBuiltWithOptionals_thenBuildSuccessfully() { assertEquals(FIELD_NAME, sparseEncodingQueryBuilder.fieldName()); assertEquals(QUERY_TEXT, sparseEncodingQueryBuilder.queryText()); assertEquals(MODEL_ID, sparseEncodingQueryBuilder.modelId()); - assertEquals(MAX_TOKEN_SCORE, sparseEncodingQueryBuilder.maxTokenScore(), 0.0); assertEquals(BOOST, sparseEncodingQueryBuilder.boost(), 0.0); assertEquals(QUERY_NAME, sparseEncodingQueryBuilder.queryName()); } @@ -193,30 +190,6 @@ public void testFromXContent_whenBuildWithMissingQuery_thenFail() { expectThrows(IllegalArgumentException.class, () -> NeuralSparseQueryBuilder.fromXContent(contentParser)); } - @SneakyThrows - public void testFromXContent_whenBuildWithNegativeMaxTokenScore_thenFail() { - /* - { - "VECTOR_FIELD": { - "query_text": "string", - "model_id": "string", - "max_token_score": -1 - } - } - */ - XContentBuilder xContentBuilder = XContentFactory.jsonBuilder() - .startObject() - .startObject(FIELD_NAME) - .field(MODEL_ID_FIELD.getPreferredName(), MODEL_ID) - .field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), -1f) - .endObject() - .endObject(); - - XContentParser contentParser = createParser(xContentBuilder); - contentParser.nextToken(); - expectThrows(IllegalArgumentException.class, () -> NeuralSparseQueryBuilder.fromXContent(contentParser)); - } - @SneakyThrows public void testFromXContent_whenBuildWithMissingModelId_thenFail() { /* From 79d25e349da50131666848fe5869dde3b5abc6bd Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Fri, 3 Nov 2023 13:41:08 +0800 Subject: [PATCH 6/7] add ut Signed-off-by: zhichao-aws --- .../query/NeuralSparseQueryBuilderTests.java | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java index 12e5394f9..9d1a1627b 100644 --- a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java +++ b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java @@ -26,6 +26,9 @@ import lombok.SneakyThrows; +import org.apache.lucene.document.FeatureField; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; import org.opensearch.client.Client; import org.opensearch.common.SetOnce; import org.opensearch.common.io.stream.BytesStreamOutput; @@ -38,9 +41,11 @@ import org.opensearch.core.xcontent.ToXContent; import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.core.xcontent.XContentParser; +import org.opensearch.index.mapper.MappedFieldType; import org.opensearch.index.query.MatchAllQueryBuilder; import org.opensearch.index.query.QueryBuilder; import org.opensearch.index.query.QueryRewriteContext; +import org.opensearch.index.query.QueryShardContext; import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor; import org.opensearch.test.OpenSearchTestCase; @@ -497,4 +502,23 @@ public void testRewrite_whenQueryTokensSupplierSet_thenReturnSelf() { queryBuilder = sparseEncodingQueryBuilder.doRewrite(null); assertTrue(queryBuilder == sparseEncodingQueryBuilder); } + + @SneakyThrows + public void testDoToQuery_successfulDoToQuery() { + NeuralSparseQueryBuilder sparseEncodingQueryBuilder = new NeuralSparseQueryBuilder().fieldName(FIELD_NAME) + .maxTokenScore(MAX_TOKEN_SCORE) + .queryText(QUERY_TEXT) + .modelId(MODEL_ID) + .queryTokensSupplier(QUERY_TOKENS_SUPPLIER); + QueryShardContext mockedQueryShardContext = mock(QueryShardContext.class); + MappedFieldType mockedMappedFieldType = mock(MappedFieldType.class); + doAnswer(invocation -> "rank_features").when(mockedMappedFieldType).typeName(); + doAnswer(invocation -> mockedMappedFieldType).when(mockedQueryShardContext).fieldMapper(any()); + + BooleanQuery.Builder targetQueryBuilder = new BooleanQuery.Builder(); + targetQueryBuilder.add(FeatureField.newLinearQuery(FIELD_NAME, "hello", 1.f), BooleanClause.Occur.SHOULD); + targetQueryBuilder.add(FeatureField.newLinearQuery(FIELD_NAME, "world", 2.f), BooleanClause.Occur.SHOULD); + + assertEquals(sparseEncodingQueryBuilder.doToQuery(mockedQueryShardContext), targetQueryBuilder.build()); + } } From 617f1541b09769f24bd6123fbb37ae0128e4b765 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Fri, 3 Nov 2023 17:50:24 +0800 Subject: [PATCH 7/7] add deprecation annotation Signed-off-by: zhichao-aws --- .../opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java index 1274107b7..20eeb2e11 100644 --- a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java +++ b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java @@ -64,6 +64,7 @@ public class NeuralSparseQueryBuilder extends AbstractQueryBuilder