From 7b0229d1887fd28c99025e661d6de428f045b704 Mon Sep 17 00:00:00 2001 From: zhichao-aws Date: Sat, 20 Apr 2024 14:28:34 +0800 Subject: [PATCH] [BUG FIX] Fix bwc failure in neural sparse search (#696) --- CHANGELOG.md | 1 + .../AbstractRestartUpgradeRestTestCase.java | 2 - .../bwc/AbstractRollingUpgradeTestCase.java | 2 - .../query/NeuralSparseQueryBuilder.java | 32 +++++-- .../query/NeuralSparseQueryBuilderTests.java | 88 +++++++++++++++++-- 5 files changed, 107 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67bdc1705..eae99470c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Fix async actions are left in neural_sparse query ([#438](https://github.com/opensearch-project/neural-search/pull/438)) - Fix typo for sparse encoding processor factory([#578](https://github.com/opensearch-project/neural-search/pull/578)) - Add non-null check for queryBuilder in NeuralQueryEnricherProcessor ([#615](https://github.com/opensearch-project/neural-search/pull/615)) +- Add max_token_score field placeholder in NeuralSparseQueryBuilder to fix the rolling-upgrade from 2.x nodes bwc tests. ([#696](https://github.com/opensearch-project/neural-search/pull/696)) ### Infrastructure - Adding integration tests for scenario of hybrid query with aggregations ([#632](https://github.com/opensearch-project/neural-search/pull/632)) ### Documentation diff --git a/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java b/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java index 395573c6a..331ee714a 100644 --- a/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java +++ b/qa/restart-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRestartUpgradeRestTestCase.java @@ -4,11 +4,9 @@ */ package org.opensearch.neuralsearch.bwc; -import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.Locale; -import java.util.Objects; import java.util.Optional; import org.junit.Before; import org.opensearch.common.settings.Settings; diff --git a/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRollingUpgradeTestCase.java b/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRollingUpgradeTestCase.java index ed1613e2f..489aefd6e 100644 --- a/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRollingUpgradeTestCase.java +++ b/qa/rolling-upgrade/src/test/java/org/opensearch/neuralsearch/bwc/AbstractRollingUpgradeTestCase.java @@ -4,11 +4,9 @@ */ package org.opensearch.neuralsearch.bwc; -import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.Locale; -import java.util.Objects; import java.util.Optional; import org.junit.Before; import org.opensearch.common.settings.Settings; diff --git a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java index 58e912788..319f4b356 100644 --- a/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java +++ b/src/main/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilder.java @@ -46,7 +46,7 @@ import lombok.extern.log4j.Log4j2; /** - * SparseEncodingQueryBuilder is responsible for handling "neural_sparse" query types. It uses an ML SPARSE_ENCODING model + * SparseEncodingQueryBuilder is responsible for handling "neural_sparse" query types. It uses an ML NEURAL_SPARSE model * or SPARSE_TOKENIZE model to produce a Map with String keys and Float values for input text. Then it will be transformed * to Lucene FeatureQuery wrapped by Lucene BooleanQuery. */ @@ -63,6 +63,11 @@ public class NeuralSparseQueryBuilder extends AbstractQueryBuilder> queryTokensSupplier; private static final Version MINIMAL_SUPPORTED_VERSION_DEFAULT_MODEL_ID = Version.V_2_13_0; @@ -91,6 +97,7 @@ public NeuralSparseQueryBuilder(StreamInput in) throws IOException { } else { this.modelId = in.readString(); } + this.maxTokenScore = in.readOptionalFloat(); if (in.readBoolean()) { Map queryTokens = in.readMap(StreamInput::readString, StreamInput::readFloat); this.queryTokensSupplier = () -> queryTokens; @@ -106,6 +113,7 @@ protected void doWriteTo(StreamOutput out) throws IOException { } else { out.writeString(this.modelId); } + out.writeOptionalFloat(maxTokenScore); if (!Objects.isNull(queryTokensSupplier) && !Objects.isNull(queryTokensSupplier.get())) { out.writeBoolean(true); out.writeMap(queryTokensSupplier.get(), StreamOutput::writeString, StreamOutput::writeFloat); @@ -122,6 +130,7 @@ protected void doXContent(XContentBuilder xContentBuilder, Params params) throws if (Objects.nonNull(modelId)) { xContentBuilder.field(MODEL_ID_FIELD.getPreferredName(), modelId); } + if (maxTokenScore != null) xContentBuilder.field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), maxTokenScore); printBoostAndQueryName(xContentBuilder); xContentBuilder.endObject(); xContentBuilder.endObject(); @@ -131,7 +140,8 @@ protected void doXContent(XContentBuilder xContentBuilder, Params params) throws * The expected parsing form looks like: * "SAMPLE_FIELD": { * "query_text": "string", - * "model_id": "string" + * "model_id": "string", + * "max_token_score": float (optional) * } * * @param parser XContentParser @@ -189,6 +199,8 @@ private static void parseQueryParams(XContentParser parser, NeuralSparseQueryBui sparseEncodingQueryBuilder.queryText(parser.text()); } else if (MODEL_ID_FIELD.match(currentFieldName, parser.getDeprecationHandler())) { sparseEncodingQueryBuilder.modelId(parser.text()); + } else if (MAX_TOKEN_SCORE_FIELD.match(currentFieldName, parser.getDeprecationHandler())) { + sparseEncodingQueryBuilder.maxTokenScore(parser.floatValue()); } else { throw new ParsingException( parser.getTokenLocation(), @@ -227,6 +239,7 @@ protected QueryBuilder doRewrite(QueryRewriteContext queryRewriteContext) throws return new NeuralSparseQueryBuilder().fieldName(fieldName) .queryText(queryText) .modelId(modelId) + .maxTokenScore(maxTokenScore) .queryTokensSupplier(queryTokensSetOnce::get); } @@ -280,13 +293,14 @@ private static void validateQueryTokens(Map queryTokens) { @Override protected boolean doEquals(NeuralSparseQueryBuilder obj) { if (this == obj) return true; - if (Objects.isNull(obj) || getClass() != obj.getClass()) return false; - if (Objects.isNull(queryTokensSupplier) && !Objects.isNull(obj.queryTokensSupplier)) return false; - if (!Objects.isNull(queryTokensSupplier) && Objects.isNull(obj.queryTokensSupplier)) return false; + if (obj == null || getClass() != obj.getClass()) return false; + if (queryTokensSupplier == null && obj.queryTokensSupplier != null) return false; + if (queryTokensSupplier != null && obj.queryTokensSupplier == null) return false; EqualsBuilder equalsBuilder = new EqualsBuilder().append(fieldName, obj.fieldName) .append(queryText, obj.queryText) - .append(modelId, obj.modelId); - if (!Objects.isNull(queryTokensSupplier)) { + .append(modelId, obj.modelId) + .append(maxTokenScore, obj.maxTokenScore); + if (queryTokensSupplier != null) { equalsBuilder.append(queryTokensSupplier.get(), obj.queryTokensSupplier.get()); } return equalsBuilder.isEquals(); @@ -294,8 +308,8 @@ protected boolean doEquals(NeuralSparseQueryBuilder obj) { @Override protected int doHashCode() { - HashCodeBuilder builder = new HashCodeBuilder().append(fieldName).append(queryText).append(modelId); - if (!Objects.isNull(queryTokensSupplier)) { + HashCodeBuilder builder = new HashCodeBuilder().append(fieldName).append(queryText).append(modelId).append(maxTokenScore); + if (queryTokensSupplier != null) { builder.append(queryTokensSupplier.get()); } return builder.toHashCode(); diff --git a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java index 89bcd57d7..90b635e7c 100644 --- a/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java +++ b/src/test/java/org/opensearch/neuralsearch/query/NeuralSparseQueryBuilderTests.java @@ -10,6 +10,7 @@ import static org.opensearch.index.query.AbstractQueryBuilder.BOOST_FIELD; import static org.opensearch.index.query.AbstractQueryBuilder.NAME_FIELD; import static org.opensearch.neuralsearch.TestUtils.xContentBuilderToMap; +import static org.opensearch.neuralsearch.query.NeuralSparseQueryBuilder.MAX_TOKEN_SCORE_FIELD; import static org.opensearch.neuralsearch.query.NeuralSparseQueryBuilder.MODEL_ID_FIELD; import static org.opensearch.neuralsearch.query.NeuralSparseQueryBuilder.NAME; import static org.opensearch.neuralsearch.query.NeuralSparseQueryBuilder.QUERY_TEXT_FIELD; @@ -22,6 +23,9 @@ import java.util.function.BiConsumer; import java.util.function.Supplier; +import org.apache.lucene.document.FeatureField; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; import org.junit.Before; import org.opensearch.Version; import org.opensearch.client.Client; @@ -37,9 +41,11 @@ import org.opensearch.core.xcontent.ToXContent; import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.core.xcontent.XContentParser; +import org.opensearch.index.mapper.MappedFieldType; import org.opensearch.index.query.MatchAllQueryBuilder; import org.opensearch.index.query.QueryBuilder; import org.opensearch.index.query.QueryRewriteContext; +import org.opensearch.index.query.QueryShardContext; import org.opensearch.neuralsearch.ml.MLCommonsClientAccessor; import org.opensearch.neuralsearch.util.NeuralSearchClusterTestUtils; import org.opensearch.neuralsearch.util.NeuralSearchClusterUtil; @@ -54,6 +60,7 @@ public class NeuralSparseQueryBuilderTests extends OpenSearchTestCase { private static final String MODEL_ID = "mfgfgdsfgfdgsde"; private static final float BOOST = 1.8f; private static final String QUERY_NAME = "queryName"; + private static final Float MAX_TOKEN_SCORE = 123f; private static final Supplier> QUERY_TOKENS_SUPPLIER = () -> Map.of("hello", 1.f, "world", 2.f); @Before @@ -121,6 +128,32 @@ public void testFromXContent_whenBuiltWithOptionals_thenBuildSuccessfully() { assertEquals(QUERY_NAME, sparseEncodingQueryBuilder.queryName()); } + @SneakyThrows + public void testFromXContent_whenBuiltWithMaxTokenScore_thenThrowWarning() { + /* + { + "VECTOR_FIELD": { + "query_text": "string", + "model_id": "string", + "max_token_score": 123.0 + } + } + */ + XContentBuilder xContentBuilder = XContentFactory.jsonBuilder() + .startObject() + .startObject(FIELD_NAME) + .field(QUERY_TEXT_FIELD.getPreferredName(), QUERY_TEXT) + .field(MODEL_ID_FIELD.getPreferredName(), MODEL_ID) + .field(MAX_TOKEN_SCORE_FIELD.getPreferredName(), MAX_TOKEN_SCORE) + .endObject() + .endObject(); + + XContentParser contentParser = createParser(xContentBuilder); + contentParser.nextToken(); + NeuralSparseQueryBuilder sparseEncodingQueryBuilder = NeuralSparseQueryBuilder.fromXContent(contentParser); + assertWarnings("Deprecated field [max_token_score] used, this field is unused and will be removed entirely"); + } + @SneakyThrows public void testFromXContent_whenBuildWithMultipleRootFields_thenFail() { /* @@ -248,7 +281,8 @@ public void testFromXContent_whenBuildWithDuplicateParameters_thenFail() { public void testToXContent() { NeuralSparseQueryBuilder sparseEncodingQueryBuilder = new NeuralSparseQueryBuilder().fieldName(FIELD_NAME) .modelId(MODEL_ID) - .queryText(QUERY_TEXT); + .queryText(QUERY_TEXT) + .maxTokenScore(MAX_TOKEN_SCORE); XContentBuilder builder = XContentFactory.jsonBuilder(); builder = sparseEncodingQueryBuilder.toXContent(builder, ToXContent.EMPTY_PARAMS); @@ -273,6 +307,7 @@ public void testToXContent() { assertEquals(MODEL_ID, secondInnerMap.get(MODEL_ID_FIELD.getPreferredName())); assertEquals(QUERY_TEXT, secondInnerMap.get(QUERY_TEXT_FIELD.getPreferredName())); + assertEquals(MAX_TOKEN_SCORE, (Double) secondInnerMap.get(MAX_TOKEN_SCORE_FIELD.getPreferredName()), 0.0); } public void testStreams_whenMinVersionIsBeforeDefaultModelId_thenSuccess() { @@ -285,6 +320,7 @@ public void testStreams() { NeuralSparseQueryBuilder original = new NeuralSparseQueryBuilder(); original.fieldName(FIELD_NAME); original.queryText(QUERY_TEXT); + original.maxTokenScore(MAX_TOKEN_SCORE); original.modelId(MODEL_ID); original.boost(BOOST); original.queryName(QUERY_NAME); @@ -306,11 +342,11 @@ public void testStreams() { queryTokensSetOnce.set(Map.of("hello", 1.0f, "world", 2.0f)); original.queryTokensSupplier(queryTokensSetOnce::get); - BytesStreamOutput streamOutput2 = new BytesStreamOutput(); - original.writeTo(streamOutput2); + streamOutput = new BytesStreamOutput(); + original.writeTo(streamOutput); filterStreamInput = new NamedWriteableAwareStreamInput( - streamOutput2.bytes().streamInput(), + streamOutput.bytes().streamInput(), new NamedWriteableRegistry( List.of(new NamedWriteableRegistry.Entry(QueryBuilder.class, MatchAllQueryBuilder.NAME, MatchAllQueryBuilder::new)) ) @@ -327,6 +363,8 @@ public void testHashAndEquals() { String queryText2 = "query text 2"; String modelId1 = "model-1"; String modelId2 = "model-2"; + float maxTokenScore1 = 1.1f; + float maxTokenScore2 = 2.2f; float boost1 = 1.8f; float boost2 = 3.8f; String queryName1 = "query-1"; @@ -337,6 +375,7 @@ public void testHashAndEquals() { NeuralSparseQueryBuilder sparseEncodingQueryBuilder_baseline = new NeuralSparseQueryBuilder().fieldName(fieldName1) .queryText(queryText1) .modelId(modelId1) + .maxTokenScore(maxTokenScore1) .boost(boost1) .queryName(queryName1); @@ -344,18 +383,21 @@ public void testHashAndEquals() { NeuralSparseQueryBuilder sparseEncodingQueryBuilder_baselineCopy = new NeuralSparseQueryBuilder().fieldName(fieldName1) .queryText(queryText1) .modelId(modelId1) + .maxTokenScore(maxTokenScore1) .boost(boost1) .queryName(queryName1); // Identical to sparseEncodingQueryBuilder_baseline except default boost and query name NeuralSparseQueryBuilder sparseEncodingQueryBuilder_defaultBoostAndQueryName = new NeuralSparseQueryBuilder().fieldName(fieldName1) .queryText(queryText1) - .modelId(modelId1); + .modelId(modelId1) + .maxTokenScore(maxTokenScore1); // Identical to sparseEncodingQueryBuilder_baseline except diff field name NeuralSparseQueryBuilder sparseEncodingQueryBuilder_diffFieldName = new NeuralSparseQueryBuilder().fieldName(fieldName2) .queryText(queryText1) .modelId(modelId1) + .maxTokenScore(maxTokenScore1) .boost(boost1) .queryName(queryName1); @@ -363,6 +405,7 @@ public void testHashAndEquals() { NeuralSparseQueryBuilder sparseEncodingQueryBuilder_diffQueryText = new NeuralSparseQueryBuilder().fieldName(fieldName1) .queryText(queryText2) .modelId(modelId1) + .maxTokenScore(maxTokenScore1) .boost(boost1) .queryName(queryName1); @@ -370,6 +413,7 @@ public void testHashAndEquals() { NeuralSparseQueryBuilder sparseEncodingQueryBuilder_diffModelId = new NeuralSparseQueryBuilder().fieldName(fieldName1) .queryText(queryText1) .modelId(modelId2) + .maxTokenScore(maxTokenScore1) .boost(boost1) .queryName(queryName1); @@ -377,6 +421,7 @@ public void testHashAndEquals() { NeuralSparseQueryBuilder sparseEncodingQueryBuilder_diffBoost = new NeuralSparseQueryBuilder().fieldName(fieldName1) .queryText(queryText1) .modelId(modelId1) + .maxTokenScore(maxTokenScore1) .boost(boost2) .queryName(queryName1); @@ -384,13 +429,23 @@ public void testHashAndEquals() { NeuralSparseQueryBuilder sparseEncodingQueryBuilder_diffQueryName = new NeuralSparseQueryBuilder().fieldName(fieldName1) .queryText(queryText1) .modelId(modelId1) + .maxTokenScore(maxTokenScore1) .boost(boost1) .queryName(queryName2); + // Identical to sparseEncodingQueryBuilder_baseline except diff max token score + NeuralSparseQueryBuilder sparseEncodingQueryBuilder_diffMaxTokenScore = new NeuralSparseQueryBuilder().fieldName(fieldName1) + .queryText(queryText1) + .modelId(modelId1) + .maxTokenScore(maxTokenScore2) + .boost(boost1) + .queryName(queryName1); + // Identical to sparseEncodingQueryBuilder_baseline except non-null query tokens supplier NeuralSparseQueryBuilder sparseEncodingQueryBuilder_nonNullQueryTokens = new NeuralSparseQueryBuilder().fieldName(fieldName1) .queryText(queryText1) .modelId(modelId1) + .maxTokenScore(maxTokenScore1) .boost(boost1) .queryName(queryName1) .queryTokensSupplier(() -> queryTokens1); @@ -399,6 +454,7 @@ public void testHashAndEquals() { NeuralSparseQueryBuilder sparseEncodingQueryBuilder_diffQueryTokens = new NeuralSparseQueryBuilder().fieldName(fieldName1) .queryText(queryText1) .modelId(modelId1) + .maxTokenScore(maxTokenScore1) .boost(boost1) .queryName(queryName1) .queryTokensSupplier(() -> queryTokens2); @@ -427,6 +483,9 @@ public void testHashAndEquals() { assertNotEquals(sparseEncodingQueryBuilder_baseline, sparseEncodingQueryBuilder_diffQueryName); assertNotEquals(sparseEncodingQueryBuilder_baseline.hashCode(), sparseEncodingQueryBuilder_diffQueryName.hashCode()); + assertNotEquals(sparseEncodingQueryBuilder_baseline, sparseEncodingQueryBuilder_diffMaxTokenScore); + assertNotEquals(sparseEncodingQueryBuilder_baseline.hashCode(), sparseEncodingQueryBuilder_diffMaxTokenScore.hashCode()); + assertNotEquals(sparseEncodingQueryBuilder_baseline, sparseEncodingQueryBuilder_nonNullQueryTokens); assertNotEquals(sparseEncodingQueryBuilder_baseline.hashCode(), sparseEncodingQueryBuilder_nonNullQueryTokens.hashCode()); @@ -486,4 +545,23 @@ private void setUpClusterService(Version version) { ClusterService clusterService = NeuralSearchClusterTestUtils.mockClusterService(version); NeuralSearchClusterUtil.instance().initialize(clusterService); } + + @SneakyThrows + public void testDoToQuery_successfulDoToQuery() { + NeuralSparseQueryBuilder sparseEncodingQueryBuilder = new NeuralSparseQueryBuilder().fieldName(FIELD_NAME) + .maxTokenScore(MAX_TOKEN_SCORE) + .queryText(QUERY_TEXT) + .modelId(MODEL_ID) + .queryTokensSupplier(QUERY_TOKENS_SUPPLIER); + QueryShardContext mockedQueryShardContext = mock(QueryShardContext.class); + MappedFieldType mockedMappedFieldType = mock(MappedFieldType.class); + doAnswer(invocation -> "rank_features").when(mockedMappedFieldType).typeName(); + doAnswer(invocation -> mockedMappedFieldType).when(mockedQueryShardContext).fieldMapper(any()); + + BooleanQuery.Builder targetQueryBuilder = new BooleanQuery.Builder(); + targetQueryBuilder.add(FeatureField.newLinearQuery(FIELD_NAME, "hello", 1.f), BooleanClause.Occur.SHOULD); + targetQueryBuilder.add(FeatureField.newLinearQuery(FIELD_NAME, "world", 2.f), BooleanClause.Occur.SHOULD); + + assertEquals(sparseEncodingQueryBuilder.doToQuery(mockedQueryShardContext), targetQueryBuilder.build()); + } }