diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java index c82f287792a7c..67892dfe78624 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java @@ -33,6 +33,8 @@ public Set getFeatures() { ); } + private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter"); + @Override public Set getTestFeatures() { return Set.of( @@ -40,7 +42,8 @@ public Set getTestFeatures() { SemanticTextFieldMapper.SEMANTIC_TEXT_SINGLE_FIELD_UPDATE_FIX, SemanticTextFieldMapper.SEMANTIC_TEXT_DELETE_FIX, SemanticTextFieldMapper.SEMANTIC_TEXT_ZERO_SIZE_FIX, - SemanticTextFieldMapper.SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX + SemanticTextFieldMapper.SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX, + SEMANTIC_TEXT_HIGHLIGHTER ); } } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java index 861efeea1463a..eeff79a1a2434 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java @@ -38,6 +38,7 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; +import java.util.Locale; import java.util.Map; /** @@ -113,11 +114,25 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc for (int i = 0; i < size; i++) { var chunk = chunks.get(i); if (nestedSources.size() <= chunk.offset) { - throw new IllegalStateException("Invalid content for field [" + fieldType.name() + "]"); + throw new IllegalStateException( + String.format( + Locale.ROOT, + "Invalid content detected for field [%s]: the chunks size is [%d], but a reference to offset [%d] was found in the result.", + fieldType.name(), + nestedSources.size(), + chunk.offset + ) + ); } String content = (String) nestedSources.get(chunk.offset).get(SemanticTextField.CHUNKED_TEXT_FIELD); if (content == null) { - throw new IllegalStateException("Invalid content for field [" + fieldType.name() + "]"); + throw new IllegalStateException( + String.format( + "Invalid content detected for field [%s]: missing text for the chunk at offset [%d].", + fieldType.name(), + chunk.offset + ) + ); } snippets[i] = new Text(content); } diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java index 0d0d4803320cf..7dc4d99e06acc 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java @@ -81,11 +81,8 @@ protected Collection getPlugins() { @Before public void setUp() throws Exception { super.setUp(); - queries = XContentHelper.convertToMap( - Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("queries.json")), - false, - XContentType.JSON - ).v2(); + var input = Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("queries.json")); + this.queries = XContentHelper.convertToMap(input, false, XContentType.JSON).v2(); } @SuppressWarnings("unchecked") @@ -168,9 +165,8 @@ public void testSparseVector() throws Exception { } private MapperService createDefaultMapperService() throws IOException { - return createMapperService( - Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json")).utf8ToString() - ); + var mappings = Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json")); + return createMapperService(mappings.utf8ToString()); } private float[] readDenseVector(Object value) { diff --git a/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml new file mode 100644 index 0000000000000..25cd1b5aec48a --- /dev/null +++ b/x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml @@ -0,0 +1,242 @@ +setup: + - requires: + cluster_features: "semantic_text.highlighter" + reason: a new highlighter for semantic text field + + - do: + inference.put: + task_type: sparse_embedding + inference_id: sparse-inference-id + body: > + { + "service": "test_service", + "service_settings": { + "model": "my_model", + "api_key": "abc64" + }, + "task_settings": { + } + } + + - do: + inference.put: + task_type: text_embedding + inference_id: dense-inference-id + body: > + { + "service": "text_embedding_test_service", + "service_settings": { + "model": "my_model", + "dimensions": 10, + "api_key": "abc64", + "similarity": "COSINE" + }, + "task_settings": { + } + } + + - do: + indices.create: + index: test-sparse-index + body: + mappings: + properties: + body: + type: semantic_text + inference_id: sparse-inference-id + + - do: + indices.create: + index: test-dense-index + body: + mappings: + properties: + body: + type: semantic_text + inference_id: dense-inference-id + +--- +"Highlighting using a sparse embedding model": + - do: + index: + index: test-sparse-index + id: doc_1 + body: + body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"] + refresh: true + + - match: { result: created } + + - do: + search: + index: test-sparse-index + body: + query: + semantic: + field: "body" + query: "What is Elasticsearch?" + highlight: + fields: + body: + type: "semantic" + number_of_fragments: 1 + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.body: 1 } + - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." } + + - do: + search: + index: test-sparse-index + body: + query: + semantic: + field: "body" + query: "What is Elasticsearch?" + highlight: + fields: + body: + type: "semantic" + number_of_fragments: 2 + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.body: 2 } + - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." } + - match: { hits.hits.0.highlight.body.1: "You Know, for Search!" } + + - do: + search: + index: test-sparse-index + body: + query: + semantic: + field: "body" + query: "What is Elasticsearch?" + highlight: + fields: + body: + type: "semantic" + order: "score" + number_of_fragments: 1 + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.body: 1 } + - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." } + + - do: + search: + index: test-sparse-index + body: + query: + semantic: + field: "body" + query: "What is Elasticsearch?" + highlight: + fields: + body: + type: "semantic" + order: "score" + number_of_fragments: 2 + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.body: 2 } + - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." } + - match: { hits.hits.0.highlight.body.1: "You Know, for Search!" } + +--- +"Highlighting using a dense embedding model": + - do: + index: + index: test-dense-index + id: doc_1 + body: + body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"] + refresh: true + + - match: { result: created } + + - do: + search: + index: test-dense-index + body: + query: + semantic: + field: "body" + query: "What is Elasticsearch?" + highlight: + fields: + body: + type: "semantic" + number_of_fragments: 1 + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.body: 1 } + - match: { hits.hits.0.highlight.body.0: "You Know, for Search!" } + + - do: + search: + index: test-dense-index + body: + query: + semantic: + field: "body" + query: "What is Elasticsearch?" + highlight: + fields: + body: + type: "semantic" + number_of_fragments: 2 + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.body: 2 } + - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." } + - match: { hits.hits.0.highlight.body.1: "You Know, for Search!" } + + - do: + search: + index: test-dense-index + body: + query: + semantic: + field: "body" + query: "What is Elasticsearch?" + highlight: + fields: + body: + type: "semantic" + order: "score" + number_of_fragments: 1 + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.body: 1 } + - match: { hits.hits.0.highlight.body.0: "You Know, for Search!" } + + - do: + search: + index: test-dense-index + body: + query: + semantic: + field: "body" + query: "What is Elasticsearch?" + highlight: + fields: + body: + type: "semantic" + order: "score" + number_of_fragments: 2 + + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "doc_1" } + - length: { hits.hits.0.highlight.body: 2 } + - match: { hits.hits.0.highlight.body.0: "You Know, for Search!" } + - match: { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." } + +