add yml tests and improve error message for invalid format

jimczi · Dec 6, 2024 · 4639c31 · 4639c31
1 parent a832939
commit 4639c31
Show file tree

Hide file tree

Showing 4 changed files with 267 additions and 11 deletions.
diff --git a/...k/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java b/...k/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferenceFeatures.java
@@ -33,14 +33,17 @@ public Set<NodeFeature> getFeatures() {
         );
     }
 
+    private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter");
+
     @Override
     public Set<NodeFeature> getTestFeatures() {
         return Set.of(
             SemanticTextFieldMapper.SEMANTIC_TEXT_IN_OBJECT_FIELD_FIX,
             SemanticTextFieldMapper.SEMANTIC_TEXT_SINGLE_FIELD_UPDATE_FIX,
             SemanticTextFieldMapper.SEMANTIC_TEXT_DELETE_FIX,
             SemanticTextFieldMapper.SEMANTIC_TEXT_ZERO_SIZE_FIX,
-            SemanticTextFieldMapper.SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX
+            SemanticTextFieldMapper.SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX,
+            SEMANTIC_TEXT_HIGHLIGHTER
         );
     }
 }
diff --git a/...ce/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java b/...ce/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java
@@ -38,6 +38,7 @@
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 
 /**
@@ -113,11 +114,25 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
         for (int i = 0; i < size; i++) {
             var chunk = chunks.get(i);
             if (nestedSources.size() <= chunk.offset) {
-                throw new IllegalStateException("Invalid content for field [" + fieldType.name() + "]");
+                throw new IllegalStateException(
+                    String.format(
+                        Locale.ROOT,
+                        "Invalid content detected for field [%s]: the chunks size is [%d], but a reference to offset [%d] was found in the result.",
+                        fieldType.name(),
+                        nestedSources.size(),
+                        chunk.offset
+                    )
+                );
             }
             String content = (String) nestedSources.get(chunk.offset).get(SemanticTextField.CHUNKED_TEXT_FIELD);
             if (content == null) {
-                throw new IllegalStateException("Invalid content for field [" + fieldType.name() + "]");
+                throw new IllegalStateException(
+                    String.format(
+                        "Invalid content detected for field [%s]: missing text for the chunk at offset [%d].",
+                        fieldType.name(),
+                        chunk.offset
+                    )
+                );
             }
             snippets[i] = new Text(content);
         }

diff --git a/...c/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java b/...c/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java
@@ -81,11 +81,8 @@ protected Collection<? extends Plugin> getPlugins() {
     @Before
     public void setUp() throws Exception {
         super.setUp();
-        queries = XContentHelper.convertToMap(
-            Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("queries.json")),
-            false,
-            XContentType.JSON
-        ).v2();
+        var input = Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("queries.json"));
+        this.queries = XContentHelper.convertToMap(input, false, XContentType.JSON).v2();
     }
 
     @SuppressWarnings("unchecked")
@@ -168,9 +165,8 @@ public void testSparseVector() throws Exception {
     }
 
     private MapperService createDefaultMapperService() throws IOException {
-        return createMapperService(
-            Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json")).utf8ToString()
-        );
+        var mappings = Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json"));
+        return createMapperService(mappings.utf8ToString());
     }
 
     private float[] readDenseVector(Object value) {

diff --git a/.../src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml b/.../src/yamlRestTest/resources/rest-api-spec/test/inference/90_semantic_text_highlighter.yml
@@ -0,0 +1,242 @@
+setup:
+  - requires:
+      cluster_features: "semantic_text.highlighter"
+      reason: a new highlighter for semantic text field
+
+  - do:
+      inference.put:
+        task_type: sparse_embedding
+        inference_id: sparse-inference-id
+        body: >
+          {
+            "service": "test_service",
+            "service_settings": {
+              "model": "my_model",
+              "api_key": "abc64"
+            },
+            "task_settings": {
+            }
+          }
+
+  - do:
+      inference.put:
+        task_type: text_embedding
+        inference_id: dense-inference-id
+        body: >
+          {
+            "service": "text_embedding_test_service",
+            "service_settings": {
+              "model": "my_model",
+              "dimensions": 10,
+              "api_key": "abc64",
+              "similarity": "COSINE"
+            },
+            "task_settings": {
+            }
+          }
+
+  - do:
+      indices.create:
+        index: test-sparse-index
+        body:
+          mappings:
+            properties:
+              body:
+                type: semantic_text
+                inference_id: sparse-inference-id
+
+  - do:
+      indices.create:
+        index: test-dense-index
+        body:
+          mappings:
+            properties:
+              body:
+                type: semantic_text
+                inference_id: dense-inference-id
+
+---
+"Highlighting using a sparse embedding model":
+  - do:
+      index:
+        index: test-sparse-index
+        id: doc_1
+        body:
+          body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
+        refresh: true
+
+  - match: { result: created }
+
+  - do:
+      search:
+        index: test-sparse-index
+        body:
+          query:
+            semantic:
+              field: "body"
+              query: "What is Elasticsearch?"
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                number_of_fragments: 1
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 1 }
+  - match:  { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+
+  - do:
+      search:
+        index: test-sparse-index
+        body:
+          query:
+            semantic:
+              field: "body"
+              query: "What is Elasticsearch?"
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                number_of_fragments: 2
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match:  { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+  - match:  { hits.hits.0.highlight.body.1: "You Know, for Search!" }
+
+  - do:
+      search:
+        index: test-sparse-index
+        body:
+          query:
+            semantic:
+              field: "body"
+              query: "What is Elasticsearch?"
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                order: "score"
+                number_of_fragments: 1
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 1 }
+  - match:  { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+
+  - do:
+      search:
+        index: test-sparse-index
+        body:
+          query:
+            semantic:
+              field: "body"
+              query: "What is Elasticsearch?"
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                order: "score"
+                number_of_fragments: 2
+
+  - match: { hits.total.value: 1 }
+  - match: { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+  - match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }
+
+---
+"Highlighting using a dense embedding model":
+  - do:
+      index:
+        index: test-dense-index
+        id: doc_1
+        body:
+          body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
+        refresh: true
+
+  - match: { result: created }
+
+  - do:
+      search:
+        index: test-dense-index
+        body:
+          query:
+            semantic:
+              field: "body"
+              query: "What is Elasticsearch?"
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                number_of_fragments: 1
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 1 }
+  - match:  { hits.hits.0.highlight.body.0: "You Know, for Search!" }
+
+  - do:
+      search:
+        index: test-dense-index
+        body:
+          query:
+            semantic:
+              field: "body"
+              query: "What is Elasticsearch?"
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                number_of_fragments: 2
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match:  { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+  - match:  { hits.hits.0.highlight.body.1: "You Know, for Search!" }
+
+  - do:
+      search:
+        index: test-dense-index
+        body:
+          query:
+            semantic:
+              field: "body"
+              query: "What is Elasticsearch?"
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                order: "score"
+                number_of_fragments: 1
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 1 }
+  - match:  { hits.hits.0.highlight.body.0: "You Know, for Search!" }
+
+  - do:
+      search:
+        index: test-dense-index
+        body:
+          query:
+            semantic:
+              field: "body"
+              query: "What is Elasticsearch?"
+          highlight:
+            fields:
+              body:
+                type: "semantic"
+                order: "score"
+                number_of_fragments: 2
+
+  - match:  { hits.total.value: 1 }
+  - match:  { hits.hits.0._id: "doc_1" }
+  - length: { hits.hits.0.highlight.body: 2 }
+  - match:  { hits.hits.0.highlight.body.0: "You Know, for Search!" }
+  - match:  { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
+
+