Skip to content

Commit

Permalink
add yml tests and improve error message for invalid format
Browse files Browse the repository at this point in the history
  • Loading branch information
jimczi committed Dec 6, 2024
1 parent a832939 commit 4639c31
Show file tree
Hide file tree
Showing 4 changed files with 267 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,17 @@ public Set<NodeFeature> getFeatures() {
);
}

private static final NodeFeature SEMANTIC_TEXT_HIGHLIGHTER = new NodeFeature("semantic_text.highlighter");

@Override
public Set<NodeFeature> getTestFeatures() {
return Set.of(
SemanticTextFieldMapper.SEMANTIC_TEXT_IN_OBJECT_FIELD_FIX,
SemanticTextFieldMapper.SEMANTIC_TEXT_SINGLE_FIELD_UPDATE_FIX,
SemanticTextFieldMapper.SEMANTIC_TEXT_DELETE_FIX,
SemanticTextFieldMapper.SEMANTIC_TEXT_ZERO_SIZE_FIX,
SemanticTextFieldMapper.SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX
SemanticTextFieldMapper.SEMANTIC_TEXT_ALWAYS_EMIT_INFERENCE_ID_FIX,
SEMANTIC_TEXT_HIGHLIGHTER
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;

/**
Expand Down Expand Up @@ -113,11 +114,25 @@ public HighlightField highlight(FieldHighlightContext fieldContext) throws IOExc
for (int i = 0; i < size; i++) {
var chunk = chunks.get(i);
if (nestedSources.size() <= chunk.offset) {
throw new IllegalStateException("Invalid content for field [" + fieldType.name() + "]");
throw new IllegalStateException(
String.format(
Locale.ROOT,
"Invalid content detected for field [%s]: the chunks size is [%d], but a reference to offset [%d] was found in the result.",
fieldType.name(),
nestedSources.size(),
chunk.offset
)
);
}
String content = (String) nestedSources.get(chunk.offset).get(SemanticTextField.CHUNKED_TEXT_FIELD);
if (content == null) {
throw new IllegalStateException("Invalid content for field [" + fieldType.name() + "]");
throw new IllegalStateException(
String.format(
"Invalid content detected for field [%s]: missing text for the chunk at offset [%d].",
fieldType.name(),
chunk.offset
)
);
}
snippets[i] = new Text(content);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,8 @@ protected Collection<? extends Plugin> getPlugins() {
@Before
public void setUp() throws Exception {
super.setUp();
queries = XContentHelper.convertToMap(
Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("queries.json")),
false,
XContentType.JSON
).v2();
var input = Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("queries.json"));
this.queries = XContentHelper.convertToMap(input, false, XContentType.JSON).v2();
}

@SuppressWarnings("unchecked")
Expand Down Expand Up @@ -168,9 +165,8 @@ public void testSparseVector() throws Exception {
}

private MapperService createDefaultMapperService() throws IOException {
return createMapperService(
Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json")).utf8ToString()
);
var mappings = Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json"));
return createMapperService(mappings.utf8ToString());
}

private float[] readDenseVector(Object value) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
setup:
- requires:
cluster_features: "semantic_text.highlighter"
reason: a new highlighter for semantic text field

- do:
inference.put:
task_type: sparse_embedding
inference_id: sparse-inference-id
body: >
{
"service": "test_service",
"service_settings": {
"model": "my_model",
"api_key": "abc64"
},
"task_settings": {
}
}
- do:
inference.put:
task_type: text_embedding
inference_id: dense-inference-id
body: >
{
"service": "text_embedding_test_service",
"service_settings": {
"model": "my_model",
"dimensions": 10,
"api_key": "abc64",
"similarity": "COSINE"
},
"task_settings": {
}
}
- do:
indices.create:
index: test-sparse-index
body:
mappings:
properties:
body:
type: semantic_text
inference_id: sparse-inference-id

- do:
indices.create:
index: test-dense-index
body:
mappings:
properties:
body:
type: semantic_text
inference_id: dense-inference-id

---
"Highlighting using a sparse embedding model":
- do:
index:
index: test-sparse-index
id: doc_1
body:
body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
refresh: true

- match: { result: created }

- do:
search:
index: test-sparse-index
body:
query:
semantic:
field: "body"
query: "What is Elasticsearch?"
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 1

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 1 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }

- do:
search:
index: test-sparse-index
body:
query:
semantic:
field: "body"
query: "What is Elasticsearch?"
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 2 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }

- do:
search:
index: test-sparse-index
body:
query:
semantic:
field: "body"
query: "What is Elasticsearch?"
highlight:
fields:
body:
type: "semantic"
order: "score"
number_of_fragments: 1

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 1 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }

- do:
search:
index: test-sparse-index
body:
query:
semantic:
field: "body"
query: "What is Elasticsearch?"
highlight:
fields:
body:
type: "semantic"
order: "score"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 2 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }

---
"Highlighting using a dense embedding model":
- do:
index:
index: test-dense-index
id: doc_1
body:
body: ["ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides.", "You Know, for Search!"]
refresh: true

- match: { result: created }

- do:
search:
index: test-dense-index
body:
query:
semantic:
field: "body"
query: "What is Elasticsearch?"
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 1

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 1 }
- match: { hits.hits.0.highlight.body.0: "You Know, for Search!" }

- do:
search:
index: test-dense-index
body:
query:
semantic:
field: "body"
query: "What is Elasticsearch?"
highlight:
fields:
body:
type: "semantic"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 2 }
- match: { hits.hits.0.highlight.body.0: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
- match: { hits.hits.0.highlight.body.1: "You Know, for Search!" }

- do:
search:
index: test-dense-index
body:
query:
semantic:
field: "body"
query: "What is Elasticsearch?"
highlight:
fields:
body:
type: "semantic"
order: "score"
number_of_fragments: 1

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 1 }
- match: { hits.hits.0.highlight.body.0: "You Know, for Search!" }

- do:
search:
index: test-dense-index
body:
query:
semantic:
field: "body"
query: "What is Elasticsearch?"
highlight:
fields:
body:
type: "semantic"
order: "score"
number_of_fragments: 2

- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "doc_1" }
- length: { hits.hits.0.highlight.body: 2 }
- match: { hits.hits.0.highlight.body.0: "You Know, for Search!" }
- match: { hits.hits.0.highlight.body.1: "ElasticSearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }


0 comments on commit 4639c31

Please sign in to comment.