diff --git a/docs/reference/mapping/types/semantic-text.asciidoc b/docs/reference/mapping/types/semantic-text.asciidoc index f76a9352c2fe8..d459e0d0eca75 100644 --- a/docs/reference/mapping/types/semantic-text.asciidoc +++ b/docs/reference/mapping/types/semantic-text.asciidoc @@ -112,50 +112,41 @@ Trying to <> that is used on a {infer-cap} endpoints have a limit on the amount of text they can process. To allow for large amounts of text to be used in semantic search, `semantic_text` automatically generates smaller passages if needed, called _chunks_. -Each chunk will include the text subpassage and the corresponding embedding generated from it. +Each chunk refers to a passage of the text and the corresponding embedding generated from it. When querying, the individual passages will be automatically searched for each document, and the most relevant passage will be used to compute a score. For more details on chunking and how to configure chunking settings, see <> in the Inference API documentation. +Refer to <> to learn more about +semantic search using `semantic_text` and the `semantic` query. [discrete] -[[semantic-text-structure]] -==== `semantic_text` structure +[[semantic-text-highlighting]] +==== Extracting Relevant Fragments from Semantic Text -Once a document is ingested, a `semantic_text` field will have the following structure: +You can extract the most relevant fragments from a semantic text field by using the <> in the <>. -[source,console-result] +[source,console] ------------------------------------------------------------ -"inference_field": { - "text": "these are not the droids you're looking for", <1> - "inference": { - "inference_id": "my-elser-endpoint", <2> - "model_settings": { <3> - "task_type": "sparse_embedding" +PUT test-index +{ + "query": { + "semantic": { + "field": "my_semantic_field" + } }, - "chunks": [ <4> - { - "text": "these are not the droids you're looking for", - "embeddings": { - (...) + "highlight": { + "fields": { + "my_semantic_field": { + "type": "semantic", + "number_of_fragments": 2 <1> + } } - } - ] - } + } } ------------------------------------------------------------ -// TEST[skip:TBD] -<1> The field will become an object structure to accommodate both the original -text and the inference results. -<2> The `inference_id` used to generate the embeddings. -<3> Model settings, including the task type and dimensions/similarity if -applicable. -<4> Inference results will be grouped in chunks, each with its corresponding -text and embeddings. - -Refer to <> to learn more about -semantic search using `semantic_text` and the `semantic` query. - +// TEST[skip:Requires inference endpoint] +<1> Specifies the maximum number of fragments to return. [discrete] [[custom-indexing]] diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java index 48458bf4f5086..dd0571c6c1855 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java @@ -37,6 +37,7 @@ import org.elasticsearch.plugins.SystemIndexPlugin; import org.elasticsearch.rest.RestController; import org.elasticsearch.rest.RestHandler; +import org.elasticsearch.search.fetch.subphase.highlight.Highlighter; import org.elasticsearch.search.rank.RankBuilder; import org.elasticsearch.search.rank.RankDoc; import org.elasticsearch.threadpool.ExecutorBuilder; @@ -67,6 +68,7 @@ import org.elasticsearch.xpack.inference.external.http.retry.RetrySettings; import org.elasticsearch.xpack.inference.external.http.sender.HttpRequestSender; import org.elasticsearch.xpack.inference.external.http.sender.RequestExecutorServiceSettings; +import org.elasticsearch.xpack.inference.highlight.SemanticTextHighlighter; import org.elasticsearch.xpack.inference.logging.ThrottlerManager; import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper; import org.elasticsearch.xpack.inference.queries.SemanticQueryBuilder; @@ -411,4 +413,9 @@ public List> getRetrievers() { new RetrieverSpec<>(new ParseField(RandomRankBuilder.NAME), RandomRankRetrieverBuilder::fromXContent) ); } + + @Override + public Map getHighlighters() { + return Map.of(SemanticTextHighlighter.NAME, new SemanticTextHighlighter()); + } } diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java new file mode 100644 index 0000000000000..861efeea1463a --- /dev/null +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighter.java @@ -0,0 +1,208 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.highlight; + +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.KnnByteVectorQuery; +import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; +import org.elasticsearch.common.text.Text; +import org.elasticsearch.common.xcontent.support.XContentMapValues; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper.DenseVectorFieldType; +import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper.SparseVectorFieldType; +import org.elasticsearch.index.query.SearchExecutionContext; +import org.elasticsearch.search.fetch.subphase.highlight.FieldHighlightContext; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightField; +import org.elasticsearch.search.fetch.subphase.highlight.Highlighter; +import org.elasticsearch.search.vectors.VectorData; +import org.elasticsearch.xpack.core.ml.search.SparseVectorQueryWrapper; +import org.elasticsearch.xpack.inference.mapper.SemanticTextField; +import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +/** + * A {@link Highlighter} designed for the {@link SemanticTextFieldMapper}. + * This highlighter extracts semantic queries and evaluates them against each chunk produced by the semantic text field. + * It returns the top-scoring chunks as snippets, optionally sorted by their scores. + */ +public class SemanticTextHighlighter implements Highlighter { + public static final String NAME = "semantic"; + + private record OffsetAndScore(int offset, float score) {} + + @Override + public boolean canHighlight(MappedFieldType fieldType) { + if (fieldType instanceof SemanticTextFieldMapper.SemanticTextFieldType) { + return true; + } + return false; + } + + @Override + public HighlightField highlight(FieldHighlightContext fieldContext) throws IOException { + SemanticTextFieldMapper.SemanticTextFieldType fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) fieldContext.fieldType; + if (fieldType.getEmbeddingsField() == null) { + // nothing indexed yet + return null; + } + + final List queries = switch (fieldType.getModelSettings().taskType()) { + case SPARSE_EMBEDDING -> extractSparseVectorQueries( + (SparseVectorFieldType) fieldType.getEmbeddingsField().fieldType(), + fieldContext.query + ); + case TEXT_EMBEDDING -> extractDenseVectorQueries( + (DenseVectorFieldType) fieldType.getEmbeddingsField().fieldType(), + fieldContext.query + ); + default -> throw new IllegalStateException( + "Wrong task type for a semantic text field, got [" + fieldType.getModelSettings().taskType().name() + "]" + ); + }; + if (queries.isEmpty()) { + // nothing to highlight + return null; + } + + int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments() <= 0 + ? 1 // we return the best fragment by default + : fieldContext.field.fieldOptions().numberOfFragments(); + + List chunks = extractOffsetAndScores( + fieldContext.context.getSearchExecutionContext(), + fieldContext.hitContext.reader(), + fieldType, + fieldContext.hitContext.docId(), + queries + ); + if (chunks.size() == 0) { + return null; + } + + chunks.sort(Comparator.comparingDouble(OffsetAndScore::score).reversed()); + int size = Math.min(chunks.size(), numberOfFragments); + if (fieldContext.field.fieldOptions().scoreOrdered() == false) { + chunks = chunks.subList(0, size); + chunks.sort(Comparator.comparingInt(c -> c.offset)); + } + Text[] snippets = new Text[size]; + List> nestedSources = XContentMapValues.extractNestedSources( + fieldType.getChunksField().fullPath(), + fieldContext.hitContext.source().source() + ); + for (int i = 0; i < size; i++) { + var chunk = chunks.get(i); + if (nestedSources.size() <= chunk.offset) { + throw new IllegalStateException("Invalid content for field [" + fieldType.name() + "]"); + } + String content = (String) nestedSources.get(chunk.offset).get(SemanticTextField.CHUNKED_TEXT_FIELD); + if (content == null) { + throw new IllegalStateException("Invalid content for field [" + fieldType.name() + "]"); + } + snippets[i] = new Text(content); + } + return new HighlightField(fieldContext.fieldName, snippets); + } + + private List extractOffsetAndScores( + SearchExecutionContext context, + LeafReader reader, + SemanticTextFieldMapper.SemanticTextFieldType fieldType, + int docId, + List leafQueries + ) throws IOException { + var bitSet = context.bitsetFilter(fieldType.getChunksField().parentTypeFilter()).getBitSet(reader.getContext()); + int previousParent = docId > 0 ? bitSet.prevSetBit(docId - 1) : -1; + + BooleanQuery.Builder bq = new BooleanQuery.Builder().add(fieldType.getChunksField().nestedTypeFilter(), BooleanClause.Occur.FILTER); + leafQueries.stream().forEach(q -> bq.add(q, BooleanClause.Occur.SHOULD)); + Weight weight = new IndexSearcher(reader).createWeight(bq.build(), ScoreMode.COMPLETE, 1); + Scorer scorer = weight.scorer(reader.getContext()); + if (previousParent != -1) { + if (scorer.iterator().advance(previousParent) == DocIdSetIterator.NO_MORE_DOCS) { + return List.of(); + } + } else if (scorer.iterator().nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { + return List.of(); + } + List results = new ArrayList<>(); + int offset = 0; + while (scorer.docID() < docId) { + results.add(new OffsetAndScore(offset++, scorer.score())); + if (scorer.iterator().nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { + break; + } + } + return results; + } + + private List extractDenseVectorQueries(DenseVectorFieldType fieldType, Query querySection) { + // TODO: Handle knn section when semantic text field can be used. + List queries = new ArrayList<>(); + querySection.visit(new QueryVisitor() { + @Override + public boolean acceptField(String field) { + return fieldType.name().equals(field); + } + + @Override + public void consumeTerms(Query query, Term... terms) { + super.consumeTerms(query, terms); + } + + @Override + public void visitLeaf(Query query) { + if (query instanceof KnnFloatVectorQuery knnQuery) { + queries.add(fieldType.createExactKnnQuery(VectorData.fromFloats(knnQuery.getTargetCopy()), null)); + } else if (query instanceof KnnByteVectorQuery knnQuery) { + queries.add(fieldType.createExactKnnQuery(VectorData.fromBytes(knnQuery.getTargetCopy()), null)); + } + } + }); + return queries; + } + + private List extractSparseVectorQueries(SparseVectorFieldType fieldType, Query querySection) { + List queries = new ArrayList<>(); + querySection.visit(new QueryVisitor() { + @Override + public boolean acceptField(String field) { + return fieldType.name().equals(field); + } + + @Override + public void consumeTerms(Query query, Term... terms) { + super.consumeTerms(query, terms); + } + + @Override + public QueryVisitor getSubVisitor(BooleanClause.Occur occur, Query parent) { + if (parent instanceof SparseVectorQueryWrapper sparseVectorQuery) { + queries.add(sparseVectorQuery.getTermsQuery()); + } + return this; + } + }); + return queries; + } +} diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java index e60e95b58770f..0f26f6577860f 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextField.java @@ -61,7 +61,7 @@ public record SemanticTextField(String fieldName, List originalValues, I static final String SEARCH_INFERENCE_ID_FIELD = "search_inference_id"; static final String CHUNKS_FIELD = "chunks"; static final String CHUNKED_EMBEDDINGS_FIELD = "embeddings"; - static final String CHUNKED_TEXT_FIELD = "text"; + public static final String CHUNKED_TEXT_FIELD = "text"; static final String MODEL_SETTINGS_FIELD = "model_settings"; static final String TASK_TYPE_FIELD = "task_type"; static final String DIMENSIONS_FIELD = "dimensions"; diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java index 3744bf2a6dbed..3ee78588de245 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapper.java @@ -46,7 +46,6 @@ import org.elasticsearch.index.query.MatchNoneQueryBuilder; import org.elasticsearch.index.query.NestedQueryBuilder; import org.elasticsearch.index.query.QueryBuilder; -import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.inference.InferenceResults; import org.elasticsearch.inference.SimilarityMeasure; @@ -57,6 +56,7 @@ import org.elasticsearch.xcontent.XContentParserConfiguration; import org.elasticsearch.xpack.core.ml.inference.results.MlTextEmbeddingResults; import org.elasticsearch.xpack.core.ml.inference.results.TextExpansionResults; +import org.elasticsearch.xpack.core.ml.search.SparseVectorQueryBuilder; import java.io.IOException; import java.util.ArrayList; @@ -529,17 +529,8 @@ public QueryBuilder semanticQuery(InferenceResults inferenceResults, Integer req ); } - // TODO: Use WeightedTokensQueryBuilder TextExpansionResults textExpansionResults = (TextExpansionResults) inferenceResults; - var boolQuery = QueryBuilders.boolQuery(); - for (var weightedToken : textExpansionResults.getWeightedTokens()) { - boolQuery.should( - QueryBuilders.termQuery(inferenceResultsFieldName, weightedToken.token()).boost(weightedToken.weight()) - ); - } - boolQuery.minimumShouldMatch(1); - - yield boolQuery; + yield new SparseVectorQueryBuilder(name(), textExpansionResults.getWeightedTokens(), null, null, null, null); } case TEXT_EMBEDDING -> { if (inferenceResults instanceof MlTextEmbeddingResults == false) { diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java new file mode 100644 index 0000000000000..0d0d4803320cf --- /dev/null +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/highlight/SemanticTextHighlighterTests.java @@ -0,0 +1,292 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.highlight; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.join.ScoreMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.action.OriginalIndices; +import org.elasticsearch.action.search.SearchRequest; +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.io.Streams; +import org.elasticsearch.common.lucene.search.Queries; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentHelper; +import org.elasticsearch.index.IndexVersion; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.MapperServiceTestCase; +import org.elasticsearch.index.mapper.SourceToParse; +import org.elasticsearch.index.query.NestedQueryBuilder; +import org.elasticsearch.index.query.QueryBuilder; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.search.builder.SearchSourceBuilder; +import org.elasticsearch.search.fetch.FetchContext; +import org.elasticsearch.search.fetch.FetchSubPhase; +import org.elasticsearch.search.fetch.subphase.highlight.FieldHighlightContext; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; +import org.elasticsearch.search.fetch.subphase.highlight.SearchHighlightContext; +import org.elasticsearch.search.internal.AliasFilter; +import org.elasticsearch.search.internal.ShardSearchRequest; +import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.search.rank.RankDoc; +import org.elasticsearch.search.vectors.KnnVectorQueryBuilder; +import org.elasticsearch.xcontent.XContentType; +import org.elasticsearch.xpack.core.ml.search.SparseVectorQueryBuilder; +import org.elasticsearch.xpack.core.ml.search.WeightedToken; +import org.elasticsearch.xpack.inference.InferencePlugin; +import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper; +import org.junit.Before; +import org.mockito.Mockito; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +import static org.hamcrest.CoreMatchers.equalTo; +import static org.mockito.Mockito.mock; + +public class SemanticTextHighlighterTests extends MapperServiceTestCase { + private static final String SEMANTIC_FIELD_E5 = "body-e5"; + private static final String SEMANTIC_FIELD_ELSER = "body-elser"; + + private Map queries; + + @Override + protected Collection getPlugins() { + return List.of(new InferencePlugin(Settings.EMPTY)); + } + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + queries = XContentHelper.convertToMap( + Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("queries.json")), + false, + XContentType.JSON + ).v2(); + } + + @SuppressWarnings("unchecked") + public void testDenseVector() throws Exception { + var mapperService = createDefaultMapperService(); + Map queryMap = (Map) queries.get("dense_vector_1"); + float[] vector = readDenseVector(queryMap.get("embeddings")); + var fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) mapperService.mappingLookup().getFieldType(SEMANTIC_FIELD_E5); + KnnVectorQueryBuilder knnQuery = new KnnVectorQueryBuilder(fieldType.getEmbeddingsField().fullPath(), vector, 10, 10, null); + NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), knnQuery, ScoreMode.Max); + var shardRequest = createShardSearchRequest(nestedQueryBuilder); + var sourceToParse = new SourceToParse("0", readSampleDoc("sample-doc.json.gz"), XContentType.JSON); + + String[] expectedScorePassages = ((List) queryMap.get("expected_by_score")).toArray(String[]::new); + for (int i = 0; i < expectedScorePassages.length; i++) { + assertHighlightOneDoc( + mapperService, + shardRequest, + sourceToParse, + SEMANTIC_FIELD_E5, + i + 1, + HighlightBuilder.Order.SCORE, + Arrays.copyOfRange(expectedScorePassages, 0, i + 1) + ); + } + + String[] expectedOffsetPassages = ((List) queryMap.get("expected_by_offset")).toArray(String[]::new); + assertHighlightOneDoc( + mapperService, + shardRequest, + sourceToParse, + SEMANTIC_FIELD_E5, + expectedOffsetPassages.length, + HighlightBuilder.Order.NONE, + expectedOffsetPassages + ); + } + + @SuppressWarnings("unchecked") + public void testSparseVector() throws Exception { + var mapperService = createDefaultMapperService(); + Map queryMap = (Map) queries.get("sparse_vector_1"); + List tokens = readSparseVector(queryMap.get("embeddings")); + var fieldType = (SemanticTextFieldMapper.SemanticTextFieldType) mapperService.mappingLookup().getFieldType(SEMANTIC_FIELD_ELSER); + SparseVectorQueryBuilder sparseQuery = new SparseVectorQueryBuilder( + fieldType.getEmbeddingsField().fullPath(), + tokens, + null, + null, + null, + null + ); + NestedQueryBuilder nestedQueryBuilder = new NestedQueryBuilder(fieldType.getChunksField().fullPath(), sparseQuery, ScoreMode.Max); + var shardRequest = createShardSearchRequest(nestedQueryBuilder); + var sourceToParse = new SourceToParse("0", readSampleDoc("sample-doc.json.gz"), XContentType.JSON); + + String[] expectedScorePassages = ((List) queryMap.get("expected_by_score")).toArray(String[]::new); + for (int i = 0; i < expectedScorePassages.length; i++) { + assertHighlightOneDoc( + mapperService, + shardRequest, + sourceToParse, + SEMANTIC_FIELD_ELSER, + i + 1, + HighlightBuilder.Order.SCORE, + Arrays.copyOfRange(expectedScorePassages, 0, i + 1) + ); + } + + String[] expectedOffsetPassages = ((List) queryMap.get("expected_by_offset")).toArray(String[]::new); + assertHighlightOneDoc( + mapperService, + shardRequest, + sourceToParse, + SEMANTIC_FIELD_ELSER, + expectedOffsetPassages.length, + HighlightBuilder.Order.NONE, + expectedOffsetPassages + ); + } + + private MapperService createDefaultMapperService() throws IOException { + return createMapperService( + Streams.readFully(SemanticTextHighlighterTests.class.getResourceAsStream("mappings.json")).utf8ToString() + ); + } + + private float[] readDenseVector(Object value) { + if (value instanceof List lst) { + float[] res = new float[lst.size()]; + int pos = 0; + for (var obj : lst) { + if (obj instanceof Number number) { + res[pos++] = number.floatValue(); + } else { + throw new IllegalArgumentException("Expected number, got " + obj.getClass().getSimpleName()); + } + } + return res; + } + throw new IllegalArgumentException("Expected list, got " + value.getClass().getSimpleName()); + } + + private List readSparseVector(Object value) { + if (value instanceof Map map) { + List res = new ArrayList<>(); + for (var entry : map.entrySet()) { + if (entry.getValue() instanceof Number number) { + res.add(new WeightedToken((String) entry.getKey(), number.floatValue())); + } else { + throw new IllegalArgumentException("Expected number, got " + entry.getValue().getClass().getSimpleName()); + } + } + return res; + } + throw new IllegalArgumentException("Expected map, got " + value.getClass().getSimpleName()); + } + + private void assertHighlightOneDoc( + MapperService mapperService, + ShardSearchRequest request, + SourceToParse source, + String fieldName, + int numFragments, + HighlightBuilder.Order order, + String[] expectedPassages + ) throws Exception { + SemanticTextFieldMapper fieldMapper = (SemanticTextFieldMapper) mapperService.mappingLookup().getMapper(fieldName); + var doc = mapperService.documentMapper().parse(source); + assertNull(doc.dynamicMappingsUpdate()); + try (Directory dir = newDirectory()) { + IndexWriterConfig iwc = newIndexWriterConfig(new StandardAnalyzer()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + iw.addDocuments(doc.docs()); + try (DirectoryReader reader = wrapInMockESDirectoryReader(iw.getReader())) { + IndexSearcher searcher = newSearcher(reader); + iw.close(); + TopDocs topDocs = searcher.search(Queries.newNonNestedFilter(IndexVersion.current()), 1, Sort.INDEXORDER); + assertThat(topDocs.totalHits.value(), equalTo(1L)); + int docID = topDocs.scoreDocs[0].doc; + SemanticTextHighlighter highlighter = new SemanticTextHighlighter(); + var execContext = createSearchExecutionContext(mapperService); + var luceneQuery = execContext.toQuery(request.source().query()).query(); + FetchContext fetchContext = mock(FetchContext.class); + Mockito.when(fetchContext.highlight()).thenReturn(new SearchHighlightContext(Collections.emptyList())); + Mockito.when(fetchContext.query()).thenReturn(luceneQuery); + Mockito.when(fetchContext.getSearchExecutionContext()).thenReturn(execContext); + + FetchSubPhase.HitContext hitContext = new FetchSubPhase.HitContext( + new SearchHit(docID), + getOnlyLeafReader(reader).getContext(), + docID, + Map.of(), + Source.fromBytes(source.source()), + new RankDoc(docID, Float.NaN, 0) + ); + try { + var highlightContext = new HighlightBuilder().field(fieldName, 0, numFragments) + .order(order) + .highlighterType(SemanticTextHighlighter.NAME) + .build(execContext); + + for (var fieldContext : highlightContext.fields()) { + FieldHighlightContext context = new FieldHighlightContext( + fieldName, + fieldContext, + fieldMapper.fieldType(), + fetchContext, + hitContext, + luceneQuery, + new HashMap<>() + ); + var result = highlighter.highlight(context); + assertThat(result.fragments().length, equalTo(expectedPassages.length)); + for (int i = 0; i < result.fragments().length; i++) { + assertThat(result.fragments()[i].string(), equalTo(expectedPassages[i])); + } + } + } finally { + hitContext.hit().decRef(); + } + } + } + } + + private SearchRequest createSearchRequest(QueryBuilder queryBuilder) { + SearchRequest request = new SearchRequest(); + request.source(new SearchSourceBuilder()); + request.allowPartialSearchResults(false); + request.source().query(queryBuilder); + return request; + } + + private ShardSearchRequest createShardSearchRequest(QueryBuilder queryBuilder) { + SearchRequest request = createSearchRequest(queryBuilder); + return new ShardSearchRequest(OriginalIndices.NONE, request, new ShardId("index", "index", 0), 0, 1, AliasFilter.EMPTY, 1, 0, null); + } + + private BytesReference readSampleDoc(String fileName) throws IOException { + try (var in = new GZIPInputStream(SemanticTextHighlighterTests.class.getResourceAsStream(fileName))) { + return new BytesArray(new BytesRef(in.readAllBytes())); + } + } +} diff --git a/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/mappings.json b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/mappings.json new file mode 100644 index 0000000000000..9841ee0aed6e2 --- /dev/null +++ b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/mappings.json @@ -0,0 +1,27 @@ +{ + "_doc": { + "properties": { + "body": { + "type": "text", + "copy_to": ["body-elser", "body-e5"] + }, + "body-e5": { + "type": "semantic_text", + "inference_id": ".multilingual-e5-small-elasticsearch", + "model_settings": { + "task_type": "text_embedding", + "dimensions": 384, + "similarity": "cosine", + "element_type": "float" + } + }, + "body-elser": { + "type": "semantic_text", + "inference_id": ".elser-2-elasticsearch", + "model_settings": { + "task_type": "sparse_embedding" + } + } + } + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json new file mode 100644 index 0000000000000..6227f3f498854 --- /dev/null +++ b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/queries.json @@ -0,0 +1,467 @@ +{ + "dense_vector_1": { + "embeddings": [ + 0.09475211, + 0.044564713, + -0.04378501, + -0.07908551, + 0.04332011, + -0.03891992, + -0.0062305215, + 0.024245035, + -0.008976331, + 0.032832284, + 0.052760173, + 0.008123907, + 0.09049037, + -0.01637332, + -0.054353267, + 0.00771307, + 0.08545496, + -0.079716265, + -0.045666866, + -0.04369993, + 0.009189822, + -0.013782891, + -0.07701858, + 0.037278354, + 0.049807206, + 0.078036495, + -0.059533164, + 0.051413406, + 0.040234447, + -0.038139492, + -0.085189626, + -0.045546446, + 0.0544375, + -0.05604156, + 0.057408098, + 0.041913517, + -0.037348013, + -0.025998272, + 0.08486864, + -0.046678443, + 0.0041820924, + 0.007514462, + 0.06424746, + 0.044233218, + 0.103267275, + 0.014130771, + -0.049954403, + 0.04226959, + -0.08346965, + -0.01639249, + -0.060537644, + 0.04546336, + 0.012866155, + 0.05375096, + 0.036775924, + -0.0762226, + -0.037304543, + -0.05692274, + -0.055807598, + 0.0040082196, + 0.059259634, + 0.012022011, + -8.0863154E-4, + 0.0070405705, + 0.050255686, + 0.06810016, + 0.017190414, + 0.051975194, + -0.051436286, + 0.023408439, + -0.029802637, + 0.034137156, + -0.004660689, + -0.0442122, + 0.019065322, + 0.030806554, + 0.0064652697, + -0.066789865, + 0.057111286, + 0.009412479, + -0.041444767, + -0.06807582, + -0.085881524, + 0.04901128, + -0.047871742, + 0.06328623, + 0.040418074, + -0.081432894, + 0.058384005, + 0.006206527, + 0.045801315, + 0.037274595, + -0.054337103, + -0.06755516, + -0.07396888, + -0.043732334, + -0.052053086, + 0.03210978, + 0.048101492, + -0.083828256, + 0.05205026, + -0.048474856, + 0.029116616, + -0.10924888, + 0.003796487, + 0.030567763, + 0.026949523, + -0.052353345, + 0.043198872, + -0.09456988, + -0.05711594, + -2.2292069E-4, + 0.032972734, + 0.054394923, + -0.0767535, + -0.02710579, + -0.032135617, + -0.01732382, + 0.059442326, + -0.07686165, + 0.07104082, + -0.03090021, + -0.05450075, + -0.038997203, + -0.07045443, + 0.00483161, + 0.010933604, + 0.020874644, + 0.037941266, + 0.019729063, + 0.06178368, + 0.013503478, + -0.008584046, + 0.045592044, + 0.05528768, + 0.11568184, + 0.0041300594, + 0.015404516, + -3.8067883E-4, + -0.06365399, + -0.07826643, + 0.061575573, + -0.060548335, + 0.05706082, + 0.042301804, + 0.052173313, + 0.07193179, + -0.03839231, + 0.0734415, + -0.045380164, + 0.02832276, + 0.003745178, + 0.058844633, + 0.04307504, + 0.037800383, + -0.031050054, + -0.06856359, + -0.059114788, + -0.02148857, + 0.07854358, + -0.03253363, + -0.04566468, + -0.019933948, + -0.057993464, + -0.08677458, + -0.06626883, + 0.031657256, + 0.101128764, + -0.08050056, + -0.050226066, + -0.014335166, + 0.050344367, + -0.06851419, + 0.008698909, + -0.011893435, + 0.07741272, + -0.059579294, + 0.03250109, + 0.058700256, + 0.046834726, + -0.035081457, + -0.0043140925, + -0.09764087, + -0.0034994273, + -0.034056358, + -0.019066337, + -0.034376107, + 0.012964423, + 0.029291175, + -0.012090671, + 0.021585712, + 0.028859599, + -0.04391145, + -0.071166754, + -0.031040335, + 0.02808108, + -0.05621317, + 0.06543945, + 0.10094665, + 0.041057374, + -0.03222324, + -0.063366964, + 0.064944476, + 0.023641933, + 0.06806713, + 0.06806097, + -0.08220105, + 0.04148528, + -0.09254079, + 0.044620737, + 0.05526614, + -0.03849534, + -0.04722273, + 0.0670776, + -0.024274077, + -0.016903497, + 0.07584147, + 0.04760533, + -0.038843267, + -0.028365409, + 0.08022705, + -0.039916333, + 0.049067073, + -0.030701574, + -0.057169467, + 0.043025102, + 0.07109674, + -0.047296863, + -0.047463104, + 0.040868305, + -0.04409507, + -0.034977127, + -0.057109762, + -0.08616165, + -0.03486079, + -0.046201482, + 0.025963873, + 0.023392359, + 0.09594902, + -0.007847159, + -0.021231368, + 0.009007263, + 0.0032713825, + -0.06876065, + 0.03169641, + -7.2582875E-4, + -0.07049708, + 0.03900843, + -0.0075472407, + 0.05184822, + 0.06452079, + -0.09832754, + -0.012775799, + -0.03925948, + -0.029761659, + 0.0065437574, + 0.0815465, + 0.0411695, + -0.0702844, + -0.009533786, + 0.07024532, + 0.0098710675, + 0.09915362, + 0.0415453, + 0.050641853, + 0.047463298, + -0.058609713, + -0.029499197, + -0.05100956, + -0.03441709, + -0.06348122, + 0.014784361, + 0.056317374, + -0.10280704, + -0.04008354, + -0.018926824, + 0.08832836, + 0.124804, + -0.047645308, + -0.07122146, + -9.886527E-4, + 0.03850324, + 0.048501793, + 0.07072816, + 0.06566776, + -0.013678872, + 0.010010848, + 0.06483413, + -0.030036367, + -0.029748922, + -0.007482364, + -0.05180385, + 0.03698522, + -0.045453787, + 0.056604166, + 0.029394176, + 0.028589265, + -0.012185886, + -0.06919616, + 0.0711641, + -0.034055933, + -0.053101335, + 0.062319, + 0.021600349, + -0.038718067, + 0.060814686, + 0.05087301, + -0.020297311, + 0.016493896, + 0.032162152, + 0.046740912, + 0.05461355, + -0.07024665, + 0.025609337, + -0.02504801, + 0.06765588, + -0.032994855, + -0.037897404, + -0.045783922, + -0.05689299, + -0.040437017, + -0.07904339, + -0.031415287, + -0.029216278, + 0.017395392, + 0.03449264, + -0.025653394, + -0.06283088, + 0.049027324, + 0.016229525, + -0.00985347, + -0.053974394, + -0.030257035, + 0.04325515, + -0.012293731, + -0.002446129, + -0.05567076, + 0.06374684, + -0.03153897, + -0.04475149, + 0.018582936, + 0.025716115, + -0.061778374, + 0.04196277, + -0.04134671, + -0.07396272, + 0.05846184, + 0.006558759, + -0.09745666, + 0.07587805, + 0.0137483915, + -0.100933895, + 0.032008193, + 0.04293283, + 0.017870268, + 0.032806385, + -0.0635923, + -0.019672254, + 0.022225974, + 0.04304554, + -0.06043949, + -0.0285274, + 0.050868835, + 0.057003833, + 0.05740866, + 0.020068677, + -0.034312245, + -0.021671802, + 0.014769731, + -0.07328285, + -0.009586734, + 0.036420938, + -0.022188472, + -0.008200541, + -0.010765854, + -0.06949713, + -0.07555878, + 0.045306854, + -0.05424466, + -0.03647476, + 0.06266633, + 0.08346125, + 0.060288202, + 0.0548457 + ], + "expected_by_score": [ + "The ancient oppidum that corresponds to the modern city of Paris was first mentioned in the mid-1st century BC by Julius Caesar as Luteciam Parisiorum ('Lutetia of the Parisii') and is later attested as Parision in the 5th century AD, then as Paris in 1265. During the Roman period, it was commonly known as Lutetia or Lutecia in Latin, and as Leukotekía in Greek, which is interpreted as either stemming from the Celtic root *lukot- ('mouse'), or from *luto- ('marsh, swamp').\n\n\nThe name Paris is derived from its early inhabitants, the Parisii, a Gallic tribe from the Iron Age and the Roman period. The meaning of the Gaulish ethnonym remains debated. According to Xavier Delamarre, it may derive from the Celtic root pario- ('cauldron'). Alfred Holder interpreted the name as 'the makers' or 'the commanders', by comparing it to the Welsh peryff ('lord, commander'), both possibly descending from a Proto-Celtic form reconstructed as *kwar-is-io-. Alternatively, Pierre-Yves Lambert proposed to translate Parisii as the 'spear people', by connecting the first element to the Old Irish carr ('spear'), derived from an earlier *kwar-sā. In any case, the city's name is not related to the Paris of Greek mythology.\n\n\nResidents of the city are known in English as Parisians and in French as Parisiens ( ⓘ). They are also pejoratively called Parigots ( ⓘ).\n\n\nHistory\n\nOrigins\n\n", + "After the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. In 1137, a new city marketplace (today's Les Halles) replaced the two smaller ones on the Île de la Cité and Place de Grève (Place de l'Hôtel de Ville). The latter location housed the headquarters of Paris's river trade corporation, an organisation that later became, unofficially (although formally in later years), Paris's first municipal government.\n\n\nIn the late 12th century, Philip Augustus extended the Louvre fortress to defend the city against river invasions from the west, gave the city its first walls between 1190 and 1215, rebuilt its bridges to either side of its central island, and paved its main thoroughfares. In 1190, he transformed Paris's former cathedral school into a student-teacher corporation that would become the University of Paris and would draw students from all of Europe.\n\n\nWith 200,000 inhabitants in 1328, Paris, then already the capital of France, was the most populous city of Europe. By comparison, London in 1300 had 80,000 inhabitants. By the early fourteenth century, so much filth had collected inside urban Europe that French and Italian cities were naming streets after human waste. In medieval Paris, several street names were inspired by merde, the French word for \"shit\".\n\n\n", + "In March 2001, Bertrand Delanoë became the first socialist mayor. He was re-elected in March 2008. In 2007, in an effort to reduce car traffic, he introduced the Vélib', a system which rents bicycles. Bertrand Delanoë also transformed a section of the highway along the Left Bank of the Seine into an urban promenade and park, the Promenade des Berges de la Seine, which he inaugurated in June 2013.\n\n\nIn 2007, President Nicolas Sarkozy launched the Grand Paris project, to integrate Paris more closely with the towns in the region around it. After many modifications, the new area, named the Metropolis of Grand Paris, with a population of 6.7 million, was created on 1 January 2016. In 2011, the City of Paris and the national government approved the plans for the Grand Paris Express, totalling 205 km (127 mi) of automated metro lines to connect Paris, the innermost three departments around Paris, airports and high-speed rail (TGV) stations, at an estimated cost of €35 billion. The system is scheduled to be completed by 2030.\n\n\nIn January 2015, Al-Qaeda in the Arabian Peninsula claimed attacks across the Paris region. 1.5 million people marched in Paris in a show of solidarity against terrorism and in support of freedom of speech. In November of the same year, terrorist attacks, claimed by ISIL, killed 130 people and injured more than 350.\n\n\n", + "\nParis (.mw-parser-output .IPA-label-small{font-size:85%}.mw-parser-output .references .IPA-label-small,.mw-parser-output .infobox .IPA-label-small,.mw-parser-output .navbox .IPA-label-small{font-size:100%}French pronunciation: ⓘ) is the capital and largest city of France. With an estimated population of 2,102,650 residents in January 2023 in an area of more than 105 km2 (41 sq mi), Paris is the fourth-largest city in the European Union and the 30th most densely populated city in the world in 2022. Since the 17th century, Paris has been one of the world's major centres of finance, diplomacy, commerce, culture, fashion, and gastronomy. Because of its leading role in the arts and sciences and its early adaptation of extensive street lighting, it became known as the City of Light in the 19th century.\n\n\nThe City of Paris is the centre of the Île-de-France region, or Paris Region, with an official estimated population of 12,271,794 inhabitants in January 2023, or about 19% of the population of France. The Paris Region had a nominal GDP of €765 billion (US$1.064 trillion when adjusted for PPP) in 2021, the highest in the European Union. According to the Economist Intelligence Unit Worldwide Cost of Living Survey, in 2022, Paris was the city with the ninth-highest cost of living in the world.\n\n\n", + "Bal-musette is a style of French music and dance that first became popular in Paris in the 1870s and 1880s; by 1880 Paris had some 150 dance halls. Patrons danced the bourrée to the accompaniment of the cabrette (a bellows-blown bagpipe locally called a \"musette\") and often the vielle à roue (hurdy-gurdy) in the cafés and bars of the city. Parisian and Italian musicians who played the accordion adopted the style and established themselves in Auvergnat bars, and Paris became a major centre for jazz and still attracts jazz musicians from all around the world to its clubs and cafés.\n\n\nParis is the spiritual home of gypsy jazz in particular, and many of the Parisian jazzmen who developed in the first half of the 20th century began by playing Bal-musette in the city. Django Reinhardt rose to fame in Paris, having moved to the 18th arrondissement in a caravan as a young boy, and performed with violinist Stéphane Grappelli and their Quintette du Hot Club de France in the 1930s and 1940s.\n\n\nImmediately after the War the Saint-Germain-des-Pres quarter and the nearby Saint-Michel quarter became home to many small jazz clubs, including the Caveau des Lorientais, the Club Saint-Germain, the Rose Rouge, the Vieux-Colombier, and the most famous, Le Tabou. They introduced Parisians to the music of Claude Luter, Boris Vian, Sydney Bechet, Mezz Mezzrow, and Henri Salvador. " + ], + "expected_by_offset": [ + "\nParis (.mw-parser-output .IPA-label-small{font-size:85%}.mw-parser-output .references .IPA-label-small,.mw-parser-output .infobox .IPA-label-small,.mw-parser-output .navbox .IPA-label-small{font-size:100%}French pronunciation: ⓘ) is the capital and largest city of France. With an estimated population of 2,102,650 residents in January 2023 in an area of more than 105 km2 (41 sq mi), Paris is the fourth-largest city in the European Union and the 30th most densely populated city in the world in 2022. Since the 17th century, Paris has been one of the world's major centres of finance, diplomacy, commerce, culture, fashion, and gastronomy. Because of its leading role in the arts and sciences and its early adaptation of extensive street lighting, it became known as the City of Light in the 19th century.\n\n\nThe City of Paris is the centre of the Île-de-France region, or Paris Region, with an official estimated population of 12,271,794 inhabitants in January 2023, or about 19% of the population of France. The Paris Region had a nominal GDP of €765 billion (US$1.064 trillion when adjusted for PPP) in 2021, the highest in the European Union. According to the Economist Intelligence Unit Worldwide Cost of Living Survey, in 2022, Paris was the city with the ninth-highest cost of living in the world.\n\n\n", + "The ancient oppidum that corresponds to the modern city of Paris was first mentioned in the mid-1st century BC by Julius Caesar as Luteciam Parisiorum ('Lutetia of the Parisii') and is later attested as Parision in the 5th century AD, then as Paris in 1265. During the Roman period, it was commonly known as Lutetia or Lutecia in Latin, and as Leukotekía in Greek, which is interpreted as either stemming from the Celtic root *lukot- ('mouse'), or from *luto- ('marsh, swamp').\n\n\nThe name Paris is derived from its early inhabitants, the Parisii, a Gallic tribe from the Iron Age and the Roman period. The meaning of the Gaulish ethnonym remains debated. According to Xavier Delamarre, it may derive from the Celtic root pario- ('cauldron'). Alfred Holder interpreted the name as 'the makers' or 'the commanders', by comparing it to the Welsh peryff ('lord, commander'), both possibly descending from a Proto-Celtic form reconstructed as *kwar-is-io-. Alternatively, Pierre-Yves Lambert proposed to translate Parisii as the 'spear people', by connecting the first element to the Old Irish carr ('spear'), derived from an earlier *kwar-sā. In any case, the city's name is not related to the Paris of Greek mythology.\n\n\nResidents of the city are known in English as Parisians and in French as Parisiens ( ⓘ). They are also pejoratively called Parigots ( ⓘ).\n\n\nHistory\n\nOrigins\n\n", + "After the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. In 1137, a new city marketplace (today's Les Halles) replaced the two smaller ones on the Île de la Cité and Place de Grève (Place de l'Hôtel de Ville). The latter location housed the headquarters of Paris's river trade corporation, an organisation that later became, unofficially (although formally in later years), Paris's first municipal government.\n\n\nIn the late 12th century, Philip Augustus extended the Louvre fortress to defend the city against river invasions from the west, gave the city its first walls between 1190 and 1215, rebuilt its bridges to either side of its central island, and paved its main thoroughfares. In 1190, he transformed Paris's former cathedral school into a student-teacher corporation that would become the University of Paris and would draw students from all of Europe.\n\n\nWith 200,000 inhabitants in 1328, Paris, then already the capital of France, was the most populous city of Europe. By comparison, London in 1300 had 80,000 inhabitants. By the early fourteenth century, so much filth had collected inside urban Europe that French and Italian cities were naming streets after human waste. In medieval Paris, several street names were inspired by merde, the French word for \"shit\".\n\n\n", + "In March 2001, Bertrand Delanoë became the first socialist mayor. He was re-elected in March 2008. In 2007, in an effort to reduce car traffic, he introduced the Vélib', a system which rents bicycles. Bertrand Delanoë also transformed a section of the highway along the Left Bank of the Seine into an urban promenade and park, the Promenade des Berges de la Seine, which he inaugurated in June 2013.\n\n\nIn 2007, President Nicolas Sarkozy launched the Grand Paris project, to integrate Paris more closely with the towns in the region around it. After many modifications, the new area, named the Metropolis of Grand Paris, with a population of 6.7 million, was created on 1 January 2016. In 2011, the City of Paris and the national government approved the plans for the Grand Paris Express, totalling 205 km (127 mi) of automated metro lines to connect Paris, the innermost three departments around Paris, airports and high-speed rail (TGV) stations, at an estimated cost of €35 billion. The system is scheduled to be completed by 2030.\n\n\nIn January 2015, Al-Qaeda in the Arabian Peninsula claimed attacks across the Paris region. 1.5 million people marched in Paris in a show of solidarity against terrorism and in support of freedom of speech. In November of the same year, terrorist attacks, claimed by ISIL, killed 130 people and injured more than 350.\n\n\n", + "Bal-musette is a style of French music and dance that first became popular in Paris in the 1870s and 1880s; by 1880 Paris had some 150 dance halls. Patrons danced the bourrée to the accompaniment of the cabrette (a bellows-blown bagpipe locally called a \"musette\") and often the vielle à roue (hurdy-gurdy) in the cafés and bars of the city. Parisian and Italian musicians who played the accordion adopted the style and established themselves in Auvergnat bars, and Paris became a major centre for jazz and still attracts jazz musicians from all around the world to its clubs and cafés.\n\n\nParis is the spiritual home of gypsy jazz in particular, and many of the Parisian jazzmen who developed in the first half of the 20th century began by playing Bal-musette in the city. Django Reinhardt rose to fame in Paris, having moved to the 18th arrondissement in a caravan as a young boy, and performed with violinist Stéphane Grappelli and their Quintette du Hot Club de France in the 1930s and 1940s.\n\n\nImmediately after the War the Saint-Germain-des-Pres quarter and the nearby Saint-Michel quarter became home to many small jazz clubs, including the Caveau des Lorientais, the Club Saint-Germain, the Rose Rouge, the Vieux-Colombier, and the most famous, Le Tabou. They introduced Parisians to the music of Claude Luter, Boris Vian, Sydney Bechet, Mezz Mezzrow, and Henri Salvador. " + ] + }, + "sparse_vector_1": { + "embeddings": { + "paris": 2.9709616, + "date": 2.1960778, + "founded": 2.0555024, + "foundation": 1.412623, + "early": 1.2162757, + "founder": 1.1271698, + "french": 0.9213378, + "france": 0.86253893, + "city": 0.82978916, + "founding": 0.79722786, + "established": 0.7967043, + "ancient": 0.7392465, + "when": 0.71705, + "built": 0.6977878, + "treaty": 0.6846069, + "created": 0.68127465, + "century": 0.58926934, + "for": 0.55019474, + "was": 0.52475905, + "origin": 0.48785052, + "expedition": 0.48757303, + "history": 0.47960007, + "mint": 0.47878903, + "historical": 0.4714338, + "capital": 0.42984143, + "timeline": 0.4222377, + "colony": 0.3876187, + "tower": 0.3474891, + "medieval": 0.3272666, + "geography": 0.32456368, + "colonial": 0.30613664, + "location": 0.29013386, + "francisco": 0.22840048, + "orleans": 0.21971667, + "earlier": 0.20318772, + "jackson": 0.18424438, + "exact": 0.17109296, + "rome": 0.16320735, + "civilization": 0.15931238, + "spanish": 0.12759624, + "museum": 0.113024555, + "latin": 0.11201205, + "european": 0.10277243, + "architect": 0.0796932, + "united": 0.031233707 + }, + "expected_by_score": [ + "Clovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. As the Frankish domination of Gaul began, there was a gradual immigration by the Franks to Paris and the Parisian Francien dialects were born. Fortification of the Île de la Cité failed to avert sacking by Vikings in 845, but Paris's strategic importance—with its bridges preventing ships from passing—was established by successful defence in the Siege of Paris (885–886), for which the then Count of Paris (comte de Paris), Odo of France, was elected king of West Francia. From the Capetian dynasty that began with the 987 election of Hugh Capet, Count of Paris and Duke of the Franks (duc des Francs), as king of a unified West Francia, Paris gradually became the largest and most prosperous city in France.\n\n\nHigh and Late Middle Ages to Louis XIV\n\nBy the end of the 12th century, Paris had become the political, economic, religious, and cultural capital of France. The Palais de la Cité, the royal residence, was located at the western end of the Île de la Cité. In 1163, during the reign of Louis VII, Maurice de Sully, bishop of Paris, undertook the construction of the Notre Dame Cathedral at its eastern extremity.\n\n\nAfter the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. ", + "\nThe Parisii, a sub-tribe of the Celtic Senones, inhabited the Paris area from around the middle of the 3rd century BC. One of the area's major north–south trade routes crossed the Seine on the Île de la Cité, which gradually became an important trading centre. The Parisii traded with many river towns (some as far away as the Iberian Peninsula) and minted their own coins.\n\n\nThe Romans conquered the Paris Basin in 52 BC and began their settlement on Paris's Left Bank. The Roman town was originally called Lutetia (more fully, Lutetia Parisiorum, \"Lutetia of the Parisii\", modern French Lutèce). It became a prosperous city with a forum, baths, temples, theatres, and an amphitheatre.\n\n\nBy the end of the Western Roman Empire, the town was known as Parisius, a Latin name that would later become Paris in French. Christianity was introduced in the middle of the 3rd century AD by Saint Denis, the first Bishop of Paris: according to legend, when he refused to renounce his faith before the Roman occupiers, he was beheaded on the hill which became known as Mons Martyrum (Latin \"Hill of Martyrs\"), later \"Montmartre\", from where he walked headless to the north of the city; the place where he fell and was buried became an important religious shrine, the Basilica of Saint-Denis, and many French kings are buried there.\n\n\nClovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. ", + "\nDuring the Hundred Years' War, Paris was occupied by England-friendly Burgundian forces from 1418, before being occupied outright by the English when Henry V of England entered the French capital in 1420; in spite of a 1429 effort by Joan of Arc to liberate the city, it would remain under English occupation until 1436.\n\n\nIn the late 16th-century French Wars of Religion, Paris was a stronghold of the Catholic League, the organisers of 24 August 1572 St. Bartholomew's Day massacre in which thousands of French Protestants were killed. The conflicts ended when pretender to the throne Henry IV, after converting to Catholicism to gain entry to the capital, entered the city in 1594 to claim the crown of France. This king made several improvements to the capital during his reign: he completed the construction of Paris's first uncovered, sidewalk-lined bridge, the Pont Neuf, built a Louvre extension connecting it to the Tuileries Palace, and created the first Paris residential square, the Place Royale, now Place des Vosges. In spite of Henry IV's efforts to improve city circulation, the narrowness of Paris's streets was a contributing factor in his assassination near Les Halles marketplace in 1610.\n\n\nDuring the 17th century, Cardinal Richelieu, chief minister of Louis XIII, was determined to make Paris the most beautiful city in Europe. He built five new bridges, a new chapel for the College of Sorbonne, and a palace for himself, the Palais-Cardinal. ", + "Diderot and D'Alembert published their Encyclopédie in 1751, before the Montgolfier Brothers launched the first manned flight in a hot air balloon on 21 November 1783. Paris was the financial capital of continental Europe, as well the primary European centre for book publishing, fashion and the manufacture of fine furniture and luxury goods. On 22 October 1797, Paris was also the site of the first parachute jump in history, by Garnerin.\n\n\nIn the summer of 1789, Paris became the centre stage of the French Revolution. On 14 July, a mob seized the arsenal at the Invalides, acquiring thousands of guns, with which it stormed the Bastille, a principal symbol of royal authority. The first independent Paris Commune, or city council, met in the Hôtel de Ville and elected a Mayor, the astronomer Jean Sylvain Bailly, on 15 July.\n\n\nLouis XVI and the royal family were brought to Paris and incarcerated in the Tuileries Palace. In 1793, as the revolution turned increasingly radical, the king, queen and mayor were beheaded by guillotine in the Reign of Terror, along with more than 16,000 others throughout France. The property of the aristocracy and the church was nationalised, and the city's churches were closed, sold or demolished. A succession of revolutionary factions ruled Paris until 9 November 1799 (coup d'état du 18 brumaire), when Napoleon Bonaparte seized power as First Consul.\n\n\n", + "After the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. In 1137, a new city marketplace (today's Les Halles) replaced the two smaller ones on the Île de la Cité and Place de Grève (Place de l'Hôtel de Ville). The latter location housed the headquarters of Paris's river trade corporation, an organisation that later became, unofficially (although formally in later years), Paris's first municipal government.\n\n\nIn the late 12th century, Philip Augustus extended the Louvre fortress to defend the city against river invasions from the west, gave the city its first walls between 1190 and 1215, rebuilt its bridges to either side of its central island, and paved its main thoroughfares. In 1190, he transformed Paris's former cathedral school into a student-teacher corporation that would become the University of Paris and would draw students from all of Europe.\n\n\nWith 200,000 inhabitants in 1328, Paris, then already the capital of France, was the most populous city of Europe. By comparison, London in 1300 had 80,000 inhabitants. By the early fourteenth century, so much filth had collected inside urban Europe that French and Italian cities were naming streets after human waste. In medieval Paris, several street names were inspired by merde, the French word for \"shit\".\n\n\n" + ], + "expected_by_offset": [ + "\nThe Parisii, a sub-tribe of the Celtic Senones, inhabited the Paris area from around the middle of the 3rd century BC. One of the area's major north–south trade routes crossed the Seine on the Île de la Cité, which gradually became an important trading centre. The Parisii traded with many river towns (some as far away as the Iberian Peninsula) and minted their own coins.\n\n\nThe Romans conquered the Paris Basin in 52 BC and began their settlement on Paris's Left Bank. The Roman town was originally called Lutetia (more fully, Lutetia Parisiorum, \"Lutetia of the Parisii\", modern French Lutèce). It became a prosperous city with a forum, baths, temples, theatres, and an amphitheatre.\n\n\nBy the end of the Western Roman Empire, the town was known as Parisius, a Latin name that would later become Paris in French. Christianity was introduced in the middle of the 3rd century AD by Saint Denis, the first Bishop of Paris: according to legend, when he refused to renounce his faith before the Roman occupiers, he was beheaded on the hill which became known as Mons Martyrum (Latin \"Hill of Martyrs\"), later \"Montmartre\", from where he walked headless to the north of the city; the place where he fell and was buried became an important religious shrine, the Basilica of Saint-Denis, and many French kings are buried there.\n\n\nClovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. ", + "Clovis the Frank, the first king of the Merovingian dynasty, made the city his capital from 508. As the Frankish domination of Gaul began, there was a gradual immigration by the Franks to Paris and the Parisian Francien dialects were born. Fortification of the Île de la Cité failed to avert sacking by Vikings in 845, but Paris's strategic importance—with its bridges preventing ships from passing—was established by successful defence in the Siege of Paris (885–886), for which the then Count of Paris (comte de Paris), Odo of France, was elected king of West Francia. From the Capetian dynasty that began with the 987 election of Hugh Capet, Count of Paris and Duke of the Franks (duc des Francs), as king of a unified West Francia, Paris gradually became the largest and most prosperous city in France.\n\n\nHigh and Late Middle Ages to Louis XIV\n\nBy the end of the 12th century, Paris had become the political, economic, religious, and cultural capital of France. The Palais de la Cité, the royal residence, was located at the western end of the Île de la Cité. In 1163, during the reign of Louis VII, Maurice de Sully, bishop of Paris, undertook the construction of the Notre Dame Cathedral at its eastern extremity.\n\n\nAfter the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. ", + "After the marshland between the river Seine and its slower 'dead arm' to its north was filled in from around the 10th century, Paris's cultural centre began to move to the Right Bank. In 1137, a new city marketplace (today's Les Halles) replaced the two smaller ones on the Île de la Cité and Place de Grève (Place de l'Hôtel de Ville). The latter location housed the headquarters of Paris's river trade corporation, an organisation that later became, unofficially (although formally in later years), Paris's first municipal government.\n\n\nIn the late 12th century, Philip Augustus extended the Louvre fortress to defend the city against river invasions from the west, gave the city its first walls between 1190 and 1215, rebuilt its bridges to either side of its central island, and paved its main thoroughfares. In 1190, he transformed Paris's former cathedral school into a student-teacher corporation that would become the University of Paris and would draw students from all of Europe.\n\n\nWith 200,000 inhabitants in 1328, Paris, then already the capital of France, was the most populous city of Europe. By comparison, London in 1300 had 80,000 inhabitants. By the early fourteenth century, so much filth had collected inside urban Europe that French and Italian cities were naming streets after human waste. In medieval Paris, several street names were inspired by merde, the French word for \"shit\".\n\n\n", + "\nDuring the Hundred Years' War, Paris was occupied by England-friendly Burgundian forces from 1418, before being occupied outright by the English when Henry V of England entered the French capital in 1420; in spite of a 1429 effort by Joan of Arc to liberate the city, it would remain under English occupation until 1436.\n\n\nIn the late 16th-century French Wars of Religion, Paris was a stronghold of the Catholic League, the organisers of 24 August 1572 St. Bartholomew's Day massacre in which thousands of French Protestants were killed. The conflicts ended when pretender to the throne Henry IV, after converting to Catholicism to gain entry to the capital, entered the city in 1594 to claim the crown of France. This king made several improvements to the capital during his reign: he completed the construction of Paris's first uncovered, sidewalk-lined bridge, the Pont Neuf, built a Louvre extension connecting it to the Tuileries Palace, and created the first Paris residential square, the Place Royale, now Place des Vosges. In spite of Henry IV's efforts to improve city circulation, the narrowness of Paris's streets was a contributing factor in his assassination near Les Halles marketplace in 1610.\n\n\nDuring the 17th century, Cardinal Richelieu, chief minister of Louis XIII, was determined to make Paris the most beautiful city in Europe. He built five new bridges, a new chapel for the College of Sorbonne, and a palace for himself, the Palais-Cardinal. ", + "Diderot and D'Alembert published their Encyclopédie in 1751, before the Montgolfier Brothers launched the first manned flight in a hot air balloon on 21 November 1783. Paris was the financial capital of continental Europe, as well the primary European centre for book publishing, fashion and the manufacture of fine furniture and luxury goods. On 22 October 1797, Paris was also the site of the first parachute jump in history, by Garnerin.\n\n\nIn the summer of 1789, Paris became the centre stage of the French Revolution. On 14 July, a mob seized the arsenal at the Invalides, acquiring thousands of guns, with which it stormed the Bastille, a principal symbol of royal authority. The first independent Paris Commune, or city council, met in the Hôtel de Ville and elected a Mayor, the astronomer Jean Sylvain Bailly, on 15 July.\n\n\nLouis XVI and the royal family were brought to Paris and incarcerated in the Tuileries Palace. In 1793, as the revolution turned increasingly radical, the king, queen and mayor were beheaded by guillotine in the Reign of Terror, along with more than 16,000 others throughout France. The property of the aristocracy and the church was nationalised, and the city's churches were closed, sold or demolished. A succession of revolutionary factions ruled Paris until 9 November 1799 (coup d'état du 18 brumaire), when Napoleon Bonaparte seized power as First Consul.\n\n\n" + ] + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/sample-doc.json.gz b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/sample-doc.json.gz new file mode 100644 index 0000000000000..881524e46e186 Binary files /dev/null and b/x-pack/plugin/inference/src/test/resources/org/elasticsearch/xpack/inference/highlight/sample-doc.json.gz differ