diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java index 48458bf4f5086..45f380c0ce4fa 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java @@ -68,6 +68,7 @@ import org.elasticsearch.xpack.inference.external.http.sender.HttpRequestSender; import org.elasticsearch.xpack.inference.external.http.sender.RequestExecutorServiceSettings; import org.elasticsearch.xpack.inference.logging.ThrottlerManager; +import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper; import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper; import org.elasticsearch.xpack.inference.queries.SemanticQueryBuilder; import org.elasticsearch.xpack.inference.rank.random.RandomRankBuilder; @@ -392,7 +393,7 @@ public void close() { @Override public Map getMappers() { - return Map.of(SemanticTextFieldMapper.CONTENT_TYPE, SemanticTextFieldMapper.PARSER); + return Map.of(SemanticTextFieldMapper.CONTENT_TYPE, SemanticTextFieldMapper.PARSER, OffsetSourceFieldMapper.CONTENT_TYPE, OffsetSourceFieldMapper.PARSER); } @Override diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceField.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceField.java new file mode 100644 index 0000000000000..1e997a2802ede --- /dev/null +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceField.java @@ -0,0 +1,151 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CompiledAutomaton; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Represents a {@link Field} that stores a {@link Term} along with its start and end offsets. + * Note: The {@link Charset} used to calculate these offsets is not associated with this field. + * It is the responsibility of the consumer to handle the appropriate {@link Charset}. + */ +public final class OffsetSourceField extends Field { + private static final FieldType FIELD_TYPE = new FieldType(); + + static { + FIELD_TYPE.setTokenized(false); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + } + + private int startOffset; + private int endOffset; + + public OffsetSourceField(String fieldName, String sourceFieldName, int startOffset, int endOffset) { + super(fieldName, sourceFieldName, FIELD_TYPE); + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + public void setValues(String fieldName, int startOffset, int endOffset) { + this.fieldsData = fieldName; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + @Override + public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { + OffsetTokenStream stream; + if (reuse instanceof OffsetTokenStream) { + stream = (OffsetTokenStream) reuse; + } else { + stream = new OffsetTokenStream(); + } + + stream.setValues((String) fieldsData, startOffset, endOffset); + return stream; + } + + public static OffsetSourceLoader loader(Terms terms, String fieldName) throws IOException { + return new OffsetSourceLoader(terms, fieldName); + } + + private static final class OffsetTokenStream extends TokenStream { + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + private boolean used = true; + private String value = null; + private int startOffset = 0; + private int endOffset = 0; + + private OffsetTokenStream() {} + + /** Sets the values */ + void setValues(String value, int startOffset, int endOffset) { + this.value = value; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + @Override + public boolean incrementToken() { + if (used) { + return false; + } + clearAttributes(); + termAttribute.append(value); + offsetAttribute.setOffset(startOffset, endOffset); + used = true; + return true; + } + + @Override + public void reset() { + used = false; + } + + @Override + public void close() { + value = null; + } + } + + public static class OffsetSourceLoader { + private final Map postingsEnums = new LinkedHashMap<>(); + + private OffsetSourceLoader(Terms terms, String fieldName) throws IOException { + Automaton prefixAutomaton = PrefixQuery.toAutomaton(new BytesRef(fieldName + ".")); + var termsEnum = terms.intersect(new CompiledAutomaton(prefixAutomaton, false, true, false), null); + while (termsEnum.next() != null) { + var postings = termsEnum.postings(null, PostingsEnum.OFFSETS); + if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + String sourceFieldName = termsEnum.term().utf8ToString().substring(fieldName.length() + 1); + postingsEnums.put(sourceFieldName, postings); + } + } + } + + public OffsetSourceFieldMapper.OffsetSource advanceTo(int doc) throws IOException { + for (var it = postingsEnums.entrySet().iterator(); it.hasNext();) { + var entry = it.next(); + var postings = entry.getValue(); + if (postings.docID() < doc) { + if (postings.advance(doc) == DocIdSetIterator.NO_MORE_DOCS) { + it.remove(); + continue; + } + } + if (postings.docID() == doc) { + assert postings.freq() == 1; + postings.nextPosition(); + return new OffsetSourceFieldMapper.OffsetSource(entry.getKey(), postings.startOffset(), postings.endOffset()); + } + } + return null; + } + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapper.java new file mode 100644 index 0000000000000..b5843bb6000ae --- /dev/null +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapper.java @@ -0,0 +1,255 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.elasticsearch.index.fielddata.FieldDataContext; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.mapper.DocumentParserContext; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.MapperBuilderContext; +import org.elasticsearch.index.mapper.TextSearchInfo; +import org.elasticsearch.index.mapper.ValueFetcher; +import org.elasticsearch.index.query.SearchExecutionContext; +import org.elasticsearch.search.fetch.StoredFieldsSpec; +import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.xcontent.ConstructingObjectParser; +import org.elasticsearch.xcontent.ParseField; +import org.elasticsearch.xcontent.ToXContentObject; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentParser; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.xcontent.ConstructingObjectParser.constructorArg; + + +/** + * A {@link FieldMapper} that maps a field name to its start and end offsets. + * The {@link CharsetFormat} used to compute the offsets is specified via the charset parameter. + * Currently, only {@link CharsetFormat#UTF_16} is supported, aligning with Java's {@code String} charset + * for simpler internal usage and integration. + * + * Each document can store at most one value in this field. + * + * Note: This mapper is not yet documented and is intended exclusively for internal use by + * {@link SemanticTextFieldMapper}. If exposing this mapper directly to users becomes necessary, + * extending charset compatibility should be considered, as the current default (and sole supported charset) + * was chosen for ease of Java integration. + */ +public class OffsetSourceFieldMapper extends FieldMapper { + public static final String NAME = "_offset_source"; + public static final String CONTENT_TYPE = "offset_source"; + + private static final String SOURCE_NAME_FIELD = "field"; + private static final String START_OFFSET_FIELD = "start"; + private static final String END_OFFSET_FIELD = "end"; + + public record OffsetSource(String field, int start, int end) implements ToXContentObject { + public OffsetSource { + if (start == -1 || end == -1) { + throw new IllegalArgumentException("Illegal offsets, expected positive numbers, got: " + start + ":" + end); + } + if (start > end) { + throw new IllegalArgumentException("Illegal offsets, expected start < end, got: " + start + " > " + end); + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + builder.field(SOURCE_NAME_FIELD, field); + builder.field(START_OFFSET_FIELD, start); + builder.field(END_OFFSET_FIELD, end); + return builder.endObject(); + } + } + + private static final ConstructingObjectParser OFFSET_SOURCE_PARSER = new ConstructingObjectParser<>( + CONTENT_TYPE, + true, + args -> new OffsetSource((String) args[0], (int) args[1], (int) args[2]) + ); + + static { + OFFSET_SOURCE_PARSER.declareString(constructorArg(), new ParseField(SOURCE_NAME_FIELD)); + OFFSET_SOURCE_PARSER.declareInt(constructorArg(), new ParseField(START_OFFSET_FIELD)); + OFFSET_SOURCE_PARSER.declareInt(constructorArg(), new ParseField(END_OFFSET_FIELD)); + } + + public enum CharsetFormat { + UTF_16(StandardCharsets.UTF_16); + + private Charset charSet; + + CharsetFormat(Charset charSet) { + this.charSet = charSet; + } + } + + public static class Builder extends FieldMapper.Builder { + private final Parameter charset = Parameter.enumParam("charset", false, i -> CharsetFormat.UTF_16, CharsetFormat.UTF_16, CharsetFormat.class); + private final Parameter> meta = Parameter.metaParam(); + + public Builder(String name) { + super(name); + } + + @Override + protected Parameter[] getParameters() { + return new Parameter[] { meta, charset }; + } + + @Override + public OffsetSourceFieldMapper build(MapperBuilderContext context) { + return new OffsetSourceFieldMapper( + leafName(), + new OffsetSourceFieldType(context.buildFullName(leafName()), charset.get(), meta.getValue()), + builderParams(this, context) + ); + } + } + + public static final TypeParser PARSER = new TypeParser((n, c) -> new Builder(n)); + + public static final class OffsetSourceFieldType extends MappedFieldType { + private final CharsetFormat charset; + + public OffsetSourceFieldType(String name, CharsetFormat charset, Map meta) { + super(name, true, false, false, TextSearchInfo.NONE, meta); + this.charset = charset; + } + + public Charset getCharset() { + return charset.charSet; + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public Query existsQuery(SearchExecutionContext context) { + return new PrefixQuery(new Term(NAME, name())); + } + + @Override + public boolean fieldHasValue(FieldInfos fieldInfos) { + return fieldInfos.fieldInfo(NAME) != null; + } + + @Override + public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) { + throw new IllegalArgumentException("[offset_source] fields do not support sorting, scripting or aggregating"); + } + + @Override + public ValueFetcher valueFetcher(SearchExecutionContext context, String format) { + return new ValueFetcher() { + OffsetSourceField.OffsetSourceLoader offsetLoader; + + @Override + public void setNextReader(LeafReaderContext context) { + try { + var terms = context.reader().terms(OffsetSourceFieldMapper.NAME); + offsetLoader = terms != null ? OffsetSourceField.loader(terms, name()) : null; + } catch (IOException exc) { + throw new UncheckedIOException(exc); + } + } + + @Override + public List fetchValues(Source source, int doc, List ignoredValues) throws IOException { + var offsetSource = offsetLoader != null ? offsetLoader.advanceTo(doc) : null; + return offsetSource != null ? List.of(offsetSource) : null; + } + + @Override + public StoredFieldsSpec storedFieldsSpec() { + return StoredFieldsSpec.NO_REQUIREMENTS; + } + }; + } + + @Override + public Query termQuery(Object value, SearchExecutionContext context) { + throw new IllegalArgumentException("Queries on [offset_source] fields are not supported"); + } + + @Override + public boolean isSearchable() { + return false; + } + } + + /** + * @param simpleName the leaf name of the mapper + * @param mappedFieldType + * @param params initialization params for this field mapper + */ + protected OffsetSourceFieldMapper(String simpleName, MappedFieldType mappedFieldType, BuilderParams params) { + super(simpleName, mappedFieldType, params); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + @Override + protected boolean supportsParsingObject() { + return true; + } + + @Override + protected void parseCreateField(DocumentParserContext context) throws IOException { + var parser = context.parser(); + if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { + // skip + return; + } + + if (context.doc().getByKey(fullPath()) != null) { + throw new IllegalArgumentException( + "[offset_source] fields do not support indexing multiple values for the same field [" + + fullPath() + + "] in the same document" + ); + } + + boolean isWithinLeafObject = context.path().isWithinLeafObject(); + // make sure that we don't expand dots in field names while parsing + context.path().setWithinLeafObject(true); + try { + var offsetSource = OFFSET_SOURCE_PARSER.parse(parser, null); + context.doc() + .addWithKey( + fullPath(), + new OffsetSourceField(NAME, fullPath() + "." + offsetSource.field, offsetSource.start, offsetSource.end) + ); + } finally { + context.path().setWithinLeafObject(isWithinLeafObject); + } + } + + @Override + public FieldMapper.Builder getMergeBuilder() { + return new Builder(leafName()).init(this); + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceMetaFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceMetaFieldMapper.java new file mode 100644 index 0000000000000..91393087ea391 --- /dev/null +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceMetaFieldMapper.java @@ -0,0 +1,78 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.search.Query; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.MetadataFieldMapper; +import org.elasticsearch.index.mapper.TextSearchInfo; +import org.elasticsearch.index.mapper.ValueFetcher; +import org.elasticsearch.index.query.SearchExecutionContext; + +import java.util.Collections; + +/** + * This meta field only exists because offset source fields index everything into a + * common _offset_source field and Elasticsearch has a custom codec that complains + * when fields exist in the index and not in mappings. + */ +public class OffsetSourceMetaFieldMapper extends MetadataFieldMapper { + + public static final String NAME = "_offset_source"; + + public static final String CONTENT_TYPE = "_offset_source"; + + public static final TypeParser PARSER = new FixedTypeParser(c -> new OffsetSourceMetaFieldMapper()); + + public static final class OffsetSourceMetaFieldType extends MappedFieldType { + + public static final OffsetSourceMetaFieldType INSTANCE = new OffsetSourceMetaFieldType(); + + // made visible for tests + OffsetSourceMetaFieldType() { + super(NAME, false, false, false, TextSearchInfo.NONE, Collections.emptyMap()); + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public ValueFetcher valueFetcher(SearchExecutionContext context, String format) { + throw new UnsupportedOperationException("Cannot fetch values for internal field [" + typeName() + "]."); + } + + @Override + public Query existsQuery(SearchExecutionContext context) { + throw new UnsupportedOperationException("Cannot run exists query on [_offset_source]"); + } + + @Override + public boolean fieldHasValue(FieldInfos fieldInfos) { + return fieldInfos.fieldInfo(NAME) != null; + } + + @Override + public Query termQuery(Object value, SearchExecutionContext context) { + throw new UnsupportedOperationException("The [_offset_source] field may not be queried directly"); + } + } + + private OffsetSourceMetaFieldMapper() { + super(OffsetSourceMetaFieldType.INSTANCE); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapperTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapperTests.java new file mode 100644 index 0000000000000..1350289d9796a --- /dev/null +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapperTests.java @@ -0,0 +1,230 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.DocumentParsingException; +import org.elasticsearch.index.mapper.LuceneDocument; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.MapperTestCase; +import org.elasticsearch.index.mapper.ParsedDocument; +import org.elasticsearch.index.mapper.SourceToParse; +import org.elasticsearch.index.mapper.ValueFetcher; +import org.elasticsearch.index.query.SearchExecutionContext; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.search.lookup.SourceProvider; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xpack.inference.InferencePlugin; +import org.junit.AssumptionViolatedException; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.instanceOf; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class OffsetSourceFieldMapperTests extends MapperTestCase { + @Override + protected Collection getPlugins() { + return List.of(new InferencePlugin(Settings.EMPTY)); + } + + @Override + protected void minimalMapping(XContentBuilder b) throws IOException { + b.field("type", "offset_source"); + } + + @Override + protected Object getSampleValueForDocument() { + return getSampleObjectForDocument(); + } + + @Override + protected Object getSampleObjectForDocument() { + return Map.of("field", "foo", "start", 100, "end", 300); + } + + @Override + protected Object generateRandomInputValue(MappedFieldType ft) { + return new OffsetSourceFieldMapper.OffsetSource("field", randomIntBetween(0, 100), randomIntBetween(101, 1000)); + } + + @Override + protected IngestScriptSupport ingestScriptSupport() { + throw new AssumptionViolatedException("not supported"); + } + + + @Override + protected void assertExistsQuery(MappedFieldType fieldType, Query query, LuceneDocument fields) { + assertThat(query, instanceOf(PrefixQuery.class)); + PrefixQuery termQuery = (PrefixQuery) query; + assertEquals("_offset_source", termQuery.getField()); + assertEquals(new Term("_offset_source", "field"), termQuery.getPrefix()); + assertNotNull(fields.getField("_offset_source")); + } + + @Override + protected void registerParameters(ParameterChecker checker) throws IOException {} + + @Override + protected void assertSearchable(MappedFieldType fieldType) { + assertFalse(fieldType.isSearchable()); + } + + @Override + protected boolean supportsStoredFields() { + return false; + } + + @Override + protected boolean supportsEmptyInputArray() { + return false; + } + + @Override + protected boolean supportsCopyTo() { + return false; + } + + @Override + protected boolean supportsIgnoreMalformed() { + return false; + } + + @Override + protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) { + return new SyntheticSourceSupport() { + @Override + public SyntheticSourceExample example(int maxValues) { + return new SyntheticSourceExample(getSampleValueForDocument(), getSampleValueForDocument(), null, b -> minimalMapping(b)); + } + + @Override + public List invalidExample() { + return List.of(); + } + }; + } + + @Override + public void testSyntheticSourceKeepArrays() { + // This mapper doesn't support multiple values (array of objects). + } + + public void testDefaults() throws Exception { + DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); + assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); + + ParsedDocument doc1 = mapper.parse( + source(b -> b.startObject("field").field("field", "foo").field("start", 0).field("end", 128).endObject()) + ); + List fields = doc1.rootDoc().getFields("_offset_source"); + assertEquals(1, fields.size()); + assertThat(fields.get(0), instanceOf(OffsetSourceField.class)); + OffsetSourceField offsetField1 = (OffsetSourceField) fields.get(0); + + ParsedDocument doc2 = mapper.parse( + source(b -> b.startObject("field").field("field", "bar").field("start", 128).field("end", 512).endObject()) + ); + OffsetSourceField offsetField2 = (OffsetSourceField) doc2.rootDoc().getFields("_offset_source").get(0); + + assertTokenStream(offsetField1.tokenStream(null, null), "field.foo", 0, 128); + assertTokenStream(offsetField2.tokenStream(null, null), "field.bar", 128, 512); + } + + private void assertTokenStream(TokenStream tk, String expectedTerm, int expectedStartOffset, int expectedEndOffset) throws IOException { + CharTermAttribute termAttribute = tk.addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = tk.addAttribute(OffsetAttribute.class); + tk.reset(); + assertTrue(tk.incrementToken()); + assertThat(new String(termAttribute.buffer(), 0, termAttribute.length()), equalTo(expectedTerm)); + assertThat(offsetAttribute.startOffset(), equalTo(expectedStartOffset)); + assertThat(offsetAttribute.endOffset(), equalTo(expectedEndOffset)); + assertFalse(tk.incrementToken()); + } + + @Override + protected void assertFetch(MapperService mapperService, String field, Object value, String format) throws IOException { + MappedFieldType ft = mapperService.fieldType(field); + MappedFieldType.FielddataOperation fdt = MappedFieldType.FielddataOperation.SEARCH; + SourceToParse source = source(b -> b.field(ft.name(), value)); + SearchExecutionContext searchExecutionContext = mock(SearchExecutionContext.class); + when(searchExecutionContext.isSourceEnabled()).thenReturn(true); + when(searchExecutionContext.sourcePath(field)).thenReturn(Set.of(field)); + when(searchExecutionContext.getForField(ft, fdt)).thenAnswer(inv -> fieldDataLookup(mapperService).apply(ft, () -> { + throw new UnsupportedOperationException(); + }, fdt)); + ValueFetcher nativeFetcher = ft.valueFetcher(searchExecutionContext, format); + ParsedDocument doc = mapperService.documentMapper().parse(source); + withLuceneIndex(mapperService, iw -> iw.addDocuments(doc.docs()), ir -> { + Source s = SourceProvider.fromStoredFields().getSource(ir.leaves().get(0), 0); + nativeFetcher.setNextReader(ir.leaves().get(0)); + List fromNative = nativeFetcher.fetchValues(s, 0, new ArrayList<>()); + assertThat(fromNative.size(), equalTo(1)); + assertThat("fetching " + value, fromNative.get(0), equalTo(value)); + }); + } + + @Override + protected void assertFetchMany(MapperService mapperService, String field, Object value, String format, int count) throws IOException { + assumeFalse("[offset_source] currently don't support multiple values in the same field", false); + } + + public void testInvalidCharset() { + var exc = expectThrows(Exception.class, () -> createDocumentMapper(mapping(b -> { b.startObject("field").field("type", "offset_source").field("charset", "utf_8").endObject(); }))); + assertThat(exc.getCause().getMessage(), containsString("Unknown value [utf_8] for field [charset]")); + } + + public void testRejectMultiValuedFields() throws IOException { + DocumentMapper mapper = createDocumentMapper(mapping(b -> { b.startObject("field").field("type", "offset_source").endObject(); })); + + DocumentParsingException exc = expectThrows(DocumentParsingException.class, () -> mapper.parse(source(b -> { + b.startArray("field"); + { + b.startObject().field("field", "bar1").field("start", 128).field("end", 512).endObject(); + b.startObject().field("field", "bar2").field("start", 128).field("end", 512).endObject(); + } + b.endArray(); + }))); + assertThat(exc.getCause().getMessage(), containsString("[offset_source] fields do not support indexing multiple values")); + } + + public void testInvalidOffsets() throws IOException { + DocumentMapper mapper = createDocumentMapper(mapping(b -> { b.startObject("field").field("type", "offset_source").endObject(); })); + + DocumentParsingException exc = expectThrows(DocumentParsingException.class, () -> mapper.parse(source(b -> { + b.startArray("field"); + { + b.startObject().field("field", "bar1").field("start", -1).field("end", 512).endObject(); + } + b.endArray(); + }))); + assertThat(exc.getCause().getCause().getCause().getMessage(), containsString("Illegal offsets")); + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTests.java new file mode 100644 index 0000000000000..1380a81eb4125 --- /dev/null +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTests.java @@ -0,0 +1,74 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.document.Document; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.util.IOUtils; +import org.elasticsearch.test.ESTestCase; + +import static org.hamcrest.Matchers.containsString; + +public class OffsetSourceFieldTests extends ESTestCase { + public void testBasics() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter( + random(), + dir, + newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean())) + ); + Document doc = new Document(); + OffsetSourceField field1 = new OffsetSourceField(OffsetSourceFieldMapper.NAME, "field1.foo", 1, 10); + doc.add(field1); + writer.addDocument(doc); + + field1.setValues("field1.bar", 10, 128); + writer.addDocument(doc); + + writer.addDocument(new Document()); // gap + + field1.setValues("field1.foo", 50, 256); + writer.addDocument(doc); + + writer.addDocument(new Document()); // double gap + writer.addDocument(new Document()); + + field1.setValues("field1.baz", 32, 512); + writer.addDocument(doc); + + writer.forceMerge(1); + var reader = writer.getReader(); + writer.close(); + + var searcher = newSearcher(reader); + var context = searcher.getIndexReader().leaves().get(0); + + var terms = context.reader().terms(OffsetSourceFieldMapper.NAME); + assertNotNull(terms); + OffsetSourceField.OffsetSourceLoader loader = OffsetSourceField.loader(terms, "field1"); + + var offset = loader.advanceTo(0); + assertEquals(new OffsetSourceFieldMapper.OffsetSource("foo", 1, 10), offset); + + offset = loader.advanceTo(1); + assertEquals(new OffsetSourceFieldMapper.OffsetSource("bar", 10, 128), offset); + + assertNull(loader.advanceTo(2)); + + offset = loader.advanceTo(3); + assertEquals(new OffsetSourceFieldMapper.OffsetSource("foo", 50, 256), offset); + + offset = loader.advanceTo(6); + assertEquals(new OffsetSourceFieldMapper.OffsetSource("baz", 32, 512), offset); + + assertNull(loader.advanceTo(189)); + + IOUtils.close(reader, dir); + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTypeTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTypeTests.java new file mode 100644 index 0000000000000..c7f7aa962c3b8 --- /dev/null +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTypeTests.java @@ -0,0 +1,48 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.elasticsearch.index.mapper.FieldTypeTestCase; +import org.elasticsearch.index.mapper.MappedFieldType; + +import java.util.Collections; + +public class OffsetSourceFieldTypeTests extends FieldTypeTestCase { + public void testIsNotAggregatable() { + MappedFieldType fieldType = getMappedFieldType(); + assertFalse(fieldType.isAggregatable()); + } + + @Override + public void testFieldHasValue() { + MappedFieldType fieldType = getMappedFieldType(); + FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { getFieldInfoWithName("_offset_source") }); + assertTrue(fieldType.fieldHasValue(fieldInfos)); + } + + @Override + public void testFieldHasValueWithEmptyFieldInfos() { + MappedFieldType fieldType = getMappedFieldType(); + assertFalse(fieldType.fieldHasValue(FieldInfos.EMPTY)); + } + + public void testFieldEmptyIfNameIsPresentInFieldInfos() { + MappedFieldType fieldType = getMappedFieldType(); + FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { getFieldInfoWithName("field") }); + assertFalse(fieldType.fieldHasValue(fieldInfos)); + } + + @Override + public MappedFieldType getMappedFieldType() { + return new OffsetSourceFieldMapper.OffsetSourceFieldType("field", OffsetSourceFieldMapper.CharsetFormat.UTF_16, Collections.emptyMap()); + } +} \ No newline at end of file