From 25f7223196a97dbdfa8b6d5045097f737533dc80 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Wed, 4 Dec 2024 17:37:02 +0000 Subject: [PATCH] Add a new `offset_source` field to store offsets referencing substrings of another field. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This field is primarily designed for use with the `semantic_text` field, where it enables storing offsets that point to substrings of the field used to generate its underlying chunks. To prevent external usage, the field is intentionally undocumented, with detailed javadocs explaining its specific purpose and limitations. I couldn’t find a way to fully block external usage, but skipping the docs should keep it mostly out of sight for now. --- .../xpack/inference/InferencePlugin.java | 3 +- .../inference/mapper/OffsetSourceField.java | 151 +++++++++++ .../mapper/OffsetSourceFieldMapper.java | 255 ++++++++++++++++++ .../mapper/OffsetSourceMetaFieldMapper.java | 78 ++++++ .../mapper/OffsetSourceFieldMapperTests.java | 230 ++++++++++++++++ .../mapper/OffsetSourceFieldTests.java | 74 +++++ .../mapper/OffsetSourceFieldTypeTests.java | 48 ++++ 7 files changed, 838 insertions(+), 1 deletion(-) create mode 100644 x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceField.java create mode 100644 x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapper.java create mode 100644 x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceMetaFieldMapper.java create mode 100644 x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapperTests.java create mode 100644 x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTests.java create mode 100644 x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTypeTests.java diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java index 48458bf4f5086..45f380c0ce4fa 100644 --- a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java @@ -68,6 +68,7 @@ import org.elasticsearch.xpack.inference.external.http.sender.HttpRequestSender; import org.elasticsearch.xpack.inference.external.http.sender.RequestExecutorServiceSettings; import org.elasticsearch.xpack.inference.logging.ThrottlerManager; +import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper; import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper; import org.elasticsearch.xpack.inference.queries.SemanticQueryBuilder; import org.elasticsearch.xpack.inference.rank.random.RandomRankBuilder; @@ -392,7 +393,7 @@ public void close() { @Override public Map getMappers() { - return Map.of(SemanticTextFieldMapper.CONTENT_TYPE, SemanticTextFieldMapper.PARSER); + return Map.of(SemanticTextFieldMapper.CONTENT_TYPE, SemanticTextFieldMapper.PARSER, OffsetSourceFieldMapper.CONTENT_TYPE, OffsetSourceFieldMapper.PARSER); } @Override diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceField.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceField.java new file mode 100644 index 0000000000000..1e997a2802ede --- /dev/null +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceField.java @@ -0,0 +1,151 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CompiledAutomaton; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Represents a {@link Field} that stores a {@link Term} along with its start and end offsets. + * Note: The {@link Charset} used to calculate these offsets is not associated with this field. + * It is the responsibility of the consumer to handle the appropriate {@link Charset}. + */ +public final class OffsetSourceField extends Field { + private static final FieldType FIELD_TYPE = new FieldType(); + + static { + FIELD_TYPE.setTokenized(false); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + } + + private int startOffset; + private int endOffset; + + public OffsetSourceField(String fieldName, String sourceFieldName, int startOffset, int endOffset) { + super(fieldName, sourceFieldName, FIELD_TYPE); + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + public void setValues(String fieldName, int startOffset, int endOffset) { + this.fieldsData = fieldName; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + @Override + public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { + OffsetTokenStream stream; + if (reuse instanceof OffsetTokenStream) { + stream = (OffsetTokenStream) reuse; + } else { + stream = new OffsetTokenStream(); + } + + stream.setValues((String) fieldsData, startOffset, endOffset); + return stream; + } + + public static OffsetSourceLoader loader(Terms terms, String fieldName) throws IOException { + return new OffsetSourceLoader(terms, fieldName); + } + + private static final class OffsetTokenStream extends TokenStream { + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); + private boolean used = true; + private String value = null; + private int startOffset = 0; + private int endOffset = 0; + + private OffsetTokenStream() {} + + /** Sets the values */ + void setValues(String value, int startOffset, int endOffset) { + this.value = value; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + @Override + public boolean incrementToken() { + if (used) { + return false; + } + clearAttributes(); + termAttribute.append(value); + offsetAttribute.setOffset(startOffset, endOffset); + used = true; + return true; + } + + @Override + public void reset() { + used = false; + } + + @Override + public void close() { + value = null; + } + } + + public static class OffsetSourceLoader { + private final Map postingsEnums = new LinkedHashMap<>(); + + private OffsetSourceLoader(Terms terms, String fieldName) throws IOException { + Automaton prefixAutomaton = PrefixQuery.toAutomaton(new BytesRef(fieldName + ".")); + var termsEnum = terms.intersect(new CompiledAutomaton(prefixAutomaton, false, true, false), null); + while (termsEnum.next() != null) { + var postings = termsEnum.postings(null, PostingsEnum.OFFSETS); + if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + String sourceFieldName = termsEnum.term().utf8ToString().substring(fieldName.length() + 1); + postingsEnums.put(sourceFieldName, postings); + } + } + } + + public OffsetSourceFieldMapper.OffsetSource advanceTo(int doc) throws IOException { + for (var it = postingsEnums.entrySet().iterator(); it.hasNext();) { + var entry = it.next(); + var postings = entry.getValue(); + if (postings.docID() < doc) { + if (postings.advance(doc) == DocIdSetIterator.NO_MORE_DOCS) { + it.remove(); + continue; + } + } + if (postings.docID() == doc) { + assert postings.freq() == 1; + postings.nextPosition(); + return new OffsetSourceFieldMapper.OffsetSource(entry.getKey(), postings.startOffset(), postings.endOffset()); + } + } + return null; + } + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapper.java new file mode 100644 index 0000000000000..b5843bb6000ae --- /dev/null +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapper.java @@ -0,0 +1,255 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.elasticsearch.index.fielddata.FieldDataContext; +import org.elasticsearch.index.fielddata.IndexFieldData; +import org.elasticsearch.index.mapper.DocumentParserContext; +import org.elasticsearch.index.mapper.FieldMapper; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.MapperBuilderContext; +import org.elasticsearch.index.mapper.TextSearchInfo; +import org.elasticsearch.index.mapper.ValueFetcher; +import org.elasticsearch.index.query.SearchExecutionContext; +import org.elasticsearch.search.fetch.StoredFieldsSpec; +import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.xcontent.ConstructingObjectParser; +import org.elasticsearch.xcontent.ParseField; +import org.elasticsearch.xcontent.ToXContentObject; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentParser; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.xcontent.ConstructingObjectParser.constructorArg; + + +/** + * A {@link FieldMapper} that maps a field name to its start and end offsets. + * The {@link CharsetFormat} used to compute the offsets is specified via the charset parameter. + * Currently, only {@link CharsetFormat#UTF_16} is supported, aligning with Java's {@code String} charset + * for simpler internal usage and integration. + * + * Each document can store at most one value in this field. + * + * Note: This mapper is not yet documented and is intended exclusively for internal use by + * {@link SemanticTextFieldMapper}. If exposing this mapper directly to users becomes necessary, + * extending charset compatibility should be considered, as the current default (and sole supported charset) + * was chosen for ease of Java integration. + */ +public class OffsetSourceFieldMapper extends FieldMapper { + public static final String NAME = "_offset_source"; + public static final String CONTENT_TYPE = "offset_source"; + + private static final String SOURCE_NAME_FIELD = "field"; + private static final String START_OFFSET_FIELD = "start"; + private static final String END_OFFSET_FIELD = "end"; + + public record OffsetSource(String field, int start, int end) implements ToXContentObject { + public OffsetSource { + if (start == -1 || end == -1) { + throw new IllegalArgumentException("Illegal offsets, expected positive numbers, got: " + start + ":" + end); + } + if (start > end) { + throw new IllegalArgumentException("Illegal offsets, expected start < end, got: " + start + " > " + end); + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + builder.field(SOURCE_NAME_FIELD, field); + builder.field(START_OFFSET_FIELD, start); + builder.field(END_OFFSET_FIELD, end); + return builder.endObject(); + } + } + + private static final ConstructingObjectParser OFFSET_SOURCE_PARSER = new ConstructingObjectParser<>( + CONTENT_TYPE, + true, + args -> new OffsetSource((String) args[0], (int) args[1], (int) args[2]) + ); + + static { + OFFSET_SOURCE_PARSER.declareString(constructorArg(), new ParseField(SOURCE_NAME_FIELD)); + OFFSET_SOURCE_PARSER.declareInt(constructorArg(), new ParseField(START_OFFSET_FIELD)); + OFFSET_SOURCE_PARSER.declareInt(constructorArg(), new ParseField(END_OFFSET_FIELD)); + } + + public enum CharsetFormat { + UTF_16(StandardCharsets.UTF_16); + + private Charset charSet; + + CharsetFormat(Charset charSet) { + this.charSet = charSet; + } + } + + public static class Builder extends FieldMapper.Builder { + private final Parameter charset = Parameter.enumParam("charset", false, i -> CharsetFormat.UTF_16, CharsetFormat.UTF_16, CharsetFormat.class); + private final Parameter> meta = Parameter.metaParam(); + + public Builder(String name) { + super(name); + } + + @Override + protected Parameter[] getParameters() { + return new Parameter[] { meta, charset }; + } + + @Override + public OffsetSourceFieldMapper build(MapperBuilderContext context) { + return new OffsetSourceFieldMapper( + leafName(), + new OffsetSourceFieldType(context.buildFullName(leafName()), charset.get(), meta.getValue()), + builderParams(this, context) + ); + } + } + + public static final TypeParser PARSER = new TypeParser((n, c) -> new Builder(n)); + + public static final class OffsetSourceFieldType extends MappedFieldType { + private final CharsetFormat charset; + + public OffsetSourceFieldType(String name, CharsetFormat charset, Map meta) { + super(name, true, false, false, TextSearchInfo.NONE, meta); + this.charset = charset; + } + + public Charset getCharset() { + return charset.charSet; + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public Query existsQuery(SearchExecutionContext context) { + return new PrefixQuery(new Term(NAME, name())); + } + + @Override + public boolean fieldHasValue(FieldInfos fieldInfos) { + return fieldInfos.fieldInfo(NAME) != null; + } + + @Override + public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) { + throw new IllegalArgumentException("[offset_source] fields do not support sorting, scripting or aggregating"); + } + + @Override + public ValueFetcher valueFetcher(SearchExecutionContext context, String format) { + return new ValueFetcher() { + OffsetSourceField.OffsetSourceLoader offsetLoader; + + @Override + public void setNextReader(LeafReaderContext context) { + try { + var terms = context.reader().terms(OffsetSourceFieldMapper.NAME); + offsetLoader = terms != null ? OffsetSourceField.loader(terms, name()) : null; + } catch (IOException exc) { + throw new UncheckedIOException(exc); + } + } + + @Override + public List fetchValues(Source source, int doc, List ignoredValues) throws IOException { + var offsetSource = offsetLoader != null ? offsetLoader.advanceTo(doc) : null; + return offsetSource != null ? List.of(offsetSource) : null; + } + + @Override + public StoredFieldsSpec storedFieldsSpec() { + return StoredFieldsSpec.NO_REQUIREMENTS; + } + }; + } + + @Override + public Query termQuery(Object value, SearchExecutionContext context) { + throw new IllegalArgumentException("Queries on [offset_source] fields are not supported"); + } + + @Override + public boolean isSearchable() { + return false; + } + } + + /** + * @param simpleName the leaf name of the mapper + * @param mappedFieldType + * @param params initialization params for this field mapper + */ + protected OffsetSourceFieldMapper(String simpleName, MappedFieldType mappedFieldType, BuilderParams params) { + super(simpleName, mappedFieldType, params); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + @Override + protected boolean supportsParsingObject() { + return true; + } + + @Override + protected void parseCreateField(DocumentParserContext context) throws IOException { + var parser = context.parser(); + if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { + // skip + return; + } + + if (context.doc().getByKey(fullPath()) != null) { + throw new IllegalArgumentException( + "[offset_source] fields do not support indexing multiple values for the same field [" + + fullPath() + + "] in the same document" + ); + } + + boolean isWithinLeafObject = context.path().isWithinLeafObject(); + // make sure that we don't expand dots in field names while parsing + context.path().setWithinLeafObject(true); + try { + var offsetSource = OFFSET_SOURCE_PARSER.parse(parser, null); + context.doc() + .addWithKey( + fullPath(), + new OffsetSourceField(NAME, fullPath() + "." + offsetSource.field, offsetSource.start, offsetSource.end) + ); + } finally { + context.path().setWithinLeafObject(isWithinLeafObject); + } + } + + @Override + public FieldMapper.Builder getMergeBuilder() { + return new Builder(leafName()).init(this); + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceMetaFieldMapper.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceMetaFieldMapper.java new file mode 100644 index 0000000000000..91393087ea391 --- /dev/null +++ b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceMetaFieldMapper.java @@ -0,0 +1,78 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.search.Query; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.MetadataFieldMapper; +import org.elasticsearch.index.mapper.TextSearchInfo; +import org.elasticsearch.index.mapper.ValueFetcher; +import org.elasticsearch.index.query.SearchExecutionContext; + +import java.util.Collections; + +/** + * This meta field only exists because offset source fields index everything into a + * common _offset_source field and Elasticsearch has a custom codec that complains + * when fields exist in the index and not in mappings. + */ +public class OffsetSourceMetaFieldMapper extends MetadataFieldMapper { + + public static final String NAME = "_offset_source"; + + public static final String CONTENT_TYPE = "_offset_source"; + + public static final TypeParser PARSER = new FixedTypeParser(c -> new OffsetSourceMetaFieldMapper()); + + public static final class OffsetSourceMetaFieldType extends MappedFieldType { + + public static final OffsetSourceMetaFieldType INSTANCE = new OffsetSourceMetaFieldType(); + + // made visible for tests + OffsetSourceMetaFieldType() { + super(NAME, false, false, false, TextSearchInfo.NONE, Collections.emptyMap()); + } + + @Override + public String typeName() { + return CONTENT_TYPE; + } + + @Override + public ValueFetcher valueFetcher(SearchExecutionContext context, String format) { + throw new UnsupportedOperationException("Cannot fetch values for internal field [" + typeName() + "]."); + } + + @Override + public Query existsQuery(SearchExecutionContext context) { + throw new UnsupportedOperationException("Cannot run exists query on [_offset_source]"); + } + + @Override + public boolean fieldHasValue(FieldInfos fieldInfos) { + return fieldInfos.fieldInfo(NAME) != null; + } + + @Override + public Query termQuery(Object value, SearchExecutionContext context) { + throw new UnsupportedOperationException("The [_offset_source] field may not be queried directly"); + } + } + + private OffsetSourceMetaFieldMapper() { + super(OffsetSourceMetaFieldType.INSTANCE); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapperTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapperTests.java new file mode 100644 index 0000000000000..1350289d9796a --- /dev/null +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldMapperTests.java @@ -0,0 +1,230 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.PrefixQuery; +import org.apache.lucene.search.Query; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.DocumentParsingException; +import org.elasticsearch.index.mapper.LuceneDocument; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.MapperTestCase; +import org.elasticsearch.index.mapper.ParsedDocument; +import org.elasticsearch.index.mapper.SourceToParse; +import org.elasticsearch.index.mapper.ValueFetcher; +import org.elasticsearch.index.query.SearchExecutionContext; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.search.lookup.Source; +import org.elasticsearch.search.lookup.SourceProvider; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xpack.inference.InferencePlugin; +import org.junit.AssumptionViolatedException; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.instanceOf; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +public class OffsetSourceFieldMapperTests extends MapperTestCase { + @Override + protected Collection getPlugins() { + return List.of(new InferencePlugin(Settings.EMPTY)); + } + + @Override + protected void minimalMapping(XContentBuilder b) throws IOException { + b.field("type", "offset_source"); + } + + @Override + protected Object getSampleValueForDocument() { + return getSampleObjectForDocument(); + } + + @Override + protected Object getSampleObjectForDocument() { + return Map.of("field", "foo", "start", 100, "end", 300); + } + + @Override + protected Object generateRandomInputValue(MappedFieldType ft) { + return new OffsetSourceFieldMapper.OffsetSource("field", randomIntBetween(0, 100), randomIntBetween(101, 1000)); + } + + @Override + protected IngestScriptSupport ingestScriptSupport() { + throw new AssumptionViolatedException("not supported"); + } + + + @Override + protected void assertExistsQuery(MappedFieldType fieldType, Query query, LuceneDocument fields) { + assertThat(query, instanceOf(PrefixQuery.class)); + PrefixQuery termQuery = (PrefixQuery) query; + assertEquals("_offset_source", termQuery.getField()); + assertEquals(new Term("_offset_source", "field"), termQuery.getPrefix()); + assertNotNull(fields.getField("_offset_source")); + } + + @Override + protected void registerParameters(ParameterChecker checker) throws IOException {} + + @Override + protected void assertSearchable(MappedFieldType fieldType) { + assertFalse(fieldType.isSearchable()); + } + + @Override + protected boolean supportsStoredFields() { + return false; + } + + @Override + protected boolean supportsEmptyInputArray() { + return false; + } + + @Override + protected boolean supportsCopyTo() { + return false; + } + + @Override + protected boolean supportsIgnoreMalformed() { + return false; + } + + @Override + protected SyntheticSourceSupport syntheticSourceSupport(boolean ignoreMalformed) { + return new SyntheticSourceSupport() { + @Override + public SyntheticSourceExample example(int maxValues) { + return new SyntheticSourceExample(getSampleValueForDocument(), getSampleValueForDocument(), null, b -> minimalMapping(b)); + } + + @Override + public List invalidExample() { + return List.of(); + } + }; + } + + @Override + public void testSyntheticSourceKeepArrays() { + // This mapper doesn't support multiple values (array of objects). + } + + public void testDefaults() throws Exception { + DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); + assertEquals(Strings.toString(fieldMapping(this::minimalMapping)), mapper.mappingSource().toString()); + + ParsedDocument doc1 = mapper.parse( + source(b -> b.startObject("field").field("field", "foo").field("start", 0).field("end", 128).endObject()) + ); + List fields = doc1.rootDoc().getFields("_offset_source"); + assertEquals(1, fields.size()); + assertThat(fields.get(0), instanceOf(OffsetSourceField.class)); + OffsetSourceField offsetField1 = (OffsetSourceField) fields.get(0); + + ParsedDocument doc2 = mapper.parse( + source(b -> b.startObject("field").field("field", "bar").field("start", 128).field("end", 512).endObject()) + ); + OffsetSourceField offsetField2 = (OffsetSourceField) doc2.rootDoc().getFields("_offset_source").get(0); + + assertTokenStream(offsetField1.tokenStream(null, null), "field.foo", 0, 128); + assertTokenStream(offsetField2.tokenStream(null, null), "field.bar", 128, 512); + } + + private void assertTokenStream(TokenStream tk, String expectedTerm, int expectedStartOffset, int expectedEndOffset) throws IOException { + CharTermAttribute termAttribute = tk.addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = tk.addAttribute(OffsetAttribute.class); + tk.reset(); + assertTrue(tk.incrementToken()); + assertThat(new String(termAttribute.buffer(), 0, termAttribute.length()), equalTo(expectedTerm)); + assertThat(offsetAttribute.startOffset(), equalTo(expectedStartOffset)); + assertThat(offsetAttribute.endOffset(), equalTo(expectedEndOffset)); + assertFalse(tk.incrementToken()); + } + + @Override + protected void assertFetch(MapperService mapperService, String field, Object value, String format) throws IOException { + MappedFieldType ft = mapperService.fieldType(field); + MappedFieldType.FielddataOperation fdt = MappedFieldType.FielddataOperation.SEARCH; + SourceToParse source = source(b -> b.field(ft.name(), value)); + SearchExecutionContext searchExecutionContext = mock(SearchExecutionContext.class); + when(searchExecutionContext.isSourceEnabled()).thenReturn(true); + when(searchExecutionContext.sourcePath(field)).thenReturn(Set.of(field)); + when(searchExecutionContext.getForField(ft, fdt)).thenAnswer(inv -> fieldDataLookup(mapperService).apply(ft, () -> { + throw new UnsupportedOperationException(); + }, fdt)); + ValueFetcher nativeFetcher = ft.valueFetcher(searchExecutionContext, format); + ParsedDocument doc = mapperService.documentMapper().parse(source); + withLuceneIndex(mapperService, iw -> iw.addDocuments(doc.docs()), ir -> { + Source s = SourceProvider.fromStoredFields().getSource(ir.leaves().get(0), 0); + nativeFetcher.setNextReader(ir.leaves().get(0)); + List fromNative = nativeFetcher.fetchValues(s, 0, new ArrayList<>()); + assertThat(fromNative.size(), equalTo(1)); + assertThat("fetching " + value, fromNative.get(0), equalTo(value)); + }); + } + + @Override + protected void assertFetchMany(MapperService mapperService, String field, Object value, String format, int count) throws IOException { + assumeFalse("[offset_source] currently don't support multiple values in the same field", false); + } + + public void testInvalidCharset() { + var exc = expectThrows(Exception.class, () -> createDocumentMapper(mapping(b -> { b.startObject("field").field("type", "offset_source").field("charset", "utf_8").endObject(); }))); + assertThat(exc.getCause().getMessage(), containsString("Unknown value [utf_8] for field [charset]")); + } + + public void testRejectMultiValuedFields() throws IOException { + DocumentMapper mapper = createDocumentMapper(mapping(b -> { b.startObject("field").field("type", "offset_source").endObject(); })); + + DocumentParsingException exc = expectThrows(DocumentParsingException.class, () -> mapper.parse(source(b -> { + b.startArray("field"); + { + b.startObject().field("field", "bar1").field("start", 128).field("end", 512).endObject(); + b.startObject().field("field", "bar2").field("start", 128).field("end", 512).endObject(); + } + b.endArray(); + }))); + assertThat(exc.getCause().getMessage(), containsString("[offset_source] fields do not support indexing multiple values")); + } + + public void testInvalidOffsets() throws IOException { + DocumentMapper mapper = createDocumentMapper(mapping(b -> { b.startObject("field").field("type", "offset_source").endObject(); })); + + DocumentParsingException exc = expectThrows(DocumentParsingException.class, () -> mapper.parse(source(b -> { + b.startArray("field"); + { + b.startObject().field("field", "bar1").field("start", -1).field("end", 512).endObject(); + } + b.endArray(); + }))); + assertThat(exc.getCause().getCause().getCause().getMessage(), containsString("Illegal offsets")); + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTests.java new file mode 100644 index 0000000000000..1380a81eb4125 --- /dev/null +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTests.java @@ -0,0 +1,74 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.document.Document; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.util.IOUtils; +import org.elasticsearch.test.ESTestCase; + +import static org.hamcrest.Matchers.containsString; + +public class OffsetSourceFieldTests extends ESTestCase { + public void testBasics() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter( + random(), + dir, + newIndexWriterConfig().setMergePolicy(newLogMergePolicy(random().nextBoolean())) + ); + Document doc = new Document(); + OffsetSourceField field1 = new OffsetSourceField(OffsetSourceFieldMapper.NAME, "field1.foo", 1, 10); + doc.add(field1); + writer.addDocument(doc); + + field1.setValues("field1.bar", 10, 128); + writer.addDocument(doc); + + writer.addDocument(new Document()); // gap + + field1.setValues("field1.foo", 50, 256); + writer.addDocument(doc); + + writer.addDocument(new Document()); // double gap + writer.addDocument(new Document()); + + field1.setValues("field1.baz", 32, 512); + writer.addDocument(doc); + + writer.forceMerge(1); + var reader = writer.getReader(); + writer.close(); + + var searcher = newSearcher(reader); + var context = searcher.getIndexReader().leaves().get(0); + + var terms = context.reader().terms(OffsetSourceFieldMapper.NAME); + assertNotNull(terms); + OffsetSourceField.OffsetSourceLoader loader = OffsetSourceField.loader(terms, "field1"); + + var offset = loader.advanceTo(0); + assertEquals(new OffsetSourceFieldMapper.OffsetSource("foo", 1, 10), offset); + + offset = loader.advanceTo(1); + assertEquals(new OffsetSourceFieldMapper.OffsetSource("bar", 10, 128), offset); + + assertNull(loader.advanceTo(2)); + + offset = loader.advanceTo(3); + assertEquals(new OffsetSourceFieldMapper.OffsetSource("foo", 50, 256), offset); + + offset = loader.advanceTo(6); + assertEquals(new OffsetSourceFieldMapper.OffsetSource("baz", 32, 512), offset); + + assertNull(loader.advanceTo(189)); + + IOUtils.close(reader, dir); + } +} \ No newline at end of file diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTypeTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTypeTests.java new file mode 100644 index 0000000000000..c7f7aa962c3b8 --- /dev/null +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceFieldTypeTests.java @@ -0,0 +1,48 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.xpack.inference.mapper; + +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.elasticsearch.index.mapper.FieldTypeTestCase; +import org.elasticsearch.index.mapper.MappedFieldType; + +import java.util.Collections; + +public class OffsetSourceFieldTypeTests extends FieldTypeTestCase { + public void testIsNotAggregatable() { + MappedFieldType fieldType = getMappedFieldType(); + assertFalse(fieldType.isAggregatable()); + } + + @Override + public void testFieldHasValue() { + MappedFieldType fieldType = getMappedFieldType(); + FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { getFieldInfoWithName("_offset_source") }); + assertTrue(fieldType.fieldHasValue(fieldInfos)); + } + + @Override + public void testFieldHasValueWithEmptyFieldInfos() { + MappedFieldType fieldType = getMappedFieldType(); + assertFalse(fieldType.fieldHasValue(FieldInfos.EMPTY)); + } + + public void testFieldEmptyIfNameIsPresentInFieldInfos() { + MappedFieldType fieldType = getMappedFieldType(); + FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { getFieldInfoWithName("field") }); + assertFalse(fieldType.fieldHasValue(fieldInfos)); + } + + @Override + public MappedFieldType getMappedFieldType() { + return new OffsetSourceFieldMapper.OffsetSourceFieldType("field", OffsetSourceFieldMapper.CharsetFormat.UTF_16, Collections.emptyMap()); + } +} \ No newline at end of file