Add a new offset_source field to store offsets referencing substrin…

…gs of another field. This field is primarily designed for use with the `semantic_text` field, where it enables storing offsets that point to substrings of the field used to generate its underlying chunks. To prevent external usage, the field is intentionally undocumented, with detailed javadocs explaining its specific purpose and limitations. I couldn’t find a way to fully block external usage, but skipping the docs should keep it mostly out of sight for now.
jimczi · Dec 4, 2024 · 25f7223 · 25f7223
1 parent 0901a27
commit 25f7223
Show file tree

Hide file tree

Showing 7 changed files with 838 additions and 1 deletion.
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java
@@ -68,6 +68,7 @@
 import org.elasticsearch.xpack.inference.external.http.sender.HttpRequestSender;
 import org.elasticsearch.xpack.inference.external.http.sender.RequestExecutorServiceSettings;
 import org.elasticsearch.xpack.inference.logging.ThrottlerManager;
+import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper;
 import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper;
 import org.elasticsearch.xpack.inference.queries.SemanticQueryBuilder;
 import org.elasticsearch.xpack.inference.rank.random.RandomRankBuilder;
@@ -392,7 +393,7 @@ public void close() {
 
     @Override
     public Map<String, Mapper.TypeParser> getMappers() {
-        return Map.of(SemanticTextFieldMapper.CONTENT_TYPE, SemanticTextFieldMapper.PARSER);
+        return Map.of(SemanticTextFieldMapper.CONTENT_TYPE, SemanticTextFieldMapper.PARSER, OffsetSourceFieldMapper.CONTENT_TYPE, OffsetSourceFieldMapper.PARSER);
     }
 
     @Override

diff --git a/...n/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceField.java b/...n/inference/src/main/java/org/elasticsearch/xpack/inference/mapper/OffsetSourceField.java
@@ -0,0 +1,151 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference.mapper;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+/**
+ * Represents a {@link Field} that stores a {@link Term} along with its start and end offsets.
+ * Note: The {@link Charset} used to calculate these offsets is not associated with this field.
+ * It is the responsibility of the consumer to handle the appropriate {@link Charset}.
+ */
+public final class OffsetSourceField extends Field {
+  private static final FieldType FIELD_TYPE = new FieldType();
+
+  static {
+    FIELD_TYPE.setTokenized(false);
+    FIELD_TYPE.setOmitNorms(true);
+    FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+  }
+
+  private int startOffset;
+  private int endOffset;
+
+  public OffsetSourceField(String fieldName, String sourceFieldName, int startOffset, int endOffset) {
+    super(fieldName, sourceFieldName, FIELD_TYPE);
+    this.startOffset = startOffset;
+    this.endOffset = endOffset;
+  }
+
+  public void setValues(String fieldName, int startOffset, int endOffset) {
+    this.fieldsData = fieldName;
+    this.startOffset = startOffset;
+    this.endOffset = endOffset;
+  }
+
+  @Override
+  public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
+    OffsetTokenStream stream;
+    if (reuse instanceof OffsetTokenStream) {
+      stream = (OffsetTokenStream) reuse;
+    } else {
+      stream = new OffsetTokenStream();
+    }
+
+    stream.setValues((String) fieldsData, startOffset, endOffset);
+    return stream;
+  }
+
+  public static OffsetSourceLoader loader(Terms terms, String fieldName) throws IOException {
+    return new OffsetSourceLoader(terms, fieldName);
+  }
+
+  private static final class OffsetTokenStream extends TokenStream {
+    private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
+    private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
+    private boolean used = true;
+    private String value = null;
+    private int startOffset = 0;
+    private int endOffset = 0;
+
+    private OffsetTokenStream() {}
+
+    /** Sets the values */
+    void setValues(String value, int startOffset, int endOffset) {
+      this.value = value;
+      this.startOffset = startOffset;
+      this.endOffset = endOffset;
+    }
+
+    @Override
+    public boolean incrementToken() {
+      if (used) {
+        return false;
+      }
+      clearAttributes();
+      termAttribute.append(value);
+      offsetAttribute.setOffset(startOffset, endOffset);
+      used = true;
+      return true;
+    }
+
+    @Override
+    public void reset() {
+      used = false;
+    }
+
+    @Override
+    public void close() {
+      value = null;
+    }
+  }
+
+  public static class OffsetSourceLoader {
+    private final Map<String, PostingsEnum> postingsEnums = new LinkedHashMap<>();
+
+    private OffsetSourceLoader(Terms terms, String fieldName) throws IOException {
+      Automaton prefixAutomaton = PrefixQuery.toAutomaton(new BytesRef(fieldName + "."));
+      var termsEnum = terms.intersect(new CompiledAutomaton(prefixAutomaton, false, true, false), null);
+      while (termsEnum.next() != null) {
+        var postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
+        if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
+          String sourceFieldName = termsEnum.term().utf8ToString().substring(fieldName.length() + 1);
+          postingsEnums.put(sourceFieldName, postings);
+        }
+      }
+    }
+
+    public OffsetSourceFieldMapper.OffsetSource advanceTo(int doc) throws IOException {
+      for (var it = postingsEnums.entrySet().iterator(); it.hasNext();) {
+        var entry = it.next();
+        var postings = entry.getValue();
+        if (postings.docID() < doc) {
+          if (postings.advance(doc) == DocIdSetIterator.NO_MORE_DOCS) {
+            it.remove();
+            continue;
+          }
+        }
+        if (postings.docID() == doc) {
+          assert postings.freq() == 1;
+          postings.nextPosition();
+          return new OffsetSourceFieldMapper.OffsetSource(entry.getKey(), postings.startOffset(), postings.endOffset());
+        }
+      }
+      return null;
+    }
+  }
+}