Add option to store sparse_vector outside _source (elastic#117917)

This PR introduces an option for `sparse_vector` to store its values separately from `_source` by using term vectors. This capability is primarly needed by the semantic text field.
jimczi · Dec 4, 2024 · 18d4078 · 18d4078
1 parent 20957d6
commit 18d4078
Show file tree

Hide file tree

Showing 9 changed files with 589 additions and 29 deletions.
diff --git a/docs/changelog/117917.yaml b/docs/changelog/117917.yaml
@@ -0,0 +1,5 @@
+pr: 117917
+summary: Add option to store `sparse_vector` outside `_source`
+area: Mapping
+type: feature
+issues: []
diff --git a/docs/reference/mapping/types/sparse-vector.asciidoc b/docs/reference/mapping/types/sparse-vector.asciidoc
@@ -26,6 +26,23 @@ PUT my-index
 
 See <<semantic-search-elser, semantic search with ELSER>> for a complete example on adding documents to a `sparse_vector` mapped field using ELSER.
 
+[[sparse-vectors-params]]
+==== Parameters for `sparse_vector` fields
+
+The following parameters are accepted by `sparse_vector` fields:
+
+[horizontal]
+
+<<mapping-store,store>>::
+
+Indicates whether the field value should be stored and retrievable independently of the <<mapping-source-field,_source>> field.
+Accepted values: true or false (default).
+The field's data is stored using term vectors, a disk-efficient structure compared to the original JSON input.
+The input map can be retrieved during a search request via the <<search-fields-param,`fields` parameter>>.
+To benefit from reduced disk usage, you must either:
+  * Exclude the field from <<source-filtering, _source>>.
+  * Use <<synthetic-source,synthetic `_source`>>.
+
 [[index-multi-value-sparse-vectors]]
 ==== Multi-value sparse vectors
 

diff --git a/...pi-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml b/...pi-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/90_sparse_vector.yml
@@ -472,3 +472,120 @@
 
   - match:
       _source.ml.tokens: {}
+
+---
+"stored sparse_vector":
+
+  - requires:
+      cluster_features: [ "mapper.sparse_vector.store_support" ]
+      reason: "sparse_vector supports store parameter"
+
+  - do:
+      indices.create:
+        index: test
+        body:
+          mappings:
+            properties:
+              ml.tokens:
+                type: sparse_vector
+                store: true
+
+  - match: { acknowledged: true }
+  - do:
+      index:
+        index: test
+        id: "1"
+        body:
+          ml:
+            tokens:
+              running: 2
+              good: 3
+              run: 5
+              race: 7
+              for: 9
+
+  - match: { result: "created" }
+
+  - do:
+      indices.refresh: { }
+
+  - do:
+      search:
+        index: test
+        body:
+          fields: [ "ml.tokens" ]
+
+  - length: { hits.hits.0.fields.ml\\.tokens: 1 }
+  - length: { hits.hits.0.fields.ml\\.tokens.0: 5 }
+  - match:  { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 }
+  - match:  { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 }
+  - match:  { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 }
+  - match:  { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 }
+  - match:  { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 }
+
+---
+"stored sparse_vector synthetic source":
+
+  - requires:
+      cluster_features: [ "mapper.source.mode_from_index_setting", "mapper.sparse_vector.store_support" ]
+      reason: "sparse_vector supports store parameter"
+
+  - do:
+      indices.create:
+        index: test
+        body:
+          settings:
+            index:
+              mapping.source.mode: synthetic
+          mappings:
+            properties:
+              ml.tokens:
+                type: sparse_vector
+                store: true
+
+  - match: { acknowledged: true }
+
+  - do:
+      index:
+        index: test
+        id: "1"
+        body:
+          ml:
+            tokens:
+              running: 2
+              good: 3
+              run: 5
+              race: 7
+              for: 9
+
+  - match: { result: "created" }
+
+  - do:
+      indices.refresh: { }
+
+  - do:
+      search:
+        index: test
+        body:
+          fields: [ "ml.tokens" ]
+
+  - match:
+      hits.hits.0._source: {
+        ml: {
+          tokens: {
+            running: 2.0,
+            good: 3.0,
+            run: 5.0,
+            race: 7.0,
+            for: 9.0
+          }
+        }
+      }
+
+  - length: { hits.hits.0.fields.ml\\.tokens: 1 }
+  - length: { hits.hits.0.fields.ml\\.tokens.0: 5 }
+  - match:  { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 }
+  - match:  { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 }
+  - match:  { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 }
+  - match:  { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 }
+  - match:  { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 }
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java
@@ -56,6 +56,7 @@ public Set<NodeFeature> getFeatures() {
     );
 
     public static final NodeFeature META_FETCH_FIELDS_ERROR_CODE_CHANGED = new NodeFeature("meta_fetch_fields_error_code_changed");
+    public static final NodeFeature SPARSE_VECTOR_STORE_SUPPORT = new NodeFeature("mapper.sparse_vector.store_support");
 
     @Override
     public Set<NodeFeature> getTestFeatures() {
@@ -68,7 +69,8 @@ public Set<NodeFeature> getTestFeatures() {
             MapperService.LOGSDB_DEFAULT_IGNORE_DYNAMIC_BEYOND_LIMIT,
             DocumentParser.FIX_PARSING_SUBOBJECTS_FALSE_DYNAMIC_FALSE,
             CONSTANT_KEYWORD_SYNTHETIC_SOURCE_WRITE_FIX,
-            META_FETCH_FIELDS_ERROR_CODE_CHANGED
+            META_FETCH_FIELDS_ERROR_CODE_CHANGED,
+            SPARSE_VECTOR_STORE_SUPPORT
         );
     }
 }
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java
@@ -11,6 +11,12 @@
 
 import org.apache.lucene.document.FeatureField;
 import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.TermVectors;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
@@ -25,14 +31,22 @@
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.MapperBuilderContext;
+import org.elasticsearch.index.mapper.SourceLoader;
 import org.elasticsearch.index.mapper.SourceValueFetcher;
 import org.elasticsearch.index.mapper.TextSearchInfo;
 import org.elasticsearch.index.mapper.ValueFetcher;
 import org.elasticsearch.index.query.SearchExecutionContext;
+import org.elasticsearch.search.fetch.StoredFieldsSpec;
+import org.elasticsearch.search.lookup.Source;
+import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentParser.Token;
 
 import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.LinkedHashMap;
+import java.util.List;
 import java.util.Map;
+import java.util.stream.Stream;
 
 import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;
 
@@ -52,8 +66,12 @@ public class SparseVectorFieldMapper extends FieldMapper {
     static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR;
     static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT;
 
-    public static class Builder extends FieldMapper.Builder {
+    private static SparseVectorFieldMapper toType(FieldMapper in) {
+        return (SparseVectorFieldMapper) in;
+    }
 
+    public static class Builder extends FieldMapper.Builder {
+        private final Parameter<Boolean> stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false);
         private final Parameter<Map<String, String>> meta = Parameter.metaParam();
 
         public Builder(String name) {
@@ -62,14 +80,14 @@ public Builder(String name) {
 
         @Override
         protected Parameter<?>[] getParameters() {
-            return new Parameter<?>[] { meta };
+            return new Parameter<?>[] { stored, meta };
         }
 
         @Override
         public SparseVectorFieldMapper build(MapperBuilderContext context) {
             return new SparseVectorFieldMapper(
                 leafName(),
-                new SparseVectorFieldType(context.buildFullName(leafName()), meta.getValue()),
+                new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue()),
                 builderParams(this, context)
             );
         }
@@ -87,8 +105,8 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) {
 
     public static final class SparseVectorFieldType extends MappedFieldType {
 
-        public SparseVectorFieldType(String name, Map<String, String> meta) {
-            super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
+        public SparseVectorFieldType(String name, boolean isStored, Map<String, String> meta) {
+            super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
         }
 
         @Override
@@ -103,6 +121,9 @@ public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext
 
         @Override
         public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
+            if (isStored()) {
+                return new SparseVectorValueFetcher(name());
+            }
             return SourceValueFetcher.identity(name(), context, format);
         }
 
@@ -135,6 +156,14 @@ private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldTy
         super(simpleName, mappedFieldType, builderParams);
     }
 
+    @Override
+    protected SyntheticSourceSupport syntheticSourceSupport() {
+        if (fieldType().isStored()) {
+            return new SyntheticSourceSupport.Native(new SparseVectorSyntheticFieldLoader(fullPath(), leafName()));
+        }
+        return super.syntheticSourceSupport();
+    }
+
     @Override
     public Map<String, NamedAnalyzer> indexAnalyzers() {
         return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
@@ -189,9 +218,9 @@ public void parse(DocumentParserContext context) throws IOException {
                     // based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf
                     IndexableField currentField = context.doc().getByKey(key);
                     if (currentField == null) {
-                        context.doc().addWithKey(key, new FeatureField(fullPath(), feature, value));
-                    } else if (currentField instanceof FeatureField && ((FeatureField) currentField).getFeatureValue() < value) {
-                        ((FeatureField) currentField).setFeatureValue(value);
+                        context.doc().addWithKey(key, new XFeatureField(fullPath(), feature, value, fieldType().isStored()));
+                    } else if (currentField instanceof XFeatureField && ((XFeatureField) currentField).getFeatureValue() < value) {
+                        ((XFeatureField) currentField).setFeatureValue(value);
                     }
                 } else {
                     throw new IllegalArgumentException(
@@ -219,4 +248,114 @@ protected String contentType() {
         return CONTENT_TYPE;
     }
 
+    private static class SparseVectorValueFetcher implements ValueFetcher {
+        private final String fieldName;
+        private TermVectors termVectors;
+
+        private SparseVectorValueFetcher(String fieldName) {
+            this.fieldName = fieldName;
+        }
+
+        @Override
+        public void setNextReader(LeafReaderContext context) {
+            try {
+                termVectors = context.reader().termVectors();
+            } catch (IOException exc) {
+                throw new UncheckedIOException(exc);
+            }
+        }
+
+        @Override
+        public List<Object> fetchValues(Source source, int doc, List<Object> ignoredValues) throws IOException {
+            if (termVectors == null) {
+                return List.of();
+            }
+            var terms = termVectors.get(doc, fieldName);
+            if (terms == null) {
+                return List.of();
+            }
+
+            var termsEnum = terms.iterator();
+            PostingsEnum postingsScratch = null;
+            Map<String, Float> result = new LinkedHashMap<>();
+            while (termsEnum.next() != null) {
+                postingsScratch = termsEnum.postings(postingsScratch);
+                postingsScratch.nextDoc();
+                result.put(termsEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(postingsScratch.freq()));
+                assert postingsScratch.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
+            }
+            return List.of(result);
+        }
+
+        @Override
+        public StoredFieldsSpec storedFieldsSpec() {
+            return StoredFieldsSpec.NO_REQUIREMENTS;
+        }
+    }
+
+    private static class SparseVectorSyntheticFieldLoader implements SourceLoader.SyntheticFieldLoader {
+        private final String fullPath;
+        private final String leafName;
+
+        private TermsEnum termsDocEnum;
+
+        private SparseVectorSyntheticFieldLoader(String fullPath, String leafName) {
+            this.fullPath = fullPath;
+            this.leafName = leafName;
+        }
+
+        @Override
+        public Stream<Map.Entry<String, StoredFieldLoader>> storedFieldLoaders() {
+            return Stream.of();
+        }
+
+        @Override
+        public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
+            var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath);
+            if (fieldInfos == null || fieldInfos.hasVectors() == false) {
+                return null;
+            }
+            return docId -> {
+                var terms = leafReader.termVectors().get(docId, fullPath);
+                if (terms == null) {
+                    return false;
+                }
+                termsDocEnum = terms.iterator();
+                if (termsDocEnum.next() == null) {
+                    termsDocEnum = null;
+                    return false;
+                }
+                return true;
+            };
+        }
+
+        @Override
+        public boolean hasValue() {
+            return termsDocEnum != null;
+        }
+
+        @Override
+        public void write(XContentBuilder b) throws IOException {
+            assert termsDocEnum != null;
+            PostingsEnum reuse = null;
+            b.startObject(leafName);
+            do {
+                reuse = termsDocEnum.postings(reuse);
+                reuse.nextDoc();
+                b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
+            } while (termsDocEnum.next() != null);
+            b.endObject();
+        }
+
+        @Override
+        public String fieldName() {
+            return leafName;
+        }
+
+        @Override
+        public void reset() {
+            termsDocEnum = null;
+        }
+    }
+
 }