Skip to content

Commit

Permalink
Add option to store sparse_vector outside _source (elastic#117917)
Browse files Browse the repository at this point in the history
This PR introduces an option for `sparse_vector` to store its values separately from `_source` by using term vectors.
This capability is primarly needed by the semantic text field.
  • Loading branch information
jimczi committed Dec 4, 2024
1 parent 20957d6 commit 18d4078
Show file tree
Hide file tree
Showing 9 changed files with 589 additions and 29 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/117917.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 117917
summary: Add option to store `sparse_vector` outside `_source`
area: Mapping
type: feature
issues: []
17 changes: 17 additions & 0 deletions docs/reference/mapping/types/sparse-vector.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,23 @@ PUT my-index

See <<semantic-search-elser, semantic search with ELSER>> for a complete example on adding documents to a `sparse_vector` mapped field using ELSER.

[[sparse-vectors-params]]
==== Parameters for `sparse_vector` fields

The following parameters are accepted by `sparse_vector` fields:

[horizontal]

<<mapping-store,store>>::

Indicates whether the field value should be stored and retrievable independently of the <<mapping-source-field,_source>> field.
Accepted values: true or false (default).
The field's data is stored using term vectors, a disk-efficient structure compared to the original JSON input.
The input map can be retrieved during a search request via the <<search-fields-param,`fields` parameter>>.
To benefit from reduced disk usage, you must either:
* Exclude the field from <<source-filtering, _source>>.
* Use <<synthetic-source,synthetic `_source`>>.

[[index-multi-value-sparse-vectors]]
==== Multi-value sparse vectors

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,3 +472,120 @@

- match:
_source.ml.tokens: {}

---
"stored sparse_vector":

- requires:
cluster_features: [ "mapper.sparse_vector.store_support" ]
reason: "sparse_vector supports store parameter"

- do:
indices.create:
index: test
body:
mappings:
properties:
ml.tokens:
type: sparse_vector
store: true

- match: { acknowledged: true }
- do:
index:
index: test
id: "1"
body:
ml:
tokens:
running: 2
good: 3
run: 5
race: 7
for: 9

- match: { result: "created" }

- do:
indices.refresh: { }

- do:
search:
index: test
body:
fields: [ "ml.tokens" ]

- length: { hits.hits.0.fields.ml\\.tokens: 1 }
- length: { hits.hits.0.fields.ml\\.tokens.0: 5 }
- match: { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 }

---
"stored sparse_vector synthetic source":

- requires:
cluster_features: [ "mapper.source.mode_from_index_setting", "mapper.sparse_vector.store_support" ]
reason: "sparse_vector supports store parameter"

- do:
indices.create:
index: test
body:
settings:
index:
mapping.source.mode: synthetic
mappings:
properties:
ml.tokens:
type: sparse_vector
store: true

- match: { acknowledged: true }

- do:
index:
index: test
id: "1"
body:
ml:
tokens:
running: 2
good: 3
run: 5
race: 7
for: 9

- match: { result: "created" }

- do:
indices.refresh: { }

- do:
search:
index: test
body:
fields: [ "ml.tokens" ]

- match:
hits.hits.0._source: {
ml: {
tokens: {
running: 2.0,
good: 3.0,
run: 5.0,
race: 7.0,
for: 9.0
}
}
}

- length: { hits.hits.0.fields.ml\\.tokens: 1 }
- length: { hits.hits.0.fields.ml\\.tokens.0: 5 }
- match: { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 }
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public Set<NodeFeature> getFeatures() {
);

public static final NodeFeature META_FETCH_FIELDS_ERROR_CODE_CHANGED = new NodeFeature("meta_fetch_fields_error_code_changed");
public static final NodeFeature SPARSE_VECTOR_STORE_SUPPORT = new NodeFeature("mapper.sparse_vector.store_support");

@Override
public Set<NodeFeature> getTestFeatures() {
Expand All @@ -68,7 +69,8 @@ public Set<NodeFeature> getTestFeatures() {
MapperService.LOGSDB_DEFAULT_IGNORE_DYNAMIC_BEYOND_LIMIT,
DocumentParser.FIX_PARSING_SUBOBJECTS_FALSE_DYNAMIC_FALSE,
CONSTANT_KEYWORD_SYNTHETIC_SOURCE_WRITE_FIX,
META_FETCH_FIELDS_ERROR_CODE_CHANGED
META_FETCH_FIELDS_ERROR_CODE_CHANGED,
SPARSE_VECTOR_STORE_SUPPORT
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@

import org.apache.lucene.document.FeatureField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
Expand All @@ -25,14 +31,22 @@
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperBuilderContext;
import org.elasticsearch.index.mapper.SourceLoader;
import org.elasticsearch.index.mapper.SourceValueFetcher;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.mapper.ValueFetcher;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.search.fetch.StoredFieldsSpec;
import org.elasticsearch.search.lookup.Source;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser.Token;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;

import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;

Expand All @@ -52,8 +66,12 @@ public class SparseVectorFieldMapper extends FieldMapper {
static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR;
static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT;

public static class Builder extends FieldMapper.Builder {
private static SparseVectorFieldMapper toType(FieldMapper in) {
return (SparseVectorFieldMapper) in;
}

public static class Builder extends FieldMapper.Builder {
private final Parameter<Boolean> stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false);
private final Parameter<Map<String, String>> meta = Parameter.metaParam();

public Builder(String name) {
Expand All @@ -62,14 +80,14 @@ public Builder(String name) {

@Override
protected Parameter<?>[] getParameters() {
return new Parameter<?>[] { meta };
return new Parameter<?>[] { stored, meta };
}

@Override
public SparseVectorFieldMapper build(MapperBuilderContext context) {
return new SparseVectorFieldMapper(
leafName(),
new SparseVectorFieldType(context.buildFullName(leafName()), meta.getValue()),
new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue()),
builderParams(this, context)
);
}
Expand All @@ -87,8 +105,8 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) {

public static final class SparseVectorFieldType extends MappedFieldType {

public SparseVectorFieldType(String name, Map<String, String> meta) {
super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
public SparseVectorFieldType(String name, boolean isStored, Map<String, String> meta) {
super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
}

@Override
Expand All @@ -103,6 +121,9 @@ public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext

@Override
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
if (isStored()) {
return new SparseVectorValueFetcher(name());
}
return SourceValueFetcher.identity(name(), context, format);
}

Expand Down Expand Up @@ -135,6 +156,14 @@ private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldTy
super(simpleName, mappedFieldType, builderParams);
}

@Override
protected SyntheticSourceSupport syntheticSourceSupport() {
if (fieldType().isStored()) {
return new SyntheticSourceSupport.Native(new SparseVectorSyntheticFieldLoader(fullPath(), leafName()));
}
return super.syntheticSourceSupport();
}

@Override
public Map<String, NamedAnalyzer> indexAnalyzers() {
return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
Expand Down Expand Up @@ -189,9 +218,9 @@ public void parse(DocumentParserContext context) throws IOException {
// based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf
IndexableField currentField = context.doc().getByKey(key);
if (currentField == null) {
context.doc().addWithKey(key, new FeatureField(fullPath(), feature, value));
} else if (currentField instanceof FeatureField && ((FeatureField) currentField).getFeatureValue() < value) {
((FeatureField) currentField).setFeatureValue(value);
context.doc().addWithKey(key, new XFeatureField(fullPath(), feature, value, fieldType().isStored()));
} else if (currentField instanceof XFeatureField && ((XFeatureField) currentField).getFeatureValue() < value) {
((XFeatureField) currentField).setFeatureValue(value);
}
} else {
throw new IllegalArgumentException(
Expand Down Expand Up @@ -219,4 +248,114 @@ protected String contentType() {
return CONTENT_TYPE;
}

private static class SparseVectorValueFetcher implements ValueFetcher {
private final String fieldName;
private TermVectors termVectors;

private SparseVectorValueFetcher(String fieldName) {
this.fieldName = fieldName;
}

@Override
public void setNextReader(LeafReaderContext context) {
try {
termVectors = context.reader().termVectors();
} catch (IOException exc) {
throw new UncheckedIOException(exc);
}
}

@Override
public List<Object> fetchValues(Source source, int doc, List<Object> ignoredValues) throws IOException {
if (termVectors == null) {
return List.of();
}
var terms = termVectors.get(doc, fieldName);
if (terms == null) {
return List.of();
}

var termsEnum = terms.iterator();
PostingsEnum postingsScratch = null;
Map<String, Float> result = new LinkedHashMap<>();
while (termsEnum.next() != null) {
postingsScratch = termsEnum.postings(postingsScratch);
postingsScratch.nextDoc();
result.put(termsEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(postingsScratch.freq()));
assert postingsScratch.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
}
return List.of(result);
}

@Override
public StoredFieldsSpec storedFieldsSpec() {
return StoredFieldsSpec.NO_REQUIREMENTS;
}
}

private static class SparseVectorSyntheticFieldLoader implements SourceLoader.SyntheticFieldLoader {
private final String fullPath;
private final String leafName;

private TermsEnum termsDocEnum;

private SparseVectorSyntheticFieldLoader(String fullPath, String leafName) {
this.fullPath = fullPath;
this.leafName = leafName;
}

@Override
public Stream<Map.Entry<String, StoredFieldLoader>> storedFieldLoaders() {
return Stream.of();
}

@Override
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath);
if (fieldInfos == null || fieldInfos.hasVectors() == false) {
return null;
}
return docId -> {
var terms = leafReader.termVectors().get(docId, fullPath);
if (terms == null) {
return false;
}
termsDocEnum = terms.iterator();
if (termsDocEnum.next() == null) {
termsDocEnum = null;
return false;
}
return true;
};
}

@Override
public boolean hasValue() {
return termsDocEnum != null;
}

@Override
public void write(XContentBuilder b) throws IOException {
assert termsDocEnum != null;
PostingsEnum reuse = null;
b.startObject(leafName);
do {
reuse = termsDocEnum.postings(reuse);
reuse.nextDoc();
b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
} while (termsDocEnum.next() != null);
b.endObject();
}

@Override
public String fieldName() {
return leafName;
}

@Override
public void reset() {
termsDocEnum = null;
}
}

}
Loading

0 comments on commit 18d4078

Please sign in to comment.