Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[8.x] Add option to store sparse_vector outside _source #118018

Merged
merged 2 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/117917.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 117917
summary: Add option to store `sparse_vector` outside `_source`
area: Mapping
type: feature
issues: []
17 changes: 17 additions & 0 deletions docs/reference/mapping/types/sparse-vector.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,23 @@ PUT my-index

See <<semantic-search-elser, semantic search with ELSER>> for a complete example on adding documents to a `sparse_vector` mapped field using ELSER.

[[sparse-vectors-params]]
==== Parameters for `sparse_vector` fields

The following parameters are accepted by `sparse_vector` fields:

[horizontal]

<<mapping-store,store>>::

Indicates whether the field value should be stored and retrievable independently of the <<mapping-source-field,_source>> field.
Accepted values: true or false (default).
The field's data is stored using term vectors, a disk-efficient structure compared to the original JSON input.
The input map can be retrieved during a search request via the <<search-fields-param,`fields` parameter>>.
To benefit from reduced disk usage, you must either:
* Exclude the field from <<source-filtering, _source>>.
* Use <<synthetic-source,synthetic `_source`>>.

[[index-multi-value-sparse-vectors]]
==== Multi-value sparse vectors

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,3 +472,120 @@

- match:
_source.ml.tokens: {}

---
"stored sparse_vector":

- requires:
cluster_features: [ "mapper.sparse_vector.store_support" ]
reason: "sparse_vector supports store parameter"

- do:
indices.create:
index: test
body:
mappings:
properties:
ml.tokens:
type: sparse_vector
store: true

- match: { acknowledged: true }
- do:
index:
index: test
id: "1"
body:
ml:
tokens:
running: 2
good: 3
run: 5
race: 7
for: 9

- match: { result: "created" }

- do:
indices.refresh: { }

- do:
search:
index: test
body:
fields: [ "ml.tokens" ]

- length: { hits.hits.0.fields.ml\\.tokens: 1 }
- length: { hits.hits.0.fields.ml\\.tokens.0: 5 }
- match: { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 }

---
"stored sparse_vector synthetic source":

- requires:
cluster_features: [ "mapper.source.mode_from_index_setting", "mapper.sparse_vector.store_support" ]
reason: "sparse_vector supports store parameter"

- do:
indices.create:
index: test
body:
settings:
index:
mapping.source.mode: synthetic
mappings:
properties:
ml.tokens:
type: sparse_vector
store: true

- match: { acknowledged: true }

- do:
index:
index: test
id: "1"
body:
ml:
tokens:
running: 2
good: 3
run: 5
race: 7
for: 9

- match: { result: "created" }

- do:
indices.refresh: { }

- do:
search:
index: test
body:
fields: [ "ml.tokens" ]

- match:
hits.hits.0._source: {
ml: {
tokens: {
running: 2.0,
good: 3.0,
run: 5.0,
race: 7.0,
for: 9.0
}
}
}

- length: { hits.hits.0.fields.ml\\.tokens: 1 }
- length: { hits.hits.0.fields.ml\\.tokens.0: 5 }
- match: { hits.hits.0.fields.ml\\.tokens.0.running: 2.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.good: 3.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.run: 5.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.race: 7.0 }
- match: { hits.hits.0.fields.ml\\.tokens.0.for: 9.0 }
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public Set<NodeFeature> getFeatures() {
);

public static final NodeFeature META_FETCH_FIELDS_ERROR_CODE_CHANGED = new NodeFeature("meta_fetch_fields_error_code_changed");
public static final NodeFeature SPARSE_VECTOR_STORE_SUPPORT = new NodeFeature("mapper.sparse_vector.store_support");

@Override
public Set<NodeFeature> getTestFeatures() {
Expand All @@ -68,7 +69,8 @@ public Set<NodeFeature> getTestFeatures() {
MapperService.LOGSDB_DEFAULT_IGNORE_DYNAMIC_BEYOND_LIMIT,
DocumentParser.FIX_PARSING_SUBOBJECTS_FALSE_DYNAMIC_FALSE,
CONSTANT_KEYWORD_SYNTHETIC_SOURCE_WRITE_FIX,
META_FETCH_FIELDS_ERROR_CODE_CHANGED
META_FETCH_FIELDS_ERROR_CODE_CHANGED,
SPARSE_VECTOR_STORE_SUPPORT
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@

import org.apache.lucene.document.FeatureField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
Expand All @@ -25,14 +31,22 @@
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperBuilderContext;
import org.elasticsearch.index.mapper.SourceLoader;
import org.elasticsearch.index.mapper.SourceValueFetcher;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.mapper.ValueFetcher;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.search.fetch.StoredFieldsSpec;
import org.elasticsearch.search.lookup.Source;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser.Token;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;

import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;

Expand All @@ -52,8 +66,12 @@ public class SparseVectorFieldMapper extends FieldMapper {
static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR;
static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT;

public static class Builder extends FieldMapper.Builder {
private static SparseVectorFieldMapper toType(FieldMapper in) {
return (SparseVectorFieldMapper) in;
}

public static class Builder extends FieldMapper.Builder {
private final Parameter<Boolean> stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false);
private final Parameter<Map<String, String>> meta = Parameter.metaParam();

public Builder(String name) {
Expand All @@ -62,14 +80,14 @@ public Builder(String name) {

@Override
protected Parameter<?>[] getParameters() {
return new Parameter<?>[] { meta };
return new Parameter<?>[] { stored, meta };
}

@Override
public SparseVectorFieldMapper build(MapperBuilderContext context) {
return new SparseVectorFieldMapper(
leafName(),
new SparseVectorFieldType(context.buildFullName(leafName()), meta.getValue()),
new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue()),
builderParams(this, context)
);
}
Expand All @@ -87,8 +105,8 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) {

public static final class SparseVectorFieldType extends MappedFieldType {

public SparseVectorFieldType(String name, Map<String, String> meta) {
super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
public SparseVectorFieldType(String name, boolean isStored, Map<String, String> meta) {
super(name, true, isStored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
}

@Override
Expand All @@ -103,6 +121,9 @@ public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext

@Override
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
if (isStored()) {
return new SparseVectorValueFetcher(name());
}
return SourceValueFetcher.identity(name(), context, format);
}

Expand Down Expand Up @@ -135,6 +156,14 @@ private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldTy
super(simpleName, mappedFieldType, builderParams);
}

@Override
protected SyntheticSourceSupport syntheticSourceSupport() {
if (fieldType().isStored()) {
return new SyntheticSourceSupport.Native(new SparseVectorSyntheticFieldLoader(fullPath(), leafName()));
}
return super.syntheticSourceSupport();
}

@Override
public Map<String, NamedAnalyzer> indexAnalyzers() {
return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
Expand Down Expand Up @@ -189,9 +218,9 @@ public void parse(DocumentParserContext context) throws IOException {
// based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf
IndexableField currentField = context.doc().getByKey(key);
if (currentField == null) {
context.doc().addWithKey(key, new FeatureField(fullPath(), feature, value));
} else if (currentField instanceof FeatureField && ((FeatureField) currentField).getFeatureValue() < value) {
((FeatureField) currentField).setFeatureValue(value);
context.doc().addWithKey(key, new XFeatureField(fullPath(), feature, value, fieldType().isStored()));
} else if (currentField instanceof XFeatureField && ((XFeatureField) currentField).getFeatureValue() < value) {
((XFeatureField) currentField).setFeatureValue(value);
}
} else {
throw new IllegalArgumentException(
Expand Down Expand Up @@ -219,4 +248,114 @@ protected String contentType() {
return CONTENT_TYPE;
}

private static class SparseVectorValueFetcher implements ValueFetcher {
private final String fieldName;
private TermVectors termVectors;

private SparseVectorValueFetcher(String fieldName) {
this.fieldName = fieldName;
}

@Override
public void setNextReader(LeafReaderContext context) {
try {
termVectors = context.reader().termVectors();
} catch (IOException exc) {
throw new UncheckedIOException(exc);
}
}

@Override
public List<Object> fetchValues(Source source, int doc, List<Object> ignoredValues) throws IOException {
if (termVectors == null) {
return List.of();
}
var terms = termVectors.get(doc, fieldName);
if (terms == null) {
return List.of();
}

var termsEnum = terms.iterator();
PostingsEnum postingsScratch = null;
Map<String, Float> result = new LinkedHashMap<>();
while (termsEnum.next() != null) {
postingsScratch = termsEnum.postings(postingsScratch);
postingsScratch.nextDoc();
result.put(termsEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(postingsScratch.freq()));
assert postingsScratch.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
}
return List.of(result);
}

@Override
public StoredFieldsSpec storedFieldsSpec() {
return StoredFieldsSpec.NO_REQUIREMENTS;
}
}

private static class SparseVectorSyntheticFieldLoader implements SourceLoader.SyntheticFieldLoader {
private final String fullPath;
private final String leafName;

private TermsEnum termsDocEnum;

private SparseVectorSyntheticFieldLoader(String fullPath, String leafName) {
this.fullPath = fullPath;
this.leafName = leafName;
}

@Override
public Stream<Map.Entry<String, StoredFieldLoader>> storedFieldLoaders() {
return Stream.of();
}

@Override
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath);
if (fieldInfos == null || fieldInfos.hasVectors() == false) {
return null;
}
return docId -> {
var terms = leafReader.termVectors().get(docId, fullPath);
if (terms == null) {
return false;
}
termsDocEnum = terms.iterator();
if (termsDocEnum.next() == null) {
termsDocEnum = null;
return false;
}
return true;
};
}

@Override
public boolean hasValue() {
return termsDocEnum != null;
}

@Override
public void write(XContentBuilder b) throws IOException {
assert termsDocEnum != null;
PostingsEnum reuse = null;
b.startObject(leafName);
do {
reuse = termsDocEnum.postings(reuse);
reuse.nextDoc();
b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
} while (termsDocEnum.next() != null);
b.endObject();
}

@Override
public String fieldName() {
return leafName;
}

@Override
public void reset() {
termsDocEnum = null;
}
}

}
Loading