Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to store sparse_vector outside _source #117917

Merged
merged 13 commits into from
Dec 4, 2024
Merged
5 changes: 5 additions & 0 deletions docs/changelog/117917.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 117917
summary: Add option to store `sparse_vector` outside `_source`
area: Mapping
type: feature
issues: []
13 changes: 13 additions & 0 deletions docs/reference/mapping/types/sparse-vector.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,19 @@ PUT my-index

See <<semantic-search-elser, semantic search with ELSER>> for a complete example on adding documents to a `sparse_vector` mapped field using ELSER.

[[sparse-vectors-params]]
==== Parameters for `sparse_vector` fields

The following parameters are accepted by `sparse_vector` fields:

[horizontal]

<<mapping-store,`store`>>::

Whether the field value should be stored and retrievable separately from
the <<mapping-source-field,`_source`>> field. Accepts `true` or `false`
(default).

jimczi marked this conversation as resolved.
Show resolved Hide resolved
[[index-multi-value-sparse-vectors]]
==== Multi-value sparse vectors

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -472,3 +472,129 @@

- match:
_source.ml.tokens: {}

---
"stored sparse_vector":

- requires:
cluster_features: [ "mapper.sparse_vector.store_support" ]
reason: "sparse_vector supports store parameter"

- do:
indices.create:
index: test
body:
mappings:
properties:
ml.tokens:
type: sparse_vector
store: true

- match: { acknowledged: true }
- do:
index:
index: test
id: "1"
body:
ml:
tokens:
running: 2
good: 3
run: 5
race: 7
for: 9

- match: { result: "created" }

- do:
indices.refresh: { }

- do:
search:
index: test
body:
fields: [ "ml.tokens" ]

- match:
hits.hits.0.fields: {
ml.tokens: [
{
running: 2.0,
good: 3.0,
run: 5.0,
race: 7.0,
for: 9.0
}
]
}

---
"stored sparse_vector synthetic source":

- requires:
cluster_features: [ "mapper.source.mode_from_index_setting", "mapper.sparse_vector.store_support" ]
reason: "sparse_vector supports store parameter"

- do:
indices.create:
index: test
body:
settings:
index:
mapping.source.mode: synthetic
mappings:
properties:
ml.tokens:
type: sparse_vector
store: true

- match: { acknowledged: true }

- do:
index:
index: test
id: "1"
body:
ml:
tokens:
running: 2
good: 3
run: 5
race: 7
for: 9

- match: { result: "created" }

- do:
indices.refresh: { }

- do:
search:
index: test
body:
fields: [ "ml.tokens" ]

- match:
hits.hits.0._source: {
ml: {
tokens: {
running: 2.0,
good: 3.0,
run: 5.0,
race: 7.0,
for: 9.0
}
}
}
- match:
hits.hits.0.fields: {
ml.tokens: [
{
running: 2.0,
good: 3.0,
run: 5.0,
race: 7.0,
for: 9.0
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@

import org.apache.lucene.document.FeatureField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
Expand All @@ -25,14 +31,22 @@
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperBuilderContext;
import org.elasticsearch.index.mapper.SourceLoader;
import org.elasticsearch.index.mapper.SourceValueFetcher;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.mapper.ValueFetcher;
import org.elasticsearch.index.query.SearchExecutionContext;
import org.elasticsearch.search.fetch.StoredFieldsSpec;
import org.elasticsearch.search.lookup.Source;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentParser.Token;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Stream;

import static org.elasticsearch.index.query.AbstractQueryBuilder.DEFAULT_BOOST;

Expand All @@ -52,8 +66,12 @@ public class SparseVectorFieldMapper extends FieldMapper {
static final IndexVersion NEW_SPARSE_VECTOR_INDEX_VERSION = IndexVersions.NEW_SPARSE_VECTOR;
static final IndexVersion SPARSE_VECTOR_IN_FIELD_NAMES_INDEX_VERSION = IndexVersions.SPARSE_VECTOR_IN_FIELD_NAMES_SUPPORT;

public static class Builder extends FieldMapper.Builder {
private static SparseVectorFieldMapper toType(FieldMapper in) {
return (SparseVectorFieldMapper) in;
}

public static class Builder extends FieldMapper.Builder {
private final Parameter<Boolean> stored = Parameter.storeParam(m -> toType(m).fieldType().isStored(), false);
private final Parameter<Map<String, String>> meta = Parameter.metaParam();

public Builder(String name) {
Expand All @@ -62,14 +80,14 @@ public Builder(String name) {

@Override
protected Parameter<?>[] getParameters() {
return new Parameter<?>[] { meta };
return new Parameter<?>[] { stored, meta };
}

@Override
public SparseVectorFieldMapper build(MapperBuilderContext context) {
return new SparseVectorFieldMapper(
leafName(),
new SparseVectorFieldType(context.buildFullName(leafName()), meta.getValue()),
new SparseVectorFieldType(context.buildFullName(leafName()), stored.getValue(), meta.getValue()),
builderParams(this, context)
);
}
Expand All @@ -87,8 +105,8 @@ public SparseVectorFieldMapper build(MapperBuilderContext context) {

public static final class SparseVectorFieldType extends MappedFieldType {

public SparseVectorFieldType(String name, Map<String, String> meta) {
super(name, true, false, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
public SparseVectorFieldType(String name, boolean stored, Map<String, String> meta) {
jimczi marked this conversation as resolved.
Show resolved Hide resolved
super(name, true, stored, false, TextSearchInfo.SIMPLE_MATCH_ONLY, meta);
}

@Override
Expand All @@ -103,6 +121,9 @@ public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext

@Override
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
if (isStored()) {
return new TermVectorsValueFetcher(name());
}
return SourceValueFetcher.identity(name(), context, format);
}

Expand Down Expand Up @@ -135,6 +156,14 @@ private SparseVectorFieldMapper(String simpleName, MappedFieldType mappedFieldTy
super(simpleName, mappedFieldType, builderParams);
}

@Override
protected SyntheticSourceSupport syntheticSourceSupport() {
if (fieldType().isStored()) {
return new SyntheticSourceSupport.Native(new SparseVectorSyntheticFieldLoader(fullPath(), leafName()));
}
return super.syntheticSourceSupport();
}

@Override
public Map<String, NamedAnalyzer> indexAnalyzers() {
return Map.of(mappedFieldType.name(), Lucene.KEYWORD_ANALYZER);
Expand Down Expand Up @@ -189,9 +218,9 @@ public void parse(DocumentParserContext context) throws IOException {
// based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf
IndexableField currentField = context.doc().getByKey(key);
if (currentField == null) {
context.doc().addWithKey(key, new FeatureField(fullPath(), feature, value));
} else if (currentField instanceof FeatureField && ((FeatureField) currentField).getFeatureValue() < value) {
((FeatureField) currentField).setFeatureValue(value);
context.doc().addWithKey(key, new XFeatureField(fullPath(), feature, value, fieldType().isStored()));
} else if (currentField instanceof XFeatureField && ((XFeatureField) currentField).getFeatureValue() < value) {
((XFeatureField) currentField).setFeatureValue(value);
}
} else {
throw new IllegalArgumentException(
Expand Down Expand Up @@ -219,4 +248,114 @@ protected String contentType() {
return CONTENT_TYPE;
}

private static class TermVectorsValueFetcher implements ValueFetcher {
jimczi marked this conversation as resolved.
Show resolved Hide resolved
private final String fieldName;
private TermVectors termVectors;

private TermVectorsValueFetcher(String fieldName) {
this.fieldName = fieldName;
}

@Override
public void setNextReader(LeafReaderContext context) {
try {
termVectors = context.reader().termVectors();
} catch (IOException exc) {
throw new UncheckedIOException(exc);
}
}

@Override
public List<Object> fetchValues(Source source, int doc, List<Object> ignoredValues) throws IOException {
if (termVectors == null) {
return List.of();
}
var terms = termVectors.get(doc, fieldName);
if (terms == null) {
return List.of();
}

var termsEnum = terms.iterator();
PostingsEnum postingsScratch = null;
Map<String, Float> result = new LinkedHashMap<>();
while (termsEnum.next() != null) {
postingsScratch = termsEnum.postings(postingsScratch);
postingsScratch.nextDoc();
result.put(termsEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(postingsScratch.freq()));
assert postingsScratch.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
}
return List.of(result);
}

@Override
public StoredFieldsSpec storedFieldsSpec() {
return StoredFieldsSpec.NO_REQUIREMENTS;
}
}

private static class SparseVectorSyntheticFieldLoader implements SourceLoader.SyntheticFieldLoader {
private final String fullPath;
private final String leafName;

private TermsEnum termsDocEnum;

private SparseVectorSyntheticFieldLoader(String fullPath, String leafName) {
this.fullPath = fullPath;
this.leafName = leafName;
}

@Override
public Stream<Map.Entry<String, StoredFieldLoader>> storedFieldLoaders() {
return Stream.of();
}

@Override
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
var fieldInfos = leafReader.getFieldInfos().fieldInfo(fullPath);
if (fieldInfos == null || fieldInfos.hasTermVectors() == false) {
return null;
}
return docId -> {
var terms = leafReader.termVectors().get(docId, fullPath);
if (terms == null) {
return false;
}
termsDocEnum = terms.iterator();
if (termsDocEnum.next() == null) {
termsDocEnum = null;
return false;
}
return true;
};
}

@Override
public boolean hasValue() {
return termsDocEnum != null;
}

@Override
public void write(XContentBuilder b) throws IOException {
assert termsDocEnum != null;
PostingsEnum reuse = null;
b.startObject(leafName);
do {
reuse = termsDocEnum.postings(reuse);
reuse.nextDoc();
b.field(termsDocEnum.term().utf8ToString(), XFeatureField.decodeFeatureValue(reuse.freq()));
} while (termsDocEnum.next() != null);
b.endObject();
}

@Override
public String fieldName() {
return leafName;
}

@Override
public void reset() {
termsDocEnum = null;
}
}

}
Loading