Add Optional Source Filtering to Source Loaders

Spinoff of elastic#113036. This change introduces optional source filtering directly within source loaders (both synthetic and stored). The main benefit is seen in synthetic source loaders, as synthetic fields are stored independently. By filtering while loading the synthetic source, generating the source becomes linear in the number of fields that match the filter. This update also modifies the get document API to apply source filters earlier—directly through the source loader. The search API, however, is not affected in this change, since the loaded source is still used by other features (e.g., highlighting, fields, nested hits), and source filtering is always applied as the final step. A follow-up will be required to ensure careful handling of all search-related scenarios.
jimczi · Sep 30, 2024 · f62f4f6 · f62f4f6
1 parent 8cb1266
commit f62f4f6
Show file tree

Hide file tree

Showing 31 changed files with 629 additions and 164 deletions.
diff --git a/...pl/src/main/java/org/elasticsearch/xcontent/provider/XContentParserConfigurationImpl.java b/...pl/src/main/java/org/elasticsearch/xcontent/provider/XContentParserConfigurationImpl.java
@@ -19,6 +19,8 @@
 import org.elasticsearch.xcontent.provider.filtering.FilterPathBasedFilter;
 import org.elasticsearch.xcontent.support.filtering.FilterPath;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Set;
 
 public class XContentParserConfigurationImpl implements XContentParserConfiguration {
@@ -102,20 +104,49 @@ public RestApiVersion restApiVersion() {
     }
 
     public XContentParserConfiguration withFiltering(
+        String rootPath,
         Set<String> includeStrings,
         Set<String> excludeStrings,
         boolean filtersMatchFieldNamesWithDots
     ) {
+        FilterPath[] includePaths = FilterPath.compile(includeStrings);
+        FilterPath[] excludePaths = FilterPath.compile(excludeStrings);
+
+        if (rootPath != null) {
+            if (includePaths != null) {
+                List<FilterPath> includeFilters = new ArrayList<>();
+                for (var incl : includePaths) {
+                    incl.matches(rootPath, includeFilters, true);
+                }
+                includePaths = includeFilters.isEmpty() ? null : includeFilters.toArray(FilterPath[]::new);
+            }
+
+            if (excludePaths != null) {
+                List<FilterPath> excludeFilters = new ArrayList<>();
+                for (var excl : excludePaths) {
+                    excl.matches(rootPath, excludeFilters, true);
+                }
+                excludePaths = excludeFilters.isEmpty() ? null : excludeFilters.toArray(FilterPath[]::new);
+            }
+        }
         return new XContentParserConfigurationImpl(
             registry,
             deprecationHandler,
             restApiVersion,
-            FilterPath.compile(includeStrings),
-            FilterPath.compile(excludeStrings),
+            includePaths,
+            excludePaths,
             filtersMatchFieldNamesWithDots
         );
     }
 
+    public XContentParserConfiguration withFiltering(
+        Set<String> includeStrings,
+        Set<String> excludeStrings,
+        boolean filtersMatchFieldNamesWithDots
+    ) {
+        return withFiltering(null, includeStrings, excludeStrings, filtersMatchFieldNamesWithDots);
+    }
+
     public JsonParser filter(JsonParser parser) {
         JsonParser filtered = parser;
         if (excludes != null) {

diff --git a/libs/x-content/src/main/java/org/elasticsearch/xcontent/XContentParserConfiguration.java b/libs/x-content/src/main/java/org/elasticsearch/xcontent/XContentParserConfiguration.java
@@ -57,4 +57,11 @@ XContentParserConfiguration withFiltering(
         Set<String> excludeStrings,
         boolean filtersMatchFieldNamesWithDots
     );
+
+    XContentParserConfiguration withFiltering(
+        String rootPath,
+        Set<String> includeStrings,
+        Set<String> excludeStrings,
+        boolean filtersMatchFieldNamesWithDots
+    );
 }
diff --git a/.../java/org/elasticsearch/xcontent/support/filtering/AbstractXContentFilteringTestCase.java b/.../java/org/elasticsearch/xcontent/support/filtering/AbstractXContentFilteringTestCase.java
@@ -22,6 +22,7 @@
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.Set;
 import java.util.stream.IntStream;
 
@@ -332,6 +333,24 @@ protected final void testFilter(Builder expected, Builder sample, Collection<Str
     private void testFilter(Builder expected, Builder sample, Set<String> includes, Set<String> excludes, boolean matchFieldNamesWithDots)
         throws IOException {
         assertFilterResult(expected.apply(createBuilder()), filter(sample, includes, excludes, matchFieldNamesWithDots));
+
+        String rootPrefix = "root.path.random";
+        if (includes != null) {
+            Set<String> rootIncludes = new HashSet<>();
+            for (var incl : includes) {
+                rootIncludes.add(rootPrefix + (randomBoolean() ? "." : "*.") + incl);
+            }
+            includes = rootIncludes;
+        }
+
+        if (excludes != null) {
+            Set<String> rootExcludes = new HashSet<>();
+            for (var excl : excludes) {
+                rootExcludes.add(rootPrefix + (randomBoolean() ? "." : "*.") + excl);
+            }
+            excludes = rootExcludes;
+        }
+        assertFilterResult(expected.apply(createBuilder()), filterSub(sample, rootPrefix, includes, excludes, matchFieldNamesWithDots));
     }
 
     public void testArrayWithEmptyObjectInInclude() throws IOException {
@@ -413,21 +432,36 @@ private XContentBuilder filter(Builder sample, Set<String> includes, Set<String>
             && matchFieldNamesWithDots == false) {
             return filterOnBuilder(sample, includes, excludes);
         }
-        return filterOnParser(sample, includes, excludes, matchFieldNamesWithDots);
+        return filterOnParser(sample, null, includes, excludes, matchFieldNamesWithDots);
+    }
+
+    private XContentBuilder filterSub(
+        Builder sample,
+        String root,
+        Set<String> includes,
+        Set<String> excludes,
+        boolean matchFieldNamesWithDots
+    ) throws IOException {
+        return filterOnParser(sample, root, includes, excludes, matchFieldNamesWithDots);
     }
 
     private XContentBuilder filterOnBuilder(Builder sample, Set<String> includes, Set<String> excludes) throws IOException {
         return sample.apply(XContentBuilder.builder(getXContentType(), includes, excludes));
     }
 
-    private XContentBuilder filterOnParser(Builder sample, Set<String> includes, Set<String> excludes, boolean matchFieldNamesWithDots)
-        throws IOException {
+    private XContentBuilder filterOnParser(
+        Builder sample,
+        String rootPath,
+        Set<String> includes,
+        Set<String> excludes,
+        boolean matchFieldNamesWithDots
+    ) throws IOException {
         try (XContentBuilder builtSample = sample.apply(createBuilder())) {
             BytesReference sampleBytes = BytesReference.bytes(builtSample);
             try (
                 XContentParser parser = getXContentType().xContent()
                     .createParser(
-                        XContentParserConfiguration.EMPTY.withFiltering(includes, excludes, matchFieldNamesWithDots),
+                        XContentParserConfiguration.EMPTY.withFiltering(rootPath, includes, excludes, matchFieldNamesWithDots),
                         sampleBytes.streamInput()
                     )
             ) {

diff --git a/server/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java b/server/src/main/java/org/elasticsearch/common/xcontent/support/XContentMapValues.java
@@ -274,31 +274,24 @@ public static Map<String, Object> filter(Map<String, Object> map, String[] inclu
      */
     public static Function<Map<String, Object>, Map<String, Object>> filter(String[] includes, String[] excludes) {
         CharacterRunAutomaton matchAllAutomaton = new CharacterRunAutomaton(Automata.makeAnyString());
-
-        CharacterRunAutomaton include;
-        if (includes == null || includes.length == 0) {
-            include = matchAllAutomaton;
-        } else {
-            Automaton includeA = Regex.simpleMatchToAutomaton(includes);
-            includeA = makeMatchDotsInFieldNames(includeA);
-            include = new CharacterRunAutomaton(includeA, MAX_DETERMINIZED_STATES);
-        }
-
-        Automaton excludeA;
-        if (excludes == null || excludes.length == 0) {
-            excludeA = Automata.makeEmpty();
-        } else {
-            excludeA = Regex.simpleMatchToAutomaton(excludes);
-            excludeA = makeMatchDotsInFieldNames(excludeA);
-        }
-        CharacterRunAutomaton exclude = new CharacterRunAutomaton(excludeA, MAX_DETERMINIZED_STATES);
+        CharacterRunAutomaton include = compileAutomaton(includes, matchAllAutomaton);
+        CharacterRunAutomaton exclude = compileAutomaton(excludes, new CharacterRunAutomaton(Automata.makeEmpty()));
 
         // NOTE: We cannot use Operations.minus because of the special case that
         // we want all sub properties to match as soon as an object matches
 
         return (map) -> filter(map, include, 0, exclude, 0, matchAllAutomaton);
     }
 
+    public static CharacterRunAutomaton compileAutomaton(String[] patterns, CharacterRunAutomaton defaultValue) {
+        if (patterns == null || patterns.length == 0) {
+            return defaultValue;
+        }
+        var aut = Regex.simpleMatchToAutomaton(patterns);
+        aut = makeMatchDotsInFieldNames(aut);
+        return new CharacterRunAutomaton(aut, MAX_DETERMINIZED_STATES);
+    }
+
     /** Make matches on objects also match dots in field names.
      *  For instance, if the original simple regex is `foo`, this will translate
      *  it into `foo` OR `foo.*`. */

diff --git a/server/src/main/java/org/elasticsearch/index/get/ShardGetService.java b/server/src/main/java/org/elasticsearch/index/get/ShardGetService.java
@@ -307,8 +307,14 @@ private GetResult innerGetFetch(
         Map<String, DocumentField> metadataFields = null;
         DocIdAndVersion docIdAndVersion = get.docIdAndVersion();
         SourceLoader loader = forceSyntheticSource
-            ? new SourceLoader.Synthetic(mappingLookup.getMapping()::syntheticFieldLoader, mapperMetrics.sourceFieldMetrics())
-            : mappingLookup.newSourceLoader(mapperMetrics.sourceFieldMetrics());
+            ? new SourceLoader.Synthetic(
+                () -> mappingLookup.getMapping().syntheticFieldLoader(fetchSourceContext.hasFilter() ? fetchSourceContext.filter() : null),
+                mapperMetrics.sourceFieldMetrics()
+            )
+            : mappingLookup.newSourceLoader(
+                fetchSourceContext.hasFilter() ? fetchSourceContext.filter() : null,
+                mapperMetrics.sourceFieldMetrics()
+            );
         StoredFieldLoader storedFieldLoader = buildStoredFieldLoader(storedFields, fetchSourceContext, loader);
         LeafStoredFieldLoader leafStoredFieldLoader = storedFieldLoader.getLoader(docIdAndVersion.reader.getContext(), null);
         try {
@@ -367,10 +373,6 @@ private GetResult innerGetFetch(
         if (mapperService.mappingLookup().isSourceEnabled() && fetchSourceContext.fetchSource()) {
             Source source = loader.leaf(docIdAndVersion.reader, new int[] { docIdAndVersion.docId })
                 .source(leafStoredFieldLoader, docIdAndVersion.docId);
-
-            if (fetchSourceContext.hasFilter()) {
-                source = source.filter(fetchSourceContext.filter());
-            }
             sourceBytes = source.internalSourceRef();
         }
 

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/CompositeSyntheticFieldLoader.java b/server/src/main/java/org/elasticsearch/index/mapper/CompositeSyntheticFieldLoader.java
@@ -12,6 +12,7 @@
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.XContentParserConfiguration;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -169,7 +170,7 @@ public MalformedValuesLayer(String fieldName) {
         @Override
         protected void writeValue(Object value, XContentBuilder b) throws IOException {
             if (value instanceof BytesRef r) {
-                XContentDataHelper.decodeAndWrite(b, r);
+                XContentDataHelper.decodeAndWrite(XContentParserConfiguration.EMPTY, b, r);
             } else {
                 b.value(value);
             }

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/DocumentMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/DocumentMapper.java
@@ -134,7 +134,7 @@ public void validate(IndexSettings settings, boolean checkLimits) {
          * with the source loading strategy declared on the source field mapper.
          */
         try {
-            sourceMapper().newSourceLoader(mapping(), mapperMetrics.sourceFieldMetrics());
+            mappingLookup.newSourceLoader(null, mapperMetrics.sourceFieldMetrics());
         } catch (IllegalArgumentException e) {
             mapperMetrics.sourceFieldMetrics().recordSyntheticSourceIncompatibleMapping();
             throw e;

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/FieldAliasMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/FieldAliasMapper.java
@@ -156,9 +156,4 @@ public FieldAliasMapper build(MapperBuilderContext context) {
             return new FieldAliasMapper(leafName(), fullName, path);
         }
     }
-
-    @Override
-    public SourceLoader.SyntheticFieldLoader syntheticFieldLoader() {
-        return SourceLoader.SyntheticFieldLoader.NOTHING;
-    }
 }
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/FieldMapper.java
@@ -31,6 +31,7 @@
 import org.elasticsearch.script.Script;
 import org.elasticsearch.script.ScriptType;
 import org.elasticsearch.search.lookup.SearchLookup;
+import org.elasticsearch.search.lookup.SourceFilter;
 import org.elasticsearch.xcontent.ToXContent;
 import org.elasticsearch.xcontent.ToXContentFragment;
 import org.elasticsearch.xcontent.XContentBuilder;
@@ -484,15 +485,14 @@ final SyntheticSourceMode syntheticSourceMode() {
     /**
      * Returns synthetic field loader for the mapper.
      * If mapper does not support synthetic source, it is handled using generic implementation
-     * in {@link DocumentParser#parseObjectOrField} and {@link ObjectMapper#syntheticFieldLoader()}.
+     * in {@link DocumentParser#parseObjectOrField} and {@link ObjectMapper#syntheticFieldLoader(SourceFilter)}.
      * <br>
      *
      * This method is final in order to support common use cases like fallback synthetic source.
      * Mappers that need custom support of synthetic source should override {@link #syntheticSourceSupport()}.
      *
      * @return implementation of {@link SourceLoader.SyntheticFieldLoader}
      */
-    @Override
     public final SourceLoader.SyntheticFieldLoader syntheticFieldLoader() {
         if (hasScript()) {
             return SourceLoader.SyntheticFieldLoader.NOTHING;

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/IgnoreMalformedStoredValues.java b/server/src/main/java/org/elasticsearch/index/mapper/IgnoreMalformedStoredValues.java
@@ -13,6 +13,7 @@
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentParser;
+import org.elasticsearch.xcontent.XContentParserConfiguration;
 
 import java.io.IOException;
 import java.util.List;
@@ -128,7 +129,7 @@ public int count() {
         public void write(XContentBuilder b) throws IOException {
             for (Object v : values) {
                 if (v instanceof BytesRef r) {
-                    XContentDataHelper.decodeAndWrite(b, r);
+                    XContentDataHelper.decodeAndWrite(XContentParserConfiguration.EMPTY, b, r);
                 } else {
                     b.value(v);
                 }

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/IgnoredSourceFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/IgnoredSourceFieldMapper.java
@@ -21,6 +21,7 @@
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.XContentParserConfiguration;
 import org.elasticsearch.xcontent.XContentType;
 
 import java.io.IOException;
@@ -281,7 +282,7 @@ public static MappedNameValue decodeAsMap(byte[] value) throws IOException {
         IgnoredSourceFieldMapper.NameValue nameValue = IgnoredSourceFieldMapper.decode(bytes);
         XContentBuilder xContentBuilder = XContentBuilder.builder(XContentDataHelper.getXContentType(nameValue.value()).xContent());
         xContentBuilder.startObject().field(nameValue.name());
-        XContentDataHelper.decodeAndWrite(xContentBuilder, nameValue.value());
+        XContentDataHelper.decodeAndWrite(XContentParserConfiguration.EMPTY, xContentBuilder, nameValue.value());
         xContentBuilder.endObject();
         Tuple<XContentType, Map<String, Object>> result = XContentHelper.convertToMap(BytesReference.bytes(xContentBuilder), true);
         return new MappedNameValue(nameValue, result.v1(), result.v2());

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/Mapper.java b/server/src/main/java/org/elasticsearch/index/mapper/Mapper.java
@@ -163,18 +163,6 @@ public final String leafName() {
      */
     public abstract void validate(MappingLookup mappers);
 
-    /**
-     * Create a {@link SourceLoader.SyntheticFieldLoader} to populate synthetic source.
-     *
-     * @throws IllegalArgumentException if the field is configured in a way that doesn't
-     *         support synthetic source. This translates nicely into a 400 error when
-     *         users configure synthetic source in the mapping without configuring all
-     *         fields properly.
-     */
-    public SourceLoader.SyntheticFieldLoader syntheticFieldLoader() {
-        throw new IllegalArgumentException("field [" + fullPath() + "] of type [" + typeName() + "] doesn't support synthetic source");
-    }
-
     @Override
     public String toString() {
         return Strings.toString(this);

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/Mapping.java b/server/src/main/java/org/elasticsearch/index/mapper/Mapping.java
@@ -13,7 +13,9 @@
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.compress.CompressedXContent;
 import org.elasticsearch.common.xcontent.XContentHelper;
+import org.elasticsearch.core.Nullable;
 import org.elasticsearch.index.mapper.MapperService.MergeReason;
+import org.elasticsearch.search.lookup.SourceFilter;
 import org.elasticsearch.xcontent.ToXContentFragment;
 import org.elasticsearch.xcontent.XContentBuilder;
 
@@ -22,6 +24,7 @@
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
 /**
@@ -126,9 +129,9 @@ private boolean isSourceSynthetic() {
         return sfm != null && sfm.isSynthetic();
     }
 
-    public SourceLoader.SyntheticFieldLoader syntheticFieldLoader() {
-        var stream = Stream.concat(Stream.of(metadataMappers), root.mappers.values().stream());
-        return root.syntheticFieldLoader(stream);
+    public SourceLoader.SyntheticFieldLoader syntheticFieldLoader(@Nullable SourceFilter filter) {
+        var mappers = Stream.concat(Stream.of(metadataMappers), root.mappers.values().stream()).collect(Collectors.toList());
+        return root.syntheticFieldLoader(filter, mappers, false);
     }
 
     /**

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java b/server/src/main/java/org/elasticsearch/index/mapper/MappingLookup.java
@@ -12,10 +12,12 @@
 import org.apache.lucene.codecs.PostingsFormat;
 import org.elasticsearch.cluster.metadata.DataStream;
 import org.elasticsearch.cluster.metadata.InferenceFieldMetadata;
+import org.elasticsearch.core.Nullable;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.IndexAnalyzers;
 import org.elasticsearch.index.analysis.NamedAnalyzer;
 import org.elasticsearch.inference.InferenceService;
+import org.elasticsearch.search.lookup.SourceFilter;
 
 import java.util.ArrayList;
 import java.util.Collection;
@@ -497,9 +499,11 @@ public boolean isSourceSynthetic() {
     /**
      * Build something to load source {@code _source}.
      */
-    public SourceLoader newSourceLoader(SourceFieldMetrics metrics) {
-        SourceFieldMapper sfm = mapping.getMetadataMapperByClass(SourceFieldMapper.class);
-        return sfm == null ? SourceLoader.FROM_STORED_SOURCE : sfm.newSourceLoader(mapping, metrics);
+    public SourceLoader newSourceLoader(@Nullable SourceFilter filter, SourceFieldMetrics metrics) {
+        if (isSourceSynthetic()) {
+            return new SourceLoader.Synthetic(() -> mapping.syntheticFieldLoader(filter), metrics);
+        }
+        return filter == null ? SourceLoader.FROM_STORED_SOURCE : new SourceLoader.Stored(filter);
     }
 
     /**