diff --git a/server/src/main/java/org/opensearch/index/mapper/DerivedFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/DerivedFieldMapper.java index c69252332da6f..9e1ac046aa56e 100644 --- a/server/src/main/java/org/opensearch/index/mapper/DerivedFieldMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/DerivedFieldMapper.java @@ -83,12 +83,22 @@ public DerivedFieldMapper build(BuilderContext context) { type.getValue(), name ); - DerivedFieldType ft = new DerivedFieldType( - new DerivedField(buildFullName(context), type.getValue(), script.getValue(), sourceIndexedField.getValue()), - fieldMapper, - fieldFunction, - indexAnalyzers - ); + DerivedFieldType ft; + if (name.contains(".")) { + ft = new DerivedObjectFieldType( + new DerivedField(buildFullName(context), type.getValue(), script.getValue(), sourceIndexedField.getValue()), + fieldMapper, + fieldFunction, + indexAnalyzers + ); + } else { + ft = new DerivedFieldType( + new DerivedField(buildFullName(context), type.getValue(), script.getValue(), sourceIndexedField.getValue()), + fieldMapper, + fieldFunction, + indexAnalyzers + ); + } return new DerivedFieldMapper(name, ft, multiFieldsBuilder.build(this, context), copyTo.build(), this, indexAnalyzers); } } diff --git a/server/src/main/java/org/opensearch/index/mapper/DerivedObjectFieldType.java b/server/src/main/java/org/opensearch/index/mapper/DerivedObjectFieldType.java index 2adfe8d521db4..20181ab9a7776 100644 --- a/server/src/main/java/org/opensearch/index/mapper/DerivedObjectFieldType.java +++ b/server/src/main/java/org/opensearch/index/mapper/DerivedObjectFieldType.java @@ -24,7 +24,7 @@ public class DerivedObjectFieldType extends DerivedFieldType { - public DerivedObjectFieldType( + DerivedObjectFieldType( DerivedField derivedField, FieldMapper typeFieldMapper, Function fieldFunction, diff --git a/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java b/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java index fd0dbf431b099..096bc92beb29e 100644 --- a/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java +++ b/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java @@ -9,7 +9,6 @@ package org.opensearch.index.mapper; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.opensearch.common.xcontent.XContentFactory; import org.opensearch.common.xcontent.json.JsonXContent; @@ -20,23 +19,37 @@ import java.io.IOException; import java.util.Arrays; import java.util.HashSet; +import java.util.Iterator; import java.util.List; import java.util.Random; import java.util.Set; +/** + * This method infers the field type by examining the _source documents. For a given value, type inference is similar to the dynamic mapping type guessing logic. + * Instead of guessing the type based on the first document, it generates a random sample of documents to make a more accurate inference. + * This approach is particularly useful when dealing with missing fields, which is common in nested fields within derived fields of object types. + * + *

The sample size should be selected carefully to ensure a high probability of selecting at least one document where the field is present. + * However, it's important to maintain a balance as a large sample size can lead to performance issues as for each sample document _source field is loaded and examined. + * + *

The problem of determining the sample size (S) is akin to deciding how many balls to draw from a bin, + * ensuring a high probability (>=P) of drawing at least one green ball (documents with the field) from a mixture of + * R red balls (documents without the field) and G green balls - + *

+ * P >= 1 - C(R, S) / C(R + G, S)
+ * 
+ * Where C() is the binomial coefficient + * For a high confidence, we want the P >= 0.95 + */ + public class FieldTypeInference { private final IndexReader indexReader; private final String indexName; private final MapperService mapperService; // TODO expose using a index setting? private int sampleSize; - - // this will lead to the probability of more than 0.95 to select on the document containing this field, - // when at least 5% of the overall documents contain the field private static final int DEFAULT_SAMPLE_SIZE = 60; - private final int MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES = 10000; - public FieldTypeInference(String indexName, MapperService mapperService, IndexReader indexReader) { this.indexName = indexName; this.mapperService = mapperService; @@ -53,56 +66,17 @@ public int getSampleSize() { } public Mapper infer(ValueFetcher valueFetcher) throws IOException { - int iter = 0; - int totalDocs = indexReader.numDocs(); - int sampleSize = Math.min(totalDocs, getSampleSize()); - int[] docs = getSortedRandomNum(sampleSize, totalDocs, Math.max(getSampleSize(), MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES)); - int offset = 0; - SourceLookup sourceLookup = new SourceLookup(); - for (LeafReaderContext leafReaderContext : indexReader.leaves()) { - LeafReader leafReader = leafReaderContext.reader(); - valueFetcher.setNextReader(leafReaderContext); - if (iter >= docs.length) { - break; + RandomSourceValuesGenerator valuesGenerator = new RandomSourceValuesGenerator(sampleSize, indexReader, valueFetcher); + Mapper inferredMapper = null; + while (inferredMapper == null && valuesGenerator.hasNext()) { + List values = valuesGenerator.next(); + if (values == null) { + continue; } - int docID = docs[iter] - offset; - while (docID < leafReader.numDocs()) { - sourceLookup.setSegmentAndDocument(leafReaderContext, docID); - List objects = valueFetcher.fetchValues(sourceLookup); - Mapper inferredMapper = null; - if (objects != null && !objects.isEmpty()) { - // always using first value in case of multi value field - inferredMapper = inferTypeFromObject(objects.get(0)); - } - if (inferredMapper != null) { - return inferredMapper; - } - iter++; - if (iter >= docs.length) { - break; - } - docID = docs[iter] - offset; - } - offset += leafReader.numDocs(); - } - return null; - } - - private static int[] getSortedRandomNum(int k, int n, int attempts) { - Set generatedNumbers = new HashSet<>(); - Random random = new Random(); - int itr = 0; - while (generatedNumbers.size() < k && itr++ < attempts) { - int randomNumber = random.nextInt(n); - generatedNumbers.add(randomNumber); + // always use first value in case of multi value field to infer type + inferredMapper = inferTypeFromObject(values.get(0)); } - int[] result = new int[generatedNumbers.size()]; - int i = 0; - for (int number : generatedNumbers) { - result[i++] = number; - } - Arrays.sort(result); - return result; + return inferredMapper; } private Mapper inferTypeFromObject(Object o) throws IOException { @@ -117,4 +91,86 @@ private Mapper inferTypeFromObject(Object o) throws IOException { Mapping mapping = parsedDocument.dynamicMappingsUpdate(); return mapping.root.getMapper("field"); } + + private static class RandomSourceValuesGenerator implements Iterator> { + private final ValueFetcher valueFetcher; + private final IndexReader indexReader; + private final SourceLookup sourceLookup; + private final int numLeaves; + private final int[] docs; + private int iter; + private int offset; + private LeafReaderContext leafReaderContext; + private int leaf; + private final int MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES = 10000; + + public RandomSourceValuesGenerator(int sampleSize, IndexReader indexReader, ValueFetcher valueFetcher) { + this.valueFetcher = valueFetcher; + this.indexReader = indexReader; + sampleSize = Math.min(sampleSize, indexReader.numDocs()); + this.docs = getSortedRandomNum( + sampleSize, + indexReader.numDocs(), + Math.max(sampleSize, MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES) + ); + this.iter = 0; + this.offset = 0; + this.leaf = 0; + this.numLeaves = indexReader.leaves().size(); + this.sourceLookup = new SourceLookup(); + this.leafReaderContext = indexReader.leaves().get(leaf); + valueFetcher.setNextReader(leafReaderContext); + } + + @Override + public boolean hasNext() { + return iter < docs.length && leaf < numLeaves; + } + + /** + * Ensure hasNext() is called before calling next() + */ + @Override + public List next() { + int docID = docs[iter] - offset; + if (docID >= leafReaderContext.reader().numDocs()) { + setNextLeaf(); + return next(); + } + // deleted docs are getting used to infer type, which should be okay? + sourceLookup.setSegmentAndDocument(leafReaderContext, docID); + try { + iter++; + return valueFetcher.fetchValues(sourceLookup); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void setNextLeaf() { + offset += leafReaderContext.reader().numDocs(); + leaf++; + if (leaf < numLeaves) { + leafReaderContext = indexReader.leaves().get(leaf); + valueFetcher.setNextReader(leafReaderContext); + } + } + + private static int[] getSortedRandomNum(int sampleSize, int upperBound, int attempts) { + Set generatedNumbers = new HashSet<>(); + Random random = new Random(); + int itr = 0; + while (generatedNumbers.size() < sampleSize && itr++ < attempts) { + int randomNumber = random.nextInt(upperBound); + generatedNumbers.add(randomNumber); + } + int[] result = new int[generatedNumbers.size()]; + int i = 0; + for (int number : generatedNumbers) { + result[i++] = number; + } + Arrays.sort(result); + return result; + } + } } diff --git a/server/src/test/java/org/opensearch/index/mapper/FieldTypeInferenceTests.java b/server/src/test/java/org/opensearch/index/mapper/FieldTypeInferenceTests.java index 97ae855e455cf..3762663c1fb01 100644 --- a/server/src/test/java/org/opensearch/index/mapper/FieldTypeInferenceTests.java +++ b/server/src/test/java/org/opensearch/index/mapper/FieldTypeInferenceTests.java @@ -126,7 +126,6 @@ public void setNextReader(LeafReaderContext leafReaderContext) { } }); assertNull(mapper); - // assertEquals(leaves, docsEvaluated.size()); assertEquals(typeInference.getSampleSize(), totalDocsEvaluated[0]); for (List docsPerLeaf : docsEvaluated) { for (int j = 0; j < docsPerLeaf.size() - 1; j++) {