Skip to content

Commit

Permalink
javadoc and code refactor
Browse files Browse the repository at this point in the history
Signed-off-by: Rishabh Maurya <[email protected]>
  • Loading branch information
rishabhmaurya committed May 7, 2024
1 parent 25fecf3 commit 5f5f54d
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 62 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,22 @@ public DerivedFieldMapper build(BuilderContext context) {
type.getValue(),
name
);
DerivedFieldType ft = new DerivedFieldType(
new DerivedField(buildFullName(context), type.getValue(), script.getValue(), sourceIndexedField.getValue()),
fieldMapper,
fieldFunction,
indexAnalyzers
);
DerivedFieldType ft;
if (name.contains(".")) {
ft = new DerivedObjectFieldType(
new DerivedField(buildFullName(context), type.getValue(), script.getValue(), sourceIndexedField.getValue()),
fieldMapper,
fieldFunction,
indexAnalyzers
);
} else {
ft = new DerivedFieldType(
new DerivedField(buildFullName(context), type.getValue(), script.getValue(), sourceIndexedField.getValue()),
fieldMapper,
fieldFunction,
indexAnalyzers
);
}
return new DerivedFieldMapper(name, ft, multiFieldsBuilder.build(this, context), copyTo.build(), this, indexAnalyzers);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

public class DerivedObjectFieldType extends DerivedFieldType {

public DerivedObjectFieldType(
DerivedObjectFieldType(
DerivedField derivedField,
FieldMapper typeFieldMapper,
Function<Object, IndexableField> fieldFunction,
Expand Down
164 changes: 110 additions & 54 deletions server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
package org.opensearch.index.mapper;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.opensearch.common.xcontent.XContentFactory;
import org.opensearch.common.xcontent.json.JsonXContent;
Expand All @@ -20,23 +19,37 @@
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Set;

/**
* This method infers the field type by examining the _source documents. For a given value, type inference is similar to the dynamic mapping type guessing logic.
* Instead of guessing the type based on the first document, it generates a random sample of documents to make a more accurate inference.
* This approach is particularly useful when dealing with missing fields, which is common in nested fields within derived fields of object types.
*
* <p>The sample size should be selected carefully to ensure a high probability of selecting at least one document where the field is present.
* However, it's important to maintain a balance as a large sample size can lead to performance issues as for each sample document _source field is loaded and examined.
*
* <p>The problem of determining the sample size (S) is akin to deciding how many balls to draw from a bin,
* ensuring a high probability (>=P) of drawing at least one green ball (documents with the field) from a mixture of
* R red balls (documents without the field) and G green balls -
* <pre>
* P >= 1 - C(R, S) / C(R + G, S)
* </pre>
* Where C() is the binomial coefficient
* For a high confidence, we want the P >= 0.95
*/

public class FieldTypeInference {
private final IndexReader indexReader;
private final String indexName;
private final MapperService mapperService;
// TODO expose using a index setting?
private int sampleSize;

// this will lead to the probability of more than 0.95 to select on the document containing this field,
// when at least 5% of the overall documents contain the field
private static final int DEFAULT_SAMPLE_SIZE = 60;

private final int MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES = 10000;

public FieldTypeInference(String indexName, MapperService mapperService, IndexReader indexReader) {
this.indexName = indexName;
this.mapperService = mapperService;
Expand All @@ -53,56 +66,17 @@ public int getSampleSize() {
}

public Mapper infer(ValueFetcher valueFetcher) throws IOException {
int iter = 0;
int totalDocs = indexReader.numDocs();
int sampleSize = Math.min(totalDocs, getSampleSize());
int[] docs = getSortedRandomNum(sampleSize, totalDocs, Math.max(getSampleSize(), MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES));
int offset = 0;
SourceLookup sourceLookup = new SourceLookup();
for (LeafReaderContext leafReaderContext : indexReader.leaves()) {
LeafReader leafReader = leafReaderContext.reader();
valueFetcher.setNextReader(leafReaderContext);
if (iter >= docs.length) {
break;
RandomSourceValuesGenerator valuesGenerator = new RandomSourceValuesGenerator(sampleSize, indexReader, valueFetcher);
Mapper inferredMapper = null;
while (inferredMapper == null && valuesGenerator.hasNext()) {
List<Object> values = valuesGenerator.next();
if (values == null) {
continue;
}
int docID = docs[iter] - offset;
while (docID < leafReader.numDocs()) {
sourceLookup.setSegmentAndDocument(leafReaderContext, docID);
List<Object> objects = valueFetcher.fetchValues(sourceLookup);
Mapper inferredMapper = null;
if (objects != null && !objects.isEmpty()) {
// always using first value in case of multi value field
inferredMapper = inferTypeFromObject(objects.get(0));
}
if (inferredMapper != null) {
return inferredMapper;
}
iter++;
if (iter >= docs.length) {
break;
}
docID = docs[iter] - offset;
}
offset += leafReader.numDocs();
}
return null;
}

private static int[] getSortedRandomNum(int k, int n, int attempts) {
Set<Integer> generatedNumbers = new HashSet<>();
Random random = new Random();
int itr = 0;
while (generatedNumbers.size() < k && itr++ < attempts) {
int randomNumber = random.nextInt(n);
generatedNumbers.add(randomNumber);
// always use first value in case of multi value field to infer type
inferredMapper = inferTypeFromObject(values.get(0));
}
int[] result = new int[generatedNumbers.size()];
int i = 0;
for (int number : generatedNumbers) {
result[i++] = number;
}
Arrays.sort(result);
return result;
return inferredMapper;
}

private Mapper inferTypeFromObject(Object o) throws IOException {
Expand All @@ -117,4 +91,86 @@ private Mapper inferTypeFromObject(Object o) throws IOException {
Mapping mapping = parsedDocument.dynamicMappingsUpdate();
return mapping.root.getMapper("field");
}

private static class RandomSourceValuesGenerator implements Iterator<List<Object>> {
private final ValueFetcher valueFetcher;
private final IndexReader indexReader;
private final SourceLookup sourceLookup;
private final int numLeaves;
private final int[] docs;
private int iter;
private int offset;
private LeafReaderContext leafReaderContext;
private int leaf;
private final int MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES = 10000;

public RandomSourceValuesGenerator(int sampleSize, IndexReader indexReader, ValueFetcher valueFetcher) {
this.valueFetcher = valueFetcher;
this.indexReader = indexReader;
sampleSize = Math.min(sampleSize, indexReader.numDocs());
this.docs = getSortedRandomNum(
sampleSize,
indexReader.numDocs(),
Math.max(sampleSize, MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES)
);
this.iter = 0;
this.offset = 0;
this.leaf = 0;
this.numLeaves = indexReader.leaves().size();
this.sourceLookup = new SourceLookup();
this.leafReaderContext = indexReader.leaves().get(leaf);
valueFetcher.setNextReader(leafReaderContext);
}

@Override
public boolean hasNext() {
return iter < docs.length && leaf < numLeaves;
}

/**
* Ensure hasNext() is called before calling next()
*/
@Override
public List<Object> next() {
int docID = docs[iter] - offset;
if (docID >= leafReaderContext.reader().numDocs()) {
setNextLeaf();
return next();
}
// deleted docs are getting used to infer type, which should be okay?
sourceLookup.setSegmentAndDocument(leafReaderContext, docID);
try {
iter++;
return valueFetcher.fetchValues(sourceLookup);
} catch (IOException e) {
throw new RuntimeException(e);
}
}

private void setNextLeaf() {
offset += leafReaderContext.reader().numDocs();
leaf++;
if (leaf < numLeaves) {
leafReaderContext = indexReader.leaves().get(leaf);
valueFetcher.setNextReader(leafReaderContext);
}
}

private static int[] getSortedRandomNum(int sampleSize, int upperBound, int attempts) {
Set<Integer> generatedNumbers = new HashSet<>();
Random random = new Random();
int itr = 0;
while (generatedNumbers.size() < sampleSize && itr++ < attempts) {
int randomNumber = random.nextInt(upperBound);
generatedNumbers.add(randomNumber);
}
int[] result = new int[generatedNumbers.size()];
int i = 0;
for (int number : generatedNumbers) {
result[i++] = number;
}
Arrays.sort(result);
return result;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ public void setNextReader(LeafReaderContext leafReaderContext) {
}
});
assertNull(mapper);
// assertEquals(leaves, docsEvaluated.size());
assertEquals(typeInference.getSampleSize(), totalDocsEvaluated[0]);
for (List<Integer> docsPerLeaf : docsEvaluated) {
for (int j = 0; j < docsPerLeaf.size() - 1; j++) {
Expand Down

0 comments on commit 5f5f54d

Please sign in to comment.