From f6ed5e1aa3d672dafdf463870e3b141d669dd44d Mon Sep 17 00:00:00 2001 From: Rishabh Maurya Date: Tue, 7 May 2024 12:47:21 -0700 Subject: [PATCH] Use Randomness#get() Signed-off-by: Rishabh Maurya --- .../org/opensearch/index/mapper/FieldTypeInference.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java b/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java index f649e3c1ca251..933b53fc8e34e 100644 --- a/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java +++ b/server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java @@ -10,6 +10,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.LeafReaderContext; +import org.opensearch.common.Randomness; import org.opensearch.common.xcontent.XContentFactory; import org.opensearch.common.xcontent.json.JsonXContent; import org.opensearch.core.common.bytes.BytesReference; @@ -32,12 +33,12 @@ *

The sample size should be chosen carefully to ensure a high probability of selecting at least one document where the field is present. * However, it's essential to strike a balance because a large sample size can lead to performance issues since each sample document's _source field is loaded and examined until the field is found. * - *

Determining the sample size (S) is akin to deciding how many balls to draw from a bin, ensuring a high probability ((>=P)) of drawing at least one green ball (documents with the field) from a mixture of R red balls (documents without the field) and G green balls: + *

Determining the sample size ({@code S}) is akin to deciding how many balls to draw from a bin, ensuring a high probability ({@code >=P}) of drawing at least one green ball (documents with the field) from a mixture of {@code R } red balls (documents without the field) and {@code G } green balls: *

{@code
  * P >= 1 - C(R, S) / C(R + G, S)
  * }
- * Here, C() represents the binomial coefficient. - * For a high confidence level, we aim for P >= 0.95. For example, with 10^7 documents where the field is present in 2% of them, the sample size S should be around 149 to achieve a probability of 0.95. + * Here, {@code C()} represents the binomial coefficient. + * For a high confidence level, we aim for {@code P >= 0.95 }. For example, with {@code 10^7 } documents where the field is present in {@code 2% } of them, the sample size {@code S } should be around 149 to achieve a probability of {@code 0.95}. */ public class FieldTypeInference { private final IndexReader indexReader; @@ -156,7 +157,7 @@ private void setNextLeaf() { private static int[] getSortedRandomNum(int sampleSize, int upperBound, int attempts) { Set generatedNumbers = new HashSet<>(); - Random random = new Random(); + Random random = Randomness.get(); int itr = 0; while (generatedNumbers.size() < sampleSize && itr++ < attempts) { int randomNumber = random.nextInt(upperBound);