similarities: provide default computeNorm implementation; remove rema…

…ining discountOverlaps setters; (#13757) Co-authored-by: Robert Muir <[email protected]>
apache · Sep 13, 2024 · 7c056ab · 7c056ab
1 parent f778cc4
commit 7c056ab
Show file tree

Hide file tree

Showing 11 changed files with 129 additions and 151 deletions.
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@@ -18,8 +18,6 @@
 
 import java.util.ArrayList;
 import java.util.List;
-import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.TermStatistics;
@@ -33,7 +31,6 @@
 public class BM25Similarity extends Similarity {
   private final float k1;
   private final float b;
-  private final boolean discountOverlaps;
 
   /**
    * BM25 with the supplied parameter values.
@@ -46,6 +43,7 @@ public class BM25Similarity extends Similarity {
    *     within the range {@code [0..1]}
    */
   public BM25Similarity(float k1, float b, boolean discountOverlaps) {
+    super(discountOverlaps);
     if (Float.isFinite(k1) == false || k1 < 0) {
       throw new IllegalArgumentException(
           "illegal k1 value: " + k1 + ", must be a non-negative finite value");
@@ -55,7 +53,6 @@ public BM25Similarity(float k1, float b, boolean discountOverlaps) {
     }
     this.k1 = k1;
     this.b = b;
-    this.discountOverlaps = discountOverlaps;
   }
 
   /**
@@ -110,15 +107,6 @@ protected float avgFieldLength(CollectionStatistics collectionStats) {
     return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
   }
 
-  /**
-   * Returns true if overlap tokens are discounted from the document's length.
-   *
-   * @see #BM25Similarity(float, float, boolean)
-   */
-  public boolean getDiscountOverlaps() {
-    return discountOverlaps;
-  }
-
   /** Cache of decoded bytes. */
   private static final float[] LENGTH_TABLE = new float[256];
 
@@ -128,19 +116,6 @@ public boolean getDiscountOverlaps() {
     }
   }
 
-  @Override
-  public final long computeNorm(FieldInvertState state) {
-    final int numTerms;
-    if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
-      numTerms = state.getUniqueTermCount();
-    } else if (discountOverlaps) {
-      numTerms = state.getLength() - state.getNumOverlap();
-    } else {
-      numTerms = state.getLength();
-    }
-    return SmallFloat.intToByte4(numTerms);
-  }
-
   /**
    * Computes a score factor for a simple term and returns an explanation for that score factor.
    *

diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
@@ -16,7 +16,6 @@
  */
 package org.apache.lucene.search.similarities;
 
-import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.TermStatistics;
@@ -25,22 +24,15 @@
  * Simple similarity that gives terms a score that is equal to their query boost. This similarity is
  * typically used with disabled norms since neither document statistics nor index statistics are
  * used for scoring. That said, if norms are enabled, they will be computed the same way as {@link
- * SimilarityBase} and {@link BM25Similarity} with {@link
- * SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps} so that the {@link Similarity}
- * can be changed after the index has been created.
+ * SimilarityBase} and {@link BM25Similarity} with {@link SimilarityBase#getDiscountOverlaps()
+ * discounted overlaps} so that the {@link Similarity} can be changed after the index has been
+ * created.
  */
 public class BooleanSimilarity extends Similarity {
 
-  private static final Similarity BM25_SIM = new BM25Similarity();
-
   /** Sole constructor */
   public BooleanSimilarity() {}
 
-  @Override
-  public long computeNorm(FieldInvertState state) {
-    return BM25_SIM.computeNorm(state);
-  }
-
   @Override
   public SimScorer scorer(
       float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {

diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
@@ -26,8 +26,15 @@
  */
 public class ClassicSimilarity extends TFIDFSimilarity {
 
-  /** Sole constructor: parameter-free */
-  public ClassicSimilarity() {}
+  /** Default constructor: parameter-free */
+  public ClassicSimilarity() {
+    super();
+  }
+
+  /** Primary constructor. */
+  public ClassicSimilarity(boolean discountOverlaps) {
+    super(discountOverlaps);
+  }
 
   /**
    * Implemented as <code>1/sqrt(length)</code>.

diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
@@ -94,6 +94,27 @@ public class DFRSimilarity extends SimilarityBase {
    */
   public DFRSimilarity(
       BasicModel basicModel, AfterEffect afterEffect, Normalization normalization) {
+    this(basicModel, afterEffect, normalization, true);
+  }
+
+  /**
+   * Creates DFRSimilarity from the three components.
+   *
+   * <p>Note that <code>null</code> values are not allowed: if you want no normalization, instead
+   * pass {@link NoNormalization}.
+   *
+   * @param basicModel Basic model of information content
+   * @param afterEffect First normalization of information gain
+   * @param normalization Second (length) normalization
+   * @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero)
+   *     are discounted from the document's length.
+   */
+  public DFRSimilarity(
+      BasicModel basicModel,
+      AfterEffect afterEffect,
+      Normalization normalization,
+      boolean discountOverlaps) {
+    super(discountOverlaps);
     if (basicModel == null || afterEffect == null || normalization == null) {
       throw new NullPointerException("null parameters not allowed.");
     }

diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
@@ -17,8 +17,10 @@
 package org.apache.lucene.search.similarities;
 
 import java.util.Collections;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
@@ -45,7 +47,7 @@
  * is in this norm, but it is most useful for encoding length normalization information.
  *
  * <p>Implementations should carefully consider how the normalization is encoded: while Lucene's
- * {@link BM25Similarity} encodes length normalization information with {@link SmallFloat} into a
+ * default implementation encodes length normalization information with {@link SmallFloat} into a
  * single byte, this might not be suitable for all purposes.
  *
  * <p>Many formulas require the use of average document length, which can be computed via a
@@ -88,13 +90,49 @@
  * @lucene.experimental
  */
 public abstract class Similarity {
-  /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
-  // Explicitly declared so that we have non-empty javadoc
-  protected Similarity() {}
+  /**
+   * True if overlap tokens (tokens with a position of increment of zero) are discounted from the
+   * document's length.
+   */
+  private final boolean discountOverlaps;
+
+  /**
+   * Returns true if overlap tokens are discounted from the document's length.
+   *
+   * @see #computeNorm
+   */
+  public final boolean getDiscountOverlaps() {
+    return discountOverlaps;
+  }
+
+  /** Default constructor. (For invocation by subclass constructors, typically implicit.) */
+  protected Similarity() {
+    this(true);
+  }
+
+  /**
+   * Expert constructor that allows adjustment of {@link #getDiscountOverlaps()} at index-time.
+   *
+   * <p>Overlap tokens are tokens such as synonyms, that have a {@link PositionIncrementAttribute}
+   * of zero from the analysis chain.
+   *
+   * <p><b>NOTE</b>: If you modify this parameter, you'll need to re-index for it to take effect.
+   *
+   * @param discountOverlaps true if overlap tokens should not impact document length for scoring.
+   */
+  protected Similarity(boolean discountOverlaps) {
+    this.discountOverlaps = discountOverlaps;
+  }
 
   /**
-   * Computes the normalization value for a field, given the accumulated state of term processing
-   * for this field (see {@link FieldInvertState}).
+   * Computes the normalization value for a field at index-time.
+   *
+   * <p>The default implementation uses {@link SmallFloat#intToByte4} to encode the number of terms
+   * as a single byte.
+   *
+   * <p><b>WARNING</b>: The default implementation is used by Lucene's supplied Similarity classes,
+   * which means you can change the Similarity at runtime without reindexing. If you override this
+   * method, you'll need to re-index documents for it to take effect.
    *
    * <p>Matches in longer fields are less precise, so implementations of this method usually set
    * smaller values when <code>state.getLength()</code> is large, and larger values when <code>
@@ -108,10 +146,20 @@ protected Similarity() {}
    * <p>{@code 0} is not a legal norm, so {@code 1} is the norm that produces the highest scores.
    *
    * @lucene.experimental
-   * @param state current processing state for this field
+   * @param state accumulated state of term processing for this field
    * @return computed norm value
    */
-  public abstract long computeNorm(FieldInvertState state);
+  public long computeNorm(FieldInvertState state) {
+    final int numTerms;
+    if (state.getIndexOptions() == IndexOptions.DOCS) {
+      numTerms = state.getUniqueTermCount();
+    } else if (discountOverlaps) {
+      numTerms = state.getLength() - state.getNumOverlap();
+    } else {
+      numTerms = state.getLength();
+    }
+    return SmallFloat.intToByte4(numTerms);
+  }
 
   /**
    * Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring

diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
@@ -18,8 +18,6 @@
 
 import java.util.ArrayList;
 import java.util.List;
-import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.TermStatistics;
@@ -43,33 +41,14 @@ public abstract class SimilarityBase extends Similarity {
   /** For {@link #log2(double)}. Precomputed for efficiency reasons. */
   private static final double LOG_2 = Math.log(2);
 
-  /**
-   * True if overlap tokens (tokens with a position of increment of zero) are discounted from the
-   * document's length.
-   */
-  protected boolean discountOverlaps = true;
-
-  /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
-  public SimilarityBase() {}
-
-  /**
-   * Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
-   * norm. By default this is true, meaning overlap tokens do not count when computing norms.
-   *
-   * @lucene.experimental
-   * @see #computeNorm
-   */
-  public void setDiscountOverlaps(boolean v) {
-    discountOverlaps = v;
+  /** Default constructor: parameter-free */
+  public SimilarityBase() {
+    super();
   }
 
-  /**
-   * Returns true if overlap tokens are discounted from the document's length.
-   *
-   * @see #setDiscountOverlaps
-   */
-  public boolean getDiscountOverlaps() {
-    return discountOverlaps;
+  /** Primary constructor. */
+  public SimilarityBase(boolean discountOverlaps) {
+    super(discountOverlaps);
   }
 
   @Override
@@ -179,20 +158,6 @@ protected Explanation explain(BasicStats stats, Explanation freq, double docLen)
     }
   }
 
-  /** Encodes the document length in the same way as {@link BM25Similarity}. */
-  @Override
-  public final long computeNorm(FieldInvertState state) {
-    final int numTerms;
-    if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
-      numTerms = state.getUniqueTermCount();
-    } else if (discountOverlaps) {
-      numTerms = state.getLength() - state.getNumOverlap();
-    } else {
-      numTerms = state.getLength();
-    }
-    return SmallFloat.intToByte4(numTerms);
-  }
-
   // ----------------------------- Static methods ------------------------------
 
   /** Returns the base two logarithm of {@code x}. */

diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
@@ -18,8 +18,6 @@
 
 import java.util.ArrayList;
 import java.util.List;
-import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.search.CollectionStatistics;
 import org.apache.lucene.search.Explanation;
 import org.apache.lucene.search.IndexSearcher;
@@ -326,33 +324,14 @@
  */
 public abstract class TFIDFSimilarity extends Similarity {
 
-  /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
-  public TFIDFSimilarity() {}
-
-  /**
-   * True if overlap tokens (tokens with a position of increment of zero) are discounted from the
-   * document's length.
-   */
-  protected boolean discountOverlaps = true;
-
-  /**
-   * Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
-   * norm. By default this is true, meaning overlap tokens do not count when computing norms.
-   *
-   * @lucene.experimental
-   * @see #computeNorm
-   */
-  public void setDiscountOverlaps(boolean v) {
-    discountOverlaps = v;
+  /** Default constructor: parameter-free */
+  public TFIDFSimilarity() {
+    super();
   }
 
-  /**
-   * Returns true if overlap tokens are discounted from the document's length.
-   *
-   * @see #setDiscountOverlaps
-   */
-  public boolean getDiscountOverlaps() {
-    return discountOverlaps;
+  /** Primary constructor. */
+  public TFIDFSimilarity(boolean discountOverlaps) {
+    super(discountOverlaps);
   }
 
   /**
@@ -438,7 +417,7 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti
   /**
    * Compute an index-time normalization value for this field instance.
    *
-   * @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean)
+   * @param length the number of terms in the field, optionally {@link #getDiscountOverlaps()
    *     discounting overlaps}
    * @return a length normalization value
    */
@@ -453,19 +432,6 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti
     }
   }
 
-  @Override
-  public final long computeNorm(FieldInvertState state) {
-    final int numTerms;
-    if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
-      numTerms = state.getUniqueTermCount();
-    } else if (discountOverlaps) {
-      numTerms = state.getLength() - state.getNumOverlap();
-    } else {
-      numTerms = state.getLength();
-    }
-    return SmallFloat.intToByte4(numTerms);
-  }
-
   @Override
   public final SimScorer scorer(
       float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {