From 7c056ab88c7589a0d103fd4c372ac7677515691d Mon Sep 17 00:00:00 2001 From: Christine Poerschke Date: Fri, 13 Sep 2024 09:27:35 +0100 Subject: [PATCH] similarities: provide default computeNorm implementation; remove remaining discountOverlaps setters; (#13757) Co-authored-by: Robert Muir --- .../search/similarities/BM25Similarity.java | 27 +------- .../similarities/BooleanSimilarity.java | 14 +--- .../similarities/ClassicSimilarity.java | 11 +++- .../search/similarities/DFRSimilarity.java | 21 ++++++ .../search/similarities/Similarity.java | 64 ++++++++++++++++--- .../search/similarities/SimilarityBase.java | 47 ++------------ .../search/similarities/TFIDFSimilarity.java | 48 ++------------ .../similarities/TestSimilarityBase.java | 16 ++--- .../lucene/luke/models/search/SearchImpl.java | 3 +- .../lucene/misc/SweetSpotSimilarity.java | 9 ++- .../lucene/misc/TestSweetSpotSimilarity.java | 20 +++--- 11 files changed, 129 insertions(+), 151 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java index f6dacd54c1e6..274bb475d6c7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java @@ -18,8 +18,6 @@ import java.util.ArrayList; import java.util.List; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexOptions; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -33,7 +31,6 @@ public class BM25Similarity extends Similarity { private final float k1; private final float b; - private final boolean discountOverlaps; /** * BM25 with the supplied parameter values. @@ -46,6 +43,7 @@ public class BM25Similarity extends Similarity { * within the range {@code [0..1]} */ public BM25Similarity(float k1, float b, boolean discountOverlaps) { + super(discountOverlaps); if (Float.isFinite(k1) == false || k1 < 0) { throw new IllegalArgumentException( "illegal k1 value: " + k1 + ", must be a non-negative finite value"); @@ -55,7 +53,6 @@ public BM25Similarity(float k1, float b, boolean discountOverlaps) { } this.k1 = k1; this.b = b; - this.discountOverlaps = discountOverlaps; } /** @@ -110,15 +107,6 @@ protected float avgFieldLength(CollectionStatistics collectionStats) { return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount()); } - /** - * Returns true if overlap tokens are discounted from the document's length. - * - * @see #BM25Similarity(float, float, boolean) - */ - public boolean getDiscountOverlaps() { - return discountOverlaps; - } - /** Cache of decoded bytes. */ private static final float[] LENGTH_TABLE = new float[256]; @@ -128,19 +116,6 @@ public boolean getDiscountOverlaps() { } } - @Override - public final long computeNorm(FieldInvertState state) { - final int numTerms; - if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) { - numTerms = state.getUniqueTermCount(); - } else if (discountOverlaps) { - numTerms = state.getLength() - state.getNumOverlap(); - } else { - numTerms = state.getLength(); - } - return SmallFloat.intToByte4(numTerms); - } - /** * Computes a score factor for a simple term and returns an explanation for that score factor. * diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java index ca1d579c93c7..db28e8454488 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java @@ -16,7 +16,6 @@ */ package org.apache.lucene.search.similarities; -import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -25,22 +24,15 @@ * Simple similarity that gives terms a score that is equal to their query boost. This similarity is * typically used with disabled norms since neither document statistics nor index statistics are * used for scoring. That said, if norms are enabled, they will be computed the same way as {@link - * SimilarityBase} and {@link BM25Similarity} with {@link - * SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps} so that the {@link Similarity} - * can be changed after the index has been created. + * SimilarityBase} and {@link BM25Similarity} with {@link SimilarityBase#getDiscountOverlaps() + * discounted overlaps} so that the {@link Similarity} can be changed after the index has been + * created. */ public class BooleanSimilarity extends Similarity { - private static final Similarity BM25_SIM = new BM25Similarity(); - /** Sole constructor */ public BooleanSimilarity() {} - @Override - public long computeNorm(FieldInvertState state) { - return BM25_SIM.computeNorm(state); - } - @Override public SimScorer scorer( float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java index 84b688feda7b..5fad406c5856 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java @@ -26,8 +26,15 @@ */ public class ClassicSimilarity extends TFIDFSimilarity { - /** Sole constructor: parameter-free */ - public ClassicSimilarity() {} + /** Default constructor: parameter-free */ + public ClassicSimilarity() { + super(); + } + + /** Primary constructor. */ + public ClassicSimilarity(boolean discountOverlaps) { + super(discountOverlaps); + } /** * Implemented as 1/sqrt(length). diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java index f47c05be1f02..0b3c1a5e7f02 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java @@ -94,6 +94,27 @@ public class DFRSimilarity extends SimilarityBase { */ public DFRSimilarity( BasicModel basicModel, AfterEffect afterEffect, Normalization normalization) { + this(basicModel, afterEffect, normalization, true); + } + + /** + * Creates DFRSimilarity from the three components. + * + *

Note that null values are not allowed: if you want no normalization, instead + * pass {@link NoNormalization}. + * + * @param basicModel Basic model of information content + * @param afterEffect First normalization of information gain + * @param normalization Second (length) normalization + * @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero) + * are discounted from the document's length. + */ + public DFRSimilarity( + BasicModel basicModel, + AfterEffect afterEffect, + Normalization normalization, + boolean discountOverlaps) { + super(discountOverlaps); if (basicModel == null || afterEffect == null || normalization == null) { throw new NullPointerException("null parameters not allowed."); } diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java index 24022e832774..83582e44e25f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java @@ -17,8 +17,10 @@ package org.apache.lucene.search.similarities; import java.util.Collections; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; @@ -45,7 +47,7 @@ * is in this norm, but it is most useful for encoding length normalization information. * *

Implementations should carefully consider how the normalization is encoded: while Lucene's - * {@link BM25Similarity} encodes length normalization information with {@link SmallFloat} into a + * default implementation encodes length normalization information with {@link SmallFloat} into a * single byte, this might not be suitable for all purposes. * *

Many formulas require the use of average document length, which can be computed via a @@ -88,13 +90,49 @@ * @lucene.experimental */ public abstract class Similarity { - /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ - // Explicitly declared so that we have non-empty javadoc - protected Similarity() {} + /** + * True if overlap tokens (tokens with a position of increment of zero) are discounted from the + * document's length. + */ + private final boolean discountOverlaps; + + /** + * Returns true if overlap tokens are discounted from the document's length. + * + * @see #computeNorm + */ + public final boolean getDiscountOverlaps() { + return discountOverlaps; + } + + /** Default constructor. (For invocation by subclass constructors, typically implicit.) */ + protected Similarity() { + this(true); + } + + /** + * Expert constructor that allows adjustment of {@link #getDiscountOverlaps()} at index-time. + * + *

Overlap tokens are tokens such as synonyms, that have a {@link PositionIncrementAttribute} + * of zero from the analysis chain. + * + *

NOTE: If you modify this parameter, you'll need to re-index for it to take effect. + * + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + */ + protected Similarity(boolean discountOverlaps) { + this.discountOverlaps = discountOverlaps; + } /** - * Computes the normalization value for a field, given the accumulated state of term processing - * for this field (see {@link FieldInvertState}). + * Computes the normalization value for a field at index-time. + * + *

The default implementation uses {@link SmallFloat#intToByte4} to encode the number of terms + * as a single byte. + * + *

WARNING: The default implementation is used by Lucene's supplied Similarity classes, + * which means you can change the Similarity at runtime without reindexing. If you override this + * method, you'll need to re-index documents for it to take effect. * *

Matches in longer fields are less precise, so implementations of this method usually set * smaller values when state.getLength() is large, and larger values when @@ -108,10 +146,20 @@ protected Similarity() {} *

{@code 0} is not a legal norm, so {@code 1} is the norm that produces the highest scores. * * @lucene.experimental - * @param state current processing state for this field + * @param state accumulated state of term processing for this field * @return computed norm value */ - public abstract long computeNorm(FieldInvertState state); + public long computeNorm(FieldInvertState state) { + final int numTerms; + if (state.getIndexOptions() == IndexOptions.DOCS) { + numTerms = state.getUniqueTermCount(); + } else if (discountOverlaps) { + numTerms = state.getLength() - state.getNumOverlap(); + } else { + numTerms = state.getLength(); + } + return SmallFloat.intToByte4(numTerms); + } /** * Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java index ef5366b60380..af63b7bebeb6 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java @@ -18,8 +18,6 @@ import java.util.ArrayList; import java.util.List; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexOptions; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.TermStatistics; @@ -43,33 +41,14 @@ public abstract class SimilarityBase extends Similarity { /** For {@link #log2(double)}. Precomputed for efficiency reasons. */ private static final double LOG_2 = Math.log(2); - /** - * True if overlap tokens (tokens with a position of increment of zero) are discounted from the - * document's length. - */ - protected boolean discountOverlaps = true; - - /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ - public SimilarityBase() {} - - /** - * Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing - * norm. By default this is true, meaning overlap tokens do not count when computing norms. - * - * @lucene.experimental - * @see #computeNorm - */ - public void setDiscountOverlaps(boolean v) { - discountOverlaps = v; + /** Default constructor: parameter-free */ + public SimilarityBase() { + super(); } - /** - * Returns true if overlap tokens are discounted from the document's length. - * - * @see #setDiscountOverlaps - */ - public boolean getDiscountOverlaps() { - return discountOverlaps; + /** Primary constructor. */ + public SimilarityBase(boolean discountOverlaps) { + super(discountOverlaps); } @Override @@ -179,20 +158,6 @@ protected Explanation explain(BasicStats stats, Explanation freq, double docLen) } } - /** Encodes the document length in the same way as {@link BM25Similarity}. */ - @Override - public final long computeNorm(FieldInvertState state) { - final int numTerms; - if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) { - numTerms = state.getUniqueTermCount(); - } else if (discountOverlaps) { - numTerms = state.getLength() - state.getNumOverlap(); - } else { - numTerms = state.getLength(); - } - return SmallFloat.intToByte4(numTerms); - } - // ----------------------------- Static methods ------------------------------ /** Returns the base two logarithm of {@code x}. */ diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java index 906c7dd2875c..c81ef763862d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java +++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java @@ -18,8 +18,6 @@ import java.util.ArrayList; import java.util.List; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexOptions; import org.apache.lucene.search.CollectionStatistics; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; @@ -326,33 +324,14 @@ */ public abstract class TFIDFSimilarity extends Similarity { - /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ - public TFIDFSimilarity() {} - - /** - * True if overlap tokens (tokens with a position of increment of zero) are discounted from the - * document's length. - */ - protected boolean discountOverlaps = true; - - /** - * Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing - * norm. By default this is true, meaning overlap tokens do not count when computing norms. - * - * @lucene.experimental - * @see #computeNorm - */ - public void setDiscountOverlaps(boolean v) { - discountOverlaps = v; + /** Default constructor: parameter-free */ + public TFIDFSimilarity() { + super(); } - /** - * Returns true if overlap tokens are discounted from the document's length. - * - * @see #setDiscountOverlaps - */ - public boolean getDiscountOverlaps() { - return discountOverlaps; + /** Primary constructor. */ + public TFIDFSimilarity(boolean discountOverlaps) { + super(discountOverlaps); } /** @@ -438,7 +417,7 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti /** * Compute an index-time normalization value for this field instance. * - * @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean) + * @param length the number of terms in the field, optionally {@link #getDiscountOverlaps() * discounting overlaps} * @return a length normalization value */ @@ -453,19 +432,6 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti } } - @Override - public final long computeNorm(FieldInvertState state) { - final int numTerms; - if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) { - numTerms = state.getUniqueTermCount(); - } else if (discountOverlaps) { - numTerms = state.getLength() - state.getNumOverlap(); - } else { - numTerms = state.getLength(); - } - return SmallFloat.intToByte4(numTerms); - } - @Override public final SimScorer scorer( float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java index fbdecaae2b45..b24bd9c5afb1 100644 --- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java +++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java @@ -526,17 +526,17 @@ public void tearDown() throws Exception { // LUCENE-5221 public void testDiscountOverlapsBoost() throws IOException { - BM25Similarity expected = new BM25Similarity(false); - SimilarityBase actual = - new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2()); - actual.setDiscountOverlaps(false); + final BM25Similarity expected0 = new BM25Similarity(false); + final SimilarityBase actual0 = + new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2(), false); FieldInvertState state = new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS); state.setLength(5); state.setNumOverlap(2); - assertEquals(expected.computeNorm(state), actual.computeNorm(state)); - expected = new BM25Similarity(); - actual.setDiscountOverlaps(true); - assertEquals(expected.computeNorm(state), actual.computeNorm(state)); + assertEquals(expected0.computeNorm(state), actual0.computeNorm(state)); + final BM25Similarity expected1 = new BM25Similarity(true); + final SimilarityBase actual1 = + new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2(), true); + assertEquals(expected1.computeNorm(state), actual1.computeNorm(state)); } } diff --git a/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java b/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java index 3c49e215feda..4d5706be0f2a 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java @@ -395,8 +395,7 @@ private Similarity createSimilarity(SimilarityConfig config) { Similarity similarity; if (config.isUseClassicSimilarity()) { - ClassicSimilarity tfidf = new ClassicSimilarity(); - tfidf.setDiscountOverlaps(config.isDiscountOverlaps()); + ClassicSimilarity tfidf = new ClassicSimilarity(config.isDiscountOverlaps()); similarity = tfidf; } else { BM25Similarity bm25 = diff --git a/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java b/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java index 3b667f3af167..44e32202fa88 100644 --- a/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java +++ b/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java @@ -45,10 +45,16 @@ public class SweetSpotSimilarity extends ClassicSimilarity { private double tf_hyper_base = 1.3d; private float tf_hyper_xoffset = 10.0f; + /** Default constructor: parameter-free */ public SweetSpotSimilarity() { super(); } + /** Primary constructor. */ + public SweetSpotSimilarity(boolean discountOverlaps) { + super(discountOverlaps); + } + /** * Sets the baseline and minimum function variables for baselineTf * @@ -82,11 +88,10 @@ public void setHyperbolicTfFactors(float min, float max, double base, float xoff * * @see #lengthNorm */ - public void setLengthNormFactors(int min, int max, float steepness, boolean discountOverlaps) { + public void setLengthNormFactors(int min, int max, float steepness) { this.ln_min = min; this.ln_max = max; this.ln_steep = steepness; - this.discountOverlaps = discountOverlaps; } /** diff --git a/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java b/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java index 17094692c7c3..feb66476005d 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java @@ -74,7 +74,7 @@ private static Explanation findExplanation(Explanation expl, String text) { public void testSweetSpotComputeNorm() throws IOException { final SweetSpotSimilarity ss = new SweetSpotSimilarity(); - ss.setLengthNormFactors(1, 1, 0.5f, true); + ss.setLengthNormFactors(1, 1, 0.5f); Similarity d = new ClassicSimilarity(); Similarity s = ss; @@ -87,7 +87,7 @@ public void testSweetSpotComputeNorm() throws IOException { // make a sweet spot - ss.setLengthNormFactors(3, 10, 0.5f, true); + ss.setLengthNormFactors(3, 10, 0.5f); for (int i = 3; i <= 10; i++) { assertEquals("3,10: spot i=" + i, 1.0f, computeNorm(ss, "bogus", i), 0.0f); @@ -101,14 +101,14 @@ public void testSweetSpotComputeNorm() throws IOException { // separate sweet spot for certain fields - final SweetSpotSimilarity ssBar = new SweetSpotSimilarity(); - ssBar.setLengthNormFactors(8, 13, 0.5f, false); - final SweetSpotSimilarity ssYak = new SweetSpotSimilarity(); - ssYak.setLengthNormFactors(6, 9, 0.5f, false); - final SweetSpotSimilarity ssA = new SweetSpotSimilarity(); - ssA.setLengthNormFactors(5, 8, 0.5f, false); - final SweetSpotSimilarity ssB = new SweetSpotSimilarity(); - ssB.setLengthNormFactors(5, 8, 0.1f, false); + final SweetSpotSimilarity ssBar = new SweetSpotSimilarity(false); + ssBar.setLengthNormFactors(8, 13, 0.5f); + final SweetSpotSimilarity ssYak = new SweetSpotSimilarity(false); + ssYak.setLengthNormFactors(6, 9, 0.5f); + final SweetSpotSimilarity ssA = new SweetSpotSimilarity(false); + ssA.setLengthNormFactors(5, 8, 0.5f); + final SweetSpotSimilarity ssB = new SweetSpotSimilarity(false); + ssB.setLengthNormFactors(5, 8, 0.1f); Similarity sp = new PerFieldSimilarityWrapper() {