Skip to content

Commit

Permalink
similarities: provide default computeNorm implementation; remove rema…
Browse files Browse the repository at this point in the history
…ining discountOverlaps setters; (#13757)

Co-authored-by: Robert Muir <[email protected]>
  • Loading branch information
cpoerschke and rmuir authored Sep 13, 2024
1 parent f778cc4 commit 7c056ab
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 151 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@

import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
Expand All @@ -33,7 +31,6 @@
public class BM25Similarity extends Similarity {
private final float k1;
private final float b;
private final boolean discountOverlaps;

/**
* BM25 with the supplied parameter values.
Expand All @@ -46,6 +43,7 @@ public class BM25Similarity extends Similarity {
* within the range {@code [0..1]}
*/
public BM25Similarity(float k1, float b, boolean discountOverlaps) {
super(discountOverlaps);
if (Float.isFinite(k1) == false || k1 < 0) {
throw new IllegalArgumentException(
"illegal k1 value: " + k1 + ", must be a non-negative finite value");
Expand All @@ -55,7 +53,6 @@ public BM25Similarity(float k1, float b, boolean discountOverlaps) {
}
this.k1 = k1;
this.b = b;
this.discountOverlaps = discountOverlaps;
}

/**
Expand Down Expand Up @@ -110,15 +107,6 @@ protected float avgFieldLength(CollectionStatistics collectionStats) {
return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
}

/**
* Returns true if overlap tokens are discounted from the document's length.
*
* @see #BM25Similarity(float, float, boolean)
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
}

/** Cache of decoded bytes. */
private static final float[] LENGTH_TABLE = new float[256];

Expand All @@ -128,19 +116,6 @@ public boolean getDiscountOverlaps() {
}
}

@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}

/**
* Computes a score factor for a simple term and returns an explanation for that score factor.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
*/
package org.apache.lucene.search.similarities;

import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
Expand All @@ -25,22 +24,15 @@
* Simple similarity that gives terms a score that is equal to their query boost. This similarity is
* typically used with disabled norms since neither document statistics nor index statistics are
* used for scoring. That said, if norms are enabled, they will be computed the same way as {@link
* SimilarityBase} and {@link BM25Similarity} with {@link
* SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps} so that the {@link Similarity}
* can be changed after the index has been created.
* SimilarityBase} and {@link BM25Similarity} with {@link SimilarityBase#getDiscountOverlaps()
* discounted overlaps} so that the {@link Similarity} can be changed after the index has been
* created.
*/
public class BooleanSimilarity extends Similarity {

private static final Similarity BM25_SIM = new BM25Similarity();

/** Sole constructor */
public BooleanSimilarity() {}

@Override
public long computeNorm(FieldInvertState state) {
return BM25_SIM.computeNorm(state);
}

@Override
public SimScorer scorer(
float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,15 @@
*/
public class ClassicSimilarity extends TFIDFSimilarity {

/** Sole constructor: parameter-free */
public ClassicSimilarity() {}
/** Default constructor: parameter-free */
public ClassicSimilarity() {
super();
}

/** Primary constructor. */
public ClassicSimilarity(boolean discountOverlaps) {
super(discountOverlaps);
}

/**
* Implemented as <code>1/sqrt(length)</code>.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,27 @@ public class DFRSimilarity extends SimilarityBase {
*/
public DFRSimilarity(
BasicModel basicModel, AfterEffect afterEffect, Normalization normalization) {
this(basicModel, afterEffect, normalization, true);
}

/**
* Creates DFRSimilarity from the three components.
*
* <p>Note that <code>null</code> values are not allowed: if you want no normalization, instead
* pass {@link NoNormalization}.
*
* @param basicModel Basic model of information content
* @param afterEffect First normalization of information gain
* @param normalization Second (length) normalization
* @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero)
* are discounted from the document's length.
*/
public DFRSimilarity(
BasicModel basicModel,
AfterEffect afterEffect,
Normalization normalization,
boolean discountOverlaps) {
super(discountOverlaps);
if (basicModel == null || afterEffect == null || normalization == null) {
throw new NullPointerException("null parameters not allowed.");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
package org.apache.lucene.search.similarities;

import java.util.Collections;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
Expand All @@ -45,7 +47,7 @@
* is in this norm, but it is most useful for encoding length normalization information.
*
* <p>Implementations should carefully consider how the normalization is encoded: while Lucene's
* {@link BM25Similarity} encodes length normalization information with {@link SmallFloat} into a
* default implementation encodes length normalization information with {@link SmallFloat} into a
* single byte, this might not be suitable for all purposes.
*
* <p>Many formulas require the use of average document length, which can be computed via a
Expand Down Expand Up @@ -88,13 +90,49 @@
* @lucene.experimental
*/
public abstract class Similarity {
/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
// Explicitly declared so that we have non-empty javadoc
protected Similarity() {}
/**
* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
* document's length.
*/
private final boolean discountOverlaps;

/**
* Returns true if overlap tokens are discounted from the document's length.
*
* @see #computeNorm
*/
public final boolean getDiscountOverlaps() {
return discountOverlaps;
}

/** Default constructor. (For invocation by subclass constructors, typically implicit.) */
protected Similarity() {
this(true);
}

/**
* Expert constructor that allows adjustment of {@link #getDiscountOverlaps()} at index-time.
*
* <p>Overlap tokens are tokens such as synonyms, that have a {@link PositionIncrementAttribute}
* of zero from the analysis chain.
*
* <p><b>NOTE</b>: If you modify this parameter, you'll need to re-index for it to take effect.
*
* @param discountOverlaps true if overlap tokens should not impact document length for scoring.
*/
protected Similarity(boolean discountOverlaps) {
this.discountOverlaps = discountOverlaps;
}

/**
* Computes the normalization value for a field, given the accumulated state of term processing
* for this field (see {@link FieldInvertState}).
* Computes the normalization value for a field at index-time.
*
* <p>The default implementation uses {@link SmallFloat#intToByte4} to encode the number of terms
* as a single byte.
*
* <p><b>WARNING</b>: The default implementation is used by Lucene's supplied Similarity classes,
* which means you can change the Similarity at runtime without reindexing. If you override this
* method, you'll need to re-index documents for it to take effect.
*
* <p>Matches in longer fields are less precise, so implementations of this method usually set
* smaller values when <code>state.getLength()</code> is large, and larger values when <code>
Expand All @@ -108,10 +146,20 @@ protected Similarity() {}
* <p>{@code 0} is not a legal norm, so {@code 1} is the norm that produces the highest scores.
*
* @lucene.experimental
* @param state current processing state for this field
* @param state accumulated state of term processing for this field
* @return computed norm value
*/
public abstract long computeNorm(FieldInvertState state);
public long computeNorm(FieldInvertState state) {
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}

/**
* Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@

import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
Expand All @@ -43,33 +41,14 @@ public abstract class SimilarityBase extends Similarity {
/** For {@link #log2(double)}. Precomputed for efficiency reasons. */
private static final double LOG_2 = Math.log(2);

/**
* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
* document's length.
*/
protected boolean discountOverlaps = true;

/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
public SimilarityBase() {}

/**
* Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap tokens do not count when computing norms.
*
* @lucene.experimental
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
/** Default constructor: parameter-free */
public SimilarityBase() {
super();
}

/**
* Returns true if overlap tokens are discounted from the document's length.
*
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
/** Primary constructor. */
public SimilarityBase(boolean discountOverlaps) {
super(discountOverlaps);
}

@Override
Expand Down Expand Up @@ -179,20 +158,6 @@ protected Explanation explain(BasicStats stats, Explanation freq, double docLen)
}
}

/** Encodes the document length in the same way as {@link BM25Similarity}. */
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}

// ----------------------------- Static methods ------------------------------

/** Returns the base two logarithm of {@code x}. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@

import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
Expand Down Expand Up @@ -326,33 +324,14 @@
*/
public abstract class TFIDFSimilarity extends Similarity {

/** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
public TFIDFSimilarity() {}

/**
* True if overlap tokens (tokens with a position of increment of zero) are discounted from the
* document's length.
*/
protected boolean discountOverlaps = true;

/**
* Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap tokens do not count when computing norms.
*
* @lucene.experimental
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
/** Default constructor: parameter-free */
public TFIDFSimilarity() {
super();
}

/**
* Returns true if overlap tokens are discounted from the document's length.
*
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
/** Primary constructor. */
public TFIDFSimilarity(boolean discountOverlaps) {
super(discountOverlaps);
}

/**
Expand Down Expand Up @@ -438,7 +417,7 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti
/**
* Compute an index-time normalization value for this field instance.
*
* @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean)
* @param length the number of terms in the field, optionally {@link #getDiscountOverlaps()
* discounting overlaps}
* @return a length normalization value
*/
Expand All @@ -453,19 +432,6 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti
}
}

@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}

@Override
public final SimScorer scorer(
float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
Expand Down
Loading

0 comments on commit 7c056ab

Please sign in to comment.