diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
index f6dacd54c1e6..274bb475d6c7 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BM25Similarity.java
@@ -18,8 +18,6 @@
import java.util.ArrayList;
import java.util.List;
-import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@@ -33,7 +31,6 @@
public class BM25Similarity extends Similarity {
private final float k1;
private final float b;
- private final boolean discountOverlaps;
/**
* BM25 with the supplied parameter values.
@@ -46,6 +43,7 @@ public class BM25Similarity extends Similarity {
* within the range {@code [0..1]}
*/
public BM25Similarity(float k1, float b, boolean discountOverlaps) {
+ super(discountOverlaps);
if (Float.isFinite(k1) == false || k1 < 0) {
throw new IllegalArgumentException(
"illegal k1 value: " + k1 + ", must be a non-negative finite value");
@@ -55,7 +53,6 @@ public BM25Similarity(float k1, float b, boolean discountOverlaps) {
}
this.k1 = k1;
this.b = b;
- this.discountOverlaps = discountOverlaps;
}
/**
@@ -110,15 +107,6 @@ protected float avgFieldLength(CollectionStatistics collectionStats) {
return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
}
- /**
- * Returns true if overlap tokens are discounted from the document's length.
- *
- * @see #BM25Similarity(float, float, boolean)
- */
- public boolean getDiscountOverlaps() {
- return discountOverlaps;
- }
-
/** Cache of decoded bytes. */
private static final float[] LENGTH_TABLE = new float[256];
@@ -128,19 +116,6 @@ public boolean getDiscountOverlaps() {
}
}
- @Override
- public final long computeNorm(FieldInvertState state) {
- final int numTerms;
- if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
- numTerms = state.getUniqueTermCount();
- } else if (discountOverlaps) {
- numTerms = state.getLength() - state.getNumOverlap();
- } else {
- numTerms = state.getLength();
- }
- return SmallFloat.intToByte4(numTerms);
- }
-
/**
* Computes a score factor for a simple term and returns an explanation for that score factor.
*
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
index ca1d579c93c7..db28e8454488 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/BooleanSimilarity.java
@@ -16,7 +16,6 @@
*/
package org.apache.lucene.search.similarities;
-import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@@ -25,22 +24,15 @@
* Simple similarity that gives terms a score that is equal to their query boost. This similarity is
* typically used with disabled norms since neither document statistics nor index statistics are
* used for scoring. That said, if norms are enabled, they will be computed the same way as {@link
- * SimilarityBase} and {@link BM25Similarity} with {@link
- * SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps} so that the {@link Similarity}
- * can be changed after the index has been created.
+ * SimilarityBase} and {@link BM25Similarity} with {@link SimilarityBase#getDiscountOverlaps()
+ * discounted overlaps} so that the {@link Similarity} can be changed after the index has been
+ * created.
*/
public class BooleanSimilarity extends Similarity {
- private static final Similarity BM25_SIM = new BM25Similarity();
-
/** Sole constructor */
public BooleanSimilarity() {}
- @Override
- public long computeNorm(FieldInvertState state) {
- return BM25_SIM.computeNorm(state);
- }
-
@Override
public SimScorer scorer(
float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
index 84b688feda7b..5fad406c5856 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/ClassicSimilarity.java
@@ -26,8 +26,15 @@
*/
public class ClassicSimilarity extends TFIDFSimilarity {
- /** Sole constructor: parameter-free */
- public ClassicSimilarity() {}
+ /** Default constructor: parameter-free */
+ public ClassicSimilarity() {
+ super();
+ }
+
+ /** Primary constructor. */
+ public ClassicSimilarity(boolean discountOverlaps) {
+ super(discountOverlaps);
+ }
/**
* Implemented as 1/sqrt(length)
.
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
index f47c05be1f02..0b3c1a5e7f02 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/DFRSimilarity.java
@@ -94,6 +94,27 @@ public class DFRSimilarity extends SimilarityBase {
*/
public DFRSimilarity(
BasicModel basicModel, AfterEffect afterEffect, Normalization normalization) {
+ this(basicModel, afterEffect, normalization, true);
+ }
+
+ /**
+ * Creates DFRSimilarity from the three components.
+ *
+ *
Note that null
values are not allowed: if you want no normalization, instead
+ * pass {@link NoNormalization}.
+ *
+ * @param basicModel Basic model of information content
+ * @param afterEffect First normalization of information gain
+ * @param normalization Second (length) normalization
+ * @param discountOverlaps True if overlap tokens (tokens with a position of increment of zero)
+ * are discounted from the document's length.
+ */
+ public DFRSimilarity(
+ BasicModel basicModel,
+ AfterEffect afterEffect,
+ Normalization normalization,
+ boolean discountOverlaps) {
+ super(discountOverlaps);
if (basicModel == null || afterEffect == null || normalization == null) {
throw new NullPointerException("null parameters not allowed.");
}
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
index 24022e832774..83582e44e25f 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/Similarity.java
@@ -17,8 +17,10 @@
package org.apache.lucene.search.similarities;
import java.util.Collections;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
@@ -45,7 +47,7 @@
* is in this norm, but it is most useful for encoding length normalization information.
*
*
Implementations should carefully consider how the normalization is encoded: while Lucene's - * {@link BM25Similarity} encodes length normalization information with {@link SmallFloat} into a + * default implementation encodes length normalization information with {@link SmallFloat} into a * single byte, this might not be suitable for all purposes. * *
Many formulas require the use of average document length, which can be computed via a @@ -88,13 +90,49 @@ * @lucene.experimental */ public abstract class Similarity { - /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ - // Explicitly declared so that we have non-empty javadoc - protected Similarity() {} + /** + * True if overlap tokens (tokens with a position of increment of zero) are discounted from the + * document's length. + */ + private final boolean discountOverlaps; + + /** + * Returns true if overlap tokens are discounted from the document's length. + * + * @see #computeNorm + */ + public final boolean getDiscountOverlaps() { + return discountOverlaps; + } + + /** Default constructor. (For invocation by subclass constructors, typically implicit.) */ + protected Similarity() { + this(true); + } + + /** + * Expert constructor that allows adjustment of {@link #getDiscountOverlaps()} at index-time. + * + *
Overlap tokens are tokens such as synonyms, that have a {@link PositionIncrementAttribute} + * of zero from the analysis chain. + * + *
NOTE: If you modify this parameter, you'll need to re-index for it to take effect. + * + * @param discountOverlaps true if overlap tokens should not impact document length for scoring. + */ + protected Similarity(boolean discountOverlaps) { + this.discountOverlaps = discountOverlaps; + } /** - * Computes the normalization value for a field, given the accumulated state of term processing - * for this field (see {@link FieldInvertState}). + * Computes the normalization value for a field at index-time. + * + *
The default implementation uses {@link SmallFloat#intToByte4} to encode the number of terms + * as a single byte. + * + *
WARNING: The default implementation is used by Lucene's supplied Similarity classes, + * which means you can change the Similarity at runtime without reindexing. If you override this + * method, you'll need to re-index documents for it to take effect. * *
Matches in longer fields are less precise, so implementations of this method usually set
* smaller values when {@code 0} is not a legal norm, so {@code 1} is the norm that produces the highest scores.
*
* @lucene.experimental
- * @param state current processing state for this field
+ * @param state accumulated state of term processing for this field
* @return computed norm value
*/
- public abstract long computeNorm(FieldInvertState state);
+ public long computeNorm(FieldInvertState state) {
+ final int numTerms;
+ if (state.getIndexOptions() == IndexOptions.DOCS) {
+ numTerms = state.getUniqueTermCount();
+ } else if (discountOverlaps) {
+ numTerms = state.getLength() - state.getNumOverlap();
+ } else {
+ numTerms = state.getLength();
+ }
+ return SmallFloat.intToByte4(numTerms);
+ }
/**
* Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
index ef5366b60380..af63b7bebeb6 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/SimilarityBase.java
@@ -18,8 +18,6 @@
import java.util.ArrayList;
import java.util.List;
-import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
@@ -43,33 +41,14 @@ public abstract class SimilarityBase extends Similarity {
/** For {@link #log2(double)}. Precomputed for efficiency reasons. */
private static final double LOG_2 = Math.log(2);
- /**
- * True if overlap tokens (tokens with a position of increment of zero) are discounted from the
- * document's length.
- */
- protected boolean discountOverlaps = true;
-
- /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
- public SimilarityBase() {}
-
- /**
- * Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
- * norm. By default this is true, meaning overlap tokens do not count when computing norms.
- *
- * @lucene.experimental
- * @see #computeNorm
- */
- public void setDiscountOverlaps(boolean v) {
- discountOverlaps = v;
+ /** Default constructor: parameter-free */
+ public SimilarityBase() {
+ super();
}
- /**
- * Returns true if overlap tokens are discounted from the document's length.
- *
- * @see #setDiscountOverlaps
- */
- public boolean getDiscountOverlaps() {
- return discountOverlaps;
+ /** Primary constructor. */
+ public SimilarityBase(boolean discountOverlaps) {
+ super(discountOverlaps);
}
@Override
@@ -179,20 +158,6 @@ protected Explanation explain(BasicStats stats, Explanation freq, double docLen)
}
}
- /** Encodes the document length in the same way as {@link BM25Similarity}. */
- @Override
- public final long computeNorm(FieldInvertState state) {
- final int numTerms;
- if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
- numTerms = state.getUniqueTermCount();
- } else if (discountOverlaps) {
- numTerms = state.getLength() - state.getNumOverlap();
- } else {
- numTerms = state.getLength();
- }
- return SmallFloat.intToByte4(numTerms);
- }
-
// ----------------------------- Static methods ------------------------------
/** Returns the base two logarithm of {@code x}. */
diff --git a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
index 906c7dd2875c..c81ef763862d 100644
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/TFIDFSimilarity.java
@@ -18,8 +18,6 @@
import java.util.ArrayList;
import java.util.List;
-import org.apache.lucene.index.FieldInvertState;
-import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
@@ -326,33 +324,14 @@
*/
public abstract class TFIDFSimilarity extends Similarity {
- /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */
- public TFIDFSimilarity() {}
-
- /**
- * True if overlap tokens (tokens with a position of increment of zero) are discounted from the
- * document's length.
- */
- protected boolean discountOverlaps = true;
-
- /**
- * Determines whether overlap tokens (Tokens with 0 position increment) are ignored when computing
- * norm. By default this is true, meaning overlap tokens do not count when computing norms.
- *
- * @lucene.experimental
- * @see #computeNorm
- */
- public void setDiscountOverlaps(boolean v) {
- discountOverlaps = v;
+ /** Default constructor: parameter-free */
+ public TFIDFSimilarity() {
+ super();
}
- /**
- * Returns true if overlap tokens are discounted from the document's length.
- *
- * @see #setDiscountOverlaps
- */
- public boolean getDiscountOverlaps() {
- return discountOverlaps;
+ /** Primary constructor. */
+ public TFIDFSimilarity(boolean discountOverlaps) {
+ super(discountOverlaps);
}
/**
@@ -438,7 +417,7 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti
/**
* Compute an index-time normalization value for this field instance.
*
- * @param length the number of terms in the field, optionally {@link #setDiscountOverlaps(boolean)
+ * @param length the number of terms in the field, optionally {@link #getDiscountOverlaps()
* discounting overlaps}
* @return a length normalization value
*/
@@ -453,19 +432,6 @@ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatisti
}
}
- @Override
- public final long computeNorm(FieldInvertState state) {
- final int numTerms;
- if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
- numTerms = state.getUniqueTermCount();
- } else if (discountOverlaps) {
- numTerms = state.getLength() - state.getNumOverlap();
- } else {
- numTerms = state.getLength();
- }
- return SmallFloat.intToByte4(numTerms);
- }
-
@Override
public final SimScorer scorer(
float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
diff --git a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
index fbdecaae2b45..b24bd9c5afb1 100644
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestSimilarityBase.java
@@ -526,17 +526,17 @@ public void tearDown() throws Exception {
// LUCENE-5221
public void testDiscountOverlapsBoost() throws IOException {
- BM25Similarity expected = new BM25Similarity(false);
- SimilarityBase actual =
- new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2());
- actual.setDiscountOverlaps(false);
+ final BM25Similarity expected0 = new BM25Similarity(false);
+ final SimilarityBase actual0 =
+ new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2(), false);
FieldInvertState state =
new FieldInvertState(Version.LATEST.major, "foo", IndexOptions.DOCS_AND_FREQS);
state.setLength(5);
state.setNumOverlap(2);
- assertEquals(expected.computeNorm(state), actual.computeNorm(state));
- expected = new BM25Similarity();
- actual.setDiscountOverlaps(true);
- assertEquals(expected.computeNorm(state), actual.computeNorm(state));
+ assertEquals(expected0.computeNorm(state), actual0.computeNorm(state));
+ final BM25Similarity expected1 = new BM25Similarity(true);
+ final SimilarityBase actual1 =
+ new DFRSimilarity(new BasicModelIne(), new AfterEffectB(), new NormalizationH2(), true);
+ assertEquals(expected1.computeNorm(state), actual1.computeNorm(state));
}
}
diff --git a/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java b/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java
index 3c49e215feda..4d5706be0f2a 100644
--- a/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java
+++ b/lucene/luke/src/java/org/apache/lucene/luke/models/search/SearchImpl.java
@@ -395,8 +395,7 @@ private Similarity createSimilarity(SimilarityConfig config) {
Similarity similarity;
if (config.isUseClassicSimilarity()) {
- ClassicSimilarity tfidf = new ClassicSimilarity();
- tfidf.setDiscountOverlaps(config.isDiscountOverlaps());
+ ClassicSimilarity tfidf = new ClassicSimilarity(config.isDiscountOverlaps());
similarity = tfidf;
} else {
BM25Similarity bm25 =
diff --git a/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java b/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
index 3b667f3af167..44e32202fa88 100644
--- a/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
+++ b/lucene/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
@@ -45,10 +45,16 @@ public class SweetSpotSimilarity extends ClassicSimilarity {
private double tf_hyper_base = 1.3d;
private float tf_hyper_xoffset = 10.0f;
+ /** Default constructor: parameter-free */
public SweetSpotSimilarity() {
super();
}
+ /** Primary constructor. */
+ public SweetSpotSimilarity(boolean discountOverlaps) {
+ super(discountOverlaps);
+ }
+
/**
* Sets the baseline and minimum function variables for baselineTf
*
@@ -82,11 +88,10 @@ public void setHyperbolicTfFactors(float min, float max, double base, float xoff
*
* @see #lengthNorm
*/
- public void setLengthNormFactors(int min, int max, float steepness, boolean discountOverlaps) {
+ public void setLengthNormFactors(int min, int max, float steepness) {
this.ln_min = min;
this.ln_max = max;
this.ln_steep = steepness;
- this.discountOverlaps = discountOverlaps;
}
/**
diff --git a/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java b/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java
index 17094692c7c3..feb66476005d 100644
--- a/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java
+++ b/lucene/misc/src/test/org/apache/lucene/misc/TestSweetSpotSimilarity.java
@@ -74,7 +74,7 @@ private static Explanation findExplanation(Explanation expl, String text) {
public void testSweetSpotComputeNorm() throws IOException {
final SweetSpotSimilarity ss = new SweetSpotSimilarity();
- ss.setLengthNormFactors(1, 1, 0.5f, true);
+ ss.setLengthNormFactors(1, 1, 0.5f);
Similarity d = new ClassicSimilarity();
Similarity s = ss;
@@ -87,7 +87,7 @@ public void testSweetSpotComputeNorm() throws IOException {
// make a sweet spot
- ss.setLengthNormFactors(3, 10, 0.5f, true);
+ ss.setLengthNormFactors(3, 10, 0.5f);
for (int i = 3; i <= 10; i++) {
assertEquals("3,10: spot i=" + i, 1.0f, computeNorm(ss, "bogus", i), 0.0f);
@@ -101,14 +101,14 @@ public void testSweetSpotComputeNorm() throws IOException {
// separate sweet spot for certain fields
- final SweetSpotSimilarity ssBar = new SweetSpotSimilarity();
- ssBar.setLengthNormFactors(8, 13, 0.5f, false);
- final SweetSpotSimilarity ssYak = new SweetSpotSimilarity();
- ssYak.setLengthNormFactors(6, 9, 0.5f, false);
- final SweetSpotSimilarity ssA = new SweetSpotSimilarity();
- ssA.setLengthNormFactors(5, 8, 0.5f, false);
- final SweetSpotSimilarity ssB = new SweetSpotSimilarity();
- ssB.setLengthNormFactors(5, 8, 0.1f, false);
+ final SweetSpotSimilarity ssBar = new SweetSpotSimilarity(false);
+ ssBar.setLengthNormFactors(8, 13, 0.5f);
+ final SweetSpotSimilarity ssYak = new SweetSpotSimilarity(false);
+ ssYak.setLengthNormFactors(6, 9, 0.5f);
+ final SweetSpotSimilarity ssA = new SweetSpotSimilarity(false);
+ ssA.setLengthNormFactors(5, 8, 0.5f);
+ final SweetSpotSimilarity ssB = new SweetSpotSimilarity(false);
+ ssB.setLengthNormFactors(5, 8, 0.1f);
Similarity sp =
new PerFieldSimilarityWrapper() {
state.getLength()
is large, and larger values when
@@ -108,10 +146,20 @@ protected Similarity() {}
*