Skip to content

Commit

Permalink
WordBreakSpellChecker.suggestWordBreaks now does a breadth first sear…
Browse files Browse the repository at this point in the history
…ch (apache#12100)
  • Loading branch information
hossman committed Jul 10, 2024
1 parent 49e7810 commit cc854f4
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 35 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,9 @@ Optimizations

* GITHUB#13538: Slightly reduce heap usage for HNSW and scalar quantized vector writers. (Ben Trent)

* GITHUB#12100: WordBreakSpellChecker.suggestWordBreaks now does a breadth first search, allowing it to return
better matches with fewer evaluations (hossman)

Changes in runtime behavior
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
import java.util.Queue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.FixedBitSet;

/**
* A spell checker whose sole function is to offer suggestions by combining multiple terms into one
Expand Down Expand Up @@ -246,51 +249,80 @@ private int generateBreakUpSuggestions(
int totalEvaluations,
BreakSuggestionSortMethod sortMethod)
throws IOException {
String termText = term.text();
int termLength = termText.codePointCount(0, termText.length());
int useMinBreakWordLength = minBreakWordLength;
if (useMinBreakWordLength < 1) {
useMinBreakWordLength = 1;
}
final String termText = term.text();
final int termLength = termText.codePointCount(0, termText.length());
final int useMinBreakWordLength = Math.max(minBreakWordLength, 1);

if (termLength < (useMinBreakWordLength * 2)) {
return totalEvaluations;
}

for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
// Two phase breadth first search.
//
// Phase #1: checking for bi-sects of our termText, and recording the
// positions of valid leftText splits for later

final int maxSplitPosition = termLength - useMinBreakWordLength;
final BitSet validLeftSplitPositions = new FixedBitSet(termText.length());

for (int i = useMinBreakWordLength; i <= maxSplitPosition; i++) {
if (totalEvaluations >= maxEvaluations) {
break;
return totalEvaluations;
}
totalEvaluations++;

int end = termText.offsetByCodePoints(0, i);
String leftText = termText.substring(0, end);
String rightText = termText.substring(end);
SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
final int end = termText.offsetByCodePoints(0, i);
final String leftText = termText.substring(0, end);
final String rightText = termText.substring(end);

final SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
if (leftWord.freq >= useMinSuggestionFrequency) {
SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
validLeftSplitPositions.set(end);
final SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
if (rightWord.freq >= useMinSuggestionFrequency) {
SuggestWordArrayWrapper suggestion =
new SuggestWordArrayWrapper(newSuggestion(prefix, leftWord, rightWord));
suggestions.offer(suggestion);
suggestions.offer(
new SuggestWordArrayWrapper(newSuggestion(prefix, leftWord, rightWord)));
if (suggestions.size() > maxSuggestions) {
suggestions.poll();
}
}
int newNumberBreaks = numberBreaks + 1;
if (newNumberBreaks <= maxChanges) {
totalEvaluations =
generateBreakUpSuggestions(
new Term(term.field(), rightWord.string),
ir,
newNumberBreaks,
maxSuggestions,
useMinSuggestionFrequency,
newPrefix(prefix, leftWord),
suggestions,
totalEvaluations,
sortMethod);
}
}
}

// if we are about to exceed our maxChanges *OR* we have a full list of
// suggestions, we can return now.
//
// (because any subsequent suggestions are garunteed to have more changes
// then anything currently in the queue, and not be competitive)

final int newNumberBreaks = numberBreaks + 1;
if (totalEvaluations >= maxEvaluations
|| newNumberBreaks > maxChanges
|| suggestions.size() >= maxSuggestions) {
return totalEvaluations;
}

// Phase #2: recursing on the right side of any valid leftText terms
final BitSetIterator leftIter = new BitSetIterator(validLeftSplitPositions, 0);
for (int pos = leftIter.nextDoc();
pos != BitSetIterator.NO_MORE_DOCS;
pos = leftIter.nextDoc()) {
final String leftText = termText.substring(0, pos);
final String rightText = termText.substring(pos);
final SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
totalEvaluations =
generateBreakUpSuggestions(
new Term(term.field(), rightText),
ir,
newNumberBreaks,
maxSuggestions,
useMinSuggestionFrequency,
newPrefix(prefix, leftWord),
suggestions,
totalEvaluations,
sortMethod);
if (totalEvaluations >= maxEvaluations) {
break;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,20 +89,57 @@ public void tearDown() throws Exception {
}

public void testMaxEvaluations() throws Exception {
final int maxEvals = 100;
try (IndexReader ir = DirectoryReader.open(dir)) {

final String input = "ab".repeat(5);
final int maxEvals = 100;
final int maxSuggestions = maxEvals * 2; // plenty

WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);

wbsp.setMaxChanges(2 * input.length()); // plenty
wbsp.setMaxEvaluations(maxEvals);

Term term = new Term("abba", input);
SuggestWord[][] sw =
wbsp.suggestWordBreaks(
term,
maxSuggestions,
ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX,
BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);

// sanity check that our suggester isn't completely broken
MatcherAssert.assertThat(sw.length, greaterThan(0));

// if maxEvaluations is respected, we can't possibly have more suggestions than that
MatcherAssert.assertThat(sw.length, lessThan(maxEvals));
}
}

public void testSmallMaxEvaluations() throws Exception {
// even using small maxEvals (relative to maxChanges) should produce
// good results if possible

try (IndexReader ir = DirectoryReader.open(dir)) {

final int maxEvals = TestUtil.nextInt(random(), 6, 20);
final int maxSuggestions = maxEvals * 10; // plenty

WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
wbsp.setMaxChanges(10);
wbsp.setMinBreakWordLength(1);
wbsp.setMinSuggestionFrequency(1);
wbsp.setMaxEvaluations(100);

Term term = new Term("abba", "ab".repeat(5));
wbsp.setMaxChanges(maxEvals * 10); // plenty
wbsp.setMaxEvaluations(maxEvals);

Term term = new Term("abba", "ababab");
SuggestWord[][] sw =
wbsp.suggestWordBreaks(
term,
500,
maxSuggestions,
ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX,
BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
Expand All @@ -112,6 +149,12 @@ public void testMaxEvaluations() throws Exception {

// if maxEvaluations is respected, we can't possibly have more suggestions than that
MatcherAssert.assertThat(sw.length, lessThan(maxEvals));

// we should have been able to find this "optimal" (due to fewest num changes)
// suggestion before hitting our small maxEvals (and before any suggests with more breaks)
assertEquals(2, sw[0].length);
assertEquals("aba", sw[0][0].string);
assertEquals("bab", sw[0][1].string);
}
}

Expand Down

0 comments on commit cc854f4

Please sign in to comment.