From cc854f408fd06553de4780b1874992c16b71f26e Mon Sep 17 00:00:00 2001 From: Chris Hostetter Date: Wed, 10 Jul 2024 11:07:30 -0700 Subject: [PATCH] WordBreakSpellChecker.suggestWordBreaks now does a breadth first search (#12100) --- lucene/CHANGES.txt | 3 + .../search/spell/WordBreakSpellChecker.java | 92 +++++++++++++------ .../spell/TestWordBreakSpellChecker.java | 53 ++++++++++- 3 files changed, 113 insertions(+), 35 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 26a7c06e4833..d67318da6785 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -282,6 +282,9 @@ Optimizations * GITHUB#13538: Slightly reduce heap usage for HNSW and scalar quantized vector writers. (Ben Trent) +* GITHUB#12100: WordBreakSpellChecker.suggestWordBreaks now does a breadth first search, allowing it to return + better matches with fewer evaluations (hossman) + Changes in runtime behavior --------------------- diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java index b5572f717d60..b3a84e9aaf82 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java @@ -22,6 +22,9 @@ import java.util.Queue; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; +import org.apache.lucene.util.BitSet; +import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.FixedBitSet; /** * A spell checker whose sole function is to offer suggestions by combining multiple terms into one @@ -246,51 +249,80 @@ private int generateBreakUpSuggestions( int totalEvaluations, BreakSuggestionSortMethod sortMethod) throws IOException { - String termText = term.text(); - int termLength = termText.codePointCount(0, termText.length()); - int useMinBreakWordLength = minBreakWordLength; - if (useMinBreakWordLength < 1) { - useMinBreakWordLength = 1; - } + final String termText = term.text(); + final int termLength = termText.codePointCount(0, termText.length()); + final int useMinBreakWordLength = Math.max(minBreakWordLength, 1); if (termLength < (useMinBreakWordLength * 2)) { return totalEvaluations; } - for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) { + // Two phase breadth first search. + // + // Phase #1: checking for bi-sects of our termText, and recording the + // positions of valid leftText splits for later + + final int maxSplitPosition = termLength - useMinBreakWordLength; + final BitSet validLeftSplitPositions = new FixedBitSet(termText.length()); + + for (int i = useMinBreakWordLength; i <= maxSplitPosition; i++) { if (totalEvaluations >= maxEvaluations) { - break; + return totalEvaluations; } totalEvaluations++; - int end = termText.offsetByCodePoints(0, i); - String leftText = termText.substring(0, end); - String rightText = termText.substring(end); - SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText); + final int end = termText.offsetByCodePoints(0, i); + final String leftText = termText.substring(0, end); + final String rightText = termText.substring(end); + + final SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText); if (leftWord.freq >= useMinSuggestionFrequency) { - SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText); + validLeftSplitPositions.set(end); + final SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText); if (rightWord.freq >= useMinSuggestionFrequency) { - SuggestWordArrayWrapper suggestion = - new SuggestWordArrayWrapper(newSuggestion(prefix, leftWord, rightWord)); - suggestions.offer(suggestion); + suggestions.offer( + new SuggestWordArrayWrapper(newSuggestion(prefix, leftWord, rightWord))); if (suggestions.size() > maxSuggestions) { suggestions.poll(); } } - int newNumberBreaks = numberBreaks + 1; - if (newNumberBreaks <= maxChanges) { - totalEvaluations = - generateBreakUpSuggestions( - new Term(term.field(), rightWord.string), - ir, - newNumberBreaks, - maxSuggestions, - useMinSuggestionFrequency, - newPrefix(prefix, leftWord), - suggestions, - totalEvaluations, - sortMethod); - } + } + } + + // if we are about to exceed our maxChanges *OR* we have a full list of + // suggestions, we can return now. + // + // (because any subsequent suggestions are garunteed to have more changes + // then anything currently in the queue, and not be competitive) + + final int newNumberBreaks = numberBreaks + 1; + if (totalEvaluations >= maxEvaluations + || newNumberBreaks > maxChanges + || suggestions.size() >= maxSuggestions) { + return totalEvaluations; + } + + // Phase #2: recursing on the right side of any valid leftText terms + final BitSetIterator leftIter = new BitSetIterator(validLeftSplitPositions, 0); + for (int pos = leftIter.nextDoc(); + pos != BitSetIterator.NO_MORE_DOCS; + pos = leftIter.nextDoc()) { + final String leftText = termText.substring(0, pos); + final String rightText = termText.substring(pos); + final SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText); + totalEvaluations = + generateBreakUpSuggestions( + new Term(term.field(), rightText), + ir, + newNumberBreaks, + maxSuggestions, + useMinSuggestionFrequency, + newPrefix(prefix, leftWord), + suggestions, + totalEvaluations, + sortMethod); + if (totalEvaluations >= maxEvaluations) { + break; } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java index 83b7066fa0b9..38121ced691e 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java @@ -89,20 +89,57 @@ public void tearDown() throws Exception { } public void testMaxEvaluations() throws Exception { - final int maxEvals = 100; + try (IndexReader ir = DirectoryReader.open(dir)) { + + final String input = "ab".repeat(5); + final int maxEvals = 100; + final int maxSuggestions = maxEvals * 2; // plenty + + WordBreakSpellChecker wbsp = new WordBreakSpellChecker(); + wbsp.setMinBreakWordLength(1); + wbsp.setMinSuggestionFrequency(1); + + wbsp.setMaxChanges(2 * input.length()); // plenty + wbsp.setMaxEvaluations(maxEvals); + + Term term = new Term("abba", input); + SuggestWord[][] sw = + wbsp.suggestWordBreaks( + term, + maxSuggestions, + ir, + SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, + BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); + + // sanity check that our suggester isn't completely broken + MatcherAssert.assertThat(sw.length, greaterThan(0)); + + // if maxEvaluations is respected, we can't possibly have more suggestions than that + MatcherAssert.assertThat(sw.length, lessThan(maxEvals)); + } + } + + public void testSmallMaxEvaluations() throws Exception { + // even using small maxEvals (relative to maxChanges) should produce + // good results if possible try (IndexReader ir = DirectoryReader.open(dir)) { + + final int maxEvals = TestUtil.nextInt(random(), 6, 20); + final int maxSuggestions = maxEvals * 10; // plenty + WordBreakSpellChecker wbsp = new WordBreakSpellChecker(); - wbsp.setMaxChanges(10); wbsp.setMinBreakWordLength(1); wbsp.setMinSuggestionFrequency(1); - wbsp.setMaxEvaluations(100); - Term term = new Term("abba", "ab".repeat(5)); + wbsp.setMaxChanges(maxEvals * 10); // plenty + wbsp.setMaxEvaluations(maxEvals); + + Term term = new Term("abba", "ababab"); SuggestWord[][] sw = wbsp.suggestWordBreaks( term, - 500, + maxSuggestions, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY); @@ -112,6 +149,12 @@ public void testMaxEvaluations() throws Exception { // if maxEvaluations is respected, we can't possibly have more suggestions than that MatcherAssert.assertThat(sw.length, lessThan(maxEvals)); + + // we should have been able to find this "optimal" (due to fewest num changes) + // suggestion before hitting our small maxEvals (and before any suggests with more breaks) + assertEquals(2, sw[0].length); + assertEquals("aba", sw[0][0].string); + assertEquals("bab", sw[0][1].string); } }