WordBreakSpellChecker.suggestWordBreaks now does a breadth first sear…

…ch (apache#12100)
kaivalnp · Jul 10, 2024 · cc854f4 · cc854f4
1 parent 49e7810
commit cc854f4
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 35 deletions.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -282,6 +282,9 @@ Optimizations
 
 * GITHUB#13538: Slightly reduce heap usage for HNSW and scalar quantized vector writers. (Ben Trent)
 
+* GITHUB#12100: WordBreakSpellChecker.suggestWordBreaks now does a breadth first search, allowing it to return
+  better matches with fewer evaluations (hossman)
+
 Changes in runtime behavior
 ---------------------
 

diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/WordBreakSpellChecker.java
@@ -22,6 +22,9 @@
 import java.util.Queue;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.BitSetIterator;
+import org.apache.lucene.util.FixedBitSet;
 
 /**
  * A spell checker whose sole function is to offer suggestions by combining multiple terms into one
@@ -246,51 +249,80 @@ private int generateBreakUpSuggestions(
       int totalEvaluations,
       BreakSuggestionSortMethod sortMethod)
       throws IOException {
-    String termText = term.text();
-    int termLength = termText.codePointCount(0, termText.length());
-    int useMinBreakWordLength = minBreakWordLength;
-    if (useMinBreakWordLength < 1) {
-      useMinBreakWordLength = 1;
-    }
+    final String termText = term.text();
+    final int termLength = termText.codePointCount(0, termText.length());
+    final int useMinBreakWordLength = Math.max(minBreakWordLength, 1);
 
     if (termLength < (useMinBreakWordLength * 2)) {
       return totalEvaluations;
     }
 
-    for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) {
+    // Two phase breadth first search.
+    //
+    // Phase #1: checking for bi-sects of our termText, and recording the
+    // positions of valid leftText splits for later
+
+    final int maxSplitPosition = termLength - useMinBreakWordLength;
+    final BitSet validLeftSplitPositions = new FixedBitSet(termText.length());
+
+    for (int i = useMinBreakWordLength; i <= maxSplitPosition; i++) {
       if (totalEvaluations >= maxEvaluations) {
-        break;
+        return totalEvaluations;
       }
       totalEvaluations++;
 
-      int end = termText.offsetByCodePoints(0, i);
-      String leftText = termText.substring(0, end);
-      String rightText = termText.substring(end);
-      SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
+      final int end = termText.offsetByCodePoints(0, i);
+      final String leftText = termText.substring(0, end);
+      final String rightText = termText.substring(end);
+
+      final SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
       if (leftWord.freq >= useMinSuggestionFrequency) {
-        SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
+        validLeftSplitPositions.set(end);
+        final SuggestWord rightWord = generateSuggestWord(ir, term.field(), rightText);
         if (rightWord.freq >= useMinSuggestionFrequency) {
-          SuggestWordArrayWrapper suggestion =
-              new SuggestWordArrayWrapper(newSuggestion(prefix, leftWord, rightWord));
-          suggestions.offer(suggestion);
+          suggestions.offer(
+              new SuggestWordArrayWrapper(newSuggestion(prefix, leftWord, rightWord)));
           if (suggestions.size() > maxSuggestions) {
             suggestions.poll();
           }
         }
-        int newNumberBreaks = numberBreaks + 1;
-        if (newNumberBreaks <= maxChanges) {
-          totalEvaluations =
-              generateBreakUpSuggestions(
-                  new Term(term.field(), rightWord.string),
-                  ir,
-                  newNumberBreaks,
-                  maxSuggestions,
-                  useMinSuggestionFrequency,
-                  newPrefix(prefix, leftWord),
-                  suggestions,
-                  totalEvaluations,
-                  sortMethod);
-        }
+      }
+    }
+
+    // if we are about to exceed our maxChanges *OR* we have a full list of
+    // suggestions, we can return now.
+    //
+    // (because any subsequent suggestions are garunteed to have more changes
+    // then anything currently in the queue, and not be competitive)
+
+    final int newNumberBreaks = numberBreaks + 1;
+    if (totalEvaluations >= maxEvaluations
+        || newNumberBreaks > maxChanges
+        || suggestions.size() >= maxSuggestions) {
+      return totalEvaluations;
+    }
+
+    // Phase #2: recursing on the right side of any valid leftText terms
+    final BitSetIterator leftIter = new BitSetIterator(validLeftSplitPositions, 0);
+    for (int pos = leftIter.nextDoc();
+        pos != BitSetIterator.NO_MORE_DOCS;
+        pos = leftIter.nextDoc()) {
+      final String leftText = termText.substring(0, pos);
+      final String rightText = termText.substring(pos);
+      final SuggestWord leftWord = generateSuggestWord(ir, term.field(), leftText);
+      totalEvaluations =
+          generateBreakUpSuggestions(
+              new Term(term.field(), rightText),
+              ir,
+              newNumberBreaks,
+              maxSuggestions,
+              useMinSuggestionFrequency,
+              newPrefix(prefix, leftWord),
+              suggestions,
+              totalEvaluations,
+              sortMethod);
+      if (totalEvaluations >= maxEvaluations) {
+        break;
       }
     }
 

diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestWordBreakSpellChecker.java
@@ -89,20 +89,57 @@ public void tearDown() throws Exception {
   }
 
   public void testMaxEvaluations() throws Exception {
-    final int maxEvals = 100;
+    try (IndexReader ir = DirectoryReader.open(dir)) {
+
+      final String input = "ab".repeat(5);
+      final int maxEvals = 100;
+      final int maxSuggestions = maxEvals * 2; // plenty
+
+      WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
+      wbsp.setMinBreakWordLength(1);
+      wbsp.setMinSuggestionFrequency(1);
+
+      wbsp.setMaxChanges(2 * input.length()); // plenty
+      wbsp.setMaxEvaluations(maxEvals);
+
+      Term term = new Term("abba", input);
+      SuggestWord[][] sw =
+          wbsp.suggestWordBreaks(
+              term,
+              maxSuggestions,
+              ir,
+              SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX,
+              BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
+
+      // sanity check that our suggester isn't completely broken
+      MatcherAssert.assertThat(sw.length, greaterThan(0));
+
+      // if maxEvaluations is respected, we can't possibly have more suggestions than that
+      MatcherAssert.assertThat(sw.length, lessThan(maxEvals));
+    }
+  }
+
+  public void testSmallMaxEvaluations() throws Exception {
+    // even using small maxEvals (relative to maxChanges) should produce
+    // good results if possible
 
     try (IndexReader ir = DirectoryReader.open(dir)) {
+
+      final int maxEvals = TestUtil.nextInt(random(), 6, 20);
+      final int maxSuggestions = maxEvals * 10; // plenty
+
       WordBreakSpellChecker wbsp = new WordBreakSpellChecker();
-      wbsp.setMaxChanges(10);
       wbsp.setMinBreakWordLength(1);
       wbsp.setMinSuggestionFrequency(1);
-      wbsp.setMaxEvaluations(100);
 
-      Term term = new Term("abba", "ab".repeat(5));
+      wbsp.setMaxChanges(maxEvals * 10); // plenty
+      wbsp.setMaxEvaluations(maxEvals);
+
+      Term term = new Term("abba", "ababab");
       SuggestWord[][] sw =
           wbsp.suggestWordBreaks(
               term,
-              500,
+              maxSuggestions,
               ir,
               SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX,
               BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY);
@@ -112,6 +149,12 @@ public void testMaxEvaluations() throws Exception {
 
       // if maxEvaluations is respected, we can't possibly have more suggestions than that
       MatcherAssert.assertThat(sw.length, lessThan(maxEvals));
+
+      // we should have been able to find this "optimal" (due to fewest num changes)
+      // suggestion before hitting our small maxEvals (and before any suggests with more breaks)
+      assertEquals(2, sw[0].length);
+      assertEquals("aba", sw[0][0].string);
+      assertEquals("bab", sw[0][1].string);
     }
   }