From 3c4ab8a2ea9dcfa6e95da36c99ee7105b4faaf45 Mon Sep 17 00:00:00 2001 From: Jim Ferenczi Date: Fri, 10 Nov 2023 12:24:51 +0000 Subject: [PATCH] Corrected the expansion of overlapping terms in the unified highlighter (#101912) This commit addresses an issue in the passage formatter of the unified highlighter, where overlapping terms were not correctly expanded to be highlighted as a single object. The fix in this commit involves adjusting the expansion logic to consider the maximum end offset during the process, as matches are initially sorted by ascending start offset and then by ascending end offset. --- .../uhighlight/CustomPassageFormatter.java | 2 +- .../CustomUnifiedHighlighterTests.java | 48 ++++++++++++++++++- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomPassageFormatter.java b/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomPassageFormatter.java index eb87dc982543f..6ae2f53a94ad8 100644 --- a/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomPassageFormatter.java +++ b/server/src/main/java/org/elasticsearch/lucene/search/uhighlight/CustomPassageFormatter.java @@ -48,7 +48,7 @@ public Snippet[] format(Passage[] passages, String content) { assert end > start; // Look ahead to expand 'end' past all overlapping: while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i + 1] < end) { - end = passage.getMatchEnds()[++i]; + end = Math.max(passage.getMatchEnds()[++i], end); } end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage diff --git a/server/src/test/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/server/src/test/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java index 5df6840640264..bf249ba4409ab 100644 --- a/server/src/test/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java +++ b/server/src/test/java/org/elasticsearch/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java @@ -13,6 +13,10 @@ import org.apache.lucene.analysis.custom.CustomAnalyzer; import org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardTokenizerFactory; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; +import org.apache.lucene.analysis.synonym.SynonymFilterFactory; +import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -35,11 +39,15 @@ import org.apache.lucene.search.uhighlight.UnifiedHighlighter; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.util.ResourceLoader; import org.elasticsearch.common.Strings; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; import org.elasticsearch.test.ESTestCase; +import java.io.IOException; +import java.io.StringReader; import java.text.BreakIterator; +import java.text.ParseException; import java.util.Locale; import java.util.Map; import java.util.TreeMap; @@ -153,9 +161,9 @@ private void assertHighlightOneDoc( true ); final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue); - assertEquals(snippets.length, expectedPassages.length); + assertEquals(expectedPassages.length, snippets.length); for (int i = 0; i < snippets.length; i++) { - assertEquals(snippets[i].getText(), expectedPassages[i]); + assertEquals(expectedPassages[i], snippets[i].getText()); } } } @@ -356,6 +364,42 @@ public void testOverlappingTerms() throws Exception { assertHighlightOneDoc("text", inputs, analyzer, query, Locale.ROOT, BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs); } + public static class NYCFilterFactory extends SynonymFilterFactory { + public NYCFilterFactory(Map args) { + super(args); + } + + @Override + protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException, + ParseException { + SynonymMap.Parser parser = new SolrSynonymParser(false, false, analyzer); + parser.parse(new StringReader("new york city => nyc, new york city")); + return parser.build(); + } + } + + public void testOverlappingPositions() throws Exception { + final String[] inputs = { "new york city" }; + final String[] outputs = { "new york city" }; + BooleanQuery query = new BooleanQuery.Builder().add( + new BooleanQuery.Builder().add(new TermQuery(new Term("text", "nyc")), BooleanClause.Occur.SHOULD) + .add( + new BooleanQuery.Builder().add(new TermQuery(new Term("text", "new")), BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("text", "york")), BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("text", "city")), BooleanClause.Occur.MUST) + .build(), + BooleanClause.Occur.SHOULD + ) + .build(), + BooleanClause.Occur.MUST + ).build(); + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer(StandardTokenizerFactory.class) + .addTokenFilter(NYCFilterFactory.class, "synonyms", "N/A") + .build(); + assertHighlightOneDoc("text", inputs, analyzer, query, Locale.ROOT, BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs); + } + public void testExceedMaxAnalyzedOffset() throws Exception { TermQuery query = new TermQuery(new Term("text", "max")); Analyzer analyzer = CustomAnalyzer.builder()