Skip to content

Commit

Permalink
Corrected the expansion of overlapping terms in the unified highlight…
Browse files Browse the repository at this point in the history
…er (elastic#101912)

This commit addresses an issue in the passage formatter of the unified highlighter, where overlapping terms were not correctly expanded to be highlighted as a single object. The fix in this commit involves adjusting the expansion logic to consider the maximum end offset during the process, as matches are initially sorted by ascending start offset and then by ascending end offset.
  • Loading branch information
jimczi committed Nov 10, 2023
1 parent df73595 commit 3c4ab8a
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public Snippet[] format(Passage[] passages, String content) {
assert end > start;
// Look ahead to expand 'end' past all overlapping:
while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i + 1] < end) {
end = passage.getMatchEnds()[++i];
end = Math.max(passage.getMatchEnds()[++i], end);
}
end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
Expand All @@ -35,11 +39,15 @@
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.util.ResourceLoader;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
import org.elasticsearch.test.ESTestCase;

import java.io.IOException;
import java.io.StringReader;
import java.text.BreakIterator;
import java.text.ParseException;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
Expand Down Expand Up @@ -153,9 +161,9 @@ private void assertHighlightOneDoc(
true
);
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
assertEquals(snippets.length, expectedPassages.length);
assertEquals(expectedPassages.length, snippets.length);
for (int i = 0; i < snippets.length; i++) {
assertEquals(snippets[i].getText(), expectedPassages[i]);
assertEquals(expectedPassages[i], snippets[i].getText());
}
}
}
Expand Down Expand Up @@ -356,6 +364,42 @@ public void testOverlappingTerms() throws Exception {
assertHighlightOneDoc("text", inputs, analyzer, query, Locale.ROOT, BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
}

public static class NYCFilterFactory extends SynonymFilterFactory {
public NYCFilterFactory(Map<String, String> args) {
super(args);
}

@Override
protected SynonymMap loadSynonyms(ResourceLoader loader, String cname, boolean dedup, Analyzer analyzer) throws IOException,
ParseException {
SynonymMap.Parser parser = new SolrSynonymParser(false, false, analyzer);
parser.parse(new StringReader("new york city => nyc, new york city"));
return parser.build();
}
}

public void testOverlappingPositions() throws Exception {
final String[] inputs = { "new york city" };
final String[] outputs = { "<b>new york city</b>" };
BooleanQuery query = new BooleanQuery.Builder().add(
new BooleanQuery.Builder().add(new TermQuery(new Term("text", "nyc")), BooleanClause.Occur.SHOULD)
.add(
new BooleanQuery.Builder().add(new TermQuery(new Term("text", "new")), BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("text", "york")), BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("text", "city")), BooleanClause.Occur.MUST)
.build(),
BooleanClause.Occur.SHOULD
)
.build(),
BooleanClause.Occur.MUST
).build();
Analyzer analyzer = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(NYCFilterFactory.class, "synonyms", "N/A")
.build();
assertHighlightOneDoc("text", inputs, analyzer, query, Locale.ROOT, BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
}

public void testExceedMaxAnalyzedOffset() throws Exception {
TermQuery query = new TermQuery(new Term("text", "max"));
Analyzer analyzer = CustomAnalyzer.builder()
Expand Down

0 comments on commit 3c4ab8a

Please sign in to comment.