From bcc00cb235da5902f8942ad46c89f980f13b1147 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Thu, 30 May 2024 22:36:54 +0000 Subject: [PATCH] Add more unit tests Signed-off-by: Michael Froh --- .../index/mapper/WildcardFieldMapper.java | 25 ++- .../mapper/WildcardFieldMapperTests.java | 176 ++++++++++++++++++ 2 files changed, 195 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java index 809280833f249..8ac0044077f6b 100644 --- a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java @@ -10,8 +10,10 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; import org.apache.lucene.document.SortedSetDocValuesField; -import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; @@ -177,6 +179,15 @@ public int ignoreAbove() { return ignoreAbove; } + private static final FieldType FIELD_TYPE = new FieldType(); + static { + FIELD_TYPE.setIndexOptions(IndexOptions.DOCS); + FIELD_TYPE.setTokenized(true); + FIELD_TYPE.setOmitNorms(true); + FIELD_TYPE.setStored(false); + FIELD_TYPE.freeze(); + } + @Override protected void parseCreateField(ParseContext context) throws IOException { String value; @@ -204,7 +215,7 @@ protected void parseCreateField(ParseContext context) throws IOException { final BytesRef binaryValue = new BytesRef(value); Tokenizer tokenizer = new WildcardFieldTokenizer(); tokenizer.setReader(new StringReader(value)); - context.doc().add(new TextField(fieldType().name(), tokenizer)); + context.doc().add(new Field(fieldType().name(), tokenizer, FIELD_TYPE)); if (fieldType().hasDocValues()) { context.doc().add(new SortedSetDocValuesField(fieldType().name(), binaryValue)); } else { @@ -283,13 +294,15 @@ public boolean incrementToken() throws IOException { // Two zeroes usually means we're done. if (length == 3 && charTermAttribute.buffer()[1] != 0) { // The only case where we're not done is if the input has exactly 1 character, so the buffer - // contains 0, char, 0. In that case, we return char,0, and it's our last token. + // contains 0, char, 0. In that case, we return char now, then return char, 0 on the next iteration charTermAttribute.buffer()[0] = charTermAttribute.buffer()[1]; charTermAttribute.buffer()[1] = 0; - charTermAttribute.setLength(2); - } else { - return false; + charTermAttribute.setLength(1); + length = 2; + offset = 1; + return true; } + return false; } if (length == 3) { // Read the next character, overwriting the current offset diff --git a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java index 2b52a1b91eca0..0ef1b2ed6d03a 100644 --- a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java +++ b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java @@ -8,14 +8,36 @@ package org.opensearch.index.mapper; +import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.IndexableFieldType; +import org.apache.lucene.util.BytesRef; import org.opensearch.core.xcontent.XContentBuilder; +import org.opensearch.index.IndexSettings; +import org.opensearch.index.analysis.AnalyzerScope; +import org.opensearch.index.analysis.CharFilterFactory; +import org.opensearch.index.analysis.CustomAnalyzer; +import org.opensearch.index.analysis.IndexAnalyzers; +import org.opensearch.index.analysis.LowercaseNormalizer; +import org.opensearch.index.analysis.NamedAnalyzer; +import org.opensearch.index.analysis.TokenFilterFactory; +import org.opensearch.index.analysis.TokenizerFactory; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; +import java.util.Map; + +import static java.util.Collections.singletonMap; public class WildcardFieldMapperTests extends MapperTestCase { @@ -71,5 +93,159 @@ public void testTokenizer() throws IOException { ), terms ); + terms.clear(); + try (Tokenizer tokenizer = new WildcardFieldMapper.WildcardFieldTokenizer()) { + tokenizer.setReader(new StringReader("a")); + tokenizer.reset(); + CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); + while (tokenizer.incrementToken()) { + terms.add(charTermAttribute.toString()); + } + } + assertEquals(List.of(WildcardFieldTypeTests.prefixAnchored("a"), "a", WildcardFieldTypeTests.suffixAnchored("a")), terms); + } + + public void testEnableDocValues() throws IOException { + DocumentMapper mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard").field("doc_values", true))); + ParsedDocument doc = mapper.parse(source(b -> b.field("field", "1234"))); + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + assertEquals(DocValuesType.NONE, fields[0].fieldType().docValuesType()); + assertEquals(DocValuesType.SORTED_SET, fields[1].fieldType().docValuesType()); + + mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard"))); + doc = mapper.parse(source(b -> b.field("field", "1234"))); + fields = doc.rootDoc().getFields("field"); + assertEquals(1, fields.length); + assertEquals(DocValuesType.NONE, fields[0].fieldType().docValuesType()); + } + + @Override + protected IndexAnalyzers createIndexAnalyzers(IndexSettings indexSettings) { + return new IndexAnalyzers( + singletonMap("default", new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer())), + Map.of( + "lowercase", + new NamedAnalyzer("lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer()), + "other_lowercase", + new NamedAnalyzer("other_lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer()) + ), + singletonMap( + "lowercase", + new NamedAnalyzer( + "lowercase", + AnalyzerScope.INDEX, + new CustomAnalyzer( + TokenizerFactory.newFactory("lowercase", WhitespaceTokenizer::new), + new CharFilterFactory[0], + new TokenFilterFactory[] { new TokenFilterFactory() { + + @Override + public String name() { + return "lowercase"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new LowerCaseFilter(tokenStream); + } + } } + ) + ) + ) + ); + } + + public void testNormalizer() throws IOException { + DocumentMapper mapper = createDocumentMapper( + fieldMapping(b -> b.field("type", "wildcard").field("normalizer", "lowercase").field("doc_values", true)) + ); + ParsedDocument doc = mapper.parse(source(b -> b.field("field", "AbC"))); + + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + + assertTrue(fields[0] instanceof Field); + Field textField = (Field) fields[0]; + List terms = new ArrayList<>(); + try (TokenStream tokenStream = textField.tokenStreamValue()) { + tokenStream.reset(); + CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); + while (tokenStream.incrementToken()) { + terms.add(charTermAttribute.toString()); + } + } + assertEquals( + List.of( + WildcardFieldTypeTests.prefixAnchored("a"), + WildcardFieldTypeTests.prefixAnchored("ab"), + "a", + "ab", + "abc", + "b", + "bc", + WildcardFieldTypeTests.suffixAnchored("bc"), + "c", + WildcardFieldTypeTests.suffixAnchored("c") + ), + terms + ); + IndexableFieldType fieldType = fields[0].fieldType(); + assertTrue(fieldType.omitNorms()); + assertTrue(fieldType.tokenized()); + assertFalse(fieldType.stored()); + assertEquals(IndexOptions.DOCS, fieldType.indexOptions()); + assertFalse(fieldType.storeTermVectors()); + assertFalse(fieldType.storeTermVectorOffsets()); + assertFalse(fieldType.storeTermVectorPositions()); + assertFalse(fieldType.storeTermVectorPayloads()); + assertEquals(DocValuesType.NONE, fieldType.docValuesType()); + + assertEquals(new BytesRef("abc"), fields[1].binaryValue()); + fieldType = fields[1].fieldType(); + assertEquals(IndexOptions.NONE, fieldType.indexOptions()); + assertEquals(DocValuesType.SORTED_SET, fieldType.docValuesType()); + } + + public void testNullValue() throws IOException { + DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping)); + ParsedDocument doc = mapper.parse(source(b -> b.nullField("field"))); + assertArrayEquals(new IndexableField[0], doc.rootDoc().getFields("field")); + + mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard").field("null_value", "uri").field("doc_values", true))); + doc = mapper.parse(source(b -> {})); + IndexableField[] fields = doc.rootDoc().getFields("field"); + assertEquals(0, fields.length); + doc = mapper.parse(source(b -> b.nullField("field"))); + fields = doc.rootDoc().getFields("field"); + assertEquals(2, fields.length); + assertTrue(fields[0] instanceof Field); + Field textField = (Field) fields[0]; + List terms = new ArrayList<>(); + try (TokenStream tokenStream = textField.tokenStreamValue()) { + tokenStream.reset(); + CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); + while (tokenStream.incrementToken()) { + terms.add(charTermAttribute.toString()); + } + } + assertEquals( + List.of( + WildcardFieldTypeTests.prefixAnchored("u"), + WildcardFieldTypeTests.prefixAnchored("ur"), + "u", + "ur", + "uri", + "r", + "ri", + WildcardFieldTypeTests.suffixAnchored("ri"), + "i", + WildcardFieldTypeTests.suffixAnchored("i") + ), + terms + ); + assertEquals(new BytesRef("uri"), fields[1].binaryValue()); + assertEquals(IndexOptions.NONE, fields[1].fieldType().indexOptions()); + assertEquals(DocValuesType.SORTED_SET, fields[1].fieldType().docValuesType()); } }