Add more unit tests

Signed-off-by: Michael Froh <[email protected]>
opensearch-project · May 30, 2024 · bcc00cb · bcc00cb
1 parent d6bdc7c
commit bcc00cb
Show file tree

Hide file tree

Showing 2 changed files with 195 additions and 6 deletions.
diff --git a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java
@@ -10,8 +10,10 @@
 
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
 import org.apache.lucene.document.SortedSetDocValuesField;
-import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause;
@@ -177,6 +179,15 @@ public int ignoreAbove() {
         return ignoreAbove;
     }
 
+    private static final FieldType FIELD_TYPE = new FieldType();
+    static {
+        FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
+        FIELD_TYPE.setTokenized(true);
+        FIELD_TYPE.setOmitNorms(true);
+        FIELD_TYPE.setStored(false);
+        FIELD_TYPE.freeze();
+    }
+
     @Override
     protected void parseCreateField(ParseContext context) throws IOException {
         String value;
@@ -204,7 +215,7 @@ protected void parseCreateField(ParseContext context) throws IOException {
         final BytesRef binaryValue = new BytesRef(value);
         Tokenizer tokenizer = new WildcardFieldTokenizer();
         tokenizer.setReader(new StringReader(value));
-        context.doc().add(new TextField(fieldType().name(), tokenizer));
+        context.doc().add(new Field(fieldType().name(), tokenizer, FIELD_TYPE));
         if (fieldType().hasDocValues()) {
             context.doc().add(new SortedSetDocValuesField(fieldType().name(), binaryValue));
         } else {
@@ -283,13 +294,15 @@ public boolean incrementToken() throws IOException {
                 // Two zeroes usually means we're done.
                 if (length == 3 && charTermAttribute.buffer()[1] != 0) {
                     // The only case where we're not done is if the input has exactly 1 character, so the buffer
-                    // contains 0, char, 0. In that case, we return char,0, and it's our last token.
+                    // contains 0, char, 0. In that case, we return char now, then return char, 0 on the next iteration
                     charTermAttribute.buffer()[0] = charTermAttribute.buffer()[1];
                     charTermAttribute.buffer()[1] = 0;
-                    charTermAttribute.setLength(2);
-                } else {
-                    return false;
+                    charTermAttribute.setLength(1);
+                    length = 2;
+                    offset = 1;
+                    return true;
                 }
+                return false;
             }
             if (length == 3) {
                 // Read the next character, overwriting the current offset

diff --git a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java
@@ -8,14 +8,36 @@
 
 package org.opensearch.index.mapper;
 
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.IndexableFieldType;
+import org.apache.lucene.util.BytesRef;
 import org.opensearch.core.xcontent.XContentBuilder;
+import org.opensearch.index.IndexSettings;
+import org.opensearch.index.analysis.AnalyzerScope;
+import org.opensearch.index.analysis.CharFilterFactory;
+import org.opensearch.index.analysis.CustomAnalyzer;
+import org.opensearch.index.analysis.IndexAnalyzers;
+import org.opensearch.index.analysis.LowercaseNormalizer;
+import org.opensearch.index.analysis.NamedAnalyzer;
+import org.opensearch.index.analysis.TokenFilterFactory;
+import org.opensearch.index.analysis.TokenizerFactory;
 
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
+
+import static java.util.Collections.singletonMap;
 
 public class WildcardFieldMapperTests extends MapperTestCase {
 
@@ -71,5 +93,159 @@ public void testTokenizer() throws IOException {
             ),
             terms
         );
+        terms.clear();
+        try (Tokenizer tokenizer = new WildcardFieldMapper.WildcardFieldTokenizer()) {
+            tokenizer.setReader(new StringReader("a"));
+            tokenizer.reset();
+            CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class);
+            while (tokenizer.incrementToken()) {
+                terms.add(charTermAttribute.toString());
+            }
+        }
+        assertEquals(List.of(WildcardFieldTypeTests.prefixAnchored("a"), "a", WildcardFieldTypeTests.suffixAnchored("a")), terms);
+    }
+
+    public void testEnableDocValues() throws IOException {
+        DocumentMapper mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard").field("doc_values", true)));
+        ParsedDocument doc = mapper.parse(source(b -> b.field("field", "1234")));
+        IndexableField[] fields = doc.rootDoc().getFields("field");
+        assertEquals(2, fields.length);
+        assertEquals(DocValuesType.NONE, fields[0].fieldType().docValuesType());
+        assertEquals(DocValuesType.SORTED_SET, fields[1].fieldType().docValuesType());
+
+        mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard")));
+        doc = mapper.parse(source(b -> b.field("field", "1234")));
+        fields = doc.rootDoc().getFields("field");
+        assertEquals(1, fields.length);
+        assertEquals(DocValuesType.NONE, fields[0].fieldType().docValuesType());
+    }
+
+    @Override
+    protected IndexAnalyzers createIndexAnalyzers(IndexSettings indexSettings) {
+        return new IndexAnalyzers(
+            singletonMap("default", new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer())),
+            Map.of(
+                "lowercase",
+                new NamedAnalyzer("lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer()),
+                "other_lowercase",
+                new NamedAnalyzer("other_lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer())
+            ),
+            singletonMap(
+                "lowercase",
+                new NamedAnalyzer(
+                    "lowercase",
+                    AnalyzerScope.INDEX,
+                    new CustomAnalyzer(
+                        TokenizerFactory.newFactory("lowercase", WhitespaceTokenizer::new),
+                        new CharFilterFactory[0],
+                        new TokenFilterFactory[] { new TokenFilterFactory() {
+
+                            @Override
+                            public String name() {
+                                return "lowercase";
+                            }
+
+                            @Override
+                            public TokenStream create(TokenStream tokenStream) {
+                                return new LowerCaseFilter(tokenStream);
+                            }
+                        } }
+                    )
+                )
+            )
+        );
+    }
+
+    public void testNormalizer() throws IOException {
+        DocumentMapper mapper = createDocumentMapper(
+            fieldMapping(b -> b.field("type", "wildcard").field("normalizer", "lowercase").field("doc_values", true))
+        );
+        ParsedDocument doc = mapper.parse(source(b -> b.field("field", "AbC")));
+
+        IndexableField[] fields = doc.rootDoc().getFields("field");
+        assertEquals(2, fields.length);
+
+        assertTrue(fields[0] instanceof Field);
+        Field textField = (Field) fields[0];
+        List<String> terms = new ArrayList<>();
+        try (TokenStream tokenStream = textField.tokenStreamValue()) {
+            tokenStream.reset();
+            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
+            while (tokenStream.incrementToken()) {
+                terms.add(charTermAttribute.toString());
+            }
+        }
+        assertEquals(
+            List.of(
+                WildcardFieldTypeTests.prefixAnchored("a"),
+                WildcardFieldTypeTests.prefixAnchored("ab"),
+                "a",
+                "ab",
+                "abc",
+                "b",
+                "bc",
+                WildcardFieldTypeTests.suffixAnchored("bc"),
+                "c",
+                WildcardFieldTypeTests.suffixAnchored("c")
+            ),
+            terms
+        );
+        IndexableFieldType fieldType = fields[0].fieldType();
+        assertTrue(fieldType.omitNorms());
+        assertTrue(fieldType.tokenized());
+        assertFalse(fieldType.stored());
+        assertEquals(IndexOptions.DOCS, fieldType.indexOptions());
+        assertFalse(fieldType.storeTermVectors());
+        assertFalse(fieldType.storeTermVectorOffsets());
+        assertFalse(fieldType.storeTermVectorPositions());
+        assertFalse(fieldType.storeTermVectorPayloads());
+        assertEquals(DocValuesType.NONE, fieldType.docValuesType());
+
+        assertEquals(new BytesRef("abc"), fields[1].binaryValue());
+        fieldType = fields[1].fieldType();
+        assertEquals(IndexOptions.NONE, fieldType.indexOptions());
+        assertEquals(DocValuesType.SORTED_SET, fieldType.docValuesType());
+    }
+
+    public void testNullValue() throws IOException {
+        DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping));
+        ParsedDocument doc = mapper.parse(source(b -> b.nullField("field")));
+        assertArrayEquals(new IndexableField[0], doc.rootDoc().getFields("field"));
+
+        mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard").field("null_value", "uri").field("doc_values", true)));
+        doc = mapper.parse(source(b -> {}));
+        IndexableField[] fields = doc.rootDoc().getFields("field");
+        assertEquals(0, fields.length);
+        doc = mapper.parse(source(b -> b.nullField("field")));
+        fields = doc.rootDoc().getFields("field");
+        assertEquals(2, fields.length);
+        assertTrue(fields[0] instanceof Field);
+        Field textField = (Field) fields[0];
+        List<String> terms = new ArrayList<>();
+        try (TokenStream tokenStream = textField.tokenStreamValue()) {
+            tokenStream.reset();
+            CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
+            while (tokenStream.incrementToken()) {
+                terms.add(charTermAttribute.toString());
+            }
+        }
+        assertEquals(
+            List.of(
+                WildcardFieldTypeTests.prefixAnchored("u"),
+                WildcardFieldTypeTests.prefixAnchored("ur"),
+                "u",
+                "ur",
+                "uri",
+                "r",
+                "ri",
+                WildcardFieldTypeTests.suffixAnchored("ri"),
+                "i",
+                WildcardFieldTypeTests.suffixAnchored("i")
+            ),
+            terms
+        );
+        assertEquals(new BytesRef("uri"), fields[1].binaryValue());
+        assertEquals(IndexOptions.NONE, fields[1].fieldType().indexOptions());
+        assertEquals(DocValuesType.SORTED_SET, fields[1].fieldType().docValuesType());
     }
 }