Skip to content

Commit

Permalink
Add more unit tests
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Froh <[email protected]>
  • Loading branch information
msfroh committed May 30, 2024
1 parent d6bdc7c commit bcc00cb
Show file tree
Hide file tree
Showing 2 changed files with 195 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
Expand Down Expand Up @@ -177,6 +179,15 @@ public int ignoreAbove() {
return ignoreAbove;
}

private static final FieldType FIELD_TYPE = new FieldType();
static {
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS);
FIELD_TYPE.setTokenized(true);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.setStored(false);
FIELD_TYPE.freeze();
}

@Override
protected void parseCreateField(ParseContext context) throws IOException {
String value;
Expand Down Expand Up @@ -204,7 +215,7 @@ protected void parseCreateField(ParseContext context) throws IOException {
final BytesRef binaryValue = new BytesRef(value);
Tokenizer tokenizer = new WildcardFieldTokenizer();
tokenizer.setReader(new StringReader(value));
context.doc().add(new TextField(fieldType().name(), tokenizer));
context.doc().add(new Field(fieldType().name(), tokenizer, FIELD_TYPE));
if (fieldType().hasDocValues()) {
context.doc().add(new SortedSetDocValuesField(fieldType().name(), binaryValue));
} else {
Expand Down Expand Up @@ -283,13 +294,15 @@ public boolean incrementToken() throws IOException {
// Two zeroes usually means we're done.
if (length == 3 && charTermAttribute.buffer()[1] != 0) {
// The only case where we're not done is if the input has exactly 1 character, so the buffer
// contains 0, char, 0. In that case, we return char,0, and it's our last token.
// contains 0, char, 0. In that case, we return char now, then return char, 0 on the next iteration
charTermAttribute.buffer()[0] = charTermAttribute.buffer()[1];
charTermAttribute.buffer()[1] = 0;
charTermAttribute.setLength(2);
} else {
return false;
charTermAttribute.setLength(1);
length = 2;
offset = 1;
return true;
}
return false;
}
if (length == 3) {
// Read the next character, overwriting the current offset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,36 @@

package org.opensearch.index.mapper;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.util.BytesRef;
import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.analysis.AnalyzerScope;
import org.opensearch.index.analysis.CharFilterFactory;
import org.opensearch.index.analysis.CustomAnalyzer;
import org.opensearch.index.analysis.IndexAnalyzers;
import org.opensearch.index.analysis.LowercaseNormalizer;
import org.opensearch.index.analysis.NamedAnalyzer;
import org.opensearch.index.analysis.TokenFilterFactory;
import org.opensearch.index.analysis.TokenizerFactory;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import static java.util.Collections.singletonMap;

public class WildcardFieldMapperTests extends MapperTestCase {

Expand Down Expand Up @@ -71,5 +93,159 @@ public void testTokenizer() throws IOException {
),
terms
);
terms.clear();
try (Tokenizer tokenizer = new WildcardFieldMapper.WildcardFieldTokenizer()) {
tokenizer.setReader(new StringReader("a"));
tokenizer.reset();
CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class);
while (tokenizer.incrementToken()) {
terms.add(charTermAttribute.toString());
}
}
assertEquals(List.of(WildcardFieldTypeTests.prefixAnchored("a"), "a", WildcardFieldTypeTests.suffixAnchored("a")), terms);
}

public void testEnableDocValues() throws IOException {
DocumentMapper mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard").field("doc_values", true)));
ParsedDocument doc = mapper.parse(source(b -> b.field("field", "1234")));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
assertEquals(DocValuesType.NONE, fields[0].fieldType().docValuesType());
assertEquals(DocValuesType.SORTED_SET, fields[1].fieldType().docValuesType());

mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard")));
doc = mapper.parse(source(b -> b.field("field", "1234")));
fields = doc.rootDoc().getFields("field");
assertEquals(1, fields.length);
assertEquals(DocValuesType.NONE, fields[0].fieldType().docValuesType());
}

@Override
protected IndexAnalyzers createIndexAnalyzers(IndexSettings indexSettings) {
return new IndexAnalyzers(
singletonMap("default", new NamedAnalyzer("default", AnalyzerScope.INDEX, new StandardAnalyzer())),
Map.of(
"lowercase",
new NamedAnalyzer("lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer()),
"other_lowercase",
new NamedAnalyzer("other_lowercase", AnalyzerScope.INDEX, new LowercaseNormalizer())
),
singletonMap(
"lowercase",
new NamedAnalyzer(
"lowercase",
AnalyzerScope.INDEX,
new CustomAnalyzer(
TokenizerFactory.newFactory("lowercase", WhitespaceTokenizer::new),
new CharFilterFactory[0],
new TokenFilterFactory[] { new TokenFilterFactory() {

@Override
public String name() {
return "lowercase";
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new LowerCaseFilter(tokenStream);
}
} }
)
)
)
);
}

public void testNormalizer() throws IOException {
DocumentMapper mapper = createDocumentMapper(
fieldMapping(b -> b.field("type", "wildcard").field("normalizer", "lowercase").field("doc_values", true))
);
ParsedDocument doc = mapper.parse(source(b -> b.field("field", "AbC")));

IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);

assertTrue(fields[0] instanceof Field);
Field textField = (Field) fields[0];
List<String> terms = new ArrayList<>();
try (TokenStream tokenStream = textField.tokenStreamValue()) {
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
terms.add(charTermAttribute.toString());
}
}
assertEquals(
List.of(
WildcardFieldTypeTests.prefixAnchored("a"),
WildcardFieldTypeTests.prefixAnchored("ab"),
"a",
"ab",
"abc",
"b",
"bc",
WildcardFieldTypeTests.suffixAnchored("bc"),
"c",
WildcardFieldTypeTests.suffixAnchored("c")
),
terms
);
IndexableFieldType fieldType = fields[0].fieldType();
assertTrue(fieldType.omitNorms());
assertTrue(fieldType.tokenized());
assertFalse(fieldType.stored());
assertEquals(IndexOptions.DOCS, fieldType.indexOptions());
assertFalse(fieldType.storeTermVectors());
assertFalse(fieldType.storeTermVectorOffsets());
assertFalse(fieldType.storeTermVectorPositions());
assertFalse(fieldType.storeTermVectorPayloads());
assertEquals(DocValuesType.NONE, fieldType.docValuesType());

assertEquals(new BytesRef("abc"), fields[1].binaryValue());
fieldType = fields[1].fieldType();
assertEquals(IndexOptions.NONE, fieldType.indexOptions());
assertEquals(DocValuesType.SORTED_SET, fieldType.docValuesType());
}

public void testNullValue() throws IOException {
DocumentMapper mapper = createDocumentMapper(fieldMapping(this::minimalMapping));
ParsedDocument doc = mapper.parse(source(b -> b.nullField("field")));
assertArrayEquals(new IndexableField[0], doc.rootDoc().getFields("field"));

mapper = createDocumentMapper(fieldMapping(b -> b.field("type", "wildcard").field("null_value", "uri").field("doc_values", true)));
doc = mapper.parse(source(b -> {}));
IndexableField[] fields = doc.rootDoc().getFields("field");
assertEquals(0, fields.length);
doc = mapper.parse(source(b -> b.nullField("field")));
fields = doc.rootDoc().getFields("field");
assertEquals(2, fields.length);
assertTrue(fields[0] instanceof Field);
Field textField = (Field) fields[0];
List<String> terms = new ArrayList<>();
try (TokenStream tokenStream = textField.tokenStreamValue()) {
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
terms.add(charTermAttribute.toString());
}
}
assertEquals(
List.of(
WildcardFieldTypeTests.prefixAnchored("u"),
WildcardFieldTypeTests.prefixAnchored("ur"),
"u",
"ur",
"uri",
"r",
"ri",
WildcardFieldTypeTests.suffixAnchored("ri"),
"i",
WildcardFieldTypeTests.suffixAnchored("i")
),
terms
);
assertEquals(new BytesRef("uri"), fields[1].binaryValue());
assertEquals(IndexOptions.NONE, fields[1].fieldType().indexOptions());
assertEquals(DocValuesType.SORTED_SET, fields[1].fieldType().docValuesType());
}
}

0 comments on commit bcc00cb

Please sign in to comment.