Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Detect regex patterns we may need to handle at eval_only time #39

Merged
merged 8 commits into from
Oct 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ public String normalizeRegex(String fieldRegex) throws IllegalArgumentException
throw new IllegalArgumentException("Cannot normalize a regex against a geometry field");
}

@Override
public boolean normalizedRegexIsLossy(String in) {
ivakegg marked this conversation as resolved.
Show resolved Hide resolved
throw new IllegalArgumentException("Cannot normalize a regex against a geometry field");
}

public String normalizeDelegateType(T geometry) {
return getEncodedStringFromIndexBytes(getSingleIndexFromGeometry(geometry));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,9 @@ public abstract class AbstractNormalizer<T> implements Normalizer<T> {
public Collection<String> expand(String in) {
return Collections.singletonList(normalize(in));
}

@Override
public boolean normalizedRegexIsLossy(String in) {
ivakegg marked this conversation as resolved.
Show resolved Hide resolved
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ public String normalizeRegex(String fieldRegex) {
}
}

@Override
public boolean normalizedRegexIsLossy(String regex) {
// Despite this normalizer actually being lossy, we are still
// returning false as users are used to overmatching when including
// diacritics or upper case letter. We may consider changing this
// down the road, but for now returning false.
return false;
}

@Override
public String normalizeDelegateType(String delegateIn) {
return normalize(delegateIn);
Expand Down
9 changes: 9 additions & 0 deletions src/main/java/datawave/data/normalizer/LcNormalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ public String normalizeRegex(String fieldRegex) {
}
}

@Override
public boolean normalizedRegexIsLossy(String regex) {
// Despite this normalizer actually being lossy, we are still
// returning false as users are used to overmatching when including
// diacritics or upper case letter. We may consider changing this
// down the road, but for now returning false.
return false;
}

@Override
public String normalizeDelegateType(String delegateIn) {
return normalize(delegateIn);
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/datawave/data/normalizer/Normalizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,7 @@ public interface Normalizer<T> extends Serializable {

String normalizeRegex(String in);

boolean normalizedRegexIsLossy(String in);

Collection<String> expand(String in);
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public String normalize(String fv) {
}

/**
* We cannot support regex against numbers
* We can support regex against numbers.
ivakegg marked this conversation as resolved.
Show resolved Hide resolved
*/
public String normalizeRegex(String fieldRegex) {
try {
Expand All @@ -40,6 +40,12 @@ public String normalizeRegex(String fieldRegex) {
}
}

public boolean normalizedRegexIsLossy(String untrimmedRegex) {
ZeroRegexStatus status = NumericRegexEncoder.getZeroRegexStatus(untrimmedRegex);

return (status.equals(ZeroRegexStatus.LEADING) || status.equals(ZeroRegexStatus.TRAILING));
}

@Override
public String normalizeDelegateType(BigDecimal delegateIn) {
return normalize(delegateIn.toString());
Expand Down
5 changes: 5 additions & 0 deletions src/main/java/datawave/data/normalizer/ZeroRegexStatus.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package datawave.data.normalizer;

public enum ZeroRegexStatus {
LEADING, TRAILING, NONE
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import com.google.common.base.CharMatcher;

import datawave.data.normalizer.ZeroRegexStatus;
import datawave.data.normalizer.regex.visitor.AlternationDeduper;
import datawave.data.normalizer.regex.visitor.AnchorTrimmer;
import datawave.data.normalizer.regex.visitor.DecimalPointPlacer;
Expand Down Expand Up @@ -143,6 +144,10 @@ private NumericRegexEncoder(String pattern) {
this.pattern = pattern;
}

public static ZeroRegexStatus getZeroRegexStatus(String regex) {
return ZeroTrimmer.getStatus(RegexParser.parse(regex).getChildren());
}

private String encode() {
if (log.isDebugEnabled()) {
log.debug("Encoding pattern " + pattern);
Expand Down
35 changes: 35 additions & 0 deletions src/main/java/datawave/data/normalizer/regex/RegexUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,24 @@ public static boolean matchesChar(Node node, char character) {
}
}

public static boolean groupNodeMatches(Node node, char character) {
GroupNode group = (GroupNode) node;
boolean matchFound = false;

for (Node child : group.getChildren()) {
// If the current child is a single character, see if it is a match for the character.
if (child instanceof SingleCharNode) {
if (isChar(child, character)) {
matchFound = true;
} else {
// A character other than the target was found, but there may be more in the group
continue;
}
}
}
return matchFound;
}

/**
* Return whether the given node is a regex element that can only match against the given character.
*
Expand Down Expand Up @@ -374,6 +392,23 @@ public static boolean matchesZero(Node node) {
return matchesChar(node, RegexConstants.ZERO);
}

public static boolean matchesCharExplicitly(Node node, char character) {
switch (node.getType()) {
case SINGLE_CHAR:
return isChar(node, character);
case CHAR_CLASS:
return charClassMatches(node, character);
case GROUP:
return groupNodeMatches(node, character);
default:
return false;
}
}

public static boolean matchesZeroExplicitly(Node node) {
return matchesCharExplicitly(node, RegexConstants.ZERO);
}

/**
* Return whether the given node is a regex element that can only match against the character '0'.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,11 @@

import org.apache.commons.lang3.tuple.Pair;

import datawave.data.normalizer.ZeroRegexStatus;
import datawave.data.normalizer.regex.AnyCharNode;
import datawave.data.normalizer.regex.EncodedPatternNode;
import datawave.data.normalizer.regex.EscapedSingleCharNode;
import datawave.data.normalizer.regex.ExpressionNode;
import datawave.data.normalizer.regex.GroupNode;
import datawave.data.normalizer.regex.IntegerNode;
import datawave.data.normalizer.regex.IntegerRangeNode;
Expand Down Expand Up @@ -42,6 +45,60 @@ public static Node trim(Node node) {
return (Node) node.accept(visitor, null);
}

public static ZeroRegexStatus getStatus(List<Node> encodedRegexNodes) {
if (hasPossiblyLeadingZeroes(encodedRegexNodes)) {
return ZeroRegexStatus.LEADING;
} else if (hasTrailingZeroes(encodedRegexNodes)) {
return ZeroRegexStatus.TRAILING;
} else
return ZeroRegexStatus.NONE;

}

private static boolean hasTrailingZeroes(List<Node> encodedRegexNodes) {
Collections.reverse(encodedRegexNodes);

NodeListIterator iter = new NodeListIterator(encodedRegexNodes);

while (iter.hasNext()) {
iter.seekPastQuantifiers();
iter.seekPastQuestionMarks();

Node next = iter.peekNext();

if (RegexUtils.matchesZero(next)) {
if (RegexUtils.matchesZeroExplicitly(next)) {
return true;
}
iter.next();
} else {
return false;
}

}
return true;

}

private static boolean hasPossiblyLeadingZeroes(List<Node> encodedRegexNodes) {
NodeListIterator iter = new NodeListIterator(encodedRegexNodes);

while (iter.hasNext()) {
Node next = iter.peekNext();

if (RegexUtils.matchesZero(next)) {
return true;
} else if (RegexUtils.isChar(next, RegexConstants.HYPHEN) || next.equals(new EscapedSingleCharNode(RegexConstants.PERIOD))) {
iter.next();
} else {
return false;
}
}

return true;

}

@Override
public Object visitEncodedPattern(EncodedPatternNode node, Object data) {
EncodedPatternNode trimmed = new EncodedPatternNode();
Expand Down
5 changes: 5 additions & 0 deletions src/main/java/datawave/data/type/BaseType.java
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ public String normalizeRegex(String in) {
return normalizer.normalizeRegex(in);
}

@Override
public boolean normalizedRegexIsLossy(String in) {
return normalizer.normalizedRegexIsLossy(in);
}

@Override
public void normalizeAndSetNormalizedValue(T valueToNormalize) {
setNormalizedValue(normalizer.normalizeDelegateType(valueToNormalize));
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/datawave/data/type/Type.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ public interface Type<T extends Comparable<T>> extends Comparable<Type<T>> {

String normalizeRegex(String in);

boolean normalizedRegexIsLossy(String in);

Collection<String> expand(String in);

Collection<String> expand();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@

import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import org.locationtech.jts.util.Assert;

import datawave.data.normalizer.ZeroRegexStatus;
import datawave.data.normalizer.regex.Node;
import datawave.data.normalizer.regex.RegexParser;

class ZeroTrimmerTest {

Expand Down Expand Up @@ -284,6 +287,8 @@ void testNoLeadingOrTrailingZeros() {
assertTrimmedTo("45.*", "\\+[b-z]E45.*");
assertTrimmedTo("300454.*", "\\+[f-z]E300454.*");
assertTrimmedTo("300.*0003", "\\+[c-z]E300.*0003");
assertTrimmedTo("300.*000[1-9]", "\\+[c-z]E300.*000[1-9]");

}

@Test
Expand All @@ -299,6 +304,35 @@ void testSingleElementPatterns() {
assertTrimmedTo("\\d{3}", "\\+[a-c]E\\d{3}");
}

@Test
void testStatus() {
// TODO: more test cases

ZeroRegexStatus status = ZeroRegexStatus.NONE;
assertStatus("300.*0003", status);
assertStatus("300.*000[1-9]", status);
assertStatus("45.*", status);
assertStatus("-45.*", status);

status = ZeroRegexStatus.LEADING;
assertStatus(".*?", status);
assertStatus(".*?11", status);
assertStatus("[04][05][06]", status);
assertStatus("[04]{1,3}[05][06]", status);
assertStatus("\\d{3}", status);
assertStatus(".\\.000034.*", status);
assertStatus("00345.*", status);
assertStatus("\\.000034.*", status);
assertStatus("-00345.*", status);

status = ZeroRegexStatus.TRAILING;
assertStatus("3.*0{0,}[01]", status);
assertStatus("3400\\.0000.", status);
assertStatus("340.*", status);
assertStatus("3400{3}0{2}", status);

}

@Test
void testTrailingZerosWithoutQuantifiers() {
assertTrimmedTo(".*34300", "\\+[e-zA-Z]E.*343");
Expand All @@ -314,6 +348,10 @@ void testMixedAlternation() {
assertTrimmedTo("234\\.45|343.*|0\\.00[0]34.*", "\\+cE2\\.3445|\\+[c-z]E343.*|\\+WE34.*");
}

private void assertStatus(String pattern, ZeroRegexStatus status) {
Assert.equals(ZeroTrimmer.getStatus(RegexParser.parse(pattern).getChildren()), status);
}

private void assertTrimmedTo(String pattern, String expectedPattern) {
Node actual = SimpleNumberEncoder.encode(parse(pattern));
actual = ExponentialBinAdder.addBins(actual);
Expand Down
Loading