From 56206f0b9e66c6dae5a6b6c2743daa12c45382fb Mon Sep 17 00:00:00 2001 From: Craig Perkins Date: Thu, 9 Nov 2023 12:59:13 -0500 Subject: [PATCH] [Refactor] Use iterative approach to evaluate Regex.simpleMatch (#11060) * Remove adjacent duplicates to optimize regex before processing Signed-off-by: Craig Perkins * Add tests Signed-off-by: Craig Perkins * Rename to removeAdjacentDuplicates Signed-off-by: Craig Perkins * Add additional test case Signed-off-by: Craig Perkins * Add CHANGELOG entry Signed-off-by: Craig Perkins * Throw IllegalArgumentException if input is invalid Signed-off-by: Craig Perkins * Add a space Signed-off-by: Craig Perkins * Change error message Signed-off-by: Craig Perkins * Use iterative solution with 2 pointers Signed-off-by: Craig Perkins * Remove unused method Signed-off-by: Craig Perkins * add tests and changelog Signed-off-by: Stephen Crawford * remove bad push Signed-off-by: Stephen Crawford * Update CHANGELOG.md Signed-off-by: Stephen Crawford <65832608+scrawfor99@users.noreply.github.com> * Update CHANGELOG.md Signed-off-by: Stephen Crawford <65832608+scrawfor99@users.noreply.github.com> * spotless Signed-off-by: Stephen Crawford * Update comment Signed-off-by: Craig Perkins * re-run CI Signed-off-by: Craig Perkins --------- Signed-off-by: Craig Perkins Signed-off-by: Craig Perkins Signed-off-by: Stephen Crawford Signed-off-by: Stephen Crawford <65832608+scrawfor99@users.noreply.github.com> Co-authored-by: Stephen Crawford Co-authored-by: Stephen Crawford <65832608+scrawfor99@users.noreply.github.com> Signed-off-by: Shivansh Arora --- CHANGELOG.md | 3 +- .../org/opensearch/common/regex/Regex.java | 54 +++++++++---------- .../opensearch/common/regex/RegexTests.java | 15 ++++++ 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ff0d4e3fd285..e8f27b9323306 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -129,6 +129,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - [BUG] Disable sort optimization for HALF_FLOAT ([#10999](https://github.com/opensearch-project/OpenSearch/pull/10999)) - Performance improvement for MultiTerm Queries on Keyword fields ([#7057](https://github.com/opensearch-project/OpenSearch/issues/7057)) - Disable concurrent aggs for Diversified Sampler and Sampler aggs ([#11087](https://github.com/opensearch-project/OpenSearch/issues/11087)) +- Use iterative approach to evaluate Regex.simpleMatch ([#11060](https://github.com/opensearch-project/OpenSearch/pull/11060)) ### Deprecated @@ -147,4 +148,4 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Security [Unreleased 3.0]: https://github.com/opensearch-project/OpenSearch/compare/2.x...HEAD -[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.12...2.x \ No newline at end of file +[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.12...2.x diff --git a/server/src/main/java/org/opensearch/common/regex/Regex.java b/server/src/main/java/org/opensearch/common/regex/Regex.java index 396af77c8a751..323b460af62df 100644 --- a/server/src/main/java/org/opensearch/common/regex/Regex.java +++ b/server/src/main/java/org/opensearch/common/regex/Regex.java @@ -129,35 +129,35 @@ public static boolean simpleMatch(String pattern, String str, boolean caseInsens } private static boolean simpleMatchWithNormalizedStrings(String pattern, String str) { - final int firstIndex = pattern.indexOf('*'); - if (firstIndex == -1) { - return pattern.equals(str); - } - if (firstIndex == 0) { - if (pattern.length() == 1) { - return true; - } - final int nextIndex = pattern.indexOf('*', firstIndex + 1); - if (nextIndex == -1) { - // str.endsWith(pattern.substring(1)), but avoiding the construction of pattern.substring(1): - return str.regionMatches(str.length() - pattern.length() + 1, pattern, 1, pattern.length() - 1); - } else if (nextIndex == 1) { - // Double wildcard "**" - skipping the first "*" - return simpleMatchWithNormalizedStrings(pattern.substring(1), str); - } - final String part = pattern.substring(1, nextIndex); - int partIndex = str.indexOf(part); - while (partIndex != -1) { - if (simpleMatchWithNormalizedStrings(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) { - return true; - } - partIndex = str.indexOf(part, partIndex + 1); + int sIdx = 0, pIdx = 0, match = 0, wildcardIdx = -1; + while (sIdx < str.length()) { + // both chars matching, incrementing both pointers + if (pIdx < pattern.length() && str.charAt(sIdx) == pattern.charAt(pIdx)) { + sIdx++; + pIdx++; + } else if (pIdx < pattern.length() && pattern.charAt(pIdx) == '*') { + // wildcard found, only incrementing pattern pointer + wildcardIdx = pIdx; + match = sIdx; + pIdx++; + } else if (wildcardIdx != -1) { + // last pattern pointer was a wildcard, incrementing string pointer + pIdx = wildcardIdx + 1; + match++; + sIdx = match; + } else { + // current pattern pointer is not a wildcard, last pattern pointer was also not a wildcard + // characters do not match + return false; } - return false; } - return str.regionMatches(0, pattern, 0, firstIndex) - && (firstIndex == pattern.length() - 1 // only wildcard in pattern is at the end, so no need to look at the rest of the string - || simpleMatchWithNormalizedStrings(pattern.substring(firstIndex), str.substring(firstIndex))); + + // check for remaining characters in pattern + while (pIdx < pattern.length() && pattern.charAt(pIdx) == '*') { + pIdx++; + } + + return pIdx == pattern.length(); } /** diff --git a/server/src/test/java/org/opensearch/common/regex/RegexTests.java b/server/src/test/java/org/opensearch/common/regex/RegexTests.java index b92fcdad56d74..21d3cb2df8f61 100644 --- a/server/src/test/java/org/opensearch/common/regex/RegexTests.java +++ b/server/src/test/java/org/opensearch/common/regex/RegexTests.java @@ -96,7 +96,22 @@ public void testDoubleWildcardMatch() { assertFalse(Regex.simpleMatch("fff**ddd", "fffabcdd")); assertTrue(Regex.simpleMatch("fff*******ddd", "fffabcddd")); assertTrue(Regex.simpleMatch("fff*******ddd", "FffAbcdDd", true)); + assertFalse(Regex.simpleMatch("fff*******ddd", "FffAbcdDd", false)); assertFalse(Regex.simpleMatch("fff******ddd", "fffabcdd")); + assertTrue(Regex.simpleMatch("abCDefGH******ddd", "abCDefGHddd", false)); + assertTrue(Regex.simpleMatch("******", "a")); + assertTrue(Regex.simpleMatch("***WILDcard***", "aaaaaaaaWILDcardZZZZZZ", false)); + assertFalse(Regex.simpleMatch("***xxxxx123456789xxxxxx***", "xxxxxabcdxxxxx", false)); + assertFalse(Regex.simpleMatch("***xxxxxabcdxxxxx***", "xxxxxABCDxxxxx", false)); + assertTrue(Regex.simpleMatch("***xxxxxabcdxxxxx***", "xxxxxABCDxxxxx", true)); + assertTrue(Regex.simpleMatch("**stephenIsSuperCool**", "ItIsTrueThatStephenIsSuperCoolSoYouShouldLetThisIn", true)); + assertTrue( + Regex.simpleMatch( + "**w**X**y**Z**", + "abcdeFGHIJKLMNOPqrstuvwabcdeFGHIJKLMNOPqrstuvwXabcdeFGHIJKLMNOPqrstuvwXyabcdeFGHIJKLMNOPqrstuvwXyZ", + false + ) + ); } public void testSimpleMatch() {