From 313e35c046d39d3c003dceaace105a3a43265f53 Mon Sep 17 00:00:00 2001
From: Songkan Tang <songkant@amazon.com>
Date: Mon, 2 Dec 2024 12:49:08 +0800
Subject: [PATCH 1/5] Introduce log pattern lib with initial implementation of
 Brain algorithm log parser

Signed-off-by: Songkan Tang <songkant@amazon.com>
---
 gradle/missing-javadoc.gradle                 |  10 +-
 libs/pattern/build.gradle                     |  39 +++
 .../opensearch/pattern/BrainLogParser.java    | 304 ++++++++++++++++++
 .../pattern/BrainLogParserTests.java          | 153 +++++++++
 4 files changed, 498 insertions(+), 8 deletions(-)
 create mode 100644 libs/pattern/build.gradle
 create mode 100644 libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java
 create mode 100644 libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java

diff --git a/gradle/missing-javadoc.gradle b/gradle/missing-javadoc.gradle
index 751da941d25dd..77479c93cf4a0 100644
--- a/gradle/missing-javadoc.gradle
+++ b/gradle/missing-javadoc.gradle
@@ -8,7 +8,6 @@
 
 
 import javax.annotation.Nullable
-import javax.inject.Inject
 import org.gradle.api.tasks.PathSensitive;
 import org.gradle.api.tasks.PathSensitivity;
 import org.gradle.internal.jvm.Jvm
@@ -102,6 +101,7 @@ configure([
   project(":libs:opensearch-geo"),
   project(":libs:opensearch-grok"),
   project(":libs:opensearch-nio"),
+  project(":libs:opensearch-pattern"),
   project(":libs:opensearch-plugin-classloader"),
   project(":libs:opensearch-secure-sm"),
   project(":libs:opensearch-ssl-config"),
@@ -228,11 +228,6 @@ class MissingJavadocTask extends DefaultTask {
   @PathSensitive(PathSensitivity.RELATIVE)
   def taskResources
 
-  // See please https://docs.gradle.org/8.11/userguide/service_injection.html#execoperations
-  interface InjectedExecOps {
-    @Inject ExecOperations getExecOps()
-  }
-
   /** Utility method to recursively collect all tasks with same name like this one that we depend on */
   private Set findRenderTasksInDependencies() {
     Set found = []
@@ -323,12 +318,11 @@ class MissingJavadocTask extends DefaultTask {
       }
     }()
 
-    def execOps = project.objects.newInstance(InjectedExecOps)
     def outputFile = project.file("${getTemporaryDir()}/javadoc-output.txt")
     def result
 
     outputFile.withOutputStream { output ->
-      result = execOps.execOps.exec {
+      result = project.exec {
         executable javadocCmd
 
         // we want to capture both stdout and stderr to the same
diff --git a/libs/pattern/build.gradle b/libs/pattern/build.gradle
new file mode 100644
index 0000000000000..bd12d772f3673
--- /dev/null
+++ b/libs/pattern/build.gradle
@@ -0,0 +1,39 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+apply plugin: 'opensearch.build'
+apply plugin: 'opensearch.publish'
+
+dependencies {
+  testImplementation(project(":test:framework")) {
+    exclude group: 'org.opensearch', module: 'opensearch-pattern'
+  }
+}
+
+tasks.named('forbiddenApisMain').configure {
+  replaceSignatureFiles 'jdk-signatures'
+}
diff --git a/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java
new file mode 100644
index 0000000000000..b2c9337938ca4
--- /dev/null
+++ b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java
@@ -0,0 +1,304 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.pattern;
+
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.OptionalLong;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * Log parser Brain algorithm implementation. See: https://ieeexplore.ieee.org/document/10109145
+ */
+public class BrainLogParser {
+
+    private static final List<String> defaultFilterPatterns = List.of(
+        "(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)", // IP
+        "(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$" // Numbers
+    );
+    private static final List<String> defaultDelimiters = List.of(":", "=", "[", "]", "(", ")", "-", "|", ",", "+");
+    private static final String variableDenoter = "<*>";
+    // counting frequency will be grouped by composite of position and token string
+    private static final String positionedTokenKeyFormat = "%d-%s";
+    // Token set will be grouped by composite of tokens length per log message, word combination candidate and token position.
+    private static final String groupTokenSetKeyFormat = "%d-%s-%d";
+
+    private final Map<String, Long> tokenFreqMap;
+    private final Map<String, Set<String>> groupTokenSetMap;
+    private final Map<String, String> logIdGroupCandidateMap;
+    private final int variableCountThreshold;
+    private final float thresholdPercentage;
+    private final List<String> filterPatterns;
+    private final List<String> delimiters;
+
+    /**
+     * Creates new Brain log parser with default parameters
+     */
+    public BrainLogParser() {
+        this(2, 0.0f, defaultFilterPatterns, defaultDelimiters);
+    }
+
+    /**
+     * Creates new Brain log parser with overridden variableCountThreshold
+     * @param variableCountThreshold the threshold to decide whether low frequency token is variable
+     */
+    public BrainLogParser(int variableCountThreshold) {
+        this(variableCountThreshold, 0.0f, defaultFilterPatterns, defaultDelimiters);
+    }
+
+    /**
+     * Creates new Brain log parser with overridden variableCountThreshold amd thresholdPercentage
+     * @param variableCountThreshold the threshold to decide whether low frequency token is variable
+     * @param thresholdPercentage the threshold percentage to decide which frequency is representative
+     *                            frequency per log message
+     */
+    public BrainLogParser(int variableCountThreshold, float thresholdPercentage) {
+        this(variableCountThreshold, thresholdPercentage, defaultFilterPatterns, defaultDelimiters);
+    }
+
+    /**
+     * Creates new Brain log parser with overridden variableCountThreshold amd thresholdPercentage and
+     * overridden filter patterns and delimiters
+     * @param variableCountThreshold the threshold to decide whether low frequency token is variable
+     * @param thresholdPercentage the threshold percentage to decide which frequency is representative
+     *                            frequency per log message
+     * @param filterPatterns a list of regex to replace matched pattern to be replaced with variable denoter
+     * @param delimiters a list of delimiters to be replaced with empty string after regex replacement
+     */
+    public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List<String> filterPatterns, List<String> delimiters) {
+        this.tokenFreqMap = new HashMap<>();
+        this.groupTokenSetMap = new HashMap<>();
+        this.logIdGroupCandidateMap = new HashMap<>();
+        this.variableCountThreshold = variableCountThreshold;
+        this.thresholdPercentage = thresholdPercentage;
+        this.filterPatterns = filterPatterns;
+        this.delimiters = delimiters;
+    }
+
+    /**
+     * Preprocess single line of log message with logId
+     * @param logMessage log message body per log
+     * @param logId logId of the log
+     * @return list of tokens by splitting preprocessed log message
+     */
+    public List<String> preprocess(String logMessage, String logId) {
+        // match regex and replace it with variable denoter
+        for (String pattern : filterPatterns) {
+            logMessage = logMessage.replaceAll(pattern, variableDenoter);
+        }
+
+        for (String delimiter : delimiters) {
+            logMessage = logMessage.replace(delimiter, "");
+        }
+
+        // Append logId/docId to the end of the split tokens
+        logMessage = logMessage.trim() + " " + logId;
+
+        return Arrays.asList(logMessage.split(" "));
+    }
+
+    /**
+     * Count token frequency per position/index in the token list
+     * @param tokens list of tokens from preprocessed log message
+     */
+    public void processTokenHistogram(List<String> tokens) {
+        // Ignore last element since it's designed to be appended logId
+        for (int i = 0; i < tokens.size() - 1; i++) {
+            String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, i, tokens.get(i));
+            tokenFreqMap.put(tokenKey, tokenFreqMap.getOrDefault(tokenKey, 0L) + 1);
+        }
+    }
+
+    /**
+     * Preprocess all lines of log messages with logId list. Empty logId list is allowed as the index within
+     * the list will be logId by default
+     * @param logMessages list of log messages
+     * @param logIds list of logIds corresponded to log message
+     * @return list of token lists
+     */
+    public List<List<String>> preprocessAllLogs(List<String> logMessages, List<String> logIds) {
+        List<List<String>> preprocessedLogs = new ArrayList<>();
+        int size = logIds.isEmpty() ? logMessages.size() : Math.min(logMessages.size(), logIds.size());
+
+        for (int i = 0; i < size; i++) {
+            String logId = logIds.isEmpty() ? String.valueOf(i) : logIds.get(i);
+            List<String> tokens = this.preprocess(logMessages.get(i), logId);
+            if (tokens.size() > 1) {
+                preprocessedLogs.add(tokens);
+                this.processTokenHistogram(tokens);
+            }
+        }
+
+        return preprocessedLogs;
+    }
+
+    /**
+     * The second process step to calculate initial groups of tokens based on previous token histogram.
+     * The group will be represented by the representative word combination of the log message. The word
+     * combination usually selects the longest word combination with the same frequency that should be above
+     * designed threshold.
+     * <p>
+     * Within initial group, new group level token set per position is counted for final log pattern calculation
+     * @param preprocessedLogs preprocessed list of log messages
+     */
+    public void calculateGroupTokenFreq(List<List<String>> preprocessedLogs) {
+        for (List<String> tokens : preprocessedLogs) {
+            Map<Long, Integer> wordOccurrences = this.getWordOccurrences(tokens);
+            List<Map.Entry<Long, Integer>> sortedOccurrences = this.getSortedWordCombinations(wordOccurrences);
+            Map.Entry<Long, Integer> candidate = this.findCandidate(sortedOccurrences);
+            String groupCandidateStr = String.format(Locale.ROOT, "%d,%d", candidate.getKey(), candidate.getValue());
+            this.logIdGroupCandidateMap.put(tokens.get(tokens.size() - 1), groupCandidateStr);
+            this.updateGroupTokenFreqMap(tokens, groupCandidateStr);
+        }
+    }
+
+    /**
+     * Parse single line of log pattern after preprocess - processTokenHistogram - calculateGroupTokenFreq
+     * @param tokens list of tokens for a specific log message
+     * @return parsed log pattern that is a list of string
+     */
+    public List<String> parseLogPattern(List<String> tokens) {
+        String logId = tokens.get(tokens.size() - 1);
+        String groupCandidateStr = this.logIdGroupCandidateMap.get(logId);
+        String[] groupCandidate = groupCandidateStr.split(",");
+        Long repFreq = Long.parseLong(groupCandidate[0]); // representative frequency of the group
+        return IntStream.range(0, tokens.size() - 1).mapToObj(i -> new AbstractMap.SimpleEntry<>(i, tokens.get(i))).map(entry -> {
+            int index = entry.getKey();
+            String token = entry.getValue();
+            String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, index, token);
+            assert this.tokenFreqMap.get(tokenKey) != null : String.format(Locale.ROOT, "Not found token: %s on position %d", token, index);
+
+            boolean isHigherFrequency = this.tokenFreqMap.get(tokenKey) > repFreq;
+            boolean isLowerFrequency = this.tokenFreqMap.get(tokenKey) < repFreq;
+            String groupTokenKey = String.format(Locale.ROOT, groupTokenSetKeyFormat, tokens.size() - 1, groupCandidateStr, index);
+            assert this.groupTokenSetMap.get(groupTokenKey) != null : String.format(
+                Locale.ROOT,
+                "Not found any token in group: %s",
+                groupTokenKey
+            );
+
+            if (isHigherFrequency) {
+                // For higher frequency token that doesn't belong to word combination, it's likely to be constant token only if
+                // it's unique token on that position within the group
+                boolean isUniqueToken = this.groupTokenSetMap.get(groupTokenKey).size() == 1;
+                if (!isUniqueToken) {
+                    return variableDenoter;
+                }
+            } else if (isLowerFrequency) {
+                // For lower frequency token that doesn't belong to word combination, it's likely to be constant token only if
+                // it doesn't exceed the preset variable count threshold. For example, some variable are limited number of enums,
+                // and sometimes they could be treated as constant tokens.
+                if (this.groupTokenSetMap.get(groupTokenKey).size() >= variableCountThreshold) {
+                    return variableDenoter;
+                }
+            }
+            return token;
+        }).collect(Collectors.toList());
+    }
+
+    /**
+     * Parse all lines of log messages to generate the log pattern map.
+     * @param logMessages all lines of log messages
+     * @param logIds corresponding logIds for all lines of log messages
+     * @return log pattern map with log pattern string as key, grouped logIds as value
+     */
+    public Map<String, List<String>> parseAllLogPatterns(List<String> logMessages, List<String> logIds) {
+        List<List<String>> processedMessages = this.preprocessAllLogs(logMessages, logIds);
+
+        this.calculateGroupTokenFreq(processedMessages);
+
+        Map<String, List<String>> logPatternMap = new HashMap<>();
+        for (int i = 0; i < processedMessages.size(); i++) {
+            List<String> processedMessage = processedMessages.get(i);
+            String logId = logIds.isEmpty() ? String.valueOf(i) : processedMessage.get(processedMessage.size() - 1);
+            List<String> logPattern = this.parseLogPattern(processedMessages.get(i));
+            String patternKey = String.join(" ", logPattern);
+            logPatternMap.computeIfAbsent(patternKey, k -> new ArrayList<>()).add(logId);
+        }
+        return logPatternMap;
+    }
+
+    /**
+     * Get token histogram
+     * @return map of token per position key and its frequency
+     */
+    public Map<String, Long> getTokenFreqMap() {
+        return this.tokenFreqMap;
+    }
+
+    /**
+     * Get group per length per position to its token set map
+     * @return map of pattern group per length per position key and its token set
+     */
+    public Map<String, Set<String>> getGroupTokenSetMap() {
+        return this.groupTokenSetMap;
+    }
+
+    /**
+     * Get logId to its group candidate map
+     * @return map of logId and group candidate
+     */
+    public Map<String, String> getLogIdGroupCandidateMap() {
+        return this.logIdGroupCandidateMap;
+    }
+
+    private Map<Long, Integer> getWordOccurrences(List<String> tokens) {
+        Map<Long, Integer> occurrences = new HashMap<>();
+        for (int i = 0; i < tokens.size() - 1; i++) {
+            String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, i, tokens.get(i));
+            Long tokenFreq = tokenFreqMap.get(tokenKey);
+            occurrences.put(tokenFreq, occurrences.getOrDefault(tokenFreq, 0) + 1);
+        }
+        return occurrences;
+    }
+
+    private List<Map.Entry<Long, Integer>> getSortedWordCombinations(Map<Long, Integer> occurrences) {
+        List<Map.Entry<Long, Integer>> sortedOccurrences = new ArrayList<>(occurrences.entrySet());
+        sortedOccurrences.sort((entry1, entry2) -> {
+            int wordCombinationLengthComparison = entry2.getValue().compareTo(entry1.getValue());
+            if (wordCombinationLengthComparison != 0) {
+                return wordCombinationLengthComparison;
+            } else {
+                return entry2.getKey().compareTo(entry1.getKey());
+            }
+        });
+
+        return sortedOccurrences;
+    }
+
+    private Map.Entry<Long, Integer> findCandidate(List<Map.Entry<Long, Integer>> sortedWordCombinations) {
+        OptionalLong maxFreqOptional = sortedWordCombinations.stream().mapToLong(Map.Entry::getKey).max();
+        if (maxFreqOptional.isPresent()) {
+            long maxFreq = maxFreqOptional.getAsLong();
+            float threshold = maxFreq * this.thresholdPercentage;
+            for (Map.Entry<Long, Integer> entry : sortedWordCombinations) {
+                if (entry.getKey() > threshold) {
+                    return entry;
+                }
+            }
+        }
+        return sortedWordCombinations.get(0);
+    }
+
+    private void updateGroupTokenFreqMap(List<String> tokens, String groupCandidateStr) {
+        int tokensLen = tokens.size() - 1;
+        for (int i = 0; i < tokensLen; i++) {
+            String groupTokenFreqKey = String.format(Locale.ROOT, groupTokenSetKeyFormat, tokensLen, groupCandidateStr, i);
+            this.groupTokenSetMap.computeIfAbsent(groupTokenFreqKey, k -> new HashSet<>()).add(tokens.get(i));
+        }
+    }
+}
diff --git a/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java
new file mode 100644
index 0000000000000..137e2eb590c4f
--- /dev/null
+++ b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java
@@ -0,0 +1,153 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+package org.opensearch.pattern;
+
+import org.opensearch.test.OpenSearchTestCase;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+public class BrainLogParserTests extends OpenSearchTestCase {
+
+    private static final List<String> TEST_HDFS_LOGS = Arrays.asList(
+        "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.31.85:50010 is added to blk_-7017553867379051457 size 67108864",
+        "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000296_0/part-00296. blk_-6620182933895093708",
+        "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.7.244:50010 is added to blk_-6956067134432991406 size 67108864",
+        "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000230_0/part-00230. blk_559204981722276126",
+        "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000169_0/part-00169. blk_-7105305952901940477",
+        "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.107.19:50010 is added to blk_-3249711809227781266 size 67108864",
+        "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000318_0/part-00318. blk_-207775976836691685",
+        "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.6.4:50010 is added to blk_5114010683183383297 size 67108864",
+        "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000318_0/part-00318. blk_2096692261399680562",
+        "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.15.240:50010 is added to blk_-1055254430948037872 size 67108864",
+        "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.7.146:50010 is added to blk_278357163850888 size 67108864",
+        "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000138_0/part-00138. blk_-210021574616486609",
+        "Verification succeeded for blk_-1547954353065580372",
+        "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.39.242:50010 is added to blk_-4110733372292809607 size 67108864",
+        "BLOCK* NameSystem.allocateBlock: /user/root/randtxt/_temporary/_task_200811092030_0003_m_000382_0/part-00382. blk_8935202950442998446",
+        "BLOCK* NameSystem.allocateBlock: /user/root/randtxt/_temporary/_task_200811092030_0003_m_000392_0/part-00392. blk_-3010126661650043258",
+        "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.25.237:50010 is added to blk_541463031152673662 size 67108864",
+        "Verification succeeded for blk_6996194389878584395",
+        "PacketResponder failed for blk_6996194389878584395",
+        "PacketResponder failed for blk_-1547954353065580372"
+    );
+
+    private BrainLogParser parser;
+
+    @Override
+    public void setUp() throws Exception {
+        super.setUp();
+        parser = new BrainLogParser();
+    }
+
+    public void testPreprocess() {
+        String logMessage = "127.0.0.1 - 1234 something";
+        String logId = "log1";
+        List<String> expectedResult = Arrays.asList("<*>", "", "<*>", "something", "log1");
+        List<String> result = parser.preprocess(logMessage, logId);
+        assertEquals(expectedResult, result);
+
+        // Test with different delimiter
+        logMessage = "127.0.0.1=1234 something";
+        logId = "log2";
+        expectedResult = Arrays.asList("<*><*>", "something", "log2");
+        result = parser.preprocess(logMessage, logId);
+        assertEquals(expectedResult, result);
+    }
+
+    public void testPreprocessAllLogs() {
+        List<String> logMessages = Arrays.asList("127.0.0.1 - 1234 something", "192.168.0.1 - 5678 something_else");
+        List<String> logIds = Arrays.asList("log1", "log2");
+
+        List<List<String>> result = parser.preprocessAllLogs(logMessages, logIds);
+
+        assertEquals(2, result.size());
+        assertEquals(Arrays.asList("<*>", "", "<*>", "something", "log1"), result.get(0));
+        assertEquals(Arrays.asList("<*>", "", "<*>", "something_else", "log2"), result.get(1));
+    }
+
+    public void testProcessTokenHistogram() {
+        String something = String.format(Locale.ROOT, "%d-%s", 0, "something");
+        String up = String.format(Locale.ROOT, "%d-%s", 1, "up");
+        List<String> firstTokens = Arrays.asList("something", "up", "0");
+        parser.processTokenHistogram(firstTokens);
+        assertEquals(1L, parser.getTokenFreqMap().get(something).longValue());
+        assertEquals(1L, parser.getTokenFreqMap().get(up).longValue());
+
+        List<String> secondTokens = Arrays.asList("something", "down", "1");
+        parser.processTokenHistogram(secondTokens);
+        assertEquals(2L, parser.getTokenFreqMap().get(something).longValue());
+        assertEquals(1L, parser.getTokenFreqMap().get(up).longValue());
+    }
+
+    public void testCalculateGroupTokenFreq() {
+        List<String> logMessages = Arrays.asList(
+            "127.0.0.1 - 1234 something",
+            "192.168.0.1:5678 something_else",
+            "0.0.0.0:42 something_else"
+        );
+        List<String> logIds = Arrays.asList("log1", "log2", "log3");
+
+        List<List<String>> preprocessedLogs = parser.preprocessAllLogs(logMessages, logIds);
+        parser.calculateGroupTokenFreq(preprocessedLogs);
+
+        for (String logId : logIds) {
+            String groupCandidate = parser.getLogIdGroupCandidateMap().get(logId);
+            assertNotNull(groupCandidate);
+        }
+        assertTrue(parser.getGroupTokenSetMap().containsValue(Set.of("something")));
+        assertTrue(parser.getGroupTokenSetMap().containsValue(Set.of("something_else")));
+        String sampleGroupTokenKey = String.format(Locale.ROOT, "%d-%s-%d", 4, parser.getLogIdGroupCandidateMap().get("log1"), 3);
+        assertTrue(parser.getGroupTokenSetMap().get(sampleGroupTokenKey).contains("something"));
+    }
+
+    public void testParseLogPattern() {
+        List<List<String>> preprocessedLogs = parser.preprocessAllLogs(TEST_HDFS_LOGS, List.of());
+        parser.calculateGroupTokenFreq(preprocessedLogs);
+
+        List<String> expectedLogPattern = Arrays.asList(
+            "BLOCK*",
+            "NameSystem.addStoredBlock",
+            "blockMap",
+            "updated",
+            "<*>",
+            "is",
+            "added",
+            "to",
+            "blk_<*>",
+            "size",
+            "<*>"
+        );
+        List<String> logPattern = parser.parseLogPattern(preprocessedLogs.get(0));
+        assertEquals(expectedLogPattern, logPattern);
+    }
+
+    public void testParseAllLogPatterns() {
+        Map<String, List<String>> logPatternMap = parser.parseAllLogPatterns(TEST_HDFS_LOGS, List.of());
+
+        Map<String, Integer> expectedResult = Map.of(
+            "PacketResponder failed for blk_<*>",
+            2,
+            "Verification succeeded for blk_<*>",
+            2,
+            "BLOCK* NameSystem.addStoredBlock blockMap updated <*> is added to blk_<*> size <*>",
+            8,
+            "BLOCK* NameSystem.allocateBlock <*> blk_<*>",
+            8
+        );
+        Map<String, Integer> logPatternByCountMap = logPatternMap.entrySet()
+            .stream()
+            .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().size()));
+        assertEquals(expectedResult, logPatternByCountMap);
+    }
+}

From bf5a834d6899caccf98a9f67808f2c0d689cb9f1 Mon Sep 17 00:00:00 2001
From: Songkan Tang <songkant@amazon.com>
Date: Mon, 2 Dec 2024 13:41:39 +0800
Subject: [PATCH 2/5] Remove unnecessary changes in missing-javadoc.gradle

Signed-off-by: Songkan Tang <songkant@amazon.com>
---
 gradle/missing-javadoc.gradle | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/gradle/missing-javadoc.gradle b/gradle/missing-javadoc.gradle
index 77479c93cf4a0..97831e88043d7 100644
--- a/gradle/missing-javadoc.gradle
+++ b/gradle/missing-javadoc.gradle
@@ -8,6 +8,7 @@
 
 
 import javax.annotation.Nullable
+import javax.inject.Inject
 import org.gradle.api.tasks.PathSensitive;
 import org.gradle.api.tasks.PathSensitivity;
 import org.gradle.internal.jvm.Jvm
@@ -228,6 +229,11 @@ class MissingJavadocTask extends DefaultTask {
   @PathSensitive(PathSensitivity.RELATIVE)
   def taskResources
 
+  // See please https://docs.gradle.org/8.11/userguide/service_injection.html#execoperations
+  interface InjectedExecOps {
+    @Inject ExecOperations getExecOps()
+  }
+
   /** Utility method to recursively collect all tasks with same name like this one that we depend on */
   private Set findRenderTasksInDependencies() {
     Set found = []
@@ -318,11 +324,12 @@ class MissingJavadocTask extends DefaultTask {
       }
     }()
 
+    def execOps = project.objects.newInstance(InjectedExecOps)
     def outputFile = project.file("${getTemporaryDir()}/javadoc-output.txt")
     def result
 
     outputFile.withOutputStream { output ->
-      result = project.exec {
+      result = execOps.execOps.exec {
         executable javadocCmd
 
         // we want to capture both stdout and stderr to the same

From 00de7ad0f42609dba5f204c874e04144661d2cb7 Mon Sep 17 00:00:00 2001
From: Songkan Tang <songkant@amazon.com>
Date: Mon, 2 Dec 2024 14:37:33 +0800
Subject: [PATCH 3/5] Update changelog

Signed-off-by: Songkan Tang <songkant@amazon.com>
---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c81586548d210..9f64b5001a0aa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Support prefix list for remote repository attributes([#16271](https://github.com/opensearch-project/OpenSearch/pull/16271))
 - Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)).
 - Add stats for remote publication failure and move download failure stats to remote methods([#16682](https://github.com/opensearch-project/OpenSearch/pull/16682/))
+- Introduce log pattern lib with initial implementation of Brain algorithm log parser([#16751](https://github.com/opensearch-project/OpenSearch/pull/16751))
 
 ### Dependencies
 - Bump `com.google.cloud:google-cloud-core-http` from 2.23.0 to 2.47.0 ([#16504](https://github.com/opensearch-project/OpenSearch/pull/16504))

From 3dd41565350c56453a3ba883b1b581413b4359f1 Mon Sep 17 00:00:00 2001
From: Songkan Tang <songkant@amazon.com>
Date: Tue, 3 Dec 2024 16:23:28 +0800
Subject: [PATCH 4/5] Address comments and add more unit test cases

Signed-off-by: Songkan Tang <songkant@amazon.com>
---
 libs/pattern/build.gradle                     | 19 ----
 .../opensearch/pattern/BrainLogParser.java    | 96 ++++++++++++-------
 .../pattern/BrainLogParserTests.java          | 76 +++++++++++++++
 3 files changed, 137 insertions(+), 54 deletions(-)

diff --git a/libs/pattern/build.gradle b/libs/pattern/build.gradle
index bd12d772f3673..f63206f2cee33 100644
--- a/libs/pattern/build.gradle
+++ b/libs/pattern/build.gradle
@@ -6,25 +6,6 @@
  * compatible open source license.
  */
 
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 apply plugin: 'opensearch.build'
 apply plugin: 'opensearch.publish'
 
diff --git a/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java
index b2c9337938ca4..50c190d3529e4 100644
--- a/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java
+++ b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java
@@ -18,6 +18,7 @@
 import java.util.Map;
 import java.util.OptionalLong;
 import java.util.Set;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
@@ -26,30 +27,37 @@
  */
 public class BrainLogParser {
 
-    private static final List<String> defaultFilterPatterns = List.of(
-        "(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)", // IP
-        "(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$" // Numbers
+    private static final List<Pattern> DEFAULT_FILTER_PATTERNS = Arrays.asList(
+        Pattern.compile("(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)"), // IP
+        Pattern.compile("(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$") // Numbers
     );
-    private static final List<String> defaultDelimiters = List.of(":", "=", "[", "]", "(", ")", "-", "|", ",", "+");
-    private static final String variableDenoter = "<*>";
+    private static final List<String> DEFAULT_DELIMITERS = List.of(":", "=", "[", "]", "(", ")", "-", "|", ",", "+");
+    private static final String VARIABLE_DENOTER = "<*>";
     // counting frequency will be grouped by composite of position and token string
-    private static final String positionedTokenKeyFormat = "%d-%s";
+    private static final String POSITIONED_TOKEN_KEY_FORMAT = "%d-%s";
     // Token set will be grouped by composite of tokens length per log message, word combination candidate and token position.
-    private static final String groupTokenSetKeyFormat = "%d-%s-%d";
+    private static final String GROUP_TOKEN_SET_KEY_FORMAT = "%d-%s-%d";
+    // By default, algorithm treats more than 2 different tokens in the group per position as variable token
+    private static final int DEFAULT_VARIABLE_COUNT_THRESHOLD = 2;
+    /*
+     * By default, algorithm treats the longest word combinations as the group root, no matter what its frequency is.
+     * Otherwise, the longest word combination will be selected when frequency >= highest frequency of log * threshold percentage
+     */
+    private static final float DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE = 0.0f;
 
     private final Map<String, Long> tokenFreqMap;
     private final Map<String, Set<String>> groupTokenSetMap;
     private final Map<String, String> logIdGroupCandidateMap;
     private final int variableCountThreshold;
     private final float thresholdPercentage;
-    private final List<String> filterPatterns;
+    private final List<Pattern> filterPatterns;
     private final List<String> delimiters;
 
     /**
      * Creates new Brain log parser with default parameters
      */
     public BrainLogParser() {
-        this(2, 0.0f, defaultFilterPatterns, defaultDelimiters);
+        this(DEFAULT_VARIABLE_COUNT_THRESHOLD, DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS);
     }
 
     /**
@@ -57,33 +65,47 @@ public BrainLogParser() {
      * @param variableCountThreshold the threshold to decide whether low frequency token is variable
      */
     public BrainLogParser(int variableCountThreshold) {
-        this(variableCountThreshold, 0.0f, defaultFilterPatterns, defaultDelimiters);
+        this(variableCountThreshold, DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS);
     }
 
     /**
-     * Creates new Brain log parser with overridden variableCountThreshold amd thresholdPercentage
+     * Creates new Brain log parser with overridden variableCountThreshold and thresholdPercentage
      * @param variableCountThreshold the threshold to decide whether low frequency token is variable
      * @param thresholdPercentage the threshold percentage to decide which frequency is representative
      *                            frequency per log message
      */
     public BrainLogParser(int variableCountThreshold, float thresholdPercentage) {
-        this(variableCountThreshold, thresholdPercentage, defaultFilterPatterns, defaultDelimiters);
+        this(variableCountThreshold, thresholdPercentage, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS);
     }
 
     /**
-     * Creates new Brain log parser with overridden variableCountThreshold amd thresholdPercentage and
+     * Creates new Brain log parser with overridden variableCountThreshold, thresholdPercentage and filter patterns
+     * @param variableCountThreshold the threshold to decide whether low frequency token is variable
+     * @param thresholdPercentage the threshold percentage to decide which frequency is representative
+     *                            frequency per log message
+     * @param filterPatterns a list of regex to replace matched pattern with variable denoter
+     */
+    public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List<Pattern> filterPatterns) {
+        this(variableCountThreshold, thresholdPercentage, filterPatterns, DEFAULT_DELIMITERS);
+    }
+
+    /**
+     * Creates new Brain log parser with overridden variableCountThreshold and thresholdPercentage and
      * overridden filter patterns and delimiters
      * @param variableCountThreshold the threshold to decide whether low frequency token is variable
      * @param thresholdPercentage the threshold percentage to decide which frequency is representative
      *                            frequency per log message
-     * @param filterPatterns a list of regex to replace matched pattern to be replaced with variable denoter
+     * @param filterPatterns a list of regex to replace matched pattern with variable denoter
      * @param delimiters a list of delimiters to be replaced with empty string after regex replacement
      */
-    public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List<String> filterPatterns, List<String> delimiters) {
+    public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List<Pattern> filterPatterns, List<String> delimiters) {
         this.tokenFreqMap = new HashMap<>();
         this.groupTokenSetMap = new HashMap<>();
         this.logIdGroupCandidateMap = new HashMap<>();
         this.variableCountThreshold = variableCountThreshold;
+        if (thresholdPercentage < 0.0f || thresholdPercentage > 1.0f) {
+            throw new IllegalArgumentException("Threshold percentage must be between 0.0 and 1.0");
+        }
         this.thresholdPercentage = thresholdPercentage;
         this.filterPatterns = filterPatterns;
         this.delimiters = delimiters;
@@ -96,9 +118,12 @@ public BrainLogParser(int variableCountThreshold, float thresholdPercentage, Lis
      * @return list of tokens by splitting preprocessed log message
      */
     public List<String> preprocess(String logMessage, String logId) {
+        if (logMessage == null || logId == null) {
+            throw new IllegalArgumentException("log message or logId must not be null");
+        }
         // match regex and replace it with variable denoter
-        for (String pattern : filterPatterns) {
-            logMessage = logMessage.replaceAll(pattern, variableDenoter);
+        for (Pattern pattern : filterPatterns) {
+            logMessage = pattern.matcher(logMessage).replaceAll(VARIABLE_DENOTER);
         }
 
         for (String delimiter : delimiters) {
@@ -118,7 +143,7 @@ public List<String> preprocess(String logMessage, String logId) {
     public void processTokenHistogram(List<String> tokens) {
         // Ignore last element since it's designed to be appended logId
         for (int i = 0; i < tokens.size() - 1; i++) {
-            String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, i, tokens.get(i));
+            String tokenKey = String.format(Locale.ROOT, POSITIONED_TOKEN_KEY_FORMAT, i, tokens.get(i));
             tokenFreqMap.put(tokenKey, tokenFreqMap.getOrDefault(tokenKey, 0L) + 1);
         }
     }
@@ -137,10 +162,8 @@ public List<List<String>> preprocessAllLogs(List<String> logMessages, List<Strin
         for (int i = 0; i < size; i++) {
             String logId = logIds.isEmpty() ? String.valueOf(i) : logIds.get(i);
             List<String> tokens = this.preprocess(logMessages.get(i), logId);
-            if (tokens.size() > 1) {
-                preprocessedLogs.add(tokens);
-                this.processTokenHistogram(tokens);
-            }
+            preprocessedLogs.add(tokens);
+            this.processTokenHistogram(tokens);
         }
 
         return preprocessedLogs;
@@ -179,12 +202,12 @@ public List<String> parseLogPattern(List<String> tokens) {
         return IntStream.range(0, tokens.size() - 1).mapToObj(i -> new AbstractMap.SimpleEntry<>(i, tokens.get(i))).map(entry -> {
             int index = entry.getKey();
             String token = entry.getValue();
-            String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, index, token);
+            String tokenKey = String.format(Locale.ROOT, POSITIONED_TOKEN_KEY_FORMAT, index, token);
             assert this.tokenFreqMap.get(tokenKey) != null : String.format(Locale.ROOT, "Not found token: %s on position %d", token, index);
 
             boolean isHigherFrequency = this.tokenFreqMap.get(tokenKey) > repFreq;
             boolean isLowerFrequency = this.tokenFreqMap.get(tokenKey) < repFreq;
-            String groupTokenKey = String.format(Locale.ROOT, groupTokenSetKeyFormat, tokens.size() - 1, groupCandidateStr, index);
+            String groupTokenKey = String.format(Locale.ROOT, GROUP_TOKEN_SET_KEY_FORMAT, tokens.size() - 1, groupCandidateStr, index);
             assert this.groupTokenSetMap.get(groupTokenKey) != null : String.format(
                 Locale.ROOT,
                 "Not found any token in group: %s",
@@ -196,14 +219,14 @@ public List<String> parseLogPattern(List<String> tokens) {
                 // it's unique token on that position within the group
                 boolean isUniqueToken = this.groupTokenSetMap.get(groupTokenKey).size() == 1;
                 if (!isUniqueToken) {
-                    return variableDenoter;
+                    return VARIABLE_DENOTER;
                 }
             } else if (isLowerFrequency) {
                 // For lower frequency token that doesn't belong to word combination, it's likely to be constant token only if
                 // it doesn't exceed the preset variable count threshold. For example, some variable are limited number of enums,
                 // and sometimes they could be treated as constant tokens.
                 if (this.groupTokenSetMap.get(groupTokenKey).size() >= variableCountThreshold) {
-                    return variableDenoter;
+                    return VARIABLE_DENOTER;
                 }
             }
             return token;
@@ -259,7 +282,7 @@ public Map<String, String> getLogIdGroupCandidateMap() {
     private Map<Long, Integer> getWordOccurrences(List<String> tokens) {
         Map<Long, Integer> occurrences = new HashMap<>();
         for (int i = 0; i < tokens.size() - 1; i++) {
-            String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, i, tokens.get(i));
+            String tokenKey = String.format(Locale.ROOT, POSITIONED_TOKEN_KEY_FORMAT, i, tokens.get(i));
             Long tokenFreq = tokenFreqMap.get(tokenKey);
             occurrences.put(tokenFreq, occurrences.getOrDefault(tokenFreq, 0) + 1);
         }
@@ -269,10 +292,12 @@ private Map<Long, Integer> getWordOccurrences(List<String> tokens) {
     private List<Map.Entry<Long, Integer>> getSortedWordCombinations(Map<Long, Integer> occurrences) {
         List<Map.Entry<Long, Integer>> sortedOccurrences = new ArrayList<>(occurrences.entrySet());
         sortedOccurrences.sort((entry1, entry2) -> {
+            // Sort by length of the word combination in descending order
             int wordCombinationLengthComparison = entry2.getValue().compareTo(entry1.getValue());
             if (wordCombinationLengthComparison != 0) {
                 return wordCombinationLengthComparison;
             } else {
+                // If the length of word combinations are the same, sort frequency in descending order
                 return entry2.getKey().compareTo(entry1.getKey());
             }
         });
@@ -281,14 +306,15 @@ private List<Map.Entry<Long, Integer>> getSortedWordCombinations(Map<Long, Integ
     }
 
     private Map.Entry<Long, Integer> findCandidate(List<Map.Entry<Long, Integer>> sortedWordCombinations) {
+        if (sortedWordCombinations.isEmpty()) {
+            throw new IllegalArgumentException("Sorted word combinations must be non empty");
+        }
         OptionalLong maxFreqOptional = sortedWordCombinations.stream().mapToLong(Map.Entry::getKey).max();
-        if (maxFreqOptional.isPresent()) {
-            long maxFreq = maxFreqOptional.getAsLong();
-            float threshold = maxFreq * this.thresholdPercentage;
-            for (Map.Entry<Long, Integer> entry : sortedWordCombinations) {
-                if (entry.getKey() > threshold) {
-                    return entry;
-                }
+        long maxFreq = maxFreqOptional.getAsLong();
+        float threshold = maxFreq * this.thresholdPercentage;
+        for (Map.Entry<Long, Integer> entry : sortedWordCombinations) {
+            if (entry.getKey() > threshold) {
+                return entry;
             }
         }
         return sortedWordCombinations.get(0);
@@ -297,7 +323,7 @@ private Map.Entry<Long, Integer> findCandidate(List<Map.Entry<Long, Integer>> so
     private void updateGroupTokenFreqMap(List<String> tokens, String groupCandidateStr) {
         int tokensLen = tokens.size() - 1;
         for (int i = 0; i < tokensLen; i++) {
-            String groupTokenFreqKey = String.format(Locale.ROOT, groupTokenSetKeyFormat, tokensLen, groupCandidateStr, i);
+            String groupTokenFreqKey = String.format(Locale.ROOT, GROUP_TOKEN_SET_KEY_FORMAT, tokensLen, groupCandidateStr, i);
             this.groupTokenSetMap.computeIfAbsent(groupTokenFreqKey, k -> new HashSet<>()).add(tokens.get(i));
         }
     }
diff --git a/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java
index 137e2eb590c4f..bcbabbf46a446 100644
--- a/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java
+++ b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java
@@ -50,6 +50,14 @@ public void setUp() throws Exception {
         parser = new BrainLogParser();
     }
 
+    public void testNewParserWithIllegalArgument() {
+        String exceptionMessage = "Threshold percentage must be between 0.0 and 1.0";
+        Throwable throwable = assertThrows(IllegalArgumentException.class, () -> new BrainLogParser(2, -1.0f));
+        assertEquals(exceptionMessage, throwable.getMessage());
+        throwable = assertThrows(IllegalArgumentException.class, () -> new BrainLogParser(2, 1.1f));
+        assertEquals(exceptionMessage, throwable.getMessage());
+    }
+
     public void testPreprocess() {
         String logMessage = "127.0.0.1 - 1234 something";
         String logId = "log1";
@@ -65,6 +73,18 @@ public void testPreprocess() {
         assertEquals(expectedResult, result);
     }
 
+    public void testPreprocessWithIllegalInput() {
+        String logMessage = "127.0.0.1 - 1234 something";
+        String logId = "log1";
+        String exceptionMessage = "log message or logId must not be null";
+        Throwable throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, logId));
+        assertEquals(exceptionMessage, throwable.getMessage());
+        throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(logMessage, null));
+        assertEquals(exceptionMessage, throwable.getMessage());
+        throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, null));
+        assertEquals(exceptionMessage, throwable.getMessage());
+    }
+
     public void testPreprocessAllLogs() {
         List<String> logMessages = Arrays.asList("127.0.0.1 - 1234 something", "192.168.0.1 - 5678 something_else");
         List<String> logIds = Arrays.asList("log1", "log2");
@@ -111,6 +131,13 @@ public void testCalculateGroupTokenFreq() {
         assertTrue(parser.getGroupTokenSetMap().get(sampleGroupTokenKey).contains("something"));
     }
 
+    public void testCalculateGroupTokenFreqWithIllegalInput() {
+        List<List<String>> preprocessedLogs = Arrays.asList(List.of());
+        String exceptionMessage = "Sorted word combinations must be non empty";
+        Throwable throwable = assertThrows(IllegalArgumentException.class, () -> parser.calculateGroupTokenFreq(preprocessedLogs));
+        assertEquals(exceptionMessage, throwable.getMessage());
+    }
+
     public void testParseLogPattern() {
         List<List<String>> preprocessedLogs = parser.preprocessAllLogs(TEST_HDFS_LOGS, List.of());
         parser.calculateGroupTokenFreq(preprocessedLogs);
@@ -150,4 +177,53 @@ public void testParseAllLogPatterns() {
             .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().size()));
         assertEquals(expectedResult, logPatternByCountMap);
     }
+
+    public void testParseLogPatternWhenLowerFrequencyTokenIsVariable() {
+        int testVariableCountThreshold = 3;
+        parser = new BrainLogParser(testVariableCountThreshold);
+        List<String> logMessages = Arrays.asList(
+            "Verification succeeded a blk_-1547954353065580372",
+            "Verification succeeded b blk_6996194389878584395",
+            "Verification succeeded c blk_6996194389878584395",
+            "Verification succeeded d blk_6996194389878584395"
+        );
+
+        Map<String, List<String>> expectedResult = Map.of("Verification succeeded <*> blk_<*>", Arrays.asList("0", "1", "2", "3"));
+        Map<String, List<String>> logPatternMap = parser.parseAllLogPatterns(logMessages, List.of());
+        assertEquals(expectedResult, logPatternMap);
+        /*
+         * 'a', 'b', 'c' and 'd' token is on the 3rd position in the group 2,3, their frequency is lower than
+         * representative frequency. Since that position's distinct token number exceeds the variable count threshold,
+         * the third position in this log group is treated as variable
+         */
+        assertTrue(parser.getTokenFreqMap().get("2-a") < parser.getTokenFreqMap().get("0-Verification"));
+        assertTrue(parser.getTokenFreqMap().get("2-b") < parser.getTokenFreqMap().get("0-Verification"));
+        assertTrue(testVariableCountThreshold <= parser.getGroupTokenSetMap().get("4-4,3-2").size());
+    }
+
+    public void testParseLogPatternWhenHigherFrequencyTokenIsVariable() {
+        List<String> logMessages = Arrays.asList(
+            "Verification succeeded for blk_-1547954353065580372",
+            "Verification succeeded for blk_6996194389878584395",
+            "Test succeeded for blk_6996194389878584395",
+            "Verification",
+            "Verification"
+        );
+
+        Map<String, List<String>> expectedResult = Map.of(
+            "<*> succeeded for blk_<*>",
+            Arrays.asList("0", "1", "2"),
+            "Verification",
+            Arrays.asList("3", "4")
+        );
+        Map<String, List<String>> logPatternMap = parser.parseAllLogPatterns(logMessages, List.of());
+        assertEquals(expectedResult, logPatternMap);
+        /*
+         * 'Verification' and 'Test' token is on the 1st position in the group 3,3, 'Verification' frequency is higher than
+         * representative frequency because there are other groups which have 'Verification' token on the 1st position as well.
+         * Since first position's distinct token number is not unique, 'Verification' is treated as variable eventually.
+         */
+        assertTrue(parser.getTokenFreqMap().get("0-Verification") > parser.getTokenFreqMap().get("1-succeeded"));
+        assertTrue(parser.getGroupTokenSetMap().get("4-3,3-0").size() > 1);
+    }
 }

From 1f833e9153bb0a5dd3bf64d49adb2147f211c979 Mon Sep 17 00:00:00 2001
From: Songkan Tang <songkant@amazon.com>
Date: Mon, 16 Dec 2024 14:41:54 +0800
Subject: [PATCH 5/5] Refine default regex and delimiters

Signed-off-by: Songkan Tang <songkant@amazon.com>
---
 .../opensearch/pattern/BrainLogParser.java    | 78 +++++++++++--------
 .../pattern/BrainLogParserTests.java          | 20 ++---
 2 files changed, 57 insertions(+), 41 deletions(-)

diff --git a/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java
index 50c190d3529e4..00b2ae23290b4 100644
--- a/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java
+++ b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java
@@ -13,6 +13,7 @@
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@@ -27,12 +28,25 @@
  */
 public class BrainLogParser {
 
-    private static final List<Pattern> DEFAULT_FILTER_PATTERNS = Arrays.asList(
-        Pattern.compile("(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)"), // IP
-        Pattern.compile("(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$") // Numbers
-    );
-    private static final List<String> DEFAULT_DELIMITERS = List.of(":", "=", "[", "]", "(", ")", "-", "|", ",", "+");
     private static final String VARIABLE_DENOTER = "<*>";
+    private static final Map<Pattern, String> DEFAULT_FILTER_PATTERN_VARIABLE_MAP = new LinkedHashMap<>();
+    static {
+        // IP
+        DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put(Pattern.compile("(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)"), "<*IP*>");
+        // Simple ISO date and time
+        DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put(
+            Pattern.compile("(\\d{4}-\\d{2}-\\d{2})[T ]?(\\d{2}:\\d{2}:\\d{2})(\\.\\d{3})?(Z|([+-]\\d{2}:?\\d{2}))?"),
+            "<*DATETIME*>"
+        );
+        // Hex Decimal, letters followed by digits, float numbers
+        DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put(
+            Pattern.compile("((0x|0X)[0-9a-fA-F]+)|[a-zA-Z]+\\d+|([+-]?(\\d+(\\.\\d*)?|\\.\\d+))"),
+            VARIABLE_DENOTER
+        );
+        // generic number surrounded by non-alphanumeric
+        DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put(Pattern.compile("(?<=[^A-Za-z0-9])(-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$"), VARIABLE_DENOTER);
+    }
+    private static final List<String> DEFAULT_DELIMITERS = List.of(",", "+");
     // counting frequency will be grouped by composite of position and token string
     private static final String POSITIONED_TOKEN_KEY_FORMAT = "%d-%s";
     // Token set will be grouped by composite of tokens length per log message, word combination candidate and token position.
@@ -50,22 +64,19 @@ public class BrainLogParser {
     private final Map<String, String> logIdGroupCandidateMap;
     private final int variableCountThreshold;
     private final float thresholdPercentage;
-    private final List<Pattern> filterPatterns;
+    private final Map<Pattern, String> filterPatternVariableMap;
     private final List<String> delimiters;
 
     /**
      * Creates new Brain log parser with default parameters
      */
     public BrainLogParser() {
-        this(DEFAULT_VARIABLE_COUNT_THRESHOLD, DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS);
-    }
-
-    /**
-     * Creates new Brain log parser with overridden variableCountThreshold
-     * @param variableCountThreshold the threshold to decide whether low frequency token is variable
-     */
-    public BrainLogParser(int variableCountThreshold) {
-        this(variableCountThreshold, DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS);
+        this(
+            DEFAULT_VARIABLE_COUNT_THRESHOLD,
+            DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE,
+            DEFAULT_FILTER_PATTERN_VARIABLE_MAP,
+            DEFAULT_DELIMITERS
+        );
     }
 
     /**
@@ -75,18 +86,17 @@ public BrainLogParser(int variableCountThreshold) {
      *                            frequency per log message
      */
     public BrainLogParser(int variableCountThreshold, float thresholdPercentage) {
-        this(variableCountThreshold, thresholdPercentage, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS);
+        this(variableCountThreshold, thresholdPercentage, DEFAULT_FILTER_PATTERN_VARIABLE_MAP, DEFAULT_DELIMITERS);
     }
 
     /**
-     * Creates new Brain log parser with overridden variableCountThreshold, thresholdPercentage and filter patterns
-     * @param variableCountThreshold the threshold to decide whether low frequency token is variable
-     * @param thresholdPercentage the threshold percentage to decide which frequency is representative
-     *                            frequency per log message
-     * @param filterPatterns a list of regex to replace matched pattern with variable denoter
+     * Creates new Brain log parser with overridden filter patterns and delimiters
+     * @param filterPatternVariableMap a map of regex patterns to variable denoter, with which the matched pattern will be replaced,
+     *                                 recommend to use LinkedHashMap to make sure patterns in order
+     * @param delimiters a list of delimiters to be replaced with empty string after regex replacement
      */
-    public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List<Pattern> filterPatterns) {
-        this(variableCountThreshold, thresholdPercentage, filterPatterns, DEFAULT_DELIMITERS);
+    public BrainLogParser(Map<Pattern, String> filterPatternVariableMap, List<String> delimiters) {
+        this(DEFAULT_VARIABLE_COUNT_THRESHOLD, DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, filterPatternVariableMap, delimiters);
     }
 
     /**
@@ -95,10 +105,16 @@ public BrainLogParser(int variableCountThreshold, float thresholdPercentage, Lis
      * @param variableCountThreshold the threshold to decide whether low frequency token is variable
      * @param thresholdPercentage the threshold percentage to decide which frequency is representative
      *                            frequency per log message
-     * @param filterPatterns a list of regex to replace matched pattern with variable denoter
+     * @param filterPatternVariableMap a map of regex patterns to variable denoter, with which the matched pattern will be replaced,
+     *                                 recommend to use LinkedHashMap to make sure patterns in order
      * @param delimiters a list of delimiters to be replaced with empty string after regex replacement
      */
-    public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List<Pattern> filterPatterns, List<String> delimiters) {
+    public BrainLogParser(
+        int variableCountThreshold,
+        float thresholdPercentage,
+        Map<Pattern, String> filterPatternVariableMap,
+        List<String> delimiters
+    ) {
         this.tokenFreqMap = new HashMap<>();
         this.groupTokenSetMap = new HashMap<>();
         this.logIdGroupCandidateMap = new HashMap<>();
@@ -107,7 +123,7 @@ public BrainLogParser(int variableCountThreshold, float thresholdPercentage, Lis
             throw new IllegalArgumentException("Threshold percentage must be between 0.0 and 1.0");
         }
         this.thresholdPercentage = thresholdPercentage;
-        this.filterPatterns = filterPatterns;
+        this.filterPatternVariableMap = filterPatternVariableMap;
         this.delimiters = delimiters;
     }
 
@@ -121,19 +137,19 @@ public List<String> preprocess(String logMessage, String logId) {
         if (logMessage == null || logId == null) {
             throw new IllegalArgumentException("log message or logId must not be null");
         }
-        // match regex and replace it with variable denoter
-        for (Pattern pattern : filterPatterns) {
-            logMessage = pattern.matcher(logMessage).replaceAll(VARIABLE_DENOTER);
+        // match regex and replace it with variable denoter in order
+        for (Map.Entry<Pattern, String> patternVariablePair : filterPatternVariableMap.entrySet()) {
+            logMessage = patternVariablePair.getKey().matcher(logMessage).replaceAll(patternVariablePair.getValue());
         }
 
         for (String delimiter : delimiters) {
-            logMessage = logMessage.replace(delimiter, "");
+            logMessage = logMessage.replace(delimiter, " ");
         }
 
         // Append logId/docId to the end of the split tokens
         logMessage = logMessage.trim() + " " + logId;
 
-        return Arrays.asList(logMessage.split(" "));
+        return Arrays.asList(logMessage.split("\\s+"));
     }
 
     /**
diff --git a/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java
index bcbabbf46a446..ff1389cdb698b 100644
--- a/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java
+++ b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java
@@ -61,14 +61,14 @@ public void testNewParserWithIllegalArgument() {
     public void testPreprocess() {
         String logMessage = "127.0.0.1 - 1234 something";
         String logId = "log1";
-        List<String> expectedResult = Arrays.asList("<*>", "", "<*>", "something", "log1");
+        List<String> expectedResult = Arrays.asList("<*IP*>", "-", "<*>", "something", "log1");
         List<String> result = parser.preprocess(logMessage, logId);
         assertEquals(expectedResult, result);
 
         // Test with different delimiter
         logMessage = "127.0.0.1=1234 something";
         logId = "log2";
-        expectedResult = Arrays.asList("<*><*>", "something", "log2");
+        expectedResult = Arrays.asList("<*IP*>=<*>", "something", "log2");
         result = parser.preprocess(logMessage, logId);
         assertEquals(expectedResult, result);
     }
@@ -92,8 +92,8 @@ public void testPreprocessAllLogs() {
         List<List<String>> result = parser.preprocessAllLogs(logMessages, logIds);
 
         assertEquals(2, result.size());
-        assertEquals(Arrays.asList("<*>", "", "<*>", "something", "log1"), result.get(0));
-        assertEquals(Arrays.asList("<*>", "", "<*>", "something_else", "log2"), result.get(1));
+        assertEquals(Arrays.asList("<*IP*>", "-", "<*>", "something", "log1"), result.get(0));
+        assertEquals(Arrays.asList("<*IP*>", "-", "<*>", "something_else", "log2"), result.get(1));
     }
 
     public void testProcessTokenHistogram() {
@@ -144,10 +144,10 @@ public void testParseLogPattern() {
 
         List<String> expectedLogPattern = Arrays.asList(
             "BLOCK*",
-            "NameSystem.addStoredBlock",
+            "NameSystem.addStoredBlock:",
             "blockMap",
-            "updated",
-            "<*>",
+            "updated:",
+            "<*IP*>",
             "is",
             "added",
             "to",
@@ -167,9 +167,9 @@ public void testParseAllLogPatterns() {
             2,
             "Verification succeeded for blk_<*>",
             2,
-            "BLOCK* NameSystem.addStoredBlock blockMap updated <*> is added to blk_<*> size <*>",
+            "BLOCK* NameSystem.addStoredBlock: blockMap updated: <*IP*> is added to blk_<*> size <*>",
             8,
-            "BLOCK* NameSystem.allocateBlock <*> blk_<*>",
+            "BLOCK* NameSystem.allocateBlock: <*> blk_<*>",
             8
         );
         Map<String, Integer> logPatternByCountMap = logPatternMap.entrySet()
@@ -180,7 +180,7 @@ public void testParseAllLogPatterns() {
 
     public void testParseLogPatternWhenLowerFrequencyTokenIsVariable() {
         int testVariableCountThreshold = 3;
-        parser = new BrainLogParser(testVariableCountThreshold);
+        parser = new BrainLogParser(testVariableCountThreshold, 0.0f);
         List<String> logMessages = Arrays.asList(
             "Verification succeeded a blk_-1547954353065580372",
             "Verification succeeded b blk_6996194389878584395",