From 313e35c046d39d3c003dceaace105a3a43265f53 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Mon, 2 Dec 2024 12:49:08 +0800 Subject: [PATCH 1/5] Introduce log pattern lib with initial implementation of Brain algorithm log parser Signed-off-by: Songkan Tang --- gradle/missing-javadoc.gradle | 10 +- libs/pattern/build.gradle | 39 +++ .../opensearch/pattern/BrainLogParser.java | 304 ++++++++++++++++++ .../pattern/BrainLogParserTests.java | 153 +++++++++ 4 files changed, 498 insertions(+), 8 deletions(-) create mode 100644 libs/pattern/build.gradle create mode 100644 libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java create mode 100644 libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java diff --git a/gradle/missing-javadoc.gradle b/gradle/missing-javadoc.gradle index 751da941d25dd..77479c93cf4a0 100644 --- a/gradle/missing-javadoc.gradle +++ b/gradle/missing-javadoc.gradle @@ -8,7 +8,6 @@ import javax.annotation.Nullable -import javax.inject.Inject import org.gradle.api.tasks.PathSensitive; import org.gradle.api.tasks.PathSensitivity; import org.gradle.internal.jvm.Jvm @@ -102,6 +101,7 @@ configure([ project(":libs:opensearch-geo"), project(":libs:opensearch-grok"), project(":libs:opensearch-nio"), + project(":libs:opensearch-pattern"), project(":libs:opensearch-plugin-classloader"), project(":libs:opensearch-secure-sm"), project(":libs:opensearch-ssl-config"), @@ -228,11 +228,6 @@ class MissingJavadocTask extends DefaultTask { @PathSensitive(PathSensitivity.RELATIVE) def taskResources - // See please https://docs.gradle.org/8.11/userguide/service_injection.html#execoperations - interface InjectedExecOps { - @Inject ExecOperations getExecOps() - } - /** Utility method to recursively collect all tasks with same name like this one that we depend on */ private Set findRenderTasksInDependencies() { Set found = [] @@ -323,12 +318,11 @@ class MissingJavadocTask extends DefaultTask { } }() - def execOps = project.objects.newInstance(InjectedExecOps) def outputFile = project.file("${getTemporaryDir()}/javadoc-output.txt") def result outputFile.withOutputStream { output -> - result = execOps.execOps.exec { + result = project.exec { executable javadocCmd // we want to capture both stdout and stderr to the same diff --git a/libs/pattern/build.gradle b/libs/pattern/build.gradle new file mode 100644 index 0000000000000..bd12d772f3673 --- /dev/null +++ b/libs/pattern/build.gradle @@ -0,0 +1,39 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +apply plugin: 'opensearch.build' +apply plugin: 'opensearch.publish' + +dependencies { + testImplementation(project(":test:framework")) { + exclude group: 'org.opensearch', module: 'opensearch-pattern' + } +} + +tasks.named('forbiddenApisMain').configure { + replaceSignatureFiles 'jdk-signatures' +} diff --git a/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java new file mode 100644 index 0000000000000..b2c9337938ca4 --- /dev/null +++ b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java @@ -0,0 +1,304 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.pattern; + +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.OptionalLong; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Log parser Brain algorithm implementation. See: https://ieeexplore.ieee.org/document/10109145 + */ +public class BrainLogParser { + + private static final List defaultFilterPatterns = List.of( + "(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)", // IP + "(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$" // Numbers + ); + private static final List defaultDelimiters = List.of(":", "=", "[", "]", "(", ")", "-", "|", ",", "+"); + private static final String variableDenoter = "<*>"; + // counting frequency will be grouped by composite of position and token string + private static final String positionedTokenKeyFormat = "%d-%s"; + // Token set will be grouped by composite of tokens length per log message, word combination candidate and token position. + private static final String groupTokenSetKeyFormat = "%d-%s-%d"; + + private final Map tokenFreqMap; + private final Map> groupTokenSetMap; + private final Map logIdGroupCandidateMap; + private final int variableCountThreshold; + private final float thresholdPercentage; + private final List filterPatterns; + private final List delimiters; + + /** + * Creates new Brain log parser with default parameters + */ + public BrainLogParser() { + this(2, 0.0f, defaultFilterPatterns, defaultDelimiters); + } + + /** + * Creates new Brain log parser with overridden variableCountThreshold + * @param variableCountThreshold the threshold to decide whether low frequency token is variable + */ + public BrainLogParser(int variableCountThreshold) { + this(variableCountThreshold, 0.0f, defaultFilterPatterns, defaultDelimiters); + } + + /** + * Creates new Brain log parser with overridden variableCountThreshold amd thresholdPercentage + * @param variableCountThreshold the threshold to decide whether low frequency token is variable + * @param thresholdPercentage the threshold percentage to decide which frequency is representative + * frequency per log message + */ + public BrainLogParser(int variableCountThreshold, float thresholdPercentage) { + this(variableCountThreshold, thresholdPercentage, defaultFilterPatterns, defaultDelimiters); + } + + /** + * Creates new Brain log parser with overridden variableCountThreshold amd thresholdPercentage and + * overridden filter patterns and delimiters + * @param variableCountThreshold the threshold to decide whether low frequency token is variable + * @param thresholdPercentage the threshold percentage to decide which frequency is representative + * frequency per log message + * @param filterPatterns a list of regex to replace matched pattern to be replaced with variable denoter + * @param delimiters a list of delimiters to be replaced with empty string after regex replacement + */ + public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List filterPatterns, List delimiters) { + this.tokenFreqMap = new HashMap<>(); + this.groupTokenSetMap = new HashMap<>(); + this.logIdGroupCandidateMap = new HashMap<>(); + this.variableCountThreshold = variableCountThreshold; + this.thresholdPercentage = thresholdPercentage; + this.filterPatterns = filterPatterns; + this.delimiters = delimiters; + } + + /** + * Preprocess single line of log message with logId + * @param logMessage log message body per log + * @param logId logId of the log + * @return list of tokens by splitting preprocessed log message + */ + public List preprocess(String logMessage, String logId) { + // match regex and replace it with variable denoter + for (String pattern : filterPatterns) { + logMessage = logMessage.replaceAll(pattern, variableDenoter); + } + + for (String delimiter : delimiters) { + logMessage = logMessage.replace(delimiter, ""); + } + + // Append logId/docId to the end of the split tokens + logMessage = logMessage.trim() + " " + logId; + + return Arrays.asList(logMessage.split(" ")); + } + + /** + * Count token frequency per position/index in the token list + * @param tokens list of tokens from preprocessed log message + */ + public void processTokenHistogram(List tokens) { + // Ignore last element since it's designed to be appended logId + for (int i = 0; i < tokens.size() - 1; i++) { + String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, i, tokens.get(i)); + tokenFreqMap.put(tokenKey, tokenFreqMap.getOrDefault(tokenKey, 0L) + 1); + } + } + + /** + * Preprocess all lines of log messages with logId list. Empty logId list is allowed as the index within + * the list will be logId by default + * @param logMessages list of log messages + * @param logIds list of logIds corresponded to log message + * @return list of token lists + */ + public List> preprocessAllLogs(List logMessages, List logIds) { + List> preprocessedLogs = new ArrayList<>(); + int size = logIds.isEmpty() ? logMessages.size() : Math.min(logMessages.size(), logIds.size()); + + for (int i = 0; i < size; i++) { + String logId = logIds.isEmpty() ? String.valueOf(i) : logIds.get(i); + List tokens = this.preprocess(logMessages.get(i), logId); + if (tokens.size() > 1) { + preprocessedLogs.add(tokens); + this.processTokenHistogram(tokens); + } + } + + return preprocessedLogs; + } + + /** + * The second process step to calculate initial groups of tokens based on previous token histogram. + * The group will be represented by the representative word combination of the log message. The word + * combination usually selects the longest word combination with the same frequency that should be above + * designed threshold. + *

+ * Within initial group, new group level token set per position is counted for final log pattern calculation + * @param preprocessedLogs preprocessed list of log messages + */ + public void calculateGroupTokenFreq(List> preprocessedLogs) { + for (List tokens : preprocessedLogs) { + Map wordOccurrences = this.getWordOccurrences(tokens); + List> sortedOccurrences = this.getSortedWordCombinations(wordOccurrences); + Map.Entry candidate = this.findCandidate(sortedOccurrences); + String groupCandidateStr = String.format(Locale.ROOT, "%d,%d", candidate.getKey(), candidate.getValue()); + this.logIdGroupCandidateMap.put(tokens.get(tokens.size() - 1), groupCandidateStr); + this.updateGroupTokenFreqMap(tokens, groupCandidateStr); + } + } + + /** + * Parse single line of log pattern after preprocess - processTokenHistogram - calculateGroupTokenFreq + * @param tokens list of tokens for a specific log message + * @return parsed log pattern that is a list of string + */ + public List parseLogPattern(List tokens) { + String logId = tokens.get(tokens.size() - 1); + String groupCandidateStr = this.logIdGroupCandidateMap.get(logId); + String[] groupCandidate = groupCandidateStr.split(","); + Long repFreq = Long.parseLong(groupCandidate[0]); // representative frequency of the group + return IntStream.range(0, tokens.size() - 1).mapToObj(i -> new AbstractMap.SimpleEntry<>(i, tokens.get(i))).map(entry -> { + int index = entry.getKey(); + String token = entry.getValue(); + String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, index, token); + assert this.tokenFreqMap.get(tokenKey) != null : String.format(Locale.ROOT, "Not found token: %s on position %d", token, index); + + boolean isHigherFrequency = this.tokenFreqMap.get(tokenKey) > repFreq; + boolean isLowerFrequency = this.tokenFreqMap.get(tokenKey) < repFreq; + String groupTokenKey = String.format(Locale.ROOT, groupTokenSetKeyFormat, tokens.size() - 1, groupCandidateStr, index); + assert this.groupTokenSetMap.get(groupTokenKey) != null : String.format( + Locale.ROOT, + "Not found any token in group: %s", + groupTokenKey + ); + + if (isHigherFrequency) { + // For higher frequency token that doesn't belong to word combination, it's likely to be constant token only if + // it's unique token on that position within the group + boolean isUniqueToken = this.groupTokenSetMap.get(groupTokenKey).size() == 1; + if (!isUniqueToken) { + return variableDenoter; + } + } else if (isLowerFrequency) { + // For lower frequency token that doesn't belong to word combination, it's likely to be constant token only if + // it doesn't exceed the preset variable count threshold. For example, some variable are limited number of enums, + // and sometimes they could be treated as constant tokens. + if (this.groupTokenSetMap.get(groupTokenKey).size() >= variableCountThreshold) { + return variableDenoter; + } + } + return token; + }).collect(Collectors.toList()); + } + + /** + * Parse all lines of log messages to generate the log pattern map. + * @param logMessages all lines of log messages + * @param logIds corresponding logIds for all lines of log messages + * @return log pattern map with log pattern string as key, grouped logIds as value + */ + public Map> parseAllLogPatterns(List logMessages, List logIds) { + List> processedMessages = this.preprocessAllLogs(logMessages, logIds); + + this.calculateGroupTokenFreq(processedMessages); + + Map> logPatternMap = new HashMap<>(); + for (int i = 0; i < processedMessages.size(); i++) { + List processedMessage = processedMessages.get(i); + String logId = logIds.isEmpty() ? String.valueOf(i) : processedMessage.get(processedMessage.size() - 1); + List logPattern = this.parseLogPattern(processedMessages.get(i)); + String patternKey = String.join(" ", logPattern); + logPatternMap.computeIfAbsent(patternKey, k -> new ArrayList<>()).add(logId); + } + return logPatternMap; + } + + /** + * Get token histogram + * @return map of token per position key and its frequency + */ + public Map getTokenFreqMap() { + return this.tokenFreqMap; + } + + /** + * Get group per length per position to its token set map + * @return map of pattern group per length per position key and its token set + */ + public Map> getGroupTokenSetMap() { + return this.groupTokenSetMap; + } + + /** + * Get logId to its group candidate map + * @return map of logId and group candidate + */ + public Map getLogIdGroupCandidateMap() { + return this.logIdGroupCandidateMap; + } + + private Map getWordOccurrences(List tokens) { + Map occurrences = new HashMap<>(); + for (int i = 0; i < tokens.size() - 1; i++) { + String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, i, tokens.get(i)); + Long tokenFreq = tokenFreqMap.get(tokenKey); + occurrences.put(tokenFreq, occurrences.getOrDefault(tokenFreq, 0) + 1); + } + return occurrences; + } + + private List> getSortedWordCombinations(Map occurrences) { + List> sortedOccurrences = new ArrayList<>(occurrences.entrySet()); + sortedOccurrences.sort((entry1, entry2) -> { + int wordCombinationLengthComparison = entry2.getValue().compareTo(entry1.getValue()); + if (wordCombinationLengthComparison != 0) { + return wordCombinationLengthComparison; + } else { + return entry2.getKey().compareTo(entry1.getKey()); + } + }); + + return sortedOccurrences; + } + + private Map.Entry findCandidate(List> sortedWordCombinations) { + OptionalLong maxFreqOptional = sortedWordCombinations.stream().mapToLong(Map.Entry::getKey).max(); + if (maxFreqOptional.isPresent()) { + long maxFreq = maxFreqOptional.getAsLong(); + float threshold = maxFreq * this.thresholdPercentage; + for (Map.Entry entry : sortedWordCombinations) { + if (entry.getKey() > threshold) { + return entry; + } + } + } + return sortedWordCombinations.get(0); + } + + private void updateGroupTokenFreqMap(List tokens, String groupCandidateStr) { + int tokensLen = tokens.size() - 1; + for (int i = 0; i < tokensLen; i++) { + String groupTokenFreqKey = String.format(Locale.ROOT, groupTokenSetKeyFormat, tokensLen, groupCandidateStr, i); + this.groupTokenSetMap.computeIfAbsent(groupTokenFreqKey, k -> new HashSet<>()).add(tokens.get(i)); + } + } +} diff --git a/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java new file mode 100644 index 0000000000000..137e2eb590c4f --- /dev/null +++ b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java @@ -0,0 +1,153 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.pattern; + +import org.opensearch.test.OpenSearchTestCase; + +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +public class BrainLogParserTests extends OpenSearchTestCase { + + private static final List TEST_HDFS_LOGS = Arrays.asList( + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.31.85:50010 is added to blk_-7017553867379051457 size 67108864", + "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000296_0/part-00296. blk_-6620182933895093708", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.7.244:50010 is added to blk_-6956067134432991406 size 67108864", + "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000230_0/part-00230. blk_559204981722276126", + "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000169_0/part-00169. blk_-7105305952901940477", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.107.19:50010 is added to blk_-3249711809227781266 size 67108864", + "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000318_0/part-00318. blk_-207775976836691685", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.6.4:50010 is added to blk_5114010683183383297 size 67108864", + "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000318_0/part-00318. blk_2096692261399680562", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.15.240:50010 is added to blk_-1055254430948037872 size 67108864", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.250.7.146:50010 is added to blk_278357163850888 size 67108864", + "BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000138_0/part-00138. blk_-210021574616486609", + "Verification succeeded for blk_-1547954353065580372", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.39.242:50010 is added to blk_-4110733372292809607 size 67108864", + "BLOCK* NameSystem.allocateBlock: /user/root/randtxt/_temporary/_task_200811092030_0003_m_000382_0/part-00382. blk_8935202950442998446", + "BLOCK* NameSystem.allocateBlock: /user/root/randtxt/_temporary/_task_200811092030_0003_m_000392_0/part-00392. blk_-3010126661650043258", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.25.237:50010 is added to blk_541463031152673662 size 67108864", + "Verification succeeded for blk_6996194389878584395", + "PacketResponder failed for blk_6996194389878584395", + "PacketResponder failed for blk_-1547954353065580372" + ); + + private BrainLogParser parser; + + @Override + public void setUp() throws Exception { + super.setUp(); + parser = new BrainLogParser(); + } + + public void testPreprocess() { + String logMessage = "127.0.0.1 - 1234 something"; + String logId = "log1"; + List expectedResult = Arrays.asList("<*>", "", "<*>", "something", "log1"); + List result = parser.preprocess(logMessage, logId); + assertEquals(expectedResult, result); + + // Test with different delimiter + logMessage = "127.0.0.1=1234 something"; + logId = "log2"; + expectedResult = Arrays.asList("<*><*>", "something", "log2"); + result = parser.preprocess(logMessage, logId); + assertEquals(expectedResult, result); + } + + public void testPreprocessAllLogs() { + List logMessages = Arrays.asList("127.0.0.1 - 1234 something", "192.168.0.1 - 5678 something_else"); + List logIds = Arrays.asList("log1", "log2"); + + List> result = parser.preprocessAllLogs(logMessages, logIds); + + assertEquals(2, result.size()); + assertEquals(Arrays.asList("<*>", "", "<*>", "something", "log1"), result.get(0)); + assertEquals(Arrays.asList("<*>", "", "<*>", "something_else", "log2"), result.get(1)); + } + + public void testProcessTokenHistogram() { + String something = String.format(Locale.ROOT, "%d-%s", 0, "something"); + String up = String.format(Locale.ROOT, "%d-%s", 1, "up"); + List firstTokens = Arrays.asList("something", "up", "0"); + parser.processTokenHistogram(firstTokens); + assertEquals(1L, parser.getTokenFreqMap().get(something).longValue()); + assertEquals(1L, parser.getTokenFreqMap().get(up).longValue()); + + List secondTokens = Arrays.asList("something", "down", "1"); + parser.processTokenHistogram(secondTokens); + assertEquals(2L, parser.getTokenFreqMap().get(something).longValue()); + assertEquals(1L, parser.getTokenFreqMap().get(up).longValue()); + } + + public void testCalculateGroupTokenFreq() { + List logMessages = Arrays.asList( + "127.0.0.1 - 1234 something", + "192.168.0.1:5678 something_else", + "0.0.0.0:42 something_else" + ); + List logIds = Arrays.asList("log1", "log2", "log3"); + + List> preprocessedLogs = parser.preprocessAllLogs(logMessages, logIds); + parser.calculateGroupTokenFreq(preprocessedLogs); + + for (String logId : logIds) { + String groupCandidate = parser.getLogIdGroupCandidateMap().get(logId); + assertNotNull(groupCandidate); + } + assertTrue(parser.getGroupTokenSetMap().containsValue(Set.of("something"))); + assertTrue(parser.getGroupTokenSetMap().containsValue(Set.of("something_else"))); + String sampleGroupTokenKey = String.format(Locale.ROOT, "%d-%s-%d", 4, parser.getLogIdGroupCandidateMap().get("log1"), 3); + assertTrue(parser.getGroupTokenSetMap().get(sampleGroupTokenKey).contains("something")); + } + + public void testParseLogPattern() { + List> preprocessedLogs = parser.preprocessAllLogs(TEST_HDFS_LOGS, List.of()); + parser.calculateGroupTokenFreq(preprocessedLogs); + + List expectedLogPattern = Arrays.asList( + "BLOCK*", + "NameSystem.addStoredBlock", + "blockMap", + "updated", + "<*>", + "is", + "added", + "to", + "blk_<*>", + "size", + "<*>" + ); + List logPattern = parser.parseLogPattern(preprocessedLogs.get(0)); + assertEquals(expectedLogPattern, logPattern); + } + + public void testParseAllLogPatterns() { + Map> logPatternMap = parser.parseAllLogPatterns(TEST_HDFS_LOGS, List.of()); + + Map expectedResult = Map.of( + "PacketResponder failed for blk_<*>", + 2, + "Verification succeeded for blk_<*>", + 2, + "BLOCK* NameSystem.addStoredBlock blockMap updated <*> is added to blk_<*> size <*>", + 8, + "BLOCK* NameSystem.allocateBlock <*> blk_<*>", + 8 + ); + Map logPatternByCountMap = logPatternMap.entrySet() + .stream() + .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().size())); + assertEquals(expectedResult, logPatternByCountMap); + } +} From bf5a834d6899caccf98a9f67808f2c0d689cb9f1 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Mon, 2 Dec 2024 13:41:39 +0800 Subject: [PATCH 2/5] Remove unnecessary changes in missing-javadoc.gradle Signed-off-by: Songkan Tang --- gradle/missing-javadoc.gradle | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gradle/missing-javadoc.gradle b/gradle/missing-javadoc.gradle index 77479c93cf4a0..97831e88043d7 100644 --- a/gradle/missing-javadoc.gradle +++ b/gradle/missing-javadoc.gradle @@ -8,6 +8,7 @@ import javax.annotation.Nullable +import javax.inject.Inject import org.gradle.api.tasks.PathSensitive; import org.gradle.api.tasks.PathSensitivity; import org.gradle.internal.jvm.Jvm @@ -228,6 +229,11 @@ class MissingJavadocTask extends DefaultTask { @PathSensitive(PathSensitivity.RELATIVE) def taskResources + // See please https://docs.gradle.org/8.11/userguide/service_injection.html#execoperations + interface InjectedExecOps { + @Inject ExecOperations getExecOps() + } + /** Utility method to recursively collect all tasks with same name like this one that we depend on */ private Set findRenderTasksInDependencies() { Set found = [] @@ -318,11 +324,12 @@ class MissingJavadocTask extends DefaultTask { } }() + def execOps = project.objects.newInstance(InjectedExecOps) def outputFile = project.file("${getTemporaryDir()}/javadoc-output.txt") def result outputFile.withOutputStream { output -> - result = project.exec { + result = execOps.execOps.exec { executable javadocCmd // we want to capture both stdout and stderr to the same From 00de7ad0f42609dba5f204c874e04144661d2cb7 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Mon, 2 Dec 2024 14:37:33 +0800 Subject: [PATCH 3/5] Update changelog Signed-off-by: Songkan Tang --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c81586548d210..9f64b5001a0aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Support prefix list for remote repository attributes([#16271](https://github.com/opensearch-project/OpenSearch/pull/16271)) - Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)). - Add stats for remote publication failure and move download failure stats to remote methods([#16682](https://github.com/opensearch-project/OpenSearch/pull/16682/)) +- Introduce log pattern lib with initial implementation of Brain algorithm log parser([#16751](https://github.com/opensearch-project/OpenSearch/pull/16751)) ### Dependencies - Bump `com.google.cloud:google-cloud-core-http` from 2.23.0 to 2.47.0 ([#16504](https://github.com/opensearch-project/OpenSearch/pull/16504)) From 3dd41565350c56453a3ba883b1b581413b4359f1 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Tue, 3 Dec 2024 16:23:28 +0800 Subject: [PATCH 4/5] Address comments and add more unit test cases Signed-off-by: Songkan Tang --- libs/pattern/build.gradle | 19 ---- .../opensearch/pattern/BrainLogParser.java | 96 ++++++++++++------- .../pattern/BrainLogParserTests.java | 76 +++++++++++++++ 3 files changed, 137 insertions(+), 54 deletions(-) diff --git a/libs/pattern/build.gradle b/libs/pattern/build.gradle index bd12d772f3673..f63206f2cee33 100644 --- a/libs/pattern/build.gradle +++ b/libs/pattern/build.gradle @@ -6,25 +6,6 @@ * compatible open source license. */ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - apply plugin: 'opensearch.build' apply plugin: 'opensearch.publish' diff --git a/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java index b2c9337938ca4..50c190d3529e4 100644 --- a/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java +++ b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java @@ -18,6 +18,7 @@ import java.util.Map; import java.util.OptionalLong; import java.util.Set; +import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -26,30 +27,37 @@ */ public class BrainLogParser { - private static final List defaultFilterPatterns = List.of( - "(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)", // IP - "(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$" // Numbers + private static final List DEFAULT_FILTER_PATTERNS = Arrays.asList( + Pattern.compile("(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)"), // IP + Pattern.compile("(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$") // Numbers ); - private static final List defaultDelimiters = List.of(":", "=", "[", "]", "(", ")", "-", "|", ",", "+"); - private static final String variableDenoter = "<*>"; + private static final List DEFAULT_DELIMITERS = List.of(":", "=", "[", "]", "(", ")", "-", "|", ",", "+"); + private static final String VARIABLE_DENOTER = "<*>"; // counting frequency will be grouped by composite of position and token string - private static final String positionedTokenKeyFormat = "%d-%s"; + private static final String POSITIONED_TOKEN_KEY_FORMAT = "%d-%s"; // Token set will be grouped by composite of tokens length per log message, word combination candidate and token position. - private static final String groupTokenSetKeyFormat = "%d-%s-%d"; + private static final String GROUP_TOKEN_SET_KEY_FORMAT = "%d-%s-%d"; + // By default, algorithm treats more than 2 different tokens in the group per position as variable token + private static final int DEFAULT_VARIABLE_COUNT_THRESHOLD = 2; + /* + * By default, algorithm treats the longest word combinations as the group root, no matter what its frequency is. + * Otherwise, the longest word combination will be selected when frequency >= highest frequency of log * threshold percentage + */ + private static final float DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE = 0.0f; private final Map tokenFreqMap; private final Map> groupTokenSetMap; private final Map logIdGroupCandidateMap; private final int variableCountThreshold; private final float thresholdPercentage; - private final List filterPatterns; + private final List filterPatterns; private final List delimiters; /** * Creates new Brain log parser with default parameters */ public BrainLogParser() { - this(2, 0.0f, defaultFilterPatterns, defaultDelimiters); + this(DEFAULT_VARIABLE_COUNT_THRESHOLD, DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS); } /** @@ -57,33 +65,47 @@ public BrainLogParser() { * @param variableCountThreshold the threshold to decide whether low frequency token is variable */ public BrainLogParser(int variableCountThreshold) { - this(variableCountThreshold, 0.0f, defaultFilterPatterns, defaultDelimiters); + this(variableCountThreshold, DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS); } /** - * Creates new Brain log parser with overridden variableCountThreshold amd thresholdPercentage + * Creates new Brain log parser with overridden variableCountThreshold and thresholdPercentage * @param variableCountThreshold the threshold to decide whether low frequency token is variable * @param thresholdPercentage the threshold percentage to decide which frequency is representative * frequency per log message */ public BrainLogParser(int variableCountThreshold, float thresholdPercentage) { - this(variableCountThreshold, thresholdPercentage, defaultFilterPatterns, defaultDelimiters); + this(variableCountThreshold, thresholdPercentage, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS); } /** - * Creates new Brain log parser with overridden variableCountThreshold amd thresholdPercentage and + * Creates new Brain log parser with overridden variableCountThreshold, thresholdPercentage and filter patterns + * @param variableCountThreshold the threshold to decide whether low frequency token is variable + * @param thresholdPercentage the threshold percentage to decide which frequency is representative + * frequency per log message + * @param filterPatterns a list of regex to replace matched pattern with variable denoter + */ + public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List filterPatterns) { + this(variableCountThreshold, thresholdPercentage, filterPatterns, DEFAULT_DELIMITERS); + } + + /** + * Creates new Brain log parser with overridden variableCountThreshold and thresholdPercentage and * overridden filter patterns and delimiters * @param variableCountThreshold the threshold to decide whether low frequency token is variable * @param thresholdPercentage the threshold percentage to decide which frequency is representative * frequency per log message - * @param filterPatterns a list of regex to replace matched pattern to be replaced with variable denoter + * @param filterPatterns a list of regex to replace matched pattern with variable denoter * @param delimiters a list of delimiters to be replaced with empty string after regex replacement */ - public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List filterPatterns, List delimiters) { + public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List filterPatterns, List delimiters) { this.tokenFreqMap = new HashMap<>(); this.groupTokenSetMap = new HashMap<>(); this.logIdGroupCandidateMap = new HashMap<>(); this.variableCountThreshold = variableCountThreshold; + if (thresholdPercentage < 0.0f || thresholdPercentage > 1.0f) { + throw new IllegalArgumentException("Threshold percentage must be between 0.0 and 1.0"); + } this.thresholdPercentage = thresholdPercentage; this.filterPatterns = filterPatterns; this.delimiters = delimiters; @@ -96,9 +118,12 @@ public BrainLogParser(int variableCountThreshold, float thresholdPercentage, Lis * @return list of tokens by splitting preprocessed log message */ public List preprocess(String logMessage, String logId) { + if (logMessage == null || logId == null) { + throw new IllegalArgumentException("log message or logId must not be null"); + } // match regex and replace it with variable denoter - for (String pattern : filterPatterns) { - logMessage = logMessage.replaceAll(pattern, variableDenoter); + for (Pattern pattern : filterPatterns) { + logMessage = pattern.matcher(logMessage).replaceAll(VARIABLE_DENOTER); } for (String delimiter : delimiters) { @@ -118,7 +143,7 @@ public List preprocess(String logMessage, String logId) { public void processTokenHistogram(List tokens) { // Ignore last element since it's designed to be appended logId for (int i = 0; i < tokens.size() - 1; i++) { - String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, i, tokens.get(i)); + String tokenKey = String.format(Locale.ROOT, POSITIONED_TOKEN_KEY_FORMAT, i, tokens.get(i)); tokenFreqMap.put(tokenKey, tokenFreqMap.getOrDefault(tokenKey, 0L) + 1); } } @@ -137,10 +162,8 @@ public List> preprocessAllLogs(List logMessages, List tokens = this.preprocess(logMessages.get(i), logId); - if (tokens.size() > 1) { - preprocessedLogs.add(tokens); - this.processTokenHistogram(tokens); - } + preprocessedLogs.add(tokens); + this.processTokenHistogram(tokens); } return preprocessedLogs; @@ -179,12 +202,12 @@ public List parseLogPattern(List tokens) { return IntStream.range(0, tokens.size() - 1).mapToObj(i -> new AbstractMap.SimpleEntry<>(i, tokens.get(i))).map(entry -> { int index = entry.getKey(); String token = entry.getValue(); - String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, index, token); + String tokenKey = String.format(Locale.ROOT, POSITIONED_TOKEN_KEY_FORMAT, index, token); assert this.tokenFreqMap.get(tokenKey) != null : String.format(Locale.ROOT, "Not found token: %s on position %d", token, index); boolean isHigherFrequency = this.tokenFreqMap.get(tokenKey) > repFreq; boolean isLowerFrequency = this.tokenFreqMap.get(tokenKey) < repFreq; - String groupTokenKey = String.format(Locale.ROOT, groupTokenSetKeyFormat, tokens.size() - 1, groupCandidateStr, index); + String groupTokenKey = String.format(Locale.ROOT, GROUP_TOKEN_SET_KEY_FORMAT, tokens.size() - 1, groupCandidateStr, index); assert this.groupTokenSetMap.get(groupTokenKey) != null : String.format( Locale.ROOT, "Not found any token in group: %s", @@ -196,14 +219,14 @@ public List parseLogPattern(List tokens) { // it's unique token on that position within the group boolean isUniqueToken = this.groupTokenSetMap.get(groupTokenKey).size() == 1; if (!isUniqueToken) { - return variableDenoter; + return VARIABLE_DENOTER; } } else if (isLowerFrequency) { // For lower frequency token that doesn't belong to word combination, it's likely to be constant token only if // it doesn't exceed the preset variable count threshold. For example, some variable are limited number of enums, // and sometimes they could be treated as constant tokens. if (this.groupTokenSetMap.get(groupTokenKey).size() >= variableCountThreshold) { - return variableDenoter; + return VARIABLE_DENOTER; } } return token; @@ -259,7 +282,7 @@ public Map getLogIdGroupCandidateMap() { private Map getWordOccurrences(List tokens) { Map occurrences = new HashMap<>(); for (int i = 0; i < tokens.size() - 1; i++) { - String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, i, tokens.get(i)); + String tokenKey = String.format(Locale.ROOT, POSITIONED_TOKEN_KEY_FORMAT, i, tokens.get(i)); Long tokenFreq = tokenFreqMap.get(tokenKey); occurrences.put(tokenFreq, occurrences.getOrDefault(tokenFreq, 0) + 1); } @@ -269,10 +292,12 @@ private Map getWordOccurrences(List tokens) { private List> getSortedWordCombinations(Map occurrences) { List> sortedOccurrences = new ArrayList<>(occurrences.entrySet()); sortedOccurrences.sort((entry1, entry2) -> { + // Sort by length of the word combination in descending order int wordCombinationLengthComparison = entry2.getValue().compareTo(entry1.getValue()); if (wordCombinationLengthComparison != 0) { return wordCombinationLengthComparison; } else { + // If the length of word combinations are the same, sort frequency in descending order return entry2.getKey().compareTo(entry1.getKey()); } }); @@ -281,14 +306,15 @@ private List> getSortedWordCombinations(Map findCandidate(List> sortedWordCombinations) { + if (sortedWordCombinations.isEmpty()) { + throw new IllegalArgumentException("Sorted word combinations must be non empty"); + } OptionalLong maxFreqOptional = sortedWordCombinations.stream().mapToLong(Map.Entry::getKey).max(); - if (maxFreqOptional.isPresent()) { - long maxFreq = maxFreqOptional.getAsLong(); - float threshold = maxFreq * this.thresholdPercentage; - for (Map.Entry entry : sortedWordCombinations) { - if (entry.getKey() > threshold) { - return entry; - } + long maxFreq = maxFreqOptional.getAsLong(); + float threshold = maxFreq * this.thresholdPercentage; + for (Map.Entry entry : sortedWordCombinations) { + if (entry.getKey() > threshold) { + return entry; } } return sortedWordCombinations.get(0); @@ -297,7 +323,7 @@ private Map.Entry findCandidate(List> so private void updateGroupTokenFreqMap(List tokens, String groupCandidateStr) { int tokensLen = tokens.size() - 1; for (int i = 0; i < tokensLen; i++) { - String groupTokenFreqKey = String.format(Locale.ROOT, groupTokenSetKeyFormat, tokensLen, groupCandidateStr, i); + String groupTokenFreqKey = String.format(Locale.ROOT, GROUP_TOKEN_SET_KEY_FORMAT, tokensLen, groupCandidateStr, i); this.groupTokenSetMap.computeIfAbsent(groupTokenFreqKey, k -> new HashSet<>()).add(tokens.get(i)); } } diff --git a/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java index 137e2eb590c4f..bcbabbf46a446 100644 --- a/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java +++ b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java @@ -50,6 +50,14 @@ public void setUp() throws Exception { parser = new BrainLogParser(); } + public void testNewParserWithIllegalArgument() { + String exceptionMessage = "Threshold percentage must be between 0.0 and 1.0"; + Throwable throwable = assertThrows(IllegalArgumentException.class, () -> new BrainLogParser(2, -1.0f)); + assertEquals(exceptionMessage, throwable.getMessage()); + throwable = assertThrows(IllegalArgumentException.class, () -> new BrainLogParser(2, 1.1f)); + assertEquals(exceptionMessage, throwable.getMessage()); + } + public void testPreprocess() { String logMessage = "127.0.0.1 - 1234 something"; String logId = "log1"; @@ -65,6 +73,18 @@ public void testPreprocess() { assertEquals(expectedResult, result); } + public void testPreprocessWithIllegalInput() { + String logMessage = "127.0.0.1 - 1234 something"; + String logId = "log1"; + String exceptionMessage = "log message or logId must not be null"; + Throwable throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, logId)); + assertEquals(exceptionMessage, throwable.getMessage()); + throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(logMessage, null)); + assertEquals(exceptionMessage, throwable.getMessage()); + throwable = assertThrows(IllegalArgumentException.class, () -> parser.preprocess(null, null)); + assertEquals(exceptionMessage, throwable.getMessage()); + } + public void testPreprocessAllLogs() { List logMessages = Arrays.asList("127.0.0.1 - 1234 something", "192.168.0.1 - 5678 something_else"); List logIds = Arrays.asList("log1", "log2"); @@ -111,6 +131,13 @@ public void testCalculateGroupTokenFreq() { assertTrue(parser.getGroupTokenSetMap().get(sampleGroupTokenKey).contains("something")); } + public void testCalculateGroupTokenFreqWithIllegalInput() { + List> preprocessedLogs = Arrays.asList(List.of()); + String exceptionMessage = "Sorted word combinations must be non empty"; + Throwable throwable = assertThrows(IllegalArgumentException.class, () -> parser.calculateGroupTokenFreq(preprocessedLogs)); + assertEquals(exceptionMessage, throwable.getMessage()); + } + public void testParseLogPattern() { List> preprocessedLogs = parser.preprocessAllLogs(TEST_HDFS_LOGS, List.of()); parser.calculateGroupTokenFreq(preprocessedLogs); @@ -150,4 +177,53 @@ public void testParseAllLogPatterns() { .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().size())); assertEquals(expectedResult, logPatternByCountMap); } + + public void testParseLogPatternWhenLowerFrequencyTokenIsVariable() { + int testVariableCountThreshold = 3; + parser = new BrainLogParser(testVariableCountThreshold); + List logMessages = Arrays.asList( + "Verification succeeded a blk_-1547954353065580372", + "Verification succeeded b blk_6996194389878584395", + "Verification succeeded c blk_6996194389878584395", + "Verification succeeded d blk_6996194389878584395" + ); + + Map> expectedResult = Map.of("Verification succeeded <*> blk_<*>", Arrays.asList("0", "1", "2", "3")); + Map> logPatternMap = parser.parseAllLogPatterns(logMessages, List.of()); + assertEquals(expectedResult, logPatternMap); + /* + * 'a', 'b', 'c' and 'd' token is on the 3rd position in the group 2,3, their frequency is lower than + * representative frequency. Since that position's distinct token number exceeds the variable count threshold, + * the third position in this log group is treated as variable + */ + assertTrue(parser.getTokenFreqMap().get("2-a") < parser.getTokenFreqMap().get("0-Verification")); + assertTrue(parser.getTokenFreqMap().get("2-b") < parser.getTokenFreqMap().get("0-Verification")); + assertTrue(testVariableCountThreshold <= parser.getGroupTokenSetMap().get("4-4,3-2").size()); + } + + public void testParseLogPatternWhenHigherFrequencyTokenIsVariable() { + List logMessages = Arrays.asList( + "Verification succeeded for blk_-1547954353065580372", + "Verification succeeded for blk_6996194389878584395", + "Test succeeded for blk_6996194389878584395", + "Verification", + "Verification" + ); + + Map> expectedResult = Map.of( + "<*> succeeded for blk_<*>", + Arrays.asList("0", "1", "2"), + "Verification", + Arrays.asList("3", "4") + ); + Map> logPatternMap = parser.parseAllLogPatterns(logMessages, List.of()); + assertEquals(expectedResult, logPatternMap); + /* + * 'Verification' and 'Test' token is on the 1st position in the group 3,3, 'Verification' frequency is higher than + * representative frequency because there are other groups which have 'Verification' token on the 1st position as well. + * Since first position's distinct token number is not unique, 'Verification' is treated as variable eventually. + */ + assertTrue(parser.getTokenFreqMap().get("0-Verification") > parser.getTokenFreqMap().get("1-succeeded")); + assertTrue(parser.getGroupTokenSetMap().get("4-3,3-0").size() > 1); + } } From 1f833e9153bb0a5dd3bf64d49adb2147f211c979 Mon Sep 17 00:00:00 2001 From: Songkan Tang Date: Mon, 16 Dec 2024 14:41:54 +0800 Subject: [PATCH 5/5] Refine default regex and delimiters Signed-off-by: Songkan Tang --- .../opensearch/pattern/BrainLogParser.java | 78 +++++++++++-------- .../pattern/BrainLogParserTests.java | 20 ++--- 2 files changed, 57 insertions(+), 41 deletions(-) diff --git a/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java index 50c190d3529e4..00b2ae23290b4 100644 --- a/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java +++ b/libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java @@ -13,6 +13,7 @@ import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; @@ -27,12 +28,25 @@ */ public class BrainLogParser { - private static final List DEFAULT_FILTER_PATTERNS = Arrays.asList( - Pattern.compile("(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)"), // IP - Pattern.compile("(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$") // Numbers - ); - private static final List DEFAULT_DELIMITERS = List.of(":", "=", "[", "]", "(", ")", "-", "|", ",", "+"); private static final String VARIABLE_DENOTER = "<*>"; + private static final Map DEFAULT_FILTER_PATTERN_VARIABLE_MAP = new LinkedHashMap<>(); + static { + // IP + DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put(Pattern.compile("(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)"), "<*IP*>"); + // Simple ISO date and time + DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put( + Pattern.compile("(\\d{4}-\\d{2}-\\d{2})[T ]?(\\d{2}:\\d{2}:\\d{2})(\\.\\d{3})?(Z|([+-]\\d{2}:?\\d{2}))?"), + "<*DATETIME*>" + ); + // Hex Decimal, letters followed by digits, float numbers + DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put( + Pattern.compile("((0x|0X)[0-9a-fA-F]+)|[a-zA-Z]+\\d+|([+-]?(\\d+(\\.\\d*)?|\\.\\d+))"), + VARIABLE_DENOTER + ); + // generic number surrounded by non-alphanumeric + DEFAULT_FILTER_PATTERN_VARIABLE_MAP.put(Pattern.compile("(?<=[^A-Za-z0-9])(-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$"), VARIABLE_DENOTER); + } + private static final List DEFAULT_DELIMITERS = List.of(",", "+"); // counting frequency will be grouped by composite of position and token string private static final String POSITIONED_TOKEN_KEY_FORMAT = "%d-%s"; // Token set will be grouped by composite of tokens length per log message, word combination candidate and token position. @@ -50,22 +64,19 @@ public class BrainLogParser { private final Map logIdGroupCandidateMap; private final int variableCountThreshold; private final float thresholdPercentage; - private final List filterPatterns; + private final Map filterPatternVariableMap; private final List delimiters; /** * Creates new Brain log parser with default parameters */ public BrainLogParser() { - this(DEFAULT_VARIABLE_COUNT_THRESHOLD, DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS); - } - - /** - * Creates new Brain log parser with overridden variableCountThreshold - * @param variableCountThreshold the threshold to decide whether low frequency token is variable - */ - public BrainLogParser(int variableCountThreshold) { - this(variableCountThreshold, DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS); + this( + DEFAULT_VARIABLE_COUNT_THRESHOLD, + DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, + DEFAULT_FILTER_PATTERN_VARIABLE_MAP, + DEFAULT_DELIMITERS + ); } /** @@ -75,18 +86,17 @@ public BrainLogParser(int variableCountThreshold) { * frequency per log message */ public BrainLogParser(int variableCountThreshold, float thresholdPercentage) { - this(variableCountThreshold, thresholdPercentage, DEFAULT_FILTER_PATTERNS, DEFAULT_DELIMITERS); + this(variableCountThreshold, thresholdPercentage, DEFAULT_FILTER_PATTERN_VARIABLE_MAP, DEFAULT_DELIMITERS); } /** - * Creates new Brain log parser with overridden variableCountThreshold, thresholdPercentage and filter patterns - * @param variableCountThreshold the threshold to decide whether low frequency token is variable - * @param thresholdPercentage the threshold percentage to decide which frequency is representative - * frequency per log message - * @param filterPatterns a list of regex to replace matched pattern with variable denoter + * Creates new Brain log parser with overridden filter patterns and delimiters + * @param filterPatternVariableMap a map of regex patterns to variable denoter, with which the matched pattern will be replaced, + * recommend to use LinkedHashMap to make sure patterns in order + * @param delimiters a list of delimiters to be replaced with empty string after regex replacement */ - public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List filterPatterns) { - this(variableCountThreshold, thresholdPercentage, filterPatterns, DEFAULT_DELIMITERS); + public BrainLogParser(Map filterPatternVariableMap, List delimiters) { + this(DEFAULT_VARIABLE_COUNT_THRESHOLD, DEFAULT_FREQUENCY_THRESHOLD_PERCENTAGE, filterPatternVariableMap, delimiters); } /** @@ -95,10 +105,16 @@ public BrainLogParser(int variableCountThreshold, float thresholdPercentage, Lis * @param variableCountThreshold the threshold to decide whether low frequency token is variable * @param thresholdPercentage the threshold percentage to decide which frequency is representative * frequency per log message - * @param filterPatterns a list of regex to replace matched pattern with variable denoter + * @param filterPatternVariableMap a map of regex patterns to variable denoter, with which the matched pattern will be replaced, + * recommend to use LinkedHashMap to make sure patterns in order * @param delimiters a list of delimiters to be replaced with empty string after regex replacement */ - public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List filterPatterns, List delimiters) { + public BrainLogParser( + int variableCountThreshold, + float thresholdPercentage, + Map filterPatternVariableMap, + List delimiters + ) { this.tokenFreqMap = new HashMap<>(); this.groupTokenSetMap = new HashMap<>(); this.logIdGroupCandidateMap = new HashMap<>(); @@ -107,7 +123,7 @@ public BrainLogParser(int variableCountThreshold, float thresholdPercentage, Lis throw new IllegalArgumentException("Threshold percentage must be between 0.0 and 1.0"); } this.thresholdPercentage = thresholdPercentage; - this.filterPatterns = filterPatterns; + this.filterPatternVariableMap = filterPatternVariableMap; this.delimiters = delimiters; } @@ -121,19 +137,19 @@ public List preprocess(String logMessage, String logId) { if (logMessage == null || logId == null) { throw new IllegalArgumentException("log message or logId must not be null"); } - // match regex and replace it with variable denoter - for (Pattern pattern : filterPatterns) { - logMessage = pattern.matcher(logMessage).replaceAll(VARIABLE_DENOTER); + // match regex and replace it with variable denoter in order + for (Map.Entry patternVariablePair : filterPatternVariableMap.entrySet()) { + logMessage = patternVariablePair.getKey().matcher(logMessage).replaceAll(patternVariablePair.getValue()); } for (String delimiter : delimiters) { - logMessage = logMessage.replace(delimiter, ""); + logMessage = logMessage.replace(delimiter, " "); } // Append logId/docId to the end of the split tokens logMessage = logMessage.trim() + " " + logId; - return Arrays.asList(logMessage.split(" ")); + return Arrays.asList(logMessage.split("\\s+")); } /** diff --git a/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java index bcbabbf46a446..ff1389cdb698b 100644 --- a/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java +++ b/libs/pattern/src/test/java/org/opensearch/pattern/BrainLogParserTests.java @@ -61,14 +61,14 @@ public void testNewParserWithIllegalArgument() { public void testPreprocess() { String logMessage = "127.0.0.1 - 1234 something"; String logId = "log1"; - List expectedResult = Arrays.asList("<*>", "", "<*>", "something", "log1"); + List expectedResult = Arrays.asList("<*IP*>", "-", "<*>", "something", "log1"); List result = parser.preprocess(logMessage, logId); assertEquals(expectedResult, result); // Test with different delimiter logMessage = "127.0.0.1=1234 something"; logId = "log2"; - expectedResult = Arrays.asList("<*><*>", "something", "log2"); + expectedResult = Arrays.asList("<*IP*>=<*>", "something", "log2"); result = parser.preprocess(logMessage, logId); assertEquals(expectedResult, result); } @@ -92,8 +92,8 @@ public void testPreprocessAllLogs() { List> result = parser.preprocessAllLogs(logMessages, logIds); assertEquals(2, result.size()); - assertEquals(Arrays.asList("<*>", "", "<*>", "something", "log1"), result.get(0)); - assertEquals(Arrays.asList("<*>", "", "<*>", "something_else", "log2"), result.get(1)); + assertEquals(Arrays.asList("<*IP*>", "-", "<*>", "something", "log1"), result.get(0)); + assertEquals(Arrays.asList("<*IP*>", "-", "<*>", "something_else", "log2"), result.get(1)); } public void testProcessTokenHistogram() { @@ -144,10 +144,10 @@ public void testParseLogPattern() { List expectedLogPattern = Arrays.asList( "BLOCK*", - "NameSystem.addStoredBlock", + "NameSystem.addStoredBlock:", "blockMap", - "updated", - "<*>", + "updated:", + "<*IP*>", "is", "added", "to", @@ -167,9 +167,9 @@ public void testParseAllLogPatterns() { 2, "Verification succeeded for blk_<*>", 2, - "BLOCK* NameSystem.addStoredBlock blockMap updated <*> is added to blk_<*> size <*>", + "BLOCK* NameSystem.addStoredBlock: blockMap updated: <*IP*> is added to blk_<*> size <*>", 8, - "BLOCK* NameSystem.allocateBlock <*> blk_<*>", + "BLOCK* NameSystem.allocateBlock: <*> blk_<*>", 8 ); Map logPatternByCountMap = logPatternMap.entrySet() @@ -180,7 +180,7 @@ public void testParseAllLogPatterns() { public void testParseLogPatternWhenLowerFrequencyTokenIsVariable() { int testVariableCountThreshold = 3; - parser = new BrainLogParser(testVariableCountThreshold); + parser = new BrainLogParser(testVariableCountThreshold, 0.0f); List logMessages = Arrays.asList( "Verification succeeded a blk_-1547954353065580372", "Verification succeeded b blk_6996194389878584395",