Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce log pattern lib with initial implementation of Brain algorithm log parser #16751

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Support prefix list for remote repository attributes([#16271](https://github.com/opensearch-project/OpenSearch/pull/16271))
- Add new configuration setting `synonym_analyzer`, to the `synonym` and `synonym_graph` filters, enabling the specification of a custom analyzer for reading the synonym file ([#16488](https://github.com/opensearch-project/OpenSearch/pull/16488)).
- Add stats for remote publication failure and move download failure stats to remote methods([#16682](https://github.com/opensearch-project/OpenSearch/pull/16682/))
- Introduce log pattern lib with initial implementation of Brain algorithm log parser([#16751](https://github.com/opensearch-project/OpenSearch/pull/16751))

### Dependencies
- Bump `com.google.cloud:google-cloud-core-http` from 2.23.0 to 2.47.0 ([#16504](https://github.com/opensearch-project/OpenSearch/pull/16504))
Expand Down
1 change: 1 addition & 0 deletions gradle/missing-javadoc.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ configure([
project(":libs:opensearch-geo"),
project(":libs:opensearch-grok"),
project(":libs:opensearch-nio"),
project(":libs:opensearch-pattern"),
project(":libs:opensearch-plugin-classloader"),
project(":libs:opensearch-secure-sm"),
project(":libs:opensearch-ssl-config"),
Expand Down
39 changes: 39 additions & 0 deletions libs/pattern/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
gaobinlong marked this conversation as resolved.
Show resolved Hide resolved
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

apply plugin: 'opensearch.build'
apply plugin: 'opensearch.publish'

dependencies {
testImplementation(project(":test:framework")) {
exclude group: 'org.opensearch', module: 'opensearch-pattern'
}
}

tasks.named('forbiddenApisMain').configure {
replaceSignatureFiles 'jdk-signatures'
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.pattern;

import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.OptionalLong;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
* Log parser Brain algorithm implementation. See: https://ieeexplore.ieee.org/document/10109145
*/
public class BrainLogParser {

private static final List<String> defaultFilterPatterns = List.of(
"(/|)([0-9]+\\.){3}[0-9]+(:[0-9]+|)(:|)", // IP
songkant-aws marked this conversation as resolved.
Show resolved Hide resolved
"(?<=[^A-Za-z0-9])(\\-?\\+?\\d+)(?=[^A-Za-z0-9])|[0-9]+$" // Numbers
);
private static final List<String> defaultDelimiters = List.of(":", "=", "[", "]", "(", ")", "-", "|", ",", "+");
private static final String variableDenoter = "<*>";
// counting frequency will be grouped by composite of position and token string
private static final String positionedTokenKeyFormat = "%d-%s";
// Token set will be grouped by composite of tokens length per log message, word combination candidate and token position.
private static final String groupTokenSetKeyFormat = "%d-%s-%d";

private final Map<String, Long> tokenFreqMap;
private final Map<String, Set<String>> groupTokenSetMap;
private final Map<String, String> logIdGroupCandidateMap;
private final int variableCountThreshold;
private final float thresholdPercentage;
private final List<String> filterPatterns;
private final List<String> delimiters;

/**
* Creates new Brain log parser with default parameters
*/
public BrainLogParser() {
this(2, 0.0f, defaultFilterPatterns, defaultDelimiters);
songkant-aws marked this conversation as resolved.
Show resolved Hide resolved
}

/**
* Creates new Brain log parser with overridden variableCountThreshold
* @param variableCountThreshold the threshold to decide whether low frequency token is variable
*/
public BrainLogParser(int variableCountThreshold) {
this(variableCountThreshold, 0.0f, defaultFilterPatterns, defaultDelimiters);
}

Check warning on line 61 in libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java

View check run for this annotation

Codecov / codecov/patch

libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java#L60-L61

Added lines #L60 - L61 were not covered by tests

/**
* Creates new Brain log parser with overridden variableCountThreshold amd thresholdPercentage
songkant-aws marked this conversation as resolved.
Show resolved Hide resolved
* @param variableCountThreshold the threshold to decide whether low frequency token is variable
* @param thresholdPercentage the threshold percentage to decide which frequency is representative
* frequency per log message
*/
public BrainLogParser(int variableCountThreshold, float thresholdPercentage) {
this(variableCountThreshold, thresholdPercentage, defaultFilterPatterns, defaultDelimiters);
}

Check warning on line 71 in libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java

View check run for this annotation

Codecov / codecov/patch

libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java#L70-L71

Added lines #L70 - L71 were not covered by tests

/**
* Creates new Brain log parser with overridden variableCountThreshold amd thresholdPercentage and
* overridden filter patterns and delimiters
* @param variableCountThreshold the threshold to decide whether low frequency token is variable
* @param thresholdPercentage the threshold percentage to decide which frequency is representative
* frequency per log message
* @param filterPatterns a list of regex to replace matched pattern to be replaced with variable denoter
* @param delimiters a list of delimiters to be replaced with empty string after regex replacement
*/
public BrainLogParser(int variableCountThreshold, float thresholdPercentage, List<String> filterPatterns, List<String> delimiters) {
this.tokenFreqMap = new HashMap<>();
this.groupTokenSetMap = new HashMap<>();
this.logIdGroupCandidateMap = new HashMap<>();
this.variableCountThreshold = variableCountThreshold;
this.thresholdPercentage = thresholdPercentage;
this.filterPatterns = filterPatterns;
this.delimiters = delimiters;
}

/**
* Preprocess single line of log message with logId
* @param logMessage log message body per log
* @param logId logId of the log
* @return list of tokens by splitting preprocessed log message
*/
public List<String> preprocess(String logMessage, String logId) {
// match regex and replace it with variable denoter
for (String pattern : filterPatterns) {
logMessage = logMessage.replaceAll(pattern, variableDenoter);
}

for (String delimiter : delimiters) {
logMessage = logMessage.replace(delimiter, "");
}

// Append logId/docId to the end of the split tokens
logMessage = logMessage.trim() + " " + logId;

return Arrays.asList(logMessage.split(" "));
}

/**
* Count token frequency per position/index in the token list
* @param tokens list of tokens from preprocessed log message
*/
public void processTokenHistogram(List<String> tokens) {
// Ignore last element since it's designed to be appended logId
for (int i = 0; i < tokens.size() - 1; i++) {
String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, i, tokens.get(i));
tokenFreqMap.put(tokenKey, tokenFreqMap.getOrDefault(tokenKey, 0L) + 1);
}
}

/**
* Preprocess all lines of log messages with logId list. Empty logId list is allowed as the index within
* the list will be logId by default
* @param logMessages list of log messages
* @param logIds list of logIds corresponded to log message
* @return list of token lists
*/
public List<List<String>> preprocessAllLogs(List<String> logMessages, List<String> logIds) {
List<List<String>> preprocessedLogs = new ArrayList<>();
int size = logIds.isEmpty() ? logMessages.size() : Math.min(logMessages.size(), logIds.size());

for (int i = 0; i < size; i++) {
String logId = logIds.isEmpty() ? String.valueOf(i) : logIds.get(i);
List<String> tokens = this.preprocess(logMessages.get(i), logId);
if (tokens.size() > 1) {
preprocessedLogs.add(tokens);
this.processTokenHistogram(tokens);
}
}

return preprocessedLogs;
}

/**
* The second process step to calculate initial groups of tokens based on previous token histogram.
* The group will be represented by the representative word combination of the log message. The word
* combination usually selects the longest word combination with the same frequency that should be above
* designed threshold.
* <p>
* Within initial group, new group level token set per position is counted for final log pattern calculation
* @param preprocessedLogs preprocessed list of log messages
*/
public void calculateGroupTokenFreq(List<List<String>> preprocessedLogs) {
for (List<String> tokens : preprocessedLogs) {
Map<Long, Integer> wordOccurrences = this.getWordOccurrences(tokens);
List<Map.Entry<Long, Integer>> sortedOccurrences = this.getSortedWordCombinations(wordOccurrences);
Map.Entry<Long, Integer> candidate = this.findCandidate(sortedOccurrences);
String groupCandidateStr = String.format(Locale.ROOT, "%d,%d", candidate.getKey(), candidate.getValue());
this.logIdGroupCandidateMap.put(tokens.get(tokens.size() - 1), groupCandidateStr);
this.updateGroupTokenFreqMap(tokens, groupCandidateStr);
}
}

/**
* Parse single line of log pattern after preprocess - processTokenHistogram - calculateGroupTokenFreq
* @param tokens list of tokens for a specific log message
* @return parsed log pattern that is a list of string
*/
public List<String> parseLogPattern(List<String> tokens) {
String logId = tokens.get(tokens.size() - 1);
String groupCandidateStr = this.logIdGroupCandidateMap.get(logId);
String[] groupCandidate = groupCandidateStr.split(",");
Long repFreq = Long.parseLong(groupCandidate[0]); // representative frequency of the group
return IntStream.range(0, tokens.size() - 1).mapToObj(i -> new AbstractMap.SimpleEntry<>(i, tokens.get(i))).map(entry -> {
int index = entry.getKey();
String token = entry.getValue();
String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, index, token);
assert this.tokenFreqMap.get(tokenKey) != null : String.format(Locale.ROOT, "Not found token: %s on position %d", token, index);

boolean isHigherFrequency = this.tokenFreqMap.get(tokenKey) > repFreq;
boolean isLowerFrequency = this.tokenFreqMap.get(tokenKey) < repFreq;
String groupTokenKey = String.format(Locale.ROOT, groupTokenSetKeyFormat, tokens.size() - 1, groupCandidateStr, index);
assert this.groupTokenSetMap.get(groupTokenKey) != null : String.format(
Locale.ROOT,
"Not found any token in group: %s",
groupTokenKey
);

if (isHigherFrequency) {
// For higher frequency token that doesn't belong to word combination, it's likely to be constant token only if
// it's unique token on that position within the group
boolean isUniqueToken = this.groupTokenSetMap.get(groupTokenKey).size() == 1;
if (!isUniqueToken) {
return variableDenoter;

Check warning on line 199 in libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java

View check run for this annotation

Codecov / codecov/patch

libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java#L199

Added line #L199 was not covered by tests
}
} else if (isLowerFrequency) {
// For lower frequency token that doesn't belong to word combination, it's likely to be constant token only if
// it doesn't exceed the preset variable count threshold. For example, some variable are limited number of enums,
// and sometimes they could be treated as constant tokens.
if (this.groupTokenSetMap.get(groupTokenKey).size() >= variableCountThreshold) {
return variableDenoter;
}
}
return token;
}).collect(Collectors.toList());
}

/**
* Parse all lines of log messages to generate the log pattern map.
* @param logMessages all lines of log messages
* @param logIds corresponding logIds for all lines of log messages
* @return log pattern map with log pattern string as key, grouped logIds as value
*/
public Map<String, List<String>> parseAllLogPatterns(List<String> logMessages, List<String> logIds) {
List<List<String>> processedMessages = this.preprocessAllLogs(logMessages, logIds);

this.calculateGroupTokenFreq(processedMessages);

Map<String, List<String>> logPatternMap = new HashMap<>();
for (int i = 0; i < processedMessages.size(); i++) {
List<String> processedMessage = processedMessages.get(i);
String logId = logIds.isEmpty() ? String.valueOf(i) : processedMessage.get(processedMessage.size() - 1);
List<String> logPattern = this.parseLogPattern(processedMessages.get(i));
String patternKey = String.join(" ", logPattern);
logPatternMap.computeIfAbsent(patternKey, k -> new ArrayList<>()).add(logId);
}
return logPatternMap;
}

/**
* Get token histogram
* @return map of token per position key and its frequency
*/
public Map<String, Long> getTokenFreqMap() {
return this.tokenFreqMap;
}

/**
* Get group per length per position to its token set map
* @return map of pattern group per length per position key and its token set
*/
public Map<String, Set<String>> getGroupTokenSetMap() {
return this.groupTokenSetMap;
}

/**
* Get logId to its group candidate map
* @return map of logId and group candidate
*/
public Map<String, String> getLogIdGroupCandidateMap() {
return this.logIdGroupCandidateMap;
}

private Map<Long, Integer> getWordOccurrences(List<String> tokens) {
Map<Long, Integer> occurrences = new HashMap<>();
for (int i = 0; i < tokens.size() - 1; i++) {
String tokenKey = String.format(Locale.ROOT, positionedTokenKeyFormat, i, tokens.get(i));
Long tokenFreq = tokenFreqMap.get(tokenKey);
occurrences.put(tokenFreq, occurrences.getOrDefault(tokenFreq, 0) + 1);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

words have the same frequency are from the same lines?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is part of the algorithm to group same frequency tokens as initial pattern. Longest same frequency tokens is encoded into (frequency, length) word occurrence for later on refining internal groups.

}
return occurrences;
}

private List<Map.Entry<Long, Integer>> getSortedWordCombinations(Map<Long, Integer> occurrences) {
List<Map.Entry<Long, Integer>> sortedOccurrences = new ArrayList<>(occurrences.entrySet());
sortedOccurrences.sort((entry1, entry2) -> {
int wordCombinationLengthComparison = entry2.getValue().compareTo(entry1.getValue());
if (wordCombinationLengthComparison != 0) {
return wordCombinationLengthComparison;
} else {
return entry2.getKey().compareTo(entry1.getKey());
}
});

return sortedOccurrences;
}

private Map.Entry<Long, Integer> findCandidate(List<Map.Entry<Long, Integer>> sortedWordCombinations) {
OptionalLong maxFreqOptional = sortedWordCombinations.stream().mapToLong(Map.Entry::getKey).max();
if (maxFreqOptional.isPresent()) {
long maxFreq = maxFreqOptional.getAsLong();
float threshold = maxFreq * this.thresholdPercentage;
for (Map.Entry<Long, Integer> entry : sortedWordCombinations) {
if (entry.getKey() > threshold) {
return entry;
}
}

Check warning on line 292 in libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java

View check run for this annotation

Codecov / codecov/patch

libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java#L292

Added line #L292 was not covered by tests
}
return sortedWordCombinations.get(0);

Check warning on line 294 in libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java

View check run for this annotation

Codecov / codecov/patch

libs/pattern/src/main/java/org/opensearch/pattern/BrainLogParser.java#L294

Added line #L294 was not covered by tests
}

private void updateGroupTokenFreqMap(List<String> tokens, String groupCandidateStr) {
int tokensLen = tokens.size() - 1;
for (int i = 0; i < tokensLen; i++) {
String groupTokenFreqKey = String.format(Locale.ROOT, groupTokenSetKeyFormat, tokensLen, groupCandidateStr, i);
this.groupTokenSetMap.computeIfAbsent(groupTokenFreqKey, k -> new HashSet<>()).add(tokens.get(i));
}
}
}
Loading
Loading