diff --git a/models/uri_drain/template_miner.py b/models/uri_drain/template_miner.py index 6f96362..f1a066c 100644 --- a/models/uri_drain/template_miner.py +++ b/models/uri_drain/template_miner.py @@ -114,6 +114,12 @@ def load_state(self): loaded_drain: Drain = jsonpickle.loads(state, keys=True) + # load all words into clusters + if len(loaded_drain.id_to_cluster) > 0: + for _, cluster in loaded_drain.id_to_cluster.items(): + if isinstance(cluster, LogCluster): + cluster.token_words_check() + # json-pickle encoded keys as string by default, so we have to convert those back to int # this is only relevant for backwards compatibility when loading a snapshot of drain <= v0.9.1 # which did not use json-pickle's keys=true @@ -137,8 +143,8 @@ def save_state(self, snapshot_reason): state = base64.b64encode(zlib.compress(state)) logger.info(f"Saving state of {len(self.drain.clusters)} clusters " - f"with {self.drain.get_total_cluster_size()} messages, {len(state)} bytes, " - f"reason: {snapshot_reason}") + f"with {self.drain.get_total_cluster_size()} messages to service <{self.persistence_handler.get_service()}>, " + f"{len(state)} bytes, reason: {snapshot_reason}") self.persistence_handler.save_state(state) def get_snapshot_reason(self, change_type, cluster_id): diff --git a/models/uri_drain/uri_drain.py b/models/uri_drain/uri_drain.py index 36380a2..2caac02 100644 --- a/models/uri_drain/uri_drain.py +++ b/models/uri_drain/uri_drain.py @@ -4,11 +4,11 @@ # Again, it's further modified to suit URI clustering needs, # changes are kept minimal to avoid divergence from Drain3 upstream. # TODO Note:: Every change to upstream Drain3 algorithm MUST be commented starting with "Modified::" - from typing import List, Dict, Sequence from cachetools import LRUCache, Cache +from models.uri_drain.word_splitter import check_all_word_correct from models.utils.simple_profiler import Profiler, NullProfiler import logger @@ -18,7 +18,7 @@ class LogCluster: # TODO Modified:: Changed to URICluster __slots__ = ["log_template_tokens", "cluster_id", "size", "latest_urls"] def __init__(self, log_template_tokens: list, cluster_id: int, combine_min_url_count: int): - self.log_template_tokens = tuple(log_template_tokens) + self.log_template_tokens = tuple(parse_token_list(log_template_tokens)) self.cluster_id = cluster_id self.size = 1 self.latest_urls = LRUCache(combine_min_url_count+1) @@ -57,6 +57,27 @@ def __str__(self): # return f"ID={str(self.cluster_id).ljust(5)} : size={str(self.size).ljust(10)}: {self.get_template()}" return f"size={str(self.size).ljust(10)}: {self.get_template()}" + def token_words_check(self): + self.log_template_tokens = parse_token_list(self.log_template_tokens) + + +class Token(str): + __slots__ = ["token", "word_correct"] + + def __new__(cls, token: str, word_correct: bool = False): + return super().__new__(cls, token) + + def __init__(self, token: str, word_correct: bool): + self.token = token + self.word_correct = word_correct + + +def parse_token_list(tokens: List[str]) -> List[Token]: + result = [] + for token in tokens: + result.append(Token(token, check_all_word_correct(token))) + return result + class SingleURILogCluster: __slots__ = ["uri", "cluster_id", "size"] @@ -198,13 +219,16 @@ def fast_match(self, cluster_ids: Sequence, tokens: list, sim_th: float, include max_param_count = -1 max_cluster = None + # pre-parse tokens to avoid repeated parsing + parsed_token = parse_token_list(tokens) + for cluster_id in cluster_ids: # Try to retrieve cluster from cache with bypassing eviction # algorithm as we are only testing candidates for a match. cluster = self.id_to_cluster.get(cluster_id) if cluster is None: continue - cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens, include_params) + cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, parsed_token, include_params) # self.logger.debug(f'SIMILARITY = {cur_sim} for c{cluster_id}, {cluster.log_template_tokens} param={param_count}') if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count): # todo: this is known caveat @@ -495,6 +519,9 @@ def get_seq_distance(self, seq1, seq2, include_params: bool): if (index == 0 or index == 1) and '.' in token1 and token1 != token2: # self.logger.debug('this is domain mismatch!') return 0.0, 0 + # if all new tokens are words, then we can consider it cannot be combined + if token1 != token2 and (token1.word_correct or token2.word_correct): + return -1, -1 # if token1 in self.possible_params or token1 == self.param_str: if token1 == self.param_str: param_count += 1 @@ -518,14 +545,6 @@ def create_template(self, seq1, seq2): ret_val = list(seq2) seq_length = len(seq1) - # SPECIAL ASSUMPTION THAT MIGHT BE FALSE:: - # /api/getconnection - # /api/dropconnection - if seq_length == 2: - if (seq1[0] == seq2[0] and seq1[1] != seq2[1] # can be simplified - and not self.has_numbers(seq1[1]) and not self.has_numbers(seq2[1])): - print(f'first token match but second token mismatch, seq1 = {seq1}, seq2 = {seq2}') - return 'rejected' # TODO, radical assumption if there's absolutely 0 digit in seq1 and seq2, then don't consider them similar? # To implement this, we increase the false negative rate, but decrease false positive rate @@ -626,7 +645,7 @@ def create_template(self, seq1, seq2): ret_val[i] = self.param_str # self.logger.debug(f'After change: {ret_val}') - return ret_val + return parse_token_list(ret_val) def match(self, content: str, full_search_strategy="never"): """ diff --git a/models/uri_drain/word_splitter.py b/models/uri_drain/word_splitter.py new file mode 100644 index 0000000..c61af11 --- /dev/null +++ b/models/uri_drain/word_splitter.py @@ -0,0 +1,52 @@ +# Copyright 2023 SkyAPM org +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +from cachetools import LRUCache +from textblob import TextBlob + +last_word_correct_lru = LRUCache(1000) + + +def split_for_url(text): + # split text by camel case + pattern = r"(?<=[a-z])(?=[A-Z])" + return re.split(pattern, text) + + +def check_all_word_correct(text): + # if contains digits, then it's not a word, ignore the word check + if any(char.isdigit() for char in text): + return False + for word in split_for_url(text): + # if a word is too long, then it's not a word, just ignore to verify to reduce the analysis time + if len(word) > 20: + return False + word = word.lower() + cached_result = last_word_correct_lru.get(word) + if cached_result is not None: + if cached_result: + continue + else: + return False + # When a word is not corrected, then it's not a param + # text blob would also split the world by regex `\w+`, so no worry about special characters(such as "_", ".") + corrected_word = TextBlob(word).correct() + correct = word == corrected_word + last_word_correct_lru[word] = correct + if not correct: + return False + + return True diff --git a/poetry.lock b/poetry.lock index c058f6b..c54169b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1144,13 +1144,13 @@ files = [ [[package]] name = "jsonpickle" -version = "3.2.2" +version = "3.3.0" description = "Python library for serializing arbitrary object graphs into JSON" optional = false python-versions = ">=3.7" files = [ - {file = "jsonpickle-3.2.2-py3-none-any.whl", hash = "sha256:87cd82d237fd72c5a34970e7222dddc0accc13fddf49af84111887ed9a9445aa"}, - {file = "jsonpickle-3.2.2.tar.gz", hash = "sha256:d425fd2b8afe9f5d7d57205153403fbf897782204437882a477e8eed60930f8c"}, + {file = "jsonpickle-3.3.0-py3-none-any.whl", hash = "sha256:287c12143f35571ab00e224fa323aa4b090d5a7f086f5f494d7ee9c7eb1a380a"}, + {file = "jsonpickle-3.3.0.tar.gz", hash = "sha256:ab467e601e5b1a1cd76f1819d014795165da071744ef30bf3786e9bc549de25a"}, ] [package.extras] @@ -1505,13 +1505,13 @@ files = [ [[package]] name = "narwhals" -version = "1.6.0" +version = "1.6.1" description = "Extremely lightweight compatibility layer between dataframe libraries" optional = false python-versions = ">=3.8" files = [ - {file = "narwhals-1.6.0-py3-none-any.whl", hash = "sha256:4ec5b248998ae552491bc1f497448e94f0f539f5664428a31ddf662c5d46c244"}, - {file = "narwhals-1.6.0.tar.gz", hash = "sha256:0b0d12f994ac7832c70af29241c32a4f7afddc1cf669f40f6318533d52204595"}, + {file = "narwhals-1.6.1-py3-none-any.whl", hash = "sha256:5dd0dd3691dbc5b44567d6dcb7506a099523ef70cd024d0e6e34af6284eed02b"}, + {file = "narwhals-1.6.1.tar.gz", hash = "sha256:c618e451a77ade63beccd55ddbe64434f14f939cd1672d3d1156c20c1e1642ff"}, ] [package.extras] @@ -2511,13 +2511,13 @@ doc = ["Sphinx", "sphinx-rtd-theme"] [[package]] name = "setuptools" -version = "74.0.0" +version = "74.1.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-74.0.0-py3-none-any.whl", hash = "sha256:0274581a0037b638b9fc1c6883cc71c0210865aaa76073f7882376b641b84e8f"}, - {file = "setuptools-74.0.0.tar.gz", hash = "sha256:a85e96b8be2b906f3e3e789adec6a9323abf79758ecfa3065bd740d81158b11e"}, + {file = "setuptools-74.1.0-py3-none-any.whl", hash = "sha256:cee604bd76cc092355a4e43ec17aee5369095974f41f088676724dc6bc2c9ef8"}, + {file = "setuptools-74.1.0.tar.gz", hash = "sha256:bea195a800f510ba3a2bc65645c88b7e016fe36709fefc58a880c4ae8a0138d7"}, ] [package.extras] @@ -2564,13 +2564,13 @@ files = [ [[package]] name = "starlette" -version = "0.38.3" +version = "0.38.4" description = "The little ASGI library that shines." optional = false python-versions = ">=3.8" files = [ - {file = "starlette-0.38.3-py3-none-any.whl", hash = "sha256:0e4af343a4e59324b96fbe0f3c6ad8c3a908d73f12f5c80a797803a6c3ad4687"}, - {file = "starlette-0.38.3.tar.gz", hash = "sha256:f674450f0f46a790be1f3a128f386080600b58fa358f8e320d93dbef6d7f676c"}, + {file = "starlette-0.38.4-py3-none-any.whl", hash = "sha256:526f53a77f0e43b85f583438aee1a940fd84f8fd610353e8b0c1a77ad8a87e76"}, + {file = "starlette-0.38.4.tar.gz", hash = "sha256:53a7439060304a208fea17ed407e998f46da5e5d9b1addfea3040094512a6379"}, ] [package.dependencies] @@ -2580,6 +2580,25 @@ typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\"" [package.extras] full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"] +[[package]] +name = "textblob" +version = "0.18.0" +description = "Simple, Pythonic text processing. Sentiment analysis, part-of-speech tagging, noun phrase parsing, and more." +optional = false +python-versions = ">=3.8" +files = [ + {file = "textblob-0.18.0-py3-none-any.whl", hash = "sha256:eb29995ab2a9acc2e0fde10dde6b069b01193c75f3dfc2550d0d1ffdd97802bf"}, + {file = "textblob-0.18.0.tar.gz", hash = "sha256:eb507b62bf2283a71f56bed3e0fc4eec7d388ef76b03699cf994166572a8daf3"}, +] + +[package.dependencies] +nltk = ">=3.8" + +[package.extras] +dev = ["pre-commit (>=3.5,<4.0)", "textblob[tests]", "tox"] +docs = ["PyYAML (==6.0.1)", "sphinx (==7.2.6)", "sphinx-issues (==4.0.0)"] +tests = ["numpy", "pytest"] + [[package]] name = "tomli" version = "2.0.1" @@ -2889,4 +2908,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "e43f94f67c34ccdc1232bc2e63ef4b4fde01d8b2c9cd51e85dfa4b80a347ce99" +content-hash = "213599d8152e698f5954913ab45f8c084513920aba9672da65a980053d93a4e2" diff --git a/pyproject.toml b/pyproject.toml index 0f4d0a9..b7aef81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,7 @@ inflect = "^6.0.4" pytest = "^7.3.2" apache-skywalking = "^1.0.1" flask = "^2.3.2" +textblob = "0.18.0" diff --git a/servers/simple/uri_drain.ini b/servers/simple/uri_drain.ini index f844707..a6ab199 100644 --- a/servers/simple/uri_drain.ini +++ b/servers/simple/uri_drain.ini @@ -35,7 +35,7 @@ max_children = ${DRAIN_MAX_CHILDREN:100} max_clusters = ${DRAIN_MAX_CLUSTERS:1024} extra_delimiters = ${DRAIN_EXTRA_DELIMITERS:["/"]} analysis_min_url_count = ${DRAIN_ANALYSIS_MIN_URL_COUNT:20} -combine_min_url_count = ${DRAIN_COMBINE_MIN_URL_COUNT:8} +combine_min_url_count = ${DRAIN_COMBINE_MIN_URL_COUNT:3} [PROFILING] enabled = ${PROFILING_ENABLED:False} diff --git a/servers/simple/worker.py b/servers/simple/worker.py index 10c0c7a..714c27a 100644 --- a/servers/simple/worker.py +++ b/servers/simple/worker.py @@ -50,7 +50,8 @@ def run_worker(uri_main_queue, shared_results_object, config, existing_miners): uris, service = uri_package[0], uri_package[1] # print(uri_main_queue.get(timeout=1)) start_time = time.time() - for uri in uris: + sorted_uris = sorted(uris) + for uri in sorted_uris: drain_instances[service].add_log_message(uri) logger.info(f'Processed {len(uris)} uris of service {service} in {time.time() - start_time} seconds') patterns = drain_instances[service].drain.cluster_patterns diff --git a/test/e2e/expected/endpoint_hard.yaml b/test/e2e/expected/endpoint_hard.yaml index 5615201..7ca2efe 100644 --- a/test/e2e/expected/endpoint_hard.yaml +++ b/test/e2e/expected/endpoint_hard.yaml @@ -19,8 +19,13 @@ patterns: - /api/v1/services/{var} - /api/v1/users/{var}/posts/{var}/comments - /api/v1/wallets/{var} + - /api/v2/admin/users/{var} - /api/v2/courses/{var}/modules/{var}/lessons - /api/v2/customers/{var} - /api/v3/products/{var}/reviews/{var}/comments - /api/v4/orders/{var}/items/{var}/tracking + - /customer/{var} + - /customer/{var}/order/{var} + - ABC/{var} + - www.google.com/api/v1/users/{var} version: '1' \ No newline at end of file diff --git a/test/e2e/expected/endpoint_hard_3k.yaml b/test/e2e/expected/endpoint_hard_3k.yaml index 5615201..fbdef4d 100644 --- a/test/e2e/expected/endpoint_hard_3k.yaml +++ b/test/e2e/expected/endpoint_hard_3k.yaml @@ -19,8 +19,12 @@ patterns: - /api/v1/services/{var} - /api/v1/users/{var}/posts/{var}/comments - /api/v1/wallets/{var} + - /api/v2/admin/users/{var} - /api/v2/courses/{var}/modules/{var}/lessons - /api/v2/customers/{var} - /api/v3/products/{var}/reviews/{var}/comments - /api/v4/orders/{var}/items/{var}/tracking + - /customer/{var} + - /customer/{var}/order/{var} + - www.google.com/api/v1/users/{var} version: '1' \ No newline at end of file diff --git a/test/e2e/expected/endpoint_trivial.yaml b/test/e2e/expected/endpoint_trivial.yaml index 885557c..ad239bf 100644 --- a/test/e2e/expected/endpoint_trivial.yaml +++ b/test/e2e/expected/endpoint_trivial.yaml @@ -14,8 +14,13 @@ patterns: - /api/v1/accounts/{var} + - /api/v1/invoices/{var} - /api/v1/orders/{var} - /api/v1/posts/{var} - /api/v1/products/{var} - /api/v1/users/{var} + - /api/v2/data/users/{var} + - /api/v999/orders/{var} + - /user/{var}/post/{var} + - /user/{var}/profile/{var}/compare/{var}/profile/{var} version: '1' \ No newline at end of file diff --git a/test/e2e/expected/endpoint_trivial_3k.yaml b/test/e2e/expected/endpoint_trivial_3k.yaml index 7367b04..a4c0da4 100644 --- a/test/e2e/expected/endpoint_trivial_3k.yaml +++ b/test/e2e/expected/endpoint_trivial_3k.yaml @@ -19,4 +19,9 @@ patterns: - /api/v1/posts/{var} - /api/v1/products/{var} - /api/v1/users/{var} + - /api/v2/data/users/{var} + - /api/v999/orders/{var} + - /user/{var} + - /user/{var}/post/{var} + - /user/{var}/profile/{var}/compare/{var}/profile/{var} version: '1' \ No newline at end of file