From 9daff105348c22a1099ec44fbad5dcb708eaa734 Mon Sep 17 00:00:00 2001 From: mrproliu <741550557@qq.com> Date: Wed, 7 Aug 2024 22:12:44 +0800 Subject: [PATCH] Support minial combine pattern count setting (#17) --- demo/uri_drain.ini | 1 + models/Configuration.md | 1 + models/uri_drain/template_miner.py | 1 + models/uri_drain/template_miner_config.py | 3 ++ models/uri_drain/uri_drain.py | 45 ++++++++++++++++--- servers/simple/uri_drain.ini | 1 + .../expected/endpoint_counterexamples.yaml | 4 +- test/e2e/expected/endpoint_hard.yaml | 18 -------- test/e2e/expected/endpoint_hard_3k.yaml | 16 ------- test/e2e/expected/endpoint_trivial.yaml | 10 ----- test/e2e/expected/endpoint_trivial_3k.yaml | 7 --- 11 files changed, 47 insertions(+), 60 deletions(-) diff --git a/demo/uri_drain.ini b/demo/uri_drain.ini index 59f6438..a7e4cb1 100644 --- a/demo/uri_drain.ini +++ b/demo/uri_drain.ini @@ -33,6 +33,7 @@ depth = 4 max_children = 100 max_clusters = 1024 extra_delimiters = ["/"] +combine_min_url_count = ${DRAIN_COMBINE_MIN_URL_COUNT:8} [PROFILING] enabled = True diff --git a/models/Configuration.md b/models/Configuration.md index be9de6c..afb7725 100644 --- a/models/Configuration.md +++ b/models/Configuration.md @@ -36,6 +36,7 @@ Drain is the core algorithm of URI Drain. | max_clusters | int | DRAIN_MAX_CLUSTERS | 1024 | Max number of tracked clusters (unlimited by default). When this number is reached, model starts replacing old clusters with a new ones according to the LRU policy. | | extra_delimiters | string | DRAIN_EXTRA_DELIMITERS | \["/"\] | The extra delimiters to split the sequence. | | analysis_min_url_count | int | DRAIN_ANALYSIS_MIN_URL_COUNT | 20 | The minimum number of unique URLs(each service) to trigger the analysis. | +| combine_min_url_count | int | DRAIN_COMBINE_MIN_URL_COUNT | 8 | The minimum number of unique URLs(candidate of each service) to mask as variable URL(encase some similar URL are not restful, such as `/test/one` and `test/two`). | ### Profiling diff --git a/models/uri_drain/template_miner.py b/models/uri_drain/template_miner.py index e49f854..33683dc 100644 --- a/models/uri_drain/template_miner.py +++ b/models/uri_drain/template_miner.py @@ -84,6 +84,7 @@ def __init__(self, max_children=self.config.drain_max_children, max_clusters=self.config.drain_max_clusters, extra_delimiters=self.config.drain_extra_delimiters, + combine_min_url_count=self.config.drain_combine_min_url_count, profiler=self.profiler, param_str=param_str, # param_extra=param_extra, # MODIFIED:: for URI Drain < It is now a dict since contains multiple types diff --git a/models/uri_drain/template_miner_config.py b/models/uri_drain/template_miner_config.py index 9e48228..09e2922 100644 --- a/models/uri_drain/template_miner_config.py +++ b/models/uri_drain/template_miner_config.py @@ -28,6 +28,7 @@ def __init__(self): self.drain_max_children = 100 self.drain_max_clusters = None self.drain_analysis_min_url_count = 20 + self.drain_combine_min_url_count = 8 self.masking_instructions = [] self.mask_prefix = "<" self.mask_suffix = ">" @@ -82,6 +83,8 @@ def load(self, config_filename: str): self.parameter_extraction_cache_capacity) self.drain_analysis_min_url_count = self.read_config_value(parser, section_drain, 'analysis_min_url_count', int, self.drain_analysis_min_url_count) + self.drain_combine_min_url_count = self.read_config_value(parser, section_drain, 'combine_min_url_count', int, + self.drain_combine_min_url_count) masking_instructions = [] masking_list = json.loads(masking_instructions_str) diff --git a/models/uri_drain/uri_drain.py b/models/uri_drain/uri_drain.py index 055cf71..36380a2 100644 --- a/models/uri_drain/uri_drain.py +++ b/models/uri_drain/uri_drain.py @@ -15,12 +15,13 @@ class LogCluster: # TODO Modified:: Changed to URICluster - __slots__ = ["log_template_tokens", "cluster_id", "size"] + __slots__ = ["log_template_tokens", "cluster_id", "size", "latest_urls"] - def __init__(self, log_template_tokens: list, cluster_id: int): + def __init__(self, log_template_tokens: list, cluster_id: int, combine_min_url_count: int): self.log_template_tokens = tuple(log_template_tokens) self.cluster_id = cluster_id self.size = 1 + self.latest_urls = LRUCache(combine_min_url_count+1) def get_template(self): # Modified:: Changed to join by slash instead of space for @@ -47,6 +48,27 @@ def get_template(self): template = '/'.join(self.log_template_tokens) return f'/{template}' + def adding_url(self, url: str): + if self.latest_urls.__contains__(url): + return + self.latest_urls[url] = True + + def __str__(self): + # return f"ID={str(self.cluster_id).ljust(5)} : size={str(self.size).ljust(10)}: {self.get_template()}" + return f"size={str(self.size).ljust(10)}: {self.get_template()}" + + +class SingleURILogCluster: + __slots__ = ["uri", "cluster_id", "size"] + + def __init__(self, uri: str): + self.uri = uri + self.cluster_id = -1 + self.size = 1 + + def get_template(self): + return self.uri + def __str__(self): # return f"ID={str(self.cluster_id).ljust(5)} : size={str(self.size).ljust(10)}: {self.get_template()}" return f"size={str(self.size).ljust(10)}: {self.get_template()}" @@ -83,6 +105,7 @@ def __init__(self, sim_th=0.4, max_children=100, max_clusters=None, + combine_min_url_count=8, extra_delimiters=(), profiler: Profiler = NullProfiler(), param_str="{var}", # Modified:: required param_str @@ -116,6 +139,7 @@ def __init__(self, self.max_node_depth = depth - 2 # max depth of a prefix tree node, starting from zero self.sim_th = sim_th self.max_children = max_children + self.combine_min_url_count = combine_min_url_count self.root_node = Node() self.profiler = profiler self.extra_delimiters = extra_delimiters @@ -133,7 +157,14 @@ def __init__(self, @property def clusters(self): - return self.id_to_cluster.values() + result = [] + for cluster in self.id_to_cluster.values(): + if cluster.latest_urls and cluster.latest_urls.__len__() >= self.combine_min_url_count: + result.append(cluster) + continue + for url, _ in cluster.latest_urls.items(): + result.append(SingleURILogCluster(url)) + return result @property def cluster_patterns(self): @@ -245,7 +276,7 @@ def add_log_message(self, content: str): self.profiler.start_section("create_cluster") self.clusters_counter += 1 cluster_id = self.clusters_counter - match_cluster = LogCluster(content_tokens, cluster_id) + match_cluster = LogCluster(content_tokens, cluster_id, self.combine_min_url_count) self.id_to_cluster[cluster_id] = match_cluster self.add_seq_to_prefix_tree(self.root_node, match_cluster) update_type = "cluster_created" @@ -261,7 +292,7 @@ def add_log_message(self, content: str): update_type = "rejected (create new)" self.clusters_counter += 1 cluster_id = self.clusters_counter - match_cluster = LogCluster(content_tokens, cluster_id) + match_cluster = LogCluster(content_tokens, cluster_id, self.combine_min_url_count) self.id_to_cluster[cluster_id] = match_cluster self.add_seq_to_prefix_tree(self.root_node, match_cluster) match_cluster.size -= 1 @@ -278,6 +309,7 @@ def add_log_message(self, content: str): if self.profiler: self.profiler.end_section() + match_cluster.adding_url(content) return match_cluster, update_type def get_total_cluster_size(self): @@ -315,12 +347,13 @@ def __init__(self, sim_th=0.4, max_children=100, max_clusters=None, + combine_min_url_count=8, extra_delimiters=(), profiler: Profiler = NullProfiler(), param_str="<*>", # param_extra=None, # Modified:: Added param_extra parametrize_numeric_tokens=True): - super().__init__(depth, sim_th, max_children, max_clusters, extra_delimiters, profiler, param_str, + super().__init__(depth, sim_th, max_children, max_clusters, combine_min_url_count, extra_delimiters, profiler, param_str, # param_extra, parametrize_numeric_tokens) diff --git a/servers/simple/uri_drain.ini b/servers/simple/uri_drain.ini index 97cbb35..f844707 100644 --- a/servers/simple/uri_drain.ini +++ b/servers/simple/uri_drain.ini @@ -35,6 +35,7 @@ max_children = ${DRAIN_MAX_CHILDREN:100} max_clusters = ${DRAIN_MAX_CLUSTERS:1024} extra_delimiters = ${DRAIN_EXTRA_DELIMITERS:["/"]} analysis_min_url_count = ${DRAIN_ANALYSIS_MIN_URL_COUNT:20} +combine_min_url_count = ${DRAIN_COMBINE_MIN_URL_COUNT:8} [PROFILING] enabled = ${PROFILING_ENABLED:False} diff --git a/test/e2e/expected/endpoint_counterexamples.yaml b/test/e2e/expected/endpoint_counterexamples.yaml index cb8987a..3b868d9 100644 --- a/test/e2e/expected/endpoint_counterexamples.yaml +++ b/test/e2e/expected/endpoint_counterexamples.yaml @@ -12,7 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. -patterns: - - "/api/v1/usernames/{var}" - - "/api/v1/users/{var}" +patterns: [] version: "1" \ No newline at end of file diff --git a/test/e2e/expected/endpoint_hard.yaml b/test/e2e/expected/endpoint_hard.yaml index 1cd48d5..5615201 100644 --- a/test/e2e/expected/endpoint_hard.yaml +++ b/test/e2e/expected/endpoint_hard.yaml @@ -13,32 +13,14 @@ # limitations under the License. patterns: - - /api-this-is-a-special-case/v99999999999999999/orders/delete/{var} - - /api-this-is-a-special-case/v99999999999999999/orders/reorder/{var} - - /api-this-is-a-special-case/v99999999999999999/orders/update/{var} - /api/v1/bills/{var} - /api/v1/companies/{var} - - /api/v1/companies/{var}/employees/{var}/reviews/{var} - - /api/v1/companies/{var}/tasks/{var}/assignees/{var} - /api/v1/projects/{var} - /api/v1/services/{var} - /api/v1/users/{var}/posts/{var}/comments - - /api/v1/users/{var}/posts/{var}/comments/{var} - /api/v1/wallets/{var} - - /api/v2/admin/users/{var} - /api/v2/courses/{var}/modules/{var}/lessons - /api/v2/customers/{var} - /api/v3/products/{var}/reviews/{var}/comments - - /api/v3/providers/{var} - /api/v4/orders/{var}/items/{var}/tracking - - /customer/{var} - - /customer/{var}/order/{var} - - /customer/{var}/profile/{var}/compare/{var}/profile/{var} - - ABC/{var} - - HikariCP/Connection/{var} - - google.com/api/v1/users/{var} - - http://www.google.com/api/v1/users/{var} - - https://www.google.com/api/v1/users/{var} - - top1.abc.example.com.net.cn/api/v1/users/{var} - - www.google.com/api/v1/users/{var} version: '1' \ No newline at end of file diff --git a/test/e2e/expected/endpoint_hard_3k.yaml b/test/e2e/expected/endpoint_hard_3k.yaml index 004e634..5615201 100644 --- a/test/e2e/expected/endpoint_hard_3k.yaml +++ b/test/e2e/expected/endpoint_hard_3k.yaml @@ -13,30 +13,14 @@ # limitations under the License. patterns: - - /api-this-is-a-special-case/v99999999999999999/orders/delete/{var} - - /api-this-is-a-special-case/v99999999999999999/orders/reorder/{var} - - /api-this-is-a-special-case/v99999999999999999/orders/update/{var} - /api/v1/bills/{var} - /api/v1/companies/{var} - - /api/v1/companies/{var}/employees/{var}/reviews/{var} - - /api/v1/companies/{var}/tasks/{var}/assignees/{var} - /api/v1/projects/{var} - /api/v1/services/{var} - /api/v1/users/{var}/posts/{var}/comments - - /api/v1/users/{var}/posts/{var}/comments/{var} - /api/v1/wallets/{var} - - /api/v2/admin/users/{var} - /api/v2/courses/{var}/modules/{var}/lessons - /api/v2/customers/{var} - /api/v3/products/{var}/reviews/{var}/comments - - /api/v3/providers/{var} - /api/v4/orders/{var}/items/{var}/tracking - - /customer/{var} - - /customer/{var}/order/{var} - - /customer/{var}/profile/{var}/compare/{var}/profile/{var} - - google.com/api/v1/users/{var} - - http://www.google.com/api/v1/users/{var} - - https://www.google.com/api/v1/users/{var} - - top1.abc.example.com.net.cn/api/v1/users/{var} - - www.google.com/api/v1/users/{var} version: '1' \ No newline at end of file diff --git a/test/e2e/expected/endpoint_trivial.yaml b/test/e2e/expected/endpoint_trivial.yaml index ba5aae0..885557c 100644 --- a/test/e2e/expected/endpoint_trivial.yaml +++ b/test/e2e/expected/endpoint_trivial.yaml @@ -14,18 +14,8 @@ patterns: - /api/v1/accounts/{var} - - /api/v1/invoices/{var} - /api/v1/orders/{var} - /api/v1/posts/{var} - /api/v1/products/{var} - /api/v1/users/{var} - - /api/v2/data/users/{var} - - /api/v999/orders/{var} - - /product/{var} - - /user/{var} - - /user/{var}/post/{var} - - /user/{var}/profile/{var}/compare/{var}/profile/{var} - - GET:/api/v1/users/{var} - - http://www.google.com/api/v1/users/{var} - - https://www.google.com/api/v1/users/{var} version: '1' \ No newline at end of file diff --git a/test/e2e/expected/endpoint_trivial_3k.yaml b/test/e2e/expected/endpoint_trivial_3k.yaml index 437531c..7367b04 100644 --- a/test/e2e/expected/endpoint_trivial_3k.yaml +++ b/test/e2e/expected/endpoint_trivial_3k.yaml @@ -19,11 +19,4 @@ patterns: - /api/v1/posts/{var} - /api/v1/products/{var} - /api/v1/users/{var} - - /api/v2/data/users/{var} - - /api/v999/orders/{var} - - /product/{var} - - /user/{var} - - /user/{var}/post/{var} - - /user/{var}/profile/{var}/compare/{var}/profile/{var} - - GET:/api/v1/users/{var} version: '1' \ No newline at end of file