From 0496ac197acbbe38710c5244ea00f8203d0db13e Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 14 Aug 2024 19:59:28 -0400 Subject: [PATCH] adjust rule --- src/dandi_s3_log_parser/_globals.py | 46 +++++++++++++++++++++++++++ src/dandi_s3_log_parser/_log_utils.py | 19 +++++------ 2 files changed, 56 insertions(+), 9 deletions(-) diff --git a/src/dandi_s3_log_parser/_globals.py b/src/dandi_s3_log_parser/_globals.py index bdf8f8a..092278e 100644 --- a/src/dandi_s3_log_parser/_globals.py +++ b/src/dandi_s3_log_parser/_globals.py @@ -14,6 +14,52 @@ "WEBSITE.GET.OBJECT", "REST.GET.BUCKETVERSIONS", "REST.GET.BUCKET", + "BATCH.DELETE.OBJECT", + "REST.COPY.OBJECT_GET", + "REST.COPY.PART", + "REST.DELETE.OBJECT", + "REST.DELETE.OBJECT_TAGGING", + "REST.DELETE.UPLOAD", + "REST.GET.ACCELERATE", + "REST.GET.ACL", + "REST.GET.ANALYTICS", + "REST.GET.BUCKET", + "REST.GET.BUCKETPOLICY", + "REST.GET.BUCKETVERSIONS", + "REST.GET.CORS", + "REST.GET.ENCRYPTION", + "REST.GET.INTELLIGENT_TIERING", + "REST.GET.INVENTORY", + "REST.GET.LIFECYCLE", + "REST.GET.LOCATION", + "REST.GET.LOGGING_STATUS", + "REST.GET.METRICS", + "REST.GET.NOTIFICATION", + "REST.GET.OBJECT", + "REST.GET.OBJECT_LOCK_CONFIGURATION", + "REST.GET.OBJECT_TAGGING", + "REST.GET.OWNERSHIP_CONTROLS", + "REST.GET.POLICY_STATUS", + "REST.GET.PUBLIC_ACCESS_BLOCK", + "REST.GET.REPLICATION", + "REST.GET.REQUEST_PAYMENT", + "REST.GET.TAGGING", + "REST.GET.UPLOAD", + "REST.GET.VERSIONING", + "REST.GET.WEBSITE", + "REST.HEAD.BUCKET", + "REST.HEAD.OBJECT", + "REST.OPTIONS.PREFLIGHT", + "REST.POST.BUCKET", + "REST.POST.MULTI_OBJECT_DELETE", + "REST.POST.OBJECT", + "REST.POST.UPLOAD", + "REST.POST.UPLOADS", + "REST.PUT.ACL", + "REST.PUT.BUCKETPOLICY", + "REST.PUT.OBJECT", + "REST.PUT.OWNERSHIP_CONTROLS", + "REST.PUT.PART", ) _IS_OPERATION_TYPE_KNOWN = collections.defaultdict(bool) diff --git a/src/dandi_s3_log_parser/_log_utils.py b/src/dandi_s3_log_parser/_log_utils.py index 8323a84..2506db0 100644 --- a/src/dandi_s3_log_parser/_log_utils.py +++ b/src/dandi_s3_log_parser/_log_utils.py @@ -11,7 +11,7 @@ def find_all_known_operation_types( base_raw_s3_log_folder_path: DirectoryPath, excluded_log_files: list[FilePath] | None, - max_files: int | None = 100, + max_files: int | None = None, ) -> set: base_raw_s3_log_folder_path = pathlib.Path(base_raw_s3_log_folder_path) excluded_log_files = excluded_log_files or {} @@ -27,13 +27,14 @@ def find_all_known_operation_types( position=0, leave=True, ): - operation_types_per_file = { - field[7] - for buffered_text_reader in BufferedTextReader(file_path=raw_s3_log_file_path) - for raw_log_line in buffered_text_reader - if len((field := raw_log_line[:500].split(" "))) > 7 - } - - unique_operation_types.update(operation_types_per_file) + for buffered_text_reader in BufferedTextReader(file_path=raw_s3_log_file_path): + slice_bound = 200 + for raw_log_line in buffered_text_reader: + fields = raw_log_line[:slice_bound].split(" ") + while len(fields) < 7: + slice_bound += 100 + fields = raw_log_line[:slice_bound].split(" ") + field = fields[7] + unique_operation_types.add(field) return unique_operation_types