-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
put together simple util to scan for operation values
- Loading branch information
CodyCBakerPhD
committed
Aug 14, 2024
1 parent
a6ba38d
commit e063f18
Showing
5 changed files
with
102 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import collections | ||
import re | ||
|
||
_KNOWN_OPERATION_TYPES = ( | ||
"REST.GET.OBJECT", | ||
"REST.PUT.OBJECT", | ||
"REST.HEAD.OBJECT", | ||
"REST.POST.OBJECT", | ||
"REST.COPY.PART", | ||
"REST.COPY.OBJECT_GET", | ||
"REST.DELETE.OBJECT", | ||
"REST.OPTIONS.PREFLIGHT", | ||
"BATCH.DELETE.OBJECT", | ||
"WEBSITE.GET.OBJECT", | ||
"REST.GET.BUCKETVERSIONS", | ||
"REST.GET.BUCKET", | ||
) | ||
|
||
_IS_OPERATION_TYPE_KNOWN = collections.defaultdict(bool) | ||
for request_type in _KNOWN_OPERATION_TYPES: | ||
_IS_OPERATION_TYPE_KNOWN[request_type] = True | ||
|
||
_FULL_PATTERN_TO_FIELD_MAPPING = [ | ||
"bucket_owner", | ||
"bucket", | ||
"timestamp", | ||
"ip_address", | ||
"requester", | ||
"request_id", | ||
"operation", | ||
"asset_id", | ||
"request_uri", | ||
# "http_version", # Regex not splitting this from the request_uri... | ||
"status_code", | ||
"error_code", | ||
"bytes_sent", | ||
"object_size", | ||
"total_time", | ||
"turn_around_time", | ||
"referrer", | ||
"user_agent", | ||
"version", | ||
"host_id", | ||
"sigv", | ||
"cipher_suite", | ||
"auth_type", | ||
"endpoint", | ||
"tls_version", | ||
"access_point_arn", | ||
] | ||
_FullLogLine = collections.namedtuple("FullLogLine", _FULL_PATTERN_TO_FIELD_MAPPING) | ||
|
||
_S3_LOG_REGEX = re.compile(pattern=r'"([^"]+)"|\[([^]]+)]|([^ ]+)') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import pathlib | ||
|
||
import tqdm | ||
from pydantic import DirectoryPath, FilePath, validate_call | ||
|
||
from ._buffered_text_reader import BufferedTextReader | ||
|
||
|
||
@validate_call | ||
def find_all_known_operation_types( | ||
base_raw_s3_log_folder_path: DirectoryPath, excluded_log_files: list[FilePath] | None | ||
) -> set: | ||
excluded_log_files = excluded_log_files or {} | ||
excluded_log_files = {pathlib.Path(excluded_log_file) for excluded_log_file in excluded_log_files} | ||
|
||
daily_raw_s3_log_file_paths = list(set(base_raw_s3_log_folder_path.rglob(pattern="*.log")) - excluded_log_files) | ||
|
||
unique_operation_types = set() | ||
for raw_s3_log_file_path in tqdm.tqdm( | ||
iterable=daily_raw_s3_log_file_paths, | ||
desc="Extracting operation types from log files...", | ||
position=0, | ||
leave=True, | ||
): | ||
# The start of each line should be regular enough to reliably slice out just the span of the operation type | ||
# (plus some extra bits on the end from irregularly of operation type length) | ||
operation_types_per_file = { | ||
raw_log_line[136:160].split(" ")[0] | ||
for buffered_text_reader in BufferedTextReader(file_path=raw_s3_log_file_path) | ||
for raw_log_line in buffered_text_reader | ||
} | ||
|
||
unique_operation_types.update(operation_types_per_file) | ||
|
||
return unique_operation_types |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters