diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py index eeb0b88..672a2a9 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py @@ -18,9 +18,6 @@ from pydantic import DirectoryPath, Field, FilePath, validate_call from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH -from ._ip_utils import ( - _get_latest_github_ip_ranges, -) from ._s3_log_file_parser import parse_raw_s3_log @@ -31,7 +28,6 @@ def parse_all_dandi_raw_s3_logs( parsed_s3_log_folder_path: DirectoryPath, excluded_log_files: list[FilePath] | None = None, excluded_ips: collections.defaultdict[str, bool] | None = None, - exclude_github_ips: bool = True, maximum_number_of_workers: int = Field(ge=1, default=1), maximum_buffer_size_in_bytes: int = 4 * 10**9, ) -> None: @@ -57,8 +53,6 @@ def parse_all_dandi_raw_s3_logs( A list of log file paths to exclude from parsing. excluded_ips : collections.defaultdict of strings to booleans, optional A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. - exclude_github_ips : bool, default: True - Include all GitHub action IP addresses in the `excluded_ips`. maximum_number_of_workers : int, default: 1 The maximum number of workers to distribute tasks across. maximum_buffer_size_in_bytes : int, default: 4 GB @@ -74,13 +68,7 @@ def parse_all_dandi_raw_s3_logs( excluded_log_files = {pathlib.Path(excluded_log_file) for excluded_log_file in excluded_log_files} excluded_ips = excluded_ips or collections.defaultdict(bool) - if exclude_github_ips: - for github_ip in _get_latest_github_ip_ranges(): - excluded_ips[github_ip] = True - - def asset_id_handler(*, raw_asset_id: str) -> str: - split_by_slash = raw_asset_id.split("/") - return split_by_slash[0] + "_" + split_by_slash[-1] + asset_id_handler = _get_default_dandi_asset_id_handler() # The .rglob is not naturally sorted; shuffle for more uniform progress updates daily_raw_s3_log_file_paths = set(base_raw_s3_log_folder_path.rglob(pattern="*.log")) - excluded_log_files @@ -98,7 +86,6 @@ def asset_id_handler(*, raw_asset_id: str) -> str: parsed_s3_log_folder_path=parsed_s3_log_folder_path, mode="a", excluded_ips=excluded_ips, - exclude_github_ips=False, # Already included in list so avoid repeated construction asset_id_handler=asset_id_handler, tqdm_kwargs=dict(position=1, leave=False), maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, @@ -203,10 +190,7 @@ def _multi_job_parse_dandi_raw_s3_log( try: error_message = "" - def asset_id_handler(*, raw_asset_id: str) -> str: - """Apparently callables, even simple built-in ones, cannot be pickled.""" - split_by_slash = raw_asset_id.split("/") - return split_by_slash[0] + "_" + split_by_slash[-1] + asset_id_handler = _get_default_dandi_asset_id_handler() job_index = os.getpid() % maximum_number_of_workers per_job_temporary_folder_path = temporary_folder_path / f"job_{job_index}" @@ -228,7 +212,6 @@ def asset_id_handler(*, raw_asset_id: str) -> str: parsed_s3_log_folder_path=per_job_temporary_folder_path, mode="a", excluded_ips=excluded_ips, - exclude_github_ips=False, # Already included in list so avoid repeated construction asset_id_handler=asset_id_handler, tqdm_kwargs=dict(position=job_index + 1, leave=False), maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, @@ -245,7 +228,6 @@ def parse_dandi_raw_s3_log( parsed_s3_log_folder_path: str | pathlib.Path, mode: Literal["w", "a"] = "a", excluded_ips: collections.defaultdict[str, bool] | None = None, - exclude_github_ips: bool = True, asset_id_handler: Callable | None = None, tqdm_kwargs: dict | None = None, maximum_buffer_size_in_bytes: int = 4 * 10**9, @@ -272,8 +254,6 @@ def parse_dandi_raw_s3_log( over each day, parsing and binning by asset, effectively 'updating' the parsed collection on each iteration. excluded_ips : collections.defaultdict of strings to booleans, optional A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. - exclude_github_ips : bool, default: True - Include all GitHub action IP addresses in the `excluded_ips`. asset_id_handler : callable, optional If your asset IDs in the raw log require custom handling (i.e., they contain slashes that you do not wish to translate into nested directory paths) then define a function of the following form: @@ -290,24 +270,12 @@ def asset_id_handler(*, raw_asset_id: str) -> str: """ raw_s3_log_file_path = pathlib.Path(raw_s3_log_file_path) parsed_s3_log_folder_path = pathlib.Path(parsed_s3_log_folder_path) + asset_id_handler = asset_id_handler or _get_default_dandi_asset_id_handler() tqdm_kwargs = tqdm_kwargs or dict() bucket = "dandiarchive" request_type = "GET" - # Form a lookup for IP addresses to exclude; much faster than asking 'if in' a list on each iteration - # Exclude GitHub actions, which are responsible for running health checks on archive which bloat the logs - excluded_ips = excluded_ips or collections.defaultdict(bool) - if exclude_github_ips: - for github_ip in _get_latest_github_ip_ranges(): - excluded_ips[github_ip] = True - - if asset_id_handler is None: - - def asset_id_handler(*, raw_asset_id: str) -> str: - split_by_slash = raw_asset_id.split("/") - return split_by_slash[0] + "_" + split_by_slash[-1] - parse_raw_s3_log( raw_s3_log_file_path=raw_s3_log_file_path, parsed_s3_log_folder_path=parsed_s3_log_folder_path, @@ -319,3 +287,11 @@ def asset_id_handler(*, raw_asset_id: str) -> str: tqdm_kwargs=tqdm_kwargs, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, ) + + +def _get_default_dandi_asset_id_handler() -> Callable: + def asset_id_handler(*, raw_asset_id: str) -> str: + split_by_slash = raw_asset_id.split("/") + return split_by_slash[0] + "_" + split_by_slash[-1] + + return asset_id_handler diff --git a/src/dandi_s3_log_parser/_s3_log_file_parser.py b/src/dandi_s3_log_parser/_s3_log_file_parser.py index c23e4b0..a108a57 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_file_parser.py @@ -9,7 +9,7 @@ import tqdm from ._buffered_text_reader import BufferedTextReader -from ._s3_log_line_parser import _append_reduced_log_line, _ReducedLogLine +from ._s3_log_line_parser import _append_reduced_log_line def parse_raw_s3_log( @@ -67,11 +67,16 @@ def asset_id_handler(*, raw_asset_id: str) -> str: raw_s3_log_file_path = pathlib.Path(raw_s3_log_file_path) parsed_s3_log_folder_path = pathlib.Path(parsed_s3_log_folder_path) parsed_s3_log_folder_path.mkdir(exist_ok=True) + bucket = bucket or "" excluded_ips = excluded_ips or collections.defaultdict(bool) + asset_id_handler = asset_id_handler or (lambda asset_id: asset_id) tqdm_kwargs = tqdm_kwargs or dict() - reduced_logs = _get_reduced_log_lines( + assert raw_s3_log_file_path.suffix == ".log", f"`{raw_s3_log_file_path=}` should end in '.log'!" + + reduced_and_binned_logs = _get_reduced_and_binned_log_lines( raw_s3_log_file_path=raw_s3_log_file_path, + asset_id_handler=asset_id_handler, bucket=bucket, request_type=request_type, excluded_ips=excluded_ips, @@ -79,53 +84,40 @@ def asset_id_handler(*, raw_asset_id: str) -> str: maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, ) - reduced_logs_binned_by_unparsed_asset = dict() - for reduced_log in reduced_logs: - raw_asset_id = reduced_log.asset_id - reduced_logs_binned_by_unparsed_asset[raw_asset_id] = reduced_logs_binned_by_unparsed_asset.get( - raw_asset_id, - collections.defaultdict(list), - ) - - reduced_logs_binned_by_unparsed_asset[raw_asset_id]["timestamp"].append(reduced_log.timestamp) - reduced_logs_binned_by_unparsed_asset[raw_asset_id]["bytes_sent"].append(reduced_log.bytes_sent) - reduced_logs_binned_by_unparsed_asset[raw_asset_id]["ip_address"].append(reduced_log.ip_address) - reduced_logs_binned_by_unparsed_asset[raw_asset_id]["line_index"].append(reduced_log.line_index) - - if asset_id_handler is not None: - reduced_logs_binned_by_asset = dict() - for raw_asset_id, reduced_logs_per_asset in reduced_logs_binned_by_unparsed_asset.items(): - parsed_asset_id = asset_id_handler(raw_asset_id=raw_asset_id) + for handled_asset_id, reduced_logs_per_handled_asset_id in reduced_and_binned_logs.items(): + parsed_s3_log_file_path = parsed_s3_log_folder_path / f"{handled_asset_id}.tsv" - reduced_logs_binned_by_asset[parsed_asset_id] = reduced_logs_per_asset - else: - reduced_logs_binned_by_asset = reduced_logs_binned_by_unparsed_asset - - for raw_asset_id, reduced_logs_per_asset in reduced_logs_binned_by_asset.items(): - parsed_s3_log_file_path = parsed_s3_log_folder_path / f"{raw_asset_id}.tsv" - - data_frame = pandas.DataFrame(data=reduced_logs_per_asset) + data_frame = pandas.DataFrame(data=reduced_logs_per_handled_asset_id) header = False if parsed_s3_log_file_path.exists() is True and mode == "a" else True data_frame.to_csv(path_or_buf=parsed_s3_log_file_path, mode=mode, sep="\t", header=header, index=False) -def _get_reduced_log_lines( +def _get_reduced_and_binned_log_lines( *, raw_s3_log_file_path: pathlib.Path, - bucket: str | None, + asset_id_handler: Callable, + bucket: str, request_type: Literal["GET", "PUT"], excluded_ips: collections.defaultdict[str, bool], - tqdm_kwargs: dict | None = None, - maximum_buffer_size_in_bytes: int = 4 * 10**9, -) -> list[_ReducedLogLine]: + tqdm_kwargs: dict, + maximum_buffer_size_in_bytes: int, +) -> collections.defaultdict[str, dict[str, list[str | int]]]: """ - Reduce the full S3 log file to minimal content and return a list of in-memory collections.namedtuple objects. + Reduce the full S3 log file to minimal content and bin by asset ID. Parameters ---------- raw_s3_log_file_path : str or pathlib.Path Path to the raw S3 log file. + asset_id_handler : callable, optional + If your asset IDs in the raw log require custom handling (i.e., they contain slashes that you do not wish to + translate into nested directory paths) then define a function of the following form: + + # For example + def asset_id_handler(*, raw_asset_id: str) -> str: + split_by_slash = raw_asset_id.split("/") + return split_by_slash[0] + "_" + split_by_slash[-1] bucket : str Only parse and return lines that match this bucket. request_type : str @@ -137,42 +129,44 @@ def _get_reduced_log_lines( maximum_buffer_size_in_bytes : int, default: 4 GB The theoretical maximum amount of RAM (in bytes) to use on each buffer iteration when reading from the source text file. - """ - assert raw_s3_log_file_path.suffix == ".log", f"{raw_s3_log_file_path=} should end in '.log'!" - # Collapse bucket to empty string instead of asking if it is None on each iteration - bucket = "" if bucket is None else bucket + Returns + ------- + reduced_and_binned_logs : collections.defaultdict + A map of all reduced log line content binned by handled asset ID. + """ tqdm_kwargs = tqdm_kwargs or dict() # Perform I/O read in batches to improve performance - resolved_tqdm_kwargs = dict(desc="Parsing line buffers...", leave=False, mininterval=1.0) + resolved_tqdm_kwargs = dict(desc="Parsing line buffers...", leave=False, mininterval=5.0) resolved_tqdm_kwargs.update(tqdm_kwargs) - reduced_log_lines = list() - per_buffer_index = 0 + reduced_and_binned_logs = collections.defaultdict(list) buffered_text_reader = BufferedTextReader( file_path=raw_s3_log_file_path, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, ) - for buffered_raw_lines in tqdm.tqdm( + progress_bar_iterator = tqdm.tqdm( iterable=buffered_text_reader, total=len(buffered_text_reader), **resolved_tqdm_kwargs, - ): - index = 0 - for raw_line in buffered_raw_lines: + ) + + per_buffer_index = 0 + for buffered_raw_lines in progress_bar_iterator: + for index, raw_line in enumerate(buffered_raw_lines): line_index = per_buffer_index + index _append_reduced_log_line( raw_line=raw_line, - reduced_log_lines=reduced_log_lines, + reduced_and_binned_logs=reduced_and_binned_logs, + asset_id_handler=asset_id_handler, bucket=bucket, request_type=request_type, excluded_ips=excluded_ips, log_file_path=raw_s3_log_file_path, line_index=line_index, ) - index += 1 per_buffer_index += index - return reduced_log_lines + return reduced_and_binned_logs diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py index af9ef1b..a162c54 100644 --- a/src/dandi_s3_log_parser/_s3_log_line_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py @@ -19,9 +19,14 @@ import importlib.metadata import pathlib import re +from collections.abc import Callable from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH +_KNOWN_REQUEST_TYPES = collections.defaultdict(bool) +for request_type in ["GET", "PUT", "HEAD"]: + _KNOWN_REQUEST_TYPES[request_type] = True + _FULL_PATTERN_TO_FIELD_MAPPING = [ "bucket_owner", "bucket", @@ -50,24 +55,130 @@ "tls_version", "access_point_arn", ] -_REDUCED_PATTERN_TO_FIELD_MAPPING = ["asset_id", "timestamp", "bytes_sent", "ip_address", "line_index"] - _FullLogLine = collections.namedtuple("FullLogLine", _FULL_PATTERN_TO_FIELD_MAPPING) -_ReducedLogLine = collections.namedtuple("ReducedLogLine", _REDUCED_PATTERN_TO_FIELD_MAPPING) _S3_LOG_REGEX = re.compile(pattern=r'"([^"]+)"|\[([^]]+)]|([^ ]+)') +def _append_reduced_log_line( + *, + raw_line: str, + reduced_and_binned_logs: collections.defaultdict[str, dict[str, list[str | int]]], + asset_id_handler: Callable, + bucket: str, + request_type: str, + excluded_ips: collections.defaultdict[str, bool], + line_index: int, + log_file_path: pathlib.Path, +) -> None: + """ + Append the `reduced_and_binned_logs` map with informatione extracted from a single raw log line, if it is valid. + + Parameters + ---------- + raw_line : string + A single line from the raw S3 log file. + reduced_and_binned_logs : collections.defaultdict + A map of reduced log line content binned by handled asset ID. + asset_id_handler : callable, optional + If your asset IDs in the raw log require custom handling (i.e., they contain slashes that you do not wish to + translate into nested directory paths) then define a function of the following form: + + # For example + def asset_id_handler(*, raw_asset_id: str) -> str: + split_by_slash = raw_asset_id.split("/") + return split_by_slash[0] + "_" + split_by_slash[-1] + bucket : string + Only parse and return lines that match this bucket string. + request_type : string + The type of request to filter for. + excluded_ips : collections.defaultdict of strings to booleans + A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. + line_index: int + The index of the line in the raw log file. + log_file_path: pathlib.Path + The path to the log file being parsed; attached for logging purposes. + """ + parsed_log_line = _parse_s3_log_line(raw_line=raw_line) + + full_log_line = _get_full_log_line( + parsed_log_line=parsed_log_line, + log_file_path=log_file_path, + line_index=line_index, + raw_line=raw_line, + ) + + if full_log_line is None: + return + + # Various early skip conditions + if full_log_line.bucket != bucket: + return + + # Raise some quick parsing errors if anything indicates an improper parsing + # These might slow parsing down a bit, but could be important to ensuring accuracy + if not full_log_line.status_code.isdigit(): + message = f"Unexpected status code: '{full_log_line.status_code}' on line {line_index} of file {log_file_path}." + raise ValueError(message) + + # An expected operation string is "REST.GET.OBJECT" + operation_slice = slice(5, 8) if full_log_line.operation[8] == "." else slice(5, 9) + handled_request_type = full_log_line.operation[operation_slice] + if _KNOWN_REQUEST_TYPES[handled_request_type] is False: + message = ( + f"Unexpected request type: '{handled_request_type}' handled from '{full_log_line.operation}' " + f"on line {line_index} of file {log_file_path}." + ) + raise ValueError(message) + + timezone = full_log_line.timestamp[-5:] != "+0000" + if timezone: + message = f"Unexpected time shift attached to log! Have always seen '+0000', found `{timezone=}`." + raise ValueError(message) + + # More early skip conditions + # Only accept 200-block status codes + if full_log_line.status_code[0] != "2": + return + + if handled_request_type != request_type: + return + + if excluded_ips[full_log_line.ip_address] is True: + return + + # All early skip conditions done; the line is parsed so bin the reduced information by handled asset ID + handled_asset_id = asset_id_handler(raw_asset_id=full_log_line.asset_id) + handled_timestamp = datetime.datetime.strptime(full_log_line.timestamp[:-6], "%d/%b/%Y:%H:%M:%S") + handled_bytes_sent = int(full_log_line.bytes_sent) if full_log_line.bytes_sent != "-" else 0 + + reduced_and_binned_logs[handled_asset_id] = reduced_and_binned_logs.get( + handled_asset_id, + collections.defaultdict(list), + ) + reduced_and_binned_logs[handled_asset_id]["timestamp"].append(handled_timestamp) + reduced_and_binned_logs[handled_asset_id]["bytes_sent"].append(handled_bytes_sent) + reduced_and_binned_logs[handled_asset_id]["ip_address"].append(full_log_line.ip_address) + reduced_and_binned_logs[handled_asset_id]["line_index"].append(line_index) + + def _find_all_possible_substring_indices(*, string: str, substring: str) -> list[int]: indices = list() start = 0 - while True: + max_iter = 1000 + while True and start < max_iter: next_index = string.find(substring, start) if next_index == -1: # .find(...) was unable to locate the substring break indices.append(next_index) start = next_index + 1 + if start >= max_iter: + message = ( + f"Exceeded maximum iterations in `_find_all_possible_substring_indices` on `{string=}` with `{substring=}`." + ) + raise StopIteration(message) + return indices @@ -150,84 +261,8 @@ def _get_full_log_line( date = datetime.datetime.now().strftime("%y%m%d") lines_errors_file_path = errors_folder_path / f"v{dandi_s3_log_parser_version}_{date}_lines_errors.txt" + # TODO: automatically attempt to anonymize any detectable IP address in the raw line by replacing with 192.0.2.0 with open(file=lines_errors_file_path, mode="a") as io: io.write(f"Line {line_index} of {log_file_path} (parsed {number_of_parsed_items} items): {raw_line}\n\n") return full_log_line - - -def _append_reduced_log_line( - *, - raw_line: str, - reduced_log_lines: list[_ReducedLogLine], - bucket: str, - request_type: str, - excluded_ips: collections.defaultdict[str, bool], - line_index: int, - log_file_path: pathlib.Path, -) -> None: - """ - Append the `reduced_log_lines` list with a ReducedLogLine constructed from a single raw log line, if it is valid. - - Parameters - ---------- - raw_line : string - A single line from the raw S3 log file. - reduced_log_lines : list of ReducedLogLine - The list of ReducedLogLine objects to mutate in place. - This is done to reduce overhead of copying/returning items in-memory via a return-based approach. - bucket : string - Only parse and return lines that match this bucket string. - request_type : string - The type of request to filter for. - excluded_ips : collections.defaultdict of strings to booleans - A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. - line_index: int - The index of the line in the raw log file. - """ - bucket = "" if bucket is None else bucket - excluded_ips = excluded_ips or collections.defaultdict(bool) - - parsed_log_line = _parse_s3_log_line(raw_line=raw_line) - - full_log_line = _get_full_log_line( - parsed_log_line=parsed_log_line, - log_file_path=log_file_path, - line_index=line_index, - raw_line=raw_line, - ) - - if full_log_line is None: - return - - # Various early skip conditions - if full_log_line.bucket != bucket: - return - - # Skip all non-success status codes (those in the 200 block) - if full_log_line.status_code[0] != "2": - return - - # Derived from operation string, e.g., "REST.GET.OBJECT" - parsed_request_type = full_log_line.operation.split(".")[1] - if parsed_request_type != request_type: - return - - if excluded_ips[full_log_line.ip_address] is True: - return - - assert ( - full_log_line.timestamp[-5:] == "+0000" - ), f"Unexpected time shift attached to log! Have always seen '+0000', found '{full_log_line.timestamp[-5:]}'." - - parsed_timestamp = datetime.datetime.strptime(full_log_line.timestamp[:-6], "%d/%b/%Y:%H:%M:%S") - parsed_bytes_sent = int(full_log_line.bytes_sent) if full_log_line.bytes_sent != "-" else 0 - reduced_log_line = _ReducedLogLine( - asset_id=full_log_line.asset_id, - timestamp=parsed_timestamp, - bytes_sent=parsed_bytes_sent, - ip_address=full_log_line.ip_address, - line_index=line_index, - ) - - reduced_log_lines.append(reduced_log_line)