From ed19f6e13dfe1c898b16fc1791c4bd7cea2bd595 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:17:33 -0400 Subject: [PATCH 01/55] refactor for simplicity and resumability --- pyproject.toml | 4 +- .../_command_line_interface.py | 112 +------ .../_dandi_s3_log_file_reducer.py | 252 +++++---------- src/dandi_s3_log_parser/_globals.py | 19 +- .../_s3_log_file_bin_by_key.py | 268 ++++++++++++++++ .../_s3_log_file_reducer.py | 299 ++++++++++++------ .../_s3_log_line_parser.py | 193 ++--------- 7 files changed, 595 insertions(+), 552 deletions(-) create mode 100644 src/dandi_s3_log_parser/_s3_log_file_bin_by_key.py diff --git a/pyproject.toml b/pyproject.toml index e273be6..06d56ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ packages = ["src/dandi_s3_log_parser"] [project] name = "dandi_s3_log_parser" -version="0.3.0" +version="0.4.0" authors = [ { name="Cody Baker", email="cody.c.baker.phd@gmail.com" }, ] @@ -38,9 +38,7 @@ classifiers = [ [project.scripts] reduce_all_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_all_dandi_raw_s3_logs_cli" -reduce_dandi_raw_s3_log = "dandi_s3_log_parser._command_line_interface:_reduce_dandi_raw_s3_log_cli" map_reduced_logs_to_dandisets = "dandi_s3_log_parser._command_line_interface:_map_reduced_logs_to_dandisets_cli" -find_random_example_line = "dandi_s3_log_parser._command_line_interface:_find_random_example_line_cli" diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index 6b69fe7..738b2c2 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -2,23 +2,19 @@ import collections import pathlib -from typing import Literal import click -from ._config import REQUEST_TYPES from ._dandi_s3_log_file_reducer import ( reduce_all_dandi_raw_s3_logs, - reduce_dandi_raw_s3_log, ) from ._dandiset_mapper import map_reduced_logs_to_dandisets -from .testing import find_random_example_line @click.command(name="reduce_all_dandi_raw_s3_logs") @click.option( - "--base_raw_s3_logs_folder_path", - help="The path to the base folder containing all raw S3 log files.", + "--raw_s3_logs_folder_path", + help="The path to the folder containing all raw S3 log files.", required=True, type=click.Path(writable=False), ) @@ -56,7 +52,7 @@ default=None, ) def _reduce_all_dandi_raw_s3_logs_cli( - base_raw_s3_logs_folder_path: str, + raw_s3_logs_folder_path: str, reduced_s3_logs_folder_path: str, maximum_number_of_workers: int, maximum_buffer_size_in_mb: int, @@ -69,65 +65,14 @@ def _reduce_all_dandi_raw_s3_logs_cli( maximum_buffer_size_in_bytes = maximum_buffer_size_in_mb * 10**6 reduce_all_dandi_raw_s3_logs( - base_raw_s3_logs_folder_path=base_raw_s3_logs_folder_path, + raw_s3_logs_folder_path=raw_s3_logs_folder_path, reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, maximum_number_of_workers=maximum_number_of_workers, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, excluded_ips=handled_excluded_ips, ) - -@click.command(name="reduce_dandi_raw_s3_log") -@click.option( - "--raw_s3_log_file_path", - help="The path to the raw S3 log file to be reduced.", - required=True, - type=click.Path(writable=False), -) -@click.option( - "--reduced_s3_logs_folder_path", - help="The path to write each reduced S3 log file to. There will be one file per handled asset ID.", - required=True, - type=click.Path(writable=True), -) -@click.option( - "--maximum_buffer_size_in_mb", - help=( - "The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the " - "source text files. " - "Actual total RAM usage will be higher due to overhead and caching. " - "Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is " - "greater than one." - ), - required=False, - type=click.IntRange(min=1), # Bare minimum of 1 MB - default=1_000, # 1 GB recommended -) -@click.option( - "--excluded_ips", - help="A comma-separated list of IP addresses to exclude from reduction.", - required=False, - type=str, - default=None, -) -def _reduce_dandi_raw_s3_log_cli( - raw_s3_log_file_path: str, - reduced_s3_logs_folder_path: str, - excluded_ips: str | None, - maximum_buffer_size_in_mb: int, -) -> None: - split_excluded_ips = excluded_ips.split(",") if excluded_ips is not None else list() - handled_excluded_ips = collections.defaultdict(bool) if len(split_excluded_ips) != 0 else None - for excluded_ip in split_excluded_ips: - handled_excluded_ips[excluded_ip] = True - maximum_buffer_size_in_bytes = maximum_buffer_size_in_mb * 10**6 - - reduce_dandi_raw_s3_log( - raw_s3_log_file_path=raw_s3_log_file_path, - reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, - maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, - excluded_ips=handled_excluded_ips, - ) + return None @click.command(name="map_reduced_logs_to_dandisets") @@ -150,51 +95,4 @@ def _map_reduced_logs_to_dandisets_cli( reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path ) - -@click.command(name="find_random_example_line") -@click.option( - "--raw_s3_log_folder_path", - help="The path to the folder containing the raw S3 log files.", - required=True, - type=click.Path(writable=False), -) -@click.option( - "--request_type", - help="The type of request to filter for.", - required=True, - type=click.Choice(REQUEST_TYPES), -) -@click.option( - "--maximum_lines_per_request_type", - help=( - "The maximum number of lines to randomly sample for each request type. " - "The default is 5. " - "These lines are always found chronologically from the start of the file." - ), - required=False, - type=click.IntRange(min=2), - default=100, -) -@click.option( - "--seed", - help="The seed to use for the random number generator. The default is 0.", - required=False, - type=click.IntRange(min=0), - default=0, -) -def _find_random_example_line_cli( - raw_s3_log_folder_path: str | pathlib.Path, - request_type: Literal[REQUEST_TYPES], - maximum_lines_per_request_type: int = 5, - seed: int = 0, -) -> None: - """Find a randomly chosen line from a folder of raw S3 log files to serve as an example for testing purposes.""" - example_line = find_random_example_line( - raw_s3_log_folder_path=raw_s3_log_folder_path, - request_type=request_type, - maximum_lines_per_request_type=maximum_lines_per_request_type, - seed=seed, - ) - print(example_line) - return None diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index d13a827..5d990cd 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -1,17 +1,14 @@ """Primary functions for parsing raw S3 log file for DANDI.""" import collections +import datetime import os -import pathlib import random -import shutil import traceback import uuid from collections.abc import Callable from concurrent.futures import ProcessPoolExecutor, as_completed -from typing import Literal -import pandas import tqdm from pydantic import DirectoryPath, Field, FilePath, validate_call @@ -22,10 +19,11 @@ @validate_call def reduce_all_dandi_raw_s3_logs( *, - base_raw_s3_logs_folder_path: DirectoryPath, + raw_s3_logs_folder_path: DirectoryPath, reduced_s3_logs_folder_path: DirectoryPath, maximum_number_of_workers: int = Field(ge=1, default=1), maximum_buffer_size_in_bytes: int = 4 * 10**9, + excluded_years: list[str] | None = None, excluded_ips: collections.defaultdict[str, bool] | None = None, ) -> None: """ @@ -41,10 +39,10 @@ def reduce_all_dandi_raw_s3_logs( Parameters ---------- - base_raw_s3_logs_folder_path : file path - The Path to the folder containing the raw S3 log files to be reduced. + raw_s3_logs_folder_path : file path + The path to the folder containing the raw S3 log files to be reduced. reduced_s3_logs_folder_path : file path - The Path to write each reduced S3 log file to. + The [ath to write each reduced S3 log file to. There will be one file per handled asset ID. maximum_number_of_workers : int, default: 1 The maximum number of workers to distribute tasks across. @@ -59,64 +57,79 @@ def reduce_all_dandi_raw_s3_logs( excluded_ips : collections.defaultdict(bool), optional A lookup table whose keys are IP addresses to exclude from reduction. """ + excluded_years = excluded_years or [] excluded_ips = excluded_ips or collections.defaultdict(bool) - asset_id_handler = _get_default_dandi_asset_id_handler() + object_key_handler = _get_default_dandi_object_key_handler() - daily_raw_s3_log_file_paths = [ - path for path in base_raw_s3_logs_folder_path.rglob(pattern="*.log") if path.stem.isdigit() + # Ensure all subfolders exist once at the start + years_to_reduce = set([str(year) for year in range(2019, int(datetime.datetime.now().strftime("%Y")))]) - set( + excluded_years + ) + for year in years_to_reduce: + reduced_year_path = reduced_s3_logs_folder_path / year + reduced_year_path.mkdir(exist_ok=True) + + for month in range(1, 13): + reduced_month_path = reduced_s3_logs_folder_path / str(month).zfill(2) + reduced_month_path.mkdir(exist_ok=True) + + relative_s3_log_file_paths = [ + raw_s3_log_file_path.relative_to(raw_s3_logs_folder_path) + for raw_s3_log_file_path in raw_s3_logs_folder_path.rglob(pattern="*.log") + if raw_s3_log_file_path.stem.isdigit() + ] + relative_s3_log_file_paths_to_reduce = [ + relative_s3_log_file_path + for relative_s3_log_file_path in relative_s3_log_file_paths + if not (reduced_s3_logs_folder_path / relative_s3_log_file_path).exists() ] # The .rglob is not naturally sorted; shuffle for more uniform progress updates - random.shuffle(daily_raw_s3_log_file_paths) + random.shuffle(relative_s3_log_file_paths_to_reduce) + fields_to_reduce = ["object_key", "timestamp", "bytes_sent", "ip_address"] + object_key_parents_to_reduce = ["blobs", "zarr"] + line_buffer_tqdm_kwargs = dict(position=1, leave=False) # TODO: add better reporting units to all TQDMs (lines / s, files / s, etc.) if maximum_number_of_workers == 1: - for raw_s3_log_file_path in tqdm.tqdm( - iterable=daily_raw_s3_log_file_paths, + for relative_s3_log_file_path in tqdm.tqdm( + iterable=relative_s3_log_file_paths_to_reduce, + total=len(relative_s3_log_file_paths_to_reduce), desc="Parsing log files...", position=0, leave=True, smoothing=0, # Use true historical average, not moving average since shuffling makes it more uniform ): - reduce_dandi_raw_s3_log( + raw_s3_log_file_path = raw_s3_logs_folder_path / relative_s3_log_file_path + reduced_s3_log_file_path = reduced_s3_logs_folder_path / relative_s3_log_file_path + + reduce_raw_s3_log( raw_s3_log_file_path=raw_s3_log_file_path, - reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, - mode="a", + reduced_s3_log_file_path=reduced_s3_log_file_path, + fields_to_reduce=fields_to_reduce, + object_key_parents_to_reduce=object_key_parents_to_reduce, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, excluded_ips=excluded_ips, - asset_id_handler=asset_id_handler, - tqdm_kwargs=dict(position=1, leave=False), + object_key_handler=object_key_handler, + line_buffer_tqdm_kwargs=line_buffer_tqdm_kwargs, ) else: - # Create a fresh temporary directory in the home folder and then fresh subfolders for each worker - temporary_base_folder_path = reduced_s3_logs_folder_path / ".temp" - temporary_base_folder_path.mkdir(exist_ok=True) - - # Clean up any previous tasks that failed to clean themselves up - for previous_task_folder_path in temporary_base_folder_path.iterdir(): - shutil.rmtree(path=previous_task_folder_path, ignore_errors=True) - - task_id = str(uuid.uuid4())[:5] - temporary_folder_path = temporary_base_folder_path / task_id - temporary_folder_path.mkdir(exist_ok=True) - - per_worker_temporary_folder_paths = list() - for worker_index in range(maximum_number_of_workers): - per_worker_temporary_folder_path = temporary_folder_path / f"worker_{worker_index}" - per_worker_temporary_folder_path.mkdir(exist_ok=True) - per_worker_temporary_folder_paths.append(per_worker_temporary_folder_path) - maximum_buffer_size_in_bytes_per_worker = maximum_buffer_size_in_bytes // maximum_number_of_workers futures = [] with ProcessPoolExecutor(max_workers=maximum_number_of_workers) as executor: - for raw_s3_log_file_path in daily_raw_s3_log_file_paths: + for relative_s3_log_file_path in relative_s3_log_file_paths_to_reduce: + raw_s3_log_file_path = raw_s3_logs_folder_path / relative_s3_log_file_path + reduced_s3_log_file_path = reduced_s3_logs_folder_path / relative_s3_log_file_path + futures.append( executor.submit( _multi_worker_reduce_dandi_raw_s3_log, raw_s3_log_file_path=raw_s3_log_file_path, - temporary_folder_path=temporary_folder_path, + reduced_s3_log_file_path=reduced_s3_log_file_path, + fields_to_reduce=fields_to_reduce, + object_key_parents_to_reduce=object_key_parents_to_reduce, maximum_number_of_workers=maximum_number_of_workers, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes_per_worker, excluded_ips=excluded_ips, @@ -125,8 +138,8 @@ def reduce_all_dandi_raw_s3_logs( progress_bar_iterable = tqdm.tqdm( iterable=as_completed(futures), + total=len(futures), desc=f"Parsing log files using {maximum_number_of_workers} workers...", - total=len(daily_raw_s3_log_file_paths), position=0, leave=True, mininterval=3.0, @@ -135,62 +148,18 @@ def reduce_all_dandi_raw_s3_logs( for future in progress_bar_iterable: future.result() # This is the call that finally triggers the deployment to the workers - print("\n\nParallel parsing complete!\n\n") - - for worker_index, per_worker_temporary_folder_path in enumerate( - tqdm.tqdm( - iterable=per_worker_temporary_folder_paths, - desc="Merging results across workers...", - total=len(per_worker_temporary_folder_paths), - position=0, - leave=True, - mininterval=3.0, - ) - ): - per_worker_reduced_s3_log_file_paths = list(per_worker_temporary_folder_path.rglob("*.tsv")) - assert ( - len(per_worker_reduced_s3_log_file_paths) != 0 - ), f"No files found in {per_worker_temporary_folder_path}!" - - for per_worker_reduced_s3_log_file_path in tqdm.tqdm( - iterable=per_worker_reduced_s3_log_file_paths, - desc="Merging results per worker...", - total=len(per_worker_reduced_s3_log_file_paths), - position=1, - leave=False, - mininterval=3.0, - ): - merge_target_file_path = reduced_s3_logs_folder_path / per_worker_reduced_s3_log_file_path.relative_to( - per_worker_temporary_folder_path - ) - - parsed_s3_log = pandas.read_table(filepath_or_buffer=per_worker_reduced_s3_log_file_path, header=0) - - merge_target_file_path_exists = merge_target_file_path.exists() - if not merge_target_file_path_exists and not merge_target_file_path.parent.exists(): - merge_target_file_path.parent.mkdir(exist_ok=True, parents=True) - - header = False if merge_target_file_path_exists else True - parsed_s3_log.to_csv( - path_or_buf=merge_target_file_path, - mode="a", - sep="\t", - header=header, - index=False, - ) - - shutil.rmtree(path=temporary_base_folder_path) + return None # Function cannot be covered because the line calls occur on subprocesses # pragma: no cover def _multi_worker_reduce_dandi_raw_s3_log( *, - raw_s3_log_file_path: pathlib.Path, - temporary_folder_path: pathlib.Path, + raw_s3_log_file_path: FilePath, + reduced_s3_log_file_path: FilePath, maximum_number_of_workers: int, maximum_buffer_size_in_bytes: int, - excluded_ips: collections.defaultdict[str, bool] | None, + excluded_ips: collections.defaultdict[str, bool], ) -> None: """ A mostly pass-through function to calculate the worker index on the worker and target the correct subfolder. @@ -199,21 +168,24 @@ def _multi_worker_reduce_dandi_raw_s3_log( to a log file. """ try: - asset_id_handler = _get_default_dandi_asset_id_handler() - worker_index = os.getpid() % maximum_number_of_workers - per_worker_temporary_folder_path = temporary_folder_path / f"worker_{worker_index}" - reduce_dandi_raw_s3_log( + fields_to_reduce = ["object_key", "timestamp", "bytes_sent", "ip_address"] + object_key_parents_to_reduce = ["blobs", "zarr"] + object_key_handler = _get_default_dandi_object_key_handler() + line_buffer_tqdm_kwargs = dict( + position=worker_index + 1, leave=False, desc=f"Parsing line buffers on worker {worker_index + 1}..." + ) + + reduce_raw_s3_log( raw_s3_log_file_path=raw_s3_log_file_path, - reduced_s3_logs_folder_path=per_worker_temporary_folder_path, - mode="a", + reduced_s3_log_file_path=reduced_s3_log_file_path, + fields_to_reduce=fields_to_reduce, + object_key_parents_to_reduce=object_key_parents_to_reduce, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, excluded_ips=excluded_ips, - asset_id_handler=asset_id_handler, - tqdm_kwargs=dict( - position=worker_index + 1, leave=False, desc=f"Parsing line buffers on worker {worker_index+1}..." - ), + object_key_handler=object_key_handler, + line_buffer_tqdm_kwargs=line_buffer_tqdm_kwargs, ) except Exception as exception: message = ( @@ -224,86 +196,18 @@ def _multi_worker_reduce_dandi_raw_s3_log( task_id = str(uuid.uuid4())[:5] _collect_error(message=message, error_type="parallel", task_id=task_id) - -@validate_call -def reduce_dandi_raw_s3_log( - *, - raw_s3_log_file_path: FilePath, - reduced_s3_logs_folder_path: DirectoryPath, - mode: Literal["w", "a"] = "a", - maximum_buffer_size_in_bytes: int = 4 * 10**9, - excluded_ips: collections.defaultdict[str, bool] | None = None, - asset_id_handler: Callable | None = None, - tqdm_kwargs: dict | None = None, -) -> None: - """ - Reduce a raw S3 log file and write the results to a folder of TSV files, one for each unique asset ID. - - 'Parsing' here means: - - limiting only to requests of the specified type (i.e., GET, PUT, etc.) - - reducing the information to the asset ID, request time, request size, and geographic IP of the requester - - Parameters - ---------- - raw_s3_log_file_path : string or pathlib.Path - Path to the raw S3 log file to be reduced. - reduced_s3_logs_folder_path : string or pathlib.Path - The path to write each reduced S3 log file to. - There will be one file per handled asset ID. - mode : "w" or "a", default: "a" - How to resolve the case when files already exist in the folder containing parsed logs. - "w" will overwrite existing content, "a" will append or create if the file does not yet exist. - - The intention of the default usage is to have one consolidated raw S3 log file per day and then to iterate - over each day, parsing and binning by asset, effectively 'updating' the parsed collection on each iteration. - maximum_buffer_size_in_bytes : int, default: 4 GB - The theoretical maximum amount of RAM (in bytes) to use on each buffer iteration when reading from the - source text file. - - Actual RAM usage will be higher due to overhead and caching. - excluded_ips : collections.defaultdict(bool), optional - A lookup table whose keys are IP addresses to exclude from reduction. - asset_id_handler : callable, optional - If your asset IDs in the raw log require custom handling (i.e., they contain slashes that you do not wish to - translate into nested directory paths) then define a function of the following form: - - # For example - def asset_id_handler(*, raw_asset_id: str) -> str: - split_by_slash = raw_asset_id.split("/") - return split_by_slash[0] + "_" + split_by_slash[-1] - tqdm_kwargs : dict, optional - Keyword arguments to pass to the tqdm progress bar for line buffers. - """ - raw_s3_log_file_path = pathlib.Path(raw_s3_log_file_path) - reduced_s3_logs_folder_path = pathlib.Path(reduced_s3_logs_folder_path) - asset_id_handler = asset_id_handler or _get_default_dandi_asset_id_handler() - tqdm_kwargs = tqdm_kwargs or dict() - - bucket = "dandiarchive" - operation_type = "REST.GET.OBJECT" - - reduce_raw_s3_log( - raw_s3_log_file_path=raw_s3_log_file_path, - reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, - mode=mode, - maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, - bucket=bucket, - operation_type=operation_type, - excluded_ips=excluded_ips, - asset_id_handler=asset_id_handler, - tqdm_kwargs=tqdm_kwargs, - ) + return None -def _get_default_dandi_asset_id_handler() -> Callable: - def asset_id_handler(*, raw_asset_id: str) -> str: - split_by_slash = raw_asset_id.split("/") +def _get_default_dandi_object_key_handler() -> Callable: + def object_key_handler(*, object_key: str) -> str: + split_by_slash = object_key.split("/") - asset_type = split_by_slash[0] - if asset_type == "zarr": + object_type = split_by_slash[0] + if object_type == "zarr": zarr_blob_form = "/".join(split_by_slash[:2]) return zarr_blob_form - return raw_asset_id + return object_key - return asset_id_handler + return object_key_handler diff --git a/src/dandi_s3_log_parser/_globals.py b/src/dandi_s3_log_parser/_globals.py index ea29373..92ee4c9 100644 --- a/src/dandi_s3_log_parser/_globals.py +++ b/src/dandi_s3_log_parser/_globals.py @@ -73,7 +73,7 @@ for request_type in _KNOWN_OPERATION_TYPES: _IS_OPERATION_TYPE_KNOWN[request_type] = True -_FULL_PATTERN_TO_FIELD_MAPPING = [ +_S3_LOG_FIELDS = ( "bucket_owner", "bucket", "timestamp", @@ -81,10 +81,10 @@ "requester", "request_id", "operation", - "asset_id", + "object_key", "request_uri", # "http_version", # Regex not splitting this from the request_uri... - "status_code", + "http_status_code", "error_code", "bytes_sent", "object_size", @@ -92,15 +92,16 @@ "turn_around_time", "referrer", "user_agent", - "version", + "version_id", "host_id", - "sigv", + "signature_version", "cipher_suite", - "auth_type", - "endpoint", + "authentication_type", + "host_header", "tls_version", "access_point_arn", -] -_FullLogLine = collections.namedtuple("FullLogLine", _FULL_PATTERN_TO_FIELD_MAPPING) + "acl_required", +) +_FullLogLine = collections.namedtuple("FullLogLine", _S3_LOG_FIELDS) _S3_LOG_REGEX = re.compile(pattern=r'"([^"]+)"|\[([^]]+)]|([^ ]+)') diff --git a/src/dandi_s3_log_parser/_s3_log_file_bin_by_key.py b/src/dandi_s3_log_parser/_s3_log_file_bin_by_key.py new file mode 100644 index 0000000..6bf33a0 --- /dev/null +++ b/src/dandi_s3_log_parser/_s3_log_file_bin_by_key.py @@ -0,0 +1,268 @@ +# """Primary functions for parsing raw S3 log file for DANDI.""" +# +# import collections +# import datetime +# import pathlib +# import uuid +# from collections.abc import Callable +# from typing import Literal +# +# import pandas +# import tqdm +# from pydantic import DirectoryPath, FilePath, validate_call +# +# from ._buffered_text_reader import BufferedTextReader +# from ._error_collection import _collect_error +# from ._s3_log_line_parser import _KNOWN_OPERATION_TYPES, _append_reduced_log_line +# +# +# @validate_call +# def reduce_raw_s3_log( +# *, +# raw_s3_log_file_path: FilePath, +# reduced_s3_logs_folder_path: DirectoryPath, +# mode: Literal["w", "a"] = "a", +# maximum_buffer_size_in_bytes: int = 4 * 10**9, +# bucket: str | None = None, +# operation_type: Literal[_KNOWN_OPERATION_TYPES] = "REST.GET.OBJECT", +# excluded_ips: collections.defaultdict[str, bool] | None = None, +# asset_id_handler: Callable | None = None, +# tqdm_kwargs: dict | None = None, +# ) -> None: +# """ +# Reduce a raw S3 log file and write the results to a folder of TSV files, one for each unique asset ID. +# +# 'Reduce' here means: +# - Filtering all lines only by the bucket specified. +# - Filtering all lines only by the type of operation specified (i.e., REST.GET.OBJECT, REST.PUT.OBJECT, etc.). +# - Filtering out any non-success status codes. +# - Filtering out any excluded IP addresses. +# - Extracting only the asset ID, request timestamp, request size, and IP address that sent the request. +# +# Parameters +# ---------- +# raw_s3_log_file_path : str or pathlib.Path +# The path to the raw S3 log file. +# reduced_s3_logs_folder_path : str or pathlib.Path +# The path to write each reduced S3 log file to. +# There will be one file per handled asset ID. +# mode : "w" or "a", default: "a" +# How to resolve the case when files already exist in the folder containing parsed logs. +# "w" will overwrite existing content, "a" will append or create if the file does not yet exist. +# +# The intention of the default usage is to have one consolidated raw S3 log file per day and then to iterate +# over each day, parsing and binning by asset, effectively 'updating' the parsed collection on each iteration. +# maximum_buffer_size_in_bytes : int, default: 4 GB +# The theoretical maximum amount of RAM (in bytes) to use on each buffer iteration when reading from the +# source text file. +# +# Actual RAM usage will be higher due to overhead and caching. +# bucket : str +# Only parse and return lines that match this bucket. +# operation_type : str, default: "REST.GET" +# The type of operation to filter for. +# excluded_ips : collections.defaultdict of strings to booleans, optional +# A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. +# asset_id_handler : callable, optional +# If your asset IDs in the raw log require custom handling (i.e., they contain slashes that you do not wish to +# translate into nested directory paths) then define a function of the following form: +# +# # For example +# def asset_id_handler(*, raw_asset_id: str) -> str: +# split_by_slash = raw_asset_id.split("/") +# return split_by_slash[0] + "_" + split_by_slash[-1] +# tqdm_kwargs : dict, optional +# Keyword arguments to pass to the tqdm progress bar for line buffers. +# """ +# reduced_s3_logs_folder_path.mkdir(exist_ok=True) +# bucket = bucket or "" +# excluded_ips = excluded_ips or collections.defaultdict(bool) +# asset_id_handler = asset_id_handler or (lambda asset_id: asset_id) +# tqdm_kwargs = tqdm_kwargs or dict() +# +# assert raw_s3_log_file_path.suffix == ".log", f"`{raw_s3_log_file_path=}` should end in '.log'!" +# +# reduced_and_binned_logs = _get_reduced_and_binned_log_lines( +# raw_s3_log_file_path=raw_s3_log_file_path, +# maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, +# bucket=bucket, +# operation_type=operation_type, +# excluded_ips=excluded_ips, +# asset_id_handler=asset_id_handler, +# tqdm_kwargs=tqdm_kwargs, +# ) +# +# for handled_asset_id, reduced_logs_per_handled_asset_id in reduced_and_binned_logs.items(): +# handled_asset_id_path = pathlib.Path(handled_asset_id) +# blob_id = handled_asset_id_path.stem +# reduced_s3_log_file_path = reduced_s3_logs_folder_path / handled_asset_id_path.parent / f"{blob_id}.tsv" +# +# reduced_log_file_exists = reduced_s3_log_file_path.exists() +# if not reduced_log_file_exists and not reduced_s3_log_file_path.parent.exists(): +# reduced_s3_log_file_path.parent.mkdir(exist_ok=True, parents=True) +# +# data_frame = pandas.DataFrame(data=reduced_logs_per_handled_asset_id) +# +# header = False if reduced_log_file_exists is True and mode == "a" else True +# data_frame.to_csv(path_or_buf=reduced_s3_log_file_path, mode=mode, sep="\t", header=header, index=False) +# +# +# def _get_reduced_and_binned_log_lines( +# *, +# raw_s3_log_file_path: pathlib.Path, +# maximum_buffer_size_in_bytes: int, +# bucket: str, +# operation_type: Literal[_KNOWN_OPERATION_TYPES], +# excluded_ips: collections.defaultdict[str, bool], +# asset_id_handler: Callable, +# tqdm_kwargs: dict, +# ) -> collections.defaultdict[str, dict[str, list[str | int]]]: +# """Reduce the full S3 log file to minimal content and bin by asset ID.""" +# tqdm_kwargs = tqdm_kwargs or dict() +# default_tqdm_kwargs = dict(desc="Parsing line buffers...", leave=False) +# resolved_tqdm_kwargs = dict(default_tqdm_kwargs) +# resolved_tqdm_kwargs.update(tqdm_kwargs) +# +# task_id = str(uuid.uuid4())[:5] +# +# reduced_and_binned_logs = collections.defaultdict(list) +# buffered_text_reader = BufferedTextReader( +# file_path=raw_s3_log_file_path, +# maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, +# ) +# progress_bar_iterator = tqdm.tqdm( +# iterable=buffered_text_reader, +# total=len(buffered_text_reader), +# **resolved_tqdm_kwargs, +# ) +# per_buffer_index = 0 +# for buffered_raw_lines in progress_bar_iterator: +# index = 0 +# for raw_line in buffered_raw_lines: +# line_index = per_buffer_index + index +# +# _append_reduced_log_line( +# raw_line=raw_line, +# reduced_and_binned_logs=reduced_and_binned_logs, +# bucket=bucket, +# operation_type=operation_type, +# excluded_ips=excluded_ips, +# asset_id_handler=asset_id_handler, +# log_file_path=raw_s3_log_file_path, +# line_index=line_index, +# task_id=task_id, +# ) +# index += 1 +# per_buffer_index += index +# +# return reduced_and_binned_logs +# +# +# def _append_reduced_log_line( +# *, +# raw_line: str, +# reduced_and_binned_logs: collections.defaultdict[str, dict[str, list[str | int]]], +# operation_type: Literal[_KNOWN_OPERATION_TYPES], +# excluded_ips: collections.defaultdict[str, bool], +# object_key_handler: Callable, +# line_index: int, +# log_file_path: pathlib.Path, +# task_id: str, +# ) -> None: +# """ +# Append the `reduced_and_binned_logs` map with information extracted from a single raw log line, if it is valid. +# +# Parameters +# ---------- +# raw_line : string +# A single line from the raw S3 log file. +# reduced_and_binned_logs : collections.defaultdict +# A map of reduced log line content binned by handled asset ID. +# object_key_handler : callable, optional +# If your object keys in the raw log require custom handling (i.e., they contain slashes that you do not wish to +# translate into nested directory paths) then define a function of the following form. +# +# For example: +# +# ```python +# def asset_id_handler(*, raw_asset_id: str) -> str: +# split_by_slash = raw_asset_id.split("/") +# +# asset_type = split_by_slash[0] +# if asset_type == "zarr": +# zarr_blob_form = "/".join(split_by_slash[:2]) +# return zarr_blob_form +# +# return raw_asset_id +# ``` +# operation_type : string +# The type of operation to filter for. +# excluded_ips : collections.defaultdict of strings to booleans +# A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. +# line_index: int +# The index of the line in the raw log file. +# log_file_path: pathlib.Path +# The path to the log file being parsed; attached for error collection purposes. +# task_id: str +# A unique task ID to ensure that error collection files are unique when parallelizing to avoid race conditions. +# """ +# parsed_log_line = _parse_s3_log_line(raw_line=raw_line) +# +# full_log_line = _get_full_log_line( +# parsed_log_line=parsed_log_line, +# log_file_path=log_file_path, +# line_index=line_index, +# raw_line=raw_line, +# task_id=task_id, +# ) +# +# if full_log_line is None: +# return None +# +# # Apply some minimal validation and contribute any invalidations to error collection +# # These might slow parsing down a bit, but could be important to ensuring accuracy +# if not full_log_line.status_code.isdigit(): +# message = f"Unexpected status code: '{full_log_line.status_code}' on line {line_index} of file {log_file_path} +# _collect_error(message=message, error_type="line", task_id=task_id) +# +# return None +# +# if _IS_OPERATION_TYPE_KNOWN[full_log_line.operation] is False: +# message = ( +# f"Unexpected request type: '{full_log_line.operation}' on line {line_index} of file {log_file_path}.\n\n" +# ) +# _collect_error(message=message, error_type="line", task_id=task_id) +# +# return None +# +# timezone = full_log_line.timestamp[-5:] +# is_timezone_utc = timezone != "+0000" +# if is_timezone_utc: +# message = f"Unexpected time shift attached to log! Have always seen '+0000', found `{timezone=}`.\n\n" +# _collect_error(message=message, error_type="line", task_id=task_id) +# # Fine to proceed; just wanted to be made aware if there is ever a difference so can try to investigate why +# +# # More early skip conditions after validation +# # Only accept 200-block status codes +# if full_log_line.status_code[0] != "2": +# return None +# +# if full_log_line.operation != operation_type: +# return None +# +# if excluded_ips[full_log_line.ip_address] is True: +# return None +# +# # All early skip conditions done; the line is parsed so bin the reduced information by handled asset ID +# handled_object_key = object_key_handler(raw_asset_id=full_log_line.asset_id) +# handled_timestamp = datetime.datetime.strptime(full_log_line.timestamp[:-6], "%d/%b/%Y:%H:%M:%S") +# handled_bytes_sent = int(full_log_line.bytes_sent) if full_log_line.bytes_sent != "-" else 0 +# +# reduced_and_binned_logs[handled_object_key] = reduced_and_binned_logs.get( +# handled_object_key, +# collections.defaultdict(list), +# ) +# reduced_and_binned_logs[handled_object_key]["timestamp"].append(handled_timestamp) +# reduced_and_binned_logs[handled_object_key]["bytes_sent"].append(handled_bytes_sent) +# reduced_and_binned_logs[handled_object_key]["ip_address"].append(full_log_line.ip_address) +# reduced_and_binned_logs[handled_object_key]["line_index"].append(line_index) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 93d55f2..5216ca5 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -1,37 +1,38 @@ """Primary functions for parsing raw S3 log file for DANDI.""" import collections -import pathlib +import datetime +import traceback import uuid from collections.abc import Callable from typing import Literal -import pandas import tqdm -from pydantic import DirectoryPath, FilePath, validate_call +from pydantic import FilePath, validate_call from ._buffered_text_reader import BufferedTextReader -from ._s3_log_line_parser import _KNOWN_OPERATION_TYPES, _append_reduced_log_line +from ._error_collection import _collect_error +from ._globals import _IS_OPERATION_TYPE_KNOWN, _KNOWN_OPERATION_TYPES, _S3_LOG_FIELDS +from ._s3_log_line_parser import _get_full_log_line, _parse_s3_log_line @validate_call def reduce_raw_s3_log( *, raw_s3_log_file_path: FilePath, - reduced_s3_logs_folder_path: DirectoryPath, - mode: Literal["w", "a"] = "a", + reduced_s3_log_file_path: FilePath, + fields_to_reduce: list[Literal[_S3_LOG_FIELDS]] | None = None, + object_key_parents_to_reduce: list[str] | None = None, maximum_buffer_size_in_bytes: int = 4 * 10**9, - bucket: str | None = None, operation_type: Literal[_KNOWN_OPERATION_TYPES] = "REST.GET.OBJECT", excluded_ips: collections.defaultdict[str, bool] | None = None, - asset_id_handler: Callable | None = None, - tqdm_kwargs: dict | None = None, + object_key_handler: Callable | None = None, + line_buffer_tqdm_kwargs: dict | None = None, ) -> None: """ - Reduce a raw S3 log file and write the results to a folder of TSV files, one for each unique asset ID. + Reduce a raw S3 log file to only the requested fields. 'Reduce' here means: - - Filtering all lines only by the bucket specified. - Filtering all lines only by the type of operation specified (i.e., REST.GET.OBJECT, REST.PUT.OBJECT, etc.). - Filtering out any non-success status codes. - Filtering out any excluded IP addresses. @@ -39,118 +40,226 @@ def reduce_raw_s3_log( Parameters ---------- - raw_s3_log_file_path : str or pathlib.Path + raw_s3_log_file_path : file path The path to the raw S3 log file. - reduced_s3_logs_folder_path : str or pathlib.Path + reduced_s3_log_file_path : file path The path to write each reduced S3 log file to. - There will be one file per handled asset ID. - mode : "w" or "a", default: "a" - How to resolve the case when files already exist in the folder containing parsed logs. - "w" will overwrite existing content, "a" will append or create if the file does not yet exist. - - The intention of the default usage is to have one consolidated raw S3 log file per day and then to iterate - over each day, parsing and binning by asset, effectively 'updating' the parsed collection on each iteration. + fields_to_reduce : list of S3 log fields, optional + The S3 log fields to reduce the raw log file to. + Defaults to ["object_key", "timestamp", "bytes_sent", "ip_address"]. + object_key_parents_to_reduce : list of strings, optional + The parent directories of the object key to reduce the raw log file to. maximum_buffer_size_in_bytes : int, default: 4 GB The theoretical maximum amount of RAM (in bytes) to use on each buffer iteration when reading from the source text file. Actual RAM usage will be higher due to overhead and caching. - bucket : str - Only parse and return lines that match this bucket. operation_type : str, default: "REST.GET" The type of operation to filter for. excluded_ips : collections.defaultdict of strings to booleans, optional A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. - asset_id_handler : callable, optional - If your asset IDs in the raw log require custom handling (i.e., they contain slashes that you do not wish to - translate into nested directory paths) then define a function of the following form: - - # For example - def asset_id_handler(*, raw_asset_id: str) -> str: - split_by_slash = raw_asset_id.split("/") - return split_by_slash[0] + "_" + split_by_slash[-1] - tqdm_kwargs : dict, optional + object_key_handler : callable, optional + If your object keys in the raw log require custom handling (i.e., they contain slashes that you do not wish to + translate into nested directory paths) then define and pass a function that takes the `object_key` as a string + and returns the corrected form. + + For example: + + ```python + def object_key_handler(*, object_key: str) -> str: + split_by_slash = object_key.split("/") + + object_type = split_by_slash[0] + if object_type == "zarr": + zarr_blob_form = "/".join(split_by_slash[:2]) + return zarr_blob_form + + return object_key + ``` + line_buffer_tqdm_kwargs : dict, optional Keyword arguments to pass to the tqdm progress bar for line buffers. """ - reduced_s3_logs_folder_path.mkdir(exist_ok=True) - bucket = bucket or "" + fields_to_reduce = fields_to_reduce or ["object_key", "timestamp", "bytes_sent", "ip_address"] + object_key_parents_to_reduce = object_key_parents_to_reduce or [] # ["blobs", "zarr"] # TODO: move to DANDI side excluded_ips = excluded_ips or collections.defaultdict(bool) - asset_id_handler = asset_id_handler or (lambda asset_id: asset_id) - tqdm_kwargs = tqdm_kwargs or dict() + object_key_handler = object_key_handler or (lambda object_key: object_key) + line_buffer_tqdm_kwargs = line_buffer_tqdm_kwargs or dict() + + default_tqdm_kwargs = {"desc": "Parsing line buffers...", "leave": False} + resolved_tqdm_kwargs = {**default_tqdm_kwargs} + resolved_tqdm_kwargs.update(line_buffer_tqdm_kwargs) assert raw_s3_log_file_path.suffix == ".log", f"`{raw_s3_log_file_path=}` should end in '.log'!" - reduced_and_binned_logs = _get_reduced_and_binned_log_lines( - raw_s3_log_file_path=raw_s3_log_file_path, + if set(fields_to_reduce) == {"object_key", "timestamp", "bytes_sent", "ip_address"}: + raise NotImplementedError("This function is not yet generalized for custom field reduction.") + + buffered_text_reader = BufferedTextReader( + file_path=raw_s3_log_file_path, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, - bucket=bucket, - operation_type=operation_type, - excluded_ips=excluded_ips, - asset_id_handler=asset_id_handler, - tqdm_kwargs=tqdm_kwargs, ) + progress_bar_iterator = tqdm.tqdm( + iterable=buffered_text_reader, + total=len(buffered_text_reader), + **resolved_tqdm_kwargs, + ) + + task_id = str(uuid.uuid4())[:5] + fast_fields_case = set(fields_to_reduce) == {"object_key", "timestamp", "bytes_sent", "ip_address"} and set( + object_key_parents_to_reduce + ) == { + "blobs", + "zarr", + } # Admittedly, this is particular to DANDI + if fast_fields_case is True: + reduced_s3_log_lines = [ + reduced_s3_log_line + for raw_s3_log_lines_buffer in progress_bar_iterator + for raw_s3_log_line in raw_s3_log_lines_buffer + if ( + reduced_s3_log_line := _fast_dandi_reduce_raw_s3_log_line( + raw_s3_log_line=raw_s3_log_line, + operation_type=operation_type, + excluded_ips=excluded_ips, + task_id=task_id, + ) + ) + is not None + ] + else: + reduced_s3_log_lines = [ + reduced_s3_log_line + for raw_s3_log_lines_buffer in progress_bar_iterator + for raw_s3_log_line in raw_s3_log_lines_buffer + if (reduced_s3_log_line := _reduce_raw_s3_log_line(raw_s3_log_line=raw_s3_log_line, task_id=task_id)) + is not None + ] + + with open(file=reduced_s3_log_file_path, mode="w") as io: + io.writelines(reduced_s3_log_lines) + + +def _fast_dandi_reduce_raw_s3_log_line( + *, + raw_s3_log_line: str, + operation_type: str, # Should be the literal of types, but simplifying for speed here + excluded_ips: collections.defaultdict[str, bool], + task_id: str, +) -> str | None: + """ + A faster version of the parsing that makes restrictive but relatively safe assumptions about the line format. + + We trust here that various fields will exist at precise and regular positions in the string split by spaces. + """ + try: + split_by_space = raw_s3_log_line.split(" ") + + ip_address = split_by_space[4] + if excluded_ips[ip_address] is True: + return None + + line_operation_type = split_by_space[7] + if line_operation_type != operation_type: + return None - for handled_asset_id, reduced_logs_per_handled_asset_id in reduced_and_binned_logs.items(): - handled_asset_id_path = pathlib.Path(handled_asset_id) - blob_id = handled_asset_id_path.stem - reduced_s3_log_file_path = reduced_s3_logs_folder_path / handled_asset_id_path.parent / f"{blob_id}.tsv" + full_object_key = split_by_space[8] + full_object_key_split_by_slash = full_object_key.split("/") + object_key_parent = full_object_key_split_by_slash[0] + match object_key_parent: + case "blobs": + object_key = full_object_key + case "zarr": + object_key = "/".join(full_object_key_split_by_slash[:2]) + case _: + return None - reduced_log_file_exists = reduced_s3_log_file_path.exists() - if not reduced_log_file_exists and not reduced_s3_log_file_path.parent.exists(): - reduced_s3_log_file_path.parent.mkdir(exist_ok=True, parents=True) + first_post_quote_block = raw_s3_log_line.split('" ')[1].split(" ") + http_status_code = first_post_quote_block[0] + bytes_sent = first_post_quote_block[2] + if len(first_post_quote_block) != 7 or not http_status_code.isdigit() or not bytes_sent.isdigit(): + return _reduce_raw_s3_log_line(raw_s3_log_line=raw_s3_log_line, task_id=task_id) + elif http_status_code[0] != "2": + return None - data_frame = pandas.DataFrame(data=reduced_logs_per_handled_asset_id) + # Forget about timezone for fast case + timestamp = datetime.datetime.strptime("".join(split_by_space[2:3]), "[%d/%b/%Y:%H:%M:%S").isoformat() - header = False if reduced_log_file_exists is True and mode == "a" else True - data_frame.to_csv(path_or_buf=reduced_s3_log_file_path, mode=mode, sep="\t", header=header, index=False) + reduced_s3_log_line = f"{timestamp}\t{ip_address}\t{object_key}\t{bytes_sent}\n" + return reduced_s3_log_line + except Exception: + message = f"Error during fast reduction of line '{raw_s3_log_line}'" + _collect_error(message=message, error_type="fast_line_reduction", task_id=task_id) -def _get_reduced_and_binned_log_lines( + +def _reduce_raw_s3_log_line( *, - raw_s3_log_file_path: pathlib.Path, - maximum_buffer_size_in_bytes: int, - bucket: str, - operation_type: Literal[_KNOWN_OPERATION_TYPES], + raw_s3_log_line: str, + operation_type: str, excluded_ips: collections.defaultdict[str, bool], - asset_id_handler: Callable, - tqdm_kwargs: dict, -) -> collections.defaultdict[str, dict[str, list[str | int]]]: - """Reduce the full S3 log file to minimal content and bin by asset ID.""" - tqdm_kwargs = tqdm_kwargs or dict() - default_tqdm_kwargs = dict(desc="Parsing line buffers...", leave=False) - resolved_tqdm_kwargs = dict(default_tqdm_kwargs) - resolved_tqdm_kwargs.update(tqdm_kwargs) + object_key_handler: Callable, + task_id: str, +) -> str | None: + try: + parsed_s3_log_line = _parse_s3_log_line(raw_s3_log_line=raw_s3_log_line) + full_log_line = _get_full_log_line(parsed_s3_log_line=parsed_s3_log_line) + except Exception as exception: + message = ( + f"Error parsing line: {raw_s3_log_line}\n\n" + f"{type(exception)}: str{exception}\n\n" + f"{traceback.format_exc()}", + ) + _collect_error(message=message, error_type="line_reduction", task_id=task_id) - task_id = str(uuid.uuid4())[:5] + return None - reduced_and_binned_logs = collections.defaultdict(list) - buffered_text_reader = BufferedTextReader( - file_path=raw_s3_log_file_path, - maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, - ) - progress_bar_iterator = tqdm.tqdm( - iterable=buffered_text_reader, - total=len(buffered_text_reader), - **resolved_tqdm_kwargs, + # Deviant log entry; usually some very ill-formed content in the URI + # Dump information to a log file in the base folder for easy sharing + if full_log_line is None: + message = f"Error during parsing of line '{raw_s3_log_line}'" + _collect_error(message=message, error_type="line") + return None + + # Apply some minimal validation and contribute any invalidations to error collection + # These might slow parsing down a bit, but could be important to ensuring accuracy + if not full_log_line.status_code.isdigit(): + message = f"Unexpected status code: '{full_log_line.status_code}' parsed from line '{raw_s3_log_line}'." + _collect_error(message=message, error_type="line", task_id=task_id) + + return None + + if _IS_OPERATION_TYPE_KNOWN[full_log_line.operation] is False: + message = f"Unexpected request type: '{full_log_line.operation}' parsed from line '{raw_s3_log_line}'." + _collect_error(message=message, error_type="line", task_id=task_id) + + return None + + timezone = full_log_line.timestamp[-5:] + is_timezone_utc = timezone != "+0000" + if is_timezone_utc: + message = f"Unexpected time shift parsed from line '{raw_s3_log_line}'." + _collect_error(message=message, error_type="line", task_id=task_id) + # Fine to proceed; just wanted to be made aware if there is ever a difference so can try to investigate why + + # More early skip conditions after validation + # Only accept 200-block status codes + if full_log_line.status_code[0] != "2": + return None + + if full_log_line.operation != operation_type: + return None + + if excluded_ips[full_log_line.ip_address] is True: + return None + + # All early skip conditions done; the line is parsed so bin the reduced information by handled asset ID + handled_object_key = object_key_handler(raw_asset_id=full_log_line.asset_id) + handled_timestamp = datetime.datetime.strptime(full_log_line.timestamp[:-6], "%d/%b/%Y:%H:%M:%S").isoformat() + handled_bytes_sent = int(full_log_line.bytes_sent) if full_log_line.bytes_sent != "-" else 0 + + # TODO: generalize this + reduced_s3_log_line = ( + f"{handled_timestamp}\t{full_log_line.ip_address}\t{handled_object_key}\t{handled_bytes_sent}\n" ) - per_buffer_index = 0 - for buffered_raw_lines in progress_bar_iterator: - index = 0 - for raw_line in buffered_raw_lines: - line_index = per_buffer_index + index - - _append_reduced_log_line( - raw_line=raw_line, - reduced_and_binned_logs=reduced_and_binned_logs, - bucket=bucket, - operation_type=operation_type, - excluded_ips=excluded_ips, - asset_id_handler=asset_id_handler, - log_file_path=raw_s3_log_file_path, - line_index=line_index, - task_id=task_id, - ) - index += 1 - per_buffer_index += index - return reduced_and_binned_logs + return reduced_s3_log_line diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py index 4250f72..fea3c4f 100644 --- a/src/dandi_s3_log_parser/_s3_log_line_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py @@ -11,138 +11,20 @@ - Filtering out log lines from excluded IPs. """ -import collections -import datetime -import pathlib -import traceback -from collections.abc import Callable -from typing import Literal - -from ._error_collection import _collect_error from ._globals import ( - _IS_OPERATION_TYPE_KNOWN, - _KNOWN_OPERATION_TYPES, _S3_LOG_REGEX, _FullLogLine, ) -def _append_reduced_log_line( - *, - raw_line: str, - reduced_and_binned_logs: collections.defaultdict[str, dict[str, list[str | int]]], - bucket: str, - operation_type: Literal[_KNOWN_OPERATION_TYPES], - excluded_ips: collections.defaultdict[str, bool], - asset_id_handler: Callable, - line_index: int, - log_file_path: pathlib.Path, - task_id: str, -) -> None: - """ - Append the `reduced_and_binned_logs` map with information extracted from a single raw log line, if it is valid. - - Parameters - ---------- - raw_line : string - A single line from the raw S3 log file. - reduced_and_binned_logs : collections.defaultdict - A map of reduced log line content binned by handled asset ID. - asset_id_handler : callable, optional - If your asset IDs in the raw log require custom handling (i.e., they contain slashes that you do not wish to - translate into nested directory paths) then define a function of the following form: - - # For example - def asset_id_handler(*, raw_asset_id: str) -> str: - split_by_slash = raw_asset_id.split("/") - return split_by_slash[0] + "_" + split_by_slash[-1] - bucket : string - Only parse and return lines that match this bucket string. - operation_type : string - The type of operation to filter for. - excluded_ips : collections.defaultdict of strings to booleans - A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. - line_index: int - The index of the line in the raw log file. - log_file_path: pathlib.Path - The path to the log file being parsed; attached for error collection purposes. - task_id: str - A unique task ID to ensure that error collection files are unique when parallelizing to avoid race conditions. - """ - parsed_log_line = _parse_s3_log_line(raw_line=raw_line) - - full_log_line = _get_full_log_line( - parsed_log_line=parsed_log_line, - log_file_path=log_file_path, - line_index=line_index, - raw_line=raw_line, - task_id=task_id, - ) - - if full_log_line is None: - return None - - # Various early skip conditions - if full_log_line.bucket != bucket: - return None - - # Apply some minimal validation and contribute any invalidations to error collection - # These might slow parsing down a bit, but could be important to ensuring accuracy - if not full_log_line.status_code.isdigit(): - message = f"Unexpected status code: '{full_log_line.status_code}' on line {line_index} of file {log_file_path}." - _collect_error(message=message, error_type="line", task_id=task_id) - - return None - - if _IS_OPERATION_TYPE_KNOWN[full_log_line.operation] is False: - message = ( - f"Unexpected request type: '{full_log_line.operation}' on line {line_index} of file {log_file_path}.\n\n" - ) - _collect_error(message=message, error_type="line", task_id=task_id) - - return None - - timezone = full_log_line.timestamp[-5:] - is_timezone_utc = timezone != "+0000" - if is_timezone_utc: - message = f"Unexpected time shift attached to log! Have always seen '+0000', found `{timezone=}`.\n\n" - _collect_error(message=message, error_type="line", task_id=task_id) - # Fine to proceed; just wanted to be made aware if there is ever a difference so can try to investigate why - - # More early skip conditions after validation - # Only accept 200-block status codes - if full_log_line.status_code[0] != "2": - return None - - if full_log_line.operation != operation_type: - return None - - if excluded_ips[full_log_line.ip_address] is True: - return None - - # All early skip conditions done; the line is parsed so bin the reduced information by handled asset ID - handled_asset_id = asset_id_handler(raw_asset_id=full_log_line.asset_id) - handled_timestamp = datetime.datetime.strptime(full_log_line.timestamp[:-6], "%d/%b/%Y:%H:%M:%S") - handled_bytes_sent = int(full_log_line.bytes_sent) if full_log_line.bytes_sent != "-" else 0 - - reduced_and_binned_logs[handled_asset_id] = reduced_and_binned_logs.get( - handled_asset_id, - collections.defaultdict(list), - ) - reduced_and_binned_logs[handled_asset_id]["timestamp"].append(handled_timestamp) - reduced_and_binned_logs[handled_asset_id]["bytes_sent"].append(handled_bytes_sent) - reduced_and_binned_logs[handled_asset_id]["ip_address"].append(full_log_line.ip_address) - reduced_and_binned_logs[handled_asset_id]["line_index"].append(line_index) - - -def _parse_s3_log_line(*, raw_line: str) -> list[str]: +def _parse_s3_log_line(*, raw_s3_log_line: str) -> list[str]: """ The current method of parsing lines of an S3 log file. Bad lines reported in https://github.com/catalystneuro/dandi_s3_log_parser/issues/18 led to quote scrubbing as a pre-step. No self-contained single regex was found that could account for this uncorrected strings. """ - parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=raw_line)] + parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=raw_s3_log_line)] number_of_parsed_items = len(parsed_log_line) @@ -150,29 +32,23 @@ def _parse_s3_log_line(*, raw_line: str) -> list[str]: if number_of_parsed_items <= 26: return parsed_log_line - try: - potentially_cleaned_raw_line = _attempt_to_remove_quotes(raw_line=raw_line, bad_parsed_line=parsed_log_line) - parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=potentially_cleaned_raw_line)] - except Exception as exception: - message = ( - f"Error parsing line: {raw_line}\n\n" f"{type(exception)}: str{exception}\n\n" f"{traceback.format_exc()}", - ) - _collect_error(message=message, error_type="line_cleaning") - - raise exception + potentially_cleaned_raw_line = _attempt_to_remove_quotes( + raw_s3_log_line=raw_s3_log_line, bad_parsed_line=parsed_log_line + ) + parsed_log_line = [a or b or c for a, b, c in _S3_LOG_REGEX.findall(string=potentially_cleaned_raw_line)] return parsed_log_line -def _attempt_to_remove_quotes(*, raw_line: str, bad_parsed_line: str) -> str: +def _attempt_to_remove_quotes(*, raw_s3_log_line: str, bad_parsed_line: str) -> str: """ Attempt to remove bad quotes from a raw line of an S3 log file. These quotes are not properly escaped and are causing issues with the regex pattern. Various attempts to fix the regex failed, so this is the most reliable correction I could find. """ - starting_quotes_indices = _find_all_possible_substring_indices(string=raw_line, substring=' "') - ending_quotes_indices = _find_all_possible_substring_indices(string=raw_line, substring='" ') + starting_quotes_indices = _find_all_possible_substring_indices(string=raw_s3_log_line, substring=' "') + ending_quotes_indices = _find_all_possible_substring_indices(string=raw_s3_log_line, substring='" ') # If even further unexpected structure, just return the bad parsed line so that the error reporter can catch it if len(starting_quotes_indices) == 0: # pragma: no cover @@ -180,13 +56,13 @@ def _attempt_to_remove_quotes(*, raw_line: str, bad_parsed_line: str) -> str: if len(starting_quotes_indices) != len(ending_quotes_indices): # pragma: no cover return bad_parsed_line - cleaned_raw_line = raw_line[0 : starting_quotes_indices[0]] + cleaned_raw_s3_log_line = raw_s3_log_line[0 : starting_quotes_indices[0]] for counter in range(1, len(starting_quotes_indices) - 1): - next_block = raw_line[ending_quotes_indices[counter - 1] + 2 : starting_quotes_indices[counter]] - cleaned_raw_line += " - " + next_block - cleaned_raw_line += " - " + raw_line[ending_quotes_indices[-1] + 2 :] + next_block = raw_s3_log_line[ending_quotes_indices[counter - 1] + 2 : starting_quotes_indices[counter]] + cleaned_raw_s3_log_line += " - " + next_block + cleaned_raw_s3_log_line += " - " + raw_s3_log_line[ending_quotes_indices[-1] + 2 :] - return cleaned_raw_line + return cleaned_raw_s3_log_line def _find_all_possible_substring_indices(*, string: str, substring: str) -> list[int]: @@ -211,33 +87,22 @@ def _find_all_possible_substring_indices(*, string: str, substring: str) -> list def _get_full_log_line( *, - parsed_log_line: list[str], - log_file_path: pathlib.Path, - line_index: int, - raw_line: str, - task_id: str, -) -> _FullLogLine | None: - """Construct a FullLogLine from a single parsed log line, or dump to error collection file and return None.""" - full_log_line = None - - number_of_parsed_items = len(parsed_log_line) + parsed_s3_log_line: list[str], +) -> _FullLogLine: + number_of_parsed_items = len(parsed_s3_log_line) match number_of_parsed_items: - # ARN not detected + # Seen in a few good lines; don't know why some fields are not detected case 24: - parsed_log_line.append("-") - full_log_line = _FullLogLine(*parsed_log_line) - # Expected length for good lines + parsed_s3_log_line.append("-") + parsed_s3_log_line.append("-") + return _FullLogLine(*parsed_s3_log_line) + # Expected length for most good lines, don't know why they don't include the extra piece on the end case 25: - full_log_line = _FullLogLine(*parsed_log_line) - # Happens for certain types of HEAD requests; not sure what the extra element is + parsed_s3_log_line.append("-") + return _FullLogLine(*parsed_s3_log_line) case 26: - full_log_line = _FullLogLine(*parsed_log_line[:25]) - - # Deviant log entry; usually some very ill-formed content in the URI - # Dump information to a log file in the base folder for easy sharing - if full_log_line is None: # pragma: no cover - # TODO: automatically attempt to anonymize any detectable IP address in the raw line by replacing with 192.0.2.0 - message = f"Line {line_index} of {log_file_path} (parsed {number_of_parsed_items} items): {raw_line}" - _collect_error(message=message, error_type="line", task_id=task_id) - - return full_log_line + return _FullLogLine(*parsed_s3_log_line) + case _: + raise ValueError( + f"Unexpected number of parsed items: {number_of_parsed_items}. Parsed line: {parsed_s3_log_line}" + ) From f5076e7d6760b819409d473d9dc67059a750c00e Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:18:40 -0400 Subject: [PATCH 02/55] refactor for simplicity and resumability --- src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 5d990cd..bca7f14 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -42,7 +42,7 @@ def reduce_all_dandi_raw_s3_logs( raw_s3_logs_folder_path : file path The path to the folder containing the raw S3 log files to be reduced. reduced_s3_logs_folder_path : file path - The [ath to write each reduced S3 log file to. + The path to write each reduced S3 log file to. There will be one file per handled asset ID. maximum_number_of_workers : int, default: 1 The maximum number of workers to distribute tasks across. From 6a4853bae3764ead38f939ae9a10aa4b4e5d2eb6 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:20:52 -0400 Subject: [PATCH 03/55] refactor for simplicity and resumability --- src/dandi_s3_log_parser/_s3_log_file_reducer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 5216ca5..5592801 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -105,12 +105,12 @@ def object_key_handler(*, object_key: str) -> str: ) task_id = str(uuid.uuid4())[:5] - fast_fields_case = set(fields_to_reduce) == {"object_key", "timestamp", "bytes_sent", "ip_address"} and set( - object_key_parents_to_reduce - ) == { + fast_fields_to_reduce = set(fields_to_reduce) == {"object_key", "timestamp", "bytes_sent", "ip_address"} + fast_object_key_parents_to_reduce = set(object_key_parents_to_reduce) == { "blobs", "zarr", } # Admittedly, this is particular to DANDI + fast_fields_case = fast_fields_to_reduce and fast_object_key_parents_to_reduce if fast_fields_case is True: reduced_s3_log_lines = [ reduced_s3_log_line From fb08027e29a319b1043906e706ac7bc37a78f791 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:21:28 -0400 Subject: [PATCH 04/55] refactor for simplicity and resumability --- src/dandi_s3_log_parser/_s3_log_file_reducer.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 5592801..b65959e 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -105,11 +105,10 @@ def object_key_handler(*, object_key: str) -> str: ) task_id = str(uuid.uuid4())[:5] + + # Admittedly, this is particular to DANDI fast_fields_to_reduce = set(fields_to_reduce) == {"object_key", "timestamp", "bytes_sent", "ip_address"} - fast_object_key_parents_to_reduce = set(object_key_parents_to_reduce) == { - "blobs", - "zarr", - } # Admittedly, this is particular to DANDI + fast_object_key_parents_to_reduce = set(object_key_parents_to_reduce) == {"blobs", "zarr"} fast_fields_case = fast_fields_to_reduce and fast_object_key_parents_to_reduce if fast_fields_case is True: reduced_s3_log_lines = [ From caf030c099cb1e9640431888083f79f0db42be1a Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:36:17 -0400 Subject: [PATCH 05/55] fixes; adapting tests --- src/dandi_s3_log_parser/__init__.py | 3 +- .../0.log | 0 .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 2 + .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 2 + .../cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv | 2 + tests/examples/binned_example_1/0.log | 2 + tests/examples/binned_example_1/1.log | 2 + .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 3 + .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 3 + tests/examples/binned_example_2/0.log | 4 ++ .../0801d996-200e-4173-ab49-d1784427e96a.tsv | 2 + .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 2 + .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 2 + .../expected_output/2020/01/01.log | 3 + .../reduced_example_0/raw_logs/2020/01/01.log | 3 + tests/test_bin_all_dandi_raw_s3_logs.py | 58 ++++++++++++++++++ tests/test_bin_dandi_raw_s3_log.py | 60 +++++++++++++++++++ 17 files changed, 151 insertions(+), 2 deletions(-) rename tests/examples/{reduced_example_0 => binned_example_0}/0.log (100%) create mode 100644 tests/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv create mode 100644 tests/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv create mode 100644 tests/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv create mode 100644 tests/examples/binned_example_1/0.log create mode 100644 tests/examples/binned_example_1/1.log create mode 100644 tests/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv create mode 100644 tests/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv create mode 100644 tests/examples/binned_example_2/0.log create mode 100644 tests/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv create mode 100644 tests/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv create mode 100644 tests/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv create mode 100644 tests/examples/reduced_example_0/expected_output/2020/01/01.log create mode 100644 tests/examples/reduced_example_0/raw_logs/2020/01/01.log create mode 100644 tests/test_bin_all_dandi_raw_s3_logs.py create mode 100644 tests/test_bin_dandi_raw_s3_log.py diff --git a/src/dandi_s3_log_parser/__init__.py b/src/dandi_s3_log_parser/__init__.py index 1167436..a6484dd 100644 --- a/src/dandi_s3_log_parser/__init__.py +++ b/src/dandi_s3_log_parser/__init__.py @@ -20,7 +20,7 @@ from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH from ._s3_log_file_reducer import reduce_raw_s3_log from ._buffered_text_reader import BufferedTextReader -from ._dandi_s3_log_file_reducer import reduce_dandi_raw_s3_log, reduce_all_dandi_raw_s3_logs +from ._dandi_s3_log_file_reducer import reduce_all_dandi_raw_s3_logs from ._ip_utils import get_region_from_ip_address from ._dandiset_mapper import map_reduced_logs_to_dandisets @@ -28,7 +28,6 @@ "DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH", "reduce_raw_s3_log", "BufferedTextReader", - "reduce_dandi_raw_s3_log", "reduce_all_dandi_raw_s3_logs", "get_region_from_ip_address", "map_reduced_logs_to_dandisets", diff --git a/tests/examples/reduced_example_0/0.log b/tests/examples/binned_example_0/0.log similarity index 100% rename from tests/examples/reduced_example_0/0.log rename to tests/examples/binned_example_0/0.log diff --git a/tests/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv new file mode 100644 index 0000000..1cc8230 --- /dev/null +++ b/tests/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address line_index +2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv new file mode 100644 index 0000000..5b97e6c --- /dev/null +++ b/tests/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address line_index +2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv b/tests/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv new file mode 100644 index 0000000..e25b6a1 --- /dev/null +++ b/tests/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address line_index +2023-01-01 22:42:58 1526223 192.0.2.0 2 diff --git a/tests/examples/binned_example_1/0.log b/tests/examples/binned_example_1/0.log new file mode 100644 index 0000000..9e9f1ac --- /dev/null +++ b/tests/examples/binned_example_1/0.log @@ -0,0 +1,2 @@ +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [24/Apr/2021:12:03:05 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [16/Mar/2022:02:21:12 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/examples/binned_example_1/1.log b/tests/examples/binned_example_1/1.log new file mode 100644 index 0000000..43bacc6 --- /dev/null +++ b/tests/examples/binned_example_1/1.log @@ -0,0 +1,2 @@ +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv new file mode 100644 index 0000000..570c480 --- /dev/null +++ b/tests/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -0,0 +1,3 @@ +timestamp bytes_sent ip_address line_index +2022-03-16 02:21:12 512 192.0.2.0 1 +2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv new file mode 100644 index 0000000..0851ae8 --- /dev/null +++ b/tests/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv @@ -0,0 +1,3 @@ +timestamp bytes_sent ip_address line_index +2021-04-24 12:03:05 1443 192.0.2.0 0 +2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/examples/binned_example_2/0.log b/tests/examples/binned_example_2/0.log new file mode 100644 index 0000000..9205ad3 --- /dev/null +++ b/tests/examples/binned_example_2/0.log @@ -0,0 +1,4 @@ +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Jan/2023:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [26/Jun/2023:03:05:53 +0000] 192.0.2.0 - 5PCGX9WKFQMJH6FB REST.GET.OBJECT blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a "GET /blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a HTTP/1.1" 200 - 6616308 422868123111 205 35 "-" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" - A54Zaz7Sl0ygUFZ4lEOYCXHxImvTGXnvR+rr9+JcM/gceQWDObRkwnP9nO+wK70lpMaaE78SWvA= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - diff --git a/tests/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv new file mode 100644 index 0000000..a55bf13 --- /dev/null +++ b/tests/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address line_index +2023-06-26 03:05:53 6616308 192.0.2.0 3 diff --git a/tests/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv new file mode 100644 index 0000000..1cc8230 --- /dev/null +++ b/tests/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address line_index +2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv new file mode 100644 index 0000000..5b97e6c --- /dev/null +++ b/tests/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address line_index +2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/examples/reduced_example_0/expected_output/2020/01/01.log b/tests/examples/reduced_example_0/expected_output/2020/01/01.log new file mode 100644 index 0000000..6303c18 --- /dev/null +++ b/tests/examples/reduced_example_0/expected_output/2020/01/01.log @@ -0,0 +1,3 @@ +[01/Jan/2020:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +[01/Jan/2020:22:42:58 +0000] 192.0.2.0 - W3VJKP0HM8TV2N46 REST.GET.OBJECT zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19 "GET /zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19?versionId=MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 HTTP/1.1" 200 - 1526223 1526223 61 55 "-" "git-annex/10.20220927-geb4a544" MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 U4WvVRIYm+n+VYNArVY/+fjDV3PZesvSaclnyALtK7rsaZ/8sTq8H1JnNAyMj/xuitYxXNUCZ+U= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +[01/Jan/2020:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/examples/reduced_example_0/raw_logs/2020/01/01.log b/tests/examples/reduced_example_0/raw_logs/2020/01/01.log new file mode 100644 index 0000000..4a80253 --- /dev/null +++ b/tests/examples/reduced_example_0/raw_logs/2020/01/01.log @@ -0,0 +1,3 @@ +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:22:42:58 +0000] 192.0.2.0 - W3VJKP0HM8TV2N46 REST.GET.OBJECT zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19 "GET /zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19?versionId=MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 HTTP/1.1" 200 - 1526223 1526223 61 55 "-" "git-annex/10.20220927-geb4a544" MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 U4WvVRIYm+n+VYNArVY/+fjDV3PZesvSaclnyALtK7rsaZ/8sTq8H1JnNAyMj/xuitYxXNUCZ+U= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_bin_all_dandi_raw_s3_logs.py b/tests/test_bin_all_dandi_raw_s3_logs.py new file mode 100644 index 0000000..32757fd --- /dev/null +++ b/tests/test_bin_all_dandi_raw_s3_logs.py @@ -0,0 +1,58 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: + """Basic test for parsing of all DANDI raw S3 logs in a directory.""" + tmpdir = pathlib.Path(tmpdir) + + file_parent = pathlib.Path(__file__).parent + examples_folder_path = file_parent / "examples" / "reduced_example_1" + expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" + + test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_1" + test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) + + dandi_s3_log_parser.reduce_all_dandi_raw_s3_logs( + base_raw_s3_logs_folder_path=examples_folder_path, + reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, + ) + test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) + + number_of_output_files = len(test_output_file_paths) + assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" + + # Increment this over time as more examples are added + expected_number_of_output_files = 2 + assert ( + number_of_output_files == expected_number_of_output_files + ), f"The number of asset files ({number_of_output_files}) does not match expectation!" + + expected_asset_ids = [file_path.stem for file_path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] + for test_parsed_s3_log_file_path in test_output_file_paths: + assert ( + test_parsed_s3_log_file_path.stem in expected_asset_ids + ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" + + test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) + + blob_id = test_parsed_s3_log_file_path.stem + expected_parsed_s3_log_file_path = ( + expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" + ) + expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) + + test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp") + expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp") + + test_parsed_s3_log.index = range(len(test_parsed_s3_log)) + expected_parsed_s3_log.index = range(len(expected_parsed_s3_log)) + + pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) + + +# TODO: add CLI diff --git a/tests/test_bin_dandi_raw_s3_log.py b/tests/test_bin_dandi_raw_s3_log.py new file mode 100644 index 0000000..484f83c --- /dev/null +++ b/tests/test_bin_dandi_raw_s3_log.py @@ -0,0 +1,60 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_reduce_dandi_raw_s3_log_example_0(tmpdir: py.path.local) -> None: + """ + Most basic test of functionality. + + If there are failures in the parsing of any lines found in application, + please raise an issue and contribute them to the example log collection. + """ + tmpdir = pathlib.Path(tmpdir) + + file_parent = pathlib.Path(__file__).parent + examples_folder_path = file_parent / "examples" / "reduced_example_0" + example_raw_s3_log_file_path = examples_folder_path / "0.log" + expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" + + test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_0" + test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) + + dandi_s3_log_parser.reduce_dandi_raw_s3_log( + raw_s3_log_file_path=example_raw_s3_log_file_path, + reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, + ) + test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) + + number_of_output_files = len(test_output_file_paths) + assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" + + # Increment this over time as more examples are added + expected_number_of_output_files = 3 + assert ( + number_of_output_files == expected_number_of_output_files + ), f"The number of asset files ({number_of_output_files}) does not match expectation!" + + expected_asset_ids = [path.stem for path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] + for test_reduced_s3_log_file_path in test_output_file_paths: + assert ( + test_reduced_s3_log_file_path.stem in expected_asset_ids + ), f"Asset ID {test_reduced_s3_log_file_path.stem} not found in expected asset IDs!" + + is_asset_zarr = "zarr" in str(test_reduced_s3_log_file_path) + if is_asset_zarr: + blob_id = test_reduced_s3_log_file_path.stem + expected_parsed_s3_log_file_path = expected_reduced_s3_logs_folder_path / "zarr" / f"{blob_id}.tsv" + else: + blob_id = test_reduced_s3_log_file_path.stem + expected_parsed_s3_log_file_path = ( + expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" + ) + + test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) + expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) + + pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_parsed_s3_log) From b7a83f57ec9840c240a8e15118189d9a360dc8b6 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:39:05 -0400 Subject: [PATCH 06/55] fix --- src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index bca7f14..562d220 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -71,7 +71,7 @@ def reduce_all_dandi_raw_s3_logs( reduced_year_path.mkdir(exist_ok=True) for month in range(1, 13): - reduced_month_path = reduced_s3_logs_folder_path / str(month).zfill(2) + reduced_month_path = reduced_year_path / str(month).zfill(2) reduced_month_path.mkdir(exist_ok=True) relative_s3_log_file_paths = [ From 95e4bda0d3dc417b394c600b6b16eb5f4957acd2 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:41:25 -0400 Subject: [PATCH 07/55] fix --- src/dandi_s3_log_parser/_command_line_interface.py | 10 ++++++++++ src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index 738b2c2..f01d4e5 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -44,6 +44,13 @@ type=click.IntRange(min=1), # Bare minimum of 1 MB default=1_000, # 1 GB recommended ) +@click.option( + "--excluded_years", + help="A comma-separated list of years to exclude from parsing.", + required=False, + type=str, + default=None, +) @click.option( "--excluded_ips", help="A comma-separated list of IP addresses to exclude from parsing.", @@ -56,8 +63,10 @@ def _reduce_all_dandi_raw_s3_logs_cli( reduced_s3_logs_folder_path: str, maximum_number_of_workers: int, maximum_buffer_size_in_mb: int, + excluded_years: str | None, excluded_ips: str | None, ) -> None: + split_excluded_years = excluded_years.split(",") if excluded_years is not None else list() split_excluded_ips = excluded_ips.split(",") if excluded_ips is not None else list() handled_excluded_ips = collections.defaultdict(bool) if len(split_excluded_ips) != 0 else None for excluded_ip in split_excluded_ips: @@ -69,6 +78,7 @@ def _reduce_all_dandi_raw_s3_logs_cli( reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, maximum_number_of_workers=maximum_number_of_workers, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, + excluded_years=split_excluded_years, excluded_ips=handled_excluded_ips, ) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 562d220..3228c14 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -31,7 +31,7 @@ def reduce_all_dandi_raw_s3_logs( Assumes the following folder structure... - |- + |- |-- 2019 (year) |--- 01 (month) |---- 01.log (day) From fb48dbb8664723598a431f02f2fbe87d3869bad3 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:47:40 -0400 Subject: [PATCH 08/55] fix --- src/dandi_s3_log_parser/_command_line_interface.py | 4 ++-- src/dandi_s3_log_parser/_s3_log_file_reducer.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index f01d4e5..a5ae299 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -66,8 +66,8 @@ def _reduce_all_dandi_raw_s3_logs_cli( excluded_years: str | None, excluded_ips: str | None, ) -> None: - split_excluded_years = excluded_years.split(",") if excluded_years is not None else list() - split_excluded_ips = excluded_ips.split(",") if excluded_ips is not None else list() + split_excluded_years = excluded_years.split(",") if excluded_years is not None else [] + split_excluded_ips = excluded_ips.split(",") if excluded_ips is not None else [] handled_excluded_ips = collections.defaultdict(bool) if len(split_excluded_ips) != 0 else None for excluded_ip in split_excluded_ips: handled_excluded_ips[excluded_ip] = True diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index b65959e..8f2de0f 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -2,6 +2,7 @@ import collections import datetime +import pathlib import traceback import uuid from collections.abc import Callable @@ -20,7 +21,7 @@ def reduce_raw_s3_log( *, raw_s3_log_file_path: FilePath, - reduced_s3_log_file_path: FilePath, + reduced_s3_log_file_path: pathlib.Path, fields_to_reduce: list[Literal[_S3_LOG_FIELDS]] | None = None, object_key_parents_to_reduce: list[str] | None = None, maximum_buffer_size_in_bytes: int = 4 * 10**9, From 93bfd3f7916a96924428897d7a3a0ed5ec3359b6 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:49:51 -0400 Subject: [PATCH 09/55] fix --- src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 3228c14..0f13a4f 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -77,7 +77,7 @@ def reduce_all_dandi_raw_s3_logs( relative_s3_log_file_paths = [ raw_s3_log_file_path.relative_to(raw_s3_logs_folder_path) for raw_s3_log_file_path in raw_s3_logs_folder_path.rglob(pattern="*.log") - if raw_s3_log_file_path.stem.isdigit() + if raw_s3_log_file_path.stem.isdigit() and raw_s3_log_file_path.parent.parent.name in years_to_reduce ] relative_s3_log_file_paths_to_reduce = [ relative_s3_log_file_path From 661e1a776c1b112096afe2f502501d9ab7a699bd Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:50:51 -0400 Subject: [PATCH 10/55] fix --- src/dandi_s3_log_parser/_s3_log_file_reducer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 8f2de0f..076f29d 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -92,7 +92,7 @@ def object_key_handler(*, object_key: str) -> str: assert raw_s3_log_file_path.suffix == ".log", f"`{raw_s3_log_file_path=}` should end in '.log'!" - if set(fields_to_reduce) == {"object_key", "timestamp", "bytes_sent", "ip_address"}: + if set(fields_to_reduce) != {"object_key", "timestamp", "bytes_sent", "ip_address"}: raise NotImplementedError("This function is not yet generalized for custom field reduction.") buffered_text_reader = BufferedTextReader( From dc8c6d9b66162c3cace4e83a2d62e1e6cda06319 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 17:53:19 -0400 Subject: [PATCH 11/55] fix --- src/dandi_s3_log_parser/_s3_log_file_reducer.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 076f29d..c3414c9 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -135,9 +135,14 @@ def object_key_handler(*, object_key: str) -> str: is not None ] + if len(reduced_s3_log_lines) == 0: + return None + with open(file=reduced_s3_log_file_path, mode="w") as io: io.writelines(reduced_s3_log_lines) + return None + def _fast_dandi_reduce_raw_s3_log_line( *, @@ -191,6 +196,8 @@ def _fast_dandi_reduce_raw_s3_log_line( message = f"Error during fast reduction of line '{raw_s3_log_line}'" _collect_error(message=message, error_type="fast_line_reduction", task_id=task_id) + return None + def _reduce_raw_s3_log_line( *, From 38ec7e6e382c7249a29878af295e7ff575545cff Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 18:05:27 -0400 Subject: [PATCH 12/55] fix --- src/dandi_s3_log_parser/_s3_log_file_reducer.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index c3414c9..87a163b 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -192,8 +192,12 @@ def _fast_dandi_reduce_raw_s3_log_line( reduced_s3_log_line = f"{timestamp}\t{ip_address}\t{object_key}\t{bytes_sent}\n" return reduced_s3_log_line - except Exception: - message = f"Error during fast reduction of line '{raw_s3_log_line}'" + except Exception as exception: + message = ( + f"Error during fast reduction of line '{raw_s3_log_line}'\n" + f"{type(exception)}: {exception}\n" + f"{traceback.format_exc()}" + ) _collect_error(message=message, error_type="fast_line_reduction", task_id=task_id) return None @@ -212,9 +216,7 @@ def _reduce_raw_s3_log_line( full_log_line = _get_full_log_line(parsed_s3_log_line=parsed_s3_log_line) except Exception as exception: message = ( - f"Error parsing line: {raw_s3_log_line}\n\n" - f"{type(exception)}: str{exception}\n\n" - f"{traceback.format_exc()}", + f"Error parsing line: {raw_s3_log_line}\n" f"{type(exception)}: {exception}\n" f"{traceback.format_exc()}", ) _collect_error(message=message, error_type="line_reduction", task_id=task_id) From 2e863b6b6eda7bc5b7d28c10461227bb70605282 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Tue, 20 Aug 2024 18:09:59 -0400 Subject: [PATCH 13/55] fix --- src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 0f13a4f..166c8e4 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -128,8 +128,6 @@ def reduce_all_dandi_raw_s3_logs( _multi_worker_reduce_dandi_raw_s3_log, raw_s3_log_file_path=raw_s3_log_file_path, reduced_s3_log_file_path=reduced_s3_log_file_path, - fields_to_reduce=fields_to_reduce, - object_key_parents_to_reduce=object_key_parents_to_reduce, maximum_number_of_workers=maximum_number_of_workers, maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes_per_worker, excluded_ips=excluded_ips, From 60b6ae5542affd392ce7d9ecc1b72ba7cc95eb4e Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 00:04:20 -0400 Subject: [PATCH 14/55] adjusting tests --- .../_dandi_s3_log_file_reducer.py | 21 +++++- .../_s3_log_file_reducer.py | 20 ++++-- .../58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv | 3 + .../cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv | 3 + .../cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv | 2 +- .../expected_output/000003/0.210812.1448.tsv | 4 +- .../expected_output/000003/0.230629.1955.tsv | 4 +- .../expected_output/000003/draft.tsv | 4 +- .../expected_output/000013/0.220126.2143.tsv | 4 +- .../expected_output/000013/draft.tsv | 4 +- .../expected_output/000108/draft.tsv | 2 +- .../test_map_reduced_logs_to_all_dandisets.py | 2 +- .../examples/binned_example_0/0.log | 0 .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 0 .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 0 .../cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv | 0 .../examples/binned_example_1/0.log | 0 .../examples/binned_example_1/1.log | 0 .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 0 .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 0 .../examples/binned_example_2/0.log | 0 .../0801d996-200e-4173-ab49-d1784427e96a.tsv | 0 .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 0 .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 0 .../expected_output/2020/01/01.log | 0 .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 0 .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 0 .../cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv | 0 .../reduced_example_0/raw_logs/2020/01/01.log | 0 .../examples/reduced_example_1/0.log | 0 .../examples/reduced_example_1/1.log | 0 .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 0 .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 0 .../examples/reduced_example_2/0.log | 0 .../0801d996-200e-4173-ab49-d1784427e96a.tsv | 0 .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 0 .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 0 .../test_bin_all_dandi_raw_s3_logs.py | 0 .../test_bin_dandi_raw_s3_log.py | 0 .../test_buffered_text_reader.py | 0 .../test_reduce_all_dandi_raw_s3_logs.py | 0 ...t_reduce_all_dandi_raw_s3_logs_parallel.py | 0 .../test_reduce_dandi_raw_s3_log_bad_lines.py | 0 .../test_reduce_raw_s3_log.py} | 2 +- .../expected_output/2020/01/01.tsv | 3 + .../reduced_example_0/raw_logs/2020/01/01.log | 3 + .../expected_output/2020/01/01.tsv | 3 + .../expected_output/2021/02/03.tsv | 3 + .../reduced_example_1/raw_logs/2020/01/01.log | 3 + .../reduced_example_1/raw_logs/2021/02/03.log | 3 + .../examples/reduced_example_2/0.log | 4 ++ .../0801d996-200e-4173-ab49-d1784427e96a.tsv | 2 + .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 1 - .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 1 - .../test_reduce_all_dandi_raw_s3_logs.py | 53 +++++++++++++++ ...t_reduce_all_dandi_raw_s3_logs_parallel.py | 59 ++++++++++++++++ .../test_reduce_dandi_raw_s3_log_bad_lines.py | 57 ++++++++++++++++ .../test_reduction/test_reduce_raw_s3_log.py | 68 +++++++++++++++++++ 58 files changed, 314 insertions(+), 24 deletions(-) create mode 100644 test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv create mode 100644 test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv rename {tests/examples/reduced_example_0/expected_output => test_live_services/examples/mapped_to_dandiset_example_0/binned_logs}/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv (51%) rename tests/{ => test_binning}/examples/binned_example_0/0.log (100%) rename tests/{ => test_binning}/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv (100%) rename tests/{ => test_binning}/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv (100%) rename {test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs => tests/test_binning/examples/binned_example_0/expected_output}/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv (100%) rename tests/{ => test_binning}/examples/binned_example_1/0.log (100%) rename tests/{ => test_binning}/examples/binned_example_1/1.log (100%) rename tests/{ => test_binning}/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv (100%) rename tests/{ => test_binning}/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv (100%) rename tests/{ => test_binning}/examples/binned_example_2/0.log (100%) rename tests/{ => test_binning}/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv (100%) rename tests/{ => test_binning}/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv (100%) rename tests/{ => test_binning}/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv (100%) rename tests/{ => test_binning}/examples/reduced_example_0/expected_output/2020/01/01.log (100%) rename tests/{ => test_binning}/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv (100%) rename tests/{ => test_binning}/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv (100%) rename tests/{examples/binned_example_0 => test_binning/examples/reduced_example_0}/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv (100%) rename tests/{ => test_binning}/examples/reduced_example_0/raw_logs/2020/01/01.log (100%) rename tests/{ => test_binning}/examples/reduced_example_1/0.log (100%) rename tests/{ => test_binning}/examples/reduced_example_1/1.log (100%) rename tests/{ => test_binning}/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv (100%) rename tests/{ => test_binning}/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv (100%) rename tests/{ => test_binning}/examples/reduced_example_2/0.log (100%) rename tests/{ => test_binning}/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv (100%) rename tests/{ => test_binning}/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv (100%) rename tests/{ => test_binning}/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv (100%) rename tests/{ => test_binning}/test_bin_all_dandi_raw_s3_logs.py (100%) rename tests/{ => test_binning}/test_bin_dandi_raw_s3_log.py (100%) rename tests/{ => test_binning}/test_buffered_text_reader.py (100%) rename tests/{ => test_binning}/test_reduce_all_dandi_raw_s3_logs.py (100%) rename tests/{ => test_binning}/test_reduce_all_dandi_raw_s3_logs_parallel.py (100%) rename tests/{ => test_binning}/test_reduce_dandi_raw_s3_log_bad_lines.py (100%) rename tests/{test_reduce_dandi_raw_s3_log.py => test_binning/test_reduce_raw_s3_log.py} (97%) create mode 100644 tests/test_reduction/examples/reduced_example_0/expected_output/2020/01/01.tsv create mode 100644 tests/test_reduction/examples/reduced_example_0/raw_logs/2020/01/01.log create mode 100644 tests/test_reduction/examples/reduced_example_1/expected_output/2020/01/01.tsv create mode 100644 tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv create mode 100644 tests/test_reduction/examples/reduced_example_1/raw_logs/2020/01/01.log create mode 100644 tests/test_reduction/examples/reduced_example_1/raw_logs/2021/02/03.log create mode 100644 tests/test_reduction/examples/reduced_example_2/0.log create mode 100644 tests/test_reduction/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv rename test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv => tests/test_reduction/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv (68%) rename test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv => tests/test_reduction/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv (68%) create mode 100644 tests/test_reduction/test_reduce_all_dandi_raw_s3_logs.py create mode 100644 tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py create mode 100644 tests/test_reduction/test_reduce_dandi_raw_s3_log_bad_lines.py create mode 100644 tests/test_reduction/test_reduce_raw_s3_log.py diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 166c8e4..759a731 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -4,6 +4,7 @@ import datetime import os import random +import shutil import traceback import uuid from collections.abc import Callable @@ -102,7 +103,9 @@ def reduce_all_dandi_raw_s3_logs( smoothing=0, # Use true historical average, not moving average since shuffling makes it more uniform ): raw_s3_log_file_path = raw_s3_logs_folder_path / relative_s3_log_file_path - reduced_s3_log_file_path = reduced_s3_logs_folder_path / relative_s3_log_file_path + reduced_s3_log_file_path = ( + reduced_s3_logs_folder_path / relative_s3_log_file_path.parent / f"{relative_s3_log_file_path.stem}.tsv" + ) reduce_raw_s3_log( raw_s3_log_file_path=raw_s3_log_file_path, @@ -121,7 +124,11 @@ def reduce_all_dandi_raw_s3_logs( with ProcessPoolExecutor(max_workers=maximum_number_of_workers) as executor: for relative_s3_log_file_path in relative_s3_log_file_paths_to_reduce: raw_s3_log_file_path = raw_s3_logs_folder_path / relative_s3_log_file_path - reduced_s3_log_file_path = reduced_s3_logs_folder_path / relative_s3_log_file_path + reduced_s3_log_file_path = ( + reduced_s3_logs_folder_path + / relative_s3_log_file_path.parent + / f"{relative_s3_log_file_path.stem}.tsv" + ) futures.append( executor.submit( @@ -146,6 +153,16 @@ def reduce_all_dandi_raw_s3_logs( for future in progress_bar_iterable: future.result() # This is the call that finally triggers the deployment to the workers + # Final step: clean any empty directories + for year in years_to_reduce: + reduced_year_folder_path = reduced_s3_logs_folder_path / year + for month in range(1, 13): + reduced_month_folder_path = reduced_year_folder_path / str(month).zfill(2) + if not any(reduced_month_folder_path.iterdir()): + shutil.rmtree(path=reduced_month_folder_path, ignore_errors=True) + if not any(reduced_year_folder_path.iterdir()): + shutil.rmtree(path=reduced_year_folder_path, ignore_errors=True) + return None diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 87a163b..879671d 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -21,7 +21,7 @@ def reduce_raw_s3_log( *, raw_s3_log_file_path: FilePath, - reduced_s3_log_file_path: pathlib.Path, + reduced_s3_log_file_path: str | pathlib.Path, # Not a FilePath because we are creating it fields_to_reduce: list[Literal[_S3_LOG_FIELDS]] | None = None, object_key_parents_to_reduce: list[str] | None = None, maximum_buffer_size_in_bytes: int = 4 * 10**9, @@ -131,7 +131,15 @@ def object_key_handler(*, object_key: str) -> str: reduced_s3_log_line for raw_s3_log_lines_buffer in progress_bar_iterator for raw_s3_log_line in raw_s3_log_lines_buffer - if (reduced_s3_log_line := _reduce_raw_s3_log_line(raw_s3_log_line=raw_s3_log_line, task_id=task_id)) + if ( + reduced_s3_log_line := _reduce_raw_s3_log_line( + raw_s3_log_line=raw_s3_log_line, + operation_type=operation_type, + excluded_ips=excluded_ips, + object_key_handler=object_key_handler, + task_id=task_id, + ) + ) is not None ] @@ -231,8 +239,8 @@ def _reduce_raw_s3_log_line( # Apply some minimal validation and contribute any invalidations to error collection # These might slow parsing down a bit, but could be important to ensuring accuracy - if not full_log_line.status_code.isdigit(): - message = f"Unexpected status code: '{full_log_line.status_code}' parsed from line '{raw_s3_log_line}'." + if not full_log_line.http_status_code.isdigit(): + message = f"Unexpected status code: '{full_log_line.http_status_code}' parsed from line '{raw_s3_log_line}'." _collect_error(message=message, error_type="line", task_id=task_id) return None @@ -252,7 +260,7 @@ def _reduce_raw_s3_log_line( # More early skip conditions after validation # Only accept 200-block status codes - if full_log_line.status_code[0] != "2": + if full_log_line.http_status_code[0] != "2": return None if full_log_line.operation != operation_type: @@ -262,7 +270,7 @@ def _reduce_raw_s3_log_line( return None # All early skip conditions done; the line is parsed so bin the reduced information by handled asset ID - handled_object_key = object_key_handler(raw_asset_id=full_log_line.asset_id) + handled_object_key = object_key_handler(object_key=full_log_line.object_key) handled_timestamp = datetime.datetime.strptime(full_log_line.timestamp[:-6], "%d/%b/%Y:%H:%M:%S").isoformat() handled_bytes_sent = int(full_log_line.bytes_sent) if full_log_line.bytes_sent != "-" else 0 diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv new file mode 100644 index 0000000..b684d88 --- /dev/null +++ b/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv @@ -0,0 +1,3 @@ +timestamp bytes_sent ip_address line_index +2022-03-16T02:21:12 512 192.0.2.0 1 +2022-05-04T05:06:35 512 192.0.2.0 1 diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv new file mode 100644 index 0000000..c3a1e3e --- /dev/null +++ b/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv @@ -0,0 +1,3 @@ +timestamp bytes_sent ip_address line_index +2021-04-24T12:03:05 1443 192.0.2.0 0 +2021-12-31T23:06:42 1443 192.0.2.0 0 diff --git a/tests/examples/reduced_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv similarity index 51% rename from tests/examples/reduced_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv rename to test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv index e25b6a1..89c526f 100644 --- a/tests/examples/reduced_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv +++ b/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv @@ -1,2 +1,2 @@ timestamp bytes_sent ip_address line_index -2023-01-01 22:42:58 1526223 192.0.2.0 2 +2023-01-01T22:42:58 1526223 192.0.2.0 2 diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv index 953e052..5b0b45b 100644 --- a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv +++ b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv @@ -1,3 +1,3 @@ filename timestamp bytes_sent region -0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16 02:21:12 512 unknown -1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04 05:06:35 512 unknown +0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16T02:21:12 512 unknown +1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv index 953e052..5b0b45b 100644 --- a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv +++ b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv @@ -1,3 +1,3 @@ filename timestamp bytes_sent region -0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16 02:21:12 512 unknown -1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04 05:06:35 512 unknown +0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16T02:21:12 512 unknown +1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv index 953e052..5b0b45b 100644 --- a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv +++ b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv @@ -1,3 +1,3 @@ filename timestamp bytes_sent region -0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16 02:21:12 512 unknown -1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04 05:06:35 512 unknown +0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16T02:21:12 512 unknown +1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv index 919df08..427dfc3 100644 --- a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv +++ b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv @@ -1,3 +1,3 @@ filename timestamp bytes_sent region -0 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-04-24 12:03:05 1443 unknown -1 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-12-31 23:06:42 1443 unknown +0 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-04-24T12:03:05 1443 unknown +1 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-12-31T23:06:42 1443 unknown diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv index 919df08..427dfc3 100644 --- a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv +++ b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv @@ -1,3 +1,3 @@ filename timestamp bytes_sent region -0 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-04-24 12:03:05 1443 unknown -1 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-12-31 23:06:42 1443 unknown +0 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-04-24T12:03:05 1443 unknown +1 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-12-31T23:06:42 1443 unknown diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000108/draft.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000108/draft.tsv index c733edc..af8e92c 100644 --- a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000108/draft.tsv +++ b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000108/draft.tsv @@ -1,2 +1,2 @@ filename timestamp bytes_sent region -0 sub-MITU01/ses-20220317h10m43s39/micr/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.ome.zarr 2023-01-01 22:42:58 1526223 unknown +0 sub-MITU01/ses-20220317h10m43s39/micr/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.ome.zarr 2023-01-01T22:42:58 1526223 unknown diff --git a/test_live_services/test_map_reduced_logs_to_all_dandisets.py b/test_live_services/test_map_reduced_logs_to_all_dandisets.py index 5bbc7a2..ef4a531 100644 --- a/test_live_services/test_map_reduced_logs_to_all_dandisets.py +++ b/test_live_services/test_map_reduced_logs_to_all_dandisets.py @@ -11,7 +11,7 @@ def test_map_reduced_logs_to_dandisets(tmpdir: py.path.local): file_parent = pathlib.Path(__file__).parent examples_folder_path = file_parent / "examples" / "mapped_to_dandiset_example_0" - reduced_s3_logs_folder_path = examples_folder_path / "reduced_logs" + reduced_s3_logs_folder_path = examples_folder_path / "binned_logs" dandiset_logs_folder_path = tmpdir dandi_s3_log_parser.map_reduced_logs_to_dandisets( diff --git a/tests/examples/binned_example_0/0.log b/tests/test_binning/examples/binned_example_0/0.log similarity index 100% rename from tests/examples/binned_example_0/0.log rename to tests/test_binning/examples/binned_example_0/0.log diff --git a/tests/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv similarity index 100% rename from tests/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv rename to tests/test_binning/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv diff --git a/tests/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv similarity index 100% rename from tests/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv rename to tests/test_binning/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv b/tests/test_binning/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv similarity index 100% rename from test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv rename to tests/test_binning/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv diff --git a/tests/examples/binned_example_1/0.log b/tests/test_binning/examples/binned_example_1/0.log similarity index 100% rename from tests/examples/binned_example_1/0.log rename to tests/test_binning/examples/binned_example_1/0.log diff --git a/tests/examples/binned_example_1/1.log b/tests/test_binning/examples/binned_example_1/1.log similarity index 100% rename from tests/examples/binned_example_1/1.log rename to tests/test_binning/examples/binned_example_1/1.log diff --git a/tests/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv similarity index 100% rename from tests/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv rename to tests/test_binning/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv diff --git a/tests/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv similarity index 100% rename from tests/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv rename to tests/test_binning/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv diff --git a/tests/examples/binned_example_2/0.log b/tests/test_binning/examples/binned_example_2/0.log similarity index 100% rename from tests/examples/binned_example_2/0.log rename to tests/test_binning/examples/binned_example_2/0.log diff --git a/tests/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/test_binning/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv similarity index 100% rename from tests/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv rename to tests/test_binning/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv diff --git a/tests/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv similarity index 100% rename from tests/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv rename to tests/test_binning/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv diff --git a/tests/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv similarity index 100% rename from tests/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv rename to tests/test_binning/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv diff --git a/tests/examples/reduced_example_0/expected_output/2020/01/01.log b/tests/test_binning/examples/reduced_example_0/expected_output/2020/01/01.log similarity index 100% rename from tests/examples/reduced_example_0/expected_output/2020/01/01.log rename to tests/test_binning/examples/reduced_example_0/expected_output/2020/01/01.log diff --git a/tests/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv similarity index 100% rename from tests/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv rename to tests/test_binning/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv diff --git a/tests/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv similarity index 100% rename from tests/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv rename to tests/test_binning/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv diff --git a/tests/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv b/tests/test_binning/examples/reduced_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv similarity index 100% rename from tests/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv rename to tests/test_binning/examples/reduced_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv diff --git a/tests/examples/reduced_example_0/raw_logs/2020/01/01.log b/tests/test_binning/examples/reduced_example_0/raw_logs/2020/01/01.log similarity index 100% rename from tests/examples/reduced_example_0/raw_logs/2020/01/01.log rename to tests/test_binning/examples/reduced_example_0/raw_logs/2020/01/01.log diff --git a/tests/examples/reduced_example_1/0.log b/tests/test_binning/examples/reduced_example_1/0.log similarity index 100% rename from tests/examples/reduced_example_1/0.log rename to tests/test_binning/examples/reduced_example_1/0.log diff --git a/tests/examples/reduced_example_1/1.log b/tests/test_binning/examples/reduced_example_1/1.log similarity index 100% rename from tests/examples/reduced_example_1/1.log rename to tests/test_binning/examples/reduced_example_1/1.log diff --git a/tests/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv similarity index 100% rename from tests/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv rename to tests/test_binning/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv diff --git a/tests/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv similarity index 100% rename from tests/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv rename to tests/test_binning/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv diff --git a/tests/examples/reduced_example_2/0.log b/tests/test_binning/examples/reduced_example_2/0.log similarity index 100% rename from tests/examples/reduced_example_2/0.log rename to tests/test_binning/examples/reduced_example_2/0.log diff --git a/tests/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/test_binning/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv similarity index 100% rename from tests/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv rename to tests/test_binning/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv diff --git a/tests/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv similarity index 100% rename from tests/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv rename to tests/test_binning/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv diff --git a/tests/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv similarity index 100% rename from tests/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv rename to tests/test_binning/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv diff --git a/tests/test_bin_all_dandi_raw_s3_logs.py b/tests/test_binning/test_bin_all_dandi_raw_s3_logs.py similarity index 100% rename from tests/test_bin_all_dandi_raw_s3_logs.py rename to tests/test_binning/test_bin_all_dandi_raw_s3_logs.py diff --git a/tests/test_bin_dandi_raw_s3_log.py b/tests/test_binning/test_bin_dandi_raw_s3_log.py similarity index 100% rename from tests/test_bin_dandi_raw_s3_log.py rename to tests/test_binning/test_bin_dandi_raw_s3_log.py diff --git a/tests/test_buffered_text_reader.py b/tests/test_binning/test_buffered_text_reader.py similarity index 100% rename from tests/test_buffered_text_reader.py rename to tests/test_binning/test_buffered_text_reader.py diff --git a/tests/test_reduce_all_dandi_raw_s3_logs.py b/tests/test_binning/test_reduce_all_dandi_raw_s3_logs.py similarity index 100% rename from tests/test_reduce_all_dandi_raw_s3_logs.py rename to tests/test_binning/test_reduce_all_dandi_raw_s3_logs.py diff --git a/tests/test_reduce_all_dandi_raw_s3_logs_parallel.py b/tests/test_binning/test_reduce_all_dandi_raw_s3_logs_parallel.py similarity index 100% rename from tests/test_reduce_all_dandi_raw_s3_logs_parallel.py rename to tests/test_binning/test_reduce_all_dandi_raw_s3_logs_parallel.py diff --git a/tests/test_reduce_dandi_raw_s3_log_bad_lines.py b/tests/test_binning/test_reduce_dandi_raw_s3_log_bad_lines.py similarity index 100% rename from tests/test_reduce_dandi_raw_s3_log_bad_lines.py rename to tests/test_binning/test_reduce_dandi_raw_s3_log_bad_lines.py diff --git a/tests/test_reduce_dandi_raw_s3_log.py b/tests/test_binning/test_reduce_raw_s3_log.py similarity index 97% rename from tests/test_reduce_dandi_raw_s3_log.py rename to tests/test_binning/test_reduce_raw_s3_log.py index 484f83c..131f2ea 100644 --- a/tests/test_reduce_dandi_raw_s3_log.py +++ b/tests/test_binning/test_reduce_raw_s3_log.py @@ -6,7 +6,7 @@ import dandi_s3_log_parser -def test_reduce_dandi_raw_s3_log_example_0(tmpdir: py.path.local) -> None: +def test_reduce_raw_s3_log_example_0(tmpdir: py.path.local) -> None: """ Most basic test of functionality. diff --git a/tests/test_reduction/examples/reduced_example_0/expected_output/2020/01/01.tsv b/tests/test_reduction/examples/reduced_example_0/expected_output/2020/01/01.tsv new file mode 100644 index 0000000..de94244 --- /dev/null +++ b/tests/test_reduction/examples/reduced_example_0/expected_output/2020/01/01.tsv @@ -0,0 +1,3 @@ +2020-01-01T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 +2020-01-01T22:42:58 192.0.2.0 zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6 1526223 +2020-01-01T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 diff --git a/tests/test_reduction/examples/reduced_example_0/raw_logs/2020/01/01.log b/tests/test_reduction/examples/reduced_example_0/raw_logs/2020/01/01.log new file mode 100644 index 0000000..4a80253 --- /dev/null +++ b/tests/test_reduction/examples/reduced_example_0/raw_logs/2020/01/01.log @@ -0,0 +1,3 @@ +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:22:42:58 +0000] 192.0.2.0 - W3VJKP0HM8TV2N46 REST.GET.OBJECT zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19 "GET /zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19?versionId=MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 HTTP/1.1" 200 - 1526223 1526223 61 55 "-" "git-annex/10.20220927-geb4a544" MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 U4WvVRIYm+n+VYNArVY/+fjDV3PZesvSaclnyALtK7rsaZ/8sTq8H1JnNAyMj/xuitYxXNUCZ+U= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_reduction/examples/reduced_example_1/expected_output/2020/01/01.tsv b/tests/test_reduction/examples/reduced_example_1/expected_output/2020/01/01.tsv new file mode 100644 index 0000000..de94244 --- /dev/null +++ b/tests/test_reduction/examples/reduced_example_1/expected_output/2020/01/01.tsv @@ -0,0 +1,3 @@ +2020-01-01T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 +2020-01-01T22:42:58 192.0.2.0 zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6 1526223 +2020-01-01T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 diff --git a/tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv b/tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv new file mode 100644 index 0000000..de94244 --- /dev/null +++ b/tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv @@ -0,0 +1,3 @@ +2020-01-01T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 +2020-01-01T22:42:58 192.0.2.0 zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6 1526223 +2020-01-01T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 diff --git a/tests/test_reduction/examples/reduced_example_1/raw_logs/2020/01/01.log b/tests/test_reduction/examples/reduced_example_1/raw_logs/2020/01/01.log new file mode 100644 index 0000000..4a80253 --- /dev/null +++ b/tests/test_reduction/examples/reduced_example_1/raw_logs/2020/01/01.log @@ -0,0 +1,3 @@ +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:22:42:58 +0000] 192.0.2.0 - W3VJKP0HM8TV2N46 REST.GET.OBJECT zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19 "GET /zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19?versionId=MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 HTTP/1.1" 200 - 1526223 1526223 61 55 "-" "git-annex/10.20220927-geb4a544" MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 U4WvVRIYm+n+VYNArVY/+fjDV3PZesvSaclnyALtK7rsaZ/8sTq8H1JnNAyMj/xuitYxXNUCZ+U= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_reduction/examples/reduced_example_1/raw_logs/2021/02/03.log b/tests/test_reduction/examples/reduced_example_1/raw_logs/2021/02/03.log new file mode 100644 index 0000000..b4f4e72 --- /dev/null +++ b/tests/test_reduction/examples/reduced_example_1/raw_logs/2021/02/03.log @@ -0,0 +1,3 @@ +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [03/Feb/2020:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [03/Feb/2020:22:42:58 +0000] 192.0.2.0 - W3VJKP0HM8TV2N46 REST.GET.OBJECT zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19 "GET /zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19?versionId=MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 HTTP/1.1" 200 - 1526223 1526223 61 55 "-" "git-annex/10.20220927-geb4a544" MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 U4WvVRIYm+n+VYNArVY/+fjDV3PZesvSaclnyALtK7rsaZ/8sTq8H1JnNAyMj/xuitYxXNUCZ+U= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [03/Feb/2020:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_reduction/examples/reduced_example_2/0.log b/tests/test_reduction/examples/reduced_example_2/0.log new file mode 100644 index 0000000..9205ad3 --- /dev/null +++ b/tests/test_reduction/examples/reduced_example_2/0.log @@ -0,0 +1,4 @@ +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Jan/2023:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [26/Jun/2023:03:05:53 +0000] 192.0.2.0 - 5PCGX9WKFQMJH6FB REST.GET.OBJECT blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a "GET /blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a HTTP/1.1" 200 - 6616308 422868123111 205 35 "-" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" - A54Zaz7Sl0ygUFZ4lEOYCXHxImvTGXnvR+rr9+JcM/gceQWDObRkwnP9nO+wK70lpMaaE78SWvA= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - diff --git a/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv new file mode 100644 index 0000000..a55bf13 --- /dev/null +++ b/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address line_index +2023-06-26 03:05:53 6616308 192.0.2.0 3 diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv b/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv similarity index 68% rename from test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv rename to tests/test_reduction/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv index 570c480..1cc8230 100644 --- a/test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv +++ b/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -1,3 +1,2 @@ timestamp bytes_sent ip_address line_index -2022-03-16 02:21:12 512 192.0.2.0 1 2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv b/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv similarity index 68% rename from test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv rename to tests/test_reduction/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv index 0851ae8..5b97e6c 100644 --- a/test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv +++ b/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv @@ -1,3 +1,2 @@ timestamp bytes_sent ip_address line_index -2021-04-24 12:03:05 1443 192.0.2.0 0 2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs.py b/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs.py new file mode 100644 index 0000000..9eb97fd --- /dev/null +++ b/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs.py @@ -0,0 +1,53 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: + """Basic test for parsing of all DANDI raw S3 logs in a directory.""" + tmpdir = pathlib.Path(tmpdir) + + file_parent = pathlib.Path(__file__).parent + example_folder_path = file_parent / "examples" / "reduced_example_1" + example_raw_s3_logs_folder_path = example_folder_path / "raw_logs" + + test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_1" + test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) + + expected_reduced_s3_logs_folder_path = example_folder_path / "expected_output" + + dandi_s3_log_parser.reduce_all_dandi_raw_s3_logs( + raw_s3_logs_folder_path=example_raw_s3_logs_folder_path, + reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, + ) + test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) + + test_number_of_output_files = len(test_output_file_paths) + expected_number_of_output_files = 2 + assert ( + test_number_of_output_files == expected_number_of_output_files + ), f"The number of asset files ({test_number_of_output_files}) does not match expectation!" + + # First file + test_reduced_s3_log_file_path = test_reduced_s3_logs_folder_path / "2020" / "01" / "01.tsv" + expected_reduced_s3_log_file_path = expected_reduced_s3_logs_folder_path / "2020" / "01" / "01.tsv" + + test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) + expected_reduced_s3_log = pandas.read_table(filepath_or_buffer=expected_reduced_s3_log_file_path) + + pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_reduced_s3_log) + + # Second file + test_reduced_s3_log_file_path = test_reduced_s3_logs_folder_path / "2021" / "02" / "03.tsv" + expected_reduced_s3_log_file_path = expected_reduced_s3_logs_folder_path / "2021" / "02" / "03.tsv" + + test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) + expected_reduced_s3_log = pandas.read_table(filepath_or_buffer=expected_reduced_s3_log_file_path) + + pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_reduced_s3_log) + + +# TODO: add CLI diff --git a/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py b/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py new file mode 100644 index 0000000..2045792 --- /dev/null +++ b/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py @@ -0,0 +1,59 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_reduce_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local) -> None: + """Basic test for parsing of all DANDI raw S3 logs in a directory using multiple workers.""" + tmpdir = pathlib.Path(tmpdir) + + file_parent = pathlib.Path(__file__).parent + examples_folder_path = file_parent / "examples" / "reduced_example_1" + expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" + + test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_1" + test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) + + dandi_s3_log_parser.reduce_all_dandi_raw_s3_logs( + base_raw_s3_logs_folder_path=examples_folder_path, + reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, + maximum_number_of_workers=2, + ) + test_output_file_paths = [path for path in test_reduced_s3_logs_folder_path.rglob("*.tsv")] + + number_of_output_files = len(test_output_file_paths) + assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" + + # Increment this over time as more examples are added + expected_number_of_output_files = 2 + assert ( + number_of_output_files == expected_number_of_output_files + ), f"The number of asset files ({number_of_output_files}) does not match expectation!" + + expected_asset_ids = [file_path.stem for file_path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] + for test_parsed_s3_log_file_path in test_output_file_paths: + assert ( + test_parsed_s3_log_file_path.stem in expected_asset_ids + ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" + + test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) + + blob_id = test_parsed_s3_log_file_path.stem + expected_parsed_s3_log_file_path = ( + expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" + ) + expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) + + test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp") + expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp") + + test_parsed_s3_log.index = range(len(test_parsed_s3_log)) + expected_parsed_s3_log.index = range(len(expected_parsed_s3_log)) + + pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) + + +# TODO: add CLI diff --git a/tests/test_reduction/test_reduce_dandi_raw_s3_log_bad_lines.py b/tests/test_reduction/test_reduce_dandi_raw_s3_log_bad_lines.py new file mode 100644 index 0000000..f8c06d9 --- /dev/null +++ b/tests/test_reduction/test_reduce_dandi_raw_s3_log_bad_lines.py @@ -0,0 +1,57 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_reduce_dandi_raw_s3_log_bad_lines(tmpdir: py.path.local) -> None: + """ + 'parsed_example_2' contains the basic test cases as well as a collection of 'bad lines' contributed over time. + """ + tmpdir = pathlib.Path(tmpdir) + + # Count initial error folder contents + error_folder = dandi_s3_log_parser.DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors" + error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() + initial_number_of_error_folder_contents = len(error_folder_contents) + + file_parent = pathlib.Path(__file__).parent + examples_folder_path = file_parent / "examples" / "reduced_example_2" + example_raw_s3_log_file_path = examples_folder_path / "0.log" + expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" + + test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_2" + test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) + + dandi_s3_log_parser.reduce_dandi_raw_s3_log( + raw_s3_log_file_path=example_raw_s3_log_file_path, + reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, + ) + test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) + + number_of_output_files = len(test_output_file_paths) + expected_number_of_output_files = 3 + assert number_of_output_files == expected_number_of_output_files + + expected_asset_ids = [path.stem for path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] + for test_parsed_s3_log_file_path in test_output_file_paths: + assert ( + test_parsed_s3_log_file_path.stem in expected_asset_ids + ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" + + test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) + + blob_id = test_parsed_s3_log_file_path.stem + expected_parsed_s3_log_file_path = ( + expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" + ) + expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) + + pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) + + post_test_error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() + assert ( + len(post_test_error_folder_contents) == initial_number_of_error_folder_contents + ), "Errors occurred during line parsing!" diff --git a/tests/test_reduction/test_reduce_raw_s3_log.py b/tests/test_reduction/test_reduce_raw_s3_log.py new file mode 100644 index 0000000..6a23851 --- /dev/null +++ b/tests/test_reduction/test_reduce_raw_s3_log.py @@ -0,0 +1,68 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_reduce_raw_s3_log_example_0_fast_case(tmpdir: py.path.local) -> None: + """ + Most basic test of functionality. + + If there are failures in the parsing of any lines found in application, + please raise an issue and contribute them to the example log collection. + """ + tmpdir = pathlib.Path(tmpdir) + + file_parent = pathlib.Path(__file__).parent + example_folder_path = file_parent / "examples" / "reduced_example_0" + example_raw_s3_log_file_path = example_folder_path / "raw_logs" / "2020" / "01" / "01.log" + + test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_0_fast_case" + test_reduced_s3_log_file_path = test_reduced_s3_logs_folder_path / "2020" / "01" / "01.tsv" + test_reduced_s3_log_file_path.parent.mkdir(parents=True, exist_ok=True) + + expected_reduced_s3_logs_folder_path = example_folder_path / "expected_output" + expected_reduced_s3_log_file_path = expected_reduced_s3_logs_folder_path / "2020" / "01" / "01.tsv" + + dandi_s3_log_parser.reduce_raw_s3_log( + raw_s3_log_file_path=example_raw_s3_log_file_path, + reduced_s3_log_file_path=test_reduced_s3_log_file_path, + # The two specifications below trigger the 'fast' parsing + fields_to_reduce=["object_key", "timestamp", "bytes_sent", "ip_address"], + object_key_parents_to_reduce=["blobs", "zarr"], + ) + + test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) + expected_reduced_s3_log = pandas.read_table(filepath_or_buffer=expected_reduced_s3_log_file_path) + + pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_reduced_s3_log) + + +def test_reduce_raw_s3_log_example_0_basic_case(tmpdir: py.path.local) -> None: + tmpdir = pathlib.Path(tmpdir) + + file_parent = pathlib.Path(__file__).parent + example_folder_path = file_parent / "examples" / "reduced_example_0" + example_raw_s3_log_file_path = example_folder_path / "raw_logs" / "2020" / "01" / "01.log" + + test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_0_basic_case" + test_reduced_s3_log_file_path = test_reduced_s3_logs_folder_path / "2020" / "01" / "01.tsv" + test_reduced_s3_log_file_path.parent.mkdir(parents=True, exist_ok=True) + + expected_reduced_s3_logs_folder_path = example_folder_path / "expected_output" + expected_reduced_s3_log_file_path = expected_reduced_s3_logs_folder_path / "2020" / "01" / "01.tsv" + + object_key_handler = dandi_s3_log_parser._dandi_s3_log_file_reducer._get_default_dandi_object_key_handler() + dandi_s3_log_parser.reduce_raw_s3_log( + raw_s3_log_file_path=example_raw_s3_log_file_path, + reduced_s3_log_file_path=test_reduced_s3_log_file_path, + fields_to_reduce=["object_key", "timestamp", "bytes_sent", "ip_address"], + object_key_handler=object_key_handler, + ) + + test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) + expected_reduced_s3_log = pandas.read_table(filepath_or_buffer=expected_reduced_s3_log_file_path) + + pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_reduced_s3_log) From 46f33752daedaee6520b470310f05fc087971d6a Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 00:19:48 -0400 Subject: [PATCH 15/55] debug reduction tests --- .../expected_output/2021/02/03.tsv | 6 +- .../reduced_example_1/raw_logs/2021/02/03.log | 6 +- .../expected_output/2022/04/06.tsv | 3 + .../0801d996-200e-4173-ab49-d1784427e96a.tsv | 2 - .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 2 - .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 2 - .../{0.log => raw_logs/2022/04/06.log} | 8 +- ...t_reduce_all_dandi_raw_s3_logs_parallel.py | 51 ++++++------ .../test_reduce_dandi_raw_s3_log_bad_lines.py | 57 ------------- .../test_reduce_raw_s3_log_bad_lines.py | 82 +++++++++++++++++++ 10 files changed, 118 insertions(+), 101 deletions(-) create mode 100644 tests/test_reduction/examples/reduced_example_2/expected_output/2022/04/06.tsv delete mode 100644 tests/test_reduction/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv delete mode 100644 tests/test_reduction/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv delete mode 100644 tests/test_reduction/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv rename tests/test_reduction/examples/reduced_example_2/{0.log => raw_logs/2022/04/06.log} (90%) delete mode 100644 tests/test_reduction/test_reduce_dandi_raw_s3_log_bad_lines.py create mode 100644 tests/test_reduction/test_reduce_raw_s3_log_bad_lines.py diff --git a/tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv b/tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv index de94244..ab1b8a1 100644 --- a/tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv +++ b/tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv @@ -1,3 +1,3 @@ -2020-01-01T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 -2020-01-01T22:42:58 192.0.2.0 zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6 1526223 -2020-01-01T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 +2021-02-03T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 +2021-02-03T22:42:58 192.0.2.0 zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6 1526223 +2021-02-03T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 diff --git a/tests/test_reduction/examples/reduced_example_1/raw_logs/2021/02/03.log b/tests/test_reduction/examples/reduced_example_1/raw_logs/2021/02/03.log index b4f4e72..429e3b6 100644 --- a/tests/test_reduction/examples/reduced_example_1/raw_logs/2021/02/03.log +++ b/tests/test_reduction/examples/reduced_example_1/raw_logs/2021/02/03.log @@ -1,3 +1,3 @@ -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [03/Feb/2020:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [03/Feb/2020:22:42:58 +0000] 192.0.2.0 - W3VJKP0HM8TV2N46 REST.GET.OBJECT zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19 "GET /zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19?versionId=MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 HTTP/1.1" 200 - 1526223 1526223 61 55 "-" "git-annex/10.20220927-geb4a544" MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 U4WvVRIYm+n+VYNArVY/+fjDV3PZesvSaclnyALtK7rsaZ/8sTq8H1JnNAyMj/xuitYxXNUCZ+U= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [03/Feb/2020:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [03/Feb/2021:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [03/Feb/2021:22:42:58 +0000] 192.0.2.0 - W3VJKP0HM8TV2N46 REST.GET.OBJECT zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19 "GET /zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19?versionId=MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 HTTP/1.1" 200 - 1526223 1526223 61 55 "-" "git-annex/10.20220927-geb4a544" MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 U4WvVRIYm+n+VYNArVY/+fjDV3PZesvSaclnyALtK7rsaZ/8sTq8H1JnNAyMj/xuitYxXNUCZ+U= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [03/Feb/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_reduction/examples/reduced_example_2/expected_output/2022/04/06.tsv b/tests/test_reduction/examples/reduced_example_2/expected_output/2022/04/06.tsv new file mode 100644 index 0000000..1e829c5 --- /dev/null +++ b/tests/test_reduction/examples/reduced_example_2/expected_output/2022/04/06.tsv @@ -0,0 +1,3 @@ +2022-04-06T03:05:53 192.0.2.0 blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a 6616308 +2022-04-06T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 +2022-04-06T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 diff --git a/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv deleted file mode 100644 index a55bf13..0000000 --- a/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2023-06-26 03:05:53 6616308 192.0.2.0 3 diff --git a/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv deleted file mode 100644 index 1cc8230..0000000 --- a/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv deleted file mode 100644 index 5b97e6c..0000000 --- a/tests/test_reduction/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/test_reduction/examples/reduced_example_2/0.log b/tests/test_reduction/examples/reduced_example_2/raw_logs/2022/04/06.log similarity index 90% rename from tests/test_reduction/examples/reduced_example_2/0.log rename to tests/test_reduction/examples/reduced_example_2/raw_logs/2022/04/06.log index 9205ad3..cfd32b8 100644 --- a/tests/test_reduction/examples/reduced_example_2/0.log +++ b/tests/test_reduction/examples/reduced_example_2/raw_logs/2022/04/06.log @@ -1,4 +1,4 @@ -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Jan/2023:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [26/Jun/2023:03:05:53 +0000] 192.0.2.0 - 5PCGX9WKFQMJH6FB REST.GET.OBJECT blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a "GET /blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a HTTP/1.1" 200 - 6616308 422868123111 205 35 "-" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" - A54Zaz7Sl0ygUFZ4lEOYCXHxImvTGXnvR+rr9+JcM/gceQWDObRkwnP9nO+wK70lpMaaE78SWvA= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Apr/2022:03:05:53 +0000] 192.0.2.0 - 5PCGX9WKFQMJH6FB REST.GET.OBJECT blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a "GET /blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a HTTP/1.1" 200 - 6616308 422868123111 205 35 "-" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" - A54Zaz7Sl0ygUFZ4lEOYCXHxImvTGXnvR+rr9+JcM/gceQWDObRkwnP9nO+wK70lpMaaE78SWvA= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Apr/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Apr/2022:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Apr/2022:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py b/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py index 2045792..e2aeaa5 100644 --- a/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py +++ b/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py @@ -6,54 +6,49 @@ import dandi_s3_log_parser -def test_reduce_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local) -> None: - """Basic test for parsing of all DANDI raw S3 logs in a directory using multiple workers.""" +def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: + """Basic test for parsing of all DANDI raw S3 logs in a directory.""" tmpdir = pathlib.Path(tmpdir) file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduced_example_1" - expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" + example_folder_path = file_parent / "examples" / "reduced_example_1" + example_raw_s3_logs_folder_path = example_folder_path / "raw_logs" test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_1" test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) + expected_reduced_s3_logs_folder_path = example_folder_path / "expected_output" + dandi_s3_log_parser.reduce_all_dandi_raw_s3_logs( - base_raw_s3_logs_folder_path=examples_folder_path, + raw_s3_logs_folder_path=example_raw_s3_logs_folder_path, reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, maximum_number_of_workers=2, ) - test_output_file_paths = [path for path in test_reduced_s3_logs_folder_path.rglob("*.tsv")] - - number_of_output_files = len(test_output_file_paths) - assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" + test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) - # Increment this over time as more examples are added + test_number_of_output_files = len(test_output_file_paths) expected_number_of_output_files = 2 assert ( - number_of_output_files == expected_number_of_output_files - ), f"The number of asset files ({number_of_output_files}) does not match expectation!" + test_number_of_output_files == expected_number_of_output_files + ), f"The number of asset files ({test_number_of_output_files}) does not match expectation!" - expected_asset_ids = [file_path.stem for file_path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] - for test_parsed_s3_log_file_path in test_output_file_paths: - assert ( - test_parsed_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" + # First file + test_reduced_s3_log_file_path = test_reduced_s3_logs_folder_path / "2020" / "01" / "01.tsv" + expected_reduced_s3_log_file_path = expected_reduced_s3_logs_folder_path / "2020" / "01" / "01.tsv" - test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) + test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) + expected_reduced_s3_log = pandas.read_table(filepath_or_buffer=expected_reduced_s3_log_file_path) - blob_id = test_parsed_s3_log_file_path.stem - expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" - ) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) + pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_reduced_s3_log) - test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp") - expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp") + # Second file + test_reduced_s3_log_file_path = test_reduced_s3_logs_folder_path / "2021" / "02" / "03.tsv" + expected_reduced_s3_log_file_path = expected_reduced_s3_logs_folder_path / "2021" / "02" / "03.tsv" - test_parsed_s3_log.index = range(len(test_parsed_s3_log)) - expected_parsed_s3_log.index = range(len(expected_parsed_s3_log)) + test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) + expected_reduced_s3_log = pandas.read_table(filepath_or_buffer=expected_reduced_s3_log_file_path) - pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) + pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_reduced_s3_log) # TODO: add CLI diff --git a/tests/test_reduction/test_reduce_dandi_raw_s3_log_bad_lines.py b/tests/test_reduction/test_reduce_dandi_raw_s3_log_bad_lines.py deleted file mode 100644 index f8c06d9..0000000 --- a/tests/test_reduction/test_reduce_dandi_raw_s3_log_bad_lines.py +++ /dev/null @@ -1,57 +0,0 @@ -import pathlib - -import pandas -import py - -import dandi_s3_log_parser - - -def test_reduce_dandi_raw_s3_log_bad_lines(tmpdir: py.path.local) -> None: - """ - 'parsed_example_2' contains the basic test cases as well as a collection of 'bad lines' contributed over time. - """ - tmpdir = pathlib.Path(tmpdir) - - # Count initial error folder contents - error_folder = dandi_s3_log_parser.DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors" - error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() - initial_number_of_error_folder_contents = len(error_folder_contents) - - file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduced_example_2" - example_raw_s3_log_file_path = examples_folder_path / "0.log" - expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - - test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_2" - test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) - - dandi_s3_log_parser.reduce_dandi_raw_s3_log( - raw_s3_log_file_path=example_raw_s3_log_file_path, - reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, - ) - test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) - - number_of_output_files = len(test_output_file_paths) - expected_number_of_output_files = 3 - assert number_of_output_files == expected_number_of_output_files - - expected_asset_ids = [path.stem for path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] - for test_parsed_s3_log_file_path in test_output_file_paths: - assert ( - test_parsed_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" - - test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) - - blob_id = test_parsed_s3_log_file_path.stem - expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" - ) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) - - pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) - - post_test_error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() - assert ( - len(post_test_error_folder_contents) == initial_number_of_error_folder_contents - ), "Errors occurred during line parsing!" diff --git a/tests/test_reduction/test_reduce_raw_s3_log_bad_lines.py b/tests/test_reduction/test_reduce_raw_s3_log_bad_lines.py new file mode 100644 index 0000000..e13bb8d --- /dev/null +++ b/tests/test_reduction/test_reduce_raw_s3_log_bad_lines.py @@ -0,0 +1,82 @@ +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_reduce_raw_s3_log_example_bad_lines_fast_case(tmpdir: py.path.local) -> None: + tmpdir = pathlib.Path(tmpdir) + + # Count initial error folder contents + error_folder = dandi_s3_log_parser.DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors" + error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() + initial_number_of_error_folder_contents = len(error_folder_contents) + + file_parent = pathlib.Path(__file__).parent + example_folder_path = file_parent / "examples" / "reduced_example_2" + example_raw_s3_log_file_path = example_folder_path / "raw_logs" / "2022" / "04" / "06.log" + + test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_bad_lines_fast_case" + test_reduced_s3_log_file_path = test_reduced_s3_logs_folder_path / "2022" / "04" / "06.tsv" + test_reduced_s3_log_file_path.parent.mkdir(parents=True, exist_ok=True) + + expected_reduced_s3_logs_folder_path = example_folder_path / "expected_output" + expected_reduced_s3_log_file_path = expected_reduced_s3_logs_folder_path / "2022" / "04" / "06.tsv" + + dandi_s3_log_parser.reduce_raw_s3_log( + raw_s3_log_file_path=example_raw_s3_log_file_path, + reduced_s3_log_file_path=test_reduced_s3_log_file_path, + # The two specifications below trigger the 'fast' parsing + fields_to_reduce=["object_key", "timestamp", "bytes_sent", "ip_address"], + object_key_parents_to_reduce=["blobs", "zarr"], + ) + + test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) + expected_reduced_s3_log = pandas.read_table(filepath_or_buffer=expected_reduced_s3_log_file_path) + + pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_reduced_s3_log) + + post_test_error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() + assert ( + len(post_test_error_folder_contents) == initial_number_of_error_folder_contents + ), "Errors occurred during line parsing!" + + +def test_reduce_raw_s3_log_example_bad_lines_basic_case(tmpdir: py.path.local) -> None: + tmpdir = pathlib.Path(tmpdir) + + # Count initial error folder contents + error_folder = dandi_s3_log_parser.DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors" + error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() + initial_number_of_error_folder_contents = len(error_folder_contents) + + file_parent = pathlib.Path(__file__).parent + example_folder_path = file_parent / "examples" / "reduced_example_2" + example_raw_s3_log_file_path = example_folder_path / "raw_logs" / "2022" / "04" / "06.log" + + test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_bad_lines_basic_case" + test_reduced_s3_log_file_path = test_reduced_s3_logs_folder_path / "2022" / "04" / "06.tsv" + test_reduced_s3_log_file_path.parent.mkdir(parents=True, exist_ok=True) + + expected_reduced_s3_logs_folder_path = example_folder_path / "expected_output" + expected_reduced_s3_log_file_path = expected_reduced_s3_logs_folder_path / "2022" / "04" / "06.tsv" + + object_key_handler = dandi_s3_log_parser._dandi_s3_log_file_reducer._get_default_dandi_object_key_handler() + dandi_s3_log_parser.reduce_raw_s3_log( + raw_s3_log_file_path=example_raw_s3_log_file_path, + reduced_s3_log_file_path=test_reduced_s3_log_file_path, + fields_to_reduce=["object_key", "timestamp", "bytes_sent", "ip_address"], + object_key_handler=object_key_handler, + ) + + test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) + expected_reduced_s3_log = pandas.read_table(filepath_or_buffer=expected_reduced_s3_log_file_path) + + pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_reduced_s3_log) + + post_test_error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() + assert ( + len(post_test_error_folder_contents) == initial_number_of_error_folder_contents + ), "Errors occurred during line parsing!" From 3bfba4843642e44fb4394141645e6bee9a00c877 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 01:23:29 -0400 Subject: [PATCH 16/55] get binning mostly there --- src/dandi_s3_log_parser/__init__.py | 2 + ....py => _bin_reduced_logs_by_object_key.py} | 69 +++++++++++++++++-- .../_dandi_s3_log_file_reducer.py | 2 +- .../_s3_log_file_reducer.py | 8 ++- .../_s3_log_line_parser.py | 13 +--- .../examples/binned_example_0/0.log | 3 - .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 2 - .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 2 - .../cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv | 2 - .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 2 + .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 2 + .../cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv | 2 + .../reduced_logs}/2020/01/01.tsv | 1 + .../expected_output/2020/01/01.log | 3 - .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 2 - .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 2 - .../cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv | 2 - .../examples/reduced_example_1/0.log | 2 - .../examples/reduced_example_1/1.log | 2 - .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 3 - .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 3 - .../examples/reduced_example_2/0.log | 4 -- .../0801d996-200e-4173-ab49-d1784427e96a.tsv | 2 - .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 2 - .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 2 - .../test_bin_all_dandi_raw_s3_logs.py | 4 +- .../test_binning/test_bin_dandi_raw_s3_log.py | 4 +- .../test_bin_reduced_s3_logs_by_object_key.py | 27 ++++++++ .../test_reduce_all_dandi_raw_s3_logs.py | 4 +- ...t_reduce_all_dandi_raw_s3_logs_parallel.py | 4 +- .../test_reduce_dandi_raw_s3_log_bad_lines.py | 4 +- tests/test_binning/test_reduce_raw_s3_log.py | 60 ---------------- .../test_buffered_text_reader.py | 0 .../reduced_example_1/raw_logs/2020/01/01.log | 3 - .../expected_output/2020/01/01.tsv | 1 + .../raw_logs/2020/01/01.log | 0 .../expected_output/2020/01/01.tsv | 4 ++ .../expected_output/2021/02/03.tsv | 1 + .../raw_logs/2020/01/01.log | 0 .../raw_logs/2021/02/03.log | 0 .../expected_output/2022/04/06.tsv | 1 + .../raw_logs/2022/04/06.log | 0 .../test_reduce_all_dandi_raw_s3_logs.py | 4 +- ...t_reduce_all_dandi_raw_s3_logs_parallel.py | 4 +- .../test_reduction/test_reduce_raw_s3_log.py | 4 +- .../test_reduce_raw_s3_log_bad_lines.py | 4 +- 46 files changed, 131 insertions(+), 141 deletions(-) rename src/dandi_s3_log_parser/{_s3_log_file_bin_by_key.py => _bin_reduced_logs_by_object_key.py} (83%) delete mode 100644 tests/test_binning/examples/binned_example_0/0.log delete mode 100644 tests/test_binning/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv delete mode 100644 tests/test_binning/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv delete mode 100644 tests/test_binning/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv create mode 100644 tests/test_binning/examples/binning_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv create mode 100644 tests/test_binning/examples/binning_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv create mode 100644 tests/test_binning/examples/binning_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv rename tests/{test_reduction/examples/reduced_example_0/expected_output => test_binning/examples/binning_example_0/reduced_logs}/2020/01/01.tsv (85%) delete mode 100644 tests/test_binning/examples/reduced_example_0/expected_output/2020/01/01.log delete mode 100644 tests/test_binning/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv delete mode 100644 tests/test_binning/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv delete mode 100644 tests/test_binning/examples/reduced_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv delete mode 100644 tests/test_binning/examples/reduced_example_1/0.log delete mode 100644 tests/test_binning/examples/reduced_example_1/1.log delete mode 100644 tests/test_binning/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv delete mode 100644 tests/test_binning/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv delete mode 100644 tests/test_binning/examples/reduced_example_2/0.log delete mode 100644 tests/test_binning/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv delete mode 100644 tests/test_binning/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv delete mode 100644 tests/test_binning/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv create mode 100644 tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py delete mode 100644 tests/test_binning/test_reduce_raw_s3_log.py rename tests/{test_binning => }/test_buffered_text_reader.py (100%) delete mode 100644 tests/test_reduction/examples/reduced_example_1/raw_logs/2020/01/01.log rename tests/test_reduction/examples/{reduced_example_1 => reduction_example_0}/expected_output/2020/01/01.tsv (85%) rename tests/{test_binning/examples/reduced_example_0 => test_reduction/examples/reduction_example_0}/raw_logs/2020/01/01.log (100%) create mode 100644 tests/test_reduction/examples/reduction_example_1/expected_output/2020/01/01.tsv rename tests/test_reduction/examples/{reduced_example_1 => reduction_example_1}/expected_output/2021/02/03.tsv (85%) rename tests/test_reduction/examples/{reduced_example_0 => reduction_example_1}/raw_logs/2020/01/01.log (100%) rename tests/test_reduction/examples/{reduced_example_1 => reduction_example_1}/raw_logs/2021/02/03.log (100%) rename tests/test_reduction/examples/{reduced_example_2 => reduction_example_2}/expected_output/2022/04/06.tsv (85%) rename tests/test_reduction/examples/{reduced_example_2 => reduction_example_2}/raw_logs/2022/04/06.log (100%) diff --git a/src/dandi_s3_log_parser/__init__.py b/src/dandi_s3_log_parser/__init__.py index a6484dd..46fde63 100644 --- a/src/dandi_s3_log_parser/__init__.py +++ b/src/dandi_s3_log_parser/__init__.py @@ -23,6 +23,7 @@ from ._dandi_s3_log_file_reducer import reduce_all_dandi_raw_s3_logs from ._ip_utils import get_region_from_ip_address from ._dandiset_mapper import map_reduced_logs_to_dandisets +from ._bin_reduced_logs_by_object_key import bin_all_reduced_s3_logs_by_object_key __all__ = [ "DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH", @@ -31,4 +32,5 @@ "reduce_all_dandi_raw_s3_logs", "get_region_from_ip_address", "map_reduced_logs_to_dandisets", + "bin_all_reduced_s3_logs_by_object_key", ] diff --git a/src/dandi_s3_log_parser/_s3_log_file_bin_by_key.py b/src/dandi_s3_log_parser/_bin_reduced_logs_by_object_key.py similarity index 83% rename from src/dandi_s3_log_parser/_s3_log_file_bin_by_key.py rename to src/dandi_s3_log_parser/_bin_reduced_logs_by_object_key.py index 6bf33a0..388125f 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_bin_by_key.py +++ b/src/dandi_s3_log_parser/_bin_reduced_logs_by_object_key.py @@ -1,21 +1,76 @@ -# """Primary functions for parsing raw S3 log file for DANDI.""" -# +"""Bin reduced logs by object key.""" + # import collections # import datetime -# import pathlib +import pathlib + # import uuid # from collections.abc import Callable # from typing import Literal -# -# import pandas +import pandas + # import tqdm -# from pydantic import DirectoryPath, FilePath, validate_call +from pydantic import DirectoryPath, validate_call + # # from ._buffered_text_reader import BufferedTextReader # from ._error_collection import _collect_error # from ._s3_log_line_parser import _KNOWN_OPERATION_TYPES, _append_reduced_log_line # -# + + +@validate_call +def bin_all_reduced_logs_by_object_key( + *, + reduced_s3_logs_folder_path: DirectoryPath, + binned_s3_logs_folder_path: DirectoryPath, +) -> None: + """ + Bin reduced S3 logs by object keys. + + Parameters + ---------- + reduced_s3_logs_folder_path : str + The path to the folder containing the reduced S3 log files. + binned_s3_logs_folder_path : str + The path to write each binned S3 log file to. + """ + # TODO: add two status tracking YAML files + # 1) reduced_log_file_paths_started.yaml + # 2) reduced_log_file_paths_completed.yaml + # Throw warning on start of this function if they disagree (indicating error occurred, most likely during I/O stage) + # All reduced logs will likely need to be freshly re-binned in this case + # (by manually removing or renaming the 'binned' target directory to start off empty) + # + # But if all goes well, then use those file paths to skip over already completed binning + # + # Although; there was no guarantee that the binned contents were chronological, so maybe also + # add a final step (with flag to disable) to re-write all binned logs in chronological order? + # + # Thought: since we're doing this non-parallel, we could just iterate the reduced logs in chronological order + + reduced_s3_log_files = reduced_s3_logs_folder_path.rglob("*.tsv") + for reduced_s3_log_file in reduced_s3_log_files: + reduced_data_frame = pandas.read_csv(filepath_or_buffer=reduced_s3_log_file, sep="\t") + binned_data_frame = reduced_data_frame.groupby("object_key").agg( + { + "timestamp": list, + "bytes_sent": list, + "ip_address": list, + } + ) + + for _, row in binned_data_frame.iterrows(): + object_key_as_path = pathlib.Path(row.name) + object_key_as_path.parent.mkdir(parents=True, exist_ok=True) + binned_s3_log_file_path = ( + binned_s3_logs_folder_path / object_key_as_path.parent / f"{object_key_as_path.stem}.tsv" + ) + + header = False if binned_s3_log_file_path.exists() else True + row.to_csv(path_or_buf=binned_s3_log_file_path, mode="a", sep="\t", header=header) + + # @validate_call # def reduce_raw_s3_log( # *, diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 759a731..cf9fc88 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -1,4 +1,4 @@ -"""Primary functions for parsing raw S3 log file for DANDI.""" +"""Primary functions for reducing raw S3 log file for DANDI.""" import collections import datetime diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 879671d..1e0aee5 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -1,4 +1,4 @@ -"""Primary functions for parsing raw S3 log file for DANDI.""" +"""Primary functions for reducing raw S3 log files.""" import collections import datetime @@ -37,7 +37,8 @@ def reduce_raw_s3_log( - Filtering all lines only by the type of operation specified (i.e., REST.GET.OBJECT, REST.PUT.OBJECT, etc.). - Filtering out any non-success status codes. - Filtering out any excluded IP addresses. - - Extracting only the asset ID, request timestamp, request size, and IP address that sent the request. + - Extracting only the object key, request timestamp, request size, and IP address that sent the request. + - The object keys written to the reduced log file may also be adjusted according to the handler. Parameters ---------- @@ -146,7 +147,10 @@ def object_key_handler(*, object_key: str) -> str: if len(reduced_s3_log_lines) == 0: return None + # TODO: generalize header to rely on the selected fields and ensure order matches + header = "timestamp\tip_address\tobject_key\tbytes_sent\n" with open(file=reduced_s3_log_file_path, mode="w") as io: + io.write(header) io.writelines(reduced_s3_log_lines) return None diff --git a/src/dandi_s3_log_parser/_s3_log_line_parser.py b/src/dandi_s3_log_parser/_s3_log_line_parser.py index fea3c4f..0a2d905 100644 --- a/src/dandi_s3_log_parser/_s3_log_line_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_line_parser.py @@ -1,15 +1,4 @@ -""" -Primary functions for parsing a single line of a raw S3 log. - -The strategy is to... - -1) Parse the raw line into a list of strings using a combination of regex patterns and custom string manipulation. -2) Construct a FullLogLine object from the parsed line. A collections.namedtuple object is used for performance. -3) Reduce and map the information from the FullLogLine into a collections.defaultdict object. - Some of the mapping operations at this step include... - - Handling the timestamp in memory as a datetime.datetime object. - - Filtering out log lines from excluded IPs. -""" +"""Primary functions for parsing a single line of a raw S3 log.""" from ._globals import ( _S3_LOG_REGEX, diff --git a/tests/test_binning/examples/binned_example_0/0.log b/tests/test_binning/examples/binned_example_0/0.log deleted file mode 100644 index 41e10c5..0000000 --- a/tests/test_binning/examples/binned_example_0/0.log +++ /dev/null @@ -1,3 +0,0 @@ -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2023:22:42:58 +0000] 192.0.2.0 - W3VJKP0HM8TV2N46 REST.GET.OBJECT zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19 "GET /zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19?versionId=MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 HTTP/1.1" 200 - 1526223 1526223 61 55 "-" "git-annex/10.20220927-geb4a544" MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 U4WvVRIYm+n+VYNArVY/+fjDV3PZesvSaclnyALtK7rsaZ/8sTq8H1JnNAyMj/xuitYxXNUCZ+U= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - diff --git a/tests/test_binning/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv deleted file mode 100644 index 1cc8230..0000000 --- a/tests/test_binning/examples/binned_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/test_binning/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv deleted file mode 100644 index 5b97e6c..0000000 --- a/tests/test_binning/examples/binned_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/test_binning/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv b/tests/test_binning/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv deleted file mode 100644 index e25b6a1..0000000 --- a/tests/test_binning/examples/binned_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2023-01-01 22:42:58 1526223 192.0.2.0 2 diff --git a/tests/test_binning/examples/binning_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/binning_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv new file mode 100644 index 0000000..1c7ebb9 --- /dev/null +++ b/tests/test_binning/examples/binning_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address +2020-01-01 05:06:35 512 192.0.2.0 diff --git a/tests/test_binning/examples/binning_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/binning_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv new file mode 100644 index 0000000..7415927 --- /dev/null +++ b/tests/test_binning/examples/binning_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address +2020-01-01 23:06:42 1443 192.0.2.0 diff --git a/tests/test_binning/examples/binning_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv b/tests/test_binning/examples/binning_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv new file mode 100644 index 0000000..f53bbd8 --- /dev/null +++ b/tests/test_binning/examples/binning_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address +2020-01-01 22:42:58 1526223 192.0.2.0 diff --git a/tests/test_reduction/examples/reduced_example_0/expected_output/2020/01/01.tsv b/tests/test_binning/examples/binning_example_0/reduced_logs/2020/01/01.tsv similarity index 85% rename from tests/test_reduction/examples/reduced_example_0/expected_output/2020/01/01.tsv rename to tests/test_binning/examples/binning_example_0/reduced_logs/2020/01/01.tsv index de94244..4d7ef29 100644 --- a/tests/test_reduction/examples/reduced_example_0/expected_output/2020/01/01.tsv +++ b/tests/test_binning/examples/binning_example_0/reduced_logs/2020/01/01.tsv @@ -1,3 +1,4 @@ +timestamp ip_address object_key bytes_sent 2020-01-01T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 2020-01-01T22:42:58 192.0.2.0 zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6 1526223 2020-01-01T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 diff --git a/tests/test_binning/examples/reduced_example_0/expected_output/2020/01/01.log b/tests/test_binning/examples/reduced_example_0/expected_output/2020/01/01.log deleted file mode 100644 index 6303c18..0000000 --- a/tests/test_binning/examples/reduced_example_0/expected_output/2020/01/01.log +++ /dev/null @@ -1,3 +0,0 @@ -[01/Jan/2020:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -[01/Jan/2020:22:42:58 +0000] 192.0.2.0 - W3VJKP0HM8TV2N46 REST.GET.OBJECT zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19 "GET /zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19?versionId=MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 HTTP/1.1" 200 - 1526223 1526223 61 55 "-" "git-annex/10.20220927-geb4a544" MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 U4WvVRIYm+n+VYNArVY/+fjDV3PZesvSaclnyALtK7rsaZ/8sTq8H1JnNAyMj/xuitYxXNUCZ+U= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - -[01/Jan/2020:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_binning/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv deleted file mode 100644 index 1cc8230..0000000 --- a/tests/test_binning/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/test_binning/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv deleted file mode 100644 index 5b97e6c..0000000 --- a/tests/test_binning/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/test_binning/examples/reduced_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv b/tests/test_binning/examples/reduced_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv deleted file mode 100644 index e25b6a1..0000000 --- a/tests/test_binning/examples/reduced_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2023-01-01 22:42:58 1526223 192.0.2.0 2 diff --git a/tests/test_binning/examples/reduced_example_1/0.log b/tests/test_binning/examples/reduced_example_1/0.log deleted file mode 100644 index 9e9f1ac..0000000 --- a/tests/test_binning/examples/reduced_example_1/0.log +++ /dev/null @@ -1,2 +0,0 @@ -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [24/Apr/2021:12:03:05 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [16/Mar/2022:02:21:12 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_binning/examples/reduced_example_1/1.log b/tests/test_binning/examples/reduced_example_1/1.log deleted file mode 100644 index 43bacc6..0000000 --- a/tests/test_binning/examples/reduced_example_1/1.log +++ /dev/null @@ -1,2 +0,0 @@ -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_binning/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv deleted file mode 100644 index 570c480..0000000 --- a/tests/test_binning/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv +++ /dev/null @@ -1,3 +0,0 @@ -timestamp bytes_sent ip_address line_index -2022-03-16 02:21:12 512 192.0.2.0 1 -2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/test_binning/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv deleted file mode 100644 index 0851ae8..0000000 --- a/tests/test_binning/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ /dev/null @@ -1,3 +0,0 @@ -timestamp bytes_sent ip_address line_index -2021-04-24 12:03:05 1443 192.0.2.0 0 -2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/test_binning/examples/reduced_example_2/0.log b/tests/test_binning/examples/reduced_example_2/0.log deleted file mode 100644 index 9205ad3..0000000 --- a/tests/test_binning/examples/reduced_example_2/0.log +++ /dev/null @@ -1,4 +0,0 @@ -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Jan/2023:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [26/Jun/2023:03:05:53 +0000] 192.0.2.0 - 5PCGX9WKFQMJH6FB REST.GET.OBJECT blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a "GET /blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a HTTP/1.1" 200 - 6616308 422868123111 205 35 "-" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" - A54Zaz7Sl0ygUFZ4lEOYCXHxImvTGXnvR+rr9+JcM/gceQWDObRkwnP9nO+wK70lpMaaE78SWvA= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - diff --git a/tests/test_binning/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/test_binning/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv deleted file mode 100644 index a55bf13..0000000 --- a/tests/test_binning/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2023-06-26 03:05:53 6616308 192.0.2.0 3 diff --git a/tests/test_binning/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv deleted file mode 100644 index 1cc8230..0000000 --- a/tests/test_binning/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/test_binning/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv deleted file mode 100644 index 5b97e6c..0000000 --- a/tests/test_binning/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/test_binning/test_bin_all_dandi_raw_s3_logs.py b/tests/test_binning/test_bin_all_dandi_raw_s3_logs.py index 32757fd..1473bba 100644 --- a/tests/test_binning/test_bin_all_dandi_raw_s3_logs.py +++ b/tests/test_binning/test_bin_all_dandi_raw_s3_logs.py @@ -11,10 +11,10 @@ def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: tmpdir = pathlib.Path(tmpdir) file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduced_example_1" + examples_folder_path = file_parent / "examples" / "reduction_example_1" expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_1" + test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_1" test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) dandi_s3_log_parser.reduce_all_dandi_raw_s3_logs( diff --git a/tests/test_binning/test_bin_dandi_raw_s3_log.py b/tests/test_binning/test_bin_dandi_raw_s3_log.py index 484f83c..fdad307 100644 --- a/tests/test_binning/test_bin_dandi_raw_s3_log.py +++ b/tests/test_binning/test_bin_dandi_raw_s3_log.py @@ -16,11 +16,11 @@ def test_reduce_dandi_raw_s3_log_example_0(tmpdir: py.path.local) -> None: tmpdir = pathlib.Path(tmpdir) file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduced_example_0" + examples_folder_path = file_parent / "examples" / "reduction_example_0" example_raw_s3_log_file_path = examples_folder_path / "0.log" expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_0" + test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_0" test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) dandi_s3_log_parser.reduce_dandi_raw_s3_log( diff --git a/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py b/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py new file mode 100644 index 0000000..888a688 --- /dev/null +++ b/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py @@ -0,0 +1,27 @@ +# import pathlib +# +# import py +# + +# def test_bin_reduced_s3_logs_by_object_key_example_0(tmpdir: py.path.local) -> None: +# tmpdir = pathlib.Path(tmpdir) +# +# file_parent = pathlib.Path(__file__).parent +# example_folder_path = file_parent / "examples" / "binning_example_0" +# reduced_s3_logs_folder_path = example_folder_path / "reduced_logs" +# +# test_binned_s3_logs_folder_path = tmpdir / "binned_example_0" +# test_binned_s3_logs_folder_path.mkdir(exist_ok=True) +# +# expected_binned_s3_logs_folder_path = example_folder_path / "expected_output" +# expected_binned_s3_log_file_path = expected_reduced_s3_logs_folder_path / "2020" / "01" / "01.tsv" +# +# dandi_s3_log_parser.bin_all_reduced_s3_logs_by_object_key( +# reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, +# binned_s3_logs_folder_path=test_binned_s3_logs_folder_path, +# ) +# +# test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) +# expected_reduced_s3_log = pandas.read_table(filepath_or_buffer=expected_reduced_s3_log_file_path) +# +# pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_reduced_s3_log) diff --git a/tests/test_binning/test_reduce_all_dandi_raw_s3_logs.py b/tests/test_binning/test_reduce_all_dandi_raw_s3_logs.py index 32757fd..1473bba 100644 --- a/tests/test_binning/test_reduce_all_dandi_raw_s3_logs.py +++ b/tests/test_binning/test_reduce_all_dandi_raw_s3_logs.py @@ -11,10 +11,10 @@ def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: tmpdir = pathlib.Path(tmpdir) file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduced_example_1" + examples_folder_path = file_parent / "examples" / "reduction_example_1" expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_1" + test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_1" test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) dandi_s3_log_parser.reduce_all_dandi_raw_s3_logs( diff --git a/tests/test_binning/test_reduce_all_dandi_raw_s3_logs_parallel.py b/tests/test_binning/test_reduce_all_dandi_raw_s3_logs_parallel.py index 2045792..70987b7 100644 --- a/tests/test_binning/test_reduce_all_dandi_raw_s3_logs_parallel.py +++ b/tests/test_binning/test_reduce_all_dandi_raw_s3_logs_parallel.py @@ -11,10 +11,10 @@ def test_reduce_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local) tmpdir = pathlib.Path(tmpdir) file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduced_example_1" + examples_folder_path = file_parent / "examples" / "reduction_example_1" expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_1" + test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_1" test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) dandi_s3_log_parser.reduce_all_dandi_raw_s3_logs( diff --git a/tests/test_binning/test_reduce_dandi_raw_s3_log_bad_lines.py b/tests/test_binning/test_reduce_dandi_raw_s3_log_bad_lines.py index f8c06d9..162761b 100644 --- a/tests/test_binning/test_reduce_dandi_raw_s3_log_bad_lines.py +++ b/tests/test_binning/test_reduce_dandi_raw_s3_log_bad_lines.py @@ -18,11 +18,11 @@ def test_reduce_dandi_raw_s3_log_bad_lines(tmpdir: py.path.local) -> None: initial_number_of_error_folder_contents = len(error_folder_contents) file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduced_example_2" + examples_folder_path = file_parent / "examples" / "reduction_example_2" example_raw_s3_log_file_path = examples_folder_path / "0.log" expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_2" + test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_2" test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) dandi_s3_log_parser.reduce_dandi_raw_s3_log( diff --git a/tests/test_binning/test_reduce_raw_s3_log.py b/tests/test_binning/test_reduce_raw_s3_log.py deleted file mode 100644 index 131f2ea..0000000 --- a/tests/test_binning/test_reduce_raw_s3_log.py +++ /dev/null @@ -1,60 +0,0 @@ -import pathlib - -import pandas -import py - -import dandi_s3_log_parser - - -def test_reduce_raw_s3_log_example_0(tmpdir: py.path.local) -> None: - """ - Most basic test of functionality. - - If there are failures in the parsing of any lines found in application, - please raise an issue and contribute them to the example log collection. - """ - tmpdir = pathlib.Path(tmpdir) - - file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduced_example_0" - example_raw_s3_log_file_path = examples_folder_path / "0.log" - expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - - test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_0" - test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) - - dandi_s3_log_parser.reduce_dandi_raw_s3_log( - raw_s3_log_file_path=example_raw_s3_log_file_path, - reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, - ) - test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) - - number_of_output_files = len(test_output_file_paths) - assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" - - # Increment this over time as more examples are added - expected_number_of_output_files = 3 - assert ( - number_of_output_files == expected_number_of_output_files - ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - - expected_asset_ids = [path.stem for path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] - for test_reduced_s3_log_file_path in test_output_file_paths: - assert ( - test_reduced_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_reduced_s3_log_file_path.stem} not found in expected asset IDs!" - - is_asset_zarr = "zarr" in str(test_reduced_s3_log_file_path) - if is_asset_zarr: - blob_id = test_reduced_s3_log_file_path.stem - expected_parsed_s3_log_file_path = expected_reduced_s3_logs_folder_path / "zarr" / f"{blob_id}.tsv" - else: - blob_id = test_reduced_s3_log_file_path.stem - expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" - ) - - test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) - - pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_parsed_s3_log) diff --git a/tests/test_binning/test_buffered_text_reader.py b/tests/test_buffered_text_reader.py similarity index 100% rename from tests/test_binning/test_buffered_text_reader.py rename to tests/test_buffered_text_reader.py diff --git a/tests/test_reduction/examples/reduced_example_1/raw_logs/2020/01/01.log b/tests/test_reduction/examples/reduced_example_1/raw_logs/2020/01/01.log deleted file mode 100644 index 4a80253..0000000 --- a/tests/test_reduction/examples/reduced_example_1/raw_logs/2020/01/01.log +++ /dev/null @@ -1,3 +0,0 @@ -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:22:42:58 +0000] 192.0.2.0 - W3VJKP0HM8TV2N46 REST.GET.OBJECT zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19 "GET /zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6/0/0/0/1/5/19?versionId=MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 HTTP/1.1" 200 - 1526223 1526223 61 55 "-" "git-annex/10.20220927-geb4a544" MwI7yXtui4mtwTeuZxHoT4qHap44j3T2 U4WvVRIYm+n+VYNArVY/+fjDV3PZesvSaclnyALtK7rsaZ/8sTq8H1JnNAyMj/xuitYxXNUCZ+U= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [01/Jan/2020:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_reduction/examples/reduced_example_1/expected_output/2020/01/01.tsv b/tests/test_reduction/examples/reduction_example_0/expected_output/2020/01/01.tsv similarity index 85% rename from tests/test_reduction/examples/reduced_example_1/expected_output/2020/01/01.tsv rename to tests/test_reduction/examples/reduction_example_0/expected_output/2020/01/01.tsv index de94244..4d7ef29 100644 --- a/tests/test_reduction/examples/reduced_example_1/expected_output/2020/01/01.tsv +++ b/tests/test_reduction/examples/reduction_example_0/expected_output/2020/01/01.tsv @@ -1,3 +1,4 @@ +timestamp ip_address object_key bytes_sent 2020-01-01T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 2020-01-01T22:42:58 192.0.2.0 zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6 1526223 2020-01-01T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 diff --git a/tests/test_binning/examples/reduced_example_0/raw_logs/2020/01/01.log b/tests/test_reduction/examples/reduction_example_0/raw_logs/2020/01/01.log similarity index 100% rename from tests/test_binning/examples/reduced_example_0/raw_logs/2020/01/01.log rename to tests/test_reduction/examples/reduction_example_0/raw_logs/2020/01/01.log diff --git a/tests/test_reduction/examples/reduction_example_1/expected_output/2020/01/01.tsv b/tests/test_reduction/examples/reduction_example_1/expected_output/2020/01/01.tsv new file mode 100644 index 0000000..4d7ef29 --- /dev/null +++ b/tests/test_reduction/examples/reduction_example_1/expected_output/2020/01/01.tsv @@ -0,0 +1,4 @@ +timestamp ip_address object_key bytes_sent +2020-01-01T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 +2020-01-01T22:42:58 192.0.2.0 zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6 1526223 +2020-01-01T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 diff --git a/tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv b/tests/test_reduction/examples/reduction_example_1/expected_output/2021/02/03.tsv similarity index 85% rename from tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv rename to tests/test_reduction/examples/reduction_example_1/expected_output/2021/02/03.tsv index ab1b8a1..96bf781 100644 --- a/tests/test_reduction/examples/reduced_example_1/expected_output/2021/02/03.tsv +++ b/tests/test_reduction/examples/reduction_example_1/expected_output/2021/02/03.tsv @@ -1,3 +1,4 @@ +timestamp ip_address object_key bytes_sent 2021-02-03T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 2021-02-03T22:42:58 192.0.2.0 zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6 1526223 2021-02-03T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 diff --git a/tests/test_reduction/examples/reduced_example_0/raw_logs/2020/01/01.log b/tests/test_reduction/examples/reduction_example_1/raw_logs/2020/01/01.log similarity index 100% rename from tests/test_reduction/examples/reduced_example_0/raw_logs/2020/01/01.log rename to tests/test_reduction/examples/reduction_example_1/raw_logs/2020/01/01.log diff --git a/tests/test_reduction/examples/reduced_example_1/raw_logs/2021/02/03.log b/tests/test_reduction/examples/reduction_example_1/raw_logs/2021/02/03.log similarity index 100% rename from tests/test_reduction/examples/reduced_example_1/raw_logs/2021/02/03.log rename to tests/test_reduction/examples/reduction_example_1/raw_logs/2021/02/03.log diff --git a/tests/test_reduction/examples/reduced_example_2/expected_output/2022/04/06.tsv b/tests/test_reduction/examples/reduction_example_2/expected_output/2022/04/06.tsv similarity index 85% rename from tests/test_reduction/examples/reduced_example_2/expected_output/2022/04/06.tsv rename to tests/test_reduction/examples/reduction_example_2/expected_output/2022/04/06.tsv index 1e829c5..db485b2 100644 --- a/tests/test_reduction/examples/reduced_example_2/expected_output/2022/04/06.tsv +++ b/tests/test_reduction/examples/reduction_example_2/expected_output/2022/04/06.tsv @@ -1,3 +1,4 @@ +timestamp ip_address object_key bytes_sent 2022-04-06T03:05:53 192.0.2.0 blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a 6616308 2022-04-06T05:06:35 192.0.2.0 blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 512 2022-04-06T23:06:42 192.0.2.0 blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 1443 diff --git a/tests/test_reduction/examples/reduced_example_2/raw_logs/2022/04/06.log b/tests/test_reduction/examples/reduction_example_2/raw_logs/2022/04/06.log similarity index 100% rename from tests/test_reduction/examples/reduced_example_2/raw_logs/2022/04/06.log rename to tests/test_reduction/examples/reduction_example_2/raw_logs/2022/04/06.log diff --git a/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs.py b/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs.py index 9eb97fd..e59ed62 100644 --- a/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs.py +++ b/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs.py @@ -11,10 +11,10 @@ def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: tmpdir = pathlib.Path(tmpdir) file_parent = pathlib.Path(__file__).parent - example_folder_path = file_parent / "examples" / "reduced_example_1" + example_folder_path = file_parent / "examples" / "reduction_example_1" example_raw_s3_logs_folder_path = example_folder_path / "raw_logs" - test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_1" + test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_1" test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) expected_reduced_s3_logs_folder_path = example_folder_path / "expected_output" diff --git a/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py b/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py index e2aeaa5..16df084 100644 --- a/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py +++ b/tests/test_reduction/test_reduce_all_dandi_raw_s3_logs_parallel.py @@ -11,10 +11,10 @@ def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: tmpdir = pathlib.Path(tmpdir) file_parent = pathlib.Path(__file__).parent - example_folder_path = file_parent / "examples" / "reduced_example_1" + example_folder_path = file_parent / "examples" / "reduction_example_1" example_raw_s3_logs_folder_path = example_folder_path / "raw_logs" - test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_1" + test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_1" test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) expected_reduced_s3_logs_folder_path = example_folder_path / "expected_output" diff --git a/tests/test_reduction/test_reduce_raw_s3_log.py b/tests/test_reduction/test_reduce_raw_s3_log.py index 6a23851..5092a30 100644 --- a/tests/test_reduction/test_reduce_raw_s3_log.py +++ b/tests/test_reduction/test_reduce_raw_s3_log.py @@ -16,7 +16,7 @@ def test_reduce_raw_s3_log_example_0_fast_case(tmpdir: py.path.local) -> None: tmpdir = pathlib.Path(tmpdir) file_parent = pathlib.Path(__file__).parent - example_folder_path = file_parent / "examples" / "reduced_example_0" + example_folder_path = file_parent / "examples" / "reduction_example_0" example_raw_s3_log_file_path = example_folder_path / "raw_logs" / "2020" / "01" / "01.log" test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_0_fast_case" @@ -44,7 +44,7 @@ def test_reduce_raw_s3_log_example_0_basic_case(tmpdir: py.path.local) -> None: tmpdir = pathlib.Path(tmpdir) file_parent = pathlib.Path(__file__).parent - example_folder_path = file_parent / "examples" / "reduced_example_0" + example_folder_path = file_parent / "examples" / "reduction_example_0" example_raw_s3_log_file_path = example_folder_path / "raw_logs" / "2020" / "01" / "01.log" test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_0_basic_case" diff --git a/tests/test_reduction/test_reduce_raw_s3_log_bad_lines.py b/tests/test_reduction/test_reduce_raw_s3_log_bad_lines.py index e13bb8d..59f3507 100644 --- a/tests/test_reduction/test_reduce_raw_s3_log_bad_lines.py +++ b/tests/test_reduction/test_reduce_raw_s3_log_bad_lines.py @@ -15,7 +15,7 @@ def test_reduce_raw_s3_log_example_bad_lines_fast_case(tmpdir: py.path.local) -> initial_number_of_error_folder_contents = len(error_folder_contents) file_parent = pathlib.Path(__file__).parent - example_folder_path = file_parent / "examples" / "reduced_example_2" + example_folder_path = file_parent / "examples" / "reduction_example_2" example_raw_s3_log_file_path = example_folder_path / "raw_logs" / "2022" / "04" / "06.log" test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_bad_lines_fast_case" @@ -53,7 +53,7 @@ def test_reduce_raw_s3_log_example_bad_lines_basic_case(tmpdir: py.path.local) - initial_number_of_error_folder_contents = len(error_folder_contents) file_parent = pathlib.Path(__file__).parent - example_folder_path = file_parent / "examples" / "reduced_example_2" + example_folder_path = file_parent / "examples" / "reduction_example_2" example_raw_s3_log_file_path = example_folder_path / "raw_logs" / "2022" / "04" / "06.log" test_reduced_s3_logs_folder_path = tmpdir / "reduced_example_bad_lines_basic_case" From 442b5e35bb3159b56cd64e712e9425cd976aed45 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 01:51:52 -0400 Subject: [PATCH 17/55] fix --- src/dandi_s3_log_parser/__init__.py | 2 +- ..._object_key.py => _bin_all_reduced_s3_logs_by_object_key.py} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename src/dandi_s3_log_parser/{_bin_reduced_logs_by_object_key.py => _bin_all_reduced_s3_logs_by_object_key.py} (99%) diff --git a/src/dandi_s3_log_parser/__init__.py b/src/dandi_s3_log_parser/__init__.py index 46fde63..b9b3d3c 100644 --- a/src/dandi_s3_log_parser/__init__.py +++ b/src/dandi_s3_log_parser/__init__.py @@ -23,7 +23,7 @@ from ._dandi_s3_log_file_reducer import reduce_all_dandi_raw_s3_logs from ._ip_utils import get_region_from_ip_address from ._dandiset_mapper import map_reduced_logs_to_dandisets -from ._bin_reduced_logs_by_object_key import bin_all_reduced_s3_logs_by_object_key +from ._bin_all_reduced_s3_logs_by_object_key import bin_all_reduced_s3_logs_by_object_key __all__ = [ "DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH", diff --git a/src/dandi_s3_log_parser/_bin_reduced_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py similarity index 99% rename from src/dandi_s3_log_parser/_bin_reduced_logs_by_object_key.py rename to src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index 388125f..62a6172 100644 --- a/src/dandi_s3_log_parser/_bin_reduced_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -20,7 +20,7 @@ @validate_call -def bin_all_reduced_logs_by_object_key( +def bin_all_reduced_s3_logs_by_object_key( *, reduced_s3_logs_folder_path: DirectoryPath, binned_s3_logs_folder_path: DirectoryPath, From 1f095e632e29f76e566e312f4e4329e019130ed6 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 10:42:19 -0400 Subject: [PATCH 18/55] debug binning --- .../_bin_all_reduced_s3_logs_by_object_key.py | 282 +----------------- .../_s3_log_file_reducer.py | 2 + .../examples/binned_example_1/0.log | 2 - .../examples/binned_example_1/1.log | 2 - .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 3 - .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 3 - .../examples/binned_example_2/0.log | 4 - .../0801d996-200e-4173-ab49-d1784427e96a.tsv | 2 - .../11ec8933-1456-4942-922b-94e5878bb991.tsv | 2 - .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 2 - .../test_bin_all_dandi_raw_s3_logs.py | 58 ---- .../test_binning/test_bin_dandi_raw_s3_log.py | 60 ---- .../test_bin_reduced_s3_logs_by_object_key.py | 65 ++-- .../test_reduce_all_dandi_raw_s3_logs.py | 58 ---- ...t_reduce_all_dandi_raw_s3_logs_parallel.py | 59 ---- .../test_reduce_dandi_raw_s3_log_bad_lines.py | 57 ---- 16 files changed, 53 insertions(+), 608 deletions(-) delete mode 100644 tests/test_binning/examples/binned_example_1/0.log delete mode 100644 tests/test_binning/examples/binned_example_1/1.log delete mode 100644 tests/test_binning/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv delete mode 100644 tests/test_binning/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv delete mode 100644 tests/test_binning/examples/binned_example_2/0.log delete mode 100644 tests/test_binning/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv delete mode 100644 tests/test_binning/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv delete mode 100644 tests/test_binning/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv delete mode 100644 tests/test_binning/test_bin_all_dandi_raw_s3_logs.py delete mode 100644 tests/test_binning/test_bin_dandi_raw_s3_log.py delete mode 100644 tests/test_binning/test_reduce_all_dandi_raw_s3_logs.py delete mode 100644 tests/test_binning/test_reduce_all_dandi_raw_s3_logs_parallel.py delete mode 100644 tests/test_binning/test_reduce_dandi_raw_s3_log_bad_lines.py diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index 62a6172..f49c91b 100644 --- a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -1,23 +1,10 @@ """Bin reduced logs by object key.""" -# import collections -# import datetime import pathlib -# import uuid -# from collections.abc import Callable -# from typing import Literal import pandas - -# import tqdm from pydantic import DirectoryPath, validate_call -# -# from ._buffered_text_reader import BufferedTextReader -# from ._error_collection import _collect_error -# from ._s3_log_line_parser import _KNOWN_OPERATION_TYPES, _append_reduced_log_line -# - @validate_call def bin_all_reduced_s3_logs_by_object_key( @@ -59,265 +46,22 @@ def bin_all_reduced_s3_logs_by_object_key( "ip_address": list, } ) + del reduced_data_frame + + object_keys_to_data = { + row.name: {"timestamp": row["timestamp"], "bytes_sent": row["bytes_sent"], "ip_address": row["ip_address"]} + for _, row in binned_data_frame.iterrows() + } + del binned_data_frame - for _, row in binned_data_frame.iterrows(): - object_key_as_path = pathlib.Path(row.name) - object_key_as_path.parent.mkdir(parents=True, exist_ok=True) + for object_key, data in object_keys_to_data.items(): + object_key_as_path = pathlib.Path(object_key) binned_s3_log_file_path = ( binned_s3_logs_folder_path / object_key_as_path.parent / f"{object_key_as_path.stem}.tsv" ) + binned_s3_log_file_path.parent.mkdir(exist_ok=True, parents=True) - header = False if binned_s3_log_file_path.exists() else True - row.to_csv(path_or_buf=binned_s3_log_file_path, mode="a", sep="\t", header=header) - + data_frame = pandas.DataFrame(data=data) -# @validate_call -# def reduce_raw_s3_log( -# *, -# raw_s3_log_file_path: FilePath, -# reduced_s3_logs_folder_path: DirectoryPath, -# mode: Literal["w", "a"] = "a", -# maximum_buffer_size_in_bytes: int = 4 * 10**9, -# bucket: str | None = None, -# operation_type: Literal[_KNOWN_OPERATION_TYPES] = "REST.GET.OBJECT", -# excluded_ips: collections.defaultdict[str, bool] | None = None, -# asset_id_handler: Callable | None = None, -# tqdm_kwargs: dict | None = None, -# ) -> None: -# """ -# Reduce a raw S3 log file and write the results to a folder of TSV files, one for each unique asset ID. -# -# 'Reduce' here means: -# - Filtering all lines only by the bucket specified. -# - Filtering all lines only by the type of operation specified (i.e., REST.GET.OBJECT, REST.PUT.OBJECT, etc.). -# - Filtering out any non-success status codes. -# - Filtering out any excluded IP addresses. -# - Extracting only the asset ID, request timestamp, request size, and IP address that sent the request. -# -# Parameters -# ---------- -# raw_s3_log_file_path : str or pathlib.Path -# The path to the raw S3 log file. -# reduced_s3_logs_folder_path : str or pathlib.Path -# The path to write each reduced S3 log file to. -# There will be one file per handled asset ID. -# mode : "w" or "a", default: "a" -# How to resolve the case when files already exist in the folder containing parsed logs. -# "w" will overwrite existing content, "a" will append or create if the file does not yet exist. -# -# The intention of the default usage is to have one consolidated raw S3 log file per day and then to iterate -# over each day, parsing and binning by asset, effectively 'updating' the parsed collection on each iteration. -# maximum_buffer_size_in_bytes : int, default: 4 GB -# The theoretical maximum amount of RAM (in bytes) to use on each buffer iteration when reading from the -# source text file. -# -# Actual RAM usage will be higher due to overhead and caching. -# bucket : str -# Only parse and return lines that match this bucket. -# operation_type : str, default: "REST.GET" -# The type of operation to filter for. -# excluded_ips : collections.defaultdict of strings to booleans, optional -# A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. -# asset_id_handler : callable, optional -# If your asset IDs in the raw log require custom handling (i.e., they contain slashes that you do not wish to -# translate into nested directory paths) then define a function of the following form: -# -# # For example -# def asset_id_handler(*, raw_asset_id: str) -> str: -# split_by_slash = raw_asset_id.split("/") -# return split_by_slash[0] + "_" + split_by_slash[-1] -# tqdm_kwargs : dict, optional -# Keyword arguments to pass to the tqdm progress bar for line buffers. -# """ -# reduced_s3_logs_folder_path.mkdir(exist_ok=True) -# bucket = bucket or "" -# excluded_ips = excluded_ips or collections.defaultdict(bool) -# asset_id_handler = asset_id_handler or (lambda asset_id: asset_id) -# tqdm_kwargs = tqdm_kwargs or dict() -# -# assert raw_s3_log_file_path.suffix == ".log", f"`{raw_s3_log_file_path=}` should end in '.log'!" -# -# reduced_and_binned_logs = _get_reduced_and_binned_log_lines( -# raw_s3_log_file_path=raw_s3_log_file_path, -# maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, -# bucket=bucket, -# operation_type=operation_type, -# excluded_ips=excluded_ips, -# asset_id_handler=asset_id_handler, -# tqdm_kwargs=tqdm_kwargs, -# ) -# -# for handled_asset_id, reduced_logs_per_handled_asset_id in reduced_and_binned_logs.items(): -# handled_asset_id_path = pathlib.Path(handled_asset_id) -# blob_id = handled_asset_id_path.stem -# reduced_s3_log_file_path = reduced_s3_logs_folder_path / handled_asset_id_path.parent / f"{blob_id}.tsv" -# -# reduced_log_file_exists = reduced_s3_log_file_path.exists() -# if not reduced_log_file_exists and not reduced_s3_log_file_path.parent.exists(): -# reduced_s3_log_file_path.parent.mkdir(exist_ok=True, parents=True) -# -# data_frame = pandas.DataFrame(data=reduced_logs_per_handled_asset_id) -# -# header = False if reduced_log_file_exists is True and mode == "a" else True -# data_frame.to_csv(path_or_buf=reduced_s3_log_file_path, mode=mode, sep="\t", header=header, index=False) -# -# -# def _get_reduced_and_binned_log_lines( -# *, -# raw_s3_log_file_path: pathlib.Path, -# maximum_buffer_size_in_bytes: int, -# bucket: str, -# operation_type: Literal[_KNOWN_OPERATION_TYPES], -# excluded_ips: collections.defaultdict[str, bool], -# asset_id_handler: Callable, -# tqdm_kwargs: dict, -# ) -> collections.defaultdict[str, dict[str, list[str | int]]]: -# """Reduce the full S3 log file to minimal content and bin by asset ID.""" -# tqdm_kwargs = tqdm_kwargs or dict() -# default_tqdm_kwargs = dict(desc="Parsing line buffers...", leave=False) -# resolved_tqdm_kwargs = dict(default_tqdm_kwargs) -# resolved_tqdm_kwargs.update(tqdm_kwargs) -# -# task_id = str(uuid.uuid4())[:5] -# -# reduced_and_binned_logs = collections.defaultdict(list) -# buffered_text_reader = BufferedTextReader( -# file_path=raw_s3_log_file_path, -# maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes, -# ) -# progress_bar_iterator = tqdm.tqdm( -# iterable=buffered_text_reader, -# total=len(buffered_text_reader), -# **resolved_tqdm_kwargs, -# ) -# per_buffer_index = 0 -# for buffered_raw_lines in progress_bar_iterator: -# index = 0 -# for raw_line in buffered_raw_lines: -# line_index = per_buffer_index + index -# -# _append_reduced_log_line( -# raw_line=raw_line, -# reduced_and_binned_logs=reduced_and_binned_logs, -# bucket=bucket, -# operation_type=operation_type, -# excluded_ips=excluded_ips, -# asset_id_handler=asset_id_handler, -# log_file_path=raw_s3_log_file_path, -# line_index=line_index, -# task_id=task_id, -# ) -# index += 1 -# per_buffer_index += index -# -# return reduced_and_binned_logs -# -# -# def _append_reduced_log_line( -# *, -# raw_line: str, -# reduced_and_binned_logs: collections.defaultdict[str, dict[str, list[str | int]]], -# operation_type: Literal[_KNOWN_OPERATION_TYPES], -# excluded_ips: collections.defaultdict[str, bool], -# object_key_handler: Callable, -# line_index: int, -# log_file_path: pathlib.Path, -# task_id: str, -# ) -> None: -# """ -# Append the `reduced_and_binned_logs` map with information extracted from a single raw log line, if it is valid. -# -# Parameters -# ---------- -# raw_line : string -# A single line from the raw S3 log file. -# reduced_and_binned_logs : collections.defaultdict -# A map of reduced log line content binned by handled asset ID. -# object_key_handler : callable, optional -# If your object keys in the raw log require custom handling (i.e., they contain slashes that you do not wish to -# translate into nested directory paths) then define a function of the following form. -# -# For example: -# -# ```python -# def asset_id_handler(*, raw_asset_id: str) -> str: -# split_by_slash = raw_asset_id.split("/") -# -# asset_type = split_by_slash[0] -# if asset_type == "zarr": -# zarr_blob_form = "/".join(split_by_slash[:2]) -# return zarr_blob_form -# -# return raw_asset_id -# ``` -# operation_type : string -# The type of operation to filter for. -# excluded_ips : collections.defaultdict of strings to booleans -# A lookup table / hash map whose keys are IP addresses and values are True to exclude from parsing. -# line_index: int -# The index of the line in the raw log file. -# log_file_path: pathlib.Path -# The path to the log file being parsed; attached for error collection purposes. -# task_id: str -# A unique task ID to ensure that error collection files are unique when parallelizing to avoid race conditions. -# """ -# parsed_log_line = _parse_s3_log_line(raw_line=raw_line) -# -# full_log_line = _get_full_log_line( -# parsed_log_line=parsed_log_line, -# log_file_path=log_file_path, -# line_index=line_index, -# raw_line=raw_line, -# task_id=task_id, -# ) -# -# if full_log_line is None: -# return None -# -# # Apply some minimal validation and contribute any invalidations to error collection -# # These might slow parsing down a bit, but could be important to ensuring accuracy -# if not full_log_line.status_code.isdigit(): -# message = f"Unexpected status code: '{full_log_line.status_code}' on line {line_index} of file {log_file_path} -# _collect_error(message=message, error_type="line", task_id=task_id) -# -# return None -# -# if _IS_OPERATION_TYPE_KNOWN[full_log_line.operation] is False: -# message = ( -# f"Unexpected request type: '{full_log_line.operation}' on line {line_index} of file {log_file_path}.\n\n" -# ) -# _collect_error(message=message, error_type="line", task_id=task_id) -# -# return None -# -# timezone = full_log_line.timestamp[-5:] -# is_timezone_utc = timezone != "+0000" -# if is_timezone_utc: -# message = f"Unexpected time shift attached to log! Have always seen '+0000', found `{timezone=}`.\n\n" -# _collect_error(message=message, error_type="line", task_id=task_id) -# # Fine to proceed; just wanted to be made aware if there is ever a difference so can try to investigate why -# -# # More early skip conditions after validation -# # Only accept 200-block status codes -# if full_log_line.status_code[0] != "2": -# return None -# -# if full_log_line.operation != operation_type: -# return None -# -# if excluded_ips[full_log_line.ip_address] is True: -# return None -# -# # All early skip conditions done; the line is parsed so bin the reduced information by handled asset ID -# handled_object_key = object_key_handler(raw_asset_id=full_log_line.asset_id) -# handled_timestamp = datetime.datetime.strptime(full_log_line.timestamp[:-6], "%d/%b/%Y:%H:%M:%S") -# handled_bytes_sent = int(full_log_line.bytes_sent) if full_log_line.bytes_sent != "-" else 0 -# -# reduced_and_binned_logs[handled_object_key] = reduced_and_binned_logs.get( -# handled_object_key, -# collections.defaultdict(list), -# ) -# reduced_and_binned_logs[handled_object_key]["timestamp"].append(handled_timestamp) -# reduced_and_binned_logs[handled_object_key]["bytes_sent"].append(handled_bytes_sent) -# reduced_and_binned_logs[handled_object_key]["ip_address"].append(full_log_line.ip_address) -# reduced_and_binned_logs[handled_object_key]["line_index"].append(line_index) + header = False if binned_s3_log_file_path.exists() else True + data_frame.to_csv(path_or_buf=binned_s3_log_file_path, mode="a", sep="\t", header=header, index=False) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 1e0aee5..3f34aef 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -112,6 +112,8 @@ def object_key_handler(*, object_key: str) -> str: fast_fields_to_reduce = set(fields_to_reduce) == {"object_key", "timestamp", "bytes_sent", "ip_address"} fast_object_key_parents_to_reduce = set(object_key_parents_to_reduce) == {"blobs", "zarr"} fast_fields_case = fast_fields_to_reduce and fast_object_key_parents_to_reduce + # TODO: add dumping to file within comprehension to alleviate RAM accumulation + # Would need a start/completed tracking similar to binning to ensure no corruption however if fast_fields_case is True: reduced_s3_log_lines = [ reduced_s3_log_line diff --git a/tests/test_binning/examples/binned_example_1/0.log b/tests/test_binning/examples/binned_example_1/0.log deleted file mode 100644 index 9e9f1ac..0000000 --- a/tests/test_binning/examples/binned_example_1/0.log +++ /dev/null @@ -1,2 +0,0 @@ -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [24/Apr/2021:12:03:05 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [16/Mar/2022:02:21:12 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_binning/examples/binned_example_1/1.log b/tests/test_binning/examples/binned_example_1/1.log deleted file mode 100644 index 43bacc6..0000000 --- a/tests/test_binning/examples/binned_example_1/1.log +++ /dev/null @@ -1,2 +0,0 @@ -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - diff --git a/tests/test_binning/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv deleted file mode 100644 index 570c480..0000000 --- a/tests/test_binning/examples/binned_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv +++ /dev/null @@ -1,3 +0,0 @@ -timestamp bytes_sent ip_address line_index -2022-03-16 02:21:12 512 192.0.2.0 1 -2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/test_binning/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv deleted file mode 100644 index 0851ae8..0000000 --- a/tests/test_binning/examples/binned_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ /dev/null @@ -1,3 +0,0 @@ -timestamp bytes_sent ip_address line_index -2021-04-24 12:03:05 1443 192.0.2.0 0 -2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/test_binning/examples/binned_example_2/0.log b/tests/test_binning/examples/binned_example_2/0.log deleted file mode 100644 index 9205ad3..0000000 --- a/tests/test_binning/examples/binned_example_2/0.log +++ /dev/null @@ -1,4 +0,0 @@ -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [31/Dec/2021:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [04/May/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Jan/2023:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - -8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [26/Jun/2023:03:05:53 +0000] 192.0.2.0 - 5PCGX9WKFQMJH6FB REST.GET.OBJECT blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a "GET /blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a HTTP/1.1" 200 - 6616308 422868123111 205 35 "-" ""Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36" - A54Zaz7Sl0ygUFZ4lEOYCXHxImvTGXnvR+rr9+JcM/gceQWDObRkwnP9nO+wK70lpMaaE78SWvA= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - diff --git a/tests/test_binning/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/test_binning/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv deleted file mode 100644 index a55bf13..0000000 --- a/tests/test_binning/examples/binned_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2023-06-26 03:05:53 6616308 192.0.2.0 3 diff --git a/tests/test_binning/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv deleted file mode 100644 index 1cc8230..0000000 --- a/tests/test_binning/examples/binned_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/test_binning/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv deleted file mode 100644 index 5b97e6c..0000000 --- a/tests/test_binning/examples/binned_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ /dev/null @@ -1,2 +0,0 @@ -timestamp bytes_sent ip_address line_index -2021-12-31 23:06:42 1443 192.0.2.0 0 diff --git a/tests/test_binning/test_bin_all_dandi_raw_s3_logs.py b/tests/test_binning/test_bin_all_dandi_raw_s3_logs.py deleted file mode 100644 index 1473bba..0000000 --- a/tests/test_binning/test_bin_all_dandi_raw_s3_logs.py +++ /dev/null @@ -1,58 +0,0 @@ -import pathlib - -import pandas -import py - -import dandi_s3_log_parser - - -def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: - """Basic test for parsing of all DANDI raw S3 logs in a directory.""" - tmpdir = pathlib.Path(tmpdir) - - file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduction_example_1" - expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - - test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_1" - test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) - - dandi_s3_log_parser.reduce_all_dandi_raw_s3_logs( - base_raw_s3_logs_folder_path=examples_folder_path, - reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, - ) - test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) - - number_of_output_files = len(test_output_file_paths) - assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" - - # Increment this over time as more examples are added - expected_number_of_output_files = 2 - assert ( - number_of_output_files == expected_number_of_output_files - ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - - expected_asset_ids = [file_path.stem for file_path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] - for test_parsed_s3_log_file_path in test_output_file_paths: - assert ( - test_parsed_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" - - test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) - - blob_id = test_parsed_s3_log_file_path.stem - expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" - ) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) - - test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp") - expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp") - - test_parsed_s3_log.index = range(len(test_parsed_s3_log)) - expected_parsed_s3_log.index = range(len(expected_parsed_s3_log)) - - pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) - - -# TODO: add CLI diff --git a/tests/test_binning/test_bin_dandi_raw_s3_log.py b/tests/test_binning/test_bin_dandi_raw_s3_log.py deleted file mode 100644 index fdad307..0000000 --- a/tests/test_binning/test_bin_dandi_raw_s3_log.py +++ /dev/null @@ -1,60 +0,0 @@ -import pathlib - -import pandas -import py - -import dandi_s3_log_parser - - -def test_reduce_dandi_raw_s3_log_example_0(tmpdir: py.path.local) -> None: - """ - Most basic test of functionality. - - If there are failures in the parsing of any lines found in application, - please raise an issue and contribute them to the example log collection. - """ - tmpdir = pathlib.Path(tmpdir) - - file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduction_example_0" - example_raw_s3_log_file_path = examples_folder_path / "0.log" - expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - - test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_0" - test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) - - dandi_s3_log_parser.reduce_dandi_raw_s3_log( - raw_s3_log_file_path=example_raw_s3_log_file_path, - reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, - ) - test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) - - number_of_output_files = len(test_output_file_paths) - assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" - - # Increment this over time as more examples are added - expected_number_of_output_files = 3 - assert ( - number_of_output_files == expected_number_of_output_files - ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - - expected_asset_ids = [path.stem for path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] - for test_reduced_s3_log_file_path in test_output_file_paths: - assert ( - test_reduced_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_reduced_s3_log_file_path.stem} not found in expected asset IDs!" - - is_asset_zarr = "zarr" in str(test_reduced_s3_log_file_path) - if is_asset_zarr: - blob_id = test_reduced_s3_log_file_path.stem - expected_parsed_s3_log_file_path = expected_reduced_s3_logs_folder_path / "zarr" / f"{blob_id}.tsv" - else: - blob_id = test_reduced_s3_log_file_path.stem - expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" - ) - - test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) - - pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_parsed_s3_log) diff --git a/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py b/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py index 888a688..8e0b0d2 100644 --- a/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py +++ b/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py @@ -1,27 +1,38 @@ -# import pathlib -# -# import py -# - -# def test_bin_reduced_s3_logs_by_object_key_example_0(tmpdir: py.path.local) -> None: -# tmpdir = pathlib.Path(tmpdir) -# -# file_parent = pathlib.Path(__file__).parent -# example_folder_path = file_parent / "examples" / "binning_example_0" -# reduced_s3_logs_folder_path = example_folder_path / "reduced_logs" -# -# test_binned_s3_logs_folder_path = tmpdir / "binned_example_0" -# test_binned_s3_logs_folder_path.mkdir(exist_ok=True) -# -# expected_binned_s3_logs_folder_path = example_folder_path / "expected_output" -# expected_binned_s3_log_file_path = expected_reduced_s3_logs_folder_path / "2020" / "01" / "01.tsv" -# -# dandi_s3_log_parser.bin_all_reduced_s3_logs_by_object_key( -# reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, -# binned_s3_logs_folder_path=test_binned_s3_logs_folder_path, -# ) -# -# test_reduced_s3_log = pandas.read_table(filepath_or_buffer=test_reduced_s3_log_file_path) -# expected_reduced_s3_log = pandas.read_table(filepath_or_buffer=expected_reduced_s3_log_file_path) -# -# pandas.testing.assert_frame_equal(left=test_reduced_s3_log, right=expected_reduced_s3_log) +import pathlib + +import pandas +import py + +import dandi_s3_log_parser + + +def test_bin_reduced_s3_logs_by_object_key_example_0(tmpdir: py.path.local) -> None: + tmpdir = pathlib.Path(tmpdir) + + file_parent = pathlib.Path(__file__).parent + example_folder_path = file_parent / "examples" / "binning_example_0" + reduced_s3_logs_folder_path = example_folder_path / "reduced_logs" + + test_binned_s3_logs_folder_path = tmpdir / "binned_example_0" + test_binned_s3_logs_folder_path.mkdir(exist_ok=True) + + expected_binned_s3_logs_folder_path = example_folder_path / "expected_output" + expected_binned_s3_log_file_paths = list(expected_binned_s3_logs_folder_path.rglob("*.tsv")) + + dandi_s3_log_parser.bin_all_reduced_s3_logs_by_object_key( + reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, + binned_s3_logs_folder_path=test_binned_s3_logs_folder_path, + ) + + for expected_binned_s3_log_file_path in expected_binned_s3_log_file_paths: + print(f"Testing binning of {expected_binned_s3_log_file_path}...") + + relative_file_path = expected_binned_s3_log_file_path.relative_to(expected_binned_s3_logs_folder_path) + test_binned_s3_log_file_path = test_binned_s3_logs_folder_path / relative_file_path + + assert test_binned_s3_log_file_path.exists() + + test_binned_s3_log = pandas.read_table(filepath_or_buffer=test_binned_s3_log_file_path) + expected_binned_s3_log = pandas.read_table(filepath_or_buffer=expected_binned_s3_log_file_path) + + pandas.testing.assert_frame_equal(left=test_binned_s3_log, right=expected_binned_s3_log) diff --git a/tests/test_binning/test_reduce_all_dandi_raw_s3_logs.py b/tests/test_binning/test_reduce_all_dandi_raw_s3_logs.py deleted file mode 100644 index 1473bba..0000000 --- a/tests/test_binning/test_reduce_all_dandi_raw_s3_logs.py +++ /dev/null @@ -1,58 +0,0 @@ -import pathlib - -import pandas -import py - -import dandi_s3_log_parser - - -def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: - """Basic test for parsing of all DANDI raw S3 logs in a directory.""" - tmpdir = pathlib.Path(tmpdir) - - file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduction_example_1" - expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - - test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_1" - test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) - - dandi_s3_log_parser.reduce_all_dandi_raw_s3_logs( - base_raw_s3_logs_folder_path=examples_folder_path, - reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, - ) - test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) - - number_of_output_files = len(test_output_file_paths) - assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" - - # Increment this over time as more examples are added - expected_number_of_output_files = 2 - assert ( - number_of_output_files == expected_number_of_output_files - ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - - expected_asset_ids = [file_path.stem for file_path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] - for test_parsed_s3_log_file_path in test_output_file_paths: - assert ( - test_parsed_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" - - test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) - - blob_id = test_parsed_s3_log_file_path.stem - expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" - ) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) - - test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp") - expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp") - - test_parsed_s3_log.index = range(len(test_parsed_s3_log)) - expected_parsed_s3_log.index = range(len(expected_parsed_s3_log)) - - pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) - - -# TODO: add CLI diff --git a/tests/test_binning/test_reduce_all_dandi_raw_s3_logs_parallel.py b/tests/test_binning/test_reduce_all_dandi_raw_s3_logs_parallel.py deleted file mode 100644 index 70987b7..0000000 --- a/tests/test_binning/test_reduce_all_dandi_raw_s3_logs_parallel.py +++ /dev/null @@ -1,59 +0,0 @@ -import pathlib - -import pandas -import py - -import dandi_s3_log_parser - - -def test_reduce_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local) -> None: - """Basic test for parsing of all DANDI raw S3 logs in a directory using multiple workers.""" - tmpdir = pathlib.Path(tmpdir) - - file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduction_example_1" - expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - - test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_1" - test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) - - dandi_s3_log_parser.reduce_all_dandi_raw_s3_logs( - base_raw_s3_logs_folder_path=examples_folder_path, - reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, - maximum_number_of_workers=2, - ) - test_output_file_paths = [path for path in test_reduced_s3_logs_folder_path.rglob("*.tsv")] - - number_of_output_files = len(test_output_file_paths) - assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" - - # Increment this over time as more examples are added - expected_number_of_output_files = 2 - assert ( - number_of_output_files == expected_number_of_output_files - ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - - expected_asset_ids = [file_path.stem for file_path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] - for test_parsed_s3_log_file_path in test_output_file_paths: - assert ( - test_parsed_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" - - test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) - - blob_id = test_parsed_s3_log_file_path.stem - expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" - ) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) - - test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp") - expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp") - - test_parsed_s3_log.index = range(len(test_parsed_s3_log)) - expected_parsed_s3_log.index = range(len(expected_parsed_s3_log)) - - pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) - - -# TODO: add CLI diff --git a/tests/test_binning/test_reduce_dandi_raw_s3_log_bad_lines.py b/tests/test_binning/test_reduce_dandi_raw_s3_log_bad_lines.py deleted file mode 100644 index 162761b..0000000 --- a/tests/test_binning/test_reduce_dandi_raw_s3_log_bad_lines.py +++ /dev/null @@ -1,57 +0,0 @@ -import pathlib - -import pandas -import py - -import dandi_s3_log_parser - - -def test_reduce_dandi_raw_s3_log_bad_lines(tmpdir: py.path.local) -> None: - """ - 'parsed_example_2' contains the basic test cases as well as a collection of 'bad lines' contributed over time. - """ - tmpdir = pathlib.Path(tmpdir) - - # Count initial error folder contents - error_folder = dandi_s3_log_parser.DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "errors" - error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() - initial_number_of_error_folder_contents = len(error_folder_contents) - - file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "reduction_example_2" - example_raw_s3_log_file_path = examples_folder_path / "0.log" - expected_reduced_s3_logs_folder_path = examples_folder_path / "expected_output" - - test_reduced_s3_logs_folder_path = tmpdir / "reduction_example_2" - test_reduced_s3_logs_folder_path.mkdir(exist_ok=True) - - dandi_s3_log_parser.reduce_dandi_raw_s3_log( - raw_s3_log_file_path=example_raw_s3_log_file_path, - reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, - ) - test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) - - number_of_output_files = len(test_output_file_paths) - expected_number_of_output_files = 3 - assert number_of_output_files == expected_number_of_output_files - - expected_asset_ids = [path.stem for path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] - for test_parsed_s3_log_file_path in test_output_file_paths: - assert ( - test_parsed_s3_log_file_path.stem in expected_asset_ids - ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" - - test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) - - blob_id = test_parsed_s3_log_file_path.stem - expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" - ) - expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) - - pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) - - post_test_error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list() - assert ( - len(post_test_error_folder_contents) == initial_number_of_error_folder_contents - ), "Errors occurred during line parsing!" From ae3b35200ec1c95a478261a0a1a00d27c1d98854 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 10:43:19 -0400 Subject: [PATCH 19/55] debug binning --- .../blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv | 2 +- .../blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv | 2 +- .../zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_binning/examples/binning_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/test_binning/examples/binning_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv index 1c7ebb9..62c702d 100644 --- a/tests/test_binning/examples/binning_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv +++ b/tests/test_binning/examples/binning_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv @@ -1,2 +1,2 @@ timestamp bytes_sent ip_address -2020-01-01 05:06:35 512 192.0.2.0 +2020-01-01T05:06:35 512 192.0.2.0 diff --git a/tests/test_binning/examples/binning_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/test_binning/examples/binning_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv index 7415927..40a2987 100644 --- a/tests/test_binning/examples/binning_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv +++ b/tests/test_binning/examples/binning_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv @@ -1,2 +1,2 @@ timestamp bytes_sent ip_address -2020-01-01 23:06:42 1443 192.0.2.0 +2020-01-01T23:06:42 1443 192.0.2.0 diff --git a/tests/test_binning/examples/binning_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv b/tests/test_binning/examples/binning_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv index f53bbd8..3e814f6 100644 --- a/tests/test_binning/examples/binning_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv +++ b/tests/test_binning/examples/binning_example_0/expected_output/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv @@ -1,2 +1,2 @@ timestamp bytes_sent ip_address -2020-01-01 22:42:58 1526223 192.0.2.0 +2020-01-01T22:42:58 1526223 192.0.2.0 From e6432be97bf57007e85526e3d12af784e9d60103 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 10:49:58 -0400 Subject: [PATCH 20/55] update mapping; debug year span --- src/dandi_s3_log_parser/__init__.py | 4 ++-- src/dandi_s3_log_parser/_command_line_interface.py | 4 ++-- src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 2 +- ...t_mapper.py => _map_all_reduced_s3_logs_to_dandisets.py} | 2 +- .../blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv | 0 .../blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv | 0 .../zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv | 0 .../expected_output/000003/0.210812.1448.tsv | 0 .../expected_output/000003/0.230629.1955.tsv | 0 .../expected_output/000003/draft.tsv | 0 .../expected_output/000013/0.220126.2143.tsv | 0 .../expected_output/000013/draft.tsv | 0 .../expected_output/000108/draft.tsv | 0 .../test_map_all_reduced_s3_logs_to_dandisets.py} | 6 +++--- 14 files changed, 9 insertions(+), 9 deletions(-) rename src/dandi_s3_log_parser/{_dandiset_mapper.py => _map_all_reduced_s3_logs_to_dandisets.py} (99%) rename test_live_services/{examples/mapped_to_dandiset_example_0 => test_mapping/examples/mapped_to_dandisets_example_0}/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv (100%) rename test_live_services/{examples/mapped_to_dandiset_example_0 => test_mapping/examples/mapped_to_dandisets_example_0}/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv (100%) rename test_live_services/{examples/mapped_to_dandiset_example_0 => test_mapping/examples/mapped_to_dandisets_example_0}/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv (100%) rename test_live_services/{examples/mapped_to_dandiset_example_0 => test_mapping/examples/mapped_to_dandisets_example_0}/expected_output/000003/0.210812.1448.tsv (100%) rename test_live_services/{examples/mapped_to_dandiset_example_0 => test_mapping/examples/mapped_to_dandisets_example_0}/expected_output/000003/0.230629.1955.tsv (100%) rename test_live_services/{examples/mapped_to_dandiset_example_0 => test_mapping/examples/mapped_to_dandisets_example_0}/expected_output/000003/draft.tsv (100%) rename test_live_services/{examples/mapped_to_dandiset_example_0 => test_mapping/examples/mapped_to_dandisets_example_0}/expected_output/000013/0.220126.2143.tsv (100%) rename test_live_services/{examples/mapped_to_dandiset_example_0 => test_mapping/examples/mapped_to_dandisets_example_0}/expected_output/000013/draft.tsv (100%) rename test_live_services/{examples/mapped_to_dandiset_example_0 => test_mapping/examples/mapped_to_dandisets_example_0}/expected_output/000108/draft.tsv (100%) rename test_live_services/{test_map_reduced_logs_to_all_dandisets.py => test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py} (93%) diff --git a/src/dandi_s3_log_parser/__init__.py b/src/dandi_s3_log_parser/__init__.py index b9b3d3c..4643b58 100644 --- a/src/dandi_s3_log_parser/__init__.py +++ b/src/dandi_s3_log_parser/__init__.py @@ -22,7 +22,7 @@ from ._buffered_text_reader import BufferedTextReader from ._dandi_s3_log_file_reducer import reduce_all_dandi_raw_s3_logs from ._ip_utils import get_region_from_ip_address -from ._dandiset_mapper import map_reduced_logs_to_dandisets +from ._map_all_reduced_s3_logs_to_dandisets import map_all_reduced_s3_logs_to_dandisets from ._bin_all_reduced_s3_logs_by_object_key import bin_all_reduced_s3_logs_by_object_key __all__ = [ @@ -31,6 +31,6 @@ "BufferedTextReader", "reduce_all_dandi_raw_s3_logs", "get_region_from_ip_address", - "map_reduced_logs_to_dandisets", + "map_all_reduced_s3_logs_to_dandisets", "bin_all_reduced_s3_logs_by_object_key", ] diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index a5ae299..ea9226f 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -8,7 +8,7 @@ from ._dandi_s3_log_file_reducer import ( reduce_all_dandi_raw_s3_logs, ) -from ._dandiset_mapper import map_reduced_logs_to_dandisets +from ._map_all_reduced_s3_logs_to_dandisets import map_all_reduced_s3_logs_to_dandisets @click.command(name="reduce_all_dandi_raw_s3_logs") @@ -101,7 +101,7 @@ def _reduce_all_dandi_raw_s3_logs_cli( def _map_reduced_logs_to_dandisets_cli( reduced_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path ) -> None: - map_reduced_logs_to_dandisets( + map_all_reduced_s3_logs_to_dandisets( reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path ) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index cf9fc88..0bddad2 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -64,7 +64,7 @@ def reduce_all_dandi_raw_s3_logs( object_key_handler = _get_default_dandi_object_key_handler() # Ensure all subfolders exist once at the start - years_to_reduce = set([str(year) for year in range(2019, int(datetime.datetime.now().strftime("%Y")))]) - set( + years_to_reduce = set([str(year) for year in range(2019, int(datetime.datetime.now().strftime("%Y")) + 1)]) - set( excluded_years ) for year in years_to_reduce: diff --git a/src/dandi_s3_log_parser/_dandiset_mapper.py b/src/dandi_s3_log_parser/_map_all_reduced_s3_logs_to_dandisets.py similarity index 99% rename from src/dandi_s3_log_parser/_dandiset_mapper.py rename to src/dandi_s3_log_parser/_map_all_reduced_s3_logs_to_dandisets.py index a2fab6d..f5a1899 100644 --- a/src/dandi_s3_log_parser/_dandiset_mapper.py +++ b/src/dandi_s3_log_parser/_map_all_reduced_s3_logs_to_dandisets.py @@ -10,7 +10,7 @@ @validate_call -def map_reduced_logs_to_dandisets( +def map_all_reduced_s3_logs_to_dandisets( reduced_s3_logs_folder_path: DirectoryPath, dandiset_logs_folder_path: DirectoryPath ) -> None: """ diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv similarity index 100% rename from test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv rename to test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv similarity index 100% rename from test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv rename to test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv similarity index 100% rename from test_live_services/examples/mapped_to_dandiset_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv rename to test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448.tsv similarity index 100% rename from test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv rename to test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448.tsv diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955.tsv similarity index 100% rename from test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv rename to test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955.tsv diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft.tsv similarity index 100% rename from test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv rename to test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft.tsv diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143.tsv similarity index 100% rename from test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv rename to test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143.tsv diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft.tsv similarity index 100% rename from test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv rename to test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft.tsv diff --git a/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000108/draft.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft.tsv similarity index 100% rename from test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000108/draft.tsv rename to test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft.tsv diff --git a/test_live_services/test_map_reduced_logs_to_all_dandisets.py b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py similarity index 93% rename from test_live_services/test_map_reduced_logs_to_all_dandisets.py rename to test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py index ef4a531..4816bc9 100644 --- a/test_live_services/test_map_reduced_logs_to_all_dandisets.py +++ b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py @@ -6,15 +6,15 @@ import dandi_s3_log_parser -def test_map_reduced_logs_to_dandisets(tmpdir: py.path.local): +def test_map_all_reduced_s3_logs_to_dandisets(tmpdir: py.path.local): tmpdir = pathlib.Path(tmpdir) file_parent = pathlib.Path(__file__).parent - examples_folder_path = file_parent / "examples" / "mapped_to_dandiset_example_0" + examples_folder_path = file_parent / "examples" / "mapped_to_dandisets_example_0" reduced_s3_logs_folder_path = examples_folder_path / "binned_logs" dandiset_logs_folder_path = tmpdir - dandi_s3_log_parser.map_reduced_logs_to_dandisets( + dandi_s3_log_parser.map_all_reduced_s3_logs_to_dandisets( reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path, ) From d3b4c80f774a5a20a738a16de7b53c1c8989ac79 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 10:56:42 -0400 Subject: [PATCH 21/55] debug skip protocol --- .../_dandi_s3_log_file_reducer.py | 31 +++++++------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 0bddad2..8edb30d 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -1,10 +1,8 @@ """Primary functions for reducing raw S3 log file for DANDI.""" import collections -import datetime import os import random -import shutil import traceback import uuid from collections.abc import Callable @@ -63,10 +61,16 @@ def reduce_all_dandi_raw_s3_logs( object_key_handler = _get_default_dandi_object_key_handler() + relative_s3_log_file_paths = [ + raw_s3_log_file_path.relative_to(raw_s3_logs_folder_path) + for raw_s3_log_file_path in raw_s3_logs_folder_path.rglob(pattern="*.log") + if raw_s3_log_file_path.stem.isdigit() + ] + # Ensure all subfolders exist once at the start - years_to_reduce = set([str(year) for year in range(2019, int(datetime.datetime.now().strftime("%Y")) + 1)]) - set( - excluded_years - ) + years_to_reduce = set( + relative_s3_log_file_path.parent.parent for relative_s3_log_file_path in relative_s3_log_file_paths + ) - set(excluded_years) for year in years_to_reduce: reduced_year_path = reduced_s3_logs_folder_path / year reduced_year_path.mkdir(exist_ok=True) @@ -75,15 +79,11 @@ def reduce_all_dandi_raw_s3_logs( reduced_month_path = reduced_year_path / str(month).zfill(2) reduced_month_path.mkdir(exist_ok=True) - relative_s3_log_file_paths = [ - raw_s3_log_file_path.relative_to(raw_s3_logs_folder_path) - for raw_s3_log_file_path in raw_s3_logs_folder_path.rglob(pattern="*.log") - if raw_s3_log_file_path.stem.isdigit() and raw_s3_log_file_path.parent.parent.name in years_to_reduce - ] relative_s3_log_file_paths_to_reduce = [ relative_s3_log_file_path for relative_s3_log_file_path in relative_s3_log_file_paths if not (reduced_s3_logs_folder_path / relative_s3_log_file_path).exists() + and relative_s3_log_file_path.parent.parent in years_to_reduce ] # The .rglob is not naturally sorted; shuffle for more uniform progress updates @@ -153,15 +153,8 @@ def reduce_all_dandi_raw_s3_logs( for future in progress_bar_iterable: future.result() # This is the call that finally triggers the deployment to the workers - # Final step: clean any empty directories - for year in years_to_reduce: - reduced_year_folder_path = reduced_s3_logs_folder_path / year - for month in range(1, 13): - reduced_month_folder_path = reduced_year_folder_path / str(month).zfill(2) - if not any(reduced_month_folder_path.iterdir()): - shutil.rmtree(path=reduced_month_folder_path, ignore_errors=True) - if not any(reduced_year_folder_path.iterdir()): - shutil.rmtree(path=reduced_year_folder_path, ignore_errors=True) + # Note that empty files and directories are kept to indicate that the file was already reduced and so can be skipped + # Even if there is no reduced activity in those files return None From 8ad2c8885ff33008a7fbd11d7b47c90f25b712d6 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 11:01:20 -0400 Subject: [PATCH 22/55] improve default folder creation --- .../_dandi_s3_log_file_reducer.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 8edb30d..7d18c6b 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -68,16 +68,15 @@ def reduce_all_dandi_raw_s3_logs( ] # Ensure all subfolders exist once at the start - years_to_reduce = set( - relative_s3_log_file_path.parent.parent for relative_s3_log_file_path in relative_s3_log_file_paths - ) - set(excluded_years) - for year in years_to_reduce: - reduced_year_path = reduced_s3_logs_folder_path / year - reduced_year_path.mkdir(exist_ok=True) - - for month in range(1, 13): - reduced_month_path = reduced_year_path / str(month).zfill(2) - reduced_month_path.mkdir(exist_ok=True) + years_and_months_to_reduce = { + (relative_s3_log_file_path.parent.parent, relative_s3_log_file_paths.parent) + for relative_s3_log_file_path in relative_s3_log_file_paths + } - set(excluded_years) + for years_and_months_to_reduce in years_and_months_to_reduce: + year, month = years_and_months_to_reduce + reduced_year_and_month_path = reduced_s3_logs_folder_path / year / month + reduced_year_and_month_path.mkdir(parents=True, exist_ok=True) + years_to_reduce = {year for year, _ in years_and_months_to_reduce} relative_s3_log_file_paths_to_reduce = [ relative_s3_log_file_path From cb5e7ff5f3553140bac11cb94c46e7569028b62c Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 11:09:48 -0400 Subject: [PATCH 23/55] improve default folder creation --- .../_dandi_s3_log_file_reducer.py | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 7d18c6b..34b17f6 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -68,22 +68,29 @@ def reduce_all_dandi_raw_s3_logs( ] # Ensure all subfolders exist once at the start - years_and_months_to_reduce = { - (relative_s3_log_file_path.parent.parent, relative_s3_log_file_paths.parent) - for relative_s3_log_file_path in relative_s3_log_file_paths - } - set(excluded_years) - for years_and_months_to_reduce in years_and_months_to_reduce: - year, month = years_and_months_to_reduce - reduced_year_and_month_path = reduced_s3_logs_folder_path / year / month - reduced_year_and_month_path.mkdir(parents=True, exist_ok=True) - years_to_reduce = {year for year, _ in years_and_months_to_reduce} - + # years_and_months_to_reduce = { + # (relative_s3_log_file_path.parent.parent.name, relative_s3_log_file_path.parent.name) + # for relative_s3_log_file_path in relative_s3_log_file_paths + # } - set(excluded_years) + # for years_and_months_to_reduce in years_and_months_to_reduce: + # year, month = years_and_months_to_reduce + # reduced_year_and_month_path = reduced_s3_logs_folder_path / year / month + # reduced_year_and_month_path.mkdir(parents=True, exist_ok=True) + # years_to_reduce = {year_and_month_to_reduce[0] for year_and_month_to_reduce in years_and_months_to_reduce} + + years_to_reduce = { + relative_s3_log_file_path.parent.parent.name for relative_s3_log_file_path in relative_s3_log_file_paths + } relative_s3_log_file_paths_to_reduce = [ relative_s3_log_file_path for relative_s3_log_file_path in relative_s3_log_file_paths if not (reduced_s3_logs_folder_path / relative_s3_log_file_path).exists() - and relative_s3_log_file_path.parent.parent in years_to_reduce + and relative_s3_log_file_path.parent.parent not in years_to_reduce ] + print("\n") + print(f"{years_to_reduce=}") + print(f"{relative_s3_log_file_paths_to_reduce=}") + print(f"{relative_s3_log_file_paths=}") # The .rglob is not naturally sorted; shuffle for more uniform progress updates random.shuffle(relative_s3_log_file_paths_to_reduce) @@ -105,6 +112,7 @@ def reduce_all_dandi_raw_s3_logs( reduced_s3_log_file_path = ( reduced_s3_logs_folder_path / relative_s3_log_file_path.parent / f"{relative_s3_log_file_path.stem}.tsv" ) + reduced_s3_log_file_path.parent.mkdir(parents=True, exist_ok=True) reduce_raw_s3_log( raw_s3_log_file_path=raw_s3_log_file_path, @@ -128,6 +136,7 @@ def reduce_all_dandi_raw_s3_logs( / relative_s3_log_file_path.parent / f"{relative_s3_log_file_path.stem}.tsv" ) + reduced_s3_log_file_path.parent.mkdir(parents=True, exist_ok=True) futures.append( executor.submit( From 08c178624b86a8937f35c89f28b57a83af1b15c4 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 11:10:11 -0400 Subject: [PATCH 24/55] improve default folder creation --- .../_dandi_s3_log_file_reducer.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 34b17f6..a87f02f 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -67,20 +67,9 @@ def reduce_all_dandi_raw_s3_logs( if raw_s3_log_file_path.stem.isdigit() ] - # Ensure all subfolders exist once at the start - # years_and_months_to_reduce = { - # (relative_s3_log_file_path.parent.parent.name, relative_s3_log_file_path.parent.name) - # for relative_s3_log_file_path in relative_s3_log_file_paths - # } - set(excluded_years) - # for years_and_months_to_reduce in years_and_months_to_reduce: - # year, month = years_and_months_to_reduce - # reduced_year_and_month_path = reduced_s3_logs_folder_path / year / month - # reduced_year_and_month_path.mkdir(parents=True, exist_ok=True) - # years_to_reduce = {year_and_month_to_reduce[0] for year_and_month_to_reduce in years_and_months_to_reduce} - years_to_reduce = { relative_s3_log_file_path.parent.parent.name for relative_s3_log_file_path in relative_s3_log_file_paths - } + } - set(excluded_years) relative_s3_log_file_paths_to_reduce = [ relative_s3_log_file_path for relative_s3_log_file_path in relative_s3_log_file_paths From e6c68fd8069f8e65e8fe5e2a64c2bdf09bde0ff5 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 11:10:22 -0400 Subject: [PATCH 25/55] improve default folder creation --- src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index a87f02f..36545dd 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -76,10 +76,6 @@ def reduce_all_dandi_raw_s3_logs( if not (reduced_s3_logs_folder_path / relative_s3_log_file_path).exists() and relative_s3_log_file_path.parent.parent not in years_to_reduce ] - print("\n") - print(f"{years_to_reduce=}") - print(f"{relative_s3_log_file_paths_to_reduce=}") - print(f"{relative_s3_log_file_paths=}") # The .rglob is not naturally sorted; shuffle for more uniform progress updates random.shuffle(relative_s3_log_file_paths_to_reduce) From 4518145fa39b4178701c71c5e4a224e3f76016b8 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 12:12:59 -0400 Subject: [PATCH 26/55] add CLI for binning --- pyproject.toml | 3 +- .../_command_line_interface.py | 30 +++++++++++++++++-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 06d56ad..8d8f9e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,8 @@ classifiers = [ [project.scripts] reduce_all_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_all_dandi_raw_s3_logs_cli" -map_reduced_logs_to_dandisets = "dandi_s3_log_parser._command_line_interface:_map_reduced_logs_to_dandisets_cli" +bin_all_reduced_s3_logs_by_object_key = "dandi_s3_log_parser._command_line_interface:_bin_all_reduced_s3_logs_by_object_key_cli" +map_all_reduced_s3_logs_to_dandisets = "dandi_s3_log_parser._command_line_interface:_map_all_reduced_s3_logs_to_dandisets_cli" diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index ea9226f..60156c0 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -5,6 +5,7 @@ import click +from ._bin_all_reduced_s3_logs_by_object_key import bin_all_reduced_s3_logs_by_object_key from ._dandi_s3_log_file_reducer import ( reduce_all_dandi_raw_s3_logs, ) @@ -85,7 +86,32 @@ def _reduce_all_dandi_raw_s3_logs_cli( return None -@click.command(name="map_reduced_logs_to_dandisets") +@click.command(name="bin_all_reduced_s3_logs_by_object_key") +@click.option( + "--reduced_s3_logs_folder_path", + help="The path to the folder containing all raw S3 log files.", + required=True, + type=click.Path(writable=False), +) +@click.option( + "--binned_s3_logs_folder_path", + help="The path to write each reduced S3 log file to. There will be one file per handled asset ID.", + required=True, + type=click.Path(writable=True), +) +def _bin_all_reduced_s3_logs_by_object_key_cli( + reduced_s3_logs_folder_path: str, + binned_s3_logs_folder_path: str, +) -> None: + bin_all_reduced_s3_logs_by_object_key( + reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, + binned_s3_logs_folder_path=binned_s3_logs_folder_path, + ) + + return None + + +@click.command(name="map_all_reduced_s3_logs_to_dandisets") @click.option( "--reduced_s3_logs_folder_path", help="", @@ -98,7 +124,7 @@ def _reduce_all_dandi_raw_s3_logs_cli( required=True, type=click.Path(writable=False), ) -def _map_reduced_logs_to_dandisets_cli( +def _map_all_reduced_s3_logs_to_dandisets_cli( reduced_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path ) -> None: map_all_reduced_s3_logs_to_dandisets( From 4fcbe889a511c13593fb9ab321408baa16fb6a2c Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 12:33:35 -0400 Subject: [PATCH 27/55] update argument name; enhance readme --- README.md | 90 ++++++++++++------- .../_command_line_interface.py | 6 +- .../_map_all_reduced_s3_logs_to_dandisets.py | 6 +- ...st_map_all_reduced_s3_logs_to_dandisets.py | 2 +- 4 files changed, 66 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index c110c94..f70b6ba 100644 --- a/README.md +++ b/README.md @@ -24,11 +24,7 @@ A few summary facts as of 2024: - A single line of a raw S3 log file can be between 400-1000+ bytes. - Some of the busiest daily logs on the archive can have around 5,014,386 lines. - There are more than 6 TB of log files collected in total. -- This parser reduces that total to around 20 GB of essential information. - -The reduced information is then additionally mapped to currently available assets in persistent published Dandiset versions and current drafts, which only comprise around 100 MB of the original data. - -These small Dandiset-specific summaries are soon to be shared publicly. +- This parser reduces that total to around 100 MB of essential information. @@ -40,71 +36,103 @@ pip install dandi_s3_log_parser +## Workflow + +The workflow is comprised of three modular steps. + +1) **Reduction.** + +Filter out: + +- Non-success status codes. +- Excluded IP addresses. +- Operation types other than the one specified (`REST.GET.OBJECT` by default). + +Then, only limit data extraction to a handful of specified fields from each full line of the raw logs; by default, `object_key`, `timestamp`, `ip_address`, and `bytes_sent`. + +In summer of 2024, this reduced 6 TB of raw logs to around 200 GB. + +The process is designed to be easily parallelized and interruptible, meaning that you can feel free to kill the process while it is running and restart it later without losing most progress. + +2) **Binning.** + +To make the mapping to Dandisets more efficient, the reduced logs are binned by their object keys (asset blob IDs) for easy lookup. + +This step reduces the total file sizes from step (1) even further by reducing repeated object keys, though it does create a large number of small files. + +In summer of 2024, this reduced 200 GB of reduced logs to around 20 GB. + +3) **Mapping.** + +The final step, which should be run periodically to keep the desired usage logs per Dandiset up to date, is to scan through all currently known Dandisets and their versions, mapping the asset blob IDs to their filenames and generating the most recently parsed usage logs that can be shared publicly. + +In summer of 2024, this reduced 20 GB of binned logs to around 100 MB of Dandiset-specific logs. + + + ## Usage -### Reduce entire history +### Reduction -To iteratively parse all historical logs all at once (parallelization strongly recommended): +To reduce: ```bash reduce_all_dandi_raw_s3_logs \ - --base_raw_s3_logs_folder_path < base log folder > \ - --reduced_s3_logs_folder_path < output folder > \ - --maximum_number_of_workers < number of CPUs to use > \ + --raw_s3_logs_folder_path < base raw S3 logs folder > \ + --reduced_s3_logs_folder_path < reduced S3 logs folder path > \ + --maximum_number_of_workers < number of workers to use > \ --maximum_buffer_size_in_mb < approximate amount of RAM to use > \ - --excluded_ips < comma-separated list of known IPs to exclude > + --excluded_ips < comma-separated list of known IPs to exclude > ``` For example, on Drogon: ```bash reduce_all_dandi_raw_s3_logs \ - --base_raw_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs \ - --reduced_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_8_15_2024/REST_GET_OBJECT_per_asset_id \ - --maximum_number_of_workers 6 \ - --maximum_buffer_size_in_mb 5000 \ + --raw_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs \ + --reduced_s3_logs_folder_path /mnt/backup/dandi/reduced-dandiarchive-logs \ + --maximum_number_of_workers 3 \ + --maximum_buffer_size_in_mb 3000 \ --excluded_ips < Drogons IP > ``` -### Reduce a single log file +### Binning -To parse only a single log file at a time, such as in a CRON job: +To bin: ```bash -reduce_dandi_raw_s3_log \ - --raw_s3_log_file_path < s3 log file path > \ - --reduced_s3_logs_folder_path < output folder > \ - --excluded_ips < comma-separated list of known IPs to exclude > +bin_all_reduced__s3_logs \ + --reduced_s3_logs_folder_path < reduced S3 logs folder path > \ + --binned_s3_logs_folder_path < binned S3 logs folder path > ``` For example, on Drogon: ```bash -reduce_dandi_raw_s3_log \ - --raw_s3_log_file_path /mnt/backup/dandi/dandiarchive-logs/2024/08/17.log \ - --reduced_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_8_15_2024/REST_GET_OBJECT_per_asset_id \ - --excluded_ips < Drogons IP > +bin_all_reduced__s3_logs \ + --reduced_s3_logs_folder_path /mnt/backup/dandi/reduced-dandiarchive-logs \ + --binned_s3_logs_folder_path /mnt/backup/dandi/binned-dandiarchive-logs ``` -### Map to Dandisets +### Mapping The next step, that should also be updated regularly (daily-weekly), is to iterate through all current versions of all Dandisets, mapping the reduced logs to their assets. ```bash map_reduced_logs_to_dandisets \ - --reduced_s3_logs_folder_path < reduced s3 logs folder path > \ - --dandiset_logs_folder_path < mapped logs folder > + --binned_s3_logs_folder_path < binned S3 logs folder path > \ + --dandiset_logs_folder_path < mapped Dandiset logs folder > ``` For example, on Drogon: ```bash map_reduced_logs_to_dandisets \ - --reduced_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_8_15_2024/REST_GET_OBJECT_per_asset_id \ - --dandiset_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/mapped_logs_8_15_2024 + --binned_s3_logs_folder_path /mnt/backup/dandi/binned-dandiarchive-logs \ + --dandiset_logs_folder_path /mnt/backup/dandi/mapped-dandiset-logs ``` ## Submit line decoding errors -Please email line decoding errors collected from your local config file to the core maintainer before raising issues or submitting PRs contributing them as examples, to more easily correct any aspects that might require anonymization. +Please email line decoding errors collected from your local config file (located in `~/.dandi_s3_log_parser/errors`) to the core maintainer before raising issues or submitting PRs contributing them as examples, to more easily correct any aspects that might require anonymization. diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index 60156c0..b819628 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -113,7 +113,7 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( @click.command(name="map_all_reduced_s3_logs_to_dandisets") @click.option( - "--reduced_s3_logs_folder_path", + "--binned_s3_logs_folder_path", help="", required=True, type=click.Path(writable=False), @@ -125,10 +125,10 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( type=click.Path(writable=False), ) def _map_all_reduced_s3_logs_to_dandisets_cli( - reduced_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path + binned_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path ) -> None: map_all_reduced_s3_logs_to_dandisets( - reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path + binned_s3_logs_folder_path=binned_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path ) return None diff --git a/src/dandi_s3_log_parser/_map_all_reduced_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_all_reduced_s3_logs_to_dandisets.py index f5a1899..0889020 100644 --- a/src/dandi_s3_log_parser/_map_all_reduced_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_all_reduced_s3_logs_to_dandisets.py @@ -11,7 +11,7 @@ @validate_call def map_all_reduced_s3_logs_to_dandisets( - reduced_s3_logs_folder_path: DirectoryPath, dandiset_logs_folder_path: DirectoryPath + binned_s3_logs_folder_path: DirectoryPath, dandiset_logs_folder_path: DirectoryPath ) -> None: """ Iterate over all dandisets and create a single .tsv per dandiset version containing reduced log for all assets. @@ -20,7 +20,7 @@ def map_all_reduced_s3_logs_to_dandisets( Parameters ---------- - reduced_s3_logs_folder_path : DirectoryPath + binned_s3_logs_folder_path : DirectoryPath The path to the folder containing the reduced S3 log files. dandiset_logs_folder_path : DirectoryPath The path to the folder where the mapped logs will be saved. @@ -51,7 +51,7 @@ def map_all_reduced_s3_logs_to_dandisets( ): _map_reduced_logs_to_dandiset( dandiset=dandiset, - reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, + reduced_s3_logs_folder_path=binned_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path, client=client, ip_hash_to_region=ip_hash_to_region, diff --git a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py index 4816bc9..3506d01 100644 --- a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py +++ b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py @@ -15,7 +15,7 @@ def test_map_all_reduced_s3_logs_to_dandisets(tmpdir: py.path.local): dandiset_logs_folder_path = tmpdir dandi_s3_log_parser.map_all_reduced_s3_logs_to_dandisets( - reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, + binned_s3_logs_folder_path=reduced_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path, ) From 344b09b8ce1bd20be0e45bb3d62b497294dbecf6 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 12:45:48 -0400 Subject: [PATCH 28/55] add binning tracking --- .../_bin_all_reduced_s3_logs_by_object_key.py | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index f49c91b..74f45c0 100644 --- a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -22,22 +22,37 @@ def bin_all_reduced_s3_logs_by_object_key( binned_s3_logs_folder_path : str The path to write each binned S3 log file to. """ - # TODO: add two status tracking YAML files - # 1) reduced_log_file_paths_started.yaml - # 2) reduced_log_file_paths_completed.yaml - # Throw warning on start of this function if they disagree (indicating error occurred, most likely during I/O stage) - # All reduced logs will likely need to be freshly re-binned in this case - # (by manually removing or renaming the 'binned' target directory to start off empty) - # - # But if all goes well, then use those file paths to skip over already completed binning - # - # Although; there was no guarantee that the binned contents were chronological, so maybe also - # add a final step (with flag to disable) to re-write all binned logs in chronological order? - # - # Thought: since we're doing this non-parallel, we could just iterate the reduced logs in chronological order + started_tracking_file_path = binned_s3_logs_folder_path / "binned_log_file_paths_started.txt" + completed_tracking_file_path = binned_s3_logs_folder_path / "binned_log_file_paths_completed.txt" - reduced_s3_log_files = reduced_s3_logs_folder_path.rglob("*.tsv") + if started_tracking_file_path.exists() != completed_tracking_file_path.exists(): + raise FileNotFoundError( + "One of the tracking files is missing, indicating corruption in the binning process. " + "Please clean the binning directory and re-run this function." + ) + + completed = None + if not started_tracking_file_path.exists(): + started_tracking_file_path.touch() + completed_tracking_file_path.touch() + else: + with open(file=started_tracking_file_path, mode="r") as io: + started = set(io.readlines()) + with open(file=completed_tracking_file_path, mode="r") as io: + completed = set(io.readlines()) + + if started != completed: + raise ValueError( + "The tracking files do not agree on the state of the binning process. " + "Please clean the binning directory and re-run this function." + ) + completed = completed or set() + + reduced_s3_log_files = set(reduced_s3_logs_folder_path.rglob("*.tsv")) - completed for reduced_s3_log_file in reduced_s3_log_files: + with open(file=started_tracking_file_path, mode="a") as started_tracking_file: + started_tracking_file.write(f"{reduced_s3_log_file}: 1\n") + reduced_data_frame = pandas.read_csv(filepath_or_buffer=reduced_s3_log_file, sep="\t") binned_data_frame = reduced_data_frame.groupby("object_key").agg( { @@ -65,3 +80,6 @@ def bin_all_reduced_s3_logs_by_object_key( header = False if binned_s3_log_file_path.exists() else True data_frame.to_csv(path_or_buf=binned_s3_log_file_path, mode="a", sep="\t", header=header, index=False) + + with open(file=completed_tracking_file_path, mode="a") as started_tracking_file: + started_tracking_file.write(f"{reduced_s3_log_file}\n") From d5ed6da81c45c8cfd4c982f0fe88695205fcbca9 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 12:50:18 -0400 Subject: [PATCH 29/55] reformat --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f70b6ba..cf11b04 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,9 @@ pip install dandi_s3_log_parser ## Workflow -The workflow is comprised of three modular steps. +The process is comprised of three modular steps. -1) **Reduction.** +### 1. **Reduction.** Filter out: @@ -52,21 +52,21 @@ Then, only limit data extraction to a handful of specified fields from each full In summer of 2024, this reduced 6 TB of raw logs to around 200 GB. -The process is designed to be easily parallelized and interruptible, meaning that you can feel free to kill the process while it is running and restart it later without losing most progress. +The process is designed to be easily parallelized and interruptible, meaning that you can feel free to kill any processes while they are running and restart later without losing most progress. -2) **Binning.** +### 2. **Binning.** -To make the mapping to Dandisets more efficient, the reduced logs are binned by their object keys (asset blob IDs) for easy lookup. +To make the mapping to Dandisets more efficient, the reduced logs are binned by their object keys (asset blob IDs) for fast lookup. This step reduces the total file sizes from step (1) even further by reducing repeated object keys, though it does create a large number of small files. In summer of 2024, this reduced 200 GB of reduced logs to around 20 GB. -3) **Mapping.** +### 3. **Mapping.** The final step, which should be run periodically to keep the desired usage logs per Dandiset up to date, is to scan through all currently known Dandisets and their versions, mapping the asset blob IDs to their filenames and generating the most recently parsed usage logs that can be shared publicly. -In summer of 2024, this reduced 20 GB of binned logs to around 100 MB of Dandiset-specific logs. +In summer of 2024, this reduced 20 GB of binned logs to around 100 MB of Dandiset logs. From bb16fd43596d6dc5261decd940e50ae8f5aaaecc Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 12:50:49 -0400 Subject: [PATCH 30/55] reformat --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index cf11b04..c13ce61 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ pip install dandi_s3_log_parser The process is comprised of three modular steps. -### 1. **Reduction.** +### 1. **Reduction** Filter out: @@ -54,7 +54,7 @@ In summer of 2024, this reduced 6 TB of raw logs to around 200 GB. The process is designed to be easily parallelized and interruptible, meaning that you can feel free to kill any processes while they are running and restart later without losing most progress. -### 2. **Binning.** +### 2. **Binning** To make the mapping to Dandisets more efficient, the reduced logs are binned by their object keys (asset blob IDs) for fast lookup. @@ -62,7 +62,7 @@ This step reduces the total file sizes from step (1) even further by reducing re In summer of 2024, this reduced 200 GB of reduced logs to around 20 GB. -### 3. **Mapping.** +### 3. **Mapping** The final step, which should be run periodically to keep the desired usage logs per Dandiset up to date, is to scan through all currently known Dandisets and their versions, mapping the asset blob IDs to their filenames and generating the most recently parsed usage logs that can be shared publicly. From ae5b057191e807e6b30d840e8c8a8e475a73d597 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 12:52:08 -0400 Subject: [PATCH 31/55] reformat --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c13ce61..790b571 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ reduce_all_dandi_raw_s3_logs \ To bin: ```bash -bin_all_reduced__s3_logs \ +bin_all_reduced_s3_logs \ --reduced_s3_logs_folder_path < reduced S3 logs folder path > \ --binned_s3_logs_folder_path < binned S3 logs folder path > ``` @@ -109,7 +109,7 @@ bin_all_reduced__s3_logs \ For example, on Drogon: ```bash -bin_all_reduced__s3_logs \ +bin_all_reduced_s3_logs \ --reduced_s3_logs_folder_path /mnt/backup/dandi/reduced-dandiarchive-logs \ --binned_s3_logs_folder_path /mnt/backup/dandi/binned-dandiarchive-logs ``` From 0000b2abf30a933aa2e6144eb99dd39a80d25b5d Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 12:54:56 -0400 Subject: [PATCH 32/55] adjust name --- README.md | 8 ++++---- pyproject.toml | 2 +- src/dandi_s3_log_parser/__init__.py | 4 ++-- src/dandi_s3_log_parser/_command_line_interface.py | 8 ++++---- ...o_dandisets.py => _map_binned_s3_logs_to_dandisets.py} | 2 +- .../test_map_all_reduced_s3_logs_to_dandisets.py | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) rename src/dandi_s3_log_parser/{_map_all_reduced_s3_logs_to_dandisets.py => _map_binned_s3_logs_to_dandisets.py} (99%) diff --git a/README.md b/README.md index 790b571..ef5ae5c 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ reduce_all_dandi_raw_s3_logs \ To bin: ```bash -bin_all_reduced_s3_logs \ +bin_all_reduced_s3_logs_by_object_key \ --reduced_s3_logs_folder_path < reduced S3 logs folder path > \ --binned_s3_logs_folder_path < binned S3 logs folder path > ``` @@ -109,7 +109,7 @@ bin_all_reduced_s3_logs \ For example, on Drogon: ```bash -bin_all_reduced_s3_logs \ +bin_all_reduced_s3_logs_by_object_key \ --reduced_s3_logs_folder_path /mnt/backup/dandi/reduced-dandiarchive-logs \ --binned_s3_logs_folder_path /mnt/backup/dandi/binned-dandiarchive-logs ``` @@ -119,7 +119,7 @@ bin_all_reduced_s3_logs \ The next step, that should also be updated regularly (daily-weekly), is to iterate through all current versions of all Dandisets, mapping the reduced logs to their assets. ```bash -map_reduced_logs_to_dandisets \ +map_binned_logs_to_dandisets \ --binned_s3_logs_folder_path < binned S3 logs folder path > \ --dandiset_logs_folder_path < mapped Dandiset logs folder > ``` @@ -127,7 +127,7 @@ map_reduced_logs_to_dandisets \ For example, on Drogon: ```bash -map_reduced_logs_to_dandisets \ +map_binned_logs_to_dandisets \ --binned_s3_logs_folder_path /mnt/backup/dandi/binned-dandiarchive-logs \ --dandiset_logs_folder_path /mnt/backup/dandi/mapped-dandiset-logs ``` diff --git a/pyproject.toml b/pyproject.toml index 8d8f9e5..6bb4a24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ classifiers = [ [project.scripts] reduce_all_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_all_dandi_raw_s3_logs_cli" bin_all_reduced_s3_logs_by_object_key = "dandi_s3_log_parser._command_line_interface:_bin_all_reduced_s3_logs_by_object_key_cli" -map_all_reduced_s3_logs_to_dandisets = "dandi_s3_log_parser._command_line_interface:_map_all_reduced_s3_logs_to_dandisets_cli" +map_binned_s3_logs_to_dandisets = "dandi_s3_log_parser._command_line_interface:_map_binned_s3_logs_to_dandisets_cli" diff --git a/src/dandi_s3_log_parser/__init__.py b/src/dandi_s3_log_parser/__init__.py index 4643b58..f873948 100644 --- a/src/dandi_s3_log_parser/__init__.py +++ b/src/dandi_s3_log_parser/__init__.py @@ -22,7 +22,7 @@ from ._buffered_text_reader import BufferedTextReader from ._dandi_s3_log_file_reducer import reduce_all_dandi_raw_s3_logs from ._ip_utils import get_region_from_ip_address -from ._map_all_reduced_s3_logs_to_dandisets import map_all_reduced_s3_logs_to_dandisets +from ._map_binned_s3_logs_to_dandisets import map_binned_s3_logs_to_dandisets from ._bin_all_reduced_s3_logs_by_object_key import bin_all_reduced_s3_logs_by_object_key __all__ = [ @@ -31,6 +31,6 @@ "BufferedTextReader", "reduce_all_dandi_raw_s3_logs", "get_region_from_ip_address", - "map_all_reduced_s3_logs_to_dandisets", + "map_binned_s3_logs_to_dandisets", "bin_all_reduced_s3_logs_by_object_key", ] diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index b819628..b73a802 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -9,7 +9,7 @@ from ._dandi_s3_log_file_reducer import ( reduce_all_dandi_raw_s3_logs, ) -from ._map_all_reduced_s3_logs_to_dandisets import map_all_reduced_s3_logs_to_dandisets +from ._map_binned_s3_logs_to_dandisets import map_binned_s3_logs_to_dandisets @click.command(name="reduce_all_dandi_raw_s3_logs") @@ -111,7 +111,7 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( return None -@click.command(name="map_all_reduced_s3_logs_to_dandisets") +@click.command(name="map_binned_s3_logs_to_dandisets") @click.option( "--binned_s3_logs_folder_path", help="", @@ -124,10 +124,10 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( required=True, type=click.Path(writable=False), ) -def _map_all_reduced_s3_logs_to_dandisets_cli( +def _map_binned_s3_logs_to_dandisets_cli( binned_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path ) -> None: - map_all_reduced_s3_logs_to_dandisets( + map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path=binned_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path ) diff --git a/src/dandi_s3_log_parser/_map_all_reduced_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py similarity index 99% rename from src/dandi_s3_log_parser/_map_all_reduced_s3_logs_to_dandisets.py rename to src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index 0889020..b7eda04 100644 --- a/src/dandi_s3_log_parser/_map_all_reduced_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -10,7 +10,7 @@ @validate_call -def map_all_reduced_s3_logs_to_dandisets( +def map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path: DirectoryPath, dandiset_logs_folder_path: DirectoryPath ) -> None: """ diff --git a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py index 3506d01..6d2b6a3 100644 --- a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py +++ b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py @@ -14,7 +14,7 @@ def test_map_all_reduced_s3_logs_to_dandisets(tmpdir: py.path.local): reduced_s3_logs_folder_path = examples_folder_path / "binned_logs" dandiset_logs_folder_path = tmpdir - dandi_s3_log_parser.map_all_reduced_s3_logs_to_dandisets( + dandi_s3_log_parser.map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path=reduced_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path, ) From 4bc592442b9df498560ba5cfd374592d06bfcf4a Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 12:57:58 -0400 Subject: [PATCH 33/55] add progress bar to binning --- .../_bin_all_reduced_s3_logs_by_object_key.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index 74f45c0..8a821d0 100644 --- a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -3,6 +3,7 @@ import pathlib import pandas +import tqdm from pydantic import DirectoryPath, validate_call @@ -48,8 +49,16 @@ def bin_all_reduced_s3_logs_by_object_key( ) completed = completed or set() - reduced_s3_log_files = set(reduced_s3_logs_folder_path.rglob("*.tsv")) - completed - for reduced_s3_log_file in reduced_s3_log_files: + reduced_s3_log_files = list(set(reduced_s3_logs_folder_path.rglob("*.tsv")) - completed) + for reduced_s3_log_file in tqdm.tqdm( + iterable=reduced_s3_log_files, + total=len(reduced_s3_log_files), + desc="Binning reduced logs...", + position=0, + leave=True, + mininterval=3.0, + smoothing=0, + ): with open(file=started_tracking_file_path, mode="a") as started_tracking_file: started_tracking_file.write(f"{reduced_s3_log_file}: 1\n") From c4426962c10719b39424cc1569eaef427e57faab Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 13:15:08 -0400 Subject: [PATCH 34/55] add a file buffer to binning --- README.md | 11 +++++++++++ .../_bin_all_reduced_s3_logs_by_object_key.py | 6 +++++- src/dandi_s3_log_parser/_command_line_interface.py | 9 +++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ef5ae5c..63abaad 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,17 @@ bin_all_reduced_s3_logs_by_object_key \ --binned_s3_logs_folder_path /mnt/backup/dandi/binned-dandiarchive-logs ``` +This process is not as friendly to random interruption as the reduction step is. If corruption is detected, the target binning folder will have to be cleaned before re-attempting. + +The `--file_processing_limit < integer >` flag can be used to limit the number of files processed in a single run, which can be useful for breaking the process up into resumable pieces, such as: + +```bash +bin_all_reduced_s3_logs_by_object_key \ + --reduced_s3_logs_folder_path /mnt/backup/dandi/reduced-dandiarchive-logs \ + --binned_s3_logs_folder_path /mnt/backup/dandi/binned-dandiarchive-logs \ + --file_processing_limit 20 +``` + ### Mapping The next step, that should also be updated regularly (daily-weekly), is to iterate through all current versions of all Dandisets, mapping the reduced logs to their assets. diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index 8a821d0..693d7bd 100644 --- a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -12,6 +12,7 @@ def bin_all_reduced_s3_logs_by_object_key( *, reduced_s3_logs_folder_path: DirectoryPath, binned_s3_logs_folder_path: DirectoryPath, + file_processing_limit: int | None = None, ) -> None: """ Bin reduced S3 logs by object keys. @@ -22,6 +23,9 @@ def bin_all_reduced_s3_logs_by_object_key( The path to the folder containing the reduced S3 log files. binned_s3_logs_folder_path : str The path to write each binned S3 log file to. + There will be one file per object key. + file_processing_limit : int, optional + The maximum number of files to process per call. """ started_tracking_file_path = binned_s3_logs_folder_path / "binned_log_file_paths_started.txt" completed_tracking_file_path = binned_s3_logs_folder_path / "binned_log_file_paths_completed.txt" @@ -49,7 +53,7 @@ def bin_all_reduced_s3_logs_by_object_key( ) completed = completed or set() - reduced_s3_log_files = list(set(reduced_s3_logs_folder_path.rglob("*.tsv")) - completed) + reduced_s3_log_files = list(set(reduced_s3_logs_folder_path.rglob("*.tsv")) - completed)[:file_processing_limit] for reduced_s3_log_file in tqdm.tqdm( iterable=reduced_s3_log_files, total=len(reduced_s3_log_files), diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index b73a802..d2dd7d7 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -99,13 +99,22 @@ def _reduce_all_dandi_raw_s3_logs_cli( required=True, type=click.Path(writable=True), ) +@click.option( + "--file_processing_limit", + help="The maximum number of files to process per call.", + required=False, + type=int, + default=None, +) def _bin_all_reduced_s3_logs_by_object_key_cli( reduced_s3_logs_folder_path: str, binned_s3_logs_folder_path: str, + file_processing_limit: int | None, ) -> None: bin_all_reduced_s3_logs_by_object_key( reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, binned_s3_logs_folder_path=binned_s3_logs_folder_path, + file_processing_limit=file_processing_limit, ) return None From d2830daeb4ea5a64f8458ddabc984ad440222e28 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 14:09:47 -0400 Subject: [PATCH 35/55] add CIDR regions --- src/dandi_s3_log_parser/_ip_utils.py | 99 ++++++++++++++++------------ 1 file changed, 58 insertions(+), 41 deletions(-) diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index 66dc7c4..16fd1b2 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -1,5 +1,6 @@ """Various private utility functions for handling IP address related tasks.""" +import functools import hashlib import ipaddress import os @@ -8,7 +9,6 @@ import ipinfo import requests import yaml -from pydantic import FilePath from ._config import ( _IP_HASH_TO_REGION_FILE_PATH, @@ -39,6 +39,21 @@ def get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, str raise ValueError(message) # pragma: no cover ip_hash_salt = bytes.fromhex(os.environ["IP_HASH_SALT"]) + # Determine if IP address belongs to GitHub, AWS, Google, or known VPNs + # Azure not yet easily doable; keep an eye on + # https://learn.microsoft.com/en-us/answers/questions/1410071/up-to-date-azure-public-api-to-get-azure-ip-ranges + # and others, maybe it will change in the future + known_services = ["GitHub", "AWS", "GCP", "VPN"] + for service_name in known_services: + cidr_addresses = _get_cidr_address_ranges(service_name=service_name) + + if any( + ipaddress.ip_address(address=ip_address) in ipaddress.ip_network(address=cidr_address) + for cidr_address in cidr_addresses + ): + return service_name + + # Probably a legitimate user, so fetch the geographic region ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + ip_hash_salt).hexdigest() # Early return for speed @@ -82,35 +97,48 @@ def get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, str return "unknown" -def _cidr_address_to_ip_range(*, cidr_address: str) -> list[str]: - """Convert a CIDR address to a list of IP addresses.""" - cidr_address_class = type(ipaddress.ip_address(cidr_address.split("/")[0])) - ip_address_range = [] - if cidr_address_class is ipaddress.IPv4Address: - ip_address_range = ipaddress.IPv4Network(address=cidr_address) - elif cidr_address_class is ipaddress.IPv6Address: # pragma: no cover - ip_address_range = ipaddress.IPv6Network(address=cidr_address) - - return [str(ip_address) for ip_address in ip_address_range] - - -def _get_latest_github_ip_ranges() -> list[str]: - """Retrieve the latest GitHub CIDR ranges from their API and expand them into a list of IP addresses.""" - github_ip_request = requests.get("https://api.github.com/meta").json() - - skip_keys = ["domains", "ssh_key_fingerprints", "verifiable_password_authentication", "ssh_keys"] - keys = set(github_ip_request.keys()) - set(skip_keys) - github_cidr_addresses = [ - cidr_address for key in keys for cidr_address in github_ip_request[key] if "::" not in cidr_address # Skip IPv6 - ] - - all_github_ips = [ - str(ip_address) - for cidr_address in github_cidr_addresses - for ip_address in _cidr_address_to_ip_range(cidr_address=cidr_address) - ] - - return all_github_ips +@functools.lru_cache +def _get_cidr_address_ranges(*, service_name: str) -> list[str]: + match service_name: + case "GitHub": + github_cidr_request = requests.get(url="https://api.github.com/meta").json() + skip_keys = ["domains", "ssh_key_fingerprints", "verifiable_password_authentication", "ssh_keys"] + keys = set(github_cidr_request.keys()) - set(skip_keys) + github_cidr_addresses = [ + cidr_address + for key in keys + for cidr_address in github_cidr_request[key] + if "::" not in cidr_address + # Skip IPv6 + ] + + return github_cidr_addresses + # Note: these endpoints also return the 'locations' of the specific subnet, such as 'us-east-2' + case "AWS": + aws_cidr_request = requests.get(url="https://ip-ranges.amazonaws.com/ip-ranges.json").json() + aws_cidr_addresses = [prefix["ip_prefix"] for prefix in aws_cidr_request["prefixes"]] + + return aws_cidr_addresses + case "GCP": + gcp_cidr_request = requests.get(url="https://www.gstatic.com/ipranges/cloud.json").json() + gcp_cidr_addresses = [prefix["ipv4Prefix"] for prefix in gcp_cidr_request["prefixes"]] + + return gcp_cidr_addresses + case "Azure": + raise NotImplementedError("Azure CIDR address fetching is not yet implemented!") # pragma: no cover + case "VPN": + # Very nice public and maintained listing! Hope this stays stable. + vpn_cidr_addresses = ( + requests.get( + url="https://raw.githubusercontent.com/josephrocca/is-vpn/main/vpn-or-datacenter-ipv4-ranges.txt" + ) + .content.decode("utf-8") + .splitlines() + ) + + return vpn_cidr_addresses + case _: + raise ValueError(f"Service name '{service_name}' is not supported!") # pragma: no cover def _load_ip_hash_to_region_cache() -> dict[str, str]: @@ -126,14 +154,3 @@ def _save_ip_hash_to_region_cache(*, ip_hash_to_region: dict[str, str]) -> None: """Save the IP hash to region cache to disk.""" with open(file=_IP_HASH_TO_REGION_FILE_PATH, mode="w") as stream: yaml.dump(data=ip_hash_to_region, stream=stream) - - -def _save_ip_address_to_region_cache( - ip_hash_to_region: dict[str, str], - ip_hash_to_region_file_path: FilePath | None = None, -) -> None: - """Save the IP address to region cache to disk.""" - ip_hash_to_region_file_path = ip_hash_to_region_file_path or _IP_HASH_TO_REGION_FILE_PATH - - with open(file=ip_hash_to_region_file_path, mode="w") as stream: - yaml.dump(data=ip_hash_to_region, stream=stream) From 297e4ffec0b07c523b8bd75ac85104e2ea3a9d87 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 14:12:52 -0400 Subject: [PATCH 36/55] make argument simpler --- README.md | 2 +- .../_bin_all_reduced_s3_logs_by_object_key.py | 6 +++--- src/dandi_s3_log_parser/_command_line_interface.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 63abaad..b565105 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ The `--file_processing_limit < integer >` flag can be used to limit the number o bin_all_reduced_s3_logs_by_object_key \ --reduced_s3_logs_folder_path /mnt/backup/dandi/reduced-dandiarchive-logs \ --binned_s3_logs_folder_path /mnt/backup/dandi/binned-dandiarchive-logs \ - --file_processing_limit 20 + --file_limit 20 ``` ### Mapping diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index 693d7bd..167eaca 100644 --- a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -12,7 +12,7 @@ def bin_all_reduced_s3_logs_by_object_key( *, reduced_s3_logs_folder_path: DirectoryPath, binned_s3_logs_folder_path: DirectoryPath, - file_processing_limit: int | None = None, + file_limit: int | None = None, ) -> None: """ Bin reduced S3 logs by object keys. @@ -24,7 +24,7 @@ def bin_all_reduced_s3_logs_by_object_key( binned_s3_logs_folder_path : str The path to write each binned S3 log file to. There will be one file per object key. - file_processing_limit : int, optional + file_limit : int, optional The maximum number of files to process per call. """ started_tracking_file_path = binned_s3_logs_folder_path / "binned_log_file_paths_started.txt" @@ -53,7 +53,7 @@ def bin_all_reduced_s3_logs_by_object_key( ) completed = completed or set() - reduced_s3_log_files = list(set(reduced_s3_logs_folder_path.rglob("*.tsv")) - completed)[:file_processing_limit] + reduced_s3_log_files = list(set(reduced_s3_logs_folder_path.rglob("*.tsv")) - completed)[:file_limit] for reduced_s3_log_file in tqdm.tqdm( iterable=reduced_s3_log_files, total=len(reduced_s3_log_files), diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index d2dd7d7..fffd8eb 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -100,7 +100,7 @@ def _reduce_all_dandi_raw_s3_logs_cli( type=click.Path(writable=True), ) @click.option( - "--file_processing_limit", + "--file_limit", help="The maximum number of files to process per call.", required=False, type=int, @@ -109,12 +109,12 @@ def _reduce_all_dandi_raw_s3_logs_cli( def _bin_all_reduced_s3_logs_by_object_key_cli( reduced_s3_logs_folder_path: str, binned_s3_logs_folder_path: str, - file_processing_limit: int | None, + file_limit: int | None, ) -> None: bin_all_reduced_s3_logs_by_object_key( reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, binned_s3_logs_folder_path=binned_s3_logs_folder_path, - file_processing_limit=file_processing_limit, + file_limit=file_limit, ) return None From 8dbd722c16265ac1c580705204756e0d729c9b9d Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 14:41:10 -0400 Subject: [PATCH 37/55] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b565105..729f34a 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ bin_all_reduced_s3_logs_by_object_key \ The next step, that should also be updated regularly (daily-weekly), is to iterate through all current versions of all Dandisets, mapping the reduced logs to their assets. ```bash -map_binned_logs_to_dandisets \ +map_binned_s3_logs_to_dandisets \ --binned_s3_logs_folder_path < binned S3 logs folder path > \ --dandiset_logs_folder_path < mapped Dandiset logs folder > ``` @@ -138,7 +138,7 @@ map_binned_logs_to_dandisets \ For example, on Drogon: ```bash -map_binned_logs_to_dandisets \ +map_binned_s3_logs_to_dandisets \ --binned_s3_logs_folder_path /mnt/backup/dandi/binned-dandiarchive-logs \ --dandiset_logs_folder_path /mnt/backup/dandi/mapped-dandiset-logs ``` From b6f5f9737017cba62e3cb4cc1dcb0a39dd3fb4ae Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 14:43:19 -0400 Subject: [PATCH 38/55] add extra progress bar for binning --- .../_bin_all_reduced_s3_logs_by_object_key.py | 10 +++++++++- .../_map_binned_s3_logs_to_dandisets.py | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index 167eaca..774cf5a 100644 --- a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -82,7 +82,15 @@ def bin_all_reduced_s3_logs_by_object_key( } del binned_data_frame - for object_key, data in object_keys_to_data.items(): + for object_key, data in tqdm.tqdm( + iterable=object_keys_to_data.items(), + total=len(object_keys_to_data), + desc="Writing binned logs...", + position=1, + leave=False, + mininterval=3.0, + smoothing=0, + ): object_key_as_path = pathlib.Path(object_key) binned_s3_log_file_path = ( binned_s3_logs_folder_path / object_key_as_path.parent / f"{object_key_as_path.stem}.tsv" diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index b7eda04..067131c 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -37,6 +37,8 @@ def map_binned_s3_logs_to_dandisets( ) raise ValueError(message) # pragma: no cover + # TODO: cache all applicable DANDI API calls + client = dandi.dandiapi.DandiAPIClient() ip_hash_to_region = _load_ip_hash_to_region_cache() From 5925438c6ca383f0a9496bfa5b4d7c23c85d6a67 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 14:46:13 -0400 Subject: [PATCH 39/55] skip ipv6 gcp --- src/dandi_s3_log_parser/_ip_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index 16fd1b2..f7ce301 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -121,7 +121,11 @@ def _get_cidr_address_ranges(*, service_name: str) -> list[str]: return aws_cidr_addresses case "GCP": gcp_cidr_request = requests.get(url="https://www.gstatic.com/ipranges/cloud.json").json() - gcp_cidr_addresses = [prefix["ipv4Prefix"] for prefix in gcp_cidr_request["prefixes"]] + gcp_cidr_addresses = [ + prefix["ipv4Prefix"] + for prefix in gcp_cidr_request["prefixes"] + if "ipv4Prefix" in prefix # Not handling IPv6 yet + ] return gcp_cidr_addresses case "Azure": From 5af93c59d2543a00d0cf3fbadb29b4f8154f50cb Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 15:16:48 -0400 Subject: [PATCH 40/55] add services to cache --- src/dandi_s3_log_parser/_ip_utils.py | 17 +++++++++-------- .../_map_binned_s3_logs_to_dandisets.py | 2 ++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index f7ce301..d11cf85 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -39,6 +39,14 @@ def get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, str raise ValueError(message) # pragma: no cover ip_hash_salt = bytes.fromhex(os.environ["IP_HASH_SALT"]) + # Probably a legitimate user, so fetch the geographic region + ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + ip_hash_salt).hexdigest() + + # Early return for speed + lookup_result = ip_hash_to_region.get(ip_hash) + if lookup_result is not None: + return lookup_result + # Determine if IP address belongs to GitHub, AWS, Google, or known VPNs # Azure not yet easily doable; keep an eye on # https://learn.microsoft.com/en-us/answers/questions/1410071/up-to-date-azure-public-api-to-get-azure-ip-ranges @@ -51,16 +59,9 @@ def get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, str ipaddress.ip_address(address=ip_address) in ipaddress.ip_network(address=cidr_address) for cidr_address in cidr_addresses ): + ip_hash_to_region[ip_hash] = service_name return service_name - # Probably a legitimate user, so fetch the geographic region - ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + ip_hash_salt).hexdigest() - - # Early return for speed - lookup_result = ip_hash_to_region.get(ip_hash) - if lookup_result is not None: - return lookup_result - # Log errors in IP fetching # Lines cannot be covered without testing on a real IP try: # pragma: no cover diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index 067131c..4958af2 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -39,6 +39,8 @@ def map_binned_s3_logs_to_dandisets( # TODO: cache all applicable DANDI API calls + # TODO: add mtime record for binned files to determine if update is needed + client = dandi.dandiapi.DandiAPIClient() ip_hash_to_region = _load_ip_hash_to_region_cache() From a43572a89df21b98bcce76a6508b7e4757a6c29a Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 15:32:34 -0400 Subject: [PATCH 41/55] add extra test case and debug --- src/dandi_s3_log_parser/_s3_log_file_reducer.py | 14 +++++++++++--- .../reduction_example_2/raw_logs/2022/04/06.log | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 3f34aef..c9ed5a3 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -195,10 +195,18 @@ def _fast_dandi_reduce_raw_s3_log_line( first_post_quote_block = raw_s3_log_line.split('" ')[1].split(" ") http_status_code = first_post_quote_block[0] bytes_sent = first_post_quote_block[2] - if len(first_post_quote_block) != 7 or not http_status_code.isdigit() or not bytes_sent.isdigit(): - return _reduce_raw_s3_log_line(raw_s3_log_line=raw_s3_log_line, task_id=task_id) - elif http_status_code[0] != "2": + if http_status_code.isdigit() and len(http_status_code) == 3 and http_status_code[0] != "2": return None + elif len(first_post_quote_block) != 7 or not http_status_code.isdigit() or not bytes_sent.isdigit(): + from ._dandi_s3_log_file_reducer import _get_default_dandi_object_key_handler + + return _reduce_raw_s3_log_line( + raw_s3_log_line=raw_s3_log_line, + operation_type=operation_type, + excluded_ips=excluded_ips, + object_key_handler=_get_default_dandi_object_key_handler(), + task_id=task_id, + ) # Forget about timezone for fast case timestamp = datetime.datetime.strptime("".join(split_by_space[2:3]), "[%d/%b/%Y:%H:%M:%S").isoformat() diff --git a/tests/test_reduction/examples/reduction_example_2/raw_logs/2022/04/06.log b/tests/test_reduction/examples/reduction_example_2/raw_logs/2022/04/06.log index cfd32b8..9bf70d2 100644 --- a/tests/test_reduction/examples/reduction_example_2/raw_logs/2022/04/06.log +++ b/tests/test_reduction/examples/reduction_example_2/raw_logs/2022/04/06.log @@ -2,3 +2,4 @@ 8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Apr/2022:05:06:35 +0000] 192.0.2.0 - J42N2W7ET0EC03CV REST.GET.OBJECT blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 "GET /blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991 HTTP/1.1" 206 - 512 171408 53 52 "-" "-" - DX8oFoKQx0o5V3lwEuWBxF5p2fSXrwINj0rnxmas0YgjWuPqYLK/vnW60Txh23K93aahe0IFw2c= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - 8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Apr/2022:12:29:11 +0000] 192.0.2.0 - MJH1XJ8DHPSZFND7 REST.GET.OBJECT / "GET //?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?> HTTP/1.1" 404 NoSuchKey 272 - 9 - "https://dandiarchive.s3.amazonaws.com//?s=index/\think\template\driver\file/write&cacheFile=robots.php&content=xbshell1%201),%20array(1),%20$ch[1].$ch[3].$ch[4]);?>" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" - V9t1ypjyDY4plW1QdEvZxgIn2dEET3gncqHpXCat9UyAups5FXGyiU0kcrI2fWZmTh66E67H/tI= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - - 8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Apr/2022:23:06:42 +0000] 192.0.2.0 - NWC7V1KE70QZYJ5Q REST.GET.OBJECT blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629 "GET /blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629?versionId=yn5YAJiwT36Rv78jGYLM71GZumWL.QWn HTTP/1.1" 200 - 1443 1443 35 35 "-" "git-annex/8.20211028-g1c76278" yn5YAJiwT36Rv78jGYLM71GZumWL.QWn ojBg2QLVTSTWsCAe1HoC6IBNLUSPmWH276FdsedhZ/4CQ67DWuZQHcXXB9XUJxYKpnPHpJyBjMM= - ECDHE-RSA-AES128-GCM-SHA256 - dandiarchive.s3.amazonaws.com TLSv1.2 - +8787a3c41bf7ce0d54359d9348ad5b08e16bd5bb8ae5aa4e1508b435773a066e dandiarchive [06/Apr/2024:16:10:20 +0000] 192.0.2.0 - FNA3VGASCWKR5MNR REST.GET.OBJECT zarr/2cae90ba-cfc5-4500-9f10-adf9533413ee/0/1/1/3 "GET /zarr/2cae90ba-cfc5-4500-9f10-adf9533413ee/0/1/1/3 HTTP/1.1" 304 - - 250340 9 - "https://neuroglancer-demo.appspot.com/" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" - 1qTeefmpM7pUYhtFtIBchp/7fo6GH8wcuVRvij3qjzGd/hB/Q2eHLeVeXIv6lJn9tv+18hHYAkk= - TLS_AES_128_GCM_SHA256 - dandiarchive.s3.amazonaws.com TLSv1.3 - - From 89c0bd5681d5ad5715b6f1784f2976641609967e Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 15:46:34 -0400 Subject: [PATCH 42/55] add helper function for in cidr --- src/dandi_s3_log_parser/_ip_utils.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index d11cf85..b3a472b 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -56,8 +56,7 @@ def get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, str cidr_addresses = _get_cidr_address_ranges(service_name=service_name) if any( - ipaddress.ip_address(address=ip_address) in ipaddress.ip_network(address=cidr_address) - for cidr_address in cidr_addresses + _is_ip_address_in_cidr(ip_address=ip_address, cidr_address=cidr_address) for cidr_address in cidr_addresses ): ip_hash_to_region[ip_hash] = service_name return service_name @@ -146,6 +145,26 @@ def _get_cidr_address_ranges(*, service_name: str) -> list[str]: raise ValueError(f"Service name '{service_name}' is not supported!") # pragma: no cover +def _is_ip_address_in_cidr(*, ip_address: str, cidr_address: str) -> bool: + """ + Check if an IP address is within a CIDR range. + + Should be faster than the syntactically simpler: + + ```python + ipaddress.ip_address(ip_address) in ipaddress.ip_network(cidr_range) + ``` + """ + ip_network = ipaddress.ip_network(address=cidr_address) + binary_network_address = int(ip_network.network_address) + binary_network_mask = int(ip_network.netmask) + + binary_ip = int(ipaddress.ip_address(address=ip_address)) + + in_network = (binary_ip & binary_network_mask) == binary_network_address + return in_network + + def _load_ip_hash_to_region_cache() -> dict[str, str]: """Load the IP hash to region cache from disk.""" if not _IP_HASH_TO_REGION_FILE_PATH.exists(): From b3a9bccec120d27cab2d8d041c53d0616042496f Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 15:51:21 -0400 Subject: [PATCH 43/55] add helper function for in cidr --- src/dandi_s3_log_parser/_ip_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index b3a472b..173052f 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -43,7 +43,7 @@ def get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, str ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + ip_hash_salt).hexdigest() # Early return for speed - lookup_result = ip_hash_to_region.get(ip_hash) + lookup_result = ip_hash_to_region.get(ip_hash, None) if lookup_result is not None: return lookup_result From 041abb91edc7fcc2965b54b5c66ece5c2fb99b28 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 16:37:21 -0400 Subject: [PATCH 44/55] add extra cache --- .../_command_line_interface.py | 7 +++ src/dandi_s3_log_parser/_config.py | 5 +- src/dandi_s3_log_parser/_globals.py | 2 + src/dandi_s3_log_parser/_ip_utils.py | 62 +++++++++++++------ .../_map_binned_s3_logs_to_dandisets.py | 24 +++++-- 5 files changed, 72 insertions(+), 28 deletions(-) diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index fffd8eb..dfe9f77 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -133,6 +133,13 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( required=True, type=click.Path(writable=False), ) +@click.option( + "--dandiset_limit", + help="The maximum number of Dandisets to process per call.", + required=False, + type=int, + default=None, +) def _map_binned_s3_logs_to_dandisets_cli( binned_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path ) -> None: diff --git a/src/dandi_s3_log_parser/_config.py b/src/dandi_s3_log_parser/_config.py index 8c950ef..f876426 100644 --- a/src/dandi_s3_log_parser/_config.py +++ b/src/dandi_s3_log_parser/_config.py @@ -1,10 +1,7 @@ import pathlib -REQUEST_TYPES = ("GET", "PUT", "HEAD") - -REQUEST_TYPES = ("GET", "PUT", "HEAD") - DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH = pathlib.Path.home() / ".dandi_s3_log_parser" DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH.mkdir(exist_ok=True) _IP_HASH_TO_REGION_FILE_PATH = DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "ip_hash_to_region.yaml" +_IP_HASH_NOT_IN_SERVICES_FILE_PATH = DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH / "ip_hash_not_in_services.yaml" diff --git a/src/dandi_s3_log_parser/_globals.py b/src/dandi_s3_log_parser/_globals.py index 92ee4c9..b7b5985 100644 --- a/src/dandi_s3_log_parser/_globals.py +++ b/src/dandi_s3_log_parser/_globals.py @@ -105,3 +105,5 @@ _FullLogLine = collections.namedtuple("FullLogLine", _S3_LOG_FIELDS) _S3_LOG_REGEX = re.compile(pattern=r'"([^"]+)"|\[([^]]+)]|([^ ]+)') + +_KNOWN_SERVICES = ("GitHub", "AWS", "GCP", "VPN") # Azure has problems; see _ip_utils.py for more info diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index 173052f..59ed2bb 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -5,18 +5,23 @@ import ipaddress import os import traceback +from typing import Literal import ipinfo import requests import yaml from ._config import ( + _IP_HASH_NOT_IN_SERVICES_FILE_PATH, _IP_HASH_TO_REGION_FILE_PATH, ) from ._error_collection import _collect_error +from ._globals import _KNOWN_SERVICES -def get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, str]) -> str | None: +def get_region_from_ip_address( + ip_address: str, ip_hash_to_region: dict[str, str], ip_hash_not_in_services: dict[str, bool] +) -> str | None: """ If the parsed S3 logs are meant to be shared openly, the remote IP could be used to directly identify individuals. @@ -51,15 +56,17 @@ def get_region_from_ip_address(ip_address: str, ip_hash_to_region: dict[str, str # Azure not yet easily doable; keep an eye on # https://learn.microsoft.com/en-us/answers/questions/1410071/up-to-date-azure-public-api-to-get-azure-ip-ranges # and others, maybe it will change in the future - known_services = ["GitHub", "AWS", "GCP", "VPN"] - for service_name in known_services: - cidr_addresses = _get_cidr_address_ranges(service_name=service_name) - - if any( - _is_ip_address_in_cidr(ip_address=ip_address, cidr_address=cidr_address) for cidr_address in cidr_addresses - ): - ip_hash_to_region[ip_hash] = service_name - return service_name + if ip_hash_not_in_services.get(ip_hash, None) is None: + for service_name in _KNOWN_SERVICES: + cidr_addresses = _get_cidr_address_ranges(service_name=service_name) + + if any( + _is_ip_address_in_cidr(ip_address=ip_address, cidr_address=cidr_address) + for cidr_address in cidr_addresses + ): + ip_hash_to_region[ip_hash] = service_name + return service_name + ip_hash_not_in_services[ip_hash] = True # Log errors in IP fetching # Lines cannot be covered without testing on a real IP @@ -165,16 +172,33 @@ def _is_ip_address_in_cidr(*, ip_address: str, cidr_address: str) -> bool: return in_network -def _load_ip_hash_to_region_cache() -> dict[str, str]: +def _load_ip_hash_to_cache(*, name: Literal["region", "services"]) -> dict[str, str] | dict[str, bool]: """Load the IP hash to region cache from disk.""" - if not _IP_HASH_TO_REGION_FILE_PATH.exists(): - return {} # pragma: no cover - - with open(file=_IP_HASH_TO_REGION_FILE_PATH) as stream: - return yaml.load(stream=stream, Loader=yaml.SafeLoader) + match name: + case "region": + if not _IP_HASH_TO_REGION_FILE_PATH.exists(): + return {} # pragma: no cover + + with open(file=_IP_HASH_TO_REGION_FILE_PATH) as stream: + return yaml.load(stream=stream, Loader=yaml.SafeLoader) + case "services": + if not _IP_HASH_NOT_IN_SERVICES_FILE_PATH.exists(): + return {} # pragma: no cover + + with open(file=_IP_HASH_NOT_IN_SERVICES_FILE_PATH) as stream: + return yaml.load(stream=stream, Loader=yaml.SafeLoader) + case _: + raise ValueError(f"Name '{name}' is not recognized!") # pragma: no cover -def _save_ip_hash_to_region_cache(*, ip_hash_to_region: dict[str, str]) -> None: +def _save_ip_hash_cache(*, name: Literal["region", "services"], ip_cache: dict[str, str] | dict[str, bool]) -> None: """Save the IP hash to region cache to disk.""" - with open(file=_IP_HASH_TO_REGION_FILE_PATH, mode="w") as stream: - yaml.dump(data=ip_hash_to_region, stream=stream) + match name: + case "region": + with open(file=_IP_HASH_TO_REGION_FILE_PATH, mode="w") as stream: + yaml.dump(data=ip_cache, stream=stream) + case "services": + with open(file=_IP_HASH_NOT_IN_SERVICES_FILE_PATH, mode="w") as stream: + yaml.dump(data=ip_cache, stream=stream) + case _: + raise ValueError(f"Name '{name}' is not recognized!") # pragma: no cover diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index 4958af2..c0b0c68 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -6,12 +6,14 @@ import tqdm from pydantic import DirectoryPath, validate_call -from ._ip_utils import _load_ip_hash_to_region_cache, get_region_from_ip_address +from ._ip_utils import _load_ip_hash_cache, _save_ip_hash_cache, get_region_from_ip_address @validate_call def map_binned_s3_logs_to_dandisets( - binned_s3_logs_folder_path: DirectoryPath, dandiset_logs_folder_path: DirectoryPath + binned_s3_logs_folder_path: DirectoryPath, + dandiset_logs_folder_path: DirectoryPath, + dandiset_limit: int | None = None, ) -> None: """ Iterate over all dandisets and create a single .tsv per dandiset version containing reduced log for all assets. @@ -24,6 +26,8 @@ def map_binned_s3_logs_to_dandisets( The path to the folder containing the reduced S3 log files. dandiset_logs_folder_path : DirectoryPath The path to the folder where the mapped logs will be saved. + dandiset_limit : int, optional + The maximum number of Dandisets to process per call. """ if "IPINFO_CREDENTIALS" not in os.environ: message = "The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!" @@ -43,8 +47,9 @@ def map_binned_s3_logs_to_dandisets( client = dandi.dandiapi.DandiAPIClient() - ip_hash_to_region = _load_ip_hash_to_region_cache() - current_dandisets = list(client.get_dandisets()) + ip_hash_to_region = _load_ip_hash_cache(name="region") + ip_hash_not_in_services = _load_ip_hash_cache(name="services") + current_dandisets = list(client.get_dandisets())[:dandiset_limit] for dandiset in tqdm.tqdm( iterable=current_dandisets, total=len(current_dandisets), @@ -59,8 +64,12 @@ def map_binned_s3_logs_to_dandisets( dandiset_logs_folder_path=dandiset_logs_folder_path, client=client, ip_hash_to_region=ip_hash_to_region, + ip_hash_not_in_services=ip_hash_not_in_services, ) + _save_ip_hash_cache(name="region", ip_cache=ip_hash_to_region) + _save_ip_hash_cache(name="services", ip_cache=ip_hash_not_in_services) + def _map_reduced_logs_to_dandiset( dandiset: dandi.dandiapi.RemoteDandiset, @@ -68,6 +77,7 @@ def _map_reduced_logs_to_dandiset( dandiset_logs_folder_path: pathlib.Path, client: dandi.dandiapi.DandiAPIClient, ip_hash_to_region: dict[str, str], + ip_hash_not_in_services: dict[str, bool], ) -> None: dandiset_id = dandiset.identifier @@ -96,7 +106,11 @@ def _map_reduced_logs_to_dandiset( reduced_s3_log = pandas.read_table(filepath_or_buffer=reduced_s3_log_file_path, header=0) reduced_s3_log["filename"] = [asset.path] * len(reduced_s3_log) reduced_s3_log["region"] = [ - get_region_from_ip_address(ip_address=ip_address, ip_hash_to_region=ip_hash_to_region) + get_region_from_ip_address( + ip_address=ip_address, + ip_hash_to_region=ip_hash_to_region, + ip_hash_not_in_services=ip_hash_not_in_services, + ) for ip_address in reduced_s3_log["ip_address"] ] From 40cb4e927b6e0211692e150bfbc1d41e8d13283c Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 16:38:26 -0400 Subject: [PATCH 45/55] add extra cache --- src/dandi_s3_log_parser/_ip_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index 59ed2bb..1bab1e4 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -172,7 +172,7 @@ def _is_ip_address_in_cidr(*, ip_address: str, cidr_address: str) -> bool: return in_network -def _load_ip_hash_to_cache(*, name: Literal["region", "services"]) -> dict[str, str] | dict[str, bool]: +def _load_ip_hash_cache(*, name: Literal["region", "services"]) -> dict[str, str] | dict[str, bool]: """Load the IP hash to region cache from disk.""" match name: case "region": From 89022a42c7692ffea041918e552637cf4c455188 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 16:39:11 -0400 Subject: [PATCH 46/55] add extra cache --- src/dandi_s3_log_parser/_command_line_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index dfe9f77..9751d86 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -141,7 +141,7 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( default=None, ) def _map_binned_s3_logs_to_dandisets_cli( - binned_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path + binned_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path, dandiset_limit: int | None ) -> None: map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path=binned_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path From 16040164c7e4bd10fd3f0f8f52108e190fde3f64 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 16:39:54 -0400 Subject: [PATCH 47/55] add extra cache --- src/dandi_s3_log_parser/_command_line_interface.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index 9751d86..d50d037 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -141,10 +141,14 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( default=None, ) def _map_binned_s3_logs_to_dandisets_cli( - binned_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path, dandiset_limit: int | None + binned_s3_logs_folder_path: pathlib.Path, + dandiset_logs_folder_path: pathlib.Path, + dandiset_limit: int | None, ) -> None: map_binned_s3_logs_to_dandisets( - binned_s3_logs_folder_path=binned_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path + binned_s3_logs_folder_path=binned_s3_logs_folder_path, + dandiset_logs_folder_path=dandiset_logs_folder_path, + dandiset_limit=dandiset_limit, ) return None From 74df297723d43c686929238044d65466d5f69ec7 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 16:50:19 -0400 Subject: [PATCH 48/55] try this again --- src/dandi_s3_log_parser/_ip_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index 1bab1e4..3243723 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -61,7 +61,8 @@ def get_region_from_ip_address( cidr_addresses = _get_cidr_address_ranges(service_name=service_name) if any( - _is_ip_address_in_cidr(ip_address=ip_address, cidr_address=cidr_address) + # _is_ip_address_in_cidr(ip_address=ip_address, cidr_address=cidr_address) + ipaddress.ip_address(address=ip_address) in ipaddress.ip_network(address=cidr_address) for cidr_address in cidr_addresses ): ip_hash_to_region[ip_hash] = service_name From 85ff066760908beb89f80dd1c03d4ca789f956fd Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 16:52:54 -0400 Subject: [PATCH 49/55] fix skip condition in reducer --- src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 36545dd..e66a09f 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -74,7 +74,7 @@ def reduce_all_dandi_raw_s3_logs( relative_s3_log_file_path for relative_s3_log_file_path in relative_s3_log_file_paths if not (reduced_s3_logs_folder_path / relative_s3_log_file_path).exists() - and relative_s3_log_file_path.parent.parent not in years_to_reduce + and relative_s3_log_file_path.parent.parent.name in years_to_reduce ] # The .rglob is not naturally sorted; shuffle for more uniform progress updates From 1694ac4f989b7f2ac69ef458d66ddc6c1ab4035d Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 16:55:01 -0400 Subject: [PATCH 50/55] remove other helper --- src/dandi_s3_log_parser/_ip_utils.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index 3243723..387fa26 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -61,7 +61,6 @@ def get_region_from_ip_address( cidr_addresses = _get_cidr_address_ranges(service_name=service_name) if any( - # _is_ip_address_in_cidr(ip_address=ip_address, cidr_address=cidr_address) ipaddress.ip_address(address=ip_address) in ipaddress.ip_network(address=cidr_address) for cidr_address in cidr_addresses ): @@ -153,26 +152,6 @@ def _get_cidr_address_ranges(*, service_name: str) -> list[str]: raise ValueError(f"Service name '{service_name}' is not supported!") # pragma: no cover -def _is_ip_address_in_cidr(*, ip_address: str, cidr_address: str) -> bool: - """ - Check if an IP address is within a CIDR range. - - Should be faster than the syntactically simpler: - - ```python - ipaddress.ip_address(ip_address) in ipaddress.ip_network(cidr_range) - ``` - """ - ip_network = ipaddress.ip_network(address=cidr_address) - binary_network_address = int(ip_network.network_address) - binary_network_mask = int(ip_network.netmask) - - binary_ip = int(ipaddress.ip_address(address=ip_address)) - - in_network = (binary_ip & binary_network_mask) == binary_network_address - return in_network - - def _load_ip_hash_cache(*, name: Literal["region", "services"]) -> dict[str, str] | dict[str, bool]: """Load the IP hash to region cache from disk.""" match name: From e170cd282047aae07e4f2c848917900bb8a027d9 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 16:58:40 -0400 Subject: [PATCH 51/55] remove other helper --- src/dandi_s3_log_parser/_s3_log_file_reducer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index c9ed5a3..97fdbdf 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -146,9 +146,6 @@ def object_key_handler(*, object_key: str) -> str: is not None ] - if len(reduced_s3_log_lines) == 0: - return None - # TODO: generalize header to rely on the selected fields and ensure order matches header = "timestamp\tip_address\tobject_key\tbytes_sent\n" with open(file=reduced_s3_log_file_path, mode="w") as io: From 121a97488e0c409615191d33c4e73ac43d750f51 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 17:06:18 -0400 Subject: [PATCH 52/55] debugging non-skip --- src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 9 +++++++++ src/dandi_s3_log_parser/_s3_log_file_reducer.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index e66a09f..b5ee149 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -77,6 +77,15 @@ def reduce_all_dandi_raw_s3_logs( and relative_s3_log_file_path.parent.parent.name in years_to_reduce ] + print(f"{relative_s3_log_file_paths_to_reduce=}") + + test = [ + not (reduced_s3_logs_folder_path / relative_s3_log_file_path).exists() + for relative_s3_log_file_path in relative_s3_log_file_paths + if relative_s3_log_file_path.parent.parent.name in years_to_reduce + ] + print(f"{test=}") + # The .rglob is not naturally sorted; shuffle for more uniform progress updates random.shuffle(relative_s3_log_file_paths_to_reduce) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 97fdbdf..d565c47 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -147,7 +147,7 @@ def object_key_handler(*, object_key: str) -> str: ] # TODO: generalize header to rely on the selected fields and ensure order matches - header = "timestamp\tip_address\tobject_key\tbytes_sent\n" + header = "timestamp\tip_address\tobject_key\tbytes_sent\n" if len(reduced_s3_log_lines) != 0 else "" with open(file=reduced_s3_log_file_path, mode="w") as io: io.write(header) io.writelines(reduced_s3_log_lines) From 9730cabe9f1096695b84ddcf1985bbd9186373e3 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 17:07:27 -0400 Subject: [PATCH 53/55] debugging non-skip --- src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index b5ee149..df4c573 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -80,7 +80,10 @@ def reduce_all_dandi_raw_s3_logs( print(f"{relative_s3_log_file_paths_to_reduce=}") test = [ - not (reduced_s3_logs_folder_path / relative_s3_log_file_path).exists() + ( + reduced_s3_logs_folder_path / relative_s3_log_file_path, + not (reduced_s3_logs_folder_path / relative_s3_log_file_path).exists(), + ) for relative_s3_log_file_path in relative_s3_log_file_paths if relative_s3_log_file_path.parent.parent.name in years_to_reduce ] From c6f712df24e7c4dfd72a35774d756788b8d4c0cb Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 17:14:10 -0400 Subject: [PATCH 54/55] debugging non-skip --- src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index df4c573..9e38151 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -73,7 +73,9 @@ def reduce_all_dandi_raw_s3_logs( relative_s3_log_file_paths_to_reduce = [ relative_s3_log_file_path for relative_s3_log_file_path in relative_s3_log_file_paths - if not (reduced_s3_logs_folder_path / relative_s3_log_file_path).exists() + if not ( + reduced_s3_logs_folder_path / relative_s3_log_file_path.parent / f"{relative_s3_log_file_path.stem}.tsv" + ).exists() and relative_s3_log_file_path.parent.parent.name in years_to_reduce ] @@ -81,8 +83,10 @@ def reduce_all_dandi_raw_s3_logs( test = [ ( - reduced_s3_logs_folder_path / relative_s3_log_file_path, - not (reduced_s3_logs_folder_path / relative_s3_log_file_path).exists(), + reduced_s3_logs_folder_path / relative_s3_log_file_path.parent / f"{relative_s3_log_file_path.stem}.tsv", + not ( + reduced_s3_logs_folder_path / relative_s3_log_file_path.parent / f"{relative_s3_log_file_path.stem}.tsv" + ).exists(), ) for relative_s3_log_file_path in relative_s3_log_file_paths if relative_s3_log_file_path.parent.parent.name in years_to_reduce From 9ed0a5be319ed6753a6f2684ba6c296b2f51543f Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Wed, 21 Aug 2024 17:14:52 -0400 Subject: [PATCH 55/55] debugging non-skip --- .../_dandi_s3_log_file_reducer.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index 9e38151..f9518ef 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -79,20 +79,6 @@ def reduce_all_dandi_raw_s3_logs( and relative_s3_log_file_path.parent.parent.name in years_to_reduce ] - print(f"{relative_s3_log_file_paths_to_reduce=}") - - test = [ - ( - reduced_s3_logs_folder_path / relative_s3_log_file_path.parent / f"{relative_s3_log_file_path.stem}.tsv", - not ( - reduced_s3_logs_folder_path / relative_s3_log_file_path.parent / f"{relative_s3_log_file_path.stem}.tsv" - ).exists(), - ) - for relative_s3_log_file_path in relative_s3_log_file_paths - if relative_s3_log_file_path.parent.parent.name in years_to_reduce - ] - print(f"{test=}") - # The .rglob is not naturally sorted; shuffle for more uniform progress updates random.shuffle(relative_s3_log_file_paths_to_reduce)