From 3dec8de3fb41e710f7c6af357f5a1139ba45e98a Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Fri, 16 Aug 2024 01:32:05 -0400 Subject: [PATCH 1/3] debugs and --help formatting --- pyproject.toml | 2 +- .../_command_line_interface.py | 38 +++++++++---------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0cfc7e4..6b0477d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ classifiers = [ [project.scripts] reduce_all_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_all_dandi_raw_s3_logs_cli" -reduce_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_dandi_raw_s3_logs_cli" +reduce_dandi_raw_s3_log = "dandi_s3_log_parser._command_line_interface:_reduce_dandi_raw_s3_log_cli" map_reduced_logs_to_dandisets = "dandi_s3_log_parser._command_line_interface:_map_reduced_logs_to_dandisets_cli" find_random_example_line = "dandi_s3_log_parser._command_line_interface:_find_random_example_line_cli" diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index 00a4038..265bf12 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -12,7 +12,7 @@ reduce_dandi_raw_s3_log, ) from ._dandiset_mapper import map_reduced_logs_to_dandisets -from .testing._helpers import find_random_example_line +from .testing import find_random_example_line @click.command(name="reduce_all_dandi_raw_s3_logs") @@ -44,13 +44,13 @@ ) @click.option( "--maximum_buffer_size_in_mb", - help="""" -The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the - source text files. - Actual total RAM usage will be higher due to overhead and caching. - Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is - greater than one. -""", + help=( + "The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the " + "source text files. " + "Actual total RAM usage will be higher due to overhead and caching. " + "Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is " + "greater than one." + ), required=False, type=click.IntRange(min=1), # Bare minimum of 1 MB default=1_000, # 1 GB recommended @@ -99,13 +99,13 @@ def _reduce_all_dandi_raw_s3_logs_cli( ) @click.option( "--maximum_buffer_size_in_mb", - help="""" -The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the - source text files. - Actual total RAM usage will be higher due to overhead and caching. - Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is - greater than one. -""", + help=( + "The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the " + "source text files. " + "Actual total RAM usage will be higher due to overhead and caching. " + "Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is " + "greater than one." + ), required=False, type=click.IntRange(min=1), # Bare minimum of 1 MB default=1_000, # 1 GB recommended @@ -167,11 +167,9 @@ def _map_reduced_logs_to_dandisets_cli( @click.option( "--maximum_lines_per_request_type", help=( - """The maximum number of lines to randomly sample for each request type. -The default is 5. - -These lines are always found chronologically from the start of the file. -""" + "The maximum number of lines to randomly sample for each request type. " + "The default is 5. " + "These lines are always found chronologically from the start of the file." ), required=False, type=click.IntRange(min=2), From 56ac42ccf7c3cbcb26b14606c571e6979f5a5fbd Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Fri, 16 Aug 2024 01:40:59 -0400 Subject: [PATCH 2/3] add docstrings to inits --- README.md | 2 +- src/dandi_s3_log_parser/__init__.py | 19 ++++++++++++++++++- .../_command_line_interface.py | 2 ++ src/dandi_s3_log_parser/testing/__init__.py | 2 ++ 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 53c4d33..6da30b3 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ For example, on Drogon: ```bash reduce_dandi_raw_s3_log \ - --raw_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs/2024/08/17.log \ + --raw_s3_log_file_path /mnt/backup/dandi/dandiarchive-logs/2024/08/17.log \ --reduced_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_8_15_2024/REST_GET_OBJECT_per_asset_id \ --excluded_ips < Drogons IP > ``` diff --git a/src/dandi_s3_log_parser/__init__.py b/src/dandi_s3_log_parser/__init__.py index 40b8c51..cca6acc 100644 --- a/src/dandi_s3_log_parser/__init__.py +++ b/src/dandi_s3_log_parser/__init__.py @@ -1,4 +1,21 @@ -"""Outermost exposed imports; including global environment variables.""" +""" +DANDI S3 log parser +=================== + +Extraction of minimal information from consolidated raw S3 logs for public sharing and plotting. + +Developed for the DANDI Archive. + +A few summary facts as of 2024: + +- A single line of a raw S3 log file can be between 400-1000+ bytes. +- Some of the busiest daily logs on the archive can have around 5,014,386 lines. +- There are more than 6 TB of log files collected in total. +- This parser reduces that total to around 20 GB of essential information. + +The reduced information is then additionally mapped to currently available assets in persistent published Dandiset +versions and current drafts, which only comprise around 100 MB of the original data. +""" from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH from ._s3_log_file_parser import parse_raw_s3_log diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index 265bf12..3f08b99 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -196,3 +196,5 @@ def _find_random_example_line_cli( seed=seed, ) print(example_line) + + return None diff --git a/src/dandi_s3_log_parser/testing/__init__.py b/src/dandi_s3_log_parser/testing/__init__.py index 9c903ee..6a4c2b8 100644 --- a/src/dandi_s3_log_parser/testing/__init__.py +++ b/src/dandi_s3_log_parser/testing/__init__.py @@ -1,3 +1,5 @@ +"""Helper functions related to testing and fetching examples from existing log files.""" + from ._helpers import find_random_example_line __all__ = ["find_random_example_line"] From aa4512a3de73ec0e7330b3c9ceb84be72ef2072e Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Fri, 16 Aug 2024 01:42:33 -0400 Subject: [PATCH 3/3] add todo --- src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py index 3f3d71e..ddb2e11 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py @@ -72,6 +72,7 @@ def reduce_all_dandi_raw_s3_logs( # The .rglob is not naturally sorted; shuffle for more uniform progress updates random.shuffle(daily_raw_s3_log_file_paths) + # TODO: add better reporting units to all TQDMs (lines / s, files / s, etc.) if maximum_number_of_workers == 1: for raw_s3_log_file_path in tqdm.tqdm( iterable=daily_raw_s3_log_file_paths,