catalystneuro · CodyCBakerPhD · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024
diff --git a/README.md b/README.md
@@ -81,7 +81,7 @@ For example, on Drogon:
 
 ```bash
 reduce_dandi_raw_s3_log \
-  --raw_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs/2024/08/17.log \
+  --raw_s3_log_file_path /mnt/backup/dandi/dandiarchive-logs/2024/08/17.log \
   --reduced_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_8_15_2024/REST_GET_OBJECT_per_asset_id \
   --excluded_ips < Drogons IP >
 ```

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,7 +38,7 @@ classifiers = [
 
 [project.scripts]
 reduce_all_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_all_dandi_raw_s3_logs_cli"
-reduce_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_dandi_raw_s3_logs_cli"
+reduce_dandi_raw_s3_log = "dandi_s3_log_parser._command_line_interface:_reduce_dandi_raw_s3_log_cli"
 map_reduced_logs_to_dandisets = "dandi_s3_log_parser._command_line_interface:_map_reduced_logs_to_dandisets_cli"
 find_random_example_line = "dandi_s3_log_parser._command_line_interface:_find_random_example_line_cli"
 

diff --git a/src/dandi_s3_log_parser/__init__.py b/src/dandi_s3_log_parser/__init__.py
@@ -1,4 +1,21 @@
-"""Outermost exposed imports; including global environment variables."""
+"""
+DANDI S3 log parser
+===================
+
+Extraction of minimal information from consolidated raw S3 logs for public sharing and plotting.
+
+Developed for the DANDI Archive.
+
+A few summary facts as of 2024:
+
+- A single line of a raw S3 log file can be between 400-1000+ bytes.
+- Some of the busiest daily logs on the archive can have around 5,014,386 lines.
+- There are more than 6 TB of log files collected in total.
+- This parser reduces that total to around 20 GB of essential information.
+
+The reduced information is then additionally mapped to currently available assets in persistent published Dandiset
+versions and current drafts, which only comprise around 100 MB of the original data.
+"""
 
 from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH
 from ._s3_log_file_parser import parse_raw_s3_log

diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py
@@ -12,7 +12,7 @@
     reduce_dandi_raw_s3_log,
 )
 from ._dandiset_mapper import map_reduced_logs_to_dandisets
-from .testing._helpers import find_random_example_line
+from .testing import find_random_example_line
 
 
 @click.command(name="reduce_all_dandi_raw_s3_logs")
@@ -44,13 +44,13 @@
 )
 @click.option(
     "--maximum_buffer_size_in_mb",
-    help=""""
-The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the
-    source text files.
-    Actual total RAM usage will be higher due to overhead and caching.
-    Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is
-    greater than one.
-""",
+    help=(
+        "The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the "
+        "source text files. "
+        "Actual total RAM usage will be higher due to overhead and caching. "
+        "Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is "
+        "greater than one."
+    ),
     required=False,
     type=click.IntRange(min=1),  # Bare minimum of 1 MB
     default=1_000,  # 1 GB recommended
@@ -99,13 +99,13 @@ def _reduce_all_dandi_raw_s3_logs_cli(
 )
 @click.option(
     "--maximum_buffer_size_in_mb",
-    help=""""
-The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the
-    source text files.
-    Actual total RAM usage will be higher due to overhead and caching.
-    Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is
-    greater than one.
-""",
+    help=(
+        "The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the "
+        "source text files. "
+        "Actual total RAM usage will be higher due to overhead and caching. "
+        "Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is "
+        "greater than one."
+    ),
     required=False,
     type=click.IntRange(min=1),  # Bare minimum of 1 MB
     default=1_000,  # 1 GB recommended
@@ -167,11 +167,9 @@ def _map_reduced_logs_to_dandisets_cli(
 @click.option(
     "--maximum_lines_per_request_type",
     help=(
-        """The maximum number of lines to randomly sample for each request type.
-The default is 5.
-
-These lines are always found chronologically from the start of the file.
-"""
+        "The maximum number of lines to randomly sample for each request type. "
+        "The default is 5. "
+        "These lines are always found chronologically from the start of the file."
     ),
     required=False,
     type=click.IntRange(min=2),
@@ -198,3 +196,5 @@ def _find_random_example_line_cli(
         seed=seed,
     )
     print(example_line)
+
+    return None
diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
@@ -72,6 +72,7 @@ def reduce_all_dandi_raw_s3_logs(
     # The .rglob is not naturally sorted; shuffle for more uniform progress updates
     random.shuffle(daily_raw_s3_log_file_paths)
 
+    # TODO: add better reporting units to all TQDMs (lines / s, files / s, etc.)
     if maximum_number_of_workers == 1:
         for raw_s3_log_file_path in tqdm.tqdm(
             iterable=daily_raw_s3_log_file_paths,

diff --git a/src/dandi_s3_log_parser/testing/__init__.py b/src/dandi_s3_log_parser/testing/__init__.py
@@ -1,3 +1,5 @@
+"""Helper functions related to testing and fetching examples from existing log files."""
+
 from ._helpers import find_random_example_line
 
 __all__ = ["find_random_example_line"]