From 3dec8de3fb41e710f7c6af357f5a1139ba45e98a Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Fri, 16 Aug 2024 01:32:05 -0400
Subject: [PATCH 1/3] debugs and --help formatting

---
 pyproject.toml                                |  2 +-
 .../_command_line_interface.py                | 38 +++++++++----------
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0cfc7e4..6b0477d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,7 +38,7 @@ classifiers = [
 
 [project.scripts]
 reduce_all_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_all_dandi_raw_s3_logs_cli"
-reduce_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_dandi_raw_s3_logs_cli"
+reduce_dandi_raw_s3_log = "dandi_s3_log_parser._command_line_interface:_reduce_dandi_raw_s3_log_cli"
 map_reduced_logs_to_dandisets = "dandi_s3_log_parser._command_line_interface:_map_reduced_logs_to_dandisets_cli"
 find_random_example_line = "dandi_s3_log_parser._command_line_interface:_find_random_example_line_cli"
 
diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py
index 00a4038..265bf12 100644
--- a/src/dandi_s3_log_parser/_command_line_interface.py
+++ b/src/dandi_s3_log_parser/_command_line_interface.py
@@ -12,7 +12,7 @@
     reduce_dandi_raw_s3_log,
 )
 from ._dandiset_mapper import map_reduced_logs_to_dandisets
-from .testing._helpers import find_random_example_line
+from .testing import find_random_example_line
 
 
 @click.command(name="reduce_all_dandi_raw_s3_logs")
@@ -44,13 +44,13 @@
 )
 @click.option(
     "--maximum_buffer_size_in_mb",
-    help=""""
-The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the
-    source text files.
-    Actual total RAM usage will be higher due to overhead and caching.
-    Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is
-    greater than one.
-""",
+    help=(
+        "The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the "
+        "source text files. "
+        "Actual total RAM usage will be higher due to overhead and caching. "
+        "Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is "
+        "greater than one."
+    ),
     required=False,
     type=click.IntRange(min=1),  # Bare minimum of 1 MB
     default=1_000,  # 1 GB recommended
@@ -99,13 +99,13 @@ def _reduce_all_dandi_raw_s3_logs_cli(
 )
 @click.option(
     "--maximum_buffer_size_in_mb",
-    help=""""
-The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the
-    source text files.
-    Actual total RAM usage will be higher due to overhead and caching.
-    Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is
-    greater than one.
-""",
+    help=(
+        "The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the "
+        "source text files. "
+        "Actual total RAM usage will be higher due to overhead and caching. "
+        "Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is "
+        "greater than one."
+    ),
     required=False,
     type=click.IntRange(min=1),  # Bare minimum of 1 MB
     default=1_000,  # 1 GB recommended
@@ -167,11 +167,9 @@ def _map_reduced_logs_to_dandisets_cli(
 @click.option(
     "--maximum_lines_per_request_type",
     help=(
-        """The maximum number of lines to randomly sample for each request type.
-The default is 5.
-
-These lines are always found chronologically from the start of the file.
-"""
+        "The maximum number of lines to randomly sample for each request type. "
+        "The default is 5. "
+        "These lines are always found chronologically from the start of the file."
     ),
     required=False,
     type=click.IntRange(min=2),

From 56ac42ccf7c3cbcb26b14606c571e6979f5a5fbd Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Fri, 16 Aug 2024 01:40:59 -0400
Subject: [PATCH 2/3] add docstrings to inits

---
 README.md                                     |  2 +-
 src/dandi_s3_log_parser/__init__.py           | 19 ++++++++++++++++++-
 .../_command_line_interface.py                |  2 ++
 src/dandi_s3_log_parser/testing/__init__.py   |  2 ++
 4 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 53c4d33..6da30b3 100644
--- a/README.md
+++ b/README.md
@@ -81,7 +81,7 @@ For example, on Drogon:
 
 ```bash
 reduce_dandi_raw_s3_log \
-  --raw_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs/2024/08/17.log \
+  --raw_s3_log_file_path /mnt/backup/dandi/dandiarchive-logs/2024/08/17.log \
   --reduced_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_8_15_2024/REST_GET_OBJECT_per_asset_id \
   --excluded_ips < Drogons IP >
 ```
diff --git a/src/dandi_s3_log_parser/__init__.py b/src/dandi_s3_log_parser/__init__.py
index 40b8c51..cca6acc 100644
--- a/src/dandi_s3_log_parser/__init__.py
+++ b/src/dandi_s3_log_parser/__init__.py
@@ -1,4 +1,21 @@
-"""Outermost exposed imports; including global environment variables."""
+"""
+DANDI S3 log parser
+===================
+
+Extraction of minimal information from consolidated raw S3 logs for public sharing and plotting.
+
+Developed for the DANDI Archive.
+
+A few summary facts as of 2024:
+
+- A single line of a raw S3 log file can be between 400-1000+ bytes.
+- Some of the busiest daily logs on the archive can have around 5,014,386 lines.
+- There are more than 6 TB of log files collected in total.
+- This parser reduces that total to around 20 GB of essential information.
+
+The reduced information is then additionally mapped to currently available assets in persistent published Dandiset
+versions and current drafts, which only comprise around 100 MB of the original data.
+"""
 
 from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH
 from ._s3_log_file_parser import parse_raw_s3_log
diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py
index 265bf12..3f08b99 100644
--- a/src/dandi_s3_log_parser/_command_line_interface.py
+++ b/src/dandi_s3_log_parser/_command_line_interface.py
@@ -196,3 +196,5 @@ def _find_random_example_line_cli(
         seed=seed,
     )
     print(example_line)
+
+    return None
diff --git a/src/dandi_s3_log_parser/testing/__init__.py b/src/dandi_s3_log_parser/testing/__init__.py
index 9c903ee..6a4c2b8 100644
--- a/src/dandi_s3_log_parser/testing/__init__.py
+++ b/src/dandi_s3_log_parser/testing/__init__.py
@@ -1,3 +1,5 @@
+"""Helper functions related to testing and fetching examples from existing log files."""
+
 from ._helpers import find_random_example_line
 
 __all__ = ["find_random_example_line"]

From aa4512a3de73ec0e7330b3c9ceb84be72ef2072e Mon Sep 17 00:00:00 2001
From: CodyCBakerPhD <codycbakerphd@gmail.com>
Date: Fri, 16 Aug 2024 01:42:33 -0400
Subject: [PATCH 3/3] add todo

---
 src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
index 3f3d71e..ddb2e11 100644
--- a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
+++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
@@ -72,6 +72,7 @@ def reduce_all_dandi_raw_s3_logs(
     # The .rglob is not naturally sorted; shuffle for more uniform progress updates
     random.shuffle(daily_raw_s3_log_file_paths)
 
+    # TODO: add better reporting units to all TQDMs (lines / s, files / s, etc.)
     if maximum_number_of_workers == 1:
         for raw_s3_log_file_path in tqdm.tqdm(
             iterable=daily_raw_s3_log_file_paths,