Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve --help formatting #42

Merged
merged 3 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ For example, on Drogon:

```bash
reduce_dandi_raw_s3_log \
--raw_s3_log_folder_path /mnt/backup/dandi/dandiarchive-logs/2024/08/17.log \
--raw_s3_log_file_path /mnt/backup/dandi/dandiarchive-logs/2024/08/17.log \
--reduced_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-cody/parsed_8_15_2024/REST_GET_OBJECT_per_asset_id \
--excluded_ips < Drogons IP >
```
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ classifiers = [

[project.scripts]
reduce_all_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_all_dandi_raw_s3_logs_cli"
reduce_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_dandi_raw_s3_logs_cli"
reduce_dandi_raw_s3_log = "dandi_s3_log_parser._command_line_interface:_reduce_dandi_raw_s3_log_cli"
map_reduced_logs_to_dandisets = "dandi_s3_log_parser._command_line_interface:_map_reduced_logs_to_dandisets_cli"
find_random_example_line = "dandi_s3_log_parser._command_line_interface:_find_random_example_line_cli"

Expand Down
19 changes: 18 additions & 1 deletion src/dandi_s3_log_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,21 @@
"""Outermost exposed imports; including global environment variables."""
"""
DANDI S3 log parser
===================

Extraction of minimal information from consolidated raw S3 logs for public sharing and plotting.

Developed for the DANDI Archive.

A few summary facts as of 2024:

- A single line of a raw S3 log file can be between 400-1000+ bytes.
- Some of the busiest daily logs on the archive can have around 5,014,386 lines.
- There are more than 6 TB of log files collected in total.
- This parser reduces that total to around 20 GB of essential information.

The reduced information is then additionally mapped to currently available assets in persistent published Dandiset
versions and current drafts, which only comprise around 100 MB of the original data.
"""

from ._config import DANDI_S3_LOG_PARSER_BASE_FOLDER_PATH
from ._s3_log_file_parser import parse_raw_s3_log
Expand Down
40 changes: 20 additions & 20 deletions src/dandi_s3_log_parser/_command_line_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
reduce_dandi_raw_s3_log,
)
from ._dandiset_mapper import map_reduced_logs_to_dandisets
from .testing._helpers import find_random_example_line
from .testing import find_random_example_line


@click.command(name="reduce_all_dandi_raw_s3_logs")
Expand Down Expand Up @@ -44,13 +44,13 @@
)
@click.option(
"--maximum_buffer_size_in_mb",
help=""""
The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the
source text files.
Actual total RAM usage will be higher due to overhead and caching.
Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is
greater than one.
""",
help=(
"The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the "
"source text files. "
"Actual total RAM usage will be higher due to overhead and caching. "
"Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is "
"greater than one."
),
required=False,
type=click.IntRange(min=1), # Bare minimum of 1 MB
default=1_000, # 1 GB recommended
Expand Down Expand Up @@ -99,13 +99,13 @@ def _reduce_all_dandi_raw_s3_logs_cli(
)
@click.option(
"--maximum_buffer_size_in_mb",
help=""""
The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the
source text files.
Actual total RAM usage will be higher due to overhead and caching.
Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is
greater than one.
""",
help=(
"The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the "
"source text files. "
"Actual total RAM usage will be higher due to overhead and caching. "
"Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is "
"greater than one."
),
required=False,
type=click.IntRange(min=1), # Bare minimum of 1 MB
default=1_000, # 1 GB recommended
Expand Down Expand Up @@ -167,11 +167,9 @@ def _map_reduced_logs_to_dandisets_cli(
@click.option(
"--maximum_lines_per_request_type",
help=(
"""The maximum number of lines to randomly sample for each request type.
The default is 5.

These lines are always found chronologically from the start of the file.
"""
"The maximum number of lines to randomly sample for each request type. "
"The default is 5. "
"These lines are always found chronologically from the start of the file."
),
required=False,
type=click.IntRange(min=2),
Expand All @@ -198,3 +196,5 @@ def _find_random_example_line_cli(
seed=seed,
)
print(example_line)

return None
1 change: 1 addition & 0 deletions src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def reduce_all_dandi_raw_s3_logs(
# The .rglob is not naturally sorted; shuffle for more uniform progress updates
random.shuffle(daily_raw_s3_log_file_paths)

# TODO: add better reporting units to all TQDMs (lines / s, files / s, etc.)
if maximum_number_of_workers == 1:
for raw_s3_log_file_path in tqdm.tqdm(
iterable=daily_raw_s3_log_file_paths,
Expand Down
2 changes: 2 additions & 0 deletions src/dandi_s3_log_parser/testing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Helper functions related to testing and fetching examples from existing log files."""

from ._helpers import find_random_example_line

__all__ = ["find_random_example_line"]