Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make process resumable and enhance IP classifier #54

Merged
merged 56 commits into from
Aug 21, 2024
Merged
Changes from 1 commit
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
ed19f6e
refactor for simplicity and resumability
Aug 20, 2024
f5076e7
refactor for simplicity and resumability
Aug 20, 2024
6a4853b
refactor for simplicity and resumability
Aug 20, 2024
fb08027
refactor for simplicity and resumability
Aug 20, 2024
caf030c
fixes; adapting tests
Aug 20, 2024
b7a83f5
fix
Aug 20, 2024
95e4bda
fix
Aug 20, 2024
fb48dbb
fix
Aug 20, 2024
93bfd3f
fix
Aug 20, 2024
661e1a7
fix
Aug 20, 2024
dc8c6d9
fix
Aug 20, 2024
38ec7e6
fix
Aug 20, 2024
2e863b6
fix
Aug 20, 2024
60b6ae5
adjusting tests
Aug 21, 2024
46f3375
debug reduction tests
Aug 21, 2024
3bfba48
get binning mostly there
Aug 21, 2024
442b5e3
fix
Aug 21, 2024
1f095e6
debug binning
Aug 21, 2024
ae3b352
debug binning
Aug 21, 2024
e6432be
update mapping; debug year span
Aug 21, 2024
d3b4c80
debug skip protocol
Aug 21, 2024
8ad2c88
improve default folder creation
Aug 21, 2024
cb5e7ff
improve default folder creation
Aug 21, 2024
08c1786
improve default folder creation
Aug 21, 2024
e6c68fd
improve default folder creation
Aug 21, 2024
4518145
add CLI for binning
Aug 21, 2024
4fcbe88
update argument name; enhance readme
Aug 21, 2024
344b09b
add binning tracking
Aug 21, 2024
d5ed6da
reformat
Aug 21, 2024
bb16fd4
reformat
Aug 21, 2024
ae5b057
reformat
Aug 21, 2024
0000b2a
adjust name
Aug 21, 2024
4bc5924
add progress bar to binning
Aug 21, 2024
c442696
add a file buffer to binning
Aug 21, 2024
d2830da
add CIDR regions
Aug 21, 2024
297e4ff
make argument simpler
Aug 21, 2024
8dbd722
update readme
Aug 21, 2024
b6f5f97
add extra progress bar for binning
Aug 21, 2024
5925438
skip ipv6 gcp
Aug 21, 2024
5af93c5
add services to cache
Aug 21, 2024
a43572a
add extra test case and debug
Aug 21, 2024
89c0bd5
add helper function for in cidr
Aug 21, 2024
b3a9bcc
add helper function for in cidr
Aug 21, 2024
041abb9
add extra cache
Aug 21, 2024
40cb4e9
add extra cache
Aug 21, 2024
89022a4
add extra cache
Aug 21, 2024
1604016
add extra cache
Aug 21, 2024
74df297
try this again
Aug 21, 2024
85ff066
fix skip condition in reducer
Aug 21, 2024
1694ac4
remove other helper
Aug 21, 2024
e170cd2
remove other helper
Aug 21, 2024
121a974
debugging non-skip
Aug 21, 2024
9730cab
debugging non-skip
Aug 21, 2024
c6f712d
debugging non-skip
Aug 21, 2024
9ed0a5b
debugging non-skip
Aug 21, 2024
75579e9
Merge branch 'main' into make_resumable
CodyCBakerPhD Aug 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
refactor for simplicity and resumability
  • Loading branch information
CodyCBakerPhD committed Aug 20, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit ed19f6e13dfe1c898b16fc1791c4bd7cea2bd595
4 changes: 1 addition & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -10,7 +10,7 @@ packages = ["src/dandi_s3_log_parser"]

[project]
name = "dandi_s3_log_parser"
version="0.3.0"
version="0.4.0"
authors = [
{ name="Cody Baker", email="[email protected]" },
]
@@ -38,9 +38,7 @@ classifiers = [

[project.scripts]
reduce_all_dandi_raw_s3_logs = "dandi_s3_log_parser._command_line_interface:_reduce_all_dandi_raw_s3_logs_cli"
reduce_dandi_raw_s3_log = "dandi_s3_log_parser._command_line_interface:_reduce_dandi_raw_s3_log_cli"
map_reduced_logs_to_dandisets = "dandi_s3_log_parser._command_line_interface:_map_reduced_logs_to_dandisets_cli"
find_random_example_line = "dandi_s3_log_parser._command_line_interface:_find_random_example_line_cli"



112 changes: 5 additions & 107 deletions src/dandi_s3_log_parser/_command_line_interface.py
Original file line number Diff line number Diff line change
@@ -2,23 +2,19 @@

import collections
import pathlib
from typing import Literal

import click

from ._config import REQUEST_TYPES
from ._dandi_s3_log_file_reducer import (
reduce_all_dandi_raw_s3_logs,
reduce_dandi_raw_s3_log,
)
from ._dandiset_mapper import map_reduced_logs_to_dandisets
from .testing import find_random_example_line


@click.command(name="reduce_all_dandi_raw_s3_logs")
@click.option(
"--base_raw_s3_logs_folder_path",
help="The path to the base folder containing all raw S3 log files.",
"--raw_s3_logs_folder_path",
help="The path to the folder containing all raw S3 log files.",
required=True,
type=click.Path(writable=False),
)
@@ -56,7 +52,7 @@
default=None,
)
def _reduce_all_dandi_raw_s3_logs_cli(
base_raw_s3_logs_folder_path: str,
raw_s3_logs_folder_path: str,
reduced_s3_logs_folder_path: str,
maximum_number_of_workers: int,
maximum_buffer_size_in_mb: int,
@@ -69,65 +65,14 @@ def _reduce_all_dandi_raw_s3_logs_cli(
maximum_buffer_size_in_bytes = maximum_buffer_size_in_mb * 10**6

reduce_all_dandi_raw_s3_logs(
base_raw_s3_logs_folder_path=base_raw_s3_logs_folder_path,
raw_s3_logs_folder_path=raw_s3_logs_folder_path,
reduced_s3_logs_folder_path=reduced_s3_logs_folder_path,
maximum_number_of_workers=maximum_number_of_workers,
maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes,
excluded_ips=handled_excluded_ips,
)


@click.command(name="reduce_dandi_raw_s3_log")
@click.option(
"--raw_s3_log_file_path",
help="The path to the raw S3 log file to be reduced.",
required=True,
type=click.Path(writable=False),
)
@click.option(
"--reduced_s3_logs_folder_path",
help="The path to write each reduced S3 log file to. There will be one file per handled asset ID.",
required=True,
type=click.Path(writable=True),
)
@click.option(
"--maximum_buffer_size_in_mb",
help=(
"The theoretical maximum amount of RAM (in MB) to use on each buffer iteration when reading from the "
"source text files. "
"Actual total RAM usage will be higher due to overhead and caching. "
"Automatically splits this total amount over the maximum number of workers if `maximum_number_of_workers` is "
"greater than one."
),
required=False,
type=click.IntRange(min=1), # Bare minimum of 1 MB
default=1_000, # 1 GB recommended
)
@click.option(
"--excluded_ips",
help="A comma-separated list of IP addresses to exclude from reduction.",
required=False,
type=str,
default=None,
)
def _reduce_dandi_raw_s3_log_cli(
raw_s3_log_file_path: str,
reduced_s3_logs_folder_path: str,
excluded_ips: str | None,
maximum_buffer_size_in_mb: int,
) -> None:
split_excluded_ips = excluded_ips.split(",") if excluded_ips is not None else list()
handled_excluded_ips = collections.defaultdict(bool) if len(split_excluded_ips) != 0 else None
for excluded_ip in split_excluded_ips:
handled_excluded_ips[excluded_ip] = True
maximum_buffer_size_in_bytes = maximum_buffer_size_in_mb * 10**6

reduce_dandi_raw_s3_log(
raw_s3_log_file_path=raw_s3_log_file_path,
reduced_s3_logs_folder_path=reduced_s3_logs_folder_path,
maximum_buffer_size_in_bytes=maximum_buffer_size_in_bytes,
excluded_ips=handled_excluded_ips,
)
return None


@click.command(name="map_reduced_logs_to_dandisets")
@@ -150,51 +95,4 @@ def _map_reduced_logs_to_dandisets_cli(
reduced_s3_logs_folder_path=reduced_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path
)


@click.command(name="find_random_example_line")
@click.option(
"--raw_s3_log_folder_path",
help="The path to the folder containing the raw S3 log files.",
required=True,
type=click.Path(writable=False),
)
@click.option(
"--request_type",
help="The type of request to filter for.",
required=True,
type=click.Choice(REQUEST_TYPES),
)
@click.option(
"--maximum_lines_per_request_type",
help=(
"The maximum number of lines to randomly sample for each request type. "
"The default is 5. "
"These lines are always found chronologically from the start of the file."
),
required=False,
type=click.IntRange(min=2),
default=100,
)
@click.option(
"--seed",
help="The seed to use for the random number generator. The default is 0.",
required=False,
type=click.IntRange(min=0),
default=0,
)
def _find_random_example_line_cli(
raw_s3_log_folder_path: str | pathlib.Path,
request_type: Literal[REQUEST_TYPES],
maximum_lines_per_request_type: int = 5,
seed: int = 0,
) -> None:
"""Find a randomly chosen line from a folder of raw S3 log files to serve as an example for testing purposes."""
example_line = find_random_example_line(
raw_s3_log_folder_path=raw_s3_log_folder_path,
request_type=request_type,
maximum_lines_per_request_type=maximum_lines_per_request_type,
seed=seed,
)
print(example_line)

return None
Loading