From 5c1d444456903a8041224973da02425d97b51108 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 13:08:38 -0400 Subject: [PATCH 01/12] add splitting logic based on object type --- .../_map_binned_s3_logs_to_dandisets.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index c0b0c68..5f5e703 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -1,5 +1,6 @@ import os import pathlib +from typing import Literal import dandi.dandiapi import pandas @@ -13,6 +14,7 @@ def map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path: DirectoryPath, dandiset_logs_folder_path: DirectoryPath, + object_type: Literal["blobs", "zarr"], dandiset_limit: int | None = None, ) -> None: """ @@ -26,6 +28,8 @@ def map_binned_s3_logs_to_dandisets( The path to the folder containing the reduced S3 log files. dandiset_logs_folder_path : DirectoryPath The path to the folder where the mapped logs will be saved. + object_type : one of "blobs" or "zarr" + The type of objects to map the logs to, as determined by the parents of the object keys. dandiset_limit : int, optional The maximum number of Dandisets to process per call. """ @@ -62,6 +66,7 @@ def map_binned_s3_logs_to_dandisets( dandiset=dandiset, reduced_s3_logs_folder_path=binned_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path, + object_type=object_type, client=client, ip_hash_to_region=ip_hash_to_region, ip_hash_not_in_services=ip_hash_not_in_services, @@ -75,6 +80,7 @@ def _map_reduced_logs_to_dandiset( dandiset: dandi.dandiapi.RemoteDandiset, reduced_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path, + object_type: Literal["blobs", "zarr"], client: dandi.dandiapi.DandiAPIClient, ip_hash_to_region: dict[str, str], ip_hash_not_in_services: dict[str, bool], @@ -91,6 +97,11 @@ def _map_reduced_logs_to_dandiset( asset_suffixes = pathlib.Path(asset.path).suffixes is_asset_zarr = ".zarr" in asset_suffixes + if is_asset_zarr and object_type == "blobs": + continue + if not is_asset_zarr and object_type == "zarr": + continue + if is_asset_zarr: blob_id = asset.zarr reduced_s3_log_file_path = reduced_s3_logs_folder_path / "zarr" / f"{blob_id}.tsv" From 318da4dfb2859d0a7ff87cce494367b397a06dd5 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 13:12:58 -0400 Subject: [PATCH 02/12] add cli and add to file name --- src/dandi_s3_log_parser/_command_line_interface.py | 6 ++++++ src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index d50d037..362510f 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -133,6 +133,12 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( required=True, type=click.Path(writable=False), ) +@click.option( + "--object_type", + help="The type of objects to map the logs to, as determined by the parents of the object keys.", + required=True, + type=click.Choice(["blobs", "zarr"]), +) @click.option( "--dandiset_limit", help="The maximum number of Dandisets to process per call.", diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index 5f5e703..802bee9 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -137,5 +137,5 @@ def _map_reduced_logs_to_dandiset( dandiset_log_folder_path = dandiset_logs_folder_path / dandiset_id dandiset_log_folder_path.mkdir(exist_ok=True) - version_file_path = dandiset_log_folder_path / f"{version_id}.tsv" + version_file_path = dandiset_log_folder_path / f"{version_id}_{object_type}.tsv" mapped_log.to_csv(path_or_buf=version_file_path, mode="w", sep="\t", header=True, index=True) From 463494ba0a758b7700685a50d5e82f75b43b1d95 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 13:15:30 -0400 Subject: [PATCH 03/12] add cli and add to file name --- src/dandi_s3_log_parser/_command_line_interface.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index 362510f..4c2a5e8 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -2,6 +2,7 @@ import collections import pathlib +from typing import Literal import click @@ -149,11 +150,13 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( def _map_binned_s3_logs_to_dandisets_cli( binned_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path, + object_type: Literal["blobs", "zarr"], dandiset_limit: int | None, ) -> None: map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path=binned_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path, + object_type=object_type, dandiset_limit=dandiset_limit, ) From 48b2b83034e8b78c2d8deb4ff7326047fc75182c Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Thu, 22 Aug 2024 13:17:38 -0400 Subject: [PATCH 04/12] Update README.md --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 43a1488..533ccf7 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,8 @@ The next step, which should also be updated regularly (daily-weekly), is to iter ```bash map_binned_s3_logs_to_dandisets \ --binned_s3_logs_folder_path < binned S3 logs folder path > \ - --dandiset_logs_folder_path < mapped Dandiset logs folder > + --dandiset_logs_folder_path < mapped Dandiset logs folder > \ + --object_type < blobs or zarr > ``` For example, on Drogon: @@ -146,10 +147,11 @@ For example, on Drogon: ```bash map_binned_s3_logs_to_dandisets \ --binned_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-binned \ - --dandiset_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-mapped + --dandiset_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-mapped \ + --object_type blobs ``` -In the summer of 2024, this process took less than ?? hours to run and in the current design should be run fresh regularly to keep the logs up to date. +In the summer of 2024, this process took less than ?? hours to run without any activate caches and in the current design should be run fresh regularly to keep the logs up to date. The caches that accumulate over time should help speed up the process over repeated calls. From c2ea25512999317e64b02c3224cfada90d3ffc3b Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 16:40:30 -0400 Subject: [PATCH 05/12] split activity per dandi file and add temporally binned summary --- .../_map_binned_s3_logs_to_dandisets.py | 61 +++++++++++++------ 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index 802bee9..d2e5dd4 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -3,6 +3,7 @@ from typing import Literal import dandi.dandiapi +import natsort import pandas import tqdm from pydantic import DirectoryPath, validate_call @@ -18,7 +19,9 @@ def map_binned_s3_logs_to_dandisets( dandiset_limit: int | None = None, ) -> None: """ - Iterate over all dandisets and create a single .tsv per dandiset version containing reduced log for all assets. + Iterate over all dandisets and create a single .tsv per asset per dandiset version. + + Also creates a summary file per dandiset that has binned activity per day. Requires the `ipinfo` environment variables to be set (`IPINFO_CREDENTIALS` and `IP_HASH_SALT`). @@ -62,7 +65,7 @@ def map_binned_s3_logs_to_dandisets( mininterval=5.0, smoothing=0, ): - _map_reduced_logs_to_dandiset( + _map_binneded_logs_to_dandiset( dandiset=dandiset, reduced_s3_logs_folder_path=binned_s3_logs_folder_path, dandiset_logs_folder_path=dandiset_logs_folder_path, @@ -76,7 +79,7 @@ def map_binned_s3_logs_to_dandisets( _save_ip_hash_cache(name="services", ip_cache=ip_hash_not_in_services) -def _map_reduced_logs_to_dandiset( +def _map_binneded_logs_to_dandiset( dandiset: dandi.dandiapi.RemoteDandiset, reduced_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path, @@ -86,15 +89,19 @@ def _map_reduced_logs_to_dandiset( ip_hash_not_in_services: dict[str, bool], ) -> None: dandiset_id = dandiset.identifier + dandiset_log_folder_path = dandiset_logs_folder_path / dandiset_id for version in dandiset.get_versions(): version_id = version.identifier + dandiset_version_log_folder_path = dandiset_log_folder_path / version_id dandiset_version = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id) - all_reduced_s3_logs = [] + all_activity_for_version = [] for asset in dandiset_version.get_assets(): - asset_suffixes = pathlib.Path(asset.path).suffixes + dandi_filename = asset.path.name + + asset_suffixes = pathlib.Path(dandi_filename).suffixes is_asset_zarr = ".zarr" in asset_suffixes if is_asset_zarr and object_type == "blobs": @@ -114,28 +121,44 @@ def _map_reduced_logs_to_dandiset( if not reduced_s3_log_file_path.exists(): continue # No reduced logs found (possible asset was never accessed); skip to next asset - reduced_s3_log = pandas.read_table(filepath_or_buffer=reduced_s3_log_file_path, header=0) - reduced_s3_log["filename"] = [asset.path] * len(reduced_s3_log) - reduced_s3_log["region"] = [ + reduced_s3_log_binned_by_blob_id = pandas.read_table(filepath_or_buffer=reduced_s3_log_file_path, header=0) + + reduced_s3_log_binned_by_blob_id["region"] = [ get_region_from_ip_address( ip_address=ip_address, ip_hash_to_region=ip_hash_to_region, ip_hash_not_in_services=ip_hash_not_in_services, ) - for ip_address in reduced_s3_log["ip_address"] + for ip_address in reduced_s3_log_binned_by_blob_id["ip_address"] ] - reordered_reduced_s3_log = reduced_s3_log.reindex(columns=("filename", "timestamp", "bytes_sent", "region")) - all_reduced_s3_logs.append(reordered_reduced_s3_log) + reordered_reduced_s3_log = reduced_s3_log_binned_by_blob_id.reindex( + columns=("timestamp", "bytes_sent", "region") + ) + reordered_reduced_s3_log.sort_values(by="timestamp", key=natsort.natsort_keygen()) + reordered_reduced_s3_log.index = range(len(reordered_reduced_s3_log)) + + dandiset_version_log_folder_path.mkdir(parents=True, exist_ok=True) + version_asset_file_path = dandiset_version_log_folder_path / f"{dandi_filename}.tsv" + reordered_reduced_s3_log.to_csv( + path_or_buf=version_asset_file_path, mode="w", sep="\t", header=True, index=True + ) - if len(all_reduced_s3_logs) == 0: + all_activity_for_version.append(reordered_reduced_s3_log) + + if len(all_activity_for_version) == 0: continue # No reduced logs found (possible dandiset version was never accessed); skip to next version - mapped_log = pandas.concat(objs=all_reduced_s3_logs, ignore_index=True) - mapped_log.sort_values(by="timestamp") - mapped_log.index = range(len(mapped_log)) + mapped_log = pandas.concat(objs=all_activity_for_version, ignore_index=True) + mapped_log["date"] = [entry[:10] for entry in mapped_log["timestamp"]] + + mapped_log_aggregated = mapped_log.groupby("date", as_index=False)["bytes_sent"].agg([list, "sum"]) + mapped_log_aggregated.rename(columns={"sum": "bytes_sent"}, inplace=True) + + mapped_log_binned_per_day = mapped_log_aggregated.reindex(columns=("date", "bytes_sent")) + mapped_log_binned_per_day.sort_values(by="date", key=natsort.natsort_keygen()) + + summary_file_path = dandiset_version_log_folder_path / "summary.tsv" + mapped_log_binned_per_day.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True) - dandiset_log_folder_path = dandiset_logs_folder_path / dandiset_id - dandiset_log_folder_path.mkdir(exist_ok=True) - version_file_path = dandiset_log_folder_path / f"{version_id}_{object_type}.tsv" - mapped_log.to_csv(path_or_buf=version_file_path, mode="w", sep="\t", header=True, index=True) + return None From 81f045a36921b097ecfb628966ffb5feef4818b0 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 16:42:46 -0400 Subject: [PATCH 06/12] split activity per dandi file and add temporally binned summary --- src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index d2e5dd4..ba8e18b 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -99,11 +99,11 @@ def _map_binneded_logs_to_dandiset( all_activity_for_version = [] for asset in dandiset_version.get_assets(): - dandi_filename = asset.path.name + asset_as_path = pathlib.Path(asset.path) + dandi_filename = asset_as_path.name + asset_suffixes = asset_as_path.suffixes - asset_suffixes = pathlib.Path(dandi_filename).suffixes is_asset_zarr = ".zarr" in asset_suffixes - if is_asset_zarr and object_type == "blobs": continue if not is_asset_zarr and object_type == "zarr": From 6ba25576367303773725f3849a67f824772077dc Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 17:12:52 -0400 Subject: [PATCH 07/12] adjusting tests --- .../_map_binned_s3_logs_to_dandisets.py | 2 +- .../expected_output/000003/0.210812.1448.tsv | 3 - ...es-YutaMouse20-140327_behavior+ecephys.tsv | 3 + .../000003/0.210812.1448/summary.tsv | 3 + .../expected_output/000003/0.230629.1955.tsv | 3 - ...es-YutaMouse20-140327_behavior+ecephys.tsv | 3 + .../000003/0.230629.1955/summary.tsv | 3 + .../expected_output/000003/draft.tsv | 3 - ...es-YutaMouse20-140327_behavior+ecephys.tsv | 3 + .../expected_output/000003/draft/summary.tsv | 3 + .../expected_output/000013/0.220126.2143.tsv | 3 - ...31015_obj-odx8px_behavior+icephys+ogen.tsv | 3 + .../000013/0.220126.2143/summary.tsv | 3 + .../expected_output/000013/draft.tsv | 3 - ...31015_obj-odx8px_behavior+icephys+ogen.tsv | 3 + .../expected_output/000013/draft/summary.tsv | 3 + .../expected_output/000108/draft.tsv | 2 - ...sample-21_stain-LEC_run-1_chunk-1_SPIM.tsv | 2 + .../expected_output/000108/draft/summary.tsv | 2 + ...st_map_all_reduced_s3_logs_to_dandisets.py | 57 +++++++++++-------- .../test_bin_reduced_s3_logs_by_object_key.py | 1 + 21 files changed, 70 insertions(+), 41 deletions(-) delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index ba8e18b..b6c6484 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -100,7 +100,7 @@ def _map_binneded_logs_to_dandiset( all_activity_for_version = [] for asset in dandiset_version.get_assets(): asset_as_path = pathlib.Path(asset.path) - dandi_filename = asset_as_path.name + dandi_filename = asset_as_path.stem asset_suffixes = asset_as_path.suffixes is_asset_zarr = ".zarr" in asset_suffixes diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448.tsv deleted file mode 100644 index 5b0b45b..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448.tsv +++ /dev/null @@ -1,3 +0,0 @@ - filename timestamp bytes_sent region -0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16T02:21:12 512 unknown -1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv new file mode 100644 index 0000000..8cf6189 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv @@ -0,0 +1,3 @@ + timestamp bytes_sent region +0 2022-03-16T02:21:12 512 unknown +1 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv new file mode 100644 index 0000000..7285c44 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv @@ -0,0 +1,3 @@ + date bytes_sent +0 2022-03-16 512 +1 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955.tsv deleted file mode 100644 index 5b0b45b..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955.tsv +++ /dev/null @@ -1,3 +0,0 @@ - filename timestamp bytes_sent region -0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16T02:21:12 512 unknown -1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv new file mode 100644 index 0000000..8cf6189 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv @@ -0,0 +1,3 @@ + timestamp bytes_sent region +0 2022-03-16T02:21:12 512 unknown +1 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv new file mode 100644 index 0000000..7285c44 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv @@ -0,0 +1,3 @@ + date bytes_sent +0 2022-03-16 512 +1 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft.tsv deleted file mode 100644 index 5b0b45b..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft.tsv +++ /dev/null @@ -1,3 +0,0 @@ - filename timestamp bytes_sent region -0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16T02:21:12 512 unknown -1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv new file mode 100644 index 0000000..8cf6189 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv @@ -0,0 +1,3 @@ + timestamp bytes_sent region +0 2022-03-16T02:21:12 512 unknown +1 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv new file mode 100644 index 0000000..7285c44 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv @@ -0,0 +1,3 @@ + date bytes_sent +0 2022-03-16 512 +1 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143.tsv deleted file mode 100644 index 427dfc3..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143.tsv +++ /dev/null @@ -1,3 +0,0 @@ - filename timestamp bytes_sent region -0 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-04-24T12:03:05 1443 unknown -1 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-12-31T23:06:42 1443 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv new file mode 100644 index 0000000..2275f2e --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv @@ -0,0 +1,3 @@ + timestamp bytes_sent region +0 2021-04-24T12:03:05 1443 unknown +1 2021-12-31T23:06:42 1443 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv new file mode 100644 index 0000000..194af9f --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv @@ -0,0 +1,3 @@ + date bytes_sent +0 2021-04-24 1443 +1 2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft.tsv deleted file mode 100644 index 427dfc3..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft.tsv +++ /dev/null @@ -1,3 +0,0 @@ - filename timestamp bytes_sent region -0 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-04-24T12:03:05 1443 unknown -1 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-12-31T23:06:42 1443 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv new file mode 100644 index 0000000..2275f2e --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv @@ -0,0 +1,3 @@ + timestamp bytes_sent region +0 2021-04-24T12:03:05 1443 unknown +1 2021-12-31T23:06:42 1443 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv new file mode 100644 index 0000000..194af9f --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv @@ -0,0 +1,3 @@ + date bytes_sent +0 2021-04-24 1443 +1 2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft.tsv deleted file mode 100644 index af8e92c..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft.tsv +++ /dev/null @@ -1,2 +0,0 @@ - filename timestamp bytes_sent region -0 sub-MITU01/ses-20220317h10m43s39/micr/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.ome.zarr 2023-01-01T22:42:58 1526223 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.tsv new file mode 100644 index 0000000..de89c72 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.tsv @@ -0,0 +1,2 @@ + timestamp bytes_sent region +0 2023-01-01T22:42:58 1526223 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv new file mode 100644 index 0000000..840e3cf --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv @@ -0,0 +1,2 @@ + date bytes_sent +0 2023-01-01 1526223 diff --git a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py index 6d2b6a3..37f3461 100644 --- a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py +++ b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py @@ -11,46 +11,57 @@ def test_map_all_reduced_s3_logs_to_dandisets(tmpdir: py.path.local): file_parent = pathlib.Path(__file__).parent examples_folder_path = file_parent / "examples" / "mapped_to_dandisets_example_0" - reduced_s3_logs_folder_path = examples_folder_path / "binned_logs" - dandiset_logs_folder_path = tmpdir + example_binned_s3_logs_folder_path = examples_folder_path / "binned_logs" - dandi_s3_log_parser.map_binned_s3_logs_to_dandisets( - binned_s3_logs_folder_path=reduced_s3_logs_folder_path, - dandiset_logs_folder_path=dandiset_logs_folder_path, - ) + test_dandiset_logs_folder_path = tmpdir expected_output_folder_path = examples_folder_path / "expected_output" + dandi_s3_log_parser.map_binned_s3_logs_to_dandisets( + binned_s3_logs_folder_path=example_binned_s3_logs_folder_path, + dandiset_logs_folder_path=test_dandiset_logs_folder_path, + object_type="blobs", + ) + dandi_s3_log_parser.map_binned_s3_logs_to_dandisets( + binned_s3_logs_folder_path=example_binned_s3_logs_folder_path, + dandiset_logs_folder_path=test_dandiset_logs_folder_path, + object_type="zarr", + ) + # Ensure to extra folders were created test_dandiset_id_folder_paths = [ - dandiset_id_folder_path.stem for dandiset_id_folder_path in dandiset_logs_folder_path.iterdir() + dandiset_id_folder_path.stem for dandiset_id_folder_path in test_dandiset_logs_folder_path.iterdir() ] expected_dandiset_id_folder_paths = [ dandiset_id_folder_path.stem for dandiset_id_folder_path in expected_output_folder_path.iterdir() ] assert set(test_dandiset_id_folder_paths) == set(expected_dandiset_id_folder_paths) - test_dandiset_version_id_file_paths = { - f"{version_id_file_path.parent.name}/{version_id_file_path.name}": version_id_file_path - for dandiset_id_folder_path in dandiset_logs_folder_path.iterdir() - for version_id_file_path in dandiset_id_folder_path.iterdir() + # test_dandiset_version_id_file_paths = { + # f"{version_id_file_path.parent.name}/{version_id_file_path.name}": version_id_file_path + # for dandiset_id_folder_path in dandiset_logs_folder_path.iterdir() + # for version_id_file_path in dandiset_id_folder_path.iterdir() + # } + # expected_dandiset_version_id_file_paths = { + # f"{version_id_file_path.parent.name}/{version_id_file_path.name}": version_id_file_path + # for dandiset_id_folder_path in expected_output_folder_path.iterdir() + # for version_id_file_path in dandiset_id_folder_path.iterdir() + # } + test_file_paths = { + path.relative_to(test_dandiset_logs_folder_path): path for path in test_dandiset_logs_folder_path.rglob("*.tsv") } - expected_dandiset_version_id_file_paths = { - f"{version_id_file_path.parent.name}/{version_id_file_path.name}": version_id_file_path - for dandiset_id_folder_path in expected_output_folder_path.iterdir() - for version_id_file_path in dandiset_id_folder_path.iterdir() + expected_file_paths = { + path.relative_to(expected_output_folder_path): path for path in expected_output_folder_path.rglob("*.tsv") } - assert set(test_dandiset_version_id_file_paths.keys()) == set(expected_dandiset_version_id_file_paths.keys()) + assert set(test_file_paths.keys()) == set(expected_file_paths.keys()) - for expected_version_id_file_path in expected_dandiset_version_id_file_paths.values(): + for expected_file_path in expected_file_paths.values(): # Pandas assertion makes no reference to the file being tested when it fails - print(expected_version_id_file_path) + print(expected_file_path) - test_version_id_file_path = ( - dandiset_logs_folder_path / expected_version_id_file_path.parent.name / expected_version_id_file_path.name - ) + test_file_path = test_dandiset_logs_folder_path / expected_file_path.parent.name / expected_file_path.name - test_mapped_log = pandas.read_table(filepath_or_buffer=test_version_id_file_path, index_col=0) - expected_mapped_log = pandas.read_table(filepath_or_buffer=expected_version_id_file_path, index_col=0) + test_mapped_log = pandas.read_table(filepath_or_buffer=test_file_path, index_col=0) + expected_mapped_log = pandas.read_table(filepath_or_buffer=expected_file_path, index_col=0) pandas.testing.assert_frame_equal(left=test_mapped_log, right=expected_mapped_log) diff --git a/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py b/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py index 8e0b0d2..73593f7 100644 --- a/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py +++ b/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py @@ -25,6 +25,7 @@ def test_bin_reduced_s3_logs_by_object_key_example_0(tmpdir: py.path.local) -> N ) for expected_binned_s3_log_file_path in expected_binned_s3_log_file_paths: + # Pandas assertion makes no reference to the file being tested when it fails print(f"Testing binning of {expected_binned_s3_log_file_path}...") relative_file_path = expected_binned_s3_log_file_path.relative_to(expected_binned_s3_logs_folder_path) From 085c9136007595b668acecb8de653c80b8ee33f8 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 17:17:52 -0400 Subject: [PATCH 08/12] fix in place --- src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index b6c6484..5ea9f1e 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -135,7 +135,7 @@ def _map_binneded_logs_to_dandiset( reordered_reduced_s3_log = reduced_s3_log_binned_by_blob_id.reindex( columns=("timestamp", "bytes_sent", "region") ) - reordered_reduced_s3_log.sort_values(by="timestamp", key=natsort.natsort_keygen()) + reordered_reduced_s3_log.sort_values(by="timestamp", key=natsort.natsort_keygen(), inplace=True) reordered_reduced_s3_log.index = range(len(reordered_reduced_s3_log)) dandiset_version_log_folder_path.mkdir(parents=True, exist_ok=True) @@ -156,7 +156,7 @@ def _map_binneded_logs_to_dandiset( mapped_log_aggregated.rename(columns={"sum": "bytes_sent"}, inplace=True) mapped_log_binned_per_day = mapped_log_aggregated.reindex(columns=("date", "bytes_sent")) - mapped_log_binned_per_day.sort_values(by="date", key=natsort.natsort_keygen()) + mapped_log_binned_per_day.sort_values(by="date", key=natsort.natsort_keygen(), inplace=True) summary_file_path = dandiset_version_log_folder_path / "summary.tsv" mapped_log_binned_per_day.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True) From ec8cb2b7ce048b8a8e385faf377adfa3c2c9db9c Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 21:26:06 -0400 Subject: [PATCH 09/12] fix zarr filenames --- .../_bin_all_reduced_s3_logs_by_object_key.py | 2 +- src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py | 2 +- .../test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index 8162b0e..20b51fe 100644 --- a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -101,7 +101,7 @@ def bin_all_reduced_s3_logs_by_object_key( ): object_key_as_path = pathlib.Path(object_key) binned_s3_log_file_path = ( - binned_s3_logs_folder_path / object_key_as_path.parent / f"{object_key_as_path.stem}.tsv" + binned_s3_logs_folder_path / object_key_as_path.parent / f"{object_key_as_path.name}.tsv" ) binned_s3_log_file_path.parent.mkdir(exist_ok=True, parents=True) diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index 5ea9f1e..49e6484 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -100,8 +100,8 @@ def _map_binneded_logs_to_dandiset( all_activity_for_version = [] for asset in dandiset_version.get_assets(): asset_as_path = pathlib.Path(asset.path) - dandi_filename = asset_as_path.stem asset_suffixes = asset_as_path.suffixes + dandi_filename = asset_as_path.name.rstrip("".join(asset_suffixes)) is_asset_zarr = ".zarr" in asset_suffixes if is_asset_zarr and object_type == "blobs": diff --git a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py index 37f3461..f0f13bf 100644 --- a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py +++ b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py @@ -30,10 +30,10 @@ def test_map_all_reduced_s3_logs_to_dandisets(tmpdir: py.path.local): # Ensure to extra folders were created test_dandiset_id_folder_paths = [ - dandiset_id_folder_path.stem for dandiset_id_folder_path in test_dandiset_logs_folder_path.iterdir() + dandiset_id_folder_path.name for dandiset_id_folder_path in test_dandiset_logs_folder_path.iterdir() ] expected_dandiset_id_folder_paths = [ - dandiset_id_folder_path.stem for dandiset_id_folder_path in expected_output_folder_path.iterdir() + dandiset_id_folder_path.name for dandiset_id_folder_path in expected_output_folder_path.iterdir() ] assert set(test_dandiset_id_folder_paths) == set(expected_dandiset_id_folder_paths) From 11b9d8e9ff75fb07666e77f719e5418c53724a25 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 22:40:08 -0400 Subject: [PATCH 10/12] fix zarr filenames --- src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index 49e6484..2ff778d 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -101,7 +101,7 @@ def _map_binneded_logs_to_dandiset( for asset in dandiset_version.get_assets(): asset_as_path = pathlib.Path(asset.path) asset_suffixes = asset_as_path.suffixes - dandi_filename = asset_as_path.name.rstrip("".join(asset_suffixes)) + dandi_filename = asset_as_path.name.removesuffix("".join(asset_suffixes)) is_asset_zarr = ".zarr" in asset_suffixes if is_asset_zarr and object_type == "blobs": From 02a2f65bd1f48ef4ee2b33e007b5781de37ef994 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 23:20:45 -0400 Subject: [PATCH 11/12] fix zarr filenames --- .../test_map_all_reduced_s3_logs_to_dandisets.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py index f0f13bf..b703954 100644 --- a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py +++ b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py @@ -56,10 +56,12 @@ def test_map_all_reduced_s3_logs_to_dandisets(tmpdir: py.path.local): assert set(test_file_paths.keys()) == set(expected_file_paths.keys()) for expected_file_path in expected_file_paths.values(): - # Pandas assertion makes no reference to the file being tested when it fails - print(expected_file_path) + relative_file_path = expected_file_path.relative_to(expected_output_folder_path) + test_file_path = test_dandiset_logs_folder_path / relative_file_path - test_file_path = test_dandiset_logs_folder_path / expected_file_path.parent.name / expected_file_path.name + # Pandas assertion makes no reference to the file being tested when it fails + print(f"{test_file_path=}") + print(f"{expected_file_path=}") test_mapped_log = pandas.read_table(filepath_or_buffer=test_file_path, index_col=0) expected_mapped_log = pandas.read_table(filepath_or_buffer=expected_file_path, index_col=0) From 797820cbc06c649bbbd5cfcbee8f922270865288 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Thu, 22 Aug 2024 23:30:32 -0400 Subject: [PATCH 12/12] update name to match --- README.md | 12 ++++---- .../_command_line_interface.py | 6 ++-- .../_map_binned_s3_logs_to_dandisets.py | 14 ++++----- ...st_map_all_reduced_s3_logs_to_dandisets.py | 30 ++++--------------- 4 files changed, 23 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 533ccf7..ffcdb9b 100644 --- a/README.md +++ b/README.md @@ -129,16 +129,16 @@ bin_all_reduced_s3_logs_by_object_key \ --file_limit 20 ``` -In the summer of 2024, this process took less than 5 hours to bin all 170 GB of reduced log data. +In the summer of 2024, this process took less than 5 hours to bin all 170 GB of reduced logs into the 80 GB of data per object key. ### Mapping -The next step, which should also be updated regularly (daily-weekly), is to iterate through all current versions of all Dandisets, mapping the reduced logs to their assets. +The next step, which is also the step to re-run and release regularly, is to iterate through all current versions of all Dandisets, mapping the binned logs to their corresponding file paths as seen on the archive. ```bash map_binned_s3_logs_to_dandisets \ --binned_s3_logs_folder_path < binned S3 logs folder path > \ - --dandiset_logs_folder_path < mapped Dandiset logs folder > \ + --mapped_s3_logs_folder_path < mapped Dandiset logs folder > \ --object_type < blobs or zarr > ``` @@ -147,11 +147,13 @@ For example, on Drogon: ```bash map_binned_s3_logs_to_dandisets \ --binned_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-binned \ - --dandiset_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-mapped \ + --mapped_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-mapped \ --object_type blobs ``` -In the summer of 2024, this process took less than ?? hours to run without any activate caches and in the current design should be run fresh regularly to keep the logs up to date. The caches that accumulate over time should help speed up the process over repeated calls. +In the summer of 2024, this `blobs` process took less than 12 hours to run with one worker (could easily be parallelized in the future) without any activate caches. The caches that accumulate over time help speed up the process over repeated calls; a fresh run with caches only took less than ?? hours. + +`zarr` is likely to take longer, but the general process is the same. diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index 4c2a5e8..bf5ea80 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -129,7 +129,7 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( type=click.Path(writable=False), ) @click.option( - "--dandiset_logs_folder_path", + "--mapped_s3_logs_folder_path", help="", required=True, type=click.Path(writable=False), @@ -149,13 +149,13 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( ) def _map_binned_s3_logs_to_dandisets_cli( binned_s3_logs_folder_path: pathlib.Path, - dandiset_logs_folder_path: pathlib.Path, + mapped_s3_logs_folder_path: pathlib.Path, object_type: Literal["blobs", "zarr"], dandiset_limit: int | None, ) -> None: map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path=binned_s3_logs_folder_path, - dandiset_logs_folder_path=dandiset_logs_folder_path, + mapped_s3_logs_folder_path=mapped_s3_logs_folder_path, object_type=object_type, dandiset_limit=dandiset_limit, ) diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index 2ff778d..1160d91 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -14,7 +14,7 @@ @validate_call def map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path: DirectoryPath, - dandiset_logs_folder_path: DirectoryPath, + mapped_s3_logs_folder_path: DirectoryPath, object_type: Literal["blobs", "zarr"], dandiset_limit: int | None = None, ) -> None: @@ -29,7 +29,7 @@ def map_binned_s3_logs_to_dandisets( ---------- binned_s3_logs_folder_path : DirectoryPath The path to the folder containing the reduced S3 log files. - dandiset_logs_folder_path : DirectoryPath + mapped_s3_logs_folder_path : DirectoryPath The path to the folder where the mapped logs will be saved. object_type : one of "blobs" or "zarr" The type of objects to map the logs to, as determined by the parents of the object keys. @@ -67,8 +67,8 @@ def map_binned_s3_logs_to_dandisets( ): _map_binneded_logs_to_dandiset( dandiset=dandiset, - reduced_s3_logs_folder_path=binned_s3_logs_folder_path, - dandiset_logs_folder_path=dandiset_logs_folder_path, + binneded_s3_logs_folder_path=binned_s3_logs_folder_path, + dandiset_logs_folder_path=mapped_s3_logs_folder_path, object_type=object_type, client=client, ip_hash_to_region=ip_hash_to_region, @@ -81,7 +81,7 @@ def map_binned_s3_logs_to_dandisets( def _map_binneded_logs_to_dandiset( dandiset: dandi.dandiapi.RemoteDandiset, - reduced_s3_logs_folder_path: pathlib.Path, + binneded_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path, object_type: Literal["blobs", "zarr"], client: dandi.dandiapi.DandiAPIClient, @@ -111,11 +111,11 @@ def _map_binneded_logs_to_dandiset( if is_asset_zarr: blob_id = asset.zarr - reduced_s3_log_file_path = reduced_s3_logs_folder_path / "zarr" / f"{blob_id}.tsv" + reduced_s3_log_file_path = binneded_s3_logs_folder_path / "zarr" / f"{blob_id}.tsv" else: blob_id = asset.blob reduced_s3_log_file_path = ( - reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" + binneded_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" ) if not reduced_s3_log_file_path.exists(): diff --git a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py index b703954..1ad0811 100644 --- a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py +++ b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py @@ -13,42 +13,24 @@ def test_map_all_reduced_s3_logs_to_dandisets(tmpdir: py.path.local): examples_folder_path = file_parent / "examples" / "mapped_to_dandisets_example_0" example_binned_s3_logs_folder_path = examples_folder_path / "binned_logs" - test_dandiset_logs_folder_path = tmpdir + test_mapped_s3_logs_folder_path = tmpdir expected_output_folder_path = examples_folder_path / "expected_output" dandi_s3_log_parser.map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path=example_binned_s3_logs_folder_path, - dandiset_logs_folder_path=test_dandiset_logs_folder_path, + mapped_s3_logs_folder_path=test_mapped_s3_logs_folder_path, object_type="blobs", ) dandi_s3_log_parser.map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path=example_binned_s3_logs_folder_path, - dandiset_logs_folder_path=test_dandiset_logs_folder_path, + mapped_s3_logs_folder_path=test_mapped_s3_logs_folder_path, object_type="zarr", ) - # Ensure to extra folders were created - test_dandiset_id_folder_paths = [ - dandiset_id_folder_path.name for dandiset_id_folder_path in test_dandiset_logs_folder_path.iterdir() - ] - expected_dandiset_id_folder_paths = [ - dandiset_id_folder_path.name for dandiset_id_folder_path in expected_output_folder_path.iterdir() - ] - assert set(test_dandiset_id_folder_paths) == set(expected_dandiset_id_folder_paths) - - # test_dandiset_version_id_file_paths = { - # f"{version_id_file_path.parent.name}/{version_id_file_path.name}": version_id_file_path - # for dandiset_id_folder_path in dandiset_logs_folder_path.iterdir() - # for version_id_file_path in dandiset_id_folder_path.iterdir() - # } - # expected_dandiset_version_id_file_paths = { - # f"{version_id_file_path.parent.name}/{version_id_file_path.name}": version_id_file_path - # for dandiset_id_folder_path in expected_output_folder_path.iterdir() - # for version_id_file_path in dandiset_id_folder_path.iterdir() - # } test_file_paths = { - path.relative_to(test_dandiset_logs_folder_path): path for path in test_dandiset_logs_folder_path.rglob("*.tsv") + path.relative_to(test_mapped_s3_logs_folder_path): path + for path in test_mapped_s3_logs_folder_path.rglob("*.tsv") } expected_file_paths = { path.relative_to(expected_output_folder_path): path for path in expected_output_folder_path.rglob("*.tsv") @@ -57,7 +39,7 @@ def test_map_all_reduced_s3_logs_to_dandisets(tmpdir: py.path.local): for expected_file_path in expected_file_paths.values(): relative_file_path = expected_file_path.relative_to(expected_output_folder_path) - test_file_path = test_dandiset_logs_folder_path / relative_file_path + test_file_path = test_mapped_s3_logs_folder_path / relative_file_path # Pandas assertion makes no reference to the file being tested when it fails print(f"{test_file_path=}")