diff --git a/README.md b/README.md index 43a1488..ffcdb9b 100644 --- a/README.md +++ b/README.md @@ -129,16 +129,17 @@ bin_all_reduced_s3_logs_by_object_key \ --file_limit 20 ``` -In the summer of 2024, this process took less than 5 hours to bin all 170 GB of reduced log data. +In the summer of 2024, this process took less than 5 hours to bin all 170 GB of reduced logs into the 80 GB of data per object key. ### Mapping -The next step, which should also be updated regularly (daily-weekly), is to iterate through all current versions of all Dandisets, mapping the reduced logs to their assets. +The next step, which is also the step to re-run and release regularly, is to iterate through all current versions of all Dandisets, mapping the binned logs to their corresponding file paths as seen on the archive. ```bash map_binned_s3_logs_to_dandisets \ --binned_s3_logs_folder_path < binned S3 logs folder path > \ - --dandiset_logs_folder_path < mapped Dandiset logs folder > + --mapped_s3_logs_folder_path < mapped Dandiset logs folder > \ + --object_type < blobs or zarr > ``` For example, on Drogon: @@ -146,10 +147,13 @@ For example, on Drogon: ```bash map_binned_s3_logs_to_dandisets \ --binned_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-binned \ - --dandiset_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-mapped + --mapped_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-mapped \ + --object_type blobs ``` -In the summer of 2024, this process took less than ?? hours to run and in the current design should be run fresh regularly to keep the logs up to date. +In the summer of 2024, this `blobs` process took less than 12 hours to run with one worker (could easily be parallelized in the future) without any activate caches. The caches that accumulate over time help speed up the process over repeated calls; a fresh run with caches only took less than ?? hours. + +`zarr` is likely to take longer, but the general process is the same. diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index 8162b0e..20b51fe 100644 --- a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -101,7 +101,7 @@ def bin_all_reduced_s3_logs_by_object_key( ): object_key_as_path = pathlib.Path(object_key) binned_s3_log_file_path = ( - binned_s3_logs_folder_path / object_key_as_path.parent / f"{object_key_as_path.stem}.tsv" + binned_s3_logs_folder_path / object_key_as_path.parent / f"{object_key_as_path.name}.tsv" ) binned_s3_log_file_path.parent.mkdir(exist_ok=True, parents=True) diff --git a/src/dandi_s3_log_parser/_command_line_interface.py b/src/dandi_s3_log_parser/_command_line_interface.py index d50d037..bf5ea80 100644 --- a/src/dandi_s3_log_parser/_command_line_interface.py +++ b/src/dandi_s3_log_parser/_command_line_interface.py @@ -2,6 +2,7 @@ import collections import pathlib +from typing import Literal import click @@ -128,11 +129,17 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( type=click.Path(writable=False), ) @click.option( - "--dandiset_logs_folder_path", + "--mapped_s3_logs_folder_path", help="", required=True, type=click.Path(writable=False), ) +@click.option( + "--object_type", + help="The type of objects to map the logs to, as determined by the parents of the object keys.", + required=True, + type=click.Choice(["blobs", "zarr"]), +) @click.option( "--dandiset_limit", help="The maximum number of Dandisets to process per call.", @@ -142,12 +149,14 @@ def _bin_all_reduced_s3_logs_by_object_key_cli( ) def _map_binned_s3_logs_to_dandisets_cli( binned_s3_logs_folder_path: pathlib.Path, - dandiset_logs_folder_path: pathlib.Path, + mapped_s3_logs_folder_path: pathlib.Path, + object_type: Literal["blobs", "zarr"], dandiset_limit: int | None, ) -> None: map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path=binned_s3_logs_folder_path, - dandiset_logs_folder_path=dandiset_logs_folder_path, + mapped_s3_logs_folder_path=mapped_s3_logs_folder_path, + object_type=object_type, dandiset_limit=dandiset_limit, ) diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index c0b0c68..1160d91 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -1,7 +1,9 @@ import os import pathlib +from typing import Literal import dandi.dandiapi +import natsort import pandas import tqdm from pydantic import DirectoryPath, validate_call @@ -12,11 +14,14 @@ @validate_call def map_binned_s3_logs_to_dandisets( binned_s3_logs_folder_path: DirectoryPath, - dandiset_logs_folder_path: DirectoryPath, + mapped_s3_logs_folder_path: DirectoryPath, + object_type: Literal["blobs", "zarr"], dandiset_limit: int | None = None, ) -> None: """ - Iterate over all dandisets and create a single .tsv per dandiset version containing reduced log for all assets. + Iterate over all dandisets and create a single .tsv per asset per dandiset version. + + Also creates a summary file per dandiset that has binned activity per day. Requires the `ipinfo` environment variables to be set (`IPINFO_CREDENTIALS` and `IP_HASH_SALT`). @@ -24,8 +29,10 @@ def map_binned_s3_logs_to_dandisets( ---------- binned_s3_logs_folder_path : DirectoryPath The path to the folder containing the reduced S3 log files. - dandiset_logs_folder_path : DirectoryPath + mapped_s3_logs_folder_path : DirectoryPath The path to the folder where the mapped logs will be saved. + object_type : one of "blobs" or "zarr" + The type of objects to map the logs to, as determined by the parents of the object keys. dandiset_limit : int, optional The maximum number of Dandisets to process per call. """ @@ -58,10 +65,11 @@ def map_binned_s3_logs_to_dandisets( mininterval=5.0, smoothing=0, ): - _map_reduced_logs_to_dandiset( + _map_binneded_logs_to_dandiset( dandiset=dandiset, - reduced_s3_logs_folder_path=binned_s3_logs_folder_path, - dandiset_logs_folder_path=dandiset_logs_folder_path, + binneded_s3_logs_folder_path=binned_s3_logs_folder_path, + dandiset_logs_folder_path=mapped_s3_logs_folder_path, + object_type=object_type, client=client, ip_hash_to_region=ip_hash_to_region, ip_hash_not_in_services=ip_hash_not_in_services, @@ -71,60 +79,86 @@ def map_binned_s3_logs_to_dandisets( _save_ip_hash_cache(name="services", ip_cache=ip_hash_not_in_services) -def _map_reduced_logs_to_dandiset( +def _map_binneded_logs_to_dandiset( dandiset: dandi.dandiapi.RemoteDandiset, - reduced_s3_logs_folder_path: pathlib.Path, + binneded_s3_logs_folder_path: pathlib.Path, dandiset_logs_folder_path: pathlib.Path, + object_type: Literal["blobs", "zarr"], client: dandi.dandiapi.DandiAPIClient, ip_hash_to_region: dict[str, str], ip_hash_not_in_services: dict[str, bool], ) -> None: dandiset_id = dandiset.identifier + dandiset_log_folder_path = dandiset_logs_folder_path / dandiset_id for version in dandiset.get_versions(): version_id = version.identifier + dandiset_version_log_folder_path = dandiset_log_folder_path / version_id dandiset_version = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id) - all_reduced_s3_logs = [] + all_activity_for_version = [] for asset in dandiset_version.get_assets(): - asset_suffixes = pathlib.Path(asset.path).suffixes + asset_as_path = pathlib.Path(asset.path) + asset_suffixes = asset_as_path.suffixes + dandi_filename = asset_as_path.name.removesuffix("".join(asset_suffixes)) + is_asset_zarr = ".zarr" in asset_suffixes + if is_asset_zarr and object_type == "blobs": + continue + if not is_asset_zarr and object_type == "zarr": + continue if is_asset_zarr: blob_id = asset.zarr - reduced_s3_log_file_path = reduced_s3_logs_folder_path / "zarr" / f"{blob_id}.tsv" + reduced_s3_log_file_path = binneded_s3_logs_folder_path / "zarr" / f"{blob_id}.tsv" else: blob_id = asset.blob reduced_s3_log_file_path = ( - reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" + binneded_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" ) if not reduced_s3_log_file_path.exists(): continue # No reduced logs found (possible asset was never accessed); skip to next asset - reduced_s3_log = pandas.read_table(filepath_or_buffer=reduced_s3_log_file_path, header=0) - reduced_s3_log["filename"] = [asset.path] * len(reduced_s3_log) - reduced_s3_log["region"] = [ + reduced_s3_log_binned_by_blob_id = pandas.read_table(filepath_or_buffer=reduced_s3_log_file_path, header=0) + + reduced_s3_log_binned_by_blob_id["region"] = [ get_region_from_ip_address( ip_address=ip_address, ip_hash_to_region=ip_hash_to_region, ip_hash_not_in_services=ip_hash_not_in_services, ) - for ip_address in reduced_s3_log["ip_address"] + for ip_address in reduced_s3_log_binned_by_blob_id["ip_address"] ] - reordered_reduced_s3_log = reduced_s3_log.reindex(columns=("filename", "timestamp", "bytes_sent", "region")) - all_reduced_s3_logs.append(reordered_reduced_s3_log) + reordered_reduced_s3_log = reduced_s3_log_binned_by_blob_id.reindex( + columns=("timestamp", "bytes_sent", "region") + ) + reordered_reduced_s3_log.sort_values(by="timestamp", key=natsort.natsort_keygen(), inplace=True) + reordered_reduced_s3_log.index = range(len(reordered_reduced_s3_log)) + + dandiset_version_log_folder_path.mkdir(parents=True, exist_ok=True) + version_asset_file_path = dandiset_version_log_folder_path / f"{dandi_filename}.tsv" + reordered_reduced_s3_log.to_csv( + path_or_buf=version_asset_file_path, mode="w", sep="\t", header=True, index=True + ) - if len(all_reduced_s3_logs) == 0: + all_activity_for_version.append(reordered_reduced_s3_log) + + if len(all_activity_for_version) == 0: continue # No reduced logs found (possible dandiset version was never accessed); skip to next version - mapped_log = pandas.concat(objs=all_reduced_s3_logs, ignore_index=True) - mapped_log.sort_values(by="timestamp") - mapped_log.index = range(len(mapped_log)) + mapped_log = pandas.concat(objs=all_activity_for_version, ignore_index=True) + mapped_log["date"] = [entry[:10] for entry in mapped_log["timestamp"]] + + mapped_log_aggregated = mapped_log.groupby("date", as_index=False)["bytes_sent"].agg([list, "sum"]) + mapped_log_aggregated.rename(columns={"sum": "bytes_sent"}, inplace=True) + + mapped_log_binned_per_day = mapped_log_aggregated.reindex(columns=("date", "bytes_sent")) + mapped_log_binned_per_day.sort_values(by="date", key=natsort.natsort_keygen(), inplace=True) + + summary_file_path = dandiset_version_log_folder_path / "summary.tsv" + mapped_log_binned_per_day.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True) - dandiset_log_folder_path = dandiset_logs_folder_path / dandiset_id - dandiset_log_folder_path.mkdir(exist_ok=True) - version_file_path = dandiset_log_folder_path / f"{version_id}.tsv" - mapped_log.to_csv(path_or_buf=version_file_path, mode="w", sep="\t", header=True, index=True) + return None diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448.tsv deleted file mode 100644 index 5b0b45b..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448.tsv +++ /dev/null @@ -1,3 +0,0 @@ - filename timestamp bytes_sent region -0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16T02:21:12 512 unknown -1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv new file mode 100644 index 0000000..8cf6189 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv @@ -0,0 +1,3 @@ + timestamp bytes_sent region +0 2022-03-16T02:21:12 512 unknown +1 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv new file mode 100644 index 0000000..7285c44 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv @@ -0,0 +1,3 @@ + date bytes_sent +0 2022-03-16 512 +1 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955.tsv deleted file mode 100644 index 5b0b45b..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955.tsv +++ /dev/null @@ -1,3 +0,0 @@ - filename timestamp bytes_sent region -0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16T02:21:12 512 unknown -1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv new file mode 100644 index 0000000..8cf6189 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv @@ -0,0 +1,3 @@ + timestamp bytes_sent region +0 2022-03-16T02:21:12 512 unknown +1 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv new file mode 100644 index 0000000..7285c44 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv @@ -0,0 +1,3 @@ + date bytes_sent +0 2022-03-16 512 +1 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft.tsv deleted file mode 100644 index 5b0b45b..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft.tsv +++ /dev/null @@ -1,3 +0,0 @@ - filename timestamp bytes_sent region -0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16T02:21:12 512 unknown -1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv new file mode 100644 index 0000000..8cf6189 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.tsv @@ -0,0 +1,3 @@ + timestamp bytes_sent region +0 2022-03-16T02:21:12 512 unknown +1 2022-05-04T05:06:35 512 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv new file mode 100644 index 0000000..7285c44 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv @@ -0,0 +1,3 @@ + date bytes_sent +0 2022-03-16 512 +1 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143.tsv deleted file mode 100644 index 427dfc3..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143.tsv +++ /dev/null @@ -1,3 +0,0 @@ - filename timestamp bytes_sent region -0 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-04-24T12:03:05 1443 unknown -1 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-12-31T23:06:42 1443 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv new file mode 100644 index 0000000..2275f2e --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv @@ -0,0 +1,3 @@ + timestamp bytes_sent region +0 2021-04-24T12:03:05 1443 unknown +1 2021-12-31T23:06:42 1443 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv new file mode 100644 index 0000000..194af9f --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv @@ -0,0 +1,3 @@ + date bytes_sent +0 2021-04-24 1443 +1 2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft.tsv deleted file mode 100644 index 427dfc3..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft.tsv +++ /dev/null @@ -1,3 +0,0 @@ - filename timestamp bytes_sent region -0 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-04-24T12:03:05 1443 unknown -1 sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2021-12-31T23:06:42 1443 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv new file mode 100644 index 0000000..2275f2e --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.tsv @@ -0,0 +1,3 @@ + timestamp bytes_sent region +0 2021-04-24T12:03:05 1443 unknown +1 2021-12-31T23:06:42 1443 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv new file mode 100644 index 0000000..194af9f --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv @@ -0,0 +1,3 @@ + date bytes_sent +0 2021-04-24 1443 +1 2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft.tsv deleted file mode 100644 index af8e92c..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft.tsv +++ /dev/null @@ -1,2 +0,0 @@ - filename timestamp bytes_sent region -0 sub-MITU01/ses-20220317h10m43s39/micr/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.ome.zarr 2023-01-01T22:42:58 1526223 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.tsv new file mode 100644 index 0000000..de89c72 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.tsv @@ -0,0 +1,2 @@ + timestamp bytes_sent region +0 2023-01-01T22:42:58 1526223 unknown diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv new file mode 100644 index 0000000..840e3cf --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv @@ -0,0 +1,2 @@ + date bytes_sent +0 2023-01-01 1526223 diff --git a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py index 6d2b6a3..1ad0811 100644 --- a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py +++ b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py @@ -11,46 +11,41 @@ def test_map_all_reduced_s3_logs_to_dandisets(tmpdir: py.path.local): file_parent = pathlib.Path(__file__).parent examples_folder_path = file_parent / "examples" / "mapped_to_dandisets_example_0" - reduced_s3_logs_folder_path = examples_folder_path / "binned_logs" - dandiset_logs_folder_path = tmpdir + example_binned_s3_logs_folder_path = examples_folder_path / "binned_logs" - dandi_s3_log_parser.map_binned_s3_logs_to_dandisets( - binned_s3_logs_folder_path=reduced_s3_logs_folder_path, - dandiset_logs_folder_path=dandiset_logs_folder_path, - ) + test_mapped_s3_logs_folder_path = tmpdir expected_output_folder_path = examples_folder_path / "expected_output" - # Ensure to extra folders were created - test_dandiset_id_folder_paths = [ - dandiset_id_folder_path.stem for dandiset_id_folder_path in dandiset_logs_folder_path.iterdir() - ] - expected_dandiset_id_folder_paths = [ - dandiset_id_folder_path.stem for dandiset_id_folder_path in expected_output_folder_path.iterdir() - ] - assert set(test_dandiset_id_folder_paths) == set(expected_dandiset_id_folder_paths) - - test_dandiset_version_id_file_paths = { - f"{version_id_file_path.parent.name}/{version_id_file_path.name}": version_id_file_path - for dandiset_id_folder_path in dandiset_logs_folder_path.iterdir() - for version_id_file_path in dandiset_id_folder_path.iterdir() + dandi_s3_log_parser.map_binned_s3_logs_to_dandisets( + binned_s3_logs_folder_path=example_binned_s3_logs_folder_path, + mapped_s3_logs_folder_path=test_mapped_s3_logs_folder_path, + object_type="blobs", + ) + dandi_s3_log_parser.map_binned_s3_logs_to_dandisets( + binned_s3_logs_folder_path=example_binned_s3_logs_folder_path, + mapped_s3_logs_folder_path=test_mapped_s3_logs_folder_path, + object_type="zarr", + ) + + test_file_paths = { + path.relative_to(test_mapped_s3_logs_folder_path): path + for path in test_mapped_s3_logs_folder_path.rglob("*.tsv") } - expected_dandiset_version_id_file_paths = { - f"{version_id_file_path.parent.name}/{version_id_file_path.name}": version_id_file_path - for dandiset_id_folder_path in expected_output_folder_path.iterdir() - for version_id_file_path in dandiset_id_folder_path.iterdir() + expected_file_paths = { + path.relative_to(expected_output_folder_path): path for path in expected_output_folder_path.rglob("*.tsv") } - assert set(test_dandiset_version_id_file_paths.keys()) == set(expected_dandiset_version_id_file_paths.keys()) + assert set(test_file_paths.keys()) == set(expected_file_paths.keys()) - for expected_version_id_file_path in expected_dandiset_version_id_file_paths.values(): - # Pandas assertion makes no reference to the file being tested when it fails - print(expected_version_id_file_path) + for expected_file_path in expected_file_paths.values(): + relative_file_path = expected_file_path.relative_to(expected_output_folder_path) + test_file_path = test_mapped_s3_logs_folder_path / relative_file_path - test_version_id_file_path = ( - dandiset_logs_folder_path / expected_version_id_file_path.parent.name / expected_version_id_file_path.name - ) + # Pandas assertion makes no reference to the file being tested when it fails + print(f"{test_file_path=}") + print(f"{expected_file_path=}") - test_mapped_log = pandas.read_table(filepath_or_buffer=test_version_id_file_path, index_col=0) - expected_mapped_log = pandas.read_table(filepath_or_buffer=expected_version_id_file_path, index_col=0) + test_mapped_log = pandas.read_table(filepath_or_buffer=test_file_path, index_col=0) + expected_mapped_log = pandas.read_table(filepath_or_buffer=expected_file_path, index_col=0) pandas.testing.assert_frame_equal(left=test_mapped_log, right=expected_mapped_log) diff --git a/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py b/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py index 8e0b0d2..73593f7 100644 --- a/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py +++ b/tests/test_binning/test_bin_reduced_s3_logs_by_object_key.py @@ -25,6 +25,7 @@ def test_bin_reduced_s3_logs_by_object_key_example_0(tmpdir: py.path.local) -> N ) for expected_binned_s3_log_file_path in expected_binned_s3_log_file_paths: + # Pandas assertion makes no reference to the file being tested when it fails print(f"Testing binning of {expected_binned_s3_log_file_path}...") relative_file_path = expected_binned_s3_log_file_path.relative_to(expected_binned_s3_logs_folder_path)