diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index bd9f565..e930aee 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -113,7 +113,7 @@ def _map_binned_logs_to_dandiset( all_activity_across_versions_per_blob_id_by_day = dict() blob_id_to_asset_path = dict() - all_activity_across_versions_by_blob_id = dict() + total_bytes_across_versions_by_blob_id = dict() dandiset_versions = list(dandiset.get_versions()) for version in tqdm.tqdm( iterable=dandiset_versions, @@ -130,8 +130,8 @@ def _map_binned_logs_to_dandiset( dandiset_version = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id) - all_activity_for_version_by_day = [] - all_activity_for_version_by_asset_path = dict() + reduced_s3_logs_per_day = [] + total_bytes_per_asset_path = dict() dandiset_version_assets = list(dandiset_version.get_assets()) for asset in tqdm.tqdm( iterable=dandiset_version_assets, @@ -192,55 +192,45 @@ def _map_binned_logs_to_dandiset( reordered_reduced_s3_log["date"] = [entry[:10] for entry in reordered_reduced_s3_log["timestamp"]] reordered_reduced_s3_log_binned_per_day = reordered_reduced_s3_log.reindex(columns=("date", "bytes_sent")) - all_activity_for_version_by_day.append(reordered_reduced_s3_log_binned_per_day) + reduced_s3_logs_per_day.append(reordered_reduced_s3_log_binned_per_day) all_activity_across_versions_per_blob_id_by_day[blob_id] = reordered_reduced_s3_log_binned_per_day total_bytes = sum(reduced_s3_log_binned_by_blob_id["bytes_sent"]) - all_activity_for_version_by_asset_path[asset.path] = total_bytes + total_bytes_per_asset_path[asset.path] = total_bytes blob_id_to_asset_path[blob_id] = asset.path - all_activity_across_versions_by_blob_id[blob_id] = total_bytes + total_bytes_across_versions_by_blob_id[blob_id] = total_bytes - if len(all_activity_for_version_by_day) == 0: + if len(reduced_s3_logs_per_day) == 0: continue # No activity found (possible dandiset version was never accessed); skip to next version - aggregated_activity_for_version_by_day = _aggregate_activity_by_day( - reduced_s3_logs_per_day=all_activity_for_version_by_day - ) - aggregated_activity_for_version_by_asset = _aggregate_activity_by_asset( - total_bytes_per_asset_path=all_activity_for_version_by_asset_path + version_summary_by_day_file_path = dandiset_version_log_folder_path / "version_summary_by_day.tsv" + _write_aggregated_activity_by_day( + reduced_s3_logs_per_day=reduced_s3_logs_per_day, file_path=version_summary_by_day_file_path ) - version_summary_by_day_file_path = dandiset_version_log_folder_path / "version_summary_by_day.tsv" version_summary_by_asset_file_path = dandiset_version_log_folder_path / "version_summary_by_asset.tsv" - aggregated_activity_for_version_by_day.to_csv( - path_or_buf=version_summary_by_day_file_path, mode="w", sep="\t", header=True, index=False - ) - aggregated_activity_for_version_by_asset.to_csv( - path_or_buf=version_summary_by_asset_file_path, mode="w", sep="\t", header=True, index=False + _write_aggregated_activity_by_asset( + total_bytes_per_asset_path=total_bytes_per_asset_path, file_path=version_summary_by_asset_file_path ) if len(all_activity_across_versions_per_blob_id_by_day) == 0: return None # No activity found (possible dandiset was never accessed); skip to next version - all_activity_across_versions_by_asset = collections.defaultdict(int) + # Single path across versions could have been replaced at various points by a new blob + total_bytes_across_versions_by_asset = collections.defaultdict(int) for blob_id, asset_path in blob_id_to_asset_path.items(): - all_activity_across_versions_by_asset[asset_path] += all_activity_across_versions_by_blob_id[blob_id] + total_bytes_across_versions_by_asset[asset_path] += total_bytes_across_versions_by_blob_id[blob_id] - aggregated_activity_for_dandiset_by_day = _aggregate_activity_by_day( - reduced_s3_logs_per_day=all_activity_across_versions_per_blob_id_by_day.values() - ) - aggregated_activity_for_dandiset_by_asset = _aggregate_activity_by_asset( - total_bytes_per_asset_path=all_activity_across_versions_by_asset + dandiset_summary_by_day_file_path = dandiset_log_folder_path / "dandiset_summary_by_day.tsv" + _write_aggregated_activity_by_day( + reduced_s3_logs_per_day=all_activity_across_versions_per_blob_id_by_day.values(), + file_path=dandiset_summary_by_day_file_path, ) - dandiset_summary_by_day_file_path = dandiset_log_folder_path / "dandiset_summary_by_day.tsv" dandiset_summary_by_asset_file_path = dandiset_log_folder_path / "dandiset_summary_by_asset.tsv" - aggregated_activity_for_dandiset_by_day.to_csv( - path_or_buf=dandiset_summary_by_day_file_path, mode="w", sep="\t", header=True, index=False - ) - aggregated_activity_for_dandiset_by_asset.to_csv( - path_or_buf=dandiset_summary_by_asset_file_path, mode="w", sep="\t", header=True, index=False + _write_aggregated_activity_by_asset( + total_bytes_per_asset_path=total_bytes_across_versions_by_asset, file_path=dandiset_summary_by_asset_file_path ) return None @@ -262,7 +252,27 @@ def _aggregate_activity_by_asset(total_bytes_per_asset_path: dict[str, int]) -> aggregated_activity_by_asset = pandas.DataFrame( data=[list(total_bytes_per_asset_path.keys()), list(total_bytes_per_asset_path.values())] ).T - aggregated_activity_by_asset.rename(columns={"0": "asset_path", "1": "bytes_sent"}, inplace=True) - aggregated_activity_by_asset.sort_values(by="bytes_sent", inplace=True) + aggregated_activity_by_asset.rename(columns={0: "asset_path", 1: "bytes_sent"}, inplace=True) + aggregated_activity_by_asset.sort_values(by="bytes_sent", ascending=False, inplace=True) return aggregated_activity_by_asset + + +def _write_aggregated_activity_by_day( + reduced_s3_logs_per_day: Iterable[pandas.DataFrame], file_path: pathlib.Path +) -> None: + aggregated_activity_for_version_by_day = _aggregate_activity_by_day(reduced_s3_logs_per_day=reduced_s3_logs_per_day) + aggregated_activity_for_version_by_day.to_csv(path_or_buf=file_path, mode="w", sep="\t", header=True, index=False) + + return None + + +def _write_aggregated_activity_by_asset(total_bytes_per_asset_path: dict[str, int], file_path: pathlib.Path) -> None: + aggregated_activity_for_dandiset_by_asset = _aggregate_activity_by_asset( + total_bytes_per_asset_path=total_bytes_per_asset_path + ) + aggregated_activity_for_dandiset_by_asset.to_csv( + path_or_buf=file_path, mode="w", sep="\t", header=True, index=False + ) + + return None diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_asset.tsv index 9117d48..18cda11 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_asset.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_asset.tsv @@ -1,3 +1,3 @@ -asset bytes_sent -sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 +asset_path bytes_sent sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_asset.tsv index 9117d48..18cda11 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_asset.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_asset.tsv @@ -1,3 +1,3 @@ -asset bytes_sent -sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 +asset_path bytes_sent sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_day.tsv index 4313612..3e94027 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_day.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_day.tsv @@ -1,3 +1,3 @@ date bytes_sent -2022-03-16 512 +2022-03-16 1746 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_asset.tsv index 9117d48..18cda11 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_asset.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_asset.tsv @@ -1,3 +1,3 @@ -asset bytes_sent -sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 +asset_path bytes_sent sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_day.tsv index 4313612..3e94027 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_day.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_day.tsv @@ -1,3 +1,3 @@ date bytes_sent -2022-03-16 512 +2022-03-16 1746 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_asset.tsv index 9117d48..18cda11 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_asset.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_asset.tsv @@ -1,3 +1,3 @@ -asset bytes_sent -sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 +asset_path bytes_sent sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_day.tsv index 4313612..3e94027 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_day.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_day.tsv @@ -1,3 +1,3 @@ date bytes_sent -2022-03-16 512 +2022-03-16 1746 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_asset.tsv index 9117d48..6cba408 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_asset.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_asset.tsv @@ -1,3 +1,2 @@ -asset bytes_sent -sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 -sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +asset_path bytes_sent +sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2886 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_day.tsv index 4313612..8538320 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_day.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_day.tsv @@ -1,3 +1,3 @@ date bytes_sent -2022-03-16 512 -2022-05-04 512 +2021-04-24 1443 +2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_asset.tsv index 9117d48..6cba408 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_asset.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_asset.tsv @@ -1,3 +1,2 @@ -asset bytes_sent -sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 -sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +asset_path bytes_sent +sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2886 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_day.tsv index 4313612..8538320 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_day.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_day.tsv @@ -1,3 +1,3 @@ date bytes_sent -2022-03-16 512 -2022-05-04 512 +2021-04-24 1443 +2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_asset.tsv index 9117d48..6cba408 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_asset.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_asset.tsv @@ -1,3 +1,2 @@ -asset bytes_sent -sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 -sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +asset_path bytes_sent +sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2886 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_day.tsv index 4313612..8538320 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_day.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_day.tsv @@ -1,3 +1,3 @@ date bytes_sent -2022-03-16 512 -2022-05-04 512 +2021-04-24 1443 +2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_asset.tsv index 9117d48..4265ef4 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_asset.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_asset.tsv @@ -1,3 +1,2 @@ -asset bytes_sent -sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 -sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +asset_path bytes_sent +sub-MITU01/ses-20220317h10m43s39/micr/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.ome.zarr 1526223 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_day.tsv index 4313612..7579108 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_day.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_day.tsv @@ -1,3 +1,2 @@ date bytes_sent -2022-03-16 512 -2022-05-04 512 +2023-01-01 1526223 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_asset.tsv index 9117d48..4265ef4 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_asset.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_asset.tsv @@ -1,3 +1,2 @@ -asset bytes_sent -sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 -sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +asset_path bytes_sent +sub-MITU01/ses-20220317h10m43s39/micr/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.ome.zarr 1526223