Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
CodyCBakerPhD committed Sep 12, 2024
1 parent 2d1a0d8 commit 05d6393
Show file tree
Hide file tree
Showing 17 changed files with 71 additions and 67 deletions.
76 changes: 43 additions & 33 deletions src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _map_binned_logs_to_dandiset(

all_activity_across_versions_per_blob_id_by_day = dict()
blob_id_to_asset_path = dict()
all_activity_across_versions_by_blob_id = dict()
total_bytes_across_versions_by_blob_id = dict()
dandiset_versions = list(dandiset.get_versions())
for version in tqdm.tqdm(
iterable=dandiset_versions,
Expand All @@ -130,8 +130,8 @@ def _map_binned_logs_to_dandiset(

dandiset_version = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id)

all_activity_for_version_by_day = []
all_activity_for_version_by_asset_path = dict()
reduced_s3_logs_per_day = []
total_bytes_per_asset_path = dict()
dandiset_version_assets = list(dandiset_version.get_assets())
for asset in tqdm.tqdm(
iterable=dandiset_version_assets,
Expand Down Expand Up @@ -192,55 +192,45 @@ def _map_binned_logs_to_dandiset(

reordered_reduced_s3_log["date"] = [entry[:10] for entry in reordered_reduced_s3_log["timestamp"]]
reordered_reduced_s3_log_binned_per_day = reordered_reduced_s3_log.reindex(columns=("date", "bytes_sent"))
all_activity_for_version_by_day.append(reordered_reduced_s3_log_binned_per_day)
reduced_s3_logs_per_day.append(reordered_reduced_s3_log_binned_per_day)
all_activity_across_versions_per_blob_id_by_day[blob_id] = reordered_reduced_s3_log_binned_per_day

total_bytes = sum(reduced_s3_log_binned_by_blob_id["bytes_sent"])
all_activity_for_version_by_asset_path[asset.path] = total_bytes
total_bytes_per_asset_path[asset.path] = total_bytes

blob_id_to_asset_path[blob_id] = asset.path
all_activity_across_versions_by_blob_id[blob_id] = total_bytes
total_bytes_across_versions_by_blob_id[blob_id] = total_bytes

if len(all_activity_for_version_by_day) == 0:
if len(reduced_s3_logs_per_day) == 0:
continue # No activity found (possible dandiset version was never accessed); skip to next version

aggregated_activity_for_version_by_day = _aggregate_activity_by_day(
reduced_s3_logs_per_day=all_activity_for_version_by_day
)
aggregated_activity_for_version_by_asset = _aggregate_activity_by_asset(
total_bytes_per_asset_path=all_activity_for_version_by_asset_path
version_summary_by_day_file_path = dandiset_version_log_folder_path / "version_summary_by_day.tsv"
_write_aggregated_activity_by_day(
reduced_s3_logs_per_day=reduced_s3_logs_per_day, file_path=version_summary_by_day_file_path
)

version_summary_by_day_file_path = dandiset_version_log_folder_path / "version_summary_by_day.tsv"
version_summary_by_asset_file_path = dandiset_version_log_folder_path / "version_summary_by_asset.tsv"
aggregated_activity_for_version_by_day.to_csv(
path_or_buf=version_summary_by_day_file_path, mode="w", sep="\t", header=True, index=False
)
aggregated_activity_for_version_by_asset.to_csv(
path_or_buf=version_summary_by_asset_file_path, mode="w", sep="\t", header=True, index=False
_write_aggregated_activity_by_asset(
total_bytes_per_asset_path=total_bytes_per_asset_path, file_path=version_summary_by_asset_file_path
)

if len(all_activity_across_versions_per_blob_id_by_day) == 0:
return None # No activity found (possible dandiset was never accessed); skip to next version

all_activity_across_versions_by_asset = collections.defaultdict(int)
# Single path across versions could have been replaced at various points by a new blob
total_bytes_across_versions_by_asset = collections.defaultdict(int)
for blob_id, asset_path in blob_id_to_asset_path.items():
all_activity_across_versions_by_asset[asset_path] += all_activity_across_versions_by_blob_id[blob_id]
total_bytes_across_versions_by_asset[asset_path] += total_bytes_across_versions_by_blob_id[blob_id]

aggregated_activity_for_dandiset_by_day = _aggregate_activity_by_day(
reduced_s3_logs_per_day=all_activity_across_versions_per_blob_id_by_day.values()
)
aggregated_activity_for_dandiset_by_asset = _aggregate_activity_by_asset(
total_bytes_per_asset_path=all_activity_across_versions_by_asset
dandiset_summary_by_day_file_path = dandiset_log_folder_path / "dandiset_summary_by_day.tsv"
_write_aggregated_activity_by_day(
reduced_s3_logs_per_day=all_activity_across_versions_per_blob_id_by_day.values(),
file_path=dandiset_summary_by_day_file_path,
)

dandiset_summary_by_day_file_path = dandiset_log_folder_path / "dandiset_summary_by_day.tsv"
dandiset_summary_by_asset_file_path = dandiset_log_folder_path / "dandiset_summary_by_asset.tsv"
aggregated_activity_for_dandiset_by_day.to_csv(
path_or_buf=dandiset_summary_by_day_file_path, mode="w", sep="\t", header=True, index=False
)
aggregated_activity_for_dandiset_by_asset.to_csv(
path_or_buf=dandiset_summary_by_asset_file_path, mode="w", sep="\t", header=True, index=False
_write_aggregated_activity_by_asset(
total_bytes_per_asset_path=total_bytes_across_versions_by_asset, file_path=dandiset_summary_by_asset_file_path
)

return None
Expand All @@ -262,7 +252,27 @@ def _aggregate_activity_by_asset(total_bytes_per_asset_path: dict[str, int]) ->
aggregated_activity_by_asset = pandas.DataFrame(
data=[list(total_bytes_per_asset_path.keys()), list(total_bytes_per_asset_path.values())]
).T
aggregated_activity_by_asset.rename(columns={"0": "asset_path", "1": "bytes_sent"}, inplace=True)
aggregated_activity_by_asset.sort_values(by="bytes_sent", inplace=True)
aggregated_activity_by_asset.rename(columns={0: "asset_path", 1: "bytes_sent"}, inplace=True)
aggregated_activity_by_asset.sort_values(by="bytes_sent", ascending=False, inplace=True)

return aggregated_activity_by_asset


def _write_aggregated_activity_by_day(
reduced_s3_logs_per_day: Iterable[pandas.DataFrame], file_path: pathlib.Path
) -> None:
aggregated_activity_for_version_by_day = _aggregate_activity_by_day(reduced_s3_logs_per_day=reduced_s3_logs_per_day)
aggregated_activity_for_version_by_day.to_csv(path_or_buf=file_path, mode="w", sep="\t", header=True, index=False)

return None


def _write_aggregated_activity_by_asset(total_bytes_per_asset_path: dict[str, int], file_path: pathlib.Path) -> None:
aggregated_activity_for_dandiset_by_asset = _aggregate_activity_by_asset(
total_bytes_per_asset_path=total_bytes_per_asset_path
)
aggregated_activity_for_dandiset_by_asset.to_csv(
path_or_buf=file_path, mode="w", sep="\t", header=True, index=False
)

return None
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
asset bytes_sent
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
asset_path bytes_sent
sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
asset bytes_sent
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
asset_path bytes_sent
sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
date bytes_sent
2022-03-16 512
2022-03-16 1746
2022-05-04 512
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
asset bytes_sent
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
asset_path bytes_sent
sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
date bytes_sent
2022-03-16 512
2022-03-16 1746
2022-05-04 512
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
asset bytes_sent
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
asset_path bytes_sent
sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
date bytes_sent
2022-03-16 512
2022-03-16 1746
2022-05-04 512
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
asset bytes_sent
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234
asset_path bytes_sent
sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2886
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
date bytes_sent
2022-03-16 512
2022-05-04 512
2021-04-24 1443
2021-12-31 1443
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
asset bytes_sent
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234
asset_path bytes_sent
sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2886
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
date bytes_sent
2022-03-16 512
2022-05-04 512
2021-04-24 1443
2021-12-31 1443
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
asset bytes_sent
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234
asset_path bytes_sent
sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2886
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
date bytes_sent
2022-03-16 512
2022-05-04 512
2021-04-24 1443
2021-12-31 1443
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
asset bytes_sent
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234
asset_path bytes_sent
sub-MITU01/ses-20220317h10m43s39/micr/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.ome.zarr 1526223
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
date bytes_sent
2022-03-16 512
2022-05-04 512
2023-01-01 1526223
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
asset bytes_sent
sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024
sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234
asset_path bytes_sent
sub-MITU01/ses-20220317h10m43s39/micr/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.ome.zarr 1526223

0 comments on commit 05d6393

Please sign in to comment.