Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Final debugs and polish #43

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions src/dandi_s3_log_parser/_dandiset_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,33 +67,32 @@ def _map_reduced_logs_to_dandiset(
) -> None:
dandiset_id = dandiset.identifier

dandiset_log_folder_path = dandiset_logs_folder_path / dandiset_id

for version in dandiset.get_versions():
version_id = version.identifier

dandiset_version = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id)

all_reduced_logs = []
for asset in dandiset_version.get_assets():
asset_id = asset.identifier
asset_suffixes = pathlib.Path(asset.path).suffixes
is_asset_zarr = ".zarr" in asset_suffixes

blob_or_zarr = "blobs" if ".zarr" not in asset_suffixes else "zarr"
blob_id = asset.blob if not is_asset_zarr else asset.zarr
blobs_or_zarr = "blobs" if not is_asset_zarr else "zarr"

reduced_log_file_path = reduced_s3_logs_folder_path / f"{blob_or_zarr}_{asset_id}.tsv"
reduced_log_file_path = reduced_s3_logs_folder_path / f"{blobs_or_zarr}_{blob_id}.tsv"

if not reduced_log_file_path.exists():
continue # No reduced logs found (possible asset was never accessed); skip to next asset

reduced_log = pandas.read_table(filepath_or_buffer=reduced_log_file_path, header=0)
reduced_log["asset_id"] = [asset_id] * len(reduced_log)
reduced_log["filename"] = [asset.path] * len(reduced_log)
reduced_log["region"] = [
get_region_from_ip_address(ip_address=ip_address, ip_hash_to_region=ip_hash_to_region)
for ip_address in reduced_log["ip_address"]
]

reordered_reduced_log = reduced_log.reindex(columns=("asset_id", "timestamp", "bytes_sent", "region"))
reordered_reduced_log = reduced_log.reindex(columns=("filename", "timestamp", "bytes_sent", "region"))
all_reduced_logs.append(reordered_reduced_log)

if len(all_reduced_logs) == 0:
Expand All @@ -103,6 +102,7 @@ def _map_reduced_logs_to_dandiset(
mapped_log.sort_values(by="timestamp")
mapped_log.index = range(len(mapped_log))

dandiset_log_folder_path = dandiset_logs_folder_path / dandiset_id
dandiset_log_folder_path.mkdir(exist_ok=True)
version_file_path = dandiset_log_folder_path / f"{version_id}.tsv"
mapped_log.to_csv(version_file_path, mode="w", sep="\t", header=True, index=True)
mapped_log.to_csv(path_or_buf=version_file_path, mode="w", sep="\t", header=True, index=True)
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
asset_id timestamp bytes_sent region
0 5e9e92e1-f044-4aa0-ab47-1cfcb8899348 2022-03-16 02:21:12 512 unknown
1 5e9e92e1-f044-4aa0-ab47-1cfcb8899348 2022-05-04 05:06:35 512 unknown
filename timestamp bytes_sent region
0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16 02:21:12 512 unknown
1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04 05:06:35 512 unknown
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
asset_id timestamp bytes_sent region
0 5e9e92e1-f044-4aa0-ab47-1cfcb8899348 2022-03-16 02:21:12 512 unknown
1 5e9e92e1-f044-4aa0-ab47-1cfcb8899348 2022-05-04 05:06:35 512 unknown
filename timestamp bytes_sent region
0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16 02:21:12 512 unknown
1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04 05:06:35 512 unknown
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
asset_id timestamp bytes_sent region
0 5e9e92e1-f044-4aa0-ab47-1cfcb8899348 2022-03-16 02:21:12 512 unknown
1 5e9e92e1-f044-4aa0-ab47-1cfcb8899348 2022-05-04 05:06:35 512 unknown
filename timestamp bytes_sent region
0 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-03-16 02:21:12 512 unknown
1 sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 2022-05-04 05:06:35 512 unknown
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
asset_id timestamp bytes_sent region
0 cbcf1d6d-7f64-4d1f-8692-75e09e177ca6 2021-04-24 12:03:05 1443 unknown
1 cbcf1d6d-7f64-4d1f-8692-75e09e177ca6 2021-12-31 23:06:42 1443 unknown
filename timestamp bytes_sent region
0 sub-anm106211/sub-anm106211_ses-20100925_behavior+icephys.nwb 2021-04-24 12:03:05 1443 unknown
1 sub-anm106211/sub-anm106211_ses-20100925_behavior+icephys.nwb 2021-12-31 23:06:42 1443 unknown
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
asset_id timestamp bytes_sent region
0 cbcf1d6d-7f64-4d1f-8692-75e09e177ca6 2021-04-24 12:03:05 1443 unknown
1 cbcf1d6d-7f64-4d1f-8692-75e09e177ca6 2021-12-31 23:06:42 1443 unknown
filename timestamp bytes_sent region
0 sub-anm106211/sub-anm106211_ses-20100925_behavior+icephys.nwb 2021-04-24 12:03:05 1443 unknown
1 sub-anm106211/sub-anm106211_ses-20100925_behavior+icephys.nwb 2021-12-31 23:06:42 1443 unknown
Loading