From b59ae1d14cda41ebdf5aa8f73f4c043723ae5804 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Thu, 12 Sep 2024 16:35:54 -0400 Subject: [PATCH] Improvements to summaries (#68) * test binning skips * debugging * no fix needed * add more summaries * fix tests * add region aggregation * add subregions to services --- README.md | 3 +- pyproject.toml | 2 +- .../_bin_all_reduced_s3_logs_by_object_key.py | 6 +- .../_dandi_s3_log_file_reducer.py | 10 +- src/dandi_s3_log_parser/_ip_utils.py | 101 ++++++++---- .../_map_binned_s3_logs_to_dandisets.py | 148 +++++++++++++++--- .../18b9a7e7-cdef-4c11-9944-702546a14eaa.tsv | 2 + .../58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv | 6 +- .../cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv | 6 +- .../cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv | 4 +- ...utaMouse20-140327_behavior+ecephys_nwb.tsv | 2 +- ...utaMouse37-150610_behavior+ecephys_nwb.tsv | 2 + .../000003/0.210812.1448/summary.tsv | 3 - .../version_summary_by_asset.tsv | 3 + .../0.210812.1448/version_summary_by_day.tsv | 3 + .../version_summary_by_region.tsv | 4 + ...utaMouse20-140327_behavior+ecephys_nwb.tsv | 2 +- ...utaMouse37-150610_behavior+ecephys_nwb.tsv | 2 + .../000003/0.230629.1955/summary.tsv | 3 - .../version_summary_by_asset.tsv | 3 + .../0.230629.1955/version_summary_by_day.tsv | 3 + .../version_summary_by_region.tsv | 4 + .../000003/dandiset_summary_by_asset.tsv | 3 + .../000003/dandiset_summary_by_day.tsv | 3 + .../000003/dandiset_summary_by_region.tsv | 4 + ...utaMouse20-140327_behavior+ecephys_nwb.tsv | 2 +- ...utaMouse37-150610_behavior+ecephys_nwb.tsv | 2 + .../expected_output/000003/draft/summary.tsv | 3 - .../000003/draft/version_summary_by_asset.tsv | 3 + .../000003/draft/version_summary_by_day.tsv | 3 + .../draft/version_summary_by_region.tsv | 4 + .../000013/0.220126.2143/summary.tsv | 3 - .../version_summary_by_asset.tsv | 2 + .../0.220126.2143/version_summary_by_day.tsv | 3 + .../version_summary_by_region.tsv | 2 + .../000013/dandiset_summary_by_asset.tsv | 2 + .../000013/dandiset_summary_by_day.tsv | 3 + .../000013/dandiset_summary_by_region.tsv | 2 + .../expected_output/000013/draft/summary.tsv | 3 - .../000013/draft/version_summary_by_asset.tsv | 2 + .../000013/draft/version_summary_by_day.tsv | 3 + .../draft/version_summary_by_region.tsv | 2 + .../000108/dandiset_summary_by_asset.tsv | 2 + .../000108/dandiset_summary_by_day.tsv | 2 + .../000108/dandiset_summary_by_region.tsv | 2 + .../expected_output/000108/draft/summary.tsv | 2 - .../000108/draft/version_summary_by_asset.tsv | 2 + .../000108/draft/version_summary_by_day.tsv | 2 + .../draft/version_summary_by_region.tsv | 2 + ...st_map_all_reduced_s3_logs_to_dandisets.py | 14 +- 50 files changed, 309 insertions(+), 95 deletions(-) create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/18b/9a7/18b9a7e7-cdef-4c11-9944-702546a14eaa.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_asset.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_day.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_region.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_asset.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_day.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_region.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_asset.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_day.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_region.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_asset.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_day.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_region.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_asset.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_day.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_region.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_asset.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_day.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_region.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_asset.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_day.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_region.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_asset.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_day.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_region.tsv delete mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_asset.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_day.tsv create mode 100644 test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_region.tsv diff --git a/README.md b/README.md index 73d2522..e2472b9 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ The process is designed to be easily parallelized and interruptible, meaning tha ### 2. **Binning** -To make the mapping to Dandisets more efficient, the reduced logs are binned by their object keys (asset blob IDs) for fast lookup. +To make the mapping to Dandisets more efficient, the reduced logs are binned by their object keys (asset blob IDs) for fast lookup. Zarr assets specifically group by the parent blob ID, *e.g.*, a request for `zarr/abcdefg/group1/dataset1/0` will be binned by `zarr/abcdefg`. This step reduces the total file sizes from step (1) even further by reducing repeated object keys, though it does create a large number of small files. @@ -126,7 +126,6 @@ The `--file_processing_limit < integer >` flag can be used to limit the number o bin_all_reduced_s3_logs_by_object_key \ --reduced_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-reduced \ --binned_s3_logs_folder_path /mnt/backup/dandi/dandiarchive-logs-binned \ - --file_limit 20 ``` In the summer of 2024, this process took less than 5 hours to bin all 170 GB of reduced logs into the 80 GB of data per object key. diff --git a/pyproject.toml b/pyproject.toml index 6bb4a24..d58f17b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ packages = ["src/dandi_s3_log_parser"] [project] name = "dandi_s3_log_parser" -version="0.4.0" +version="0.4.1" authors = [ { name="Cody Baker", email="cody.c.baker.phd@gmail.com" }, ] diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index 20b51fe..7d9498e 100644 --- a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -57,11 +57,12 @@ def bin_all_reduced_s3_logs_by_object_key( for reduced_s3_log_file in tqdm.tqdm( iterable=reduced_s3_log_files, total=len(reduced_s3_log_files), - desc="Binning reduced logs...", + desc="Binning reduced logs", position=0, leave=True, mininterval=3.0, smoothing=0, + unit="file", ): if reduced_s3_log_file.stat().st_size == 0: with open(file=started_tracking_file_path, mode="a") as io: @@ -93,11 +94,12 @@ def bin_all_reduced_s3_logs_by_object_key( for object_key, data in tqdm.tqdm( iterable=object_keys_to_data.items(), total=len(object_keys_to_data), - desc="Writing binned logs...", + desc=f"Binning {reduced_s3_log_file}", position=1, leave=False, mininterval=3.0, smoothing=0, + unit="asset", ): object_key_as_path = pathlib.Path(object_key) binned_s3_log_file_path = ( diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index f9518ef..1311625 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -85,15 +85,15 @@ def reduce_all_dandi_raw_s3_logs( fields_to_reduce = ["object_key", "timestamp", "bytes_sent", "ip_address"] object_key_parents_to_reduce = ["blobs", "zarr"] line_buffer_tqdm_kwargs = dict(position=1, leave=False) - # TODO: add better reporting units to all TQDMs (lines / s, files / s, etc.) if maximum_number_of_workers == 1: for relative_s3_log_file_path in tqdm.tqdm( iterable=relative_s3_log_file_paths_to_reduce, total=len(relative_s3_log_file_paths_to_reduce), - desc="Parsing log files...", + desc="Parsing log files", position=0, leave=True, smoothing=0, # Use true historical average, not moving average since shuffling makes it more uniform + unit="file", ): raw_s3_log_file_path = raw_s3_logs_folder_path / relative_s3_log_file_path reduced_s3_log_file_path = ( @@ -144,6 +144,7 @@ def reduce_all_dandi_raw_s3_logs( leave=True, mininterval=3.0, smoothing=0, # Use true historical average, not moving average since shuffling makes it more uniform + unit="file", ) for future in progress_bar_iterable: future.result() # This is the call that finally triggers the deployment to the workers @@ -177,7 +178,10 @@ def _multi_worker_reduce_dandi_raw_s3_log( object_key_parents_to_reduce = ["blobs", "zarr"] object_key_handler = _get_default_dandi_object_key_handler() line_buffer_tqdm_kwargs = dict( - position=worker_index + 1, leave=False, desc=f"Parsing line buffers on worker {worker_index + 1}..." + position=worker_index + 1, + leave=False, + desc=f"Parsing line buffers on worker {worker_index + 1}...", + unit="buffer", ) reduce_raw_s3_log( diff --git a/src/dandi_s3_log_parser/_ip_utils.py b/src/dandi_s3_log_parser/_ip_utils.py index 387fa26..af6fed9 100644 --- a/src/dandi_s3_log_parser/_ip_utils.py +++ b/src/dandi_s3_log_parser/_ip_utils.py @@ -44,10 +44,10 @@ def get_region_from_ip_address( raise ValueError(message) # pragma: no cover ip_hash_salt = bytes.fromhex(os.environ["IP_HASH_SALT"]) - # Probably a legitimate user, so fetch the geographic region + # Hash for anonymization within the cache ip_hash = hashlib.sha1(string=bytes(ip_address, "utf-8") + ip_hash_salt).hexdigest() - # Early return for speed + # Early return from the cache for faster performance lookup_result = ip_hash_to_region.get(ip_hash, None) if lookup_result is not None: return lookup_result @@ -55,17 +55,28 @@ def get_region_from_ip_address( # Determine if IP address belongs to GitHub, AWS, Google, or known VPNs # Azure not yet easily doable; keep an eye on # https://learn.microsoft.com/en-us/answers/questions/1410071/up-to-date-azure-public-api-to-get-azure-ip-ranges - # and others, maybe it will change in the future + # maybe it will change in the future if ip_hash_not_in_services.get(ip_hash, None) is None: for service_name in _KNOWN_SERVICES: - cidr_addresses = _get_cidr_address_ranges(service_name=service_name) - - if any( - ipaddress.ip_address(address=ip_address) in ipaddress.ip_network(address=cidr_address) - for cidr_address in cidr_addresses - ): - ip_hash_to_region[ip_hash] = service_name - return service_name + cidr_addresses_and_subregions = _get_cidr_address_ranges_and_subregions(service_name=service_name) + + matched_cidr_address_and_subregion = next( + ( + (cidr_address, subregion) + for cidr_address, subregion in cidr_addresses_and_subregions + if ipaddress.ip_address(address=ip_address) in ipaddress.ip_network(address=cidr_address) + ), + None, + ) + if matched_cidr_address_and_subregion is not None: + region_service_string = service_name + + subregion = matched_cidr_address_and_subregion[1] + if subregion is not None: + region_service_string += f"/{subregion}" + + ip_hash_to_region[ip_hash] = region_service_string + return region_service_string ip_hash_not_in_services[ip_hash] = True # Log errors in IP fetching @@ -105,41 +116,67 @@ def get_region_from_ip_address( @functools.lru_cache -def _get_cidr_address_ranges(*, service_name: str) -> list[str]: +def _get_cidr_address_ranges_and_subregions(*, service_name: str) -> list[tuple[str, str | None]]: + cidr_request = _request_cidr_range(service_name=service_name) match service_name: case "GitHub": - github_cidr_request = requests.get(url="https://api.github.com/meta").json() skip_keys = ["domains", "ssh_key_fingerprints", "verifiable_password_authentication", "ssh_keys"] - keys = set(github_cidr_request.keys()) - set(skip_keys) - github_cidr_addresses = [ - cidr_address + keys = set(cidr_request.keys()) - set(skip_keys) + github_cidr_addresses_and_subregions = [ + (cidr_address, None) for key in keys - for cidr_address in github_cidr_request[key] + for cidr_address in cidr_request[key] if "::" not in cidr_address # Skip IPv6 ] - return github_cidr_addresses + return github_cidr_addresses_and_subregions # Note: these endpoints also return the 'locations' of the specific subnet, such as 'us-east-2' case "AWS": - aws_cidr_request = requests.get(url="https://ip-ranges.amazonaws.com/ip-ranges.json").json() - aws_cidr_addresses = [prefix["ip_prefix"] for prefix in aws_cidr_request["prefixes"]] + aws_cidr_addresses_and_subregions = [ + (prefix["ip_prefix"], prefix.get("region", None)) for prefix in cidr_request["prefixes"] + ] - return aws_cidr_addresses + return aws_cidr_addresses_and_subregions case "GCP": - gcp_cidr_request = requests.get(url="https://www.gstatic.com/ipranges/cloud.json").json() - gcp_cidr_addresses = [ - prefix["ipv4Prefix"] - for prefix in gcp_cidr_request["prefixes"] + gcp_cidr_addresses_and_subregions = [ + (prefix["ipv4Prefix"], prefix.get("scope", None)) + for prefix in cidr_request["prefixes"] if "ipv4Prefix" in prefix # Not handling IPv6 yet ] - return gcp_cidr_addresses + return gcp_cidr_addresses_and_subregions case "Azure": raise NotImplementedError("Azure CIDR address fetching is not yet implemented!") # pragma: no cover + case "VPN": + vpn_cidr_addresses_and_subregions = [(cidr_address, None) for cidr_address in cidr_request] + + return vpn_cidr_addresses_and_subregions + case _: + raise ValueError(f"Service name '{service_name}' is not supported!") # pragma: no cover + + +@functools.lru_cache +def _request_cidr_range(service_name: str) -> dict: + """Cache (in-memory) the requests to external services.""" + match service_name: + case "GitHub": + github_cidr_request = requests.get(url="https://api.github.com/meta").json() + + return github_cidr_request + case "AWS": + aws_cidr_request = requests.get(url="https://ip-ranges.amazonaws.com/ip-ranges.json").json() + + return aws_cidr_request + case "GCP": + gcp_cidr_request = requests.get(url="https://www.gstatic.com/ipranges/cloud.json").json() + + return gcp_cidr_request + case "Azure": + raise NotImplementedError("Azure CIDR address fetching is not yet implemented!") case "VPN": # Very nice public and maintained listing! Hope this stays stable. - vpn_cidr_addresses = ( + vpn_cidr_request = ( requests.get( url="https://raw.githubusercontent.com/josephrocca/is-vpn/main/vpn-or-datacenter-ipv4-ranges.txt" ) @@ -147,7 +184,7 @@ def _get_cidr_address_ranges(*, service_name: str) -> list[str]: .splitlines() ) - return vpn_cidr_addresses + return vpn_cidr_request case _: raise ValueError(f"Service name '{service_name}' is not supported!") # pragma: no cover @@ -157,16 +194,16 @@ def _load_ip_hash_cache(*, name: Literal["region", "services"]) -> dict[str, str match name: case "region": if not _IP_HASH_TO_REGION_FILE_PATH.exists(): - return {} # pragma: no cover + return dict() # pragma: no cover with open(file=_IP_HASH_TO_REGION_FILE_PATH) as stream: - return yaml.load(stream=stream, Loader=yaml.SafeLoader) + return yaml.load(stream=stream, Loader=yaml.SafeLoader) or dict() case "services": if not _IP_HASH_NOT_IN_SERVICES_FILE_PATH.exists(): - return {} # pragma: no cover + return dict() # pragma: no cover with open(file=_IP_HASH_NOT_IN_SERVICES_FILE_PATH) as stream: - return yaml.load(stream=stream, Loader=yaml.SafeLoader) + return yaml.load(stream=stream, Loader=yaml.SafeLoader) or dict() case _: raise ValueError(f"Name '{name}' is not recognized!") # pragma: no cover diff --git a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py index 8727250..9f0d3ac 100644 --- a/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py +++ b/src/dandi_s3_log_parser/_map_binned_s3_logs_to_dandisets.py @@ -1,5 +1,7 @@ +import collections import os import pathlib +from typing import Iterable import dandi.dandiapi import natsort @@ -39,17 +41,17 @@ def map_binned_s3_logs_to_dandisets( The maximum number of Dandisets to process per call. Useful for quick testing. """ - if "IPINFO_CREDENTIALS" not in os.environ: + if "IPINFO_CREDENTIALS" not in os.environ: # pragma: no cover message = "The environment variable 'IPINFO_CREDENTIALS' must be set to import `dandi_s3_log_parser`!" - raise ValueError(message) # pragma: no cover + raise ValueError(message) - if "IP_HASH_SALT" not in os.environ: + if "IP_HASH_SALT" not in os.environ: # pragma: no cover message = ( "The environment variable 'IP_HASH_SALT' must be set to import `dandi_s3_log_parser`! " "To retrieve the value, set a temporary value to this environment variable " "and then use the `get_hash_salt` helper function and set it to the correct value." ) - raise ValueError(message) # pragma: no cover + raise ValueError(message) if excluded_dandisets is not None and restrict_to_dandisets is not None: message = "Only one of `exclude_dandisets` or `restrict_to_dandisets` can be passed, not both!" @@ -81,6 +83,7 @@ def map_binned_s3_logs_to_dandisets( leave=True, mininterval=5.0, smoothing=0, + unit="dandiset", ): _map_binned_logs_to_dandiset( dandiset=dandiset, @@ -108,31 +111,37 @@ def _map_binned_logs_to_dandiset( dandiset_id = dandiset.identifier dandiset_log_folder_path = dandiset_logs_folder_path / dandiset_id + all_reduced_s3_logs_per_blob_id = dict() + blob_id_to_asset_path = dict() + total_bytes_across_versions_by_blob_id = dict() dandiset_versions = list(dandiset.get_versions()) for version in tqdm.tqdm( iterable=dandiset_versions, total=len(dandiset_versions), - desc=f"Mapping Dandiset {dandiset_id} versions...", + desc=f"Mapping Dandiset {dandiset_id} versions", position=1, leave=False, mininterval=5.0, smoothing=0, + unit="version", ): version_id = version.identifier dandiset_version_log_folder_path = dandiset_log_folder_path / version_id dandiset_version = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id) - all_activity_for_version = [] + reduced_s3_logs_per_day = [] + total_bytes_per_asset_path = dict() dandiset_version_assets = list(dandiset_version.get_assets()) for asset in tqdm.tqdm( iterable=dandiset_version_assets, total=len(dandiset_version_assets), - desc="Mapping assets...", + desc=f"Mapping {dandiset_id}/{version}", position=2, leave=False, mininterval=5.0, smoothing=0, + unit="asset", ): asset_as_path = pathlib.Path(asset.path) asset_suffixes = asset_as_path.suffixes @@ -182,26 +191,121 @@ def _map_binned_logs_to_dandiset( ) reordered_reduced_s3_log["date"] = [entry[:10] for entry in reordered_reduced_s3_log["timestamp"]] + reduced_s3_logs_per_day.append(reordered_reduced_s3_log) + all_reduced_s3_logs_per_blob_id[blob_id] = reordered_reduced_s3_log - reordered_reduced_s3_log_aggregated = reordered_reduced_s3_log.groupby("date", as_index=False)[ - "bytes_sent" - ].agg([list, "sum"]) - reordered_reduced_s3_log_aggregated.rename(columns={"sum": "bytes_sent"}, inplace=True) + total_bytes = sum(reduced_s3_log_binned_by_blob_id["bytes_sent"]) + total_bytes_per_asset_path[asset.path] = total_bytes - reordered_reduced_s3_log_binned_per_day = reordered_reduced_s3_log_aggregated.reindex( - columns=("date", "bytes_sent") - ) - reordered_reduced_s3_log_binned_per_day.sort_values(by="date", key=natsort.natsort_keygen(), inplace=True) + blob_id_to_asset_path[blob_id] = asset.path + total_bytes_across_versions_by_blob_id[blob_id] = total_bytes + + if len(reduced_s3_logs_per_day) == 0: + continue # No activity found (possible dandiset version was never accessed); skip to next version + + version_summary_by_day_file_path = dandiset_version_log_folder_path / "version_summary_by_day.tsv" + _write_aggregated_activity_by_day( + reduced_s3_logs_per_day=reduced_s3_logs_per_day, file_path=version_summary_by_day_file_path + ) + + version_summary_by_region_file_path = dandiset_version_log_folder_path / "version_summary_by_region.tsv" + _write_aggregated_activity_by_region( + reduced_s3_logs_per_day=reduced_s3_logs_per_day, file_path=version_summary_by_region_file_path + ) + + version_summary_by_asset_file_path = dandiset_version_log_folder_path / "version_summary_by_asset.tsv" + _write_aggregated_activity_by_asset( + total_bytes_per_asset_path=total_bytes_per_asset_path, file_path=version_summary_by_asset_file_path + ) + + if len(all_reduced_s3_logs_per_blob_id) == 0: + return None # No activity found (possible dandiset was never accessed); skip to next version + + # Single path across versions could have been replaced at various points by a new blob + total_bytes_across_versions_by_asset = collections.defaultdict(int) + for blob_id, asset_path in blob_id_to_asset_path.items(): + total_bytes_across_versions_by_asset[asset_path] += total_bytes_across_versions_by_blob_id[blob_id] + + dandiset_summary_by_day_file_path = dandiset_log_folder_path / "dandiset_summary_by_day.tsv" + _write_aggregated_activity_by_day( + reduced_s3_logs_per_day=all_reduced_s3_logs_per_blob_id.values(), + file_path=dandiset_summary_by_day_file_path, + ) + + dandiset_summary_by_region_file_path = dandiset_log_folder_path / "dandiset_summary_by_region.tsv" + _write_aggregated_activity_by_region( + reduced_s3_logs_per_day=all_reduced_s3_logs_per_blob_id.values(), + file_path=dandiset_summary_by_region_file_path, + ) + + dandiset_summary_by_asset_file_path = dandiset_log_folder_path / "dandiset_summary_by_asset.tsv" + _write_aggregated_activity_by_asset( + total_bytes_per_asset_path=total_bytes_across_versions_by_asset, file_path=dandiset_summary_by_asset_file_path + ) + + return None + + +def _aggregate_activity_by_day(reduced_s3_logs_per_day: Iterable[pandas.DataFrame]) -> pandas.DataFrame: + all_reduced_s3_logs = pandas.concat(objs=reduced_s3_logs_per_day, ignore_index=True) + all_reduced_s3_logs_clipped = all_reduced_s3_logs.reindex(columns=("date", "bytes_sent")) - all_activity_for_version.append(reordered_reduced_s3_log_binned_per_day) + pre_aggregated = all_reduced_s3_logs_clipped.groupby(by="date", as_index=False)["bytes_sent"].agg([list, "sum"]) + pre_aggregated.rename(columns={"sum": "bytes_sent"}, inplace=True) + pre_aggregated.sort_values(by="date", key=natsort.natsort_keygen(), inplace=True) - if len(all_activity_for_version) == 0: - continue # No reduced logs found (possible dandiset version was never accessed); skip to next version + aggregated_activity_by_day = pre_aggregated.reindex(columns=("date", "bytes_sent")) + + return aggregated_activity_by_day + + +def _aggregate_activity_by_region(reduced_s3_logs_per_day: Iterable[pandas.DataFrame]) -> pandas.DataFrame: + all_reduced_s3_logs = pandas.concat(objs=reduced_s3_logs_per_day, ignore_index=True) + all_reduced_s3_logs_clipped = all_reduced_s3_logs.reindex(columns=("region", "bytes_sent")) + + pre_aggregated = all_reduced_s3_logs_clipped.groupby(by="region", as_index=False)["bytes_sent"].agg([list, "sum"]) + pre_aggregated.rename(columns={"sum": "bytes_sent"}, inplace=True) + pre_aggregated.sort_values(by="bytes_sent", ascending=False, inplace=True) + + aggregated_activity_by_region = pre_aggregated.reindex(columns=("region", "bytes_sent")) + + return aggregated_activity_by_region + + +def _aggregate_activity_by_asset(total_bytes_per_asset_path: dict[str, int]) -> pandas.DataFrame: + aggregated_activity_by_asset = pandas.DataFrame( + data=[list(total_bytes_per_asset_path.keys()), list(total_bytes_per_asset_path.values())] + ).T + aggregated_activity_by_asset.rename(columns={0: "asset_path", 1: "bytes_sent"}, inplace=True) + aggregated_activity_by_asset.sort_values(by="bytes_sent", ascending=False, inplace=True) + + return aggregated_activity_by_asset + + +def _write_aggregated_activity_by_day( + reduced_s3_logs_per_day: Iterable[pandas.DataFrame], file_path: pathlib.Path +) -> None: + aggregated_activity_by_day = _aggregate_activity_by_day(reduced_s3_logs_per_day=reduced_s3_logs_per_day) + aggregated_activity_by_day.to_csv(path_or_buf=file_path, mode="w", sep="\t", header=True, index=False) + + return None + + +def _write_aggregated_activity_by_region( + reduced_s3_logs_per_day: Iterable[pandas.DataFrame], file_path: pathlib.Path +) -> None: + aggregated_activity_by_region = _aggregate_activity_by_region(reduced_s3_logs_per_day=reduced_s3_logs_per_day) + aggregated_activity_by_region.to_csv(path_or_buf=file_path, mode="w", sep="\t", header=True, index=False) + + return None - summary_logs = pandas.concat(objs=all_activity_for_version, ignore_index=True) - summary_logs.sort_values(by="date", key=natsort.natsort_keygen(), inplace=True) - summary_file_path = dandiset_version_log_folder_path / "summary.tsv" - summary_logs.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True) +def _write_aggregated_activity_by_asset(total_bytes_per_asset_path: dict[str, int], file_path: pathlib.Path) -> None: + aggregated_activity_for_dandiset_by_asset = _aggregate_activity_by_asset( + total_bytes_per_asset_path=total_bytes_per_asset_path + ) + aggregated_activity_for_dandiset_by_asset.to_csv( + path_or_buf=file_path, mode="w", sep="\t", header=True, index=False + ) return None diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/18b/9a7/18b9a7e7-cdef-4c11-9944-702546a14eaa.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/18b/9a7/18b9a7e7-cdef-4c11-9944-702546a14eaa.tsv new file mode 100644 index 0000000..e8fdac0 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/18b/9a7/18b9a7e7-cdef-4c11-9944-702546a14eaa.tsv @@ -0,0 +1,2 @@ +timestamp bytes_sent ip_address +2022-03-16T02:21:12 1234 18.220.4.80 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv index b684d88..5d18444 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv @@ -1,3 +1,3 @@ -timestamp bytes_sent ip_address line_index -2022-03-16T02:21:12 512 192.0.2.0 1 -2022-05-04T05:06:35 512 192.0.2.0 1 +timestamp bytes_sent ip_address +2022-03-16T02:21:12 512 192.0.2.0 +2022-05-04T05:06:35 512 104.155.168.90 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv index c3a1e3e..b73a118 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv @@ -1,3 +1,3 @@ -timestamp bytes_sent ip_address line_index -2021-04-24T12:03:05 1443 192.0.2.0 0 -2021-12-31T23:06:42 1443 192.0.2.0 0 +timestamp bytes_sent ip_address +2021-04-24T12:03:05 1443 192.0.2.0 +2021-12-31T23:06:42 1443 192.0.2.0 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv index 89c526f..7baa3af 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/binned_logs/zarr/cb65c877-882b-4554-8fa1-8f4e986e13a6.tsv @@ -1,2 +1,2 @@ -timestamp bytes_sent ip_address line_index -2023-01-01T22:42:58 1526223 192.0.2.0 2 +timestamp bytes_sent ip_address +2023-01-01T22:42:58 1526223 192.0.2.0 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv index 8cf6189..fc9edee 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv @@ -1,3 +1,3 @@ timestamp bytes_sent region 0 2022-03-16T02:21:12 512 unknown -1 2022-05-04T05:06:35 512 unknown +1 2022-05-04T05:06:35 512 GCP/us-central1 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv new file mode 100644 index 0000000..dbfd8b5 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv @@ -0,0 +1,2 @@ + timestamp bytes_sent region +0 2022-03-16T02:21:12 1234 AWS/us-east-2 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv deleted file mode 100644 index 7285c44..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/summary.tsv +++ /dev/null @@ -1,3 +0,0 @@ - date bytes_sent -0 2022-03-16 512 -1 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_asset.tsv new file mode 100644 index 0000000..18cda11 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_asset.tsv @@ -0,0 +1,3 @@ +asset_path bytes_sent +sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_day.tsv new file mode 100644 index 0000000..3e94027 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_day.tsv @@ -0,0 +1,3 @@ +date bytes_sent +2022-03-16 1746 +2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_region.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_region.tsv new file mode 100644 index 0000000..9e3a5f5 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.210812.1448/version_summary_by_region.tsv @@ -0,0 +1,4 @@ +region bytes_sent +AWS/us-east-2 1234 +GCP/us-central1 512 +unknown 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv index 8cf6189..fc9edee 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv @@ -1,3 +1,3 @@ timestamp bytes_sent region 0 2022-03-16T02:21:12 512 unknown -1 2022-05-04T05:06:35 512 unknown +1 2022-05-04T05:06:35 512 GCP/us-central1 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv new file mode 100644 index 0000000..dbfd8b5 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv @@ -0,0 +1,2 @@ + timestamp bytes_sent region +0 2022-03-16T02:21:12 1234 AWS/us-east-2 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv deleted file mode 100644 index 7285c44..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/summary.tsv +++ /dev/null @@ -1,3 +0,0 @@ - date bytes_sent -0 2022-03-16 512 -1 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_asset.tsv new file mode 100644 index 0000000..18cda11 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_asset.tsv @@ -0,0 +1,3 @@ +asset_path bytes_sent +sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_day.tsv new file mode 100644 index 0000000..3e94027 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_day.tsv @@ -0,0 +1,3 @@ +date bytes_sent +2022-03-16 1746 +2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_region.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_region.tsv new file mode 100644 index 0000000..9e3a5f5 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/0.230629.1955/version_summary_by_region.tsv @@ -0,0 +1,4 @@ +region bytes_sent +AWS/us-east-2 1234 +GCP/us-central1 512 +unknown 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_asset.tsv new file mode 100644 index 0000000..18cda11 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_asset.tsv @@ -0,0 +1,3 @@ +asset_path bytes_sent +sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_day.tsv new file mode 100644 index 0000000..3e94027 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_day.tsv @@ -0,0 +1,3 @@ +date bytes_sent +2022-03-16 1746 +2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_region.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_region.tsv new file mode 100644 index 0000000..9e3a5f5 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/dandiset_summary_by_region.tsv @@ -0,0 +1,4 @@ +region bytes_sent +AWS/us-east-2 1234 +GCP/us-central1 512 +unknown 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv index 8cf6189..fc9edee 100644 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys_nwb.tsv @@ -1,3 +1,3 @@ timestamp bytes_sent region 0 2022-03-16T02:21:12 512 unknown -1 2022-05-04T05:06:35 512 unknown +1 2022-05-04T05:06:35 512 GCP/us-central1 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv new file mode 100644 index 0000000..dbfd8b5 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys_nwb.tsv @@ -0,0 +1,2 @@ + timestamp bytes_sent region +0 2022-03-16T02:21:12 1234 AWS/us-east-2 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv deleted file mode 100644 index 7285c44..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/summary.tsv +++ /dev/null @@ -1,3 +0,0 @@ - date bytes_sent -0 2022-03-16 512 -1 2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_asset.tsv new file mode 100644 index 0000000..18cda11 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_asset.tsv @@ -0,0 +1,3 @@ +asset_path bytes_sent +sub-YutaMouse37/sub-YutaMouse37_ses-YutaMouse37-150610_behavior+ecephys.nwb 1234 +sub-YutaMouse20/sub-YutaMouse20_ses-YutaMouse20-140327_behavior+ecephys.nwb 1024 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_day.tsv new file mode 100644 index 0000000..3e94027 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_day.tsv @@ -0,0 +1,3 @@ +date bytes_sent +2022-03-16 1746 +2022-05-04 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_region.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_region.tsv new file mode 100644 index 0000000..9e3a5f5 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000003/draft/version_summary_by_region.tsv @@ -0,0 +1,4 @@ +region bytes_sent +AWS/us-east-2 1234 +GCP/us-central1 512 +unknown 512 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv deleted file mode 100644 index 194af9f..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/summary.tsv +++ /dev/null @@ -1,3 +0,0 @@ - date bytes_sent -0 2021-04-24 1443 -1 2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_asset.tsv new file mode 100644 index 0000000..6cba408 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_asset.tsv @@ -0,0 +1,2 @@ +asset_path bytes_sent +sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2886 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_day.tsv new file mode 100644 index 0000000..8538320 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_day.tsv @@ -0,0 +1,3 @@ +date bytes_sent +2021-04-24 1443 +2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_region.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_region.tsv new file mode 100644 index 0000000..748a0af --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/0.220126.2143/version_summary_by_region.tsv @@ -0,0 +1,2 @@ +region bytes_sent +unknown 2886 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_asset.tsv new file mode 100644 index 0000000..6cba408 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_asset.tsv @@ -0,0 +1,2 @@ +asset_path bytes_sent +sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2886 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_day.tsv new file mode 100644 index 0000000..8538320 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_day.tsv @@ -0,0 +1,3 @@ +date bytes_sent +2021-04-24 1443 +2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_region.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_region.tsv new file mode 100644 index 0000000..748a0af --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/dandiset_summary_by_region.tsv @@ -0,0 +1,2 @@ +region bytes_sent +unknown 2886 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv deleted file mode 100644 index 194af9f..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/summary.tsv +++ /dev/null @@ -1,3 +0,0 @@ - date bytes_sent -0 2021-04-24 1443 -1 2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_asset.tsv new file mode 100644 index 0000000..6cba408 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_asset.tsv @@ -0,0 +1,2 @@ +asset_path bytes_sent +sub-anm215592/sub-anm215592_ses-20131015_obj-odx8px_behavior+icephys+ogen.nwb 2886 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_day.tsv new file mode 100644 index 0000000..8538320 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_day.tsv @@ -0,0 +1,3 @@ +date bytes_sent +2021-04-24 1443 +2021-12-31 1443 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_region.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_region.tsv new file mode 100644 index 0000000..748a0af --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000013/draft/version_summary_by_region.tsv @@ -0,0 +1,2 @@ +region bytes_sent +unknown 2886 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_asset.tsv new file mode 100644 index 0000000..4265ef4 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_asset.tsv @@ -0,0 +1,2 @@ +asset_path bytes_sent +sub-MITU01/ses-20220317h10m43s39/micr/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.ome.zarr 1526223 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_day.tsv new file mode 100644 index 0000000..7579108 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_day.tsv @@ -0,0 +1,2 @@ +date bytes_sent +2023-01-01 1526223 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_region.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_region.tsv new file mode 100644 index 0000000..13cd95e --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/dandiset_summary_by_region.tsv @@ -0,0 +1,2 @@ +region bytes_sent +unknown 1526223 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv deleted file mode 100644 index 840e3cf..0000000 --- a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/summary.tsv +++ /dev/null @@ -1,2 +0,0 @@ - date bytes_sent -0 2023-01-01 1526223 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_asset.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_asset.tsv new file mode 100644 index 0000000..4265ef4 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_asset.tsv @@ -0,0 +1,2 @@ +asset_path bytes_sent +sub-MITU01/ses-20220317h10m43s39/micr/sub-MITU01_ses-20220317h10m43s39_sample-21_stain-LEC_run-1_chunk-1_SPIM.ome.zarr 1526223 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_day.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_day.tsv new file mode 100644 index 0000000..7579108 --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_day.tsv @@ -0,0 +1,2 @@ +date bytes_sent +2023-01-01 1526223 diff --git a/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_region.tsv b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_region.tsv new file mode 100644 index 0000000..13cd95e --- /dev/null +++ b/test_live_services/test_mapping/examples/mapped_to_dandisets_example_0/expected_output/000108/draft/version_summary_by_region.tsv @@ -0,0 +1,2 @@ +region bytes_sent +unknown 1526223 diff --git a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py index d2b3f6b..ce61755 100644 --- a/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py +++ b/test_live_services/test_mapping/test_map_all_reduced_s3_logs_to_dandisets.py @@ -35,11 +35,15 @@ def test_map_all_reduced_s3_logs_to_dandisets(tmpdir: py.path.local): relative_file_path = expected_file_path.relative_to(expected_output_folder_path) test_file_path = test_mapped_s3_logs_folder_path / relative_file_path - # Pandas assertion makes no reference to the file being tested when it fails - print(f"{test_file_path=}") - print(f"{expected_file_path=}") - test_mapped_log = pandas.read_table(filepath_or_buffer=test_file_path, index_col=0) expected_mapped_log = pandas.read_table(filepath_or_buffer=expected_file_path, index_col=0) - pandas.testing.assert_frame_equal(left=test_mapped_log, right=expected_mapped_log) + # Pandas assertion makes no reference to the case being tested when it fails + try: + pandas.testing.assert_frame_equal(left=test_mapped_log, right=expected_mapped_log) + except AssertionError as exception: + message = ( + f"\n\nTest file path: {test_file_path}\nExpected file path: {expected_file_path}\n\n" + f"{str(exception)}\n\n" + ) + raise AssertionError(message)