From 683c2811acc93372fcd337bfedc3417814509535 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Fri, 16 Aug 2024 15:16:36 -0400 Subject: [PATCH] Post-meeting fixes 1 (#50) * adjust workflows; adjust test cases; add logic; refactor asset handler * remove from main testing * debug * fix CI * fix CI * fix CI --------- Co-authored-by: CodyCBakerPhD --- .github/workflows/deploy_daily_tests.yml | 7 ++- .../deploy_tests_on_pull_request.yml | 9 +++- .github/workflows/testing.yml | 51 +++++++++++++++++++ ...ting_dev.yml => testing_live_services.yml} | 2 +- pyproject.toml | 2 +- .../_dandi_s3_log_file_reducer.py | 40 ++++++++++----- src/dandi_s3_log_parser/_dandiset_mapper.py | 32 +++++++----- .../_s3_log_file_reducer.py | 14 +++-- .../expected_output/000003/0.210812.1448.tsv | 0 .../expected_output/000003/0.230629.1955.tsv | 0 .../expected_output/000003/draft.tsv | 0 .../expected_output/000013/0.220126.2143.tsv | 0 .../expected_output/000013/draft.tsv | 0 .../58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv | 0 .../cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv | 0 .../test_map_reduced_logs_to_all_dandisets.py | 0 .../11ec8933-1456-4942-922b-94e5878bb991.tsv} | 0 .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv} | 0 .../11ec8933-1456-4942-922b-94e5878bb991.tsv} | 0 .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv} | 0 ...s_11ec8933-1456-4942-922b-94e5878bb991.tsv | 3 -- .../0801d996-200e-4173-ab49-d1784427e96a.tsv} | 0 .../11ec8933-1456-4942-922b-94e5878bb991.tsv} | 0 .../a7b032b8-1e31-429f-975f-52a28cec6629.tsv} | 0 tests/test_reduce_all_dandi_raw_s3_logs.py | 8 +-- ...t_reduce_all_dandi_raw_s3_logs_parallel.py | 10 ++-- tests/test_reduce_dandi_raw_s3_log.py | 9 ++-- .../test_reduce_dandi_raw_s3_log_bad_lines.py | 9 ++-- 28 files changed, 143 insertions(+), 53 deletions(-) create mode 100644 .github/workflows/testing.yml rename .github/workflows/{testing_dev.yml => testing_live_services.yml} (95%) rename {tests => test_live_services}/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv (100%) rename {tests => test_live_services}/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv (100%) rename {tests => test_live_services}/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv (100%) rename {tests => test_live_services}/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv (100%) rename {tests => test_live_services}/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv (100%) rename tests/examples/mapped_to_dandiset_example_0/reduced_logs/blobs_58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv => test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv (100%) rename tests/examples/mapped_to_dandiset_example_0/reduced_logs/blobs_cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv => test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv (100%) rename {tests => test_live_services}/test_map_reduced_logs_to_all_dandisets.py (100%) rename tests/examples/reduced_example_0/expected_output/{blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv => blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv} (100%) rename tests/examples/reduced_example_0/expected_output/{blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv => blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv} (100%) rename tests/examples/{mapped_to_dandiset_example_0/reduced_logs/blobs_not_a_real_id.tsv => reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv} (100%) rename tests/examples/reduced_example_1/expected_output/{blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv => blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv} (100%) delete mode 100644 tests/examples/reduced_example_1/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv rename tests/examples/reduced_example_2/expected_output/{blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv => blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv} (100%) rename tests/examples/reduced_example_2/expected_output/{blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv => blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv} (100%) rename tests/examples/reduced_example_2/expected_output/{blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv => blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv} (100%) diff --git a/.github/workflows/deploy_daily_tests.yml b/.github/workflows/deploy_daily_tests.yml index 2207945..4cc5f95 100644 --- a/.github/workflows/deploy_daily_tests.yml +++ b/.github/workflows/deploy_daily_tests.yml @@ -12,7 +12,12 @@ concurrency: jobs: DailyTests: - uses: ./.github/workflows/testing_dev.yml + uses: ./.github/workflows/testing.yml + secrets: + CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }} + + LiveServices: + uses: ./.github/workflows/testing_live_services.yml secrets: IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }} IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }} diff --git a/.github/workflows/deploy_tests_on_pull_request.yml b/.github/workflows/deploy_tests_on_pull_request.yml index dd6c825..086e124 100644 --- a/.github/workflows/deploy_tests_on_pull_request.yml +++ b/.github/workflows/deploy_tests_on_pull_request.yml @@ -9,8 +9,13 @@ concurrency: jobs: - DevTests: - uses: ./.github/workflows/testing_dev.yml + Tests: + uses: ./.github/workflows/testing.yml + secrets: + CODECOV_CREDENTIALS: ${{ secrets.CODECOV_CREDENTIALS }} + + LiveServices: + uses: ./.github/workflows/testing_live_services.yml secrets: IP_HASH_SALT: ${{ secrets.IP_HASH_SALT }} IPINFO_CREDENTIALS: ${{ secrets.IPINFO_CREDENTIALS }} diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml new file mode 100644 index 0000000..1aa758b --- /dev/null +++ b/.github/workflows/testing.yml @@ -0,0 +1,51 @@ +name: Dev tests +on: + workflow_call: + secrets: + CODECOV_CREDENTIALS: + required: true + +jobs: + + run: + # Will read on PR dashboard as 'Deploy / DevTests / ubuntu' + # Action dashboard identified by 'Dev tests' + # Requirement settings identified as 'DevTests / ubuntu' + name: ubuntu + runs-on: ubuntu-latest + strategy: + fail-fast: false + + steps: + - uses: actions/checkout@v4 + - run: git fetch --prune --unshallow --tags + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Global Setup + run: | + python -m pip install -U pip + pip install pytest-cov + + - name: Install local checkout + run: pip install --no-cache-dir . + + - name: Display installed packages and their sources for debugging + run: pip list + + - name: Run pytest with coverage and printout coverage for debugging + run: | + pytest tests -vv -rsx --cov=dandi_s3_log_parser --cov-report xml:./coverage.xml + cat ./coverage.xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_CREDENTIALS }} + file: ./coverage.xml + flags: unittests + name: codecov-umbrella + fail_ci_if_error: true + verbose: true diff --git a/.github/workflows/testing_dev.yml b/.github/workflows/testing_live_services.yml similarity index 95% rename from .github/workflows/testing_dev.yml rename to .github/workflows/testing_live_services.yml index 27544ee..fa78bfb 100644 --- a/.github/workflows/testing_dev.yml +++ b/.github/workflows/testing_live_services.yml @@ -62,7 +62,7 @@ jobs: - name: Run pytest with coverage and printout coverage for debugging run: | - pytest -vv -rsx --cov=dandi_s3_log_parser --cov-report xml:./coverage.xml + pytest test_live_services -vv -rsx --cov=dandi_s3_log_parser --cov-report xml:./coverage.xml cat ./coverage.xml - name: Upload coverage to Codecov diff --git a/pyproject.toml b/pyproject.toml index 6b0477d..e273be6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ packages = ["src/dandi_s3_log_parser"] [project] name = "dandi_s3_log_parser" -version="0.2.0" +version="0.3.0" authors = [ { name="Cody Baker", email="cody.c.baker.phd@gmail.com" }, ] diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py index d8dc64d..a7f7b6f 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_reducer.py @@ -139,15 +139,17 @@ def reduce_all_dandi_raw_s3_logs( print("\n\nParallel parsing complete!\n\n") - for per_worker_temporary_folder_path in tqdm.tqdm( - iterable=per_worker_temporary_folder_paths, - desc="Merging results across workers...", - total=len(per_worker_temporary_folder_paths), - position=0, - leave=True, - mininterval=2.0, + for worker_index, per_worker_temporary_folder_path in enumerate( + tqdm.tqdm( + iterable=per_worker_temporary_folder_paths, + desc="Merging results across workers...", + total=len(per_worker_temporary_folder_paths), + position=0, + leave=True, + mininterval=2.0, + ) ): - per_worker_reduced_s3_log_file_paths = list(per_worker_temporary_folder_path.iterdir()) + per_worker_reduced_s3_log_file_paths = list(per_worker_temporary_folder_path.rglob("*.tsv")) assert ( len(per_worker_reduced_s3_log_file_paths) != 0 ), f"No files found in {per_worker_temporary_folder_path}!" @@ -160,20 +162,26 @@ def reduce_all_dandi_raw_s3_logs( leave=False, mininterval=2.0, ): - merged_temporary_file_path = reduced_s3_logs_folder_path / per_worker_reduced_s3_log_file_path.name + merge_target_file_path = reduced_s3_logs_folder_path / per_worker_reduced_s3_log_file_path.relative_to( + per_worker_temporary_folder_path + ) parsed_s3_log = pandas.read_table(filepath_or_buffer=per_worker_reduced_s3_log_file_path, header=0) - header = False if merged_temporary_file_path.exists() else True + merge_target_file_path_exists = merge_target_file_path.exists() + if not merge_target_file_path_exists and not merge_target_file_path.parent.exists(): + merge_target_file_path.parent.mkdir(exist_ok=True, parents=True) + + header = False if merge_target_file_path_exists else True parsed_s3_log.to_csv( - path_or_buf=merged_temporary_file_path, + path_or_buf=merge_target_file_path, mode="a", sep="\t", header=header, index=False, ) - print("\n\n") + shutil.rmtree(path=temporary_base_folder_path) # Function cannot be covered because the line calls occur on subprocesses @@ -302,6 +310,12 @@ def asset_id_handler(*, raw_asset_id: str) -> str: def _get_default_dandi_asset_id_handler() -> Callable: def asset_id_handler(*, raw_asset_id: str) -> str: split_by_slash = raw_asset_id.split("/") - return split_by_slash[0] + "_" + split_by_slash[-1] + + asset_type = split_by_slash[0] + if asset_type == "zarr": + zarr_blob_form = "/".join(split_by_slash[:2]) + return zarr_blob_form + + return raw_asset_id return asset_id_handler diff --git a/src/dandi_s3_log_parser/_dandiset_mapper.py b/src/dandi_s3_log_parser/_dandiset_mapper.py index 737a04f..a2fab6d 100644 --- a/src/dandi_s3_log_parser/_dandiset_mapper.py +++ b/src/dandi_s3_log_parser/_dandiset_mapper.py @@ -72,33 +72,37 @@ def _map_reduced_logs_to_dandiset( dandiset_version = client.get_dandiset(dandiset_id=dandiset_id, version_id=version_id) - all_reduced_logs = [] + all_reduced_s3_logs = [] for asset in dandiset_version.get_assets(): asset_suffixes = pathlib.Path(asset.path).suffixes is_asset_zarr = ".zarr" in asset_suffixes - blob_id = asset.blob if not is_asset_zarr else asset.zarr - blobs_or_zarr = "blobs" if not is_asset_zarr else "zarr" + if is_asset_zarr: + blob_id = asset.zarr + reduced_s3_log_file_path = reduced_s3_logs_folder_path / "zarr" / f"{blob_id}.tsv" + else: + blob_id = asset.blob + reduced_s3_log_file_path = ( + reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" + ) - reduced_log_file_path = reduced_s3_logs_folder_path / f"{blobs_or_zarr}_{blob_id}.tsv" - - if not reduced_log_file_path.exists(): + if not reduced_s3_log_file_path.exists(): continue # No reduced logs found (possible asset was never accessed); skip to next asset - reduced_log = pandas.read_table(filepath_or_buffer=reduced_log_file_path, header=0) - reduced_log["filename"] = [asset.path] * len(reduced_log) - reduced_log["region"] = [ + reduced_s3_log = pandas.read_table(filepath_or_buffer=reduced_s3_log_file_path, header=0) + reduced_s3_log["filename"] = [asset.path] * len(reduced_s3_log) + reduced_s3_log["region"] = [ get_region_from_ip_address(ip_address=ip_address, ip_hash_to_region=ip_hash_to_region) - for ip_address in reduced_log["ip_address"] + for ip_address in reduced_s3_log["ip_address"] ] - reordered_reduced_log = reduced_log.reindex(columns=("filename", "timestamp", "bytes_sent", "region")) - all_reduced_logs.append(reordered_reduced_log) + reordered_reduced_s3_log = reduced_s3_log.reindex(columns=("filename", "timestamp", "bytes_sent", "region")) + all_reduced_s3_logs.append(reordered_reduced_s3_log) - if len(all_reduced_logs) == 0: + if len(all_reduced_s3_logs) == 0: continue # No reduced logs found (possible dandiset version was never accessed); skip to next version - mapped_log = pandas.concat(objs=all_reduced_logs, ignore_index=True) + mapped_log = pandas.concat(objs=all_reduced_s3_logs, ignore_index=True) mapped_log.sort_values(by="timestamp") mapped_log.index = range(len(mapped_log)) diff --git a/src/dandi_s3_log_parser/_s3_log_file_reducer.py b/src/dandi_s3_log_parser/_s3_log_file_reducer.py index 4ee98a1..7aa5bc2 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_reducer.py +++ b/src/dandi_s3_log_parser/_s3_log_file_reducer.py @@ -44,7 +44,7 @@ def reduce_raw_s3_log( ---------- raw_s3_log_file_path : str or pathlib.Path The path to the raw S3 log file. - reduced_s3_log_folder_path : str or pathlib.Path + reduced_s3_logs_folder_path : str or pathlib.Path The path to write each reduced S3 log file to. There will be one file per handled asset ID. mode : "w" or "a", default: "a" @@ -94,12 +94,18 @@ def asset_id_handler(*, raw_asset_id: str) -> str: ) for handled_asset_id, reduced_logs_per_handled_asset_id in reduced_and_binned_logs.items(): - parsed_s3_log_file_path = reduced_s3_logs_folder_path / f"{handled_asset_id}.tsv" + handled_asset_id_path = pathlib.Path(handled_asset_id) + blob_id = handled_asset_id_path.stem + reduced_s3_log_file_path = reduced_s3_logs_folder_path / handled_asset_id_path.parent / f"{blob_id}.tsv" + + reduced_log_file_exists = reduced_s3_log_file_path.exists() + if not reduced_log_file_exists and not reduced_s3_log_file_path.parent.exists(): + reduced_s3_log_file_path.parent.mkdir(exist_ok=True, parents=True) data_frame = pandas.DataFrame(data=reduced_logs_per_handled_asset_id) - header = False if parsed_s3_log_file_path.exists() is True and mode == "a" else True - data_frame.to_csv(path_or_buf=parsed_s3_log_file_path, mode=mode, sep="\t", header=header, index=False) + header = False if reduced_log_file_exists is True and mode == "a" else True + data_frame.to_csv(path_or_buf=reduced_s3_log_file_path, mode=mode, sep="\t", header=header, index=False) def _get_reduced_and_binned_log_lines( diff --git a/tests/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv similarity index 100% rename from tests/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv rename to test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.210812.1448.tsv diff --git a/tests/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv similarity index 100% rename from tests/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv rename to test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/0.230629.1955.tsv diff --git a/tests/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv similarity index 100% rename from tests/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv rename to test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000003/draft.tsv diff --git a/tests/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv similarity index 100% rename from tests/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv rename to test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/0.220126.2143.tsv diff --git a/tests/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv similarity index 100% rename from tests/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv rename to test_live_services/examples/mapped_to_dandiset_example_0/expected_output/000013/draft.tsv diff --git a/tests/examples/mapped_to_dandiset_example_0/reduced_logs/blobs_58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv similarity index 100% rename from tests/examples/mapped_to_dandiset_example_0/reduced_logs/blobs_58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv rename to test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/58c/537/58c53789-eec4-4080-ad3b-207cf2a1cac9.tsv diff --git a/tests/examples/mapped_to_dandiset_example_0/reduced_logs/blobs_cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv b/test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv similarity index 100% rename from tests/examples/mapped_to_dandiset_example_0/reduced_logs/blobs_cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv rename to test_live_services/examples/mapped_to_dandiset_example_0/reduced_logs/blobs/cad/9e8/cad9e87d-9154-464c-91d6-2f08f2a9c354.tsv diff --git a/tests/test_map_reduced_logs_to_all_dandisets.py b/test_live_services/test_map_reduced_logs_to_all_dandisets.py similarity index 100% rename from tests/test_map_reduced_logs_to_all_dandisets.py rename to test_live_services/test_map_reduced_logs_to_all_dandisets.py diff --git a/tests/examples/reduced_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv similarity index 100% rename from tests/examples/reduced_example_0/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv rename to tests/examples/reduced_example_0/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv diff --git a/tests/examples/reduced_example_0/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv similarity index 100% rename from tests/examples/reduced_example_0/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv rename to tests/examples/reduced_example_0/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv diff --git a/tests/examples/mapped_to_dandiset_example_0/reduced_logs/blobs_not_a_real_id.tsv b/tests/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv similarity index 100% rename from tests/examples/mapped_to_dandiset_example_0/reduced_logs/blobs_not_a_real_id.tsv rename to tests/examples/reduced_example_1/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv diff --git a/tests/examples/reduced_example_1/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv similarity index 100% rename from tests/examples/reduced_example_1/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv rename to tests/examples/reduced_example_1/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv diff --git a/tests/examples/reduced_example_1/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/reduced_example_1/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv deleted file mode 100644 index 570c480..0000000 --- a/tests/examples/reduced_example_1/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv +++ /dev/null @@ -1,3 +0,0 @@ -timestamp bytes_sent ip_address line_index -2022-03-16 02:21:12 512 192.0.2.0 1 -2022-05-04 05:06:35 512 192.0.2.0 1 diff --git a/tests/examples/reduced_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv b/tests/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv similarity index 100% rename from tests/examples/reduced_example_2/expected_output/blobs_0801d996-200e-4173-ab49-d1784427e96a.tsv rename to tests/examples/reduced_example_2/expected_output/blobs/080/1d9/0801d996-200e-4173-ab49-d1784427e96a.tsv diff --git a/tests/examples/reduced_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv b/tests/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv similarity index 100% rename from tests/examples/reduced_example_2/expected_output/blobs_11ec8933-1456-4942-922b-94e5878bb991.tsv rename to tests/examples/reduced_example_2/expected_output/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991.tsv diff --git a/tests/examples/reduced_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv b/tests/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv similarity index 100% rename from tests/examples/reduced_example_2/expected_output/blobs_a7b032b8-1e31-429f-975f-52a28cec6629.tsv rename to tests/examples/reduced_example_2/expected_output/blobs/a7b/032/a7b032b8-1e31-429f-975f-52a28cec6629.tsv diff --git a/tests/test_reduce_all_dandi_raw_s3_logs.py b/tests/test_reduce_all_dandi_raw_s3_logs.py index 9e9ffd6..32757fd 100644 --- a/tests/test_reduce_all_dandi_raw_s3_logs.py +++ b/tests/test_reduce_all_dandi_raw_s3_logs.py @@ -21,7 +21,7 @@ def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: base_raw_s3_logs_folder_path=examples_folder_path, reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, ) - test_output_file_paths = list(test_reduced_s3_logs_folder_path.iterdir()) + test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) number_of_output_files = len(test_output_file_paths) assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" @@ -32,15 +32,17 @@ def test_reduce_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: number_of_output_files == expected_number_of_output_files ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - expected_asset_ids = [file_path.stem for file_path in expected_reduced_s3_logs_folder_path.iterdir()] + expected_asset_ids = [file_path.stem for file_path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] for test_parsed_s3_log_file_path in test_output_file_paths: assert ( test_parsed_s3_log_file_path.stem in expected_asset_ids ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) + + blob_id = test_parsed_s3_log_file_path.stem expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" + expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" ) expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) diff --git a/tests/test_reduce_all_dandi_raw_s3_logs_parallel.py b/tests/test_reduce_all_dandi_raw_s3_logs_parallel.py index c6aa6e0..2045792 100644 --- a/tests/test_reduce_all_dandi_raw_s3_logs_parallel.py +++ b/tests/test_reduce_all_dandi_raw_s3_logs_parallel.py @@ -22,9 +22,7 @@ def test_reduce_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local) reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, maximum_number_of_workers=2, ) - test_output_file_paths = [ - path for path in test_reduced_s3_logs_folder_path.iterdir() if path.is_file() - ] # Skip .temp + test_output_file_paths = [path for path in test_reduced_s3_logs_folder_path.rglob("*.tsv")] number_of_output_files = len(test_output_file_paths) assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" @@ -35,15 +33,17 @@ def test_reduce_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local) number_of_output_files == expected_number_of_output_files ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - expected_asset_ids = [file_path.stem for file_path in expected_reduced_s3_logs_folder_path.iterdir()] + expected_asset_ids = [file_path.stem for file_path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] for test_parsed_s3_log_file_path in test_output_file_paths: assert ( test_parsed_s3_log_file_path.stem in expected_asset_ids ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) + + blob_id = test_parsed_s3_log_file_path.stem expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" + expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" ) expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) diff --git a/tests/test_reduce_dandi_raw_s3_log.py b/tests/test_reduce_dandi_raw_s3_log.py index d750589..0f37892 100644 --- a/tests/test_reduce_dandi_raw_s3_log.py +++ b/tests/test_reduce_dandi_raw_s3_log.py @@ -27,7 +27,7 @@ def test_reduce_dandi_raw_s3_log_example_0(tmpdir: py.path.local) -> None: raw_s3_log_file_path=example_raw_s3_log_file_path, reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, ) - test_output_file_paths = list(test_reduced_s3_logs_folder_path.iterdir()) + test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) number_of_output_files = len(test_output_file_paths) assert number_of_output_files != 0, f"Test expected_output folder ({test_reduced_s3_logs_folder_path}) is empty!" @@ -38,15 +38,18 @@ def test_reduce_dandi_raw_s3_log_example_0(tmpdir: py.path.local) -> None: number_of_output_files == expected_number_of_output_files ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - expected_asset_ids = [path.stem for path in expected_reduced_s3_logs_folder_path.iterdir() if path.is_file()] + expected_asset_ids = [path.stem for path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] for test_parsed_s3_log_file_path in test_output_file_paths: assert ( test_parsed_s3_log_file_path.stem in expected_asset_ids ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) + + blob_id = test_parsed_s3_log_file_path.stem expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" + expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" ) expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) + pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) diff --git a/tests/test_reduce_dandi_raw_s3_log_bad_lines.py b/tests/test_reduce_dandi_raw_s3_log_bad_lines.py index 92a866d..f8c06d9 100644 --- a/tests/test_reduce_dandi_raw_s3_log_bad_lines.py +++ b/tests/test_reduce_dandi_raw_s3_log_bad_lines.py @@ -29,23 +29,26 @@ def test_reduce_dandi_raw_s3_log_bad_lines(tmpdir: py.path.local) -> None: raw_s3_log_file_path=example_raw_s3_log_file_path, reduced_s3_logs_folder_path=test_reduced_s3_logs_folder_path, ) - test_output_file_paths = list(test_reduced_s3_logs_folder_path.iterdir()) + test_output_file_paths = list(test_reduced_s3_logs_folder_path.rglob("*.tsv")) number_of_output_files = len(test_output_file_paths) expected_number_of_output_files = 3 assert number_of_output_files == expected_number_of_output_files - expected_asset_ids = [path.stem for path in expected_reduced_s3_logs_folder_path.iterdir() if path.is_file()] + expected_asset_ids = [path.stem for path in expected_reduced_s3_logs_folder_path.rglob("*.tsv")] for test_parsed_s3_log_file_path in test_output_file_paths: assert ( test_parsed_s3_log_file_path.stem in expected_asset_ids ), f"Asset ID {test_parsed_s3_log_file_path.stem} not found in expected asset IDs!" test_parsed_s3_log = pandas.read_table(filepath_or_buffer=test_parsed_s3_log_file_path) + + blob_id = test_parsed_s3_log_file_path.stem expected_parsed_s3_log_file_path = ( - expected_reduced_s3_logs_folder_path / f"{test_parsed_s3_log_file_path.stem}.tsv" + expected_reduced_s3_logs_folder_path / "blobs" / blob_id[:3] / blob_id[3:6] / f"{blob_id}.tsv" ) expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path) + pandas.testing.assert_frame_equal(left=test_parsed_s3_log, right=expected_parsed_s3_log) post_test_error_folder_contents = list(error_folder.iterdir()) if error_folder.exists() else list()