From 3c2a1660cb037dd348c99cab214c43b484a13c3c Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Mon, 12 Aug 2024 13:25:32 -0400 Subject: [PATCH] Release limit and shuffle (#30) * release limit and shuffle * debug --------- Co-authored-by: CodyCBakerPhD --- src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py | 4 +++- tests/test_parse_all_dandi_raw_s3_logs.py | 1 - tests/test_parse_all_dandi_raw_s3_logs_parallel.py | 1 - 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py index 2db203a..07a48c4 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py @@ -5,6 +5,7 @@ import importlib.metadata import os import pathlib +import random import shutil import traceback import uuid @@ -81,8 +82,9 @@ def asset_id_handler(*, raw_asset_id: str) -> str: split_by_slash = raw_asset_id.split("/") return split_by_slash[0] + "_" + split_by_slash[-1] + # The .rglob is not naturally sorted; shuffle for more uniform progress updates daily_raw_s3_log_file_paths = set(base_raw_s3_log_folder_path.rglob(pattern="*.log")) - excluded_log_files - daily_raw_s3_log_file_paths = list(daily_raw_s3_log_file_paths)[:10] + random.shuffle(list(daily_raw_s3_log_file_paths)) if maximum_number_of_workers == 1: for raw_s3_log_file_path in tqdm.tqdm( diff --git a/tests/test_parse_all_dandi_raw_s3_logs.py b/tests/test_parse_all_dandi_raw_s3_logs.py index 71df631..5dc7301 100644 --- a/tests/test_parse_all_dandi_raw_s3_logs.py +++ b/tests/test_parse_all_dandi_raw_s3_logs.py @@ -44,7 +44,6 @@ def test_parse_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None: ) expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0) - # Sometimes the order of line parsings is different; unsure why this is not deterministic test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp") expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp") diff --git a/tests/test_parse_all_dandi_raw_s3_logs_parallel.py b/tests/test_parse_all_dandi_raw_s3_logs_parallel.py index 472c7f8..f1a828e 100644 --- a/tests/test_parse_all_dandi_raw_s3_logs_parallel.py +++ b/tests/test_parse_all_dandi_raw_s3_logs_parallel.py @@ -45,7 +45,6 @@ def test_parse_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local) - ) expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0) - # Parallelization might have merged things out of deterministic order based on whichever worker finished first test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp") expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp")