From 8655bbba1da63f89bed55c781cef03f7fdaa6a59 Mon Sep 17 00:00:00 2001 From: CodyCBakerPhD Date: Sun, 11 Aug 2024 22:29:48 -0400 Subject: [PATCH] debugging --- .../_dandi_s3_log_file_parser.py | 6 +++--- src/dandi_s3_log_parser/_order_parsed_logs.py | 2 +- src/dandi_s3_log_parser/_s3_log_file_parser.py | 7 +++---- tests/test_parse_dandi_raw_s3_log.py | 4 ++-- tests/test_parse_dandi_raw_s3_log_bad_lines.py | 15 +++++---------- 5 files changed, 14 insertions(+), 20 deletions(-) diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py index fbe4583..0ca5d46 100644 --- a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py @@ -94,10 +94,10 @@ def asset_id_handler(*, raw_asset_id: str) -> str: split_by_slash = raw_asset_id.split("/") return split_by_slash[0] + "_" + split_by_slash[-1] - daily_raw_s3_log_file_paths = set(base_raw_s3_log_folder_path.rglob(pattern="*.log")) - # Workaround to particular issue with current repo storage structure on Drogon - daily_raw_s3_log_file_paths.remove(pathlib.Path("/mnt/backup/dandi/dandiarchive-logs/stats/start-end.log")) + daily_raw_s3_log_file_paths = set(base_raw_s3_log_folder_path.rglob(pattern="*.log")) - set( + [pathlib.Path("/mnt/backup/dandi/dandiarchive-logs/stats/start-end.log")] + ) if maximum_number_of_workers == 1: for raw_s3_log_file_path in tqdm.tqdm( diff --git a/src/dandi_s3_log_parser/_order_parsed_logs.py b/src/dandi_s3_log_parser/_order_parsed_logs.py index c2dc623..0031c42 100644 --- a/src/dandi_s3_log_parser/_order_parsed_logs.py +++ b/src/dandi_s3_log_parser/_order_parsed_logs.py @@ -24,7 +24,7 @@ def order_parsed_logs( unordered_file_paths = list(unordered_parsed_s3_log_folder_path.glob("*.tsv")) for unordered_parsed_s3_log_file_path in tqdm.tqdm( - iterable=unordered_parsed_s3_log_folder_path.glob("*.tsv"), + iterable=unordered_file_paths, total=len(unordered_file_paths), desc="Ordering parsed logs...", position=0, diff --git a/src/dandi_s3_log_parser/_s3_log_file_parser.py b/src/dandi_s3_log_parser/_s3_log_file_parser.py index d0bae29..01ce1c8 100644 --- a/src/dandi_s3_log_parser/_s3_log_file_parser.py +++ b/src/dandi_s3_log_parser/_s3_log_file_parser.py @@ -86,12 +86,9 @@ def asset_id_handler(*, raw_asset_id: str) -> str: if order_results is True: # Create a fresh temporary directory in the home folder and then fresh subfolders for each job temporary_base_folder_path = parsed_s3_log_folder_path / ".temp" + shutil.rmtree(path=temporary_base_folder_path, ignore_errors=True) temporary_base_folder_path.mkdir(exist_ok=True) - # Clean up any previous tasks that failed to clean themselves up - for previous_task_folder_path in temporary_base_folder_path.iterdir(): - shutil.rmtree(path=previous_task_folder_path, ignore_errors=True) - task_id = str(uuid.uuid4())[:5] temporary_folder_path = temporary_base_folder_path / task_id temporary_folder_path.mkdir(exist_ok=True) @@ -108,6 +105,8 @@ def asset_id_handler(*, raw_asset_id: str) -> str: ip_hash_to_region_file_path=ip_hash_to_region_file_path, ) + print(reduced_logs) + reduced_logs_binned_by_unparsed_asset = dict() for reduced_log in reduced_logs: raw_asset_id = reduced_log.asset_id diff --git a/tests/test_parse_dandi_raw_s3_log.py b/tests/test_parse_dandi_raw_s3_log.py index 329b180..23dbd53 100644 --- a/tests/test_parse_dandi_raw_s3_log.py +++ b/tests/test_parse_dandi_raw_s3_log.py @@ -24,7 +24,7 @@ def test_parse_dandi_raw_s3_log_example_0(tmpdir: py.path.local): dandi_s3_log_parser.parse_dandi_raw_s3_log( raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path ) - test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir()) + test_output_file_paths = [path for path in test_parsed_s3_log_folder_path.iterdir() if path.is_file()] number_of_output_files = len(test_output_file_paths) assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!" @@ -35,7 +35,7 @@ def test_parse_dandi_raw_s3_log_example_0(tmpdir: py.path.local): number_of_output_files == expected_number_of_output_files ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()] + expected_asset_ids = [path.stem for path in expected_parsed_s3_log_folder_path.iterdir() if path.is_file()] for test_parsed_s3_log_file_path in test_output_file_paths: assert ( test_parsed_s3_log_file_path.stem in expected_asset_ids diff --git a/tests/test_parse_dandi_raw_s3_log_bad_lines.py b/tests/test_parse_dandi_raw_s3_log_bad_lines.py index 9ae2718..5c34f13 100644 --- a/tests/test_parse_dandi_raw_s3_log_bad_lines.py +++ b/tests/test_parse_dandi_raw_s3_log_bad_lines.py @@ -6,7 +6,7 @@ import dandi_s3_log_parser -def test_parse_dandi_raw_s3_log_example_2(tmpdir: py.path.local): +def test_parse_dandi_raw_s3_log_bad_lines(tmpdir: py.path.local): """ 'ordered_example_2' contains the basic test cases as well as a collection of 'bad lines' contributed over time. """ @@ -26,18 +26,13 @@ def test_parse_dandi_raw_s3_log_example_2(tmpdir: py.path.local): dandi_s3_log_parser.parse_dandi_raw_s3_log( raw_s3_log_file_path=example_raw_s3_log_file_path, parsed_s3_log_folder_path=test_parsed_s3_log_folder_path ) - test_output_file_paths = list(test_parsed_s3_log_folder_path.iterdir()) + test_output_file_paths = [path for path in test_parsed_s3_log_folder_path.iterdir() if path.is_file()] number_of_output_files = len(test_output_file_paths) - assert number_of_output_files != 0, f"Test expected_output folder ({test_parsed_s3_log_folder_path}) is empty!" + expected_number_of_output_files = 3 + assert number_of_output_files == expected_number_of_output_files - # Increment this over time as more examples are added - expected_number_of_output_files = 2 - assert ( - number_of_output_files == expected_number_of_output_files - ), f"The number of asset files ({number_of_output_files}) does not match expectation!" - - expected_asset_ids = [file_path.stem for file_path in expected_parsed_s3_log_folder_path.iterdir()] + expected_asset_ids = [path.stem for path in expected_parsed_s3_log_folder_path.iterdir() if path.is_file()] for test_parsed_s3_log_file_path in test_output_file_paths: assert ( test_parsed_s3_log_file_path.stem in expected_asset_ids