catalystneuro · CodyCBakerPhD · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
@@ -5,6 +5,7 @@
 import importlib.metadata
 import os
 import pathlib
+import random
 import shutil
 import traceback
 import uuid
@@ -81,8 +82,9 @@ def asset_id_handler(*, raw_asset_id: str) -> str:
         split_by_slash = raw_asset_id.split("/")
         return split_by_slash[0] + "_" + split_by_slash[-1]
 
+    # The .rglob is not naturally sorted; shuffle for more uniform progress updates
     daily_raw_s3_log_file_paths = set(base_raw_s3_log_folder_path.rglob(pattern="*.log")) - excluded_log_files
-    daily_raw_s3_log_file_paths = list(daily_raw_s3_log_file_paths)[:10]
+    random.shuffle(list(daily_raw_s3_log_file_paths))
 
     if maximum_number_of_workers == 1:
         for raw_s3_log_file_path in tqdm.tqdm(

diff --git a/tests/test_parse_all_dandi_raw_s3_logs.py b/tests/test_parse_all_dandi_raw_s3_logs.py
@@ -44,7 +44,6 @@ def test_parse_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None:
         )
         expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
 
-        # Sometimes the order of line parsings is different; unsure why this is not deterministic
         test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp")
         expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp")
 

diff --git a/tests/test_parse_all_dandi_raw_s3_logs_parallel.py b/tests/test_parse_all_dandi_raw_s3_logs_parallel.py
@@ -45,7 +45,6 @@ def test_parse_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local) -
         )
         expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
 
-        # Parallelization might have merged things out of deterministic order based on whichever worker finished first
         test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp")
         expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp")