From 3c2a1660cb037dd348c99cab214c43b484a13c3c Mon Sep 17 00:00:00 2001
From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com>
Date: Mon, 12 Aug 2024 13:25:32 -0400
Subject: [PATCH] Release limit and shuffle (#30)

* release limit and shuffle

* debug

---------

Co-authored-by: CodyCBakerPhD <codycbakerphd@gmail.com>
---
 src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py | 4 +++-
 tests/test_parse_all_dandi_raw_s3_logs.py            | 1 -
 tests/test_parse_all_dandi_raw_s3_logs_parallel.py   | 1 -
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
index 2db203a..07a48c4 100644
--- a/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
+++ b/src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
@@ -5,6 +5,7 @@
 import importlib.metadata
 import os
 import pathlib
+import random
 import shutil
 import traceback
 import uuid
@@ -81,8 +82,9 @@ def asset_id_handler(*, raw_asset_id: str) -> str:
         split_by_slash = raw_asset_id.split("/")
         return split_by_slash[0] + "_" + split_by_slash[-1]
 
+    # The .rglob is not naturally sorted; shuffle for more uniform progress updates
     daily_raw_s3_log_file_paths = set(base_raw_s3_log_folder_path.rglob(pattern="*.log")) - excluded_log_files
-    daily_raw_s3_log_file_paths = list(daily_raw_s3_log_file_paths)[:10]
+    random.shuffle(list(daily_raw_s3_log_file_paths))
 
     if maximum_number_of_workers == 1:
         for raw_s3_log_file_path in tqdm.tqdm(
diff --git a/tests/test_parse_all_dandi_raw_s3_logs.py b/tests/test_parse_all_dandi_raw_s3_logs.py
index 71df631..5dc7301 100644
--- a/tests/test_parse_all_dandi_raw_s3_logs.py
+++ b/tests/test_parse_all_dandi_raw_s3_logs.py
@@ -44,7 +44,6 @@ def test_parse_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None:
         )
         expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
 
-        # Sometimes the order of line parsings is different; unsure why this is not deterministic
         test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp")
         expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp")
 
diff --git a/tests/test_parse_all_dandi_raw_s3_logs_parallel.py b/tests/test_parse_all_dandi_raw_s3_logs_parallel.py
index 472c7f8..f1a828e 100644
--- a/tests/test_parse_all_dandi_raw_s3_logs_parallel.py
+++ b/tests/test_parse_all_dandi_raw_s3_logs_parallel.py
@@ -45,7 +45,6 @@ def test_parse_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local) -
         )
         expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)
 
-        # Parallelization might have merged things out of deterministic order based on whichever worker finished first
         test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp")
         expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp")