Skip to content

Commit

Permalink
Release limit and shuffle (#30)
Browse files Browse the repository at this point in the history
* release limit and shuffle

* debug

---------

Co-authored-by: CodyCBakerPhD <[email protected]>
  • Loading branch information
CodyCBakerPhD and CodyCBakerPhD authored Aug 12, 2024
1 parent 98c26a4 commit 3c2a166
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 3 deletions.
4 changes: 3 additions & 1 deletion src/dandi_s3_log_parser/_dandi_s3_log_file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import importlib.metadata
import os
import pathlib
import random
import shutil
import traceback
import uuid
Expand Down Expand Up @@ -81,8 +82,9 @@ def asset_id_handler(*, raw_asset_id: str) -> str:
split_by_slash = raw_asset_id.split("/")
return split_by_slash[0] + "_" + split_by_slash[-1]

# The .rglob is not naturally sorted; shuffle for more uniform progress updates
daily_raw_s3_log_file_paths = set(base_raw_s3_log_folder_path.rglob(pattern="*.log")) - excluded_log_files
daily_raw_s3_log_file_paths = list(daily_raw_s3_log_file_paths)[:10]
random.shuffle(list(daily_raw_s3_log_file_paths))

if maximum_number_of_workers == 1:
for raw_s3_log_file_path in tqdm.tqdm(
Expand Down
1 change: 0 additions & 1 deletion tests/test_parse_all_dandi_raw_s3_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def test_parse_all_dandi_raw_s3_logs_example_1(tmpdir: py.path.local) -> None:
)
expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)

# Sometimes the order of line parsings is different; unsure why this is not deterministic
test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp")
expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp")

Expand Down
1 change: 0 additions & 1 deletion tests/test_parse_all_dandi_raw_s3_logs_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ def test_parse_all_dandi_raw_s3_logs_example_0_parallel(tmpdir: py.path.local) -
)
expected_parsed_s3_log = pandas.read_table(filepath_or_buffer=expected_parsed_s3_log_file_path, index_col=0)

# Parallelization might have merged things out of deterministic order based on whichever worker finished first
test_parsed_s3_log = test_parsed_s3_log.sort_values(by="timestamp")
expected_parsed_s3_log = expected_parsed_s3_log.sort_values(by="timestamp")

Expand Down

0 comments on commit 3c2a166

Please sign in to comment.