diff --git a/README.md b/README.md index c1ec5ce..d350d46 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,8 @@ reduce_all_dandi_raw_s3_logs \ --excluded_ips < Drogons IP > ``` +In summer of 2024, this process took less than 10 hours to process all 6 TB of raw log data (using 3 workers at 3 GB buffer size). + ### Binning To bin: @@ -127,6 +129,8 @@ bin_all_reduced_s3_logs_by_object_key \ --file_limit 20 ``` +In summer of 2024, this process took less than ?? hours to bin all ?? GB of reduced log data. + ### Mapping The next step, that should also be updated regularly (daily-weekly), is to iterate through all current versions of all Dandisets, mapping the reduced logs to their assets. @@ -145,6 +149,9 @@ map_binned_s3_logs_to_dandisets \ --dandiset_logs_folder_path /mnt/backup/dandi/mapped-dandiset-logs ``` +In summer of 2024, this process took less than ?? hours to run and in the current design should be run fresh on a regular basis to keep the logs up to date. + + ## Submit line decoding errors diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py index abf84f8..64b4977 100644 --- a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py +++ b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py @@ -63,12 +63,11 @@ def bin_all_reduced_s3_logs_by_object_key( mininterval=3.0, smoothing=0, ): - with open(file=started_tracking_file_path, mode="a") as started_tracking_file: - started_tracking_file.write(f"{reduced_s3_log_file}: 1\n") - if reduced_s3_log_file.stat().st_size == 0: - with open(file=completed_tracking_file_path, mode="a") as started_tracking_file: - started_tracking_file.write(f"{reduced_s3_log_file}\n") + with open(file=started_tracking_file_path, mode="a") as io: + io.write(f"{reduced_s3_log_file}: 1\n") + with open(file=completed_tracking_file_path, mode="a") as io: + io.write(f"{reduced_s3_log_file}\n") continue @@ -88,6 +87,9 @@ def bin_all_reduced_s3_logs_by_object_key( } del binned_data_frame + with open(file=started_tracking_file_path, mode="a") as io: + io.write(f"{reduced_s3_log_file}: 1\n") + for object_key, data in tqdm.tqdm( iterable=object_keys_to_data.items(), total=len(object_keys_to_data), @@ -108,5 +110,5 @@ def bin_all_reduced_s3_logs_by_object_key( header = False if binned_s3_log_file_path.exists() else True data_frame.to_csv(path_or_buf=binned_s3_log_file_path, mode="a", sep="\t", header=header, index=False) - with open(file=completed_tracking_file_path, mode="a") as started_tracking_file: - started_tracking_file.write(f"{reduced_s3_log_file}\n") + with open(file=completed_tracking_file_path, mode="a") as io: + io.write(f"{reduced_s3_log_file}\n")