enhance README with time estimates; properly time the writing of star…

…t/stop for binning
catalystneuro · Aug 22, 2024 · 566ee1d · 566ee1d
1 parent 4a315dc
commit 566ee1d
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -98,6 +98,8 @@ reduce_all_dandi_raw_s3_logs \
   --excluded_ips < Drogons IP >
 ```
 
+In summer of 2024, this process took less than 10 hours to process all 6 TB of raw log data (using 3 workers at 3 GB buffer size).
+
 ### Binning
 
 To bin:
@@ -127,6 +129,8 @@ bin_all_reduced_s3_logs_by_object_key \
   --file_limit 20
 ```
 
+In summer of 2024, this process took less than ?? hours to bin all ?? GB of reduced log data.
+
 ### Mapping
 
 The next step, that should also be updated regularly (daily-weekly), is to iterate through all current versions of all Dandisets, mapping the reduced logs to their assets.
@@ -145,6 +149,9 @@ map_binned_s3_logs_to_dandisets \
   --dandiset_logs_folder_path /mnt/backup/dandi/mapped-dandiset-logs
 ```
 
+In summer of 2024, this process took less than ?? hours to run and in the current design should be run fresh on a regular basis to keep the logs up to date.
+
+
 
 ## Submit line decoding errors
 

diff --git a/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py b/src/dandi_s3_log_parser/_bin_all_reduced_s3_logs_by_object_key.py
@@ -63,12 +63,11 @@ def bin_all_reduced_s3_logs_by_object_key(
         mininterval=3.0,
         smoothing=0,
     ):
-        with open(file=started_tracking_file_path, mode="a") as started_tracking_file:
-            started_tracking_file.write(f"{reduced_s3_log_file}: 1\n")
-
         if reduced_s3_log_file.stat().st_size == 0:
-            with open(file=completed_tracking_file_path, mode="a") as started_tracking_file:
-                started_tracking_file.write(f"{reduced_s3_log_file}\n")
+            with open(file=started_tracking_file_path, mode="a") as io:
+                io.write(f"{reduced_s3_log_file}: 1\n")
+            with open(file=completed_tracking_file_path, mode="a") as io:
+                io.write(f"{reduced_s3_log_file}\n")
 
             continue
 
@@ -88,6 +87,9 @@ def bin_all_reduced_s3_logs_by_object_key(
         }
         del binned_data_frame
 
+        with open(file=started_tracking_file_path, mode="a") as io:
+            io.write(f"{reduced_s3_log_file}: 1\n")
+
         for object_key, data in tqdm.tqdm(
             iterable=object_keys_to_data.items(),
             total=len(object_keys_to_data),
@@ -108,5 +110,5 @@ def bin_all_reduced_s3_logs_by_object_key(
             header = False if binned_s3_log_file_path.exists() else True
             data_frame.to_csv(path_or_buf=binned_s3_log_file_path, mode="a", sep="\t", header=header, index=False)
 
-        with open(file=completed_tracking_file_path, mode="a") as started_tracking_file:
-            started_tracking_file.write(f"{reduced_s3_log_file}\n")
+        with open(file=completed_tracking_file_path, mode="a") as io:
+            io.write(f"{reduced_s3_log_file}\n")