use zipstores for intermediate files

- define separate work folder (workn5) for bigstitcher in (n5) and out (zarr) since these are not zipstores.. - hopefully on graham, can use SLURM_TMPDIR for workn5, and project space for bids and work (if zipstores enabled)
khanlab · Oct 27, 2024 · d3860ba · d3860ba
1 parent 4562271
commit d3860ba
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 100 deletions.
diff --git a/config/config.yml b/config/config.yml
@@ -1,15 +1,16 @@
 samples: 'config/samples.tsv'
 
 root: 'bids' # can use a s3:// or gcs:// prefix to write output to cloud storage
-work: 'work' 
+work: 'work' #intermediate files, uses zipstores if enabled 
+workn5: 'workn5'  #where the bigstitcher temp files go (not using zipstores)
 
 remote_creds: '~/.config/gcloud/application_default_credentials.json' #this is needed so we can pass creds to container
 
-use_zipstore: False #if True, produce SPIM.ome.zarr.zip instead of SPIM.ome.zarr
+use_zipstore: True #if True, produce SPIM.ome.zarr.zip instead of SPIM.ome.zarr, and zarr.zip for intermediate files
 
 #total resources available, used to set rule resources
-total_cores: 32 
-total_mem_mb: 128000 
+total_cores: 32
+total_mem_mb: 128000
 
 #import wildcards:  tilex, tiley, channel, zslice (and prefix - unused)
 import_blaze:
@@ -31,10 +32,14 @@ basic_flatfield_corr:
     smoothness_flatfield: 1.0
     smoothness_darkfield: 1.0
     sparse_cost_darkfield: 0.01
+  output_chunks:
+    - 128
+    - 128
+    - 128
 
 
 bigstitcher:
-  use_interestpoints: False
+  use_interestpoints: True
   interest_points:
     downsample_xy: 4
     min_intensity: 0
@@ -72,7 +77,7 @@ bigstitcher:
     block_size_factor_z: 32
 
 ome_zarr:
-  desc: stitchedflatcorr
+  desc: stitchedraw
   max_downsampling_layers: 5 # e.g. 4 levels: { 0: orig, 1: ds2, 2: ds4, 3: ds8, 4: ds16}
   rechunk_size: #z, y, x 
     - 1 

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -15,6 +15,7 @@ container: config["containers"]["spimprep"]
 # use expandvars so we can use e.g. '$SLURM_TMPDIR'
 root = os.path.expandvars(config["root"])
 work = os.path.expandvars(config["work"])
+workn5 = os.path.expandvars(config["workn5"])
 resampled = Path(root) / "derivatives" / "resampled"
 
 

diff --git a/workflow/rules/bigstitcher.smk b/workflow/rules/bigstitcher.smk
@@ -7,15 +7,17 @@ rule zarr_to_bdv:
             sample="{sample}",
             acq="{acq}",
             desc="{desc}",
-            suffix="SPIM.zarr",
+            suffix="SPIM.{ext}".format(
+                ext="zarr.zip" if config["use_zipstore"] else "zarr"
+            ),
         ),
         metadata_json=rules.copy_blaze_metadata.output.metadata_json,
     params:
         max_downsampling_layers=5,
         temp_h5=str(
             Path(
                 bids(
-                    root=work,
+                    root=workn5,
                     subject="{subject}",
                     datatype="micr",
                     sample="{sample}",
@@ -29,7 +31,7 @@ rule zarr_to_bdv:
         temp_xml=str(
             Path(
                 bids(
-                    root=work,
+                    root=workn5,
                     subject="{subject}",
                     datatype="micr",
                     sample="{sample}",
@@ -44,7 +46,7 @@ rule zarr_to_bdv:
         bdv_n5=temp(
             directory(
                 bids(
-                    root=work,
+                    root=workn5,
                     subject="{subject}",
                     datatype="micr",
                     sample="{sample}",
@@ -56,7 +58,7 @@ rule zarr_to_bdv:
         ),
         bdv_xml=temp(
             bids(
-                root=work,
+                root=workn5,
                 subject="{subject}",
                 datatype="micr",
                 sample="{sample}",
@@ -119,7 +121,7 @@ rule bigstitcher_stitching:
     output:
         dataset_xml=temp(
             bids(
-                root=work,
+                root=workn5,
                 subject="{subject}",
                 datatype="micr",
                 sample="{sample}",
@@ -193,7 +195,7 @@ rule bigstitcher_detect_interestpoints:
     output:
         dataset_xml=temp(
             bids(
-                root=work,
+                root=workn5,
                 subject="{subject}",
                 datatype="micr",
                 sample="{sample}",
@@ -252,7 +254,7 @@ rule bigstitcher_match_interestpoints:
     output:
         dataset_xml=temp(
             bids(
-                root=work,
+                root=workn5,
                 subject="{subject}",
                 datatype="micr",
                 sample="{sample}",
@@ -320,7 +322,7 @@ rule bigstitcher_solver:
     output:
         dataset_xml=temp(
             bids(
-                root=work,
+                root=workn5,
                 subject="{subject}",
                 datatype="micr",
                 sample="{sample}",
@@ -368,7 +370,7 @@ rule bigstitcher_solver:
 rule bigstitcher_fusion:
     input:
         dataset_n5=bids(
-            root=work,
+            root=workn5,
             subject="{subject}",
             datatype="micr",
             sample="{sample}",
@@ -377,7 +379,7 @@ rule bigstitcher_fusion:
             suffix="bdv.n5",
         ),
         dataset_xml=bids(
-            root=work,
+            root=workn5,
             subject="{subject}",
             datatype="micr",
             sample="{sample}",
@@ -410,7 +412,7 @@ rule bigstitcher_fusion:
         zarr=temp(
             directory(
                 bids(
-                    root=work,
+                    root=workn5,
                     subject="{subject}",
                     datatype="micr",
                     sample="{sample}",

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -82,6 +82,13 @@ def get_extension_ome_zarr():
             return "ome.zarr"
 
 
+def intermediate_zarr(path):
+    if config["use_zipstore"]:
+        return temp(path + ".zarr.zip")
+    else:
+        return temp(directory(path + ".zarr"))
+
+
 # targets
 def get_all_targets():
     targets = []
@@ -302,12 +309,12 @@ def get_output_ome_zarr(acq_type):
         if config["use_zipstore"]:
             return {
                 "zarr": bids(
-                    root=work,
+                    root=root,
                     subject="{subject}",
                     datatype="micr",
                     sample="{sample}",
                     acq=f"{{acq,[a-zA-Z0-9]*{acq_type}[a-zA-Z0-9]*}}",
-                    suffix="SPIM.ome.zarr",
+                    suffix="SPIM.ome.zarr.zip",
                 )
             }
         else:

diff --git a/workflow/rules/flatfield_corr.smk b/workflow/rules/flatfield_corr.smk
@@ -2,15 +2,17 @@
 rule fit_basic_flatfield_corr:
     """ BaSiC flatfield correction"""
     input:
-        zarr=lambda wildcards: bids(
-            root=work,
-            subject="{subject}",
-            datatype="micr",
-            sample="{sample}",
-            acq="{acq}",
-            desc="rawfromgcs" if sample_is_remote(wildcards) else "raw",
-            suffix="SPIM.zarr",
-        ).format(**wildcards),
+        zarr=lambda wildcards: intermediate_zarr(
+            bids(
+                root=work,
+                subject="{subject}",
+                datatype="micr",
+                sample="{sample}",
+                acq="{acq}",
+                desc="rawfromgcs" if sample_is_remote(wildcards) else "raw",
+                suffix="SPIM",
+            ).format(**wildcards)
+        ),
     params:
         channel=lambda wildcards: get_stains(wildcards).index(wildcards.stain),
         max_n_images=config["basic_flatfield_corr"]["max_n_images"],
@@ -64,34 +66,34 @@ rule fit_basic_flatfield_corr:
 rule apply_basic_flatfield_corr:
     """ apply BaSiC flatfield correction """
     input:
-        zarr=lambda wildcards: bids(
-            root=work,
-            subject="{subject}",
-            datatype="micr",
-            sample="{sample}",
-            acq="{acq}",
-            desc="rawfromgcs" if sample_is_remote(wildcards) else "raw",
-            suffix="SPIM.zarr",
-        ).format(**wildcards),
+        zarr=lambda wildcards: intermediate_zarr(
+            bids(
+                root=work,
+                subject="{subject}",
+                datatype="micr",
+                sample="{sample}",
+                acq="{acq}",
+                desc="rawfromgcs" if sample_is_remote(wildcards) else "raw",
+                suffix="SPIM",
+            ).format(**wildcards)
+        ),
         model_dirs=lambda wildcards: expand(
             rules.fit_basic_flatfield_corr.output.model_dir,
             stain=get_stains(wildcards),
             allow_missing=True,
         ),
     params:
-        out_chunks=[128, 128, 128],
+        out_chunks=config["basic_flatfield_corr"]["output_chunks"],
     output:
-        zarr=temp(
-            directory(
-                bids(
-                    root=work,
-                    subject="{subject}",
-                    datatype="micr",
-                    sample="{sample}",
-                    acq="{acq}",
-                    desc="flatcorr",
-                    suffix="SPIM.zarr",
-                )
+        zarr=intermediate_zarr(
+            bids(
+                root=work,
+                subject="{subject}",
+                datatype="micr",
+                sample="{sample}",
+                acq="{acq}",
+                desc="flatcorr",
+                suffix="SPIM",
             )
         ),
     benchmark:

diff --git a/workflow/rules/import.smk b/workflow/rules/import.smk
@@ -193,18 +193,18 @@ rule tif_to_zarr:
         metadata_json=rules.copy_blaze_metadata.output.metadata_json,
     params:
         intensity_rescaling=config["import_blaze"]["intensity_rescaling"],
+        do_rechunking=True if config["ome_zarr"]["desc"][-3:] == "raw" else False,
+        out_chunks=config["basic_flatfield_corr"]["output_chunks"],
     output:
-        zarr=temp(
-            directory(
-                bids(
-                    root=work,
-                    subject="{subject}",
-                    datatype="micr",
-                    sample="{sample}",
-                    acq="{acq}",
-                    desc="raw",
-                    suffix="SPIM.zarr",
-                )
+        zarr=intermediate_zarr(
+            bids(
+                root=work,
+                subject="{subject}",
+                datatype="micr",
+                sample="{sample}",
+                acq="{acq}",
+                desc="raw",
+                suffix="SPIM",
             )
         ),
     benchmark:

diff --git a/workflow/rules/ome_zarr.smk b/workflow/rules/ome_zarr.smk
@@ -5,7 +5,7 @@ rule zarr_to_ome_zarr:
         **get_storage_creds(),
         zarr=lambda wildcards: expand(
             bids(
-                root=work,
+                root=workn5,
                 subject="{subject}",
                 datatype="micr",
                 sample="{sample}",
@@ -89,41 +89,6 @@ rule tif_stacks_to_ome_zarr:
         "../scripts/tif_stacks_to_ome_zarr.py"
 
 
-rule ome_zarr_to_zipstore:
-    """ use 7zip to create a zipstore """
-    input:
-        zarr=bids(
-            root=work,
-            subject="{subject}",
-            datatype="micr",
-            sample="{sample}",
-            acq="{acq}",
-            suffix="SPIM.ome.zarr",
-        ),
-    output:
-        zarr_zip=bids(
-            root=root,
-            subject="{subject}",
-            datatype="micr",
-            sample="{sample}",
-            acq="{acq}",
-            suffix="SPIM.ome.zarr.zip",
-        ),
-    log:
-        bids(
-            root="logs",
-            subject="{subject}",
-            datatype="micr",
-            sample="{sample}",
-            acq="{acq}",
-            suffix="log.txt",
-        ),
-    group:
-        "preproc"
-    shell:
-        "7z a -mx0 -tzip {output.zarr_zip} {input.zarr}/. &> {log}"
-
-
 rule ome_zarr_to_nii:
     input:
         **get_storage_creds(),

diff --git a/workflow/scripts/apply_basic_flatfield_corr_zarr.py b/workflow/scripts/apply_basic_flatfield_corr_zarr.py
@@ -2,6 +2,7 @@
 from basicpy import BaSiC
 from pathlib import Path
 import numpy as np
+import zarr
 from skimage.transform import resize
 import dask.array as da
 from dask.diagnostics import ProgressBar
@@ -49,5 +50,12 @@ def apply_basic_parallel(x):
 #stack along chans
 arr_stacked = da.stack(chan_arr_list,axis=1).rechunk([1,1] + snakemake.params.out_chunks)
 
+#now we can do the computation itself, storing to zarr
+if Path(snakemake.output.zarr).suffix == '.zip':
+    store = zarr.storage.ZipStore(snakemake.output.zarr,dimension_separator='/',mode='w')
+else:
+    store = zarr.storage.DirectoryStore(snakemake.output.zarr,dimension_separator='/')
+
+
 with ProgressBar():
-    da.to_zarr(arr_stacked,snakemake.output.zarr,overwrite=True,dimension_separator='/')
+    da.to_zarr(arr_stacked,store,overwrite=True)