use bigstitcher-spark exclusively instead of fiji

- can consider cleaning up repo and deps because of this, maybe once we test it out on more samples - also fixes a ome_zarr_to_nii bug when dealing with ome.zarr.zip
khanlab · Sep 25, 2024 · f60d1e0 · f60d1e0
1 parent d1fa55c
commit f60d1e0
Show file tree

Hide file tree

Showing 10 changed files with 109 additions and 82 deletions.
diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ or for snakemake<8.0, use:
 snakemake -c all --use-singularity 
 ```
 
-Note: if you run the workflow on a system with large memory, you will need to set the heap size for the stitching and fusion rules. This can be done with e.g.: `--set-resources bigstitcher:mem_mb=60000 fuse_dataset:mem_mb=100000`
+Note: if you run the workflow on a system with large memory, you will need to set the heap size for the stitching and fusion rules. This can be done with e.g.: `--set-resources bigstitcher_spark_stitching:mem_mb=60000 bigstitcher_spark_fusion:mem_mb=100000`
 
 7. If you want to run the workflow using a batch job submission server, please see the executor plugins here: https://snakemake.github.io/snakemake-plugin-catalog/
 

diff --git a/config/config.yml b/config/config.yml
@@ -6,7 +6,7 @@ work: 'work'
 
 remote_creds: '~/.config/gcloud/application_default_credentials.json' #this is needed so we can pass creds to container
 
-write_ome_zarr_direct: False #use this to skip writing the final zarr output to work first and copying afterwards -- useful when work is not a fast local disk
+write_ome_zarr_direct: True #use this to skip writing the final zarr output to work first and copying afterwards -- useful when work is not a fast local disk
 
 cores_per_rule: 32 
 
@@ -36,18 +36,18 @@ bigstitcher:
     downsample_in_x: 4
     downsample_in_y: 4
     downsample_in_z: 1
-    method: "phase_corr"
-    methods:
-      phase_corr: "Phase Correlation"
-      optical_flow: "Lucas-Kanade" 
+    method: "phase_corr" #unused
+    methods: #unused
+      phase_corr: "Phase Correlation" 
+      optical_flow: "Lucas-Kanade"  
   filter_pairwise_shifts:
-    enabled: 1
-    min_r: 0.2
-    max_shift_total: 50 
+    enabled: 1 #unused
+    min_r: 0.7
+    max_shift_total: 50
   global_optimization:
     enabled: 1 
-    method: ONE_ROUND_SIMPLE
-    methods:
+    method: TWO_ROUND_ITERATIVE
+    methods: #unused, only for reference
       ONE_ROUND_SIMPLE: "One-Round"
       ONE_ROUND_ITERATIVE: "One-Round with iterative dropping of bad links"
       TWO_ROUND_SIMPLE:  "Two-Round using metadata to align unconnected Tiles"
@@ -64,7 +64,7 @@ bigstitcher:
     block_size_factor_z: 32
 
 ome_zarr:
-  desc: sparkstitchedflatcorr
+  desc: sparkstitchedraw #flatcorr
   max_downsampling_layers: 5 # e.g. 4 levels: { 0: orig, 1: ds2, 2: ds4, 3: ds8, 4: ds16}
   rechunk_size: #z, y, x 
     - 1 
@@ -100,7 +100,7 @@ ome_zarr:
       id: 0
       name: spim
       version: "0.4"  
-  use_zipstore: True #if True, produce SPIM.ome.zarr.zip instead of SPIM.ome.zarr
+  use_zipstore: False #if True, produce SPIM.ome.zarr.zip instead of SPIM.ome.zarr
 
 nifti:
   levels: #cannot be higher than max_downsampling_layers in ome_zarr
@@ -155,7 +155,5 @@ report:
 
 
 containers:
-  #spimprep: 'docker://khanlab/spimprep-deps:main'
-  spimprep: '/local/scratch/spimprep-deps_spark.sif'
-  # updated.sif'
+  spimprep: 'docker://khanlab/spimprep-deps:main'
 
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -33,7 +33,7 @@ rule all:
     input:
         get_all_targets(),
         get_bids_toplevel_targets(),
-    #        get_qc_targets(), #need to skip this if using prestitched
+        get_qc_targets(),  #need to skip this if using prestitched
     localrule: True
 
 

diff --git a/workflow/rules/bigstitcher.smk b/workflow/rules/bigstitcher.smk
@@ -93,7 +93,7 @@ rule zarr_to_bdv:
     script:
         "../scripts/zarr_to_n5_bdv.py"
 
-"""
+
 rule bigstitcher:
     input:
         dataset_n5=rules.zarr_to_bdv.output.bdv_n5,
@@ -112,7 +112,7 @@ rule bigstitcher:
                 sample="{sample}",
                 acq="{acq}",
                 desc="{desc}",
-                suffix="bigstitcherproc.sh",
+                suffix="bigstitcherfiji.sh",
             )
         ),
         dataset_xml=temp(
@@ -123,13 +123,13 @@ rule bigstitcher:
                 sample="{sample}",
                 acq="{acq}",
                 desc="{desc}",
-                suffix="bigstitcher.xml",
+                suffix="bigstitcherfiji.xml",
             )
         ),
     benchmark:
         bids(
             root="benchmarks",
-            datatype="bigstitcherproc",
+            datatype="bigstitcherfiji",
             subject="{subject}",
             sample="{sample}",
             acq="{acq}",
@@ -139,7 +139,7 @@ rule bigstitcher:
     log:
         bids(
             root="logs",
-            datatype="bigstitcherproc",
+            datatype="bigstitcherfiji",
             subject="{subject}",
             sample="{sample}",
             acq="{acq}",
@@ -159,19 +159,30 @@ rule bigstitcher:
         " {params.fiji_launcher_cmd} && "
         " echo ' -macro {input.ijm} \"{params.macro_args}\"' >> {output.launcher} "
         " && {output.launcher} |& tee {log} && {params.rm_old_xml}"
-"""
+
 
 rule bigstitcher_spark_stitching:
     input:
         dataset_n5=rules.zarr_to_bdv.output.bdv_n5,
         dataset_xml=rules.zarr_to_bdv.output.bdv_xml,
     params:
-        downsampling='--downsampling={dsx},{dsy},{dsz}'.format(dsx=config['bigstitcher']['calc_pairwise_shifts']['downsample_in_x'],
-                                                            dsy=config['bigstitcher']['calc_pairwise_shifts']['downsample_in_y'],
-                                                            dsz=config['bigstitcher']['calc_pairwise_shifts']['downsample_in_z']),
-        min_r='--minR={min_r}'.format(min_r=config['bigstitcher']['filter_pairwise_shifts']['min_r']),
-        max_shift='--maxShiftTotal={max_shift}'.format(max_shift=config['bigstitcher']['filter_pairwise_shifts']['max_shift_total']),
-        mem_gb=lambda wildcards, resources: '{mem_gb}'.format(mem_gb=int(resources.mem_mb/1000))
+        downsampling="--downsampling={dsx},{dsy},{dsz}".format(
+            dsx=config["bigstitcher"]["calc_pairwise_shifts"]["downsample_in_x"],
+            dsy=config["bigstitcher"]["calc_pairwise_shifts"]["downsample_in_y"],
+            dsz=config["bigstitcher"]["calc_pairwise_shifts"]["downsample_in_z"],
+        ),
+        min_r="--minR={min_r}".format(
+            min_r=config["bigstitcher"]["filter_pairwise_shifts"]["min_r"]
+        ),
+        max_shift="--maxShiftTotal={max_shift}".format(
+            max_shift=config["bigstitcher"]["filter_pairwise_shifts"][
+                "max_shift_total"
+            ]
+        ),
+        mem_gb=lambda wildcards, resources: "{mem_gb}".format(
+            mem_gb=int(resources.mem_mb / 1000)
+        ),
+        rm_old_xml=lambda wildcards, output: f"rm -f {output.dataset_xml}~?",
     output:
         dataset_xml=temp(
             bids(
@@ -215,19 +226,27 @@ rule bigstitcher_spark_stitching:
     shell:
         "cp {input.dataset_xml} {output.dataset_xml} && "
         "stitching {params.mem_gb} {threads} -x {output.dataset_xml} "
-        " {params.min_r} {params.downsampling} "
+        " {params.min_r} {params.downsampling} {params.max_shift} && "
+        "{params.rm_old_xml}"
 
 
 rule bigstitcher_spark_solver:
     input:
         dataset_n5=rules.zarr_to_bdv.output.bdv_n5,
         dataset_xml=rules.bigstitcher_spark_stitching.output.dataset_xml,
     params:
-        downsampling='--downsampling={dsx},{dsy},{dsz}'.format(dsx=config['bigstitcher']['calc_pairwise_shifts']['downsample_in_x'],
-                                                            dsy=config['bigstitcher']['calc_pairwise_shifts']['downsample_in_y'],
-                                                            dsz=config['bigstitcher']['calc_pairwise_shifts']['downsample_in_z']),
-        method='--method={method}'.format(method=config['bigstitcher']['global_optimization']['method']),
-        mem_gb=lambda wildcards, resources: '{mem_gb}'.format(mem_gb=int(resources.mem_mb/1000))
+        downsampling="--downsampling={dsx},{dsy},{dsz}".format(
+            dsx=config["bigstitcher"]["calc_pairwise_shifts"]["downsample_in_x"],
+            dsy=config["bigstitcher"]["calc_pairwise_shifts"]["downsample_in_y"],
+            dsz=config["bigstitcher"]["calc_pairwise_shifts"]["downsample_in_z"],
+        ),
+        method="--method={method}".format(
+            method=config["bigstitcher"]["global_optimization"]["method"]
+        ),
+        mem_gb=lambda wildcards, resources: "{mem_gb}".format(
+            mem_gb=int(resources.mem_mb / 1000)
+        ),
+        rm_old_xml=lambda wildcards, output: f"rm -f {output.dataset_xml}~?",
     output:
         dataset_xml=temp(
             bids(
@@ -271,12 +290,13 @@ rule bigstitcher_spark_solver:
     shell:
         "cp {input.dataset_xml} {output.dataset_xml} && "
         "solver {params.mem_gb} {threads} -x {output.dataset_xml} "
-        " -s STITCHING --lambda 0.1 " #lambda 0.1 is default (can expose this if needed)
-        " {params.method} "
-
+        " -s STITCHING --lambda 0.1 "
+        " {params.method} && "
+        "{params.rm_old_xml}"
+        #lambda 0.1 is default (can expose this if needed)
 
 
-rule fuse_dataset:
+rule bigstitcher_fusion:
     input:
         dataset_n5=bids(
             root=work,
@@ -294,7 +314,11 @@ rule fuse_dataset:
             sample="{sample}",
             acq="{acq}",
             desc="{desc}",
-            suffix="bigstitcher{}.xml".format('solver' if config['bigstitcher']['global_optimization']['enabled'] else 'stitching'),
+            suffix="bigstitcher{}.xml".format(
+                "solver"
+                if config["bigstitcher"]["global_optimization"]["enabled"]
+                else "stitching"
+            ),
         ),
         ijm=Path(workflow.basedir) / "macros" / "FuseImageMacroZarr.ijm",
     params:
@@ -363,7 +387,7 @@ rule fuse_dataset:
         " && {output.launcher} |& tee {log}"
 
 
-rule fuse_dataset_spark:
+rule bigstitcher_spark_fusion:
     input:
         dataset_n5=bids(
             root=work,
@@ -381,7 +405,11 @@ rule fuse_dataset_spark:
             sample="{sample}",
             acq="{acq}",
             desc="{desc}",
-            suffix="bigstitcher{}.xml".format('solver' if config['bigstitcher']['global_optimization']['enabled'] else 'stitching'),
+            suffix="bigstitcher{}.xml".format(
+                "solver"
+                if config["bigstitcher"]["global_optimization"]["enabled"]
+                else "stitching"
+            ),
         ),
         ijm=Path(workflow.basedir) / "macros" / "FuseImageMacroZarr.ijm",
     params:
@@ -398,7 +426,9 @@ rule fuse_dataset_spark:
             bsfy=config["bigstitcher"]["fuse_dataset"]["block_size_factor_y"],
             bsfz=config["bigstitcher"]["fuse_dataset"]["block_size_factor_z"],
         ),
-        mem_gb=lambda wildcards, resources: '{mem_gb}'.format(mem_gb=int(resources.mem_mb/1000))
+        mem_gb=lambda wildcards, resources: "{mem_gb}".format(
+            mem_gb=int(resources.mem_mb / 1000)
+        ),
     output:
         zarr=temp(
             directory(
@@ -440,7 +470,7 @@ rule fuse_dataset_spark:
         config["containers"]["spimprep"]
     resources:
         runtime=30,
-        mem_mb=40000,
+        mem_mb=30000,
     threads: config["cores_per_rule"]
     group:
         "preproc"

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -368,13 +368,6 @@ def get_output_ome_zarr(acq_type):
             }
 
 
-def get_output_ome_zarr_as_input(wildcards):
-    if 'blaze' in wildcards.acq:
-        return get_output_ome_zarr('blaze')
-    elif 'prestitched' in wildcards.acq:
-        return get_output_ome_zarr('prestitched')
-
-
 def get_storage_creds():
     """for rules that deal with remote storage directly"""
     protocol = Path(config["root"]).protocol

diff --git a/workflow/rules/ome_zarr.smk b/workflow/rules/ome_zarr.smk
@@ -117,14 +117,15 @@ rule ome_zarr_to_zipstore:
 
 rule ome_zarr_to_nii:
     input:
-        zarr=bids(
-                root=root,
-                subject="{subject}",
-                datatype="micr",
-                sample="{sample}",
-                acq="{acq}",
-                suffix="SPIM.{ext}".format(ext=get_extension_ome_zarr())),
         **get_storage_creds(),
+        zarr=bids(
+            root=root,
+            subject="{subject}",
+            datatype="micr",
+            sample="{sample}",
+            acq="{acq}",
+            suffix="SPIM.{ext}".format(ext=get_extension_ome_zarr()),
+        ),
     params:
         channel_index=lambda wildcards: get_stains(wildcards).index(wildcards.stain),
         uri=get_output_ome_zarr_uri(),

diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk
@@ -73,8 +73,15 @@ rule generate_flatfield_qc:
 rule generate_whole_slice_qc:
     "Generates an html file to view whole slices from preprocessed images"
     input:
-        unpack(get_output_ome_zarr_as_input),
         **get_storage_creds(),
+        zarr=bids(
+            root=root,
+            subject="{subject}",
+            datatype="micr",
+            sample="{sample}",
+            acq="{acq}",
+            suffix="SPIM.{ext}".format(ext=get_extension_ome_zarr()),
+        ),
         ws_html=config["report"]["resources"]["ws_html"],
     params:
         ws_s_start=config["report"]["whole_slice_viewer"]["slice_start"],
@@ -112,8 +119,15 @@ rule generate_whole_slice_qc:
 rule generate_volume_qc:
     "Generates an html file to view the volume rendered image"
     input:
-        unpack(get_output_ome_zarr_as_input),
         **get_storage_creds(),
+        zarr=bids(
+            root=root,
+            subject="{subject}",
+            datatype="micr",
+            sample="{sample}",
+            acq="{acq}",
+            suffix="SPIM.{ext}".format(ext=get_extension_ome_zarr()),
+        ),
         vol_viewer_dir=config["report"]["resources"]["vol_viewer_dir"],
     params:
         uri=get_output_ome_zarr_uri(),

diff --git a/workflow/scripts/generate_volume_qc.py b/workflow/scripts/generate_volume_qc.py
@@ -14,8 +14,6 @@
 # where html file should be written
 html_dest = snakemake.output.html
 
-# inputted ome-zarr path
-ome_data = snakemake.input.zarr
 
 # move volume renderer into the subjects directory
 copy_tree(snakemake.input.vol_viewer_dir, resource_dir)
@@ -28,7 +26,11 @@
     fs_args={}
 
 fs = get_fsspec(uri,**fs_args)
-store = zarr.storage.FSStore(Path(uri).path,fs=fs,dimension_separator='/',mode='r')
+
+if Path(uri).suffix == '.zip':
+    store = zarr.storage.ZipStore(Path(uri).path,dimension_separator='/',mode='r')
+else:
+    store = zarr.storage.FSStore(Path(uri).path,fs=fs,dimension_separator='/',mode='r')
 darr = da.from_zarr(store,component='/5')
 
 # Get most downsampled ome-zarr image