fix for reading data from a folder

- doesn't symlink or copy it, it just uses it as input for the downstream rules - if the input is an archive, only then will the extract rule run
khanlab · Feb 20, 2024 · 5d0c321 · 5d0c321
1 parent fb7a31b
commit 5d0c321
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 16 deletions.
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -49,4 +49,3 @@ include: "rules/flatfield_corr.smk"
 include: "rules/bigstitcher.smk"
 include: "rules/ome_zarr.smk"
 include: "rules/bids.smk"
-
diff --git a/workflow/rules/bids.smk b/workflow/rules/bids.smk
@@ -3,8 +3,8 @@ rule raw_dataset_desc:
         dd=config["bids"]["raw"],
     output:
         json=Path(root) / "dataset_description.json",
-    log: 
-        'logs/dd_raw.log'
+    log:
+        "logs/dd_raw.log",
     run:
         import json
 
@@ -17,8 +17,8 @@ rule resampled_dataset_desc:
         dd=config["bids"]["resampled"],
     output:
         json=Path(resampled) / "dataset_description.json",
-    log: 
-        'logs/dd_raw.log'
+    log:
+        "logs/dd_raw.log",
     run:
         import json
 

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -40,18 +40,36 @@ def get_all_targets():
     return targets
 
 
+def get_input_dataset(wildcards):
+    """returns path to extracted dataset or path to provided input folder"""
+    in_dataset = get_dataset_path(wildcards)
+
+    dataset_path = Path(get_dataset_path(wildcards))
+    suffix = dataset_path.suffix
+
+    if dataset_path.is_dir():
+        # we have a directory already, just point to it
+        return str(dataset_path)
+
+    elif tarfile.is_tarfile(dataset_path):
+        # dataset was a tar file, so point to the extracted folder
+        return rules.get_dataset.output.ome_dir.format(**wildcards)
+
+    else:
+        print(f"unsupported input: {dataset_path}")
+
+
 # import
-def cmd_get_dataset(wildcards, input, output):
+def cmd_extract_dataset(wildcards, input, output):
     cmds = []
     import tarfile
 
-    # supports tar, tar.gz, tgz, zip, or folder name
+    # supports tar, tar.gz, tgz, or folder name
     dataset_path = Path(input.dataset_path)
     suffix = dataset_path.suffix
     if dataset_path.is_dir():
-        # we have a directory:
-        # return command to copy folder
-        cmds.append(f"ln -sr {input} {output}")
+        # we have a directory
+        print("input directory not copied/extracted by this rule")
 
     elif tarfile.is_tarfile(dataset_path):
         # we have a tar file

diff --git a/workflow/rules/import.smk b/workflow/rules/import.smk
@@ -1,9 +1,9 @@
 
-rule get_dataset:
+rule extract_dataset:
     input:
         dataset_path=get_dataset_path,
     params:
-        cmd=cmd_get_dataset,
+        cmd=cmd_extract_dataset,
     output:
         ome_dir=temp(
             directory(
@@ -36,7 +36,7 @@ rule get_dataset:
 
 rule raw_to_metadata:
     input:
-        ome_dir=rules.get_dataset.output.ome_dir,
+        ome_dir=get_input_dataset,
     params:
         in_tif_pattern=lambda wildcards, input: os.path.join(
             input.ome_dir,
@@ -82,7 +82,7 @@ rule tif_to_zarr:
         output shape is (tiles,channels,z,y,x), with the 2d 
         images as the chunks"""
     input:
-        ome_dir=rules.get_dataset.output.ome_dir,
+        ome_dir=get_input_dataset,
         metadata_json=rules.raw_to_metadata.output.metadata_json,
     params:
         in_tif_pattern=lambda wildcards, input: os.path.join(

diff --git a/workflow/scripts/raw_to_metadata.py b/workflow/scripts/raw_to_metadata.py
@@ -5,10 +5,16 @@
 from itertools import product
 from snakemake.io import glob_wildcards
 
+
 in_tif_pattern = snakemake.params.in_tif_pattern
 
+#add a wildcard constraint to ensure no
+#subfolders get parsed (ie don't match anything with / in it):
+prefix_constraint=r'[^/]+' 
+in_tif_pattern_constrained = in_tif_pattern.replace('{prefix}',f'{{prefix,{prefix_constraint}}}')
+
 #parse the filenames to get number of channels, tiles etc..
-prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern)
+prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern_constrained)
 
 tiles_x = sorted(list(set(tilex)))
 tiles_y = sorted(list(set(tiley)))
@@ -19,7 +25,7 @@
 #read in series metadata from first file
 in_tif = in_tif_pattern.format(tilex=tiles_x[0],tiley=tiles_y[0],prefix=prefixes[0],channel=channels[0],zslice=zslices[0])
 
-raw_tif = tifffile.TiffFile(in_tif)
+raw_tif = tifffile.TiffFile(in_tif,mode='r')
 
 axes = raw_tif.series[0].get_axes()
 shape = raw_tif.series[0].get_shape()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -49,4 +49,3 @@ include: "rules/flatfield_corr.smk"
		include: "rules/bigstitcher.smk"
		include: "rules/ome_zarr.smk"
		include: "rules/bids.smk"