Skip to content

Commit

Permalink
fix for reading data from a folder
Browse files Browse the repository at this point in the history
- doesn't symlink or copy it, it just uses it as input for the
downstream rules
- if the input is an archive, only then will the extract rule run
  • Loading branch information
akhanf committed Feb 20, 2024
1 parent fb7a31b commit 5d0c321
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 16 deletions.
1 change: 0 additions & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,3 @@ include: "rules/flatfield_corr.smk"
include: "rules/bigstitcher.smk"
include: "rules/ome_zarr.smk"
include: "rules/bids.smk"

8 changes: 4 additions & 4 deletions workflow/rules/bids.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ rule raw_dataset_desc:
dd=config["bids"]["raw"],
output:
json=Path(root) / "dataset_description.json",
log:
'logs/dd_raw.log'
log:
"logs/dd_raw.log",
run:
import json

Expand All @@ -17,8 +17,8 @@ rule resampled_dataset_desc:
dd=config["bids"]["resampled"],
output:
json=Path(resampled) / "dataset_description.json",
log:
'logs/dd_raw.log'
log:
"logs/dd_raw.log",
run:
import json

Expand Down
28 changes: 23 additions & 5 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,36 @@ def get_all_targets():
return targets


def get_input_dataset(wildcards):
"""returns path to extracted dataset or path to provided input folder"""
in_dataset = get_dataset_path(wildcards)

dataset_path = Path(get_dataset_path(wildcards))
suffix = dataset_path.suffix

if dataset_path.is_dir():
# we have a directory already, just point to it
return str(dataset_path)

elif tarfile.is_tarfile(dataset_path):
# dataset was a tar file, so point to the extracted folder
return rules.get_dataset.output.ome_dir.format(**wildcards)

else:
print(f"unsupported input: {dataset_path}")


# import
def cmd_get_dataset(wildcards, input, output):
def cmd_extract_dataset(wildcards, input, output):
cmds = []
import tarfile

# supports tar, tar.gz, tgz, zip, or folder name
# supports tar, tar.gz, tgz, or folder name
dataset_path = Path(input.dataset_path)
suffix = dataset_path.suffix
if dataset_path.is_dir():
# we have a directory:
# return command to copy folder
cmds.append(f"ln -sr {input} {output}")
# we have a directory
print("input directory not copied/extracted by this rule")

elif tarfile.is_tarfile(dataset_path):
# we have a tar file
Expand Down
8 changes: 4 additions & 4 deletions workflow/rules/import.smk
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@

rule get_dataset:
rule extract_dataset:
input:
dataset_path=get_dataset_path,
params:
cmd=cmd_get_dataset,
cmd=cmd_extract_dataset,
output:
ome_dir=temp(
directory(
Expand Down Expand Up @@ -36,7 +36,7 @@ rule get_dataset:

rule raw_to_metadata:
input:
ome_dir=rules.get_dataset.output.ome_dir,
ome_dir=get_input_dataset,
params:
in_tif_pattern=lambda wildcards, input: os.path.join(
input.ome_dir,
Expand Down Expand Up @@ -82,7 +82,7 @@ rule tif_to_zarr:
output shape is (tiles,channels,z,y,x), with the 2d
images as the chunks"""
input:
ome_dir=rules.get_dataset.output.ome_dir,
ome_dir=get_input_dataset,
metadata_json=rules.raw_to_metadata.output.metadata_json,
params:
in_tif_pattern=lambda wildcards, input: os.path.join(
Expand Down
10 changes: 8 additions & 2 deletions workflow/scripts/raw_to_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,16 @@
from itertools import product
from snakemake.io import glob_wildcards


in_tif_pattern = snakemake.params.in_tif_pattern

#add a wildcard constraint to ensure no
#subfolders get parsed (ie don't match anything with / in it):
prefix_constraint=r'[^/]+'
in_tif_pattern_constrained = in_tif_pattern.replace('{prefix}',f'{{prefix,{prefix_constraint}}}')

#parse the filenames to get number of channels, tiles etc..
prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern)
prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern_constrained)

tiles_x = sorted(list(set(tilex)))
tiles_y = sorted(list(set(tiley)))
Expand All @@ -19,7 +25,7 @@
#read in series metadata from first file
in_tif = in_tif_pattern.format(tilex=tiles_x[0],tiley=tiles_y[0],prefix=prefixes[0],channel=channels[0],zslice=zslices[0])

raw_tif = tifffile.TiffFile(in_tif)
raw_tif = tifffile.TiffFile(in_tif,mode='r')

axes = raw_tif.series[0].get_axes()
shape = raw_tif.series[0].get_shape()
Expand Down

0 comments on commit 5d0c321

Please sign in to comment.