From 5d0c321141d11ca55afdc10f04cc1efbd73694c7 Mon Sep 17 00:00:00 2001 From: Ali Khan Date: Tue, 20 Feb 2024 15:26:13 -0500 Subject: [PATCH] fix for reading data from a folder - doesn't symlink or copy it, it just uses it as input for the downstream rules - if the input is an archive, only then will the extract rule run --- workflow/Snakefile | 1 - workflow/rules/bids.smk | 8 ++++---- workflow/rules/common.smk | 28 +++++++++++++++++++++++----- workflow/rules/import.smk | 8 ++++---- workflow/scripts/raw_to_metadata.py | 10 ++++++++-- 5 files changed, 39 insertions(+), 16 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index f8f3a24..46dc46c 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -49,4 +49,3 @@ include: "rules/flatfield_corr.smk" include: "rules/bigstitcher.smk" include: "rules/ome_zarr.smk" include: "rules/bids.smk" - diff --git a/workflow/rules/bids.smk b/workflow/rules/bids.smk index 585162d..1354951 100644 --- a/workflow/rules/bids.smk +++ b/workflow/rules/bids.smk @@ -3,8 +3,8 @@ rule raw_dataset_desc: dd=config["bids"]["raw"], output: json=Path(root) / "dataset_description.json", - log: - 'logs/dd_raw.log' + log: + "logs/dd_raw.log", run: import json @@ -17,8 +17,8 @@ rule resampled_dataset_desc: dd=config["bids"]["resampled"], output: json=Path(resampled) / "dataset_description.json", - log: - 'logs/dd_raw.log' + log: + "logs/dd_raw.log", run: import json diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 75df2db..2616ea3 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -40,18 +40,36 @@ def get_all_targets(): return targets +def get_input_dataset(wildcards): + """returns path to extracted dataset or path to provided input folder""" + in_dataset = get_dataset_path(wildcards) + + dataset_path = Path(get_dataset_path(wildcards)) + suffix = dataset_path.suffix + + if dataset_path.is_dir(): + # we have a directory already, just point to it + return str(dataset_path) + + elif tarfile.is_tarfile(dataset_path): + # dataset was a tar file, so point to the extracted folder + return rules.get_dataset.output.ome_dir.format(**wildcards) + + else: + print(f"unsupported input: {dataset_path}") + + # import -def cmd_get_dataset(wildcards, input, output): +def cmd_extract_dataset(wildcards, input, output): cmds = [] import tarfile - # supports tar, tar.gz, tgz, zip, or folder name + # supports tar, tar.gz, tgz, or folder name dataset_path = Path(input.dataset_path) suffix = dataset_path.suffix if dataset_path.is_dir(): - # we have a directory: - # return command to copy folder - cmds.append(f"ln -sr {input} {output}") + # we have a directory + print("input directory not copied/extracted by this rule") elif tarfile.is_tarfile(dataset_path): # we have a tar file diff --git a/workflow/rules/import.smk b/workflow/rules/import.smk index 357073a..50cc684 100644 --- a/workflow/rules/import.smk +++ b/workflow/rules/import.smk @@ -1,9 +1,9 @@ -rule get_dataset: +rule extract_dataset: input: dataset_path=get_dataset_path, params: - cmd=cmd_get_dataset, + cmd=cmd_extract_dataset, output: ome_dir=temp( directory( @@ -36,7 +36,7 @@ rule get_dataset: rule raw_to_metadata: input: - ome_dir=rules.get_dataset.output.ome_dir, + ome_dir=get_input_dataset, params: in_tif_pattern=lambda wildcards, input: os.path.join( input.ome_dir, @@ -82,7 +82,7 @@ rule tif_to_zarr: output shape is (tiles,channels,z,y,x), with the 2d images as the chunks""" input: - ome_dir=rules.get_dataset.output.ome_dir, + ome_dir=get_input_dataset, metadata_json=rules.raw_to_metadata.output.metadata_json, params: in_tif_pattern=lambda wildcards, input: os.path.join( diff --git a/workflow/scripts/raw_to_metadata.py b/workflow/scripts/raw_to_metadata.py index 5449c4c..b6f5664 100644 --- a/workflow/scripts/raw_to_metadata.py +++ b/workflow/scripts/raw_to_metadata.py @@ -5,10 +5,16 @@ from itertools import product from snakemake.io import glob_wildcards + in_tif_pattern = snakemake.params.in_tif_pattern +#add a wildcard constraint to ensure no +#subfolders get parsed (ie don't match anything with / in it): +prefix_constraint=r'[^/]+' +in_tif_pattern_constrained = in_tif_pattern.replace('{prefix}',f'{{prefix,{prefix_constraint}}}') + #parse the filenames to get number of channels, tiles etc.. -prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern) +prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern_constrained) tiles_x = sorted(list(set(tilex))) tiles_y = sorted(list(set(tiley))) @@ -19,7 +25,7 @@ #read in series metadata from first file in_tif = in_tif_pattern.format(tilex=tiles_x[0],tiley=tiles_y[0],prefix=prefixes[0],channel=channels[0],zslice=zslices[0]) -raw_tif = tifffile.TiffFile(in_tif) +raw_tif = tifffile.TiffFile(in_tif,mode='r') axes = raw_tif.series[0].get_axes() shape = raw_tif.series[0].get_shape()