From 52b04f204ee1c4b6f0392a8823a318cdb52a654c Mon Sep 17 00:00:00 2001 From: Ali Khan Date: Thu, 12 Sep 2024 18:09:43 -0400 Subject: [PATCH] updates for compatibility with input zstacks Blaze microscope now outputs data with one tif file per zstack. This adds compatibility for it. TODO: test full end-to-end --- config/config.yml | 2 +- workflow/lib/dask_image.py | 45 ++++++++++++++++ workflow/rules/import.smk | 9 ---- workflow/scripts/blaze_to_metadata.py | 64 ++++++++++++++++++----- workflow/scripts/blaze_to_metadata_gcs.py | 59 ++++++++++++++++----- workflow/scripts/tif_to_zarr.py | 40 ++++++++++---- workflow/scripts/tif_to_zarr_gcs.py | 53 +++++++++++++++++-- 7 files changed, 221 insertions(+), 51 deletions(-) create mode 100644 workflow/lib/dask_image.py diff --git a/config/config.yml b/config/config.yml index edc16ba..41ec75f 100644 --- a/config/config.yml +++ b/config/config.yml @@ -1,6 +1,5 @@ datasets: 'config/datasets.tsv' - root: 'bids' # can use a s3:// or gcs:// prefix to write output to cloud storage work: 'work' @@ -13,6 +12,7 @@ cores_per_rule: 32 #import wildcards: tilex, tiley, channel, zslice (and prefix - unused) import_blaze: raw_tif_pattern: "{prefix}_Blaze[{tilex} x {tiley}]_C{channel}_xyz-Table Z{zslice}.ome.tif" + raw_tif_pattern_zstack: "{prefix}_Blaze[{tilex} x {tiley}]_C{channel}.ome.tif" intensity_rescaling: 0.5 #raw images seem to be at the upper end of uint16 (over-saturated) -- causes wrapping issues when adjusting with flatfield correction etc. this rescales the raw data as it imports it.. import_prestitched: diff --git a/workflow/lib/dask_image.py b/workflow/lib/dask_image.py new file mode 100644 index 0000000..dbc06c3 --- /dev/null +++ b/workflow/lib/dask_image.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import os +from glob import glob + +try: + import tifffile +except (AttributeError, ImportError): + pass + +from dask.array.core import Array +from dask.base import tokenize + + +def add_leading_dimension(x): + return x[None, ...] + +def imread(fn, page): + return tifffile.imread(fn,key=page) + + +def imread_pages(filename, preprocess=None): + + + tif = tifffile.TiffFile(filename) + pages = [i for i in range(len(tif.pages))] + name = "imread-%s" % tokenize(filename, os.path.getmtime(filename)) + + sample = tif.pages[0].asarray() + + if preprocess: + sample = preprocess(sample) + + keys = [(name, i) + (0,) * len(sample.shape) for i in pages] + if preprocess: + values = [ + (add_leading_dimension, (preprocess, (imread, filename,i))) for i in pages + ] + else: + values = [(add_leading_dimension, (imread, filename,i)) for i in pages] + dsk = dict(zip(keys, values)) + + chunks = ((1,) * len(pages),) + tuple((d,) for d in sample.shape) + + return Array(dsk, name, chunks, sample.dtype) diff --git a/workflow/rules/import.smk b/workflow/rules/import.smk index 8a9074a..3906b36 100644 --- a/workflow/rules/import.smk +++ b/workflow/rules/import.smk @@ -79,11 +79,6 @@ rule blaze_to_metadata_gcs: rule blaze_to_metadata: input: ome_dir=get_input_dataset, - params: - in_tif_pattern=lambda wildcards, input: os.path.join( - input.ome_dir, - config["import_blaze"]["raw_tif_pattern"], - ), output: metadata_json=temp( bids( @@ -197,10 +192,6 @@ rule tif_to_zarr: ome_dir=get_input_dataset, metadata_json=rules.copy_blaze_metadata.output.metadata_json, params: - in_tif_pattern=lambda wildcards, input: os.path.join( - input.ome_dir, - config["import_blaze"]["raw_tif_pattern"], - ), intensity_rescaling=config["import_blaze"]["intensity_rescaling"], output: zarr=temp( diff --git a/workflow/scripts/blaze_to_metadata.py b/workflow/scripts/blaze_to_metadata.py index 608158e..196dac5 100644 --- a/workflow/scripts/blaze_to_metadata.py +++ b/workflow/scripts/blaze_to_metadata.py @@ -4,25 +4,48 @@ import re from itertools import product from snakemake.io import glob_wildcards +from glob import glob +from pathlib import Path +import os -in_tif_pattern = snakemake.params.in_tif_pattern +tif_files = glob(f"{snakemake.input.ome_dir}/*.tif") + +#check first tif file to see if it is zstack or not: +if 'xyz-Table Z' in Path(tif_files[0]).name: + is_zstack=False +else: + is_zstack=True + +if is_zstack: + in_tif_pattern = os.path.join(snakemake.input.ome_dir,snakemake.config["import_blaze"]["raw_tif_pattern_zstack"]) +else: + in_tif_pattern = os.path.join(snakemake.input.ome_dir,snakemake.config["import_blaze"]["raw_tif_pattern"]) #add a wildcard constraint to ensure no #subfolders get parsed (ie don't match anything with / in it): prefix_constraint=r'[^/]+' in_tif_pattern_constrained = in_tif_pattern.replace('{prefix}',f'{{prefix,{prefix_constraint}}}') + #parse the filenames to get number of channels, tiles etc.. -prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern_constrained) +if is_zstack: + prefix, tilex, tiley, channel = glob_wildcards(in_tif_pattern_constrained,tif_files) +else: + prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern_constrained,tif_files) tiles_x = sorted(list(set(tilex))) tiles_y = sorted(list(set(tiley))) channels = sorted(list(set(channel))) -zslices = sorted(list(set(zslice))) prefixes = sorted(list(set(prefix))) -#read in series metadata from first file -in_tif = in_tif_pattern.format(tilex=tiles_x[0],tiley=tiles_y[0],prefix=prefixes[0],channel=channels[0],zslice=zslices[0]) +if not is_zstack: + zslices = sorted(list(set(zslice))) + + #read in series metadata from first file + in_tif = in_tif_pattern.format(tilex=tiles_x[0],tiley=tiles_y[0],prefix=prefixes[0],channel=channels[0],zslice=zslices[0]) + +else: + in_tif = in_tif_pattern.format(tilex=tiles_x[0],tiley=tiles_y[0],prefix=prefixes[0],channel=channels[0]) raw_tif = tifffile.TiffFile(in_tif,mode='r') @@ -37,12 +60,18 @@ custom_metadata = ome_dict['OME']['Image']['ca:CustomAttributes'] - #read tile configuration from the microscope metadata if axes == 'CZYX': - tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+),(?P\S+), (?P\S+)\)" + if is_zstack: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+),(?P\S+)\)" + else: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+),(?P\S+), (?P\S+)\)" elif axes == 'ZYX': - tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+), (?P\S+)\)" + if is_zstack: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+)\)" + else: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+), (?P\S+)\)" + tile_pattern = re.compile(tile_config_pattern) @@ -58,23 +87,29 @@ chunks.append(chunk) for line in custom_metadata['TileConfiguration']['@TileConfiguration'].split(' ')[1:]: - d = re.search(tile_pattern,line).groupdict() chunk = map_tiles_to_chunk[d['tilex']+d['tiley']] # want the key to have chunk instad of tilex,tiley, so map to that first - - #key is: tile-{chunk}_chan-{channel}_z-{zslice} - key = f"tile-{chunk}_chan-{d['channel']}_z-{d['zslice']}" + + if is_zstack: + key = f"tile-{chunk}_chan-{d['channel']}_z-0" + else: + #key is: tile-{chunk}_chan-{channel}_z-{zslice} + key = f"tile-{chunk}_chan-{d['channel']}_z-{d['zslice']}" map_x[key] = float(d['x']) map_y[key] = float(d['y']) - map_z[key] = float(d['z']) + if is_zstack: + map_z[key] = float(0) + else: + map_z[key] = float(d['z']) metadata={} metadata['tiles_x'] = tiles_x metadata['tiles_y'] = tiles_y metadata['channels'] = channels -metadata['zslices'] = zslices +if not is_zstack: + metadata['zslices'] = zslices metadata['prefixes'] = prefixes metadata['chunks'] = chunks metadata['axes'] = axes @@ -88,6 +123,7 @@ metadata['ome_full_metadata'] = ome_dict metadata['PixelSize'] = [ metadata['physical_size_z']/1000.0, metadata['physical_size_y']/1000.0, metadata['physical_size_x']/1000.0 ] #zyx since OME-Zarr is ZYX metadata['PixelSizeUnits'] = 'mm' +metadata['is_zstack'] = is_zstack #write metadata to json with open(snakemake.output.metadata_json, 'w') as fp: diff --git a/workflow/scripts/blaze_to_metadata_gcs.py b/workflow/scripts/blaze_to_metadata_gcs.py index 7817d87..cff5302 100644 --- a/workflow/scripts/blaze_to_metadata_gcs.py +++ b/workflow/scripts/blaze_to_metadata_gcs.py @@ -5,12 +5,11 @@ import os from itertools import product from snakemake.io import glob_wildcards +from pathlib import Path import gcsfs from lib.cloud_io import get_fsspec dataset_uri = snakemake.params.dataset_path -in_tif_pattern = snakemake.params.in_tif_pattern - gcsfs_opts={'project': snakemake.params.storage_provider_settings['gcs'].get_settings().project, 'token': snakemake.input.creds} @@ -18,18 +17,38 @@ tifs = fs.glob(f"{dataset_uri}/*.tif") +#check first tif file to see if it is zstack or not: +if 'xyz-Table Z' in Path(tifs[0]).name: + is_zstack=False +else: + is_zstack=True + +if is_zstack: + in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern_zstack"] +else: + in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern"] + + + #parse the filenames to get number of channels, tiles etc.. -prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern,files=tifs) +if is_zstack: + prefix, tilex, tiley, channel = glob_wildcards(in_tif_pattern,files=tifs) +else: + prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern,files=tifs) + zslices = sorted(list(set(zslice))) tiles_x = sorted(list(set(tilex))) tiles_y = sorted(list(set(tiley))) channels = sorted(list(set(channel))) -zslices = sorted(list(set(zslice))) prefixes = sorted(list(set(prefix))) +if not is_zstack: + zslices = sorted(list(set(zslice))) -#read in series metadata from first file -in_tif = in_tif_pattern.format(tilex=tiles_x[0],tiley=tiles_y[0],prefix=prefixes[0],channel=channels[0],zslice=zslices[0]) + #read in series metadata from first file + in_tif = in_tif_pattern.format(tilex=tiles_x[0],tiley=tiles_y[0],prefix=prefixes[0],channel=channels[0],zslice=zslices[0]) +else: + in_tif = in_tif_pattern.format(tilex=tiles_x[0],tiley=tiles_y[0],prefix=prefixes[0],channel=channels[0]) with fs.open(f"gcs://{in_tif}", 'rb') as tif_file: raw_tif = tifffile.TiffFile(tif_file,mode='r') @@ -49,9 +68,15 @@ #read tile configuration from the microscope metadata if axes == 'CZYX': - tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+),(?P\S+), (?P\S+)\)" + if is_zstack: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+),(?P\S+)\)" + else: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+),(?P\S+), (?P\S+)\)" elif axes == 'ZYX': - tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+), (?P\S+)\)" + if is_zstack: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+)\)" + else: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+), (?P\S+)\)" tile_pattern = re.compile(tile_config_pattern) @@ -71,19 +96,28 @@ d = re.search(tile_pattern,line).groupdict() chunk = map_tiles_to_chunk[d['tilex']+d['tiley']] # want the key to have chunk instad of tilex,tiley, so map to that first - #key is: tile-{chunk}_chan-{channel}_z-{zslice} - key = f"tile-{chunk}_chan-{d['channel']}_z-{d['zslice']}" + if is_zstack: + key = f"tile-{chunk}_chan-{d['channel']}_z-0" + else: + #key is: tile-{chunk}_chan-{channel}_z-{zslice} + key = f"tile-{chunk}_chan-{d['channel']}_z-{d['zslice']}" map_x[key] = float(d['x']) map_y[key] = float(d['y']) - map_z[key] = float(d['z']) + if is_zstack: + map_z[key] = float(0) + else: + map_z[key] = float(d['z']) + + metadata={} metadata['tiles_x'] = tiles_x metadata['tiles_y'] = tiles_y metadata['channels'] = channels -metadata['zslices'] = zslices +if not is_zstack: + metadata['zslices'] = zslices metadata['prefixes'] = prefixes metadata['chunks'] = chunks metadata['axes'] = axes @@ -97,6 +131,7 @@ metadata['ome_full_metadata'] = ome_dict metadata['PixelSize'] = [ metadata['physical_size_z']/1000.0, metadata['physical_size_y']/1000.0, metadata['physical_size_x']/1000.0 ] #zyx since OME-Zarr is ZYX metadata['PixelSizeUnits'] = 'mm' +metadata['is_zstack'] = is_zstack #write metadata to json with open(snakemake.output.metadata_json, 'w') as fp: diff --git a/workflow/scripts/tif_to_zarr.py b/workflow/scripts/tif_to_zarr.py index bfd4a86..d00d461 100644 --- a/workflow/scripts/tif_to_zarr.py +++ b/workflow/scripts/tif_to_zarr.py @@ -1,7 +1,9 @@ import tifffile +import os import json import dask.array as da -import dask.array.image +from dask.array.image import imread as imread_tifs +from lib.dask_image import imread_pages from itertools import product from dask.diagnostics import ProgressBar @@ -21,14 +23,24 @@ def single_imread(*args): return tifffile.imread(*args,key=0) -#use tif pattern but replace the [ and ] with [[] and []] so glob doesn't choke -in_tif_glob = replace_square_brackets(str(snakemake.params.in_tif_pattern)) - #read metadata json with open(snakemake.input.metadata_json) as fp: metadata = json.load(fp) + +is_zstack = metadata['is_zstack'] + +if is_zstack: + in_tif_pattern = os.path.join(snakemake.input.ome_dir,snakemake.config["import_blaze"]["raw_tif_pattern_zstack"]) +else: + in_tif_pattern = os.path.join(snakemake.input.ome_dir,snakemake.config["import_blaze"]["raw_tif_pattern"]) + + +#use tif pattern but replace the [ and ] with [[] and []] so glob doesn't choke +in_tif_glob = replace_square_brackets(str(in_tif_pattern)) + + #TODO: put these in top-level metadata for easier access.. size_x=metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeX'] size_y=metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeY'] @@ -36,31 +48,39 @@ def single_imread(*args): size_c=metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeC'] size_tiles=len(metadata['tiles_x'])*len(metadata['tiles_y']) - #now get the first channel and first zslice tif tiles=[] for i_tile,(tilex,tiley) in enumerate(product(metadata['tiles_x'],metadata['tiles_y'])): - + print(f'tile {tilex}x{tiley}, {i_tile}') zstacks=[] for i_chan,channel in enumerate(metadata['channels']): - - zstacks.append(dask.array.image.imread(in_tif_glob.format(tilex=tilex,tiley=tiley,prefix=metadata['prefixes'][0],channel=channel,zslice='*'), imread=single_imread)) - + print(f'channel {i_chan}') + + if is_zstack: + tif_file = in_tif_pattern.format(tilex=tilex,tiley=tiley,prefix=metadata['prefixes'][0],channel=channel) + + zstacks.append(imread_pages(tif_file)) + print(zstacks[-1].shape) + else: + zstacks.append(imread_tifs(in_tif_glob.format(tilex=tilex,tiley=tiley,prefix=metadata['prefixes'][0],channel=channel,zslice='*'), imread=single_imread)) #have list of zstack dask arrays for the tile, one for each channel #stack them up and append to list of tiles tiles.append(da.stack(zstacks)) + #now we have list of tiles, each a dask array #stack them up to get our final array darr = da.stack(tiles) +print(darr.shape) +print(darr.chunksize) + #rescale intensities, and recast darr = darr * snakemake.params.intensity_rescaling darr = darr.astype('uint16') - #now we can do the computation itself, storing to zarr print('writing images to zarr with dask') with ProgressBar(): diff --git a/workflow/scripts/tif_to_zarr_gcs.py b/workflow/scripts/tif_to_zarr_gcs.py index 01a9af1..9c234c2 100644 --- a/workflow/scripts/tif_to_zarr_gcs.py +++ b/workflow/scripts/tif_to_zarr_gcs.py @@ -4,6 +4,7 @@ import dask.array.image from itertools import product from dask.diagnostics import ProgressBar +from lib.dask_image import imread_pages import gcsfs gcsfs_opts={'project': snakemake.params.storage_provider_settings['gcs'].get_settings().project, @@ -20,6 +21,19 @@ def replace_square_brackets(pattern): pattern = pattern.replace('##RIGHTBRACKET##','[]]') return pattern +def get_zstack_metadata(gcs_uri,fs): + with fs.open(gcs_uri, 'rb') as file: + tif = tifffile.TiffFile(file) + pages = tif.pages + n_z = len(pages) + sample = tif.pages[0].asarray() + return {'n_z': n_z, 'shape': sample.shape, 'dtype': sample.dtype} + + +def get_tiff_num_pages(fs,gcs_uri): + with fs.open(gcs_uri, 'rb') as file: + return len(tifffile.TiffFile(file).pages) + def read_tiff_slice(fs,gcs_uri, key=0): """Read a single TIFF slice from GCS.""" with fs.open(gcs_uri, 'rb') as file: @@ -37,16 +51,38 @@ def build_zstack(gcs_uris,fs): # Convert the list of delayed objects into a Dask array return da.stack([da.from_delayed(lazy_array, shape=sample_array.shape, dtype=dtype) for lazy_array in lazy_arrays], axis=0) +def build_zstack_from_single(gcs_uri,zstack_metadata,fs): + """Build a z-stack from a single GCS URI """ + pages = [i for i in range(zstack_metadata['n_z'])] + lazy_arrays = [ + dask.delayed(read_tiff_slice)(fs,gcs_uri,page) for page in pages + ] + + # Convert the list of delayed objects into a Dask array + return da.stack([da.from_delayed(lazy_array, shape=zstack_metadata['shape'], dtype=zstack_metadata['dtype']) for lazy_array in lazy_arrays], axis=0) -#use tif pattern but replace the [ and ] with [[] and []] so glob doesn't choke -in_tif_glob = replace_square_brackets(str(snakemake.params.in_tif_pattern)) #read metadata json with open(snakemake.input.metadata_json) as fp: metadata = json.load(fp) + +is_zstack = metadata['is_zstack'] + +if is_zstack: + in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern_zstack"] +else: + in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern"] + + + +#use tif pattern but replace the [ and ] with [[] and []] so glob doesn't choke +in_tif_glob = replace_square_brackets(str(in_tif_pattern)) + + + #TODO: put these in top-level metadata for easier access.. size_x=metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeX'] size_y=metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeY'] @@ -56,14 +92,21 @@ def build_zstack(gcs_uris,fs): #now get the first channel and first zslice tif + +if is_zstack: + #prepopulate this since its the same for all tiles, and takes time to query + zstack_metadata = get_zstack_metadata('gcs://'+in_tif_pattern.format(tilex=metadata['tiles_x'][0],tiley=metadata['tiles_y'][0],prefix=metadata['prefixes'][0],channel=metadata['channels'][0]),fs=fs) + tiles=[] for i_tile,(tilex,tiley) in enumerate(product(metadata['tiles_x'],metadata['tiles_y'])): zstacks=[] for i_chan,channel in enumerate(metadata['channels']): - - - zstacks.append(build_zstack(fs.glob('gcs://'+in_tif_glob.format(tilex=tilex,tiley=tiley,prefix=metadata['prefixes'][0],channel=channel,zslice='*')),fs=fs)) + + if is_zstack: + zstacks.append(build_zstack_from_single('gcs://'+in_tif_pattern.format(tilex=tilex,tiley=tiley,prefix=metadata['prefixes'][0],channel=channel),fs=fs,zstack_metadata=zstack_metadata)) + else: + zstacks.append(build_zstack(fs.glob('gcs://'+in_tif_glob.format(tilex=tilex,tiley=tiley,prefix=metadata['prefixes'][0],channel=channel,zslice='*')),fs=fs)) #have list of zstack dask arrays for the tile, one for each channel