From a2ce2186633811c642a2f5272c2f447cf6dfe80c Mon Sep 17 00:00:00 2001 From: Ali Khan Date: Tue, 1 Oct 2024 13:05:21 -0400 Subject: [PATCH] update the tif_to_zarr_gcs for tifzstacks on google cloud --- workflow/scripts/blaze_to_metadata_gcs.py | 139 +++++++++++++--------- workflow/scripts/tif_to_zarr_gcs.py | 99 +++++++++------ 2 files changed, 147 insertions(+), 91 deletions(-) diff --git a/workflow/scripts/blaze_to_metadata_gcs.py b/workflow/scripts/blaze_to_metadata_gcs.py index fac9747..b652170 100644 --- a/workflow/scripts/blaze_to_metadata_gcs.py +++ b/workflow/scripts/blaze_to_metadata_gcs.py @@ -23,25 +23,44 @@ else: is_zstack=True -if is_zstack: - in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern_zstack"] +#now check to see if there is just single tile +if ' x ' in Path(tifs[0]).name: + is_tiled=True else: - in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern"] + is_tiled=False + + + +if is_tiled: + if is_zstack: + in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern_zstack"] + else: + in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern"] +else: + in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern_notile"] #parse the filenames to get number of channels, tiles etc.. -if is_zstack: - prefix, tilex, tiley, channel = glob_wildcards(in_tif_pattern,files=tifs) +if is_tiled: + if is_zstack: + prefix, tilex, tiley, channel = glob_wildcards(in_tif_pattern,files=tifs) + else: + prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern,files=tifs) else: - prefix, tilex, tiley, channel, zslice = glob_wildcards(in_tif_pattern,files=tifs) - zslices = sorted(list(set(zslice))) + prefix, channel, zslice = glob_wildcards(in_tif_pattern,files=tifs) + +if is_tiled: + tiles_x = sorted(list(set(tilex))) + tiles_y = sorted(list(set(tiley))) -tiles_x = sorted(list(set(tilex))) -tiles_y = sorted(list(set(tiley))) channels = sorted(list(set(channel))) prefixes = sorted(list(set(prefix))) +if not is_zstack: + zslices = sorted(list(set(zslice))) + + if not is_zstack: zslices = sorted(list(set(zslice))) @@ -68,70 +87,80 @@ #read tile configuration from the microscope metadata if axes == 'CZYX': - if is_zstack: - tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+),(?P\S+)\)" - else: - tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+),(?P\S+), (?P\S+)\)" -elif axes == 'ZYX': - if is_zstack: - tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+)\)" - else: - tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+), (?P\S+)\)" - -tile_pattern = re.compile(tile_config_pattern) - -#put it in 3 maps, one for each coord, indexed by tilex, tiley, channel, and aslice -map_x=dict() -map_y=dict() -map_z=dict() - -map_tiles_to_chunk=dict() -chunks = [] -for chunk,(tilex,tiley) in enumerate(product(tiles_x,tiles_y)): - map_tiles_to_chunk[tilex+tiley] = chunk - chunks.append(chunk) - -for line in custom_metadata['TileConfiguration']['@TileConfiguration'].split(' ')[1:]: - - d = re.search(tile_pattern,line).groupdict() - chunk = map_tiles_to_chunk[d['tilex']+d['tiley']] # want the key to have chunk instad of tilex,tiley, so map to that first - - if is_zstack: - key = f"tile-{chunk}_chan-{d['channel']}_z-0000" + if is_tiled: + if is_zstack: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+),(?P\S+)\)" + else: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+),(?P\S+), (?P\S+)\)" else: - #key is: tile-{chunk}_chan-{channel}_z-{zslice} - key = f"tile-{chunk}_chan-{d['channel']}_z-{d['zslice']}" + print('single tile, axes=CZYX') - map_x[key] = float(d['x']) - map_y[key] = float(d['y']) - if is_zstack: - map_z[key] = float(0) +elif axes == 'ZYX': + if is_tiled: + if is_zstack: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+)\)" + else: + tile_config_pattern=r"Blaze\[(?P[0-9]+) x (?P[0-9]+)\]_C(?P[0-9]+)_xyz-Table Z(?P[0-9]+).ome.tif;;\((?P\S+), (?P\S+), (?P\S+)\)" else: - map_z[key] = float(d['z']) - - - + print('single tile, axes=ZYX') + +if is_tiled: + tile_pattern = re.compile(tile_config_pattern) + + #put it in 3 maps, one for each coord, indexed by tilex, tiley, channel, and aslice + map_x=dict() + map_y=dict() + map_z=dict() + + map_tiles_to_chunk=dict() + chunks = [] + for chunk,(tilex,tiley) in enumerate(product(tiles_x,tiles_y)): + map_tiles_to_chunk[tilex+tiley] = chunk + chunks.append(chunk) + + for line in custom_metadata['TileConfiguration']['@TileConfiguration'].split(' ')[1:]: + + d = re.search(tile_pattern,line).groupdict() + chunk = map_tiles_to_chunk[d['tilex']+d['tiley']] # want the key to have chunk instad of tilex,tiley, so map to that first + + if is_zstack: + key = f"tile-{chunk}_chan-{d['channel']}_z-0000" + else: + #key is: tile-{chunk}_chan-{channel}_z-{zslice} + key = f"tile-{chunk}_chan-{d['channel']}_z-{d['zslice']}" + + map_x[key] = float(d['x']) + map_y[key] = float(d['y']) + if is_zstack: + map_z[key] = float(0) + else: + map_z[key] = float(d['z']) + metadata={} -metadata['tiles_x'] = tiles_x -metadata['tiles_y'] = tiles_y +if is_tiled: + metadata['tiles_x'] = tiles_x + metadata['tiles_y'] = tiles_y + metadata['chunks'] = chunks + metadata['lookup_tile_offset_x'] = map_x + metadata['lookup_tile_offset_y'] = map_y + metadata['lookup_tile_offset_z'] = map_z + metadata['channels'] = channels + if not is_zstack: metadata['zslices'] = zslices metadata['prefixes'] = prefixes -metadata['chunks'] = chunks metadata['axes'] = axes metadata['shape'] = shape metadata['physical_size_x'] = float(physical_size_x) metadata['physical_size_y'] = float(physical_size_y) metadata['physical_size_z'] = float(physical_size_z) -metadata['lookup_tile_offset_x'] = map_x -metadata['lookup_tile_offset_y'] = map_y -metadata['lookup_tile_offset_z'] = map_z metadata['ome_full_metadata'] = ome_dict metadata['PixelSize'] = [ metadata['physical_size_z']/1000.0, metadata['physical_size_y']/1000.0, metadata['physical_size_x']/1000.0 ] #zyx since OME-Zarr is ZYX metadata['PixelSizeUnits'] = 'mm' metadata['is_zstack'] = is_zstack +metadata['is_tiled'] = is_tiled #write metadata to json with open(snakemake.output.metadata_json, 'w') as fp: diff --git a/workflow/scripts/tif_to_zarr_gcs.py b/workflow/scripts/tif_to_zarr_gcs.py index 9c234c2..ca9ea11 100644 --- a/workflow/scripts/tif_to_zarr_gcs.py +++ b/workflow/scripts/tif_to_zarr_gcs.py @@ -1,6 +1,7 @@ import tifffile import json import dask.array as da +from dask.delayed import delayed import dask.array.image from itertools import product from dask.diagnostics import ProgressBar @@ -21,14 +22,6 @@ def replace_square_brackets(pattern): pattern = pattern.replace('##RIGHTBRACKET##','[]]') return pattern -def get_zstack_metadata(gcs_uri,fs): - with fs.open(gcs_uri, 'rb') as file: - tif = tifffile.TiffFile(file) - pages = tif.pages - n_z = len(pages) - sample = tif.pages[0].asarray() - return {'n_z': n_z, 'shape': sample.shape, 'dtype': sample.dtype} - def get_tiff_num_pages(fs,gcs_uri): with fs.open(gcs_uri, 'rb') as file: @@ -39,6 +32,19 @@ def read_tiff_slice(fs,gcs_uri, key=0): with fs.open(gcs_uri, 'rb') as file: return tifffile.imread(file, key=key) +def read_page_as_numpy(tif_file_uri, page, fs=None): + """Gets a single page (i.e., 2D image) from a tif file z-stack stored in a cloud URI.""" + + # Open the file from the cloud storage using fsspec + with fs.open(tif_file_uri, 'rb') as f: + # Read the file content into a buffer + file_buffer = f.read() + + # Use pyvips to read from the buffer + return pyvips.Image.new_from_buffer(file_buffer, "", page=page).numpy() + + + def build_zstack(gcs_uris,fs): """Build a z-stack from a list of GCS URIs.""" lazy_arrays = [ @@ -70,11 +76,16 @@ def build_zstack_from_single(gcs_uri,zstack_metadata,fs): is_zstack = metadata['is_zstack'] +is_tiled = metadata['is_tiled'] -if is_zstack: - in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern_zstack"] +if is_tiled: + if is_zstack: + in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern_zstack"] + else: + in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern"] else: - in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern"] + in_tif_pattern = snakemake.config["import_blaze"]["raw_tif_pattern_notile"] + @@ -84,39 +95,55 @@ def build_zstack_from_single(gcs_uri,zstack_metadata,fs): #TODO: put these in top-level metadata for easier access.. -size_x=metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeX'] -size_y=metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeY'] -size_z=metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeZ'] -size_c=metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeC'] -size_tiles=len(metadata['tiles_x'])*len(metadata['tiles_y']) +size_x=int(metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeX']) +size_y=int(metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeY']) +size_z=int(metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeZ']) +size_c=int(metadata['ome_full_metadata']['OME']['Image']['Pixels']['@SizeC']) -#now get the first channel and first zslice tif -if is_zstack: - #prepopulate this since its the same for all tiles, and takes time to query - zstack_metadata = get_zstack_metadata('gcs://'+in_tif_pattern.format(tilex=metadata['tiles_x'][0],tiley=metadata['tiles_y'][0],prefix=metadata['prefixes'][0],channel=metadata['channels'][0]),fs=fs) +if is_tiled: -tiles=[] -for i_tile,(tilex,tiley) in enumerate(product(metadata['tiles_x'],metadata['tiles_y'])): - - zstacks=[] - for i_chan,channel in enumerate(metadata['channels']): - - if is_zstack: - zstacks.append(build_zstack_from_single('gcs://'+in_tif_pattern.format(tilex=tilex,tiley=tiley,prefix=metadata['prefixes'][0],channel=channel),fs=fs,zstack_metadata=zstack_metadata)) - else: - zstacks.append(build_zstack(fs.glob('gcs://'+in_tif_glob.format(tilex=tilex,tiley=tiley,prefix=metadata['prefixes'][0],channel=channel,zslice='*')),fs=fs)) + size_tiles=len(metadata['tiles_x'])*len(metadata['tiles_y']) + + #now get the first channel and first zslice tif + + + tiles=[] + for i_tile,(tilex,tiley) in enumerate(product(metadata['tiles_x'],metadata['tiles_y'])): + + print(f'tile {tilex}x{tiley}, {i_tile}') + zstacks=[] + for i_chan,channel in enumerate(metadata['channels']): + print(f'channel {i_chan}') + if is_zstack: + + tif_file = in_tif_pattern.format(tilex=tilex,tiley=tiley,prefix=metadata['prefixes'][0],channel=channel) + + pages=[] + #read each page + for i_z in range(size_z): + pages.append(da.from_delayed(delayed(read_page_as_numpy)('gcs://'+tif_file,i_z),shape=(size_y,size_x),dtype='uint16')) + + zstacks.append(da.stack(pages)) - #have list of zstack dask arrays for the tile, one for each channel - #stack them up and append to list of tiles - tiles.append(da.stack(zstacks)) + else: + zstacks.append(build_zstack(fs.glob('gcs://'+in_tif_glob.format(tilex=tilex,tiley=tiley,prefix=metadata['prefixes'][0],channel=channel,zslice='*')),fs=fs)) + + #have list of zstack dask arrays for the tile, one for each channel + #stack them up and append to list of tiles + tiles.append(da.stack(zstacks)) + + + #now we have list of tiles, each a dask array + #stack them up to get our final array + darr = da.stack(tiles) + +else: + print("single tile data not supported for tif_to_zarr_gcs") -#now we have list of tiles, each a dask array -#stack them up to get our final array -darr = da.stack(tiles) #rescale intensities, and recast darr = darr * snakemake.params.intensity_rescaling