From 5284522639ce52aa8e656c24a38519eeac88f091 Mon Sep 17 00:00:00 2001 From: Ali Khan Date: Tue, 8 Oct 2024 08:12:11 -0500 Subject: [PATCH] change naming of dataset to samples (#55) Semantically makes more sense since the bids "datasets" contain SPIM samples. Note, we still use the word dataset in other local context, e.g. for bigstitcher (dataset.xml, fuse_dataset) or ome_zarr multiscales dataset. But at least this resolves the conflict in the direct inputs and outputs of the workflow. Changes many files in the workflow, but nothing functionally changes with this PR.. --- .gitignore | 4 + README.md | 2 +- config/config.yml | 2 +- config/datasets.tsv | 3 - config/samples.tsv | 3 + pyproject.toml | 14 ++-- testing/dryrun_tests/datasets_local.tsv | 2 - .../{datasets_gcs.tsv => samples_gcs.tsv} | 2 +- testing/dryrun_tests/samples_local.tsv | 2 + workflow/Snakefile | 4 +- workflow/rules/bids.smk | 2 +- workflow/rules/common.smk | 83 +++++++++---------- workflow/rules/flatfield_corr.smk | 4 +- workflow/rules/import.smk | 18 ++-- workflow/rules/ome_zarr.smk | 2 +- workflow/rules/qc.smk | 4 +- workflow/scripts/blaze_to_metadata_gcs.py | 4 +- workflow/scripts/create_samples_tsv.py | 2 +- workflow/scripts/generate_aggregate_qc.py | 8 +- 19 files changed, 85 insertions(+), 80 deletions(-) delete mode 100644 config/datasets.tsv create mode 100644 config/samples.tsv delete mode 100644 testing/dryrun_tests/datasets_local.tsv rename testing/dryrun_tests/{datasets_gcs.tsv => samples_gcs.tsv} (52%) create mode 100644 testing/dryrun_tests/samples_local.tsv diff --git a/.gitignore b/.gitignore index efca3e5..81f458c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,7 @@ resources/** logs/** .snakemake .snakemake/** +__pycache__ +.ipynb_checkpoints +benchmarks +*.swp diff --git a/README.md b/README.md index 018def7..4f28778 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ python3.11 -m venv venv source venv/bin/activate ``` -3. Update the `config/datasets.tsv` spreadsheet to point to your dataset(s). Each dataset's tif files should be in it's own folder or tar file, with no other tif files. Enter the path to each dataset in the `dataset_path` column. The first three columns identify the subject, sample, acquisition, which become part of the resulting filenames (BIDS naming). The `stain_0` and `stain_1` identify what stains were used for each channel. Use `autof` to indicate the autofluorescence channel. If you have a different number of stains you can add or remove these columns. If your samples have different numbers of stains, you can leave values blank or use `n/a` to indicate that a sample does not have a particular stain. +3. Update the `config/samples.tsv` spreadsheet to point to your sample(s). Each sample's tif files should be in it's own folder or tar file, with no other tif files. Enter the path to each sample in the `sample_path` column. The first three columns identify the subject, sample, acquisition, which become part of the resulting filenames (BIDS naming). The `stain_0` and `stain_1` identify what stains were used for each channel. Use `autof` to indicate the autofluorescence channel. If you have a different number of stains you can add or remove these columns. If your samples have different numbers of stains, you can leave values blank or use `n/a` to indicate that a sample does not have a particular stain. Note: The acquisition value must contain either `blaze` or `prestitched`, and defines which workflow will be used. E.g. for LifeCanvas data that is already stitched, you need to include `prestitched` in the acquisition flag. diff --git a/config/config.yml b/config/config.yml index cf69864..edd15fb 100644 --- a/config/config.yml +++ b/config/config.yml @@ -1,4 +1,4 @@ -datasets: 'config/datasets.tsv' +samples: 'config/samples.tsv' root: 'bids' # can use a s3:// or gcs:// prefix to write output to cloud storage work: 'work' diff --git a/config/datasets.tsv b/config/datasets.tsv deleted file mode 100644 index 99bb4c8..0000000 --- a/config/datasets.tsv +++ /dev/null @@ -1,3 +0,0 @@ -subject sample acq stain_0 stain_1 dataset_path -mouse1 brain blaze1x abeta autof .test/dryrun/data -lifecanvas1 brain prestitched PI BetaAmyloid .test/dryrun/data diff --git a/config/samples.tsv b/config/samples.tsv new file mode 100644 index 0000000..e2b4614 --- /dev/null +++ b/config/samples.tsv @@ -0,0 +1,3 @@ +subject sample acq stain_0 stain_1 stain_2 sample_path +mouse1 brain blaze Lectin PI Abeta .test/dryrun/data +lifecanvas1 brain prestitched PI Abeta n/a .test/dryrun/data diff --git a/pyproject.toml b/pyproject.toml index 54f6961..cbabd88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,12 +29,14 @@ xmltodict = "^0.13.0" tifffile = "^2024.5.10" [tool.poe.tasks] -test_localin_gcsout = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_local.tsv root='gcs://khanlab-lightsheet/data/test_bids'" -test_localin_localout = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_local.tsv root=bids" -test_gcsin_gcsout = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_gcs.tsv root='gcs://khanlab-lightsheet/data/test_bids'" -test_gcsin_localout = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_gcs.tsv root=bids" -test_localin_localout_zipstore = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_local.tsv root=bids use_zipstore=True" -test_localin_gcsout_zipstore = "snakemake --dry-run --config datasets=testing/dryrun_tests/datasets_local.tsv root='gcs://khanlab-lightsheet/data/test_bids' use_zipstore=True" +test_localin_gcsout = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_local.tsv root='gcs://khanlab-lightsheet/data/test_bids'" +test_localin_localout = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_local.tsv root=bids" +test_gcsin_gcsout = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_gcs.tsv root='gcs://khanlab-lightsheet/data/test_bids'" +test_gcsin_localout = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_gcs.tsv root=bids" +test_localin_localout_zipstore = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_local.tsv root=bids use_zipstore=True" +test_localin_gcsout_zipstore = "snakemake --dry-run --config samples=testing/dryrun_tests/samples_local.tsv root='gcs://khanlab-lightsheet/data/test_bids' use_zipstore=True" +test_gcsout=["test_localin_gcsout", "test_gcsin_gcsout", "test_localin_gcsout_zipstore"] +test_localout=["test_localin_localout", "test_gcsin_localout", "test_localin_localout_zipstore"] [build-system] requires = ["poetry-core"] diff --git a/testing/dryrun_tests/datasets_local.tsv b/testing/dryrun_tests/datasets_local.tsv deleted file mode 100644 index 49d6b29..0000000 --- a/testing/dryrun_tests/datasets_local.tsv +++ /dev/null @@ -1,2 +0,0 @@ -subject sample acq stain_0 stain_1 stain_2 dataset_path -mouse1 brain blaze Lectin PI Abeta .test/dryrun/data diff --git a/testing/dryrun_tests/datasets_gcs.tsv b/testing/dryrun_tests/samples_gcs.tsv similarity index 52% rename from testing/dryrun_tests/datasets_gcs.tsv rename to testing/dryrun_tests/samples_gcs.tsv index 98e634e..c920e1b 100644 --- a/testing/dryrun_tests/datasets_gcs.tsv +++ b/testing/dryrun_tests/samples_gcs.tsv @@ -1,2 +1,2 @@ -subject sample acq stain_0 stain_1 stain_2 dataset_path +subject sample acq stain_0 stain_1 stain_2 sample_path mouse1 brain blaze Lectin PI Abeta gcs://my-bucket/test_data diff --git a/testing/dryrun_tests/samples_local.tsv b/testing/dryrun_tests/samples_local.tsv new file mode 100644 index 0000000..98ebf5b --- /dev/null +++ b/testing/dryrun_tests/samples_local.tsv @@ -0,0 +1,2 @@ +subject sample acq stain_0 stain_1 stain_2 sample_path +mouse1 brain blaze Lectin PI Abeta .test/dryrun/data diff --git a/workflow/Snakefile b/workflow/Snakefile index 70b5ced..30834a5 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -21,9 +21,9 @@ resampled = Path(root) / "derivatives" / "resampled" # this is needed to use the latest bids spec with the pre-release snakebids set_bids_spec("v0_11_0") -# read datasets tsv +# read samples tsv dtype = defaultdict(lambda: str, num_tiles=int) -datasets = pd.read_csv(config["datasets"], sep="\t", dtype=dtype) +samples = pd.read_csv(config["samples"], sep="\t", dtype=dtype) include: "rules/common.smk" diff --git a/workflow/rules/bids.smk b/workflow/rules/bids.smk index cb4de00..427a937 100644 --- a/workflow/rules/bids.smk +++ b/workflow/rules/bids.smk @@ -53,7 +53,7 @@ rule bids_samples_json: rule create_samples_tsv: params: - datasets_df=datasets, + samples_df=samples, output: tsv=bids_toplevel(root, "samples.tsv"), log: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 1c2495f..799416e 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -85,7 +85,7 @@ def get_extension_ome_zarr(): # targets def get_all_targets(): targets = [] - for i in range(len(datasets)): + for i in range(len(samples)): targets.extend( expand_bids( root=root, @@ -95,9 +95,9 @@ def get_all_targets(): acq="{acq}", suffix="SPIM.{extension}", expand_kwargs=dict( - subject=datasets.loc[i, "subject"], - sample=datasets.loc[i, "sample"], - acq=datasets.loc[i, "acq"], + subject=samples.loc[i, "subject"], + sample=samples.loc[i, "sample"], + acq=samples.loc[i, "acq"], extension=[get_extension_ome_zarr(), "json"], ), ) @@ -113,9 +113,9 @@ def get_all_targets(): stain="{stain}", suffix="SPIM.nii", expand_kwargs=dict( - subject=datasets.loc[i, "subject"], - sample=datasets.loc[i, "sample"], - acq=datasets.loc[i, "acq"], + subject=samples.loc[i, "subject"], + sample=samples.loc[i, "sample"], + acq=samples.loc[i, "acq"], level=config["nifti"]["levels"], stain=get_stains_by_row(i), ), @@ -127,12 +127,12 @@ def get_all_targets(): def get_all_subj_html(wildcards): htmls = [] - for i in range(len(datasets)): + for i in range(len(samples)): html = "{root}/qc/sub-{subject}_sample-{sample}_acq-{acq}/subject.html".format( root=root, - subject=datasets.loc[i, "subject"], - sample=datasets.loc[i, "sample"], - acq=datasets.loc[i, "acq"], + subject=samples.loc[i, "subject"], + sample=samples.loc[i, "sample"], + acq=samples.loc[i, "acq"], ) htmls.append(remote_file(html)) @@ -157,51 +157,50 @@ def get_qc_targets(): return targets -def dataset_is_remote(wildcards): - return is_remote_gcs(Path(get_dataset_path(wildcards))) +def sample_is_remote(wildcards): + return is_remote_gcs(Path(get_sample_path(wildcards))) -def get_input_dataset(wildcards): - """returns path to extracted dataset or path to provided input folder""" - dataset_path = Path(get_dataset_path(wildcards)) - suffix = dataset_path.suffix +def get_input_sample(wildcards): + """returns path to extracted sample or path to provided input folder""" + sample_path = Path(get_sample_path(wildcards)) - if is_remote_gcs(dataset_path): + if is_remote_gcs(sample_path): return rules.cp_from_gcs.output.ome_dir.format(**wildcards) - if dataset_path.is_dir(): - return get_dataset_path_remote(wildcards) + if sample_path.is_dir(): + return get_sample_path_remote(wildcards) - elif tarfile.is_tarfile(dataset_path): - # dataset was a tar file, so point to the extracted folder - return rules.extract_dataset.output.ome_dir.format(**wildcards) + elif tarfile.is_tarfile(sample_path): + # sample was a tar file, so point to the extracted folder + return rules.extract_sample.output.ome_dir.format(**wildcards) else: - print(f"unsupported input: {dataset_path}") + print(f"unsupported input: {sample_path}") def get_metadata_json(wildcards): """returns path to metadata, extracted from local or gcs""" - dataset_path = Path(get_dataset_path(wildcards)) + sample_path = Path(get_sample_path(wildcards)) - if is_remote_gcs(dataset_path): + if is_remote_gcs(sample_path): return rules.blaze_to_metadata_gcs.output.metadata_json.format(**wildcards) else: return rules.blaze_to_metadata.output.metadata_json.format(**wildcards) # import -def cmd_extract_dataset(wildcards, input, output): +def cmd_extract_sample(wildcards, input, output): cmds = [] # supports tar, tar.gz, tgz, or folder name - dataset_path = Path(input.dataset_path) - suffix = dataset_path.suffix - if dataset_path.is_dir(): + sample_path = Path(input.sample_path) + suffix = sample_path.suffix + if sample_path.is_dir(): # we have a directory print("input directory not copied/extracted by this rule") - elif tarfile.is_tarfile(dataset_path): + elif tarfile.is_tarfile(sample_path): # we have a tar file # check if gzipped: cmds.append(f"mkdir -p {output}") @@ -211,41 +210,41 @@ def cmd_extract_dataset(wildcards, input, output): cmds.append(f"tar -xf {input} -C {output}") else: - print(f"unsupported input: {dataset_path}") + print(f"unsupported input: {sample_path}") return " && ".join(cmds) -def get_dataset_path_remote(wildcards): - path = get_dataset_path(wildcards) +def get_sample_path_remote(wildcards): + path = get_sample_path(wildcards) if is_remote(path): return storage(path) else: return path -def get_dataset_path_gs(wildcards): - path = Path(get_dataset_path(wildcards)).path +def get_sample_path_gs(wildcards): + path = Path(get_sample_path(wildcards)).path return f"gs://{path}" -def get_dataset_path(wildcards): - df = datasets.query( +def get_sample_path(wildcards): + df = samples.query( f"subject=='{wildcards.subject}' and sample=='{wildcards.sample}' and acq=='{wildcards.acq}'" ) - return df.dataset_path.to_list()[0] + return df.sample_path.to_list()[0] def get_stains_by_row(i): # Select columns that match the pattern 'stain_' - stain_columns = datasets.filter(like="stain_").columns + stain_columns = samples.filter(like="stain_").columns # Select values for a given row - return datasets.loc[i, stain_columns].dropna().tolist() + return samples.loc[i, stain_columns].dropna().tolist() def get_stains(wildcards): - df = datasets.query( + df = samples.query( f"subject=='{wildcards.subject}' and sample=='{wildcards.sample}' and acq=='{wildcards.acq}'" ) diff --git a/workflow/rules/flatfield_corr.smk b/workflow/rules/flatfield_corr.smk index 95bcde7..5bdb302 100644 --- a/workflow/rules/flatfield_corr.smk +++ b/workflow/rules/flatfield_corr.smk @@ -8,7 +8,7 @@ rule fit_basic_flatfield_corr: datatype="micr", sample="{sample}", acq="{acq}", - desc="rawfromgcs" if dataset_is_remote(wildcards) else "raw", + desc="rawfromgcs" if sample_is_remote(wildcards) else "raw", suffix="SPIM.zarr", ).format(**wildcards), params: @@ -70,7 +70,7 @@ rule apply_basic_flatfield_corr: datatype="micr", sample="{sample}", acq="{acq}", - desc="rawfromgcs" if dataset_is_remote(wildcards) else "raw", + desc="rawfromgcs" if sample_is_remote(wildcards) else "raw", suffix="SPIM.zarr", ).format(**wildcards), model_dirs=lambda wildcards: expand( diff --git a/workflow/rules/import.smk b/workflow/rules/import.smk index 1c6902a..d3bb230 100644 --- a/workflow/rules/import.smk +++ b/workflow/rules/import.smk @@ -1,8 +1,8 @@ -rule extract_dataset: +rule extract_sample: input: - dataset_path=get_dataset_path_remote, + sample_path=get_sample_path_remote, params: - cmd=cmd_extract_dataset, + cmd=cmd_extract_sample, output: ome_dir=temp( directory( @@ -23,7 +23,7 @@ rule extract_dataset: bids( root="logs", subject="{subject}", - datatype="extract_dataset", + datatype="extract_sample", sample="{sample}", acq="{acq}", desc="raw", @@ -37,7 +37,7 @@ rule blaze_to_metadata_gcs: input: creds=os.path.expanduser(config["remote_creds"]), params: - dataset_path=get_dataset_path_gs, + sample_path=get_sample_path_gs, in_tif_pattern=lambda wildcards: config["import_blaze"]["raw_tif_pattern"], storage_provider_settings=workflow.storage_provider_settings, output: @@ -78,7 +78,7 @@ rule blaze_to_metadata_gcs: rule blaze_to_metadata: input: - ome_dir=get_input_dataset, + ome_dir=get_input_sample, output: metadata_json=temp( bids( @@ -144,7 +144,7 @@ rule copy_blaze_metadata: rule prestitched_to_metadata: input: - ome_dir=get_input_dataset, + ome_dir=get_input_sample, params: physical_size_x_um=config["import_prestitched"]["physical_size_x_um"], physical_size_y_um=config["import_prestitched"]["physical_size_y_um"], @@ -189,7 +189,7 @@ rule tif_to_zarr: output shape is (tiles,channels,z,y,x), with the 2d images as the chunks""" input: - ome_dir=get_input_dataset, + ome_dir=get_input_sample, metadata_json=rules.copy_blaze_metadata.output.metadata_json, params: intensity_rescaling=config["import_blaze"]["intensity_rescaling"], @@ -244,7 +244,7 @@ rule tif_to_zarr_gcs: metadata_json=rules.copy_blaze_metadata.output.metadata_json, creds=os.path.expanduser(config["remote_creds"]), params: - dataset_path=get_dataset_path_gs, + sample_path=get_sample_path_gs, in_tif_pattern=lambda wildcards: config["import_blaze"]["raw_tif_pattern"], intensity_rescaling=config["import_blaze"]["intensity_rescaling"], storage_provider_settings=workflow.storage_provider_settings, diff --git a/workflow/rules/ome_zarr.smk b/workflow/rules/ome_zarr.smk index d3a859b..26606ac 100644 --- a/workflow/rules/ome_zarr.smk +++ b/workflow/rules/ome_zarr.smk @@ -52,7 +52,7 @@ rule zarr_to_ome_zarr: rule tif_stacks_to_ome_zarr: input: **get_storage_creds(), - tif_dir=get_input_dataset, + tif_dir=get_input_sample, metadata_json=rules.prestitched_to_metadata.output.metadata_json, params: in_tif_glob=lambda wildcards, input: os.path.join( diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index fc32d7b..36815f5 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -20,7 +20,7 @@ rule generate_flatfield_qc: datatype="micr", sample="{sample}", acq="{acq}", - desc="rawfromgcs" if dataset_is_remote(wildcards) else "raw", + desc="rawfromgcs" if sample_is_remote(wildcards) else "raw", suffix="SPIM.zarr", ).format(**wildcards), corr=bids( @@ -190,7 +190,7 @@ rule generate_aggregate_qc: report_html=config["report"]["resources"]["report_html"], subj_htmls=get_all_subj_html, params: - datasets=datasets, + samples=samples, output: total_html=remote_file(Path(root) / "qc" / "qc_report.html"), log: diff --git a/workflow/scripts/blaze_to_metadata_gcs.py b/workflow/scripts/blaze_to_metadata_gcs.py index b652170..d812edb 100644 --- a/workflow/scripts/blaze_to_metadata_gcs.py +++ b/workflow/scripts/blaze_to_metadata_gcs.py @@ -9,13 +9,13 @@ import gcsfs from lib.cloud_io import get_fsspec -dataset_uri = snakemake.params.dataset_path +sample_uri = snakemake.params.sample_path gcsfs_opts={'project': snakemake.params.storage_provider_settings['gcs'].get_settings().project, 'token': snakemake.input.creds} fs = gcsfs.GCSFileSystem(**gcsfs_opts) -tifs = fs.glob(f"{dataset_uri}/*.tif") +tifs = fs.glob(f"{sample_uri}/*.tif") #check first tif file to see if it is zstack or not: if 'xyz-Table Z' in Path(tifs[0]).name: diff --git a/workflow/scripts/create_samples_tsv.py b/workflow/scripts/create_samples_tsv.py index 28b9a53..5da5c2f 100644 --- a/workflow/scripts/create_samples_tsv.py +++ b/workflow/scripts/create_samples_tsv.py @@ -1,6 +1,6 @@ import pandas as pd -df = snakemake.params.datasets_df +df = snakemake.params.samples_df df['participant_id'] = 'sub-' + df['subject'] df['sample_id'] = 'sample-' + df['sample'] diff --git a/workflow/scripts/generate_aggregate_qc.py b/workflow/scripts/generate_aggregate_qc.py index d35189b..f021aaf 100644 --- a/workflow/scripts/generate_aggregate_qc.py +++ b/workflow/scripts/generate_aggregate_qc.py @@ -1,7 +1,7 @@ from jinja2 import Environment, FileSystemLoader from pathlib import Path -datasets = snakemake.params.datasets +samples = snakemake.params.samples total_html = snakemake.output.total_html subj_htmls = snakemake.input.subj_htmls @@ -13,9 +13,9 @@ output = template.render() for i,subj_html in enumerate(subj_htmls): - subject=datasets.loc[i,'subject'] - sample=datasets.loc[i,'sample'] - acq=datasets.loc[i,'acq'] + subject=samples.loc[i,'subject'] + sample=samples.loc[i,'sample'] + acq=samples.loc[i,'acq'] relative_path = Path(subj_html).relative_to(Path(total_html).parent) # Create line to add link to subject into final qc report combining all subjects