Skip to content

Commit

Permalink
use gcloud to transfer input data
Browse files Browse the repository at this point in the history
snakemake gs plugin was failing when retrieving data, and
very slow)
  • Loading branch information
akhanf committed Aug 17, 2024
1 parent 3c41cec commit 7e2c873
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 2 deletions.
8 changes: 8 additions & 0 deletions workflow/lib/cloud_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ def is_remote(uri_string):
else:
return False

def is_remote_gcs(uri_string):
uri = Path(uri_string)
if uri.protocol == 'gcs':
return True
else:
return False


def get_fsspec(uri_string,storage_provider_settings=None,creds=None):
uri = Path(uri_string)
if uri.protocol == 'gcs':
Expand Down
10 changes: 9 additions & 1 deletion workflow/rules/common.smk
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import tarfile
from snakebids import bids as _bids
from upath import UPath as Path
from lib.cloud_io import is_remote
from lib.cloud_io import is_remote, is_remote_gcs


def bids(root, *args, **kwargs):
Expand Down Expand Up @@ -134,6 +134,9 @@ def get_input_dataset(wildcards):
dataset_path = Path(get_dataset_path(wildcards))
suffix = dataset_path.suffix

if is_remote_gcs(dataset_path):
return rules.cp_from_gcs.output.ome_dir.format(**wildcards)

if dataset_path.is_dir():
print('is a dir')
return get_dataset_path_remote(wildcards)
Expand Down Expand Up @@ -179,6 +182,11 @@ def get_dataset_path_remote(wildcards):
else:
return path

def get_dataset_path_gs(wildcards):
path=Path(get_dataset_path(wildcards)).path
return f"gs://{path}"


def get_dataset_path(wildcards):
df = datasets.query(
f"subject=='{wildcards.subject}' and sample=='{wildcards.sample}' and acq=='{wildcards.acq}'"
Expand Down
36 changes: 35 additions & 1 deletion workflow/rules/import.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

rule extract_dataset:
input:
dataset_path=get_dataset_path_remote,
Expand Down Expand Up @@ -33,6 +32,41 @@ rule extract_dataset:
shell:
"{params.cmd}"

rule cp_from_gcs:
params:
dataset_path=get_dataset_path_gs,
output:
ome_dir=temp(
directory(
bids(
root=work,
subject="{subject}",
datatype="micr",
sample="{sample}",
acq="{acq}",
desc="rawfromgcs",
suffix="SPIM",
)
)
),
threads: 32
group:
"preproc"
log:
bids(
root="logs",
subject="{subject}",
datatype="cp_from_gcs",
sample="{sample}",
acq="{acq}",
desc="raw",
suffix="log.txt",
),
container: None
shell:
"mkdir -p {output} && gcloud storage cp --recursive {params.dataset_path}/* {output}"



rule blaze_to_metadata:
input:
Expand Down

0 comments on commit 7e2c873

Please sign in to comment.