Updates Snakefile to be able to run all preprocessing

With this commit, the Snakefile can now run the (BIDS) preprocessing correctly. this adds a couple helper files (cluster.json, config.yml) to help with this, as well as updating the README and the .gitignore to better explain snakemake and ignore .snakemake, respectively. requirements also updated to include pyPyrTools
billbrod · Jan 16, 2018 · cd6bb47 · cd6bb47
1 parent 2a08b9d
commit cd6bb47
Show file tree

Hide file tree

Showing 6 changed files with 97 additions and 10 deletions.
diff --git a/.gitignore b/.gitignore
@@ -71,3 +71,6 @@ target/
 
 # exclude data from source control by default
 /data/
+
+# ignore snakemake hidden directory
+.snakemake
diff --git a/README.md b/README.md
@@ -13,9 +13,15 @@ Matlab:
 
  - [GLMdenoise](https://github.com/kendrickkay/GLMdenoise/)
 
-Python: see `requirements.txt` file
+Python: 
 
-Other: [FreeSurfer](http://freesurfer.net/)
+ - see `requirements.txt` file
+ - ![WinawerLab's MRI_tools](https://github.com/WinawerLab/MRI_tools))
+   (which requires [FSL](https://fsl.fmrib.ox.ac.uk/fsl/fslwiki/))
+
+Other: 
+
+ - [FreeSurfer](http://freesurfer.net/)
 
 # Overview of analysis
 
@@ -72,6 +78,30 @@ arguments to set the paths yourself.
 it came off the scanner into the BIDS structure. As such, you will not
 need it.
 
+# Snakemake
+
+A Snakefile is included with this project to enable the use
+of ![snakemake](http://snakemake.readthedocs.io/en/latest/). This is
+highly recommended, since it will allow you to easily to rerun the
+analyses exactly as I have performed them and it enables easy use on a
+cluster. To
+use,
+![install snakemake](http://snakemake.readthedocs.io/en/latest/getting_started/installation.html) and,
+if using on a cluster, set up your
+appropriate
+![Snakemake profile](https://github.com/Snakemake-Profiles/doc) (see
+the
+![snakemake docs](http://snakemake.readthedocs.io/en/latest/executable.html#profiles)for
+more info on profiles). Then simply type `snakemake {target}` to
+re-run the analyses. 
+
+For example, if running on NYU's HPC cluster, set up the SLURM profile
+and use the following command: `snakemake --profile slurm --jobs {n}
+--cluster-config cluster.json {target}`, where `{n}` is the number of
+jobs you allow `snakemake` to simultaneously submit to the cluster and
+`cluster.json` is an included configuration file with some reasonable
+values for the cluster (feel free to change these as needed).
+
 Project Organization
 ------------
 

diff --git a/Snakefile b/Snakefile
@@ -1,6 +1,22 @@
 import os
 # the directory that contains the data (in BIDS format)
-DATA_DIR="/scratch/wfb229/spatial_frequency_preferences"
+configfile:
+    "config.yml"
+if not os.path.isdir(config["DATA_DIR"]):
+    raise Exception("Cannot find the dataset at %s" % config["DATA_DIR"])
+if os.system("module list") == 0:
+    # then we're on the cluster
+    shell.prefix("module purge; module load anaconda2/4.3.1; module load fsl/5.0.10; "
+                 "module load freesurfer/6.0.0; module load matlab/2017a; ")
+
+SUBJECTS = ['sub-wlsubj001', 'sub-wlsubj042', 'sub-wlsubj045']
+SESSIONS = {'sub-wlsubj001': ['ses-pilot01'], 'sub-wlsubj042': ['ses-pilot00', 'ses-pilot01'],
+            'sub-wlsubj045': ['ses-pilot01']}
+
+rule all:
+    input:
+        [os.path.join(config["DATA_DIR"], "derivatives", "preprocessed", "{subject}", "{session}").format(subject=sub, session=ses) for sub in SUBJECTS for ses in SESSIONS[sub]]
+
 
 rule stimuli:
     output:
@@ -13,8 +29,7 @@ rule stimuli_idx:
     output:
         ["data/stimuli/{subject}_run%02d_idx.npy" % i for i in range(12)]
     params:
-        seeds_dict = {'wl_subj001': 1, 'wl_subj042': 2, 'wl_subj045': 3}
-        seed = lambda wildcards: SUBJECTS_SEEDS.get(wildcards.subject)
+        seed = lambda wildcards: {'wl_subj001': 1, 'wl_subj042': 2, 'wl_subj045': 3}.get(wildcards.subject)
     shell:
         "python sfp/stimuli.py {wildcards.subject} -i -s {params.seed}"
 
@@ -23,12 +38,33 @@ rule stimuli_idx:
 # both on and off cluster, but now I'm thinking maybe I make the things that should be run on the
 # cluster always call `module` so they fail if run locally. that would work for me at any rate.
 
+# this has to be run on the cluster, otherwise it will fail
 rule preprocess:
     input:
-        data_dir = os.path.join(DATA_DIR, "{subject}", "{session}"),
-        freesurfer_dir = os.path.join(DATA_DIR, "derivatives", "freesurfer"),
-        os.path.join(DATA_DIR, "derivatives", "freesurfer", "{subject}"),
+        os.path.join(config["DATA_DIR"], "derivatives", "freesurfer", "{subject}"),
+        data_dir = os.path.join(config["DATA_DIR"], "{subject}", "{session}"),
+        freesurfer_dir = os.path.join(config["DATA_DIR"], "derivatives", "freesurfer"),
     output:
-        output_dir = os.path.join(DATA_DIR, "derivatives", "preprocessed", "{subject}", "{session}"),
+        output_dir = os.path.join(config["DATA_DIR"], "derivatives", "preprocessed", "{subject}", "{session}"),
     params:
-        # NEXT: get this working and then re-run preprocessing using it.
+        sbref = 1,
+        epis = lambda wildcards:
+        {('sub-wlsubj001', 'ses-pilot01'): [1, 2, 3, 4, 5, 6, 7, 8, 9],
+         ('sub-wlsubj042', 'ses-pilot00'): [1, 2, 3, 4, 5, 6, 7, 8],
+         ('sub-wlsubj042', 'ses-pilot01'): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+         ('sub-wlsubj045', 'ses-pilot01'): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]}.get((wildcards.subject, wildcards.session)),
+        distortPE = 'PA',
+        distortrevPE = 'AP',
+        plugin = "Linear",        
+        working_dir = "/scratch/wfb229",
+        PEdim = 'y'
+    benchmark:
+        os.path.join(config["DATA_DIR"], "code", "preprocessed", "{subject}_{session}_benchmark.txt")
+    log:
+        os.path.join(config["DATA_DIR"], "code", "preprocessed", "{subject}_{session}.log")
+    shell:
+        "export SUBJECTS_DIR={input.freesurfer_dir};"
+        "python ~/MRI_tools/preprocessing/prisma_preproc.py -subject {wildcards.subject} -datadir "
+        "{input.data_dir} -outdir {output.output_dir} -epis {params.epis} -sbref {params.sbref} "
+        "-distortPE {params.distortPE} -distortrevPE {params.distortrevPE} -working_dir "
+        "{params.working_dir} -PEdim {params.PEdim} -plugin {params.plugin} -dir_structure bids"
diff --git a/cluster.json b/cluster.json
@@ -0,0 +1,16 @@
+{
+    "__default__":
+    {
+	"nodes": 1,
+	"tasks_per_node": 1,
+	"cpus_per_task": 1,
+	"mem": "48GB",
+	"time": "12:00:00"
+    },
+
+    "preprocess":
+    {
+	"job_name": "preproc",
+	"mem": "10GB"
+    }
+}
diff --git a/config.yml b/config.yml
@@ -0,0 +1 @@
+DATA_DIR: "/scratch/wfb229/spatial_frequency_preferences"
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ matplotlib
 seaborn
 numpy>=1.13.0
 scipy
+pyPyrTools
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		DATA_DIR: "/scratch/wfb229/spatial_frequency_preferences"
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,4 @@ matplotlib @@
     seaborn
     numpy>=1.13.0
     scipy
+    pyPyrTools