From cd6bb473634a6f950b458f7d16861162d791c786 Mon Sep 17 00:00:00 2001
From: "William F. Broderick" <billbrod@gmail.com>
Date: Mon, 15 Jan 2018 22:13:46 -0500
Subject: [PATCH] Updates Snakefile to be able to run all preprocessing

With this commit, the Snakefile can now run the (BIDS) preprocessing
correctly. this adds a couple helper files (cluster.json, config.yml)
to help with this, as well as updating the README and the .gitignore
to better explain snakemake and ignore .snakemake, respectively.

requirements also updated to include pyPyrTools
---
 .gitignore       |  3 +++
 README.md        | 34 +++++++++++++++++++++++++++++--
 Snakefile        | 52 ++++++++++++++++++++++++++++++++++++++++--------
 cluster.json     | 16 +++++++++++++++
 config.yml       |  1 +
 requirements.txt |  1 +
 6 files changed, 97 insertions(+), 10 deletions(-)
 create mode 100644 cluster.json
 create mode 100644 config.yml

diff --git a/.gitignore b/.gitignore
index 1867d7a..896ea04 100644
--- a/.gitignore
+++ b/.gitignore
@@ -71,3 +71,6 @@ target/
 
 # exclude data from source control by default
 /data/
+
+# ignore snakemake hidden directory
+.snakemake
\ No newline at end of file
diff --git a/README.md b/README.md
index b57642d..eace966 100644
--- a/README.md
+++ b/README.md
@@ -13,9 +13,15 @@ Matlab:
 
  - [GLMdenoise](https://github.com/kendrickkay/GLMdenoise/)
  
-Python: see `requirements.txt` file
+Python: 
 
-Other: [FreeSurfer](http://freesurfer.net/)
+ - see `requirements.txt` file
+ - ![WinawerLab's MRI_tools](https://github.com/WinawerLab/MRI_tools))
+   (which requires [FSL](https://fsl.fmrib.ox.ac.uk/fsl/fslwiki/))
+
+Other: 
+
+ - [FreeSurfer](http://freesurfer.net/)
 
 # Overview of analysis
 
@@ -72,6 +78,30 @@ arguments to set the paths yourself.
 it came off the scanner into the BIDS structure. As such, you will not
 need it.
 
+# Snakemake
+
+A Snakefile is included with this project to enable the use
+of ![snakemake](http://snakemake.readthedocs.io/en/latest/). This is
+highly recommended, since it will allow you to easily to rerun the
+analyses exactly as I have performed them and it enables easy use on a
+cluster. To
+use,
+![install snakemake](http://snakemake.readthedocs.io/en/latest/getting_started/installation.html) and,
+if using on a cluster, set up your
+appropriate
+![Snakemake profile](https://github.com/Snakemake-Profiles/doc) (see
+the
+![snakemake docs](http://snakemake.readthedocs.io/en/latest/executable.html#profiles)for
+more info on profiles). Then simply type `snakemake {target}` to
+re-run the analyses. 
+
+For example, if running on NYU's HPC cluster, set up the SLURM profile
+and use the following command: `snakemake --profile slurm --jobs {n}
+--cluster-config cluster.json {target}`, where `{n}` is the number of
+jobs you allow `snakemake` to simultaneously submit to the cluster and
+`cluster.json` is an included configuration file with some reasonable
+values for the cluster (feel free to change these as needed).
+
 Project Organization
 ------------
 
diff --git a/Snakefile b/Snakefile
index 32e1796..a531ea9 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,6 +1,22 @@
 import os
 # the directory that contains the data (in BIDS format)
-DATA_DIR="/scratch/wfb229/spatial_frequency_preferences"
+configfile:
+    "config.yml"
+if not os.path.isdir(config["DATA_DIR"]):
+    raise Exception("Cannot find the dataset at %s" % config["DATA_DIR"])
+if os.system("module list") == 0:
+    # then we're on the cluster
+    shell.prefix("module purge; module load anaconda2/4.3.1; module load fsl/5.0.10; "
+                 "module load freesurfer/6.0.0; module load matlab/2017a; ")
+
+SUBJECTS = ['sub-wlsubj001', 'sub-wlsubj042', 'sub-wlsubj045']
+SESSIONS = {'sub-wlsubj001': ['ses-pilot01'], 'sub-wlsubj042': ['ses-pilot00', 'ses-pilot01'],
+            'sub-wlsubj045': ['ses-pilot01']}
+
+rule all:
+    input:
+        [os.path.join(config["DATA_DIR"], "derivatives", "preprocessed", "{subject}", "{session}").format(subject=sub, session=ses) for sub in SUBJECTS for ses in SESSIONS[sub]]
+
 
 rule stimuli:
     output:
@@ -13,8 +29,7 @@ rule stimuli_idx:
     output:
         ["data/stimuli/{subject}_run%02d_idx.npy" % i for i in range(12)]
     params:
-        seeds_dict = {'wl_subj001': 1, 'wl_subj042': 2, 'wl_subj045': 3}
-        seed = lambda wildcards: SUBJECTS_SEEDS.get(wildcards.subject)
+        seed = lambda wildcards: {'wl_subj001': 1, 'wl_subj042': 2, 'wl_subj045': 3}.get(wildcards.subject)
     shell:
         "python sfp/stimuli.py {wildcards.subject} -i -s {params.seed}"
 
@@ -23,12 +38,33 @@ rule stimuli_idx:
 # both on and off cluster, but now I'm thinking maybe I make the things that should be run on the
 # cluster always call `module` so they fail if run locally. that would work for me at any rate.
 
+# this has to be run on the cluster, otherwise it will fail
 rule preprocess:
     input:
-        data_dir = os.path.join(DATA_DIR, "{subject}", "{session}"),
-        freesurfer_dir = os.path.join(DATA_DIR, "derivatives", "freesurfer"),
-        os.path.join(DATA_DIR, "derivatives", "freesurfer", "{subject}"),
+        os.path.join(config["DATA_DIR"], "derivatives", "freesurfer", "{subject}"),
+        data_dir = os.path.join(config["DATA_DIR"], "{subject}", "{session}"),
+        freesurfer_dir = os.path.join(config["DATA_DIR"], "derivatives", "freesurfer"),
     output:
-        output_dir = os.path.join(DATA_DIR, "derivatives", "preprocessed", "{subject}", "{session}"),
+        output_dir = os.path.join(config["DATA_DIR"], "derivatives", "preprocessed", "{subject}", "{session}"),
     params:
-        # NEXT: get this working and then re-run preprocessing using it.
+        sbref = 1,
+        epis = lambda wildcards:
+        {('sub-wlsubj001', 'ses-pilot01'): [1, 2, 3, 4, 5, 6, 7, 8, 9],
+         ('sub-wlsubj042', 'ses-pilot00'): [1, 2, 3, 4, 5, 6, 7, 8],
+         ('sub-wlsubj042', 'ses-pilot01'): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
+         ('sub-wlsubj045', 'ses-pilot01'): [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]}.get((wildcards.subject, wildcards.session)),
+        distortPE = 'PA',
+        distortrevPE = 'AP',
+        plugin = "Linear",        
+        working_dir = "/scratch/wfb229",
+        PEdim = 'y'
+    benchmark:
+        os.path.join(config["DATA_DIR"], "code", "preprocessed", "{subject}_{session}_benchmark.txt")
+    log:
+        os.path.join(config["DATA_DIR"], "code", "preprocessed", "{subject}_{session}.log")
+    shell:
+        "export SUBJECTS_DIR={input.freesurfer_dir};"
+        "python ~/MRI_tools/preprocessing/prisma_preproc.py -subject {wildcards.subject} -datadir "
+        "{input.data_dir} -outdir {output.output_dir} -epis {params.epis} -sbref {params.sbref} "
+        "-distortPE {params.distortPE} -distortrevPE {params.distortrevPE} -working_dir "
+        "{params.working_dir} -PEdim {params.PEdim} -plugin {params.plugin} -dir_structure bids"
diff --git a/cluster.json b/cluster.json
new file mode 100644
index 0000000..d59a957
--- /dev/null
+++ b/cluster.json
@@ -0,0 +1,16 @@
+{
+    "__default__":
+    {
+	"nodes": 1,
+	"tasks_per_node": 1,
+	"cpus_per_task": 1,
+	"mem": "48GB",
+	"time": "12:00:00"
+    },
+    
+    "preprocess":
+    {
+	"job_name": "preproc",
+	"mem": "10GB"
+    }
+}
diff --git a/config.yml b/config.yml
new file mode 100644
index 0000000..2016c8e
--- /dev/null
+++ b/config.yml
@@ -0,0 +1 @@
+DATA_DIR: "/scratch/wfb229/spatial_frequency_preferences"
diff --git a/requirements.txt b/requirements.txt
index 2dc6962..9993a6e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ matplotlib
 seaborn
 numpy>=1.13.0
 scipy
+pyPyrTools