Adding files from Trinity on Sedna

Adding the scripts and snakefile used when testing Trinity on Sedna.
noaa-nwfsc · Jun 20, 2024 · db06a6b · db06a6b
1 parent 73be658
commit db06a6b
Show file tree

Hide file tree

Showing 5 changed files with 211 additions and 0 deletions.
diff --git a/Sedna/Trinity/Snakefile b/Sedna/Trinity/Snakefile
@@ -0,0 +1,36 @@
+import glob
+import re
+
+files_R1 = glob.glob('raw/*.R1.fq.gz')
+files_R2 = glob.glob('raw/*.R2.fq.gz')
+files_R1_str = ','.join(files_R1)
+files_R2_str = ','.join(files_R2)
+
+adapter_file = "/share/nwfsc/ggoetz/202310-terra_gcp-trinity/TruSeq3-PE-2.fa"
+trim_string = ("ILLUMINACLIP:" + adapter_file +
+    ":2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:50")
+
+rule trinity:
+    conda:
+        "trinity-2.15.1"
+    output:
+        "trinity/trinity.all.Trinity.fasta",
+        "trinity/trinity.all.Trinity.fasta.gene_trans_map" 
+    threads: 20
+    log:
+        "logs/trinity.log"
+    benchmark:
+        "benchmarks/trinity.txt"
+    shell:
+        """
+        Trinity \
+            --seqType fq \
+            --trimmomatic \
+            --quality_trimming_params "{trim_string}" \
+            --CPU {threads} \
+            --output trinity/trinity.all \
+            --max_memory 500G \
+            --left {files_R1_str} \
+            --right {files_R2_str} \
+            &> {log} 
+        """
diff --git a/Sedna/Trinity/run.fastqc_multiqc.raw.v1.sh b/Sedna/Trinity/run.fastqc_multiqc.raw.v1.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+#SBATCH --job-name=FASTQC
+#SBATCH -c 20
+#SBATCH -t 0
+#SBATCH [email protected]
+#SBATCH --mail-type=ALL
+
+# Fix for using conda with 
+source ~/.bashrc
+
+# Load the FastQC environment
+mamba activate fastqc-0.12.1
+
+# Setup base variables
+BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity
+IN=${BASE}/raw
+OUT=${BASE}/fastqc/raw
+
+# Create output folder if it doesn't exist
+if [ ! -d ${OUT} ]; then
+    mkdir -p ${OUT}
+fi
+
+cd ${BASE}
+
+# Running FastQC
+fastqc \
+    -t 20 \
+    -o ${OUT} \
+    ${IN}/*.gz \
+    &> ${OUT}/fastqc.log
+
+# Switch to MultiQC environment
+mamba activate multiqc-1.17
+
+cd ${OUT}
+
+# Run MultiQC
+multiqc .
diff --git a/Sedna/Trinity/run.fastqc_multiqc.trimmed.v1.sh b/Sedna/Trinity/run.fastqc_multiqc.trimmed.v1.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+#SBATCH --job-name=FASTQCTrimmed
+#SBATCH -c 20
+#SBATCH -t 0
+#SBATCH [email protected]
+#SBATCH --mail-type=ALL
+
+# Fix for using conda with 
+source ~/.bashrc
+
+# Load the FastQC environment
+mamba activate fastqc-0.12.1
+
+# Setup base variables
+BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity
+IN=${BASE}/trimmed
+OUT=${BASE}/fastqc/trimmed
+
+# Create output folder if it doesn't exist
+if [ ! -d ${OUT} ]; then
+    mkdir -p ${OUT}
+fi
+
+cd ${BASE}
+
+# Running FastQC
+fastqc \
+    -t 20 \
+    -o ${OUT} \
+    ${IN}/*.trimmed.paired.*.fq.gz \
+    &> ${OUT}/fastqc.log
+
+# Switch to MultiQC environment
+mamba activate multiqc-1.17
+
+cd ${OUT}
+
+# Run MultiQC
+multiqc .
diff --git a/Sedna/Trinity/run.trimmomatic.v1.sh b/Sedna/Trinity/run.trimmomatic.v1.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+#SBATCH --job-name=TrimmomaticNose
+#SBATCH -c 20
+#SBATCH -t 0
+#SBATCH [email protected]
+#SBATCH --mail-type=ALL
+
+# Fix for using conda with 
+source ~/.bashrc
+
+# Load Trimmomatic environment
+mamba activate trimmomatic-0.39
+
+# Setup
+BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity
+IN=${BASE}/raw
+OUT=${BASE}/trimmed
+
+# Trimmomatic Adapter Folder
+ADAPTER_FOLDER=/opt/bioinformatics/mambaforge/envs/trimmomatic-0.39/share/trimmomatic/adapters
+
+# Create output folder if it doesn't exist
+if [ ! -d ${OUT} ]; then
+    mkdir -p ${OUT}
+fi
+
+cd ${BASE}
+
+# Loop through the samples
+SAMPLES=$(ls ${IN}/*R1*.gz | \
+    awk -F "/" '{print $NF}' | \
+    awk -F "." '{print $1}')
+
+for SAMPLE in ${SAMPLES}
+do
+    # Run Trimmomatic
+    trimmomatic \
+        PE \
+        -threads 20 \
+        ${IN}/${SAMPLE}.R1.fq.gz \
+        ${IN}/${SAMPLE}.R2.fq.gz \
+        ${OUT}/${SAMPLE}.trimmed.paired.R1.fq.gz \
+        ${OUT}/${SAMPLE}.trimmed.unpaired.R1.fq.gz \
+        ${OUT}/${SAMPLE}.trimmed.paired.R2.fq.gz \
+        ${OUT}/${SAMPLE}.trimmed.unpaired.R2.fq.gz \
+        ILLUMINACLIP:${ADAPTER_FOLDER}/TruSeq3-PE-2.fa:2:30:10 \
+        LEADING:3 \
+        TRAILING:3 \
+        SLIDINGWINDOW:4:20 \
+        MINLEN:50 \
+        &> ${OUT}/${SAMPLE}.trim.log
+done
diff --git a/Sedna/Trinity/run.trinity.all.v1.sh b/Sedna/Trinity/run.trinity.all.v1.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+#SBATCH --job-name=Trinity
+#SBATCH -c 24
+#SBATCH -t 0
+#SBATCH [email protected]
+#SBATCH --mail-type=ALL
+
+# Using a himem node since we probably need more than 200 GB
+#SBATCH -p himem
+
+# so we can use conda with the scheduler
+source ~/.bashrc
+
+# load the trinity environment
+mamba activate trinity-2.15.1
+
+VER=1
+
+# Setup
+BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity
+IN=${BASE}/trimmed
+OUT=${BASE}/trinity/trinity.all.v${VER}
+
+if [ ! -d ${OUT} ]; then
+    mkdir -p ${OUT}
+fi
+
+# Get a list of the files, only using the paired trimmed ones
+FILES_R1=$(ls ${IN}/*.paired.R1.*.gz | tr '\n' ',' | sed -e 's/,$//')
+FILES_R2=$(ls ${IN}/*.paired.R2.*.gz | tr '\n' ',' | sed -e 's/,$//')
+
+# Running Trinity, setting a really high memory limit since
+# we don't know how much it is going to use.
+Trinity \
+    --seqType fq \
+    --CPU 24 \
+    --output ${OUT} \
+    --max_memory 1200G \
+    --left ${FILES_R1} \
+    --right ${FILES_R2} \
+    &> ${OUT}/trinity.log