Skip to content

Commit

Permalink
Adding files from Trinity on Sedna
Browse files Browse the repository at this point in the history
Adding the scripts and snakefile used when testing Trinity on Sedna.
  • Loading branch information
ggoetznoaa authored Jun 20, 2024
1 parent 73be658 commit db06a6b
Show file tree
Hide file tree
Showing 5 changed files with 211 additions and 0 deletions.
36 changes: 36 additions & 0 deletions Sedna/Trinity/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import glob
import re

files_R1 = glob.glob('raw/*.R1.fq.gz')
files_R2 = glob.glob('raw/*.R2.fq.gz')
files_R1_str = ','.join(files_R1)
files_R2_str = ','.join(files_R2)

adapter_file = "/share/nwfsc/ggoetz/202310-terra_gcp-trinity/TruSeq3-PE-2.fa"
trim_string = ("ILLUMINACLIP:" + adapter_file +
":2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:50")

rule trinity:
conda:
"trinity-2.15.1"
output:
"trinity/trinity.all.Trinity.fasta",
"trinity/trinity.all.Trinity.fasta.gene_trans_map"
threads: 20
log:
"logs/trinity.log"
benchmark:
"benchmarks/trinity.txt"
shell:
"""
Trinity \
--seqType fq \
--trimmomatic \
--quality_trimming_params "{trim_string}" \
--CPU {threads} \
--output trinity/trinity.all \
--max_memory 500G \
--left {files_R1_str} \
--right {files_R2_str} \
&> {log}
"""
40 changes: 40 additions & 0 deletions Sedna/Trinity/run.fastqc_multiqc.raw.v1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

#SBATCH --job-name=FASTQC
#SBATCH -c 20
#SBATCH -t 0
#SBATCH [email protected]
#SBATCH --mail-type=ALL

# Fix for using conda with
source ~/.bashrc

# Load the FastQC environment
mamba activate fastqc-0.12.1

# Setup base variables
BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity
IN=${BASE}/raw
OUT=${BASE}/fastqc/raw

# Create output folder if it doesn't exist
if [ ! -d ${OUT} ]; then
mkdir -p ${OUT}
fi

cd ${BASE}

# Running FastQC
fastqc \
-t 20 \
-o ${OUT} \
${IN}/*.gz \
&> ${OUT}/fastqc.log

# Switch to MultiQC environment
mamba activate multiqc-1.17

cd ${OUT}

# Run MultiQC
multiqc .
40 changes: 40 additions & 0 deletions Sedna/Trinity/run.fastqc_multiqc.trimmed.v1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash

#SBATCH --job-name=FASTQCTrimmed
#SBATCH -c 20
#SBATCH -t 0
#SBATCH [email protected]
#SBATCH --mail-type=ALL

# Fix for using conda with
source ~/.bashrc

# Load the FastQC environment
mamba activate fastqc-0.12.1

# Setup base variables
BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity
IN=${BASE}/trimmed
OUT=${BASE}/fastqc/trimmed

# Create output folder if it doesn't exist
if [ ! -d ${OUT} ]; then
mkdir -p ${OUT}
fi

cd ${BASE}

# Running FastQC
fastqc \
-t 20 \
-o ${OUT} \
${IN}/*.trimmed.paired.*.fq.gz \
&> ${OUT}/fastqc.log

# Switch to MultiQC environment
mamba activate multiqc-1.17

cd ${OUT}

# Run MultiQC
multiqc .
53 changes: 53 additions & 0 deletions Sedna/Trinity/run.trimmomatic.v1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash

#SBATCH --job-name=TrimmomaticNose
#SBATCH -c 20
#SBATCH -t 0
#SBATCH [email protected]
#SBATCH --mail-type=ALL

# Fix for using conda with
source ~/.bashrc

# Load Trimmomatic environment
mamba activate trimmomatic-0.39

# Setup
BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity
IN=${BASE}/raw
OUT=${BASE}/trimmed

# Trimmomatic Adapter Folder
ADAPTER_FOLDER=/opt/bioinformatics/mambaforge/envs/trimmomatic-0.39/share/trimmomatic/adapters

# Create output folder if it doesn't exist
if [ ! -d ${OUT} ]; then
mkdir -p ${OUT}
fi

cd ${BASE}

# Loop through the samples
SAMPLES=$(ls ${IN}/*R1*.gz | \
awk -F "/" '{print $NF}' | \
awk -F "." '{print $1}')

for SAMPLE in ${SAMPLES}
do
# Run Trimmomatic
trimmomatic \
PE \
-threads 20 \
${IN}/${SAMPLE}.R1.fq.gz \
${IN}/${SAMPLE}.R2.fq.gz \
${OUT}/${SAMPLE}.trimmed.paired.R1.fq.gz \
${OUT}/${SAMPLE}.trimmed.unpaired.R1.fq.gz \
${OUT}/${SAMPLE}.trimmed.paired.R2.fq.gz \
${OUT}/${SAMPLE}.trimmed.unpaired.R2.fq.gz \
ILLUMINACLIP:${ADAPTER_FOLDER}/TruSeq3-PE-2.fa:2:30:10 \
LEADING:3 \
TRAILING:3 \
SLIDINGWINDOW:4:20 \
MINLEN:50 \
&> ${OUT}/${SAMPLE}.trim.log
done
42 changes: 42 additions & 0 deletions Sedna/Trinity/run.trinity.all.v1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

#SBATCH --job-name=Trinity
#SBATCH -c 24
#SBATCH -t 0
#SBATCH [email protected]
#SBATCH --mail-type=ALL

# Using a himem node since we probably need more than 200 GB
#SBATCH -p himem

# so we can use conda with the scheduler
source ~/.bashrc

# load the trinity environment
mamba activate trinity-2.15.1

VER=1

# Setup
BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity
IN=${BASE}/trimmed
OUT=${BASE}/trinity/trinity.all.v${VER}

if [ ! -d ${OUT} ]; then
mkdir -p ${OUT}
fi

# Get a list of the files, only using the paired trimmed ones
FILES_R1=$(ls ${IN}/*.paired.R1.*.gz | tr '\n' ',' | sed -e 's/,$//')
FILES_R2=$(ls ${IN}/*.paired.R2.*.gz | tr '\n' ',' | sed -e 's/,$//')

# Running Trinity, setting a really high memory limit since
# we don't know how much it is going to use.
Trinity \
--seqType fq \
--CPU 24 \
--output ${OUT} \
--max_memory 1200G \
--left ${FILES_R1} \
--right ${FILES_R2} \
&> ${OUT}/trinity.log

0 comments on commit db06a6b

Please sign in to comment.