From db06a6b963caaf26ff0899e74eeab6ba93ceb1d6 Mon Sep 17 00:00:00 2001 From: ggoetznoaa <118486586+ggoetznoaa@users.noreply.github.com> Date: Thu, 20 Jun 2024 08:56:55 -0700 Subject: [PATCH] Adding files from Trinity on Sedna Adding the scripts and snakefile used when testing Trinity on Sedna. --- Sedna/Trinity/Snakefile | 36 +++++++++++++ Sedna/Trinity/run.fastqc_multiqc.raw.v1.sh | 40 ++++++++++++++ .../Trinity/run.fastqc_multiqc.trimmed.v1.sh | 40 ++++++++++++++ Sedna/Trinity/run.trimmomatic.v1.sh | 53 +++++++++++++++++++ Sedna/Trinity/run.trinity.all.v1.sh | 42 +++++++++++++++ 5 files changed, 211 insertions(+) create mode 100644 Sedna/Trinity/Snakefile create mode 100644 Sedna/Trinity/run.fastqc_multiqc.raw.v1.sh create mode 100644 Sedna/Trinity/run.fastqc_multiqc.trimmed.v1.sh create mode 100644 Sedna/Trinity/run.trimmomatic.v1.sh create mode 100644 Sedna/Trinity/run.trinity.all.v1.sh diff --git a/Sedna/Trinity/Snakefile b/Sedna/Trinity/Snakefile new file mode 100644 index 0000000..635db73 --- /dev/null +++ b/Sedna/Trinity/Snakefile @@ -0,0 +1,36 @@ +import glob +import re + +files_R1 = glob.glob('raw/*.R1.fq.gz') +files_R2 = glob.glob('raw/*.R2.fq.gz') +files_R1_str = ','.join(files_R1) +files_R2_str = ','.join(files_R2) + +adapter_file = "/share/nwfsc/ggoetz/202310-terra_gcp-trinity/TruSeq3-PE-2.fa" +trim_string = ("ILLUMINACLIP:" + adapter_file + + ":2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:50") + +rule trinity: + conda: + "trinity-2.15.1" + output: + "trinity/trinity.all.Trinity.fasta", + "trinity/trinity.all.Trinity.fasta.gene_trans_map" + threads: 20 + log: + "logs/trinity.log" + benchmark: + "benchmarks/trinity.txt" + shell: + """ + Trinity \ + --seqType fq \ + --trimmomatic \ + --quality_trimming_params "{trim_string}" \ + --CPU {threads} \ + --output trinity/trinity.all \ + --max_memory 500G \ + --left {files_R1_str} \ + --right {files_R2_str} \ + &> {log} + """ diff --git a/Sedna/Trinity/run.fastqc_multiqc.raw.v1.sh b/Sedna/Trinity/run.fastqc_multiqc.raw.v1.sh new file mode 100644 index 0000000..a50a4df --- /dev/null +++ b/Sedna/Trinity/run.fastqc_multiqc.raw.v1.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +#SBATCH --job-name=FASTQC +#SBATCH -c 20 +#SBATCH -t 0 +#SBATCH --mail-user=giles.goetz@noaa.gov +#SBATCH --mail-type=ALL + +# Fix for using conda with +source ~/.bashrc + +# Load the FastQC environment +mamba activate fastqc-0.12.1 + +# Setup base variables +BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity +IN=${BASE}/raw +OUT=${BASE}/fastqc/raw + +# Create output folder if it doesn't exist +if [ ! -d ${OUT} ]; then + mkdir -p ${OUT} +fi + +cd ${BASE} + +# Running FastQC +fastqc \ + -t 20 \ + -o ${OUT} \ + ${IN}/*.gz \ + &> ${OUT}/fastqc.log + +# Switch to MultiQC environment +mamba activate multiqc-1.17 + +cd ${OUT} + +# Run MultiQC +multiqc . diff --git a/Sedna/Trinity/run.fastqc_multiqc.trimmed.v1.sh b/Sedna/Trinity/run.fastqc_multiqc.trimmed.v1.sh new file mode 100644 index 0000000..ed7a4d9 --- /dev/null +++ b/Sedna/Trinity/run.fastqc_multiqc.trimmed.v1.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +#SBATCH --job-name=FASTQCTrimmed +#SBATCH -c 20 +#SBATCH -t 0 +#SBATCH --mail-user=giles.goetz@noaa.gov +#SBATCH --mail-type=ALL + +# Fix for using conda with +source ~/.bashrc + +# Load the FastQC environment +mamba activate fastqc-0.12.1 + +# Setup base variables +BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity +IN=${BASE}/trimmed +OUT=${BASE}/fastqc/trimmed + +# Create output folder if it doesn't exist +if [ ! -d ${OUT} ]; then + mkdir -p ${OUT} +fi + +cd ${BASE} + +# Running FastQC +fastqc \ + -t 20 \ + -o ${OUT} \ + ${IN}/*.trimmed.paired.*.fq.gz \ + &> ${OUT}/fastqc.log + +# Switch to MultiQC environment +mamba activate multiqc-1.17 + +cd ${OUT} + +# Run MultiQC +multiqc . diff --git a/Sedna/Trinity/run.trimmomatic.v1.sh b/Sedna/Trinity/run.trimmomatic.v1.sh new file mode 100644 index 0000000..f6ac45d --- /dev/null +++ b/Sedna/Trinity/run.trimmomatic.v1.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +#SBATCH --job-name=TrimmomaticNose +#SBATCH -c 20 +#SBATCH -t 0 +#SBATCH --mail-user=giles.goetz@noaa.gov +#SBATCH --mail-type=ALL + +# Fix for using conda with +source ~/.bashrc + +# Load Trimmomatic environment +mamba activate trimmomatic-0.39 + +# Setup +BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity +IN=${BASE}/raw +OUT=${BASE}/trimmed + +# Trimmomatic Adapter Folder +ADAPTER_FOLDER=/opt/bioinformatics/mambaforge/envs/trimmomatic-0.39/share/trimmomatic/adapters + +# Create output folder if it doesn't exist +if [ ! -d ${OUT} ]; then + mkdir -p ${OUT} +fi + +cd ${BASE} + +# Loop through the samples +SAMPLES=$(ls ${IN}/*R1*.gz | \ + awk -F "/" '{print $NF}' | \ + awk -F "." '{print $1}') + +for SAMPLE in ${SAMPLES} +do + # Run Trimmomatic + trimmomatic \ + PE \ + -threads 20 \ + ${IN}/${SAMPLE}.R1.fq.gz \ + ${IN}/${SAMPLE}.R2.fq.gz \ + ${OUT}/${SAMPLE}.trimmed.paired.R1.fq.gz \ + ${OUT}/${SAMPLE}.trimmed.unpaired.R1.fq.gz \ + ${OUT}/${SAMPLE}.trimmed.paired.R2.fq.gz \ + ${OUT}/${SAMPLE}.trimmed.unpaired.R2.fq.gz \ + ILLUMINACLIP:${ADAPTER_FOLDER}/TruSeq3-PE-2.fa:2:30:10 \ + LEADING:3 \ + TRAILING:3 \ + SLIDINGWINDOW:4:20 \ + MINLEN:50 \ + &> ${OUT}/${SAMPLE}.trim.log +done diff --git a/Sedna/Trinity/run.trinity.all.v1.sh b/Sedna/Trinity/run.trinity.all.v1.sh new file mode 100644 index 0000000..5650785 --- /dev/null +++ b/Sedna/Trinity/run.trinity.all.v1.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +#SBATCH --job-name=Trinity +#SBATCH -c 24 +#SBATCH -t 0 +#SBATCH --mail-user=giles.goetz@noaa.gov +#SBATCH --mail-type=ALL + +# Using a himem node since we probably need more than 200 GB +#SBATCH -p himem + +# so we can use conda with the scheduler +source ~/.bashrc + +# load the trinity environment +mamba activate trinity-2.15.1 + +VER=1 + +# Setup +BASE=/share/nwfsc/ggoetz/202310-terra_gcp-trinity +IN=${BASE}/trimmed +OUT=${BASE}/trinity/trinity.all.v${VER} + +if [ ! -d ${OUT} ]; then + mkdir -p ${OUT} +fi + +# Get a list of the files, only using the paired trimmed ones +FILES_R1=$(ls ${IN}/*.paired.R1.*.gz | tr '\n' ',' | sed -e 's/,$//') +FILES_R2=$(ls ${IN}/*.paired.R2.*.gz | tr '\n' ',' | sed -e 's/,$//') + +# Running Trinity, setting a really high memory limit since +# we don't know how much it is going to use. +Trinity \ + --seqType fq \ + --CPU 24 \ + --output ${OUT} \ + --max_memory 1200G \ + --left ${FILES_R1} \ + --right ${FILES_R2} \ + &> ${OUT}/trinity.log