utwaste-pipeline-master.sh

#! /bin/sh
#SBATCH --partition highmem
#SBATCH --mem 100G
#SBATCH --cpus-per-task 12
#SBATCH --output pipeline.out
#SBATCH --error pipeline.err

# options above specified for submitting jobs to a SLURM job manager 

# user should define these paths:
# 1. qc-trimmed and -filtered reads
QCREADS_PATH="qc_reads"
# 2. fam.tab, included in database downloaded with amrfinder -u
ln -svi /srv/databases/proteins/amrfinder/data/2022-12-19.1/fam.tab
DB="NCBIfam-2022-12-19.1"
DBVERSION="2022-12-19.1"
# 3. gff file generated by prodigal for each assembly
GFF_PATH="gff_files"
# 4. amrfinder results generated for each assembly
AMR_PATH="amrfinder_results"
# 5. dereplicated bam files and their index files representing reads mapped back to their assembly
BAM_PATH="bam_files"
# 6. scripts required for the workflow
SCRIPTS_PATH="scripts"

# qc reads with bbduk and Brazelton lab seq-qc https://github.com/Brazelton-Lab/seq-qc
for FORWARD in *_R1_001.fastq.gz
do
sample=$(basename $FORWARD .forward.fastq.gz)
bbduk.sh -Xmx10g ziplevel=9 threads=16 qin=33 ref=/srv/databases/contaminants/truseq_adapters.fa in1=${sample}_R1_001.fastq.gz in2=${sample}_R1_001.fastq.gz out=${sample}.interleaved.atrim.fq.gz stats=${sample}.adapter_stats.txt ftm=5 ktrim=r k=23 mink=9 rcomp=t hdist=2 tbo tpe minlength=0 2>${sample}.adapters.log
bbduk.sh -Xmx10g threads=16 qin=33 interleaved=t ref=/srv/databases/contaminants/phix174.fa.gz in=${sample}.interleaved.atrim.fq.gz out=${sample}.interleaved.atrim.decontam.fq.gz outm=${sample}.phix.fq.gz k=31 hdist=1 mcf=0.9 stats=${sample}.phix_stats.txt 2>${sample}.phix.log
qscore=28
window=8
qtrim --threads 12 --interleaved --qual-offset 33 --min-len 100 --sliding-window ${window}:${qscore} -o ${sample}.interleaved.atrim.decontam.qtrim.fq.gz -s ${sample}.singles.atrim.decontam.qtrim.fq.gz ${sample}.interleaved.atrim.decontam.fq.gz 2>${sample}.qtrim.log
mv ${sample}.interleaved.atrim.decontam.qtrim.fq.gz "$QCREADS_PATH"/
mv ${sample}.singles.atrim.decontam.qtrim.fq.gz "$QCREADS_PATH"/
done

# delete intermediate files
rm *.atrim.fq.gz
rm *.atrim.decontam.fq.gz

# assembly with Megahit
for READS12 in "$QCREADS_PATH"/*.interleaved.atrim.decontam.qtrim.fq.gz; do
ASS=$(basename $READS12 .interleaved.atrim.decontam.qtrim.fq.gz)
echo "writing assembly to" "$ASS"_megahit
# no singles
megahit -t 12 -m 0.9 --12 $READS12 --out-prefix $ASS --out-dir "$ASS"_megahit --min-contig-len 400 --k-min 27 --k-max 151 --k-step 4

# prodigal and cleanup
mkdir "$GFF_PATH"
for ASS in *_megahit/*.contigs.fa; do
BASE=$(basename $ASS .contigs.fa)
anvi-script-reformat-fasta $ASS -o $BASE.renamed.fa -l 0 --simplify-names --report-file $ASS.contig_names_map.tsv
prodigal -q -p meta -f gff -i $BASE.renamed.fa -o $BASE.gff -a $BASE.faa -d $BASE.ffn 
awk '/^>/{split($0, a, " "); print a[1]; next} {gsub(/[-\*_\.]/,""); print}' $BASE.faa > $BASE.faa.clean
sed -i 's/\x0//g' $BASE.faa.clean
mv $BASE.faa.clean $BASE.faa
awk -F'\t' '/^#/{print $0; next} {sub(/ID\=[^_]*_/, "ID="$1"_", $9); print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9}' $BASE.gff > $BASE.gff.clean
mv $BASE.gff.clean $BASE.gff
mv $BASE.faa "$GFF_PATH"/$BASE.faa
mv $BASE.gff "$GFF_PATH"/$BASE.gff
mv $BASE.ffn "$GFF_PATH"/$BASE.ffn
done

# amrfinder
mkdir "$AMR_PATH"
for ASS in "$GFF_PATH"/*.faa; do
BASE=$(basename $ASS .faa)
amrfinder --protein $ASS --database /srv/databases/proteins/amrfinder/data/"$DBVERSION"/ -o "$AMR_PATH/"$BASE.amrfinder.tsv --threads 12 > "$AMR_PATH"/amrfinder-log-"$BASE".txt
done

# cleanup fam.tab with python pandas
python "$SCRIPTS_PATH"/fam.tab.cleanup.py
# output is fam.tsv

# generate the json file with Brazelton lab reldb https://github.com/Brazelton-Lab/seq-annot/blob/master/seq_annot/reldb.py
reldb --in-csv --filter 'Dbxref:-' --fields node_id,parent_node_id,gene_symbol,hmm_id,type,subtype,class,subclass,family_name --prepend 'Dbxref:NCBIfam:' --extend "database:'$DB'" --out AMR-"$DB".json fam.tsv

# Remove entries with “NA” for HMM id (column 17) from all .csv files
# This removes "PARTIALP" hits from downstream analyses
for ASS in "$AMR_PATH"/*.amrfinder.tsv; do
awk -F '\t' '$17 != "NA"' $ASS > $ASS.noNA
mv $ASS.noNA $ASS
done

# convert amrfinder results to blast-formatted file and run annotate_features
# amr2b6.py https://github.com/Brazelton-Lab/lab_scripts/blob/master/amr2b6.py
# annotate_features https://github.com/Brazelton-Lab/seq-annot
for AMR in "$AMR_PATH"/*.amrfinder.tsv; do
amr2b6.py --split -f Dbxref --mapping AMR-"$DB".json --out "$AMR".b6 $AMR
ASS=$(basename $AMR .amrfinder.tsv)
annotate_features --clear --format b6 --type CDS --conflict quality --fields "gene_symbol,hmm_id,node_id,parent_node_id,class,subclass,type,subtype" --mapping AMR-"$DB".json --out "$GFF_PATH"/"$ASS".amrfinder.gff --hits "$AMR_PATH"/"$ASS".amrfinder.tsv.b6 "$GFF_PATH"/"$ASS".gff
done

# map reads back to their assembly
mkdir bam_files
for FASTA in *.renamed.fa
do
	ASS=$(basename $FASTA .renamed.fa)
	bowtie2-build $ASS.renamed.fa $ASS
	READS=$(find ./ -name "$ASS*interleaved.atrim.decontam.qtrim.fq.gz" -print)
	bowtie2 --threads 12 -x "$ASS" --interleaved "$READS" --no-unal -S "$ASS"_mapped_to_"$ASS".sam 2>> "$ASS"_mapped_to_"$ASS".mapping.log
	samtools view -F 4 -bS "$ASS"_mapped_to_"$ASS".sam > "$ASS"_mapped_to_"$ASS"_RAW.bam
	anvi-init-bam "$ASS"_mapped_to_"$ASS"_RAW.bam -o "$ASS"_mapped_to_"$ASS".bam
	rm "$ASS"_mapped_to_"$ASS".sam "$ASS"_mapped_to_"$ASS"_RAW.bam
	mv *.bam* bam_files/
done

# derep
for FILE in "$BAM_PATH"/*.bam
do
    bam=$(basename $FILE .bam)
    picard -Xmx80g MarkDuplicates COMPRESSION_LEVEL=9 USE_JDK_INFLATER=true USE_JDK_DEFLATER=true REMOVE_DUPLICATES=true INPUT="$BAM_PATH"/"$bam".bam OUTPUT="$BAM_PATH"/"$bam".derep.bam METRICS_FILE="$BAM_PATH"/"$bam".replicates.txt 2> "$BAM_PATH"/"$bam".derep.log 
    samtools index "$BAM_PATH"/"$bam".derep.bam
done

# count_features with gene_symbol
# specific for self-mappings
for assembly in "$GFF_PATH"/*.amrfinder.gff; do
	ASS=$(basename $assembly .amrfinder.gff)
	samtools sort -n -o "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.bam
	count_features --attr gene_symbol --sorting name --units fpkm,tpm,counts --out "$ASS"."$ASS".abunds "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam "$GFF_PATH"/"$ASS".amrfinder.gff 2> "$ASS"."$ASS".count.log
done

# merge counts from multiple samples into one table
# compare_features https://github.com/Brazelton-Lab/seq-annot
L=""
for SAMPLE in *.abunds.tpm.csv; do
	L+="$SAMPLE "
done
compare_features $L > merged.abunds.tpm.tsv

# map ARG types and classes to merged table
python "$SCRIPTS_PATH"/map-ARG-types.py

# repeat above steps starting with count_features at level of node_id instead of gene_symbol
# count_features https://github.com/Brazelton-Lab/seq-annot
for assembly in "$GFF_PATH"/*.amrfinder.gff; do
	ASS=$(basename $assembly .amrfinder.gff)
	samtools sort -n -o "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.bam
	count_features --attr node_id --sorting name --units fpkm,tpm,counts --out "$ASS"."$ASS".abunds.node "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam "$GFF_PATH"/"$ASS".amrfinder.gff 2> "$ASS"."$ASS".count.log
done

L=""
for SAMPLE in *.abunds.node.tpm.csv; do
	L+="$SAMPLE "
done
compare_features $L > merged.abunds.node.tpm.tsv

# move individual sample count files into a directory
mkdir AMRfinder_count_files
mv *.abunds.*.csv AMRfinder_count_files/
mv *.count.log AMRfinder_count_files/

# map ARG types and classes to merged table
python "$SCRIPTS_PATH"/map-ARG-nodes.py
# output is "merged.abunds.node.tpm.csv"

# search mobileOG database
# mobileOG-db_beatrix-1.6.All.faa = all sequences
# mobileOG-db_beatrix-1.6.faa	= core sequences
# website readme uses the All dataset
DB=/srv/databases/proteins/mobileOG/mobileOG-db_beatrix-1.6.dmnd
M=/srv/databases/proteins/mobileOG/mobileOG-db-beatrix-1.6-All.csv
for FILE in "$GFF_PATH"/*.faa; do
ASS=$(basename $FILE .faa)
diamond blastp --threads 12 --tmpdir ./  -q $FILE --db $DB --outfmt 6 stitle qtitle pident bitscore slen evalue qlen sstart send qstart qend -k 15 -o "$ASS".mobileOG.tsv -e 1e-20 --query-cover 90 --id 90
python "$SCRIPTS_PATH"/mobileOGs-pl-kyanite.py --o $ASS --i $ASS.mobileOG.tsv -m $M
done
# output is $ASS.mobileOG.Alignment.Out.csv

# merge	mobileOG result	files
python "$SCRIPTS_PATH"/mobileOG-merge-tables.py

# search VFDB
# VFDB_setA_pro.fas	= core dataset
# VFDB_setB_pro.fas	= full dataset
# from the website: core dataset includes representative genes associated with experimentally verified VFs only, whereas full dataset covers all genes related to known and predicted VFs in the database.
DB=/srv/databases/proteins/VFDB/VFDB_setB_pro.dmnd
for FILE in "$GFF_PATH"/*.faa; do
ASS=$(basename $FILE .faa)
diamond blastp --threads 12 --tmpdir ./  -q $FILE --db $DB --outfmt 6 stitle qtitle pident bitscore slen evalue qlen sstart send qstart qend -k 15 -o "$ASS".VFDB.tsv -e 1e-20 --query-cover 90 --id 90
done

# merge	VFDB result	files
python "$SCRIPTS_PATH"/VFDB-merge-tables.py

# merged AMRfinder result files (I can't use the final count table I generated above because the contig names have been lost at that point. Here we need to use the original .amrfinder.tsv.b6 results for each assembly)
python "$SCRIPTS_PATH"/AMR-merge-tables.py

# map the AMRfinder types and classes again
# this is a slightly different version of the similar python script above
python "$SCRIPTS_PATH"/map-ARG-nodes-contigs.py

# merge AMRfinder, mobileOG, and VFDB results on contig name
python "$SCRIPTS_PATH"/merge-all-results.py

# cleanup
mkdir "mobileOG_results"
mv *.mobileOG.tsv mobileOG_results/
mv *.mobileOG.Alignment.Out.csv mobileOG_results/
mv *.summary.csv mobileOG_results/
mkdir "VFDB_results"
mv *.VFDB.tsv VFDB_results/

# get coverage of mobileOG results
for MGE in "mobileOG_results"/*.mobileOG.Alignment.Out.csv; do
ASS=$(basename $MGE .mobileOG.Alignment.Out.csv)
# convert to tab-delimited
csv2tab < "$MGE" > "$MGE".tsv
# chop the header row
tail -n +2 "$MGE".tsv > "$MGE".tsv2
# chop off unnecessary columns that cause trouble with annotate_features
srun cut -f1-19 $MGE.tsv2 > "$MGE".tsv2.cut
annotate_features --type CDS --conflict quality --hits "$MGE".tsv2.cut --specifiers "id,saccver,qaccver,pident,bitscore,length,evalue,qlength,sstart,send,qstart,qend,mobileOG_ID,Gene_Name,Best_Hit_Accession_ID,Major_mobileOG_Category,Minor_mobileOG_Category,Source_Database,Evidence_Type" --out "$GFF_PATH"/"$ASS".mobileOG.gff "$GFF_PATH"/"$ASS".gff
done

# specific for self-mappings only
for assembly in "$GFF_PATH"/*.mobileOG.gff; do
	ASS=$(basename $assembly .mobileOG.gff)
	samtools sort -n -o "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.bam
	count_features --attr Alias --sorting name --units fpkm,tpm,counts --out "$ASS"."$ASS".mobileOG.abunds "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam $assembly 2> "$ASS"."$ASS".count.log
done

# merge counts from multiple samples into one table
L=""
for SAMPLE in *.abunds.tpm.csv; do
	L+="$SAMPLE "
done
compare_features $L > merged.mobileOG.abunds.tpm.tsv

# map mobileOG categories to merged table
# requires mobileOG.final.csv file generated above
python "$SCRIPTS_PATH"/map-mobileOG-categories.py
# output is merged.abunds.mobileOG.cats.tpm.csv

# move individual sample count files into a directory
mkdir mobileOG_count_files
mv *.abunds.*.csv mobileOG_count_files/
mv *.count.log mobileOG_count_files/