-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutwaste-pipeline-master.sh
241 lines (208 loc) · 11.2 KB
/
utwaste-pipeline-master.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#! /bin/sh
#SBATCH --partition highmem
#SBATCH --mem 100G
#SBATCH --cpus-per-task 12
#SBATCH --output pipeline.out
#SBATCH --error pipeline.err
# options above specified for submitting jobs to a SLURM job manager
# user should define these paths:
# 1. qc-trimmed and -filtered reads
QCREADS_PATH="qc_reads"
# 2. fam.tab, included in database downloaded with amrfinder -u
ln -svi /srv/databases/proteins/amrfinder/data/2022-12-19.1/fam.tab
DB="NCBIfam-2022-12-19.1"
DBVERSION="2022-12-19.1"
# 3. gff file generated by prodigal for each assembly
GFF_PATH="gff_files"
# 4. amrfinder results generated for each assembly
AMR_PATH="amrfinder_results"
# 5. dereplicated bam files and their index files representing reads mapped back to their assembly
BAM_PATH="bam_files"
# 6. scripts required for the workflow
SCRIPTS_PATH="scripts"
# qc reads with bbduk and Brazelton lab seq-qc https://github.com/Brazelton-Lab/seq-qc
for FORWARD in *_R1_001.fastq.gz
do
sample=$(basename $FORWARD .forward.fastq.gz)
bbduk.sh -Xmx10g ziplevel=9 threads=16 qin=33 ref=/srv/databases/contaminants/truseq_adapters.fa in1=${sample}_R1_001.fastq.gz in2=${sample}_R1_001.fastq.gz out=${sample}.interleaved.atrim.fq.gz stats=${sample}.adapter_stats.txt ftm=5 ktrim=r k=23 mink=9 rcomp=t hdist=2 tbo tpe minlength=0 2>${sample}.adapters.log
bbduk.sh -Xmx10g threads=16 qin=33 interleaved=t ref=/srv/databases/contaminants/phix174.fa.gz in=${sample}.interleaved.atrim.fq.gz out=${sample}.interleaved.atrim.decontam.fq.gz outm=${sample}.phix.fq.gz k=31 hdist=1 mcf=0.9 stats=${sample}.phix_stats.txt 2>${sample}.phix.log
qscore=28
window=8
qtrim --threads 12 --interleaved --qual-offset 33 --min-len 100 --sliding-window ${window}:${qscore} -o ${sample}.interleaved.atrim.decontam.qtrim.fq.gz -s ${sample}.singles.atrim.decontam.qtrim.fq.gz ${sample}.interleaved.atrim.decontam.fq.gz 2>${sample}.qtrim.log
mv ${sample}.interleaved.atrim.decontam.qtrim.fq.gz "$QCREADS_PATH"/
mv ${sample}.singles.atrim.decontam.qtrim.fq.gz "$QCREADS_PATH"/
done
# delete intermediate files
rm *.atrim.fq.gz
rm *.atrim.decontam.fq.gz
# assembly with Megahit
for READS12 in "$QCREADS_PATH"/*.interleaved.atrim.decontam.qtrim.fq.gz; do
ASS=$(basename $READS12 .interleaved.atrim.decontam.qtrim.fq.gz)
echo "writing assembly to" "$ASS"_megahit
# no singles
megahit -t 12 -m 0.9 --12 $READS12 --out-prefix $ASS --out-dir "$ASS"_megahit --min-contig-len 400 --k-min 27 --k-max 151 --k-step 4
# prodigal and cleanup
mkdir "$GFF_PATH"
for ASS in *_megahit/*.contigs.fa; do
BASE=$(basename $ASS .contigs.fa)
anvi-script-reformat-fasta $ASS -o $BASE.renamed.fa -l 0 --simplify-names --report-file $ASS.contig_names_map.tsv
prodigal -q -p meta -f gff -i $BASE.renamed.fa -o $BASE.gff -a $BASE.faa -d $BASE.ffn
awk '/^>/{split($0, a, " "); print a[1]; next} {gsub(/[-\*_\.]/,""); print}' $BASE.faa > $BASE.faa.clean
sed -i 's/\x0//g' $BASE.faa.clean
mv $BASE.faa.clean $BASE.faa
awk -F'\t' '/^#/{print $0; next} {sub(/ID\=[^_]*_/, "ID="$1"_", $9); print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9}' $BASE.gff > $BASE.gff.clean
mv $BASE.gff.clean $BASE.gff
mv $BASE.faa "$GFF_PATH"/$BASE.faa
mv $BASE.gff "$GFF_PATH"/$BASE.gff
mv $BASE.ffn "$GFF_PATH"/$BASE.ffn
done
# amrfinder
mkdir "$AMR_PATH"
for ASS in "$GFF_PATH"/*.faa; do
BASE=$(basename $ASS .faa)
amrfinder --protein $ASS --database /srv/databases/proteins/amrfinder/data/"$DBVERSION"/ -o "$AMR_PATH/"$BASE.amrfinder.tsv --threads 12 > "$AMR_PATH"/amrfinder-log-"$BASE".txt
done
# cleanup fam.tab with python pandas
python "$SCRIPTS_PATH"/fam.tab.cleanup.py
# output is fam.tsv
# generate the json file with Brazelton lab reldb https://github.com/Brazelton-Lab/seq-annot/blob/master/seq_annot/reldb.py
reldb --in-csv --filter 'Dbxref:-' --fields node_id,parent_node_id,gene_symbol,hmm_id,type,subtype,class,subclass,family_name --prepend 'Dbxref:NCBIfam:' --extend "database:'$DB'" --out AMR-"$DB".json fam.tsv
# Remove entries with “NA” for HMM id (column 17) from all .csv files
# This removes "PARTIALP" hits from downstream analyses
for ASS in "$AMR_PATH"/*.amrfinder.tsv; do
awk -F '\t' '$17 != "NA"' $ASS > $ASS.noNA
mv $ASS.noNA $ASS
done
# convert amrfinder results to blast-formatted file and run annotate_features
# amr2b6.py https://github.com/Brazelton-Lab/lab_scripts/blob/master/amr2b6.py
# annotate_features https://github.com/Brazelton-Lab/seq-annot
for AMR in "$AMR_PATH"/*.amrfinder.tsv; do
amr2b6.py --split -f Dbxref --mapping AMR-"$DB".json --out "$AMR".b6 $AMR
ASS=$(basename $AMR .amrfinder.tsv)
annotate_features --clear --format b6 --type CDS --conflict quality --fields "gene_symbol,hmm_id,node_id,parent_node_id,class,subclass,type,subtype" --mapping AMR-"$DB".json --out "$GFF_PATH"/"$ASS".amrfinder.gff --hits "$AMR_PATH"/"$ASS".amrfinder.tsv.b6 "$GFF_PATH"/"$ASS".gff
done
# map reads back to their assembly
mkdir bam_files
for FASTA in *.renamed.fa
do
ASS=$(basename $FASTA .renamed.fa)
bowtie2-build $ASS.renamed.fa $ASS
READS=$(find ./ -name "$ASS*interleaved.atrim.decontam.qtrim.fq.gz" -print)
bowtie2 --threads 12 -x "$ASS" --interleaved "$READS" --no-unal -S "$ASS"_mapped_to_"$ASS".sam 2>> "$ASS"_mapped_to_"$ASS".mapping.log
samtools view -F 4 -bS "$ASS"_mapped_to_"$ASS".sam > "$ASS"_mapped_to_"$ASS"_RAW.bam
anvi-init-bam "$ASS"_mapped_to_"$ASS"_RAW.bam -o "$ASS"_mapped_to_"$ASS".bam
rm "$ASS"_mapped_to_"$ASS".sam "$ASS"_mapped_to_"$ASS"_RAW.bam
mv *.bam* bam_files/
done
# derep
for FILE in "$BAM_PATH"/*.bam
do
bam=$(basename $FILE .bam)
picard -Xmx80g MarkDuplicates COMPRESSION_LEVEL=9 USE_JDK_INFLATER=true USE_JDK_DEFLATER=true REMOVE_DUPLICATES=true INPUT="$BAM_PATH"/"$bam".bam OUTPUT="$BAM_PATH"/"$bam".derep.bam METRICS_FILE="$BAM_PATH"/"$bam".replicates.txt 2> "$BAM_PATH"/"$bam".derep.log
samtools index "$BAM_PATH"/"$bam".derep.bam
done
# count_features with gene_symbol
# specific for self-mappings
for assembly in "$GFF_PATH"/*.amrfinder.gff; do
ASS=$(basename $assembly .amrfinder.gff)
samtools sort -n -o "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.bam
count_features --attr gene_symbol --sorting name --units fpkm,tpm,counts --out "$ASS"."$ASS".abunds "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam "$GFF_PATH"/"$ASS".amrfinder.gff 2> "$ASS"."$ASS".count.log
done
# merge counts from multiple samples into one table
# compare_features https://github.com/Brazelton-Lab/seq-annot
L=""
for SAMPLE in *.abunds.tpm.csv; do
L+="$SAMPLE "
done
compare_features $L > merged.abunds.tpm.tsv
# map ARG types and classes to merged table
python "$SCRIPTS_PATH"/map-ARG-types.py
# repeat above steps starting with count_features at level of node_id instead of gene_symbol
# count_features https://github.com/Brazelton-Lab/seq-annot
for assembly in "$GFF_PATH"/*.amrfinder.gff; do
ASS=$(basename $assembly .amrfinder.gff)
samtools sort -n -o "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.bam
count_features --attr node_id --sorting name --units fpkm,tpm,counts --out "$ASS"."$ASS".abunds.node "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam "$GFF_PATH"/"$ASS".amrfinder.gff 2> "$ASS"."$ASS".count.log
done
L=""
for SAMPLE in *.abunds.node.tpm.csv; do
L+="$SAMPLE "
done
compare_features $L > merged.abunds.node.tpm.tsv
# move individual sample count files into a directory
mkdir AMRfinder_count_files
mv *.abunds.*.csv AMRfinder_count_files/
mv *.count.log AMRfinder_count_files/
# map ARG types and classes to merged table
python "$SCRIPTS_PATH"/map-ARG-nodes.py
# output is "merged.abunds.node.tpm.csv"
# search mobileOG database
# mobileOG-db_beatrix-1.6.All.faa = all sequences
# mobileOG-db_beatrix-1.6.faa = core sequences
# website readme uses the All dataset
DB=/srv/databases/proteins/mobileOG/mobileOG-db_beatrix-1.6.dmnd
M=/srv/databases/proteins/mobileOG/mobileOG-db-beatrix-1.6-All.csv
for FILE in "$GFF_PATH"/*.faa; do
ASS=$(basename $FILE .faa)
diamond blastp --threads 12 --tmpdir ./ -q $FILE --db $DB --outfmt 6 stitle qtitle pident bitscore slen evalue qlen sstart send qstart qend -k 15 -o "$ASS".mobileOG.tsv -e 1e-20 --query-cover 90 --id 90
python "$SCRIPTS_PATH"/mobileOGs-pl-kyanite.py --o $ASS --i $ASS.mobileOG.tsv -m $M
done
# output is $ASS.mobileOG.Alignment.Out.csv
# merge mobileOG result files
python "$SCRIPTS_PATH"/mobileOG-merge-tables.py
# search VFDB
# VFDB_setA_pro.fas = core dataset
# VFDB_setB_pro.fas = full dataset
# from the website: core dataset includes representative genes associated with experimentally verified VFs only, whereas full dataset covers all genes related to known and predicted VFs in the database.
DB=/srv/databases/proteins/VFDB/VFDB_setB_pro.dmnd
for FILE in "$GFF_PATH"/*.faa; do
ASS=$(basename $FILE .faa)
diamond blastp --threads 12 --tmpdir ./ -q $FILE --db $DB --outfmt 6 stitle qtitle pident bitscore slen evalue qlen sstart send qstart qend -k 15 -o "$ASS".VFDB.tsv -e 1e-20 --query-cover 90 --id 90
done
# merge VFDB result files
python "$SCRIPTS_PATH"/VFDB-merge-tables.py
# merged AMRfinder result files (I can't use the final count table I generated above because the contig names have been lost at that point. Here we need to use the original .amrfinder.tsv.b6 results for each assembly)
python "$SCRIPTS_PATH"/AMR-merge-tables.py
# map the AMRfinder types and classes again
# this is a slightly different version of the similar python script above
python "$SCRIPTS_PATH"/map-ARG-nodes-contigs.py
# merge AMRfinder, mobileOG, and VFDB results on contig name
python "$SCRIPTS_PATH"/merge-all-results.py
# cleanup
mkdir "mobileOG_results"
mv *.mobileOG.tsv mobileOG_results/
mv *.mobileOG.Alignment.Out.csv mobileOG_results/
mv *.summary.csv mobileOG_results/
mkdir "VFDB_results"
mv *.VFDB.tsv VFDB_results/
# get coverage of mobileOG results
for MGE in "mobileOG_results"/*.mobileOG.Alignment.Out.csv; do
ASS=$(basename $MGE .mobileOG.Alignment.Out.csv)
# convert to tab-delimited
csv2tab < "$MGE" > "$MGE".tsv
# chop the header row
tail -n +2 "$MGE".tsv > "$MGE".tsv2
# chop off unnecessary columns that cause trouble with annotate_features
srun cut -f1-19 $MGE.tsv2 > "$MGE".tsv2.cut
annotate_features --type CDS --conflict quality --hits "$MGE".tsv2.cut --specifiers "id,saccver,qaccver,pident,bitscore,length,evalue,qlength,sstart,send,qstart,qend,mobileOG_ID,Gene_Name,Best_Hit_Accession_ID,Major_mobileOG_Category,Minor_mobileOG_Category,Source_Database,Evidence_Type" --out "$GFF_PATH"/"$ASS".mobileOG.gff "$GFF_PATH"/"$ASS".gff
done
# specific for self-mappings only
for assembly in "$GFF_PATH"/*.mobileOG.gff; do
ASS=$(basename $assembly .mobileOG.gff)
samtools sort -n -o "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.bam
count_features --attr Alias --sorting name --units fpkm,tpm,counts --out "$ASS"."$ASS".mobileOG.abunds "$BAM_PATH"/"$ASS"_mapped_to_"$ASS".derep.sorted-n.bam $assembly 2> "$ASS"."$ASS".count.log
done
# merge counts from multiple samples into one table
L=""
for SAMPLE in *.abunds.tpm.csv; do
L+="$SAMPLE "
done
compare_features $L > merged.mobileOG.abunds.tpm.tsv
# map mobileOG categories to merged table
# requires mobileOG.final.csv file generated above
python "$SCRIPTS_PATH"/map-mobileOG-categories.py
# output is merged.abunds.mobileOG.cats.tpm.csv
# move individual sample count files into a directory
mkdir mobileOG_count_files
mv *.abunds.*.csv mobileOG_count_files/
mv *.count.log mobileOG_count_files/