-
Notifications
You must be signed in to change notification settings - Fork 9
/
HybPhyloMaker2a2_readmappingParallel_summary.sh
231 lines (214 loc) · 10.4 KB
/
HybPhyloMaker2a2_readmappingParallel_summary.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/bin/bash
#----------------MetaCentrum----------------
#PBS -l walltime=1:0:0
#PBS -l select=1:ncpus=1:mem=4gb:scratch_local=8gb
#PBS -j oe
#PBS -N HybPhyloMaker2a2_readmappingParallel_summary
#PBS -m abe
#-------------------HYDRA-------------------
#$ -S /bin/bash
#$ -q sThC.q
#$ -l mres=4G,h_data=4G,h_vmem=4G
#$ -cwd
#$ -j y
#$ -N HybPhyloMaker2_readmappingParallel_summary
#$ -o HybPhyloMaker2_readmappingParallel_summary.log
# ********************************************************************************
# * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building *
# * https://github.com/tomas-fer/HybPhyloMaker *
# * Script 02a2 - Read mapping in parallel summary *
# * v.1.8.0a *
# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2022 *
# * [email protected] *
# ********************************************************************************
#Complete path and set configuration for selected location
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "\nHybPhyloMaker2a2 is running on MetaCentrum...\n"
#settings for MetaCentrum
#Copy file with settings from home and set variables from settings.cfg
cp $PBS_O_WORKDIR/settings.cfg .
. settings.cfg
path=/storage/$server/home/$LOGNAME/$data
source=/storage/$server/home/$LOGNAME/HybSeqSource
#Move to scratch
cd $SCRATCHDIR
#Add necessary modules
module add datamash-1.3 #for data summary
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "\nHybPhyloMaker2a2 is running on Hydra...\n"
#settings for Hydra
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir -p workdir02a2
cd workdir02a2
#Add necessary modules
module load datamash #not yet on Hydra
else
echo -e "\nHybPhyloMaker2a2 is running locally...\n"
echo -e "This summary is only for cluster environment. Exiting...\n"
exit 3
fi
#Write log
logname=HPM2a2
echo -e "HybPhyloMaker2a2: summary of read mapping in parallel" > ${logname}.log
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "run on MetaCentrum: $PBS_O_HOST" >> ${logname}.log
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "run on Hydra: $HOSTNAME" >> ${logname}.log
else
echo -e "local run: "`hostname`"/"`whoami` >> ${logname}.log
fi
echo -e "\nBegin:" `date '+%A %d-%m-%Y %X'` >> ${logname}.log
echo -e "\nSettings" >> ${logname}.log
if [[ $PBS_O_HOST == *".cz" ]]; then
printf "%-25s %s\n" `echo -e "\nServer:\t$server"` >> ${logname}.log
fi
for set in data cp mapping probes cpDNACDS mappingmethod conscall mincov majthres plurality nrns; do
printf "%-25s %s\n" `echo -e "${set}:\t" ${!set}` >> ${logname}.log
done
#Setting for the case when working with cpDNA
if [[ $cp =~ "yes" ]]; then
echo -en "Working with cpDNA"
type="cp"
cp $source/$cpDNACDS .
probes=$cpDNACDS
else
echo -en "Working with exons"
type="exons"
cp $source/$probes .
fi
echo -n
#Copy list of samples
cp $path/10rawreads/SamplesFileNames.txt .
#Copy all fasta files
cp $path/$type/21mapped_${mappingmethod}/*.fasta .
#Copy all mapping summaries
cp $path/$type/21mapped_${mappingmethod}/mapping_summary*.txt .
#Copy all per target coverages
mkdir coverage
cp $path/$type/21mapped_${mappingmethod}/coverage/*_perTarget.txt coverage/
#Rename the probe file and move it (just for the case it has the suffix *.fasta which would make problems in the next step)
mv $probes coverage/probes.fa
#Creating a summary table
echo -e "\nCreating summary tables..."
#Produce tab-separated table
#Write headers (number of reads, nr paired reads, nr forward unpaired reads, nr reverse unpaired reads, nr mapped reads, % of mapped reads)
echo -e "Sample no.\tGenus\tSpecies\tTotal nr. reads\tNr. paired reads\tNr. forward unpaired reads\tNr. reverse unpaired reads\tNr. mapped reads\tPercentage of mapped reads" > tmpmap
cat mapping_summary*.txt >> tmpmap
mv tmpmap mapping_summary.txt
cp mapping_summary.txt $path/$type/21mapped_${mappingmethod}/
#Combine all fasta files into one
echo -e "\nCombining consensus FASTA files..."
cat *.fasta > consensus.fasta
#Remove line breaks from fasta file
awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' consensus.fasta > tmp && mv tmp consensus.fasta
mkdir -p $path/$type/30consensus
if [[ $cp =~ "yes" ]]; then
cp consensus.fasta consensus_cpDNA.fasta
cp consensus_cpDNA.fasta $path/$type/30consensus
else
cp consensus.fasta $path/$type/30consensus
fi
#Calculate ambiguous bases (number and percentage) if ConsensusFixer was used
if [[ $conscall =~ "consensusfixer" ]]; then
echo -e "\nCalculating ambiguous bases for ConsensusFixer results..."
#print length of sequences (i.e., print line length if the first character on line is not '>')
awk '{ if (substr($1,1,1) !~ /^>/ ) print length($0) }' consensus.fasta > totallength.txt
#Remove '?'s (introduced when N in reference), only in sequences (i.e., not on lines starting with '>')
sed -i '/>/!s/\?//g' consensus.fasta
#print length of sequences without '?'
awk '{ if (substr($1,1,1) !~ /^>/ ) print length($0) }' consensus.fasta > totallengthNoQ.txt
#Remove 'N's (introduced when too low coverage), only in sequences (i.e., not on lines starting with '>')
sed -i '/>/!s/N//g' consensus.fasta
#Replace newline with ' ' if line starts with '>' (i.e., merge headers with data into single line separated by space)
cat consensus.fasta | sed '/^>/{N; s/\n/ /;}' > consensus.modif.fasta
#Cut first part until space, i.e. header, and remove '>'
cat consensus.modif.fasta | cut -f1 -d" " | sed 's/>//' > headers.txt
#Calculate sequence length for each line
cat consensus.modif.fasta | cut -f2 -d" " | awk '{ print length}' > lengths.txt
#Cut only part after the first space, i.e., only sequence, modify to big letters only, change all ambiguities (RYSWKMBDHV) to 'x', replace all other characters then 'x' by nothing
cat consensus.modif.fasta | cut -f2 -d" " | tr [a-z] [A-Z] | sed 's/[RYSWKMBDHV]/x/g' | sed 's/[^x]//g' > x.txt
#Combine lengths.txt (sequence lengths) and x.txt (number of ambiguities) and print number and percentage of 'x's (i.e., ambiguities) in each sequence
paste totallength.txt totallengthNoQ.txt lengths.txt x.txt | awk '{ print $1 "\t" $2 "\t" $3 "\t" length($4) "\t" (length($4)*100)/$3 }' > ambigpercentage.txt
#Calculate number and percentage of each ambiguous base
for base in R Y S W K M B D H V; do
echo -e "Nr. $base\tPerc $base" > ${base}_percentage.txt
cat consensus.modif.fasta | cut -f2 -d" " | sed "s/$base/x/g" | sed 's/[^x]//g' > ${base}x.txt
paste lengths.txt ${base}x.txt | awk '{ print length($2) "\t" (length($2)*100)/$1 }' >> ${base}_percentage.txt
done
sed '1 i sample' headers.txt > headers2.txt
paste headers2.txt *_percentage.txt > ambigbaseperc.txt
paste headers.txt ambigpercentage.txt > ambigperc.txt
#Calculate mean of all values
echo -e "Sample\tTotalLength\tLengthWithout?\tCalledBases\tNrAmbig\tPercAmbig" > first.txt
echo -e "MEAN\t$(awk '{ sum += $2; sum2 += $3; sum3 += $4; sum4 += $5; sum5 += $6; n++ } END { if (n > 0) print sum / n "\t" sum2 / n "\t" sum3 / n "\t" sum4 / n "\t" sum5 / n}' ambigperc.txt)" > mean.txt
cat first.txt ambigperc.txt mean.txt > tmp && mv tmp ambigperc.txt
rm consensus.modif.fasta headers.txt headers2.txt ambigpercentage.txt mean.txt *_percentage.txt lengths.txt *x.txt totallength.txt totallengthNoQ.txt first.txt
#extract text after last '/' in $data (whole $data in no '/')
data1=$(echo $data | rev | cut -d"/" -f1 | rev)
cp ambigperc.txt $path/$type/30consensus/${data1}_ambigperc.txt
cp ambigbaseperc.txt $path/$type/30consensus/${data1}_ambigbaseperc.txt
fi
# Make summary table for coverage
cd coverage
echo -e "\nCreating summary tables for coverage..."
ls *perTarget.txt | cut -d"." -f1 | sed 's/_perTarget//' > ListOfCoverageFiles.txt
cat `ls *perTarget.txt | head -n 1` | cut -f 2,3,4 > infocolumn.txt
echo -e "taxon\n5\n10\n20\n30\n50\n100" >> firstcolumn.txt
for i in $(cat ListOfCoverageFiles.txt)
do
# Print header (species name)
echo "$i" >> ${i}_exon_meancoverage.txt
# Extract 7th column containing mean coverage per exon
cat ${i}_perTarget.txt | cut -f7 | sed '1d' >> ${i}_exon_meancoverage.txt
# Write samples name to file with
echo $i > ${i}_nrexons.txt
for j in 5 10 20 30 50 100
do
cat ${i}_perTarget.txt | cut -f7 | awk -v val=$j ' NR>1 {if ($1>val) print $1;}' | wc -l >> ${i}_nrexons.txt
done
done
# Combine information from all samples together
paste infocolumn.txt *_exon_meancoverage.txt > exon_meancoverageALL.txt
paste firstcolumn.txt *_nrexons.txt > numberexonsALL.txt
# Copy summary table to home
cp exon_meancoverageALL.txt $path/$type/21mapped_${mappingmethod}/coverage
cp numberexonsALL.txt $path/$type/21mapped_${mappingmethod}/coverage
# Calculate mean values per locus (a mean from per-exon means)
# Delete first three rows (i.e., leaving only mean values)
cut -f4- exon_meancoverageALL.txt | datamash transpose > coverage.txt
# Prepare header
echo locus | tr "\n" "\t" > header.txt #print 'locus' and replaces EOL by TAB, i.e., next line prints on the same line
grep ">" probes.fa | cut -d'_' -f2 | datamash transpose >> header.txt
echo -e "exon\t" | perl -0pe 's/\n\Z//' >> header.txt #print 'exonTAB' and removes very last character which is EOL, i.e., next line prints on the same line
grep ">" probes.fa | cut -d'_' -f4 | datamash transpose >> header.txt
cat header.txt coverage.txt > exon_meancoverageALLmodif.txt
rm header.txt coverage.txt
# Calculate mean per locus from per-exon values
lines=$(wc -l < "exon_meancoverageALLmodif.txt")
datamash -W transpose < "exon_meancoverageALLmodif.txt" | datamash -H groupby 1 mean 3-"$lines" | datamash transpose > locus_meancoverageALL.txt
#modify output
sed -i.bak 's/GroupBy(//' locus_meancoverageALL.txt
sed -i.bak2 's/mean(//' locus_meancoverageALL.txt
sed -i.bak3 's/)//' locus_meancoverageALL.txt
rm *bak*
cp exon_meancoverageALLmodif.txt $path/$type/21mapped_${mappingmethod}/coverage
cp locus_meancoverageALL.txt $path/$type/21mapped_${mappingmethod}/coverage
cd ..
#Copy log to home
echo -e "\nEnd:" `date '+%A %d-%m-%Y %X'` >> ${logname}.log
cp ${logname}.log $path/$type/21mapped_${mappingmethod}/
#Clean scratch/work directory
if [[ $PBS_O_HOST == *".cz" ]]; then
#delete scratch
if [[ ! $SCRATCHDIR == "" ]]; then
rm -rf $SCRATCHDIR/*
fi
else
cd ..
rm -r workdir02a2
fi
echo -e "\nScript HybPhyloMaker2a2 finished...\n"