-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathstampa.sh
135 lines (105 loc) · 3.71 KB
/
stampa.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/bin/bash -
#
# This shell script splits a multiple fasta file in individual fasta
# files, each file is then pairwised aligned with a reference dataset
# using vsearch. The best identity (in terms of percentage of
# identity) is kept.
## Usage
function usage () {
echo "Usage:"
echo "$(basename ${0}) /path/to/target/file.fasta reference_dataset"
echo " (reference_dataset can be SSU_V4, SSU_V9)"
}
## Variables
PROJECT="/path/to/target"
SCRIPT_PATH="${PROJECT}/src"
SCRIPT_NAME="stampa.sh"
VSEARCH_SCRIPT="stampa_vsearch.sh"
MERGE_SCRIPT="stampa_merge.sh"
VSEARCH="${PROJECT}/bin/vsearch/bin/vsearch"
PATH_TO_V4="data/references"
SSU_V4="V4_reference_sequences_20150216.fasta"
PATH_TO_V9="data/references"
SSU_V9="V9_reference_sequences_20130802.fasta"
OUTPUT_PREFIX="fasta."
INPUT_FILE=$(readlink -f "${1}") # Works if $1 is a symbolic link
PATH_TO_FILE=$(readlink -f $(dirname "${1}"))
TARGET="${2}"
INPUT_FILE_BASENAME=${INPUT_FILE##*/}
INPUT_FILE_BASENAME=${INPUT_FILE_BASENAME%.*}
STAMPA_FOLDER="stampa_${INPUT_FILE_BASENAME}"
NULL="/dev/null"
## Check arguments
if [[ -z "${INPUT_FILE}" ]] ; then
echo -e "You must specify a fasta file.\n" 1>&2
usage
exit 1
fi
if [[ -z "${TARGET}" ]] ; then
echo -e "You must specify a target database (SSU_V4 or SSU_V9).\n" 1>&2
usage
exit 1
fi
## Select database
case "${TARGET}" in
"SSU_V9")
DATABASE="${PROJECT}/${PATH_TO_V4}/${SSU_V9}"
THRESHOLD="10000"
;;
"SSU_V4")
DATABASE="${PROJECT}/${PATH_TO_V4}/${SSU_V4}"
THRESHOLD="10000"
;;
*)
echo -e "You must specify a target database (SSU_V4 or SSU_V9).\n" 1>&2
usage
exit 1
;;
esac
## Verify the uniqueness of reference sequence names
duplicates=$(grep "^>" "${DATABASE}" | cut -d " " -f 1 | sort -d | uniq -d)
if [[ "${duplicates}" ]] ; then
echo -e "WARNING!\nThe reference database contains duplicated accession numbers\n${duplicates}\n" 1>&2
fi
## Verify the abundance annotations (expect "_")
if [[ $(head "${INPUT_FILE}" | grep ";size=") ]] ; then
echo "Please note that the fasta file contains abundance annotations in usearch's style (;size=)." 1>&2
fi
## Compute the number of jobs
AMPLICON_NUMBER=$(grep -c "^>" "${INPUT_FILE}")
if (( AMPLICON_NUMBER % THRESHOLD == 0 )) ; then
MAX=$((2 * AMPLICON_NUMBER / THRESHOLD))
else
MAX=$((2 * AMPLICON_NUMBER / THRESHOLD + 1 ))
fi
## The upper limit to the number of amplicons is 10,000 * THRESHOLD / 2
if [[ ${MAX} -gt 10000 ]] ; then
echo -e "Too many amplicons!\nChange the threshold value or further split your dataset." 1>&2
exit 1
fi
## Go where the work is.
cd "${PATH_TO_FILE}/"
## Remove old analysis and create a work folder
if [[ -d "${STAMPA_FOLDER}" ]] ; then
echo "Removing old stampa analysis." 1>&2
rm -rf "${STAMPA_FOLDER}/"
fi
mkdir "${STAMPA_FOLDER}"
cd "${STAMPA_FOLDER}/"
## Split the input fasta file into chuncks (convert vsearch abundance-style to swarm style)
split --numeric-suffixes \
--lines="${THRESHOLD}" \
--suffix-length=5 \
<(sed -e '/^>/ s/;size=/_/' -e '/^>/ s/;$//' "${INPUT_FILE}") "${OUTPUT_PREFIX}"
## Where are we?
PWD=$(pwd)
## Launch job array (upper limit is 10,000 jobs)
# -N: separates the job report information from the job output.
LSB_JOB_REPORT_MAIL=N bsub -N -J "${STAMPA_FOLDER}[1-${MAX}]" \
-o "${NULL}" -e "${NULL}" \
-q short -W 60 -n 16 \
-R "span[hosts=1] select[model==XEON_E5_2670] rusage[mem=1024]" \
bash "${SCRIPT_PATH}/${VSEARCH_SCRIPT}" "${DATABASE}" "${PWD}"
## Hold a job as long as the above job is not done, then launch the merging script
bsub -w "done(\"${STAMPA_FOLDER}\")" -q normal -R "rusage[mem=1024]" bash "${SCRIPT_PATH}/${MERGE_SCRIPT}" "${PWD}"
exit 0