-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_colabfold_array_singularity.sh
executable file
·231 lines (188 loc) · 5.53 KB
/
run_colabfold_array_singularity.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/bin/env bash
# Need to select an A40 GPU node even for the non-gpu search
# phase since currently only these have the RAM and local
# copy of the database
#$ -adds l_hard gpu 1
#$ -adds l_hard cuda.0.name 'NVIDIA A40'
#$ -mods l_hard h_vmem 350G
#$ -mods l_hard mem_free 350G
#$ -pe smp 32
#$ -j y
#$ -N colabfold
#$ -o colabfold_logs/$JOB_NAME.o$JOB_ID
#$ -cwd
set -e
# if modifying threads, also update '-pe smp' request above to match...
THREADS=32
INSTALL_DIR="/cluster/sw/colabfold/current"
DB_PATH="/opt/colabfold/current"
export TF_CPP_MIN_LOG_LEVEL=2
export TINI_SUBREAPER=1
SCRIPT_PATH=$(dirname ${BASH_SOURCE[0]})
IMAGE=$(ls ${INSTALL_DIR}/*sif)
# based on the image being named 'colabfold_batch.?.?.?.sif'...
VERSION=$(echo $IMAGE|sed -r 's/.*colabfold_batch.([0-9\.]+).sif/\1/')
usage() {
echo "Usage: $0 -i /path/to/fasta/file [-c 'colabfold arguments'] [-m 'mmseq arguments'] [-h] [-u] [-s]"
echo
echo "-i: path to input fasta or a3m file"
echo "-m: Arguments to pass to mmseqs search phase (must be surrounded with quotes)"
echo "-c: Arguments to pass to colabfold phase (must be surrounded with quotes)"
echo "-h: Show Help"
echo "-s: Show MMseqs search options"
echo "-u: Show colabfold usage options"
echo
exit 1
}
colabfold_usage() {
singularity run ${IMAGE} colabfold_batch -h
exit 1
}
mmseqs_search_usage() {
singularity run ${IMAGE} colabfold_search -h
exit 1
}
######################################################################
#
# create_wrapper
#
# Generates a shell script for qsubbing each alphafold job once search
# phase is completed.
#
# Required parameters:
# source_node: hostname to sync from
# target_node: hostname to sync to
# hold: jid to submit hold_jid for
#
# Returns:
# path to wrapper script
#
######################################################################
create_wrapper() {
COUNT=$1
COLABFOLD_INPUT=$2
COLABFOLD_ARGS_LIST=$3
script="${TMPDIR}/colabfold_${COUNT}.sh"
# SGE directives have ## rather than #$ to we can qsub the current script without these
# being interpreted, then correct them with sed before submitting the wrapper...
cat<<EOF > $script
#!/bin/env bash
## -adds l_hard gpu 1
## -adds l_hard cuda.0.name 'NVIDIA A40'
## -N colabfold
## -j y
## -o colabfold_logs/\$JOB_NAME.o\$JOB_ID
## -cwd
echo "Hostname: $HOSTNAME"
echo "GPU: $CUDA_VISIBLE_DEVICES"
echo "INPUT_FILE=${COLABFOLD_INPUT}"
echo "COLABFOLD_ARGS=${COLABFOLD_ARGS[@]}"
COMPARISON=$(echo ${COLABFOLD_INPUT}|sed 's/.a3m//')
mkdir -p colabfold_predictions/${COMPARISON}
cp -v colabfold_output/${COLABFOLD_INPUT} $TMPDIR
singularity exec --nv -B $TMPDIR:/mnt/output ${IMAGE} \
colabfold_batch ${COLABFOLD_ARGS_LIST[@]} $TMPDIR/${COLABFOLD_INPUT} $TMPDIR
cp -v $TMPDIR/* colabfold_predictions/${COMPARISON}
EOF
sed -i 's/##/#$/' $script
echo $script
}
while getopts "i:c:m:ush" opt; do
case $opt in
i)
INPUT=$OPTARG
;;
c)
COLABFOLD_ARGS=$OPTARG
;;
m)
MMSEQS_ARGS=$OPTARG
;;
u)
colabfold_usage
;;
s)
mmseqs_search_usage
;;
h)
usage
;;
*)
;;
esac
done
if [[ -z "$INPUT" ]]; then
usage
fi
if [[ ! -e "$INPUT" ]]; then
echo "Specified input file (${INPUT} not found..."
exit 1
fi
if [[ -z "${JOB_ID}" || "${REQUEST}" == "QRLOGIN" ]]; then
echo "This script must be submitted as a batch job to the scheduler"
echo "i.e. qsub $0 $@"
exit 1
fi
read -a COLABFOLD_ARGS_LIST <<< "$COLABFOLD_ARGS"
read -a MMSEQS_ARGS_LIST <<< "$MMSEQS_ARGS"
for arg in "${COLABFOLD_ARGS_LIST[@]}"; do
if [[ "$arg" == "--use-gpu-relax" ]]; then
echo
echo "WARNING: Running amber relaxation on GPUs is unreliable and may fail."
echo "Should this occur, rerun without --use-gpu-relax"
echo
fi
done
INPUT_DIR=$(dirname $INPUT)
INPUT_FILE=$(basename $INPUT)
SUFFIX="${INPUT_FILE##*.}"
echo "Hostname: $HOSTNAME"
echo "GPU: $CUDA_VISIBLE_DEVICES"
echo "INPUT_DIR=${INPUT_DIR}"
echo "INPUT_FILE=${INPUT_FILE}"
echo "SUFFIX=${SUFFIX}"
echo "DB_PATH=${DB_PATH}"
echo "VERSION=${VERSION}"
echo "MMSEQS_ARGS=${MMSEQS_ARGS[@]}"
echo "COLABFOLD_ARGS=${COLABFOLD_ARGS[@]}"
mkdir -p colabfold_output
if [[ "$SUFFIX" != 'fa' ]] && [[ "$SUFFIX" != 'fasta' ]]; then
echo "ERROR: Input file for batch queries should be a fasta file, with a '.fa' or '.fasta' suffix"
exit 1
else
SEQ_COUNT=$(grep -c '>' ${INPUT})
if [[ ${SEQ_COUNT} -gt 25 ]]; then
echo "Input file must be a fasta file containing up to 25 sequences."
echo "The provided file contains ${SEQ_COUNT} sequences"
exit 1
fi
# Extract final '|' separated field from seq id in case of uniprot format headers
SEQ_IDS=$(grep '>' ${INPUT}|sed 's/>//'|awk '{print $1}')
# Create a mapping of sequence index to ID
i=0
declare -A ID_MAP
for SEQ_ID in ${SEQ_IDS[@]}; do
SEQ_ID="${SEQ_ID##*|}"
ID_MAP[${i}]=${SEQ_ID}
i=$(( i + 1 ))
done
echo "SEQUENCE_IDS:"
for key in "${!ID_MAP[@]}"; do
echo "$key => ${ID_MAP[$key]}"
done
echo
singularity exec -B ${INPUT_DIR}:/mnt/input -B colabfold_output:/mnt/output -B $DB_PATH/:/mnt/db \
${IMAGE} colabfold_search --threads ${THREADS} ${MMSEQS_ARGS_LIST[@]} \
/mnt/input/${INPUT_FILE} /mnt/db /mnt/output/
# a3m files are created with a 0-indexed count, which isn't particulary helpful,
# so rename each of these to the SEQ_ID.a3m, which is then used by the alphafold
# output filenaming
readarray -t A3M_LIST < <(ls colabfold_output/*a3m)
for A3M in ${A3M_LIST[@]}; do
index=$(basename ${A3M}|sed 's/.a3m//')
echo "index=${index}"
mv -v $A3M colabfold_output/${ID_MAP[${index}]}.a3m
script=$(create_wrapper $index "${ID_MAP[${index}]}.a3m" ${COLABFOLD_ARGS_LIST[@]})
qsub $script
done
fi