forked from ISUgenomics/common_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExtractSeq.sh
executable file
·80 lines (61 loc) · 1.9 KB
/
ExtractSeq.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/bin/bash
# This is a bash script that extracts the sequences for all orthologous groups (OG).
# It takes the a OG ids list as input and saves all sequences belonging to that group
# from all organism in a file named with OG group in fasta format.
# Note that after the script is executed, there will be 'n' number of files (where
# n=total number of OG's in the input list
# Arun Seetharam <[email protected]>
scriptName="${0##*/}"
outdir=$(pwd)
function printUsage() {
cat <<EOF
Synopsis
$scriptName [-h | --help] [-o dir_name] input_ids_list database
Description
Extracts sequences for all ortholog groups supplied as list. For each ID in the list
a file containing FASTA sequences will be generated, which belong to that OG.
Note: this script requires standalone cdbfasta program.
input_ids_list
Input list should contain orthologous group IDs one per line
These IDs should be generated by "orthomclMclToGroups" command
sequence_file
Absolute path for the sequence file should be specified. This file is generally
named as 'goodProteins.fasta'
-o directory_name
directory name to save the output files. By default all files will be saved in
the current directory
-h, --help
Brings up this help page
Author
Arun Seetharam, Genome Informatics Facility, Iowa State University.
EOF
}
if [ $# -lt 1 ] ; then
printUsage
exit 1
fi
while getopts ':o:' option; do
case "$option" in
o) outdir=$OPTARG
shift
;;
h) printUsage
exit
;;
help) printUsage
exit
;;
esac
done
module load cdbfasta
mkdir -p $outdir
shift $(( $# - 2 ))
file=${1}
pathdbname=${2}
cdbfasta ${pathdbname}
sed -i 's/://g' ${file}
while IFS=$' ' read -r -a myArray
do
echo "${myArray[@]:1}" | cdbyank ${pathdbname}.cidx >> ${outdir}/${myArray[0]}.fa;
done <${file}