-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlengther.sh
169 lines (123 loc) · 3.32 KB
/
lengther.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/bin/bash
## SETTING HELP FUNCTION ##
helpFunction()
{
echo ""
echo "How to run script: $0 -f path/to/gene_table.file -p path/to/clusters_directory/ -n num -m path/to/pangenome_matrix.file"
echo -e "\t -p: folder where the sequence clusters are located."
echo -e "\t -f: Path to table with genenames/homologues"
echo -e "\t -n: Number of top expresed sequences to measure"
echo -e "\t -m: Path to pangenome matrix with cluster names added to genenames"
exit 1 # Exit script after printing help if no parameter is indicated
}
while getopts "p:f:n:m:" opt
do
case "$opt" in
p ) path="$OPTARG" ;;
f ) table="$OPTARG" ;;
n ) num="$OPTARG" ;;
m ) pangemat="$OPTARG" ;;
? ) helpFunction ;; # Print helpFunction in case parameter is non-existent
esac
done
if [ -z "$path" ] || [ -z "$table" ] || [ -z "$num" ] || [ -z "$pangemat" ];
then
echo "Some or all of the parameters are empty";
helpFunction
fi
NC='\033[0m'
YELLOW='\033[1;33m'
RED='\033[1;31m'
max=$(tail -n +2 $table | wc -l)
if [[ $num -gt $max ]];
then
echo -e "${RED}Number of genes requested excedes the number of sequences in file ${NC}"
echo -e "Max value =${YELLOW} $max ${NC}"
exit 1
fi
names=()
sed -n "2,${num}p" $table | cut -f 4 > filetemp.txt
readarray -t names < filetemp.txt
rm filetemp.txt
clusters=()
c=0
for name in ${names[@]}
do
gene="${name%\"}"
gene="${gene#\"}"
greping=$(sed -n "s/${gene}\b/${gene}\b/p" $pangemat | cut -f 2 | sed -e 's/^"//' -e 's/"$//' >> filetemp2.txt)
opt[c]=${gene}
((c++))
done
readarray -t clusters < filetemp2.txt
rm filetemp2.txt
c=0
b=0
echo "selected genes: ${opt[@]}"
sequence=()
number=()
for cluster in ${clusters[@]}
do
file=$(find $path | grep ${cluster})
echo -e "${YELLOW} length of gene ${opt[c]} in cluster $cluster: ${NC}"
length=$(sed -n "/${opt[c]}/{n;p;}" $file | awk "{print length}")
echo "$length"
line=(${length[@]})
line=(${length[0]})
if [[ $line -gt 250 ]];
then
echo -e "${RED} ${opt[c]}'s sequence length is >250!! ${NC}"
sequence[${#sequence[@]}]=" Gene ${opt[c]}:length=${line} "
fi
values=(${length[@]})
a=1
e=0
p=0
for (( i= 0; i< ${#values[@]}; ++i ))
do
numbers[${#numbers[@]}]=${values[p]}
if [[ $a -gt 1 && $a -lt 3 ]];
then
echo -e "${RED} Gene ${opt[c]} has more than one sequence!! ${NC}"
duplicate[${#duplicate[@]}]=${opt[c]}
fi
((p++))
((a++))
((e++))
done
if [[ $e -gt 1 ]];
then
count[b]=$e
((b++))
fi
((c++))
done
echo -e "${YELLOW}Duplicated genes: ${duplicate[@]} ${NC}"
echo "${count[@]}"
### printf "%s\n" "${duplicate[@]}" > dup_genes.txt ###
rm -f dup_genes.txt
touch dup_genes.txt
b=0
for dup in ${duplicate[@]};
do
printf "%s\n" "$dup is duplicated ${count[b]} times" >> dup_genes.txt
#sed -e "s/$/repeats ${count[b]} times/" -i dup_genes.txt
((b++))
done
echo ""
echo -e "${YELLOW} list of proteins with sequences greater than 250: ${NC}"
echo "${sequence[@]}"
echo ""
echo "all lengths: ${numbers[@]}"
for (( i = 0; i < ${#numbers[@]}; ++i ));
do
sum=`expr $sum + ${numbers[i]}`
### también sirve sum=$((sum + numbers[i])) ###
done
echo ""
echo -e "${YELLOW} total aminoacids = $sum ${NC}"
echo ""
echo "FIN"
#head -n 50 pangenome_matrix_edited.tab | cut -d "\t" -f 4
#sed -n '/${opt[c]}/{n;p;}'
#grep "T4E_6892" trichinella_intersection/79192_T4D_11040.1.faa -A 2 | awk "{print length}"