-
Notifications
You must be signed in to change notification settings - Fork 0
/
get-uORFs_fromGFFs.pl
231 lines (224 loc) · 11.4 KB
/
get-uORFs_fromGFFs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/perl
# Programmer: Elton Vasconcelos (21/Oct/2013)
# Script that catches uORFs from ATGs located upstream of the canonical ATG and downstream the SL site for each protein-coding gene in a chromosome
# It takes as input1 a gff file generated by Gowthaman (Tb927-TritrypV5p0.11ChrOnly.pma.5utr.noadjust.gff) after our manual ATG curation.
# The input1 is a gff containing the annotation of the 5'UTR of each Tb927 protein-coding gene.
# The input2 is also a gff, but now cointaining the CDS features only.
# Usage: perl NT_ORFs2fasta.pl [input-file1.gff] [input-file2.gff] [Tb927-WG.fasta] >outfile.fasta
#################################################################################################
########### Example of FILE1 (Tb927-TritrypV5p0.11ChrOnly.pma.5utr.noadjust.gff)#################
#Tb927_10_v5 SBRI 5UTR 2571651 2571705 . - . ID=NCDS.Tb927.10.10451;length=55;has5UTRdefined=YES
#Tb927_10_v5 SBRI 5UTR 3257540 3257725 . - . ID=NCDS.Tb927.10.13341;length=186;has5UTRdefined=YES
#Tb927_10_v5 SBRI 5UTR 3967616 3967639 . + . ID=NCDS.Tb927.10.16121;length=24;has5UTRdefined=YES
#Tb927_10_v5 SBRI 5UTR 524642 524687 . + . ID=NCDS.Tb927.10.1991;length=46;has5UTRdefined=YES
##############################################
########################################################
######## Working with the Tb927-WG.fasta file ##########
########################################################
open (FILE3, "$ARGV[2]") or die ("Can't open file $ARGV[2]\n");
my $line3 = <FILE3>;
chomp($line3);
my $seq;
my $chr;
my %hash;
while ($line3 ne "") {
if ($line3 =~ m/^>/) {
$chr = $line3;
$chr =~ s/^>//;
$chr =~ s/ .*//g; #catching only the chr ID (i.e. Tb927_01_v4)
$line3 = <FILE3>;
chomp($line3);
until ($line3 =~ m/^>/ || $line3 eq "") {
$seq .= $line3; #putting the whole chromosome sequence in one single line
$line3 = <FILE3>;
chomp($line3);
}
$hash{$chr} = $seq; #Creating a hash containing each chromosome in the file
$seq = "";
}
else { #Just in the case the FASTA file is not starting with a headline (^>.+)
$line3 = <FILE3>;
chomp($line3);
}
}
########################################################
######## Working with the CDSs gff file ################
########################################################
open (FILE2, "$ARGV[1]") or die ("Can't open file $ARGV[1]\n");
my $line2 = <FILE2>;
chomp($line2);
my @array2;
my %hash2; #Hash to store the CDS end coordinates for each gene
my $geneID2;
while ($line2 ne "") {
@array2 = split(/\t/, $line2);
if ($line2 =~ m/\tCDS\t/ && $line2 =~ m/\t\+\t/) { #gene on the top strand
$geneID2 = $array2[8]; #Column 9 "Attributes where we have the gene ID"
$geneID2 =~ s/\-mRNA.*$//g;
$geneID2 =~ s/^ID.*Tb/Tb/g;
$hash2{$geneID2} = $array2[4]; #Column 5 "CDS end coord" on the plus strand. Storing it in %hash2
}
elsif ($line2 =~ m/\tCDS\t/ && $line2 =~ m/\t\-\t/) { #gene on the bottom strand
$geneID2 = $array2[8]; #Column 9 "Attributes where we have the gene ID"
$geneID2 =~ s/\-mRNA.*$//g;
$geneID2 =~ s/^ID.*Tb/Tb/g;
$hash2{$geneID2} = $array2[3]; #Column 4 "CDS end coord" on the minus strand. Storing it in %hash2
}
$line2 = <FILE2>;
chomp($line2);
}
##############################################################################
################## Working with the 5'UTRs gff file ##########################
##############################################################################
open (FILE1, "$ARGV[0]") or die ("Can't open file $ARGV[0]\n");
my $line1 = <FILE1>;
chomp($line1);
my @array;
my $chr_target; my $geneID;
my $SL;
my $atg_coord; my $fiveUTR; my $fiveUTR_rev; my $CDS_end;
while ($line1 ne "") {
@array = split(/\t/, $line1);
$chr_target = $array[0]; #Column 1 "Chromosome ID"
$geneID = $array[8]; #Column 9 "Attributes where we have the gene ID"
$geneID =~ s/\;.*//g;
$geneID =~ s/^ID.*Tb/Tb/g;
$CDS_end = $hash2{$geneID};
if ($line1 =~ m/\t\+\t/) { #gff is indicating the feature is on top strand (column "+")
$SL = $array[3]; #Column 4 "5UTR start coord"
$atg_coord = $array[4] + 1; #Column 5 "5UTR end coord", actually it is the end of 5UTR (one nt upstream of the ATG)
if ($line1 =~ m/YES$/) { #if the gene has a defined 5UTR (as it is assigned in the end of each line from the gff input file)
#$five_UTR = substr($hash{$chr_target}, $SL - 1, $atg_coord - $SL); #taking the sequence from the SL site to the given atg_coord (5'UTR)
$fiveUTR = substr($hash{$chr_target}, $SL - 1, $CDS_end - $SL); #taking the sequence from the SL site to the CDS end
&CatchORF();
}
else {
print(">$array[8]\t***sbriSLsite is downstream of the canonical ATG***\n");
}
}
elsif ($line1 =~ m/\t\-\t/) { #your gff is indicating the feature is on bottom strand (column "-")
$SL = $array[4]; #Column 5 "5UTR start coord"
$atg_coord = $array[3] - 1; #Column 4 "5UTR end coord", actually it is the end of 5UTR (one nt upstream of the ATG)
if ($line1 =~ m/YES$/) { #if the gene has a defined 5UTR (as it is assigned in the end of each line from the gff input file)
#$five_UTR_rev = substr($hash{$chr_target}, $atg_coord, $SL - $atg_coord); #taking the sequence from the given atg_coord to the SL site (minus strand)
$fiveUTR_rev = substr($hash{$chr_target}, $CDS_end - 1, $SL - $CDS_end + 1); #taking the chromosome sequence until the $SL coord
$fiveUTR = reverse($fiveUTR_rev);
$fiveUTR =~ tr/ATGCNatgcn/TACGNtacgn/;
&CatchORF();
}
else {
print(">$array[8]\t***sbriSLsite is downstream of the canonical ATG***\n");
}
}
$line1 = <FILE1>;
chomp($line1);
}
#############################################
################# Sub-routines ##############
#############################################
sub CatchORF { #Sub-routine to Catch uORFs
if ($fiveUTR ne "") { #Verify whether the sequence is null
my $fiveUTR2 = $fiveUTR; $fiveUTR2 =~ s/^.//g; #frame 2, relative to the string on $fiveUTR
my $fiveUTR3 = $fiveUTR; $fiveUTR3 =~ s/^..//g; #frame 3, relative to the string on $fiveUTR
my @frames = ($fiveUTR, $fiveUTR2, $fiveUTR3);
my @codons; my $m; my $n; my $c = 1;
my $posI; my $posI_temp; my $posF;
my $uorf; my $uStart_coord; my $uStop_coord;
my $rest; my $frame_abs; #frame based on the absolute coordinate. Go to the line 169 and downward
for($j=0;$j<@frames;$j++) {
@codons = ($frames[$j] =~ m/.{3}/g); #spliting the sequence into 3-characters groups (each element of @codons is a codon)
$m = 0;
$n = 0;
for ($i=0;$i<@codons;$i++) {
$posI_temp = ($i * 3) + 1; #Store the initial position of any codon (the first nt of the codon)
if ($line1 =~ m/\t\+\t/) { #Top strand
if ($codons[$i] =~ m/ATG/ig && $n == 0 && $SL + $posI_temp < $atg_coord - 2) { #Search for the match with START on a case insensitive manner, catching only ATG upstream the canonical one
$posI = ($i * 3) + 1; #Store the initial position of the uATG (the first nt of the START codon)
$n = 1;
}
if ($codons[$i] =~ m/T[GA]A|TAG/ig && $m == 0 && $posI =~ m/\d+/) { #Search for the match with STOP on a case insensitive manner
$posF = ($i * 3) + 3; #Store the final position of the STOP (the third nt of the STOP codon)
$m = 1;
}
if ($m == 1) {
last;
}
}
elsif ($line1 =~ m/\t\-\t/) { #Bottom strand
if ($codons[$i] =~ m/ATG/ig && $n == 0 && $SL - $posI_temp > $atg_coord + 2) { #Search for the match with START on a case insensitive manner, catching only ATG upstream the canonical one
$posI = ($i * 3) + 1; #Store the initial position (the first nt of the START codon)
$n = 1;
}
if ($codons[$i] =~ m/T[GA]A|TAG/ig && $m == 0 && $posI =~ m/\d+/) { #Search for the match with STOP on a case insensitive manner
$posF = ($i * 3) + 3; #Store the final position (the third nt of the STOP codon)
$m = 1;
}
if ($m == 1) {
last;
}
}
}
if ($m == 1 && $posI < $posF) {
$uorf = substr($frames[$j], $posI - 1, $posF - $posI + 1); #Catching the uORF sequence
###### The following lines are to catch the absolute coordinates of the uORF within the chromosome
if ($line1 =~ m/\t\+\t/) { ### On the Top strand
if ($j == 0) {
$uStart_coord = $SL + $posI - 1;
$uStop_coord = $SL + $posF - 1;
}
elsif ($j == 1) {
$uStart_coord = $SL + $posI;
$uStop_coord = $SL + $posF;
}
elsif ($j == 2) {
$uStart_coord = $SL + $posI + 1;
$uStop_coord = $SL + $posF + 1;
}
$rest = $uStart_coord % 3; # Taking the rest of the division
if ($rest == 1) {
$frame_abs = "+1";
}
elsif ($rest == 2) {
$frame_abs = "+2";
}
elsif ($rest == 0) {
$frame_abs = "+3";
}
}
if ($line1 =~ m/\t\-\t/) { ### On the Bottom strand
if ($j == 0) {
$uStart_coord = $SL - $posI + 1;
$uStop_coord = $SL - $posF + 1;
}
elsif ($j == 1) {
$uStart_coord = $SL - $posI;
$uStop_coord = $SL - $posF;
}
elsif ($j == 2) {
$uStart_coord = $SL - $posI - 1;
$uStop_coord = $SL - $posF - 1;
}
$rest = (length($hash{$chr_target}) - $uStart_coord + 1) % 3; # Taking the rest of the division
if ($rest == 1) {
$frame_abs = "-1";
}
elsif ($rest == 2) {
$frame_abs = "-2";
}
elsif ($rest == 0) {
$frame_abs = "-3";
}
}
$bp = length($uorf);
print(">$geneID-uORF_$c-frame_$frame_abs $bp bp \($uStart_coord..$uStop_coord\) \($array[6]\)\n$uorf\n");
$c++;
}
$posI = "";
$posF = "";
}
if ($uorf eq "") {
print(">$geneID\t***No uORFs found***\n");
}
$uorf = "";
}
}