-
Notifications
You must be signed in to change notification settings - Fork 0
/
gbGeneFeat2fasta.pl
107 lines (102 loc) · 3.75 KB
/
gbGeneFeat2fasta.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/perl
# Programmer: Elton Vasconcelos (01/Nov/2018)
# Script that generates a fasta file for all gene features within a genbank file
# Usage: perl gbGeneFeat2fasta.pl [infile.gb] [infile.fasta] >gene_features.fasta
#################################################################################################
#IMPORTANT NOTE: New version of gb files doesn't have the actual whole sequences. NCBI is providing a separate fasta file for the whole sequences of each gb LOCUS entry. Usually the files have the same name. For instance, see the file names' structure in https://ftp.ncbi.nih.gov/refseq/release/bacteria/. Therefore, this script takes both *.gbff and *.fna files as inputs.
if (@ARGV != 2) {
die ("**Error**\nThe cmd line must contain 2 input files as arguments:\n\$perl gbGeneFeat2fasta.pl [infile.gb] [infile.fna] >gene_features.fasta\nRead script's initial commented lines for a better explanation\n");
}
########################################################
############## Working on infile.fna ###################
########################################################
open (FILE2, "$ARGV[1]") or die ("Can't open file $ARGV[1]\n");
my $line2 = <FILE2>;
chomp($line2);
my $seq;
my $ctg; #genomic contig
my %hash;
while ($line2 ne "") {
if ($line2 =~ m/^>\S+/) {
$ctg = $&; #catching the contig ID
$ctg =~ s/^>//;
$line2 = <FILE2>;
chomp($line2);
until ($line2 =~ m/^>/ || $line2 eq "") {
$seq .= $line2; #putting the whole contig sequence in one single line
$line2 = <FILE2>;
chomp($line2);
}
$hash{$ctg} = $seq; #Creating a hash containing each chromosome in the file
$seq = "";
}
else { #Just in case the FASTA file is not starting with a headline (^>.+)
$line2 = <FILE2>;
chomp($line2);
}
}
########################################################
############# Working on infile.gbff ###################
########################################################
open (FILE1, "$ARGV[0]") or die ("Can't open file $ARGV[0]\n");
my ($version, $org, $gene_seq, $coord, $start, $end, $dist, $locus_tag, $product);
while(<FILE1>) {
chomp($_);
if ($_ =~ m/^VERSION\s+.*/) {
$version = $&; #should be the same as genomic contig ID in the fasta input
$version =~ s/^VERSION\s+//g;
}
if ($_ =~ m/^\s+\/organism\=.*/) {
$org = $&;
$org =~ s/^\s+\/organism\=//g; #storing organism's name
$org =~ s/\"//g;
$org =~ s/ +/_/g;
}
if ($_ =~ m/^ {5}gene\s+.*/) {
if ($_ !~ m/complement/) { #gene on sense orientation
$coord = $_;
$coord =~ s/\s+gene\s+//g;
$coord =~ s/[<>]//g;
$start = $coord;
$start =~ s/\.\.\d+//g;
$end = $coord;
$end =~ s/\d+\.\.//g;
$dist = $end - $start + 1;
$gene_seq = substr($hash{$version}, $start - 1, $dist);
}
else { #gene on anti-sense orientation
$coord = $_;
$coord =~ s/\s+gene\s+complement\(//g;
$coord =~ s/\)//g;
$coord =~ s/[<>]//g;
$start = $coord;
$start =~ s/\.\.\d+//g;
$end = $coord;
$end =~ s/\d+\.\.//g;
$dist = $end - $start + 1;
$gene_seq = reverse(substr($hash{$version}, $start - 1, $dist));
$gene_seq =~ tr/ATGCNatgcn/TACGNtacgn/;
}
}
if ($_ =~ m/^\s+\/locus_tag\=.*/) {
$locus_tag = $&;
$locus_tag =~ s/^\s+\/locus_tag\=//g; #storing locus_tag ID
$locus_tag =~ s/\"//g;
}
if ($_ =~ m/^\s+\/product\=.*/) { #last important description of a gene feature
$product = $&;
$product =~ s/^\s+\/product\=//g; #storing organism's name
$product =~ s/\"//g;
$product =~ s/ +/_/g;
print(">$locus_tag $org $product $dist bp\n$gene_seq\n");
$coord = "";
$dist = "";
$gene_seq = "";
$locus_tag = "";
$product = "";
}
if ($_ =~ m/^\/\//) {
$version = "";
$org = "";
}
}