-
Notifications
You must be signed in to change notification settings - Fork 10
/
maskedSEQ2bed.pl
128 lines (95 loc) · 2.64 KB
/
maskedSEQ2bed.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/perl -w
#
# maskedSEQ2bed.pl -- sequence related analysis
#
# Author: Nowind
# Created: 2012-02-21
# Updated: 2014-06-04
# Version: 1.0.1
#
# Change logs:
# Version 1.0.0 12/05/24: The initial version.
# Version 1.0.1 14/06/04: Add some comments.
use strict;
use Data::Dumper;
##################### Main ####################
my $CMDLINE = "perl $0 @ARGV";
my $VERSION = '1.0.1';
my $HEADER = "# $CMDLINE\n# Version: $VERSION\n";
unless( @ARGV == 1 ) {
print <<EOF;
$0 -- sequence related analysis
Usage:
perl $0 <fasta file>
EOF
exit(0);
}
$|++;
print STDERR "# $0 v$VERSION\n# " . (scalar localtime()) . "\n";
print STDERR ">> Read in $ARGV[0]...";
my %SEQs = ();
parseFasta(\%SEQs, $ARGV[0]);
print STDERR "done!\n";
#print "$HEADER# " . (scalar localtime()) . "\n";
## BED -- BED type, a general purpose format for representing
## genomic interval data, useful for masks and other interval
## outputs. Please note that the bed format is 0-based (most
## other formats are 1-based).
print STDERR ">> Start analysis sequences...";
for my $id (sort keys %SEQs)
{
my $seq = $SEQs{$id};
#my @nts = split //, $seq;
#
#for my $nt (@nts)
#{
#
#}
## the matching process is really slow and a new solution is urged
while ( $seq =~ m/(N|[a-z])+/g )
{
my $end = (pos $seq); ## BED ends are one-based
my $len = length $&;
my $start = $end - $len; ## BED starts are zero-based
print STDOUT "$id\t$start\t$end\n";
}
}
print STDERR "done!\n";
print STDERR "# " . (scalar localtime()) . "\n";
####################### Sub ###########################
#**************************************************
# Save all the sequences in the fasta
# file into a hash or an array
#**************************************************
sub parseFasta
{
my ($rf_seq, $in, $desc) = @_;
my $match_str = '^(.*?)\s+';
if ($desc) {
if ($desc eq 'full') {
$match_str = '^(.*?)\n'
}
elsif ($desc eq 'gbk') {
$match_str = 'gb\|(.*?)\.';
}
}
open (F, "< $in") or die $!;
my $fas = do { local $/; <F> };
close F;
my @fas = split /\>/, $fas;
shift @fas;
for my $str (@fas)
{
$str =~ /$match_str/;
my $id = $1;
$str =~ s/.*\n//;
#$str =~ s/\>//;
$str =~ s/\s+//g;
if (ref($rf_seq) eq 'HASH') {
$rf_seq->{$id} = $str;
}
elsif (ref($rf_seq) eq 'ARRAY') {
push @{$rf_seq}, $str;
}
}
}