-
Notifications
You must be signed in to change notification settings - Fork 20
/
mergeKmers.pl
executable file
·151 lines (131 loc) · 3.32 KB
/
mergeKmers.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env perl
my $usage="
mergeKmers.pl :
Generates counts table from multiple kmer fasta files produced by Jellyfish.
To go from fastq to kmer counts (for one file):
fastq_to_fasta -i mydata1.fastq -o mydata1.fasta
jellyfish count -m [kmer length] -s 100M -t [number of threads] -C mydata1.fasta -o mydata1.jf
jellyfish dump mydata1.jf > mydata1_kmers.fa
arg1: list of kmer fasta files
minDP=[integer] minimum total sequencing depth to consider a uniqie kmer. Default 5.
minInd=[integer] minimum number of individuals the kmer must be seen in. Default 2.
maxInd=[integer] maximum number of individuals the kmer must be seen in. Default:
all individuals. Set this to twice per-pop sample
size for kmer sharing analysis.
prints to STDOUT
Example:
mergeKmers.pl kmerfiles minDP=5 minInd=5 > kmers.txt
";
#sub rcom {
# my $rcs=scalar reverse ("$_[0]");
# $rcs=lc $rcs;
# $rcs=~s/a/T/g;
# $rcs=~s/t/A/g;
# $rcs=~s/g/C/g;
# $rcs=~s/c/G/g;
## $rcs=~s/![atgcATGC]/N/g;
# return $rcs;
#}
my $mincount=5;
my $minind=2;
if ("@ARGV"=~/minDP=(\d+)/) { $mincount=$1;}
if ("@ARGV"=~/minInd=(\d+)/) { $minind=$1;}
my $flist=shift or die $usage;
open FL, $flist or die "cannot open file list $flist\n";
my @ins;
while (<FL>){
chomp;
push @ins, $_;
}
my $maxind=-1;
if ("@ARGV"=~/maxInd=(\d+)/) { $maxind=$1;}
my $tag;
my $tot;
my $ind;
my $counts;
my $now;
my %dstring={};
my @allins=sort @ins;
$reff=shift(@ins);
open INP, $reff or die "cannot open file $reff\n";
$now=localtime;
warn "reading input files... $now\n";
warn "$reff\n";
while (<INP>) {
chomp;
if ($_=~/^>(\d+)/) {
$counts=$1;
}
else {
$tag=$_;
@{$dstring{$tag}}=($counts,$reff,$counts);
}
}
foreach $gmap (@ins) {
warn "$gmap\n";
$frst=1;
$now=localtime;
open INP, $gmap or die "cannot open file $gmap\n";
while (<INP>) {
chomp;
if ($_=~/^>(\d+)/) {
$counts=$1;
}
else {
$tag=$_;
# if (!$dstring{$tag}) {
# my $rc=rcom($tag);
# if ($dstring{$rc}){
# $tag=$rc;
# }
# }
if (!$dstring{$tag}) { @{$dstring{$tag}}=($counts,$gmap,$counts);}
else {
$tot=$counts+${$dstring{$tag}}[0];
$ind=${$dstring{$tag}}[1].",".$gmap;
$counts=${$dstring{$tag}}[2].",".$counts;
@{$dstring{$tag}}=($tot,$ind,$counts);
}
}
}
}
$now=localtime;
warn "processing: $now\n";
sub bycount {
${$dstring{$b}}[0] <=> ${$dstring{$a}}[0]
}
my @seqs=keys %dstring;
@seqs= sort bycount @seqs;
my $globalcount=0;
my $passedcount=0;
my $indexx=0;
print "id\tkmer\tcount\t",join("\t",@allins),"\n";
foreach $seq (@seqs) {
next if ($seq=~/HASH/ || $seq!~/[ATGC]+/);
last if (${$dstring{$seq}}[0]<$mincount);
$globalcount+=${$dstring{$seq}}[0];
$indexx++;
my @cts=split(",",pop @{$dstring{$seq}});
my @inds=split(",", pop @{$dstring{$seq}});
if ($#inds+1<$minind) { next; }
elsif ($#inds+1>$maxind and $maxind>0) { next;}
my @ctall=();
my $skip=0;
for ($i=0;$ind=$allins[$i];$i++) {
if ($inds[$i-$skip] eq $ind) {
push @ctall, $cts[$i-$skip];
}
else {
push @ctall, 0;
$skip++;
}
}
$passedcount++;
push @{$dstring{$seq}}, @ctall;
print "k$indexx\t$seq\t",join("\t",@{$dstring{$seq}}),"\n";
undef $dstring{$seq};
}
warn "\nconsidering kmers seen at least $mincount times:$globalcount\n";
warn "passing minDP:$mincount minInd:$minind maxInd:$maxind filters: $passedcount\n";
$now=localtime;
warn "done $now\n\n";