forked from z0on/2bRAD_denovo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
alleleCounts.pl
executable file
·129 lines (112 loc) · 2.81 KB
/
alleleCounts.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/perl
my $usage = "
alleleCounts.pl:
Uses vcf file to count alleles for each locus across populations.
Can filter counted genotypes by genotype quality.
Only counts bialelic loci.
Arguments:
vcf=[file name] : input vcf v.4 file
pops=[file name] : tab-delimited table: sample name <tab> population affiliation
minGQ=[integer] : minimum phred-scaled genotype quality to count alleles. Default 0.
prints to STDOUT:
locus number
chromosome
position
ref allele
alt allele
population
ref allele count
alt allele count
number of missing alleles
Mikhail V. Matz, matz/@utexas.edu
";
my $mingq=0;
my $vcf;
my $pops;
if ("@ARGV"=~/vcf=(\S+)/) { $vcf=$1; } else { die $usage;}
if ("@ARGV"=~/pops=(\S+)/) { $pops=$1; } else { die $usage;}
if ("@ARGV"=~/minGQ=(\d+)/) { $mingq=$1; }
open VCF, "$vcf" or die "cannot open input vcf file $vcf\n";
open POPS, "$pops" or die "cannot open input populations file $pops\n";
my %inds2pops={};
my %pops2inds={};
while (<POPS>) {
chop;
(my $ind,my $pop)=split('\t',$_);
$inds2pops{$ind}=$pop;
push @{$pops2inds{$pop}}, $ind;
}
my @Pops = keys %pops2inds ;
my $GQpos=-1;
my %num2pops={};
my %pops2num={};
my $totals=0;
print "CHROM\tPOS\tREF\tALT\tPOP\tREFC\tALTC\tMISSC\n";
while (<VCF>) {
if ($_=~/^#CHROM/) {
chop;
my @lin=split("\t",$_);
my @start=splice(@lin,0,9);
for(my $i=0;$lin[$i];$i++) {
if ($inds2pops{$lin[$i]}) {
$num2pops{$i}=$inds2pops{$lin[$i]};
push @{$pops2num{$inds2pops{$lin[$i]}}}, $i;
}
else { warn "\t no pop affiliation found for $lin[$i]\n";}
}
next;
}
if ($_=~/^#/) { next;}
chop;
my @lin=split("\t",$_);
my @start=splice(@lin,0,9);
if ($start[4]=~/\,/) { next;} # multi-allelic SNP
if ($GQpos==-1 && $mingq>0) {
#warn "format: $start[8]\n";
if ($start[8]=~/GQ/) {
my @info=split('\:',$start[8]);
my $i;
for ($i=0;$info[$i];$i++){
if ($info[$i] eq "GQ") { last;}
}
$GQpos=$i;
#warn "GQpos:$GQpos\n";
}
else {
warn "\tno GQ field found\n";
$mingq=0;
}
}
my %popRef={};
my %popAlt={};
my %popMiss={};
$totals++;
for($i=0;$lin[$i];$i++){
my $pop = $num2pops{$i};
#warn "$i:pop $pop; geno $lin[$i]\n";
my @geno=split("\:",$lin[$i]);
if (($mingq>0 && $geno[$GQpos]<$mingq) || $geno[0] eq "./." ) {
$popMiss{$pop}+=2;
}
else {
if ($geno[0] eq "0/0" || $geno[0] eq "0|0" ) {
$popRef{$pop}+=2;
}
elsif ($geno[0] eq "1/1" || $geno[0] eq "1|1" ) {
$popAlt{$pop}+=2;
}
elsif ($geno[0] eq "0/1" || $geno[0] eq "0|1" ) {
$popRef{$pop}++;
$popAlt{$pop}++;
}
}
}
foreach my $p (@Pops) {
next if ($p=~/HASH/);
if (!$popRef{$p}){ $popRef{$p}=0;}
if (!$popAlt{$p}){ $popAlt{$p}=0;}
if (!$popMiss{$p}){ $popMiss{$p}=0;}
print "$totals\t$start[0]\t$start[1]\t$start[3]\t$start[4]\t$p\t$popRef{$p}\t$popAlt{$p}\t$popMiss{$p}\n";
}
}
warn "total $totals loci\n";