-
Notifications
You must be signed in to change notification settings - Fork 5
/
SNPtable2eigenstrat.pl
127 lines (109 loc) · 2.71 KB
/
SNPtable2eigenstrat.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/perl
#This only outputs biallelic sites and puts the major allele as reference.
use warnings;
use strict;
use lib '/home/owens/bin'; #For GObox server
my %t;
$t{"N"} = "NN";
$t{"A"} = "AA";
$t{"T"} = "TT";
$t{"G"} = "GG";
$t{"C"} = "CC";
$t{"W"} = "TA";
$t{"R"} = "AG";
$t{"M"} = "AC";
$t{"S"} = "CG";
$t{"K"} = "TG";
$t{"Y"} = "CT";
my %samples;
my @Good_samples;
my %TotalSites;
my %pop;
my %Genotype;
my %samplepop;
my %poplist;
my $in = $ARGV[0];
my $pop = $ARGV[1];
my $out = $ARGV[2];
require "countbadcolumns.pl";
my ($iupac_coding, $badcolumns) = count_bad_columns($in);
$. = 0;
open POP, $pop;
while (<POP>){
chomp;
my @a = split (/\t/,$_);
$pop{$a[0]}=$a[1];
$poplist{$a[1]}++;
}
close POP;
open (GENOFILE, "> $out.eigenstratgeno") or die "Could not open a file\n";
open (SNPFILE, "> $out.snp") or die "Could not open a file\n";
open (INDFILE, "> $out.ind") or die "Could not open a file\n";
open IN, $in;
while (<IN>){
chomp;
my @a = split (/\t/,$_);
if ($. == 1){
foreach my $i ($badcolumns..$#a){ #Get sample names for each column
if ($pop{$a[$i]}){
$samplepop{$i} = $pop{$a[$i]};
print INDFILE "$a[$i]\tU\t$pop{$a[$i]}\n";
}
}
}else{
next if /^\s*$/;
my $snpname = $a[0]."-"."$a[1]";
my $snpchrom = $a[0];
my $snppos = $a[1];
my %BC;
my %BS;
my %total_alleles;
foreach my $i ($badcolumns..$#a){
if ($samplepop{$i}){
$BC{"total"}{"total"}++;
if ($iupac_coding eq "TRUE"){
$a[$i] = $t{$a[$i]};
}
unless (($a[$i] eq "NN")or($a[$i] eq "XX")){
my @bases = split(//, $a[$i]);
$total_alleles{$bases[0]}++;
$total_alleles{$bases[1]}++;
$BC{"total"}{$bases[0]}++;
$BC{"total"}{$bases[1]}++;
$BC{$samplepop{$i}}{$bases[0]}++;
$BC{$samplepop{$i}}{$bases[1]}++;
$BC{"total"}{"Calls"}++;
$BC{$samplepop{$i}}{"Calls"}++;
if($bases[0] ne $bases[1]){
$BC{"total"}{"Het"}++;
$BC{$samplepop{$i}}{"Het"}++;
}
}
}
}
if (keys %total_alleles == 2){
#Sort bases so p is the major allele and q is the minor allele
my @bases = sort { $total_alleles{$a} <=> $total_alleles{$b} } keys %total_alleles ;
#Major allele
my $b1 = $bases[1];
#Minor allele
my $b2 = $bases[0];
print SNPFILE "$snpname\t$snpchrom\t0.0\t$snppos\n";
foreach my $i ($badcolumns..$#a){
if ($samplepop{$i}){
my @bases = split(//, $a[$i]);
if ($bases[0] eq "N"){
print GENOFILE "9";
}elsif (($bases[0] eq $b1) and ($bases[1] eq $b1)){
print GENOFILE "2";
}elsif (($bases[0] ne $b1) and ($bases[1] ne $b1)){
print GENOFILE "0";
}else {
print GENOFILE "1";
}
}
}
print GENOFILE "\n";
}
}
}