forked from eead-csic-compbio/get_homologues
-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_pancore_matrices.pl
executable file
·122 lines (101 loc) · 3.08 KB
/
add_pancore_matrices.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env perl
# 2012-8 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
# 1: http://www.eead.csic.es/compbio (Estacion Experimental Aula Dei/CSIC/Fundacion ARAID, Spain)
# 2: http://www.ccg.unam.mx/~vinuesa (Center for Genomic Sciences, UNAM, Mexico)
# This script can be used to add pan/core data from .tab files generated by get_homologues.pl -c ,
# for instance to combine CDS and -a 'rRNA,tRNA' results
$|=1;
use strict;
use warnings;
use File::Basename;
use Getopt::Std;
use FindBin '$Bin';
use lib "$Bin/lib";
use lib "$Bin/lib/bioperl-1.5.2_102/";
use phyTools;
my ($INP_tabfile1,$INP_tabfile2,$INP_out_tabfile,%opts) = ('','','');
getopts('hi:j:o:', \%opts);
if(($opts{'h'})||(scalar(keys(%opts))==0))
{
print "\n[options]: \n";
print "-h \t this message\n";
print "-i \t first input .tab data file (required, generated by get_homologues.pl)\n";
print "-j \t second input .tab data file (required, generated by get_homologues.pl)\n";
print "-o \t name of output .tab file (required)\n";
exit(-1);
}
if(defined($opts{'i'}))
{
$INP_tabfile1 = $opts{'i'};
}
else{ die "\n# $0 : need -i parameter, exit\n"; }
if(defined($opts{'j'}))
{
$INP_tabfile2 = $opts{'j'};
}
else{ die "\n# $0 : need -j parameter, exit\n"; }
if(defined($opts{'o'}))
{
$INP_out_tabfile = $opts{'o'};
}
else{ die "\n# $0 : need -o parameter, exit\n"; }
printf("\n# %s -i %s -j %s -o %s\n",$0,$INP_tabfile1,$INP_tabfile2,$INP_out_tabfile);
#################################### MAIN PROGRAM ################################################
my ($total1,$total2,@table1,@table2) = (-1,-1); # first line contains headers
my ($genomes1,$genomes2) = (0,0);
# 1) read input file i
open(DATAI,$INP_tabfile1) || die "# add_pancore_matrices : cannot read $INP_tabfile1\n";
while(<DATAI>)
{
#g1 g2 ...
#2 4 ...
chomp($_);
my @data = split(/\t/,$_);
if(!$genomes1){ $genomes1 = scalar(@data) }
push(@table1,\@data);
$total1++;
}
close(DATAI);
# 1) read input file j
open(DATAJ,$INP_tabfile2) || die "# add_pancore_matrices : cannot read $INP_tabfile2\n";
while(<DATAJ>)
{
chomp($_);
my @data = split(/\t/,$_);
if(!$genomes2){ $genomes2 = scalar(@data) }
push(@table2,\@data);
$total2++;
}
close(DATAJ);
# 3) check dimensions
if($total1 != $total2)
{
die "# $0 : number of samples in $INP_tabfile1 and $INP_tabfile2 does not match, please check input\n";
}
else
{
print "# number of samples = $total1\n";
}
if($genomes1 != $genomes2)
{
die "# $0 : number of genomes in $INP_tabfile1 and $INP_tabfile2 does not match, please check input\n";
}
else
{
print "# number of genomes = $genomes1\n";
}
# 4) add matrices
open(OUT,">$INP_out_tabfile\n") || die "# $0 : cannot create $INP_out_tabfile\n";
foreach my $i (0 .. $total1)
{
if($i == 0)
{
foreach my $j (0 .. $genomes1-1){ print OUT "$table1[$i][$j]\t"; } print OUT "\n";
}
else
{
foreach my $j (0 .. $genomes1-1){ printf OUT ("%d\t",$table1[$i][$j]+$table2[$i][$j]); } print OUT "\n";
}
}
close(OUT);
print "# ouput matrix file = $INP_out_tabfile\n";