add_pancore_matrices.pl

#!/usr/bin/env perl

# 2012-8 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
# 1: http://www.eead.csic.es/compbio (Estacion Experimental Aula Dei/CSIC/Fundacion ARAID, Spain)
# 2: http://www.ccg.unam.mx/~vinuesa (Center for Genomic Sciences, UNAM, Mexico)

# This script can be used to add pan/core data from .tab files generated by get_homologues.pl -c ,
# for instance to combine CDS and -a 'rRNA,tRNA' results

$|=1;

use strict;
use warnings;
use File::Basename;
use Getopt::Std;
use FindBin '$Bin';
use lib "$Bin/lib";
use lib "$Bin/lib/bioperl-1.5.2_102/";
use phyTools;

my ($INP_tabfile1,$INP_tabfile2,$INP_out_tabfile,%opts) = ('','','');

getopts('hi:j:o:', \%opts);

if(($opts{'h'})||(scalar(keys(%opts))==0))
{
  print   "\n[options]: \n";
  print   "-h \t this message\n";
  print   "-i \t first input .tab data file                           (required, generated by get_homologues.pl)\n";
  print   "-j \t second input .tab data file                          (required, generated by get_homologues.pl)\n";
  print   "-o \t name of output .tab file                             (required)\n";
  exit(-1);
}

if(defined($opts{'i'}))
{
  $INP_tabfile1 = $opts{'i'};
}
else{ die "\n# $0 : need -i parameter, exit\n"; }

if(defined($opts{'j'}))
{
  $INP_tabfile2 = $opts{'j'};
}
else{ die "\n# $0 : need -j parameter, exit\n"; }

if(defined($opts{'o'}))
{
  $INP_out_tabfile = $opts{'o'};
}
else{ die "\n# $0 : need -o parameter, exit\n"; }

printf("\n# %s -i %s -j %s -o %s\n",$0,$INP_tabfile1,$INP_tabfile2,$INP_out_tabfile);

#################################### MAIN PROGRAM  ################################################

my ($total1,$total2,@table1,@table2) = (-1,-1); # first line contains headers
my ($genomes1,$genomes2) = (0,0);

# 1) read input file i
open(DATAI,$INP_tabfile1) || die "# add_pancore_matrices : cannot read $INP_tabfile1\n";
while(<DATAI>)
{

  #g1	g2	...
  #2	4	...
  chomp($_);

  my @data = split(/\t/,$_);
  if(!$genomes1){ $genomes1 = scalar(@data) }
  push(@table1,\@data);
  $total1++;
}
close(DATAI);

# 1) read input file j
open(DATAJ,$INP_tabfile2) || die "# add_pancore_matrices : cannot read $INP_tabfile2\n";
while(<DATAJ>)
{
  chomp($_);
  my @data = split(/\t/,$_);
  if(!$genomes2){ $genomes2 = scalar(@data) }
  push(@table2,\@data);
  $total2++;
}
close(DATAJ);

# 3) check dimensions
if($total1 != $total2)
{
  die "# $0 : number of samples in $INP_tabfile1 and $INP_tabfile2 does not match, please check input\n";
}
else
{
  print "# number of samples = $total1\n";
}

if($genomes1 != $genomes2)
{
  die "# $0 : number of genomes in $INP_tabfile1 and $INP_tabfile2 does not match, please check input\n";
}
else
{
  print "# number of genomes = $genomes1\n";
}

# 4) add matrices
open(OUT,">$INP_out_tabfile\n") || die "# $0 : cannot create $INP_out_tabfile\n";
foreach my $i (0 .. $total1)
{
  if($i == 0)
  {
    foreach my $j (0 .. $genomes1-1){ print OUT "$table1[$i][$j]\t"; } print OUT "\n";
  }
  else
  {
    foreach my $j (0 .. $genomes1-1){ printf OUT ("%d\t",$table1[$i][$j]+$table2[$i][$j]); } print OUT "\n";
  }
}
close(OUT);

print "# ouput matrix file = $INP_out_tabfile\n";