parse_pangenome_matrix.pl now checks that taxon names in -A/-B lists …

…match those in pangenome_matrix.tab
eead-csic-compbio · Mar 23, 2016 · fef10c9 · fef10c9
1 parent fe5038f
commit fef10c9
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 8 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -106,3 +106,4 @@
 11032016: created pfam_enrich.pl to calculate Pfam-domain enrichment of get_homologues[-est] clusters
 11032016: created sub parse_Pfam_freqs in marfil_homology.pm
 11032016: manuals updated
+23032016: parse_pangenome_matrix.pl now checks that taxon names in -A/-B lists match those in pangenome_matrix.tab
diff --git a/parse_pangenome_matrix.pl b/parse_pangenome_matrix.pl
@@ -131,7 +131,7 @@
 my (%cluster_names,%pangemat,$col,$cluster_dir);
 my (%included_input_filesA,%included_input_filesB);
 my ($n_of_clusters,$n_of_includedA,$n_of_includedB) = (0,0,0);
-my ($outfile_root,$outpanfileA,$outexpanfileA);
+my ($outfile_root,$outpanfileA,$outexpanfileA,$taxon);
 my ($shell_input,$shell_output_png,$shell_output_pdf,$shell_circle_png,$shell_circle_pdf,$shell_estimates);
 my ($cloudlistfile,$shelllistfile,$softcorelistfile,$corelistfile);
 my (@pansetA,@pansetB,@expA,@expB,@shell);
@@ -208,11 +208,16 @@
   while(<INCL>)
   {
     next if(/^#/ || /^$/);
-    $included_input_filesB{(split)[0]} = 1;
+    $taxon = (split)[0];
+    $included_input_filesB{$taxon} = 1;
+    if(!$pangemat{$taxon})
+    {
+      die "# cannot match $taxon in $INP_matrix (included in $INP_includeB)\n";
+    }
   }
   close(INCL);
   $n_of_includedB = scalar(keys(%included_input_filesB));
-  print "# taxa included in group B = $n_of_includedB\n\n";
+  print "# taxa included in group B = $n_of_includedB\n\n";  
 }
 elsif($needAB)
 {
@@ -221,7 +226,12 @@
   while(<INCL>)
   {
     next if(/^#/ || /^$/);
-    $included_input_filesA{(split)[0]} = 1;
+    $taxon = (split)[0];
+    $included_input_filesA{$taxon} = 1;
+    if(!$pangemat{$taxon})
+    {
+      die "# cannot match $taxon in $INP_matrix (included in $INP_includeA)\n";
+    }
   }
   close(INCL);
   $n_of_includedA = scalar(keys(%included_input_filesA));
@@ -232,21 +242,28 @@
   while(<INCL>)
   {
     next if(/^#/ || /^$/);
-    $included_input_filesB{(split)[0]} = 1;
+    $taxon = (split)[0];
+    $included_input_filesB{$taxon} = 1;
+    if(!$pangemat{$taxon})
+    {
+      die "# cannot match $taxon in $INP_matrix (included in $INP_includeB)\n";
+    } 
   }
   close(INCL);
   $n_of_includedB = scalar(keys(%included_input_filesB));
   print "# taxa included in group B = $n_of_includedB\n\n";
 }
 
+
+
 ## 3) perform requested operations
 if($INP_absentB)
 {
   print "\n# finding genes which are absent in B ...\n";
   foreach $col (1 .. $n_of_clusters)
   {
     my ($presentA,$absentA,$absentB,$presentB) = (0,0,0,0);
-    foreach my $taxon (keys(%pangemat))
+    foreach $taxon (keys(%pangemat))
     {
       if($pangemat{$taxon}[$col])
       {
@@ -287,7 +304,7 @@
   foreach $col (1 .. $n_of_clusters)
   {
     my ($presentA,$absentA,$absentB,$presentB) = (0,0,0,0);
-    foreach my $taxon (keys(%pangemat))
+    foreach $taxon (keys(%pangemat))
     {
       if($pangemat{$taxon}[$col])
       {
@@ -340,7 +357,7 @@
   {
     my ($presentA,$presentB,@sizeA,@sizeB) = (0,0);
     my ($minA,$maxA,$minB,$maxB);
-    foreach my $taxon (keys(%pangemat))
+    foreach $taxon (keys(%pangemat))
     {
       if($pangemat{$taxon}[$col])
       {