updated phyTools::extract_[intergenic|CDS]_from_genbank as GIs will b…

…e deprecated in Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers
eead-csic-compbio · Jun 23, 2016 · e525725 · e525725
1 parent 972fca0
commit e525725
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 20 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -138,3 +138,5 @@
 20052016: make_nr_pangenome_matrix.pl can now take all references matching nr clusters (see var $ONLYBESTREFHIT)
 27052016: improved the description of $MIN_PERSEQID_HOM_EST and $MIN_COVERAGE_HOM_EST and added a FAQ
 27052016: improved the description of $MIN_PERSEQID_HOM and $MIN_COVERAGE_HOM and added a FAQ
+23062016: updated phyTools extract_intergenic_from_genbank & extract_CDSs_from_genbank as GIs will be deprecated in Sept2016 (https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers)
+23062016: updated manuals
diff --git a/lib/marfil_homology.pm b/lib/marfil_homology.pm
@@ -3219,7 +3219,7 @@ sub find_OMCL_clusters
 }
 
 # returns: adds all orthologies found by orth all against all comparisons
-# uses globals. %graph,%weight,$bpofile,$last_graph_item
+# uses globals: %graph,%weight,$bpofile,$last_graph_item
 # modified from original orthomcl.pl, last change in May2012
 # jan2015 added $ref_full_sequence_taxa
 sub findAllOrthologiesORTHMCL

diff --git a/lib/phyTools.pm b/lib/phyTools.pm
@@ -620,14 +620,15 @@ sub add_labels2newick_tree
   return join(";\n",split(/;/,$fully_labelled_tree));
 }
 
+# Updated Jun2016
 sub extract_intergenic_from_genbank
 {
-
-# takes a genbank input file and creates a FNA file containing all intergenic sequences found
-# inspired by http://bioperl.org/pipermail/bioperl-l/2006-March/021065.html
-# use $min_intergenic_size = 0 to skip minimal length tests
-# use $max_intergenic_size = 0 to skip maximum length test
-# use $length_flanking_ORFs > 0 if you want to cut oligonucleotides from both flanking ORFs to be used as PCR anchors
+  # takes a genbank input file and creates a FNA file containing all intergenic sequences found
+  # inspired by http://bioperl.org/pipermail/bioperl-l/2006-March/021065.html
+  # use $min_intergenic_size = 0 to skip minimal length tests
+  # use $max_intergenic_size = 0 to skip maximum length test
+  # use $length_flanking_ORFs > 0 if you want to cut oligonucleotides from both flanking ORFs to be used as PCR anchors
+  
   my ($infile,$out_intergenic_file,$min_intergenic_size,$max_intergenic_size,$length_flanking_ORFs) = @_;
   if(!defined($length_flanking_ORFs) || $length_flanking_ORFs < 0){ $length_flanking_ORFs = 0 }
   my ($n_of_intergenic,$gi,$start,$end,$length,$strand,$genename,$taxon) = (0);
@@ -643,17 +644,24 @@ sub extract_intergenic_from_genbank
     $taxon = '';
     for my $f ($seq->get_SeqFeatures)
     {
-      if($f->primary_tag =~ /CDS|rRNA|tRNA/) # campos de 'genes'
+      if($f->primary_tag =~ /CDS|rRNA|tRNA/) # ~genes
       {
+        #GI deprecated on Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/
         $gi = $genename = ''; # compatible con subrutina extract_CDS_from_genbank
-        if($f->has_tag('db_xref'))
+        #if($f->has_tag('db_xref'))
+        #{
+        #  my $crossrefs = join(',',sort $f->each_tag_value('db_xref'));
+        #  if($crossrefs =~ /(GI\:\d+)/){ $gi = $1 }
+        #}
+
+        if($f->has_tag('protein_id'))
         {
-          my $crossrefs = join(',',sort $f->each_tag_value('db_xref'));
-          if($crossrefs =~ /(GI\:\d+)/){ $gi = $1 }
-        }
-        elsif($f->has_tag('locus_tag'))
+          $gi = "ID:".join(',',sort $f->each_tag_value('protein_id')); #print "$gi\n";
+        } 
+
+        if($f->has_tag('locus_tag') && $gi eq '')
         {
-          if($gi eq '' && $f->has_tag('locus_tag')){ $gi = "ID:".join(',',sort $f->each_tag_value('locus_tag')); }
+          $gi = "ID:".join(',',sort $f->each_tag_value('locus_tag'));
         }
 
         if($f->has_tag('gene'))
@@ -834,7 +842,7 @@ sub extract_features_from_genbank
         }
         elsif($f->has_tag('db_xref') && $gi eq '')
         {
-          $gi = "ID:".join(',',sort $f->each_tag_value('locus_tag'));
+          $gi = "ID:".join(',',sort $f->each_tag_value('db_xref'));
         }
 
         if($f->has_tag('gene'))
@@ -880,9 +888,9 @@ sub extract_features_from_genbank
   return \%already_seen;
 }
 
+# Updated Jun2016
 sub extract_CDSs_from_genbank
 {
-
  # takes a genbank input file and creates two FASTA files containing all CDSs in
  # aminoacid and dna sequences, respectively
  # returns number of CDSs found
@@ -920,12 +928,12 @@ sub extract_CDSs_from_genbank
       }
       elsif($f->primary_tag() =~ /CDS/)
       {
+        #GI deprecated on Sept2016 https://www.ncbi.nlm.nih.gov/news/03-02-2016-phase-out-of-GI-numbers/
         $gene=$gi=$crossrefs=$genelength=$protsequence=$CDSseq=$rev=''; 
         $CDScoords = $f->spliced_seq();
 
         if($f->location->isa('Bio::Location::SplitLocationI'))
         {
-          # Bruno Jul2014
           #LOCUS       NC_002505            2961149 bp    DNA     circular BCT 24-MAY-2010
           #ACCESSION   NC_002505
           #VERSION     NC_002505.1  GI:1564003
@@ -950,13 +958,18 @@ sub extract_CDSs_from_genbank
         {
           $CDSseq = $CDScoords->{'seq'};
         }
-
+        
         if($f->has_tag('db_xref'))
         {
           $crossrefs = join(',',sort $f->each_tag_value('db_xref'));
-          if($crossrefs =~ /(GI\:\d+)/){ $gi = $1; $crossrefs =~ s/$gi// }
+          #if($crossrefs =~ /(GI\:\d+)/){ $gi = $1; $crossrefs =~ s/$gi// }
           next if($crossrefs =~ /PSEUDO:/); # no sabemos si es universal, funciona con Bradyrizobium_ORS278.gb
         }
+
+        if($f->has_tag('protein_id'))
+        {
+          $gi = "ID:".join(',',sort $f->each_tag_value('protein_id')); #print "$gi\n";
+        }        
         if($f->has_tag('translation'))
         {
           $protsequence = join('',sort $f->each_tag_value('translation'));

diff --git a/manual_get_homologues-est.pdf b/manual_get_homologues-est.pdf
diff --git a/manual_get_homologues.pdf b/manual_get_homologues.pdf
diff --git a/plot_pancore_matrix.pl b/plot_pancore_matrix.pl
@@ -687,7 +687,7 @@ sub plot_pan_genome
 	}
   
 	png(file="$PNGfile");
-	plot(pan\$genomes,pan\$genes,xaxt='n',xlab='genomes (g)',ylab='pan genome size (genes)',pch=20);
+	plot(pan\$genomes,pan\$genes,xaxt='n',xlab='genomes (g)',ylab='pan genome size (genes)',pch=20); #,ylim=c(25000,35000));
 	axis(side=1,at=xaxis_labels);	
 	if(converged == TRUE) 
         {