From 060a07b8ed59c7017646370d945552d8b486184a Mon Sep 17 00:00:00 2001
From: qiyunzhu
Date: Tue, 22 Aug 2017 17:01:14 -0700
Subject: [PATCH 1/9] replaced tab with 4 spaces, removed reliance on outdated
NCBI GI
---
ChangeLog | 10 +
HGTector.pl | 220 +--
scripts/analyzer.pl | 2645 +++++++++++++++++----------------
scripts/bbh.pl | 309 ++--
scripts/orthologer.pl | 353 +++--
scripts/reporter.pl | 1333 +++++++++--------
scripts/searcher.pl | 3278 ++++++++++++++++++++---------------------
scripts/treer.pl | 847 ++++++-----
8 files changed, 4471 insertions(+), 4524 deletions(-)
mode change 100755 => 100644 HGTector.pl
mode change 100755 => 100644 scripts/analyzer.pl
mode change 100755 => 100644 scripts/bbh.pl
mode change 100755 => 100644 scripts/orthologer.pl
mode change 100755 => 100644 scripts/reporter.pl
mode change 100755 => 100644 scripts/searcher.pl
mode change 100755 => 100644 scripts/treer.pl
diff --git a/ChangeLog b/ChangeLog
index 8b569cc..f390080 100755
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,15 @@
ChangeLog of HGTector
+== Version 0.2.2 (8/22/2017) ==
+
+New features:
+- Adopt the new NCBI format (accession instead of GI as sequence identifier)
+
+Bug fixes:
+- Reformatted all Perl scripts using Perltidy.
+- Replaced 'http' in NCBI URLs with 'https'.
+
+
== Incremental update (1/28/2017) ==
Bug fixes:
- Fixed a bug in databaser.py which incorrectly deals the case with subsampling off.
diff --git a/HGTector.pl b/HGTector.pl
old mode 100755
new mode 100644
index a2ec136..f2dfceb
--- a/HGTector.pl
+++ b/HGTector.pl
@@ -1,6 +1,6 @@
#!/usr/bin/perl
-# HGTector (version 0.2.1): Genome-wide detection of horizontal gene transfer based on sequence similarity search result distribution statistics
+# HGTector (version 0.2.2): Genome-wide detection of horizontal gene transfer based on sequence similarity search result distribution statistics
# Copyright (C) 2013-2017, Qiyun Zhu, Katharina Dittmar. All rights reserved.
# Licensed under BSD 2-clause license.
@@ -11,9 +11,9 @@
$| = 1;
print "
-┌---------------------┐
-| HGTector v0.2.1 |
-└---------------------┘
++---------------------+
+| HGTector v0.2.2 |
++---------------------+
";
my $usage = "
@@ -35,116 +35,118 @@
my $scripts = abs_path($0); $scripts =~ s/HGTector\.pl$/scripts/;
my $wkDir = abs_path($0); $wkDir =~ s/HGTector\.pl$/sample/;
-if (not @ARGV or $ARGV[0] eq "-h" or $ARGV[0] eq "--help"){
- if (-d $db){
- print $usage;
- }else{
- print "\nWelcome! This seems to be the first time you run HGTector on this device.\n";
- print "I will guide you through the basics of the program.\n";
- print "\nWould you like to test on a small sample dataset (yes/NO)? ";
- my $s = ; chomp $s;
- if ($s =~ /^yes$/i or $s =~ /^y$/i){
- print "\nWarning: HGTector will connect the NCBI BLAST server to perform sequence similarity search and identify taxonomy of hits. You don't need any local search tool, database or computing power. However, please be prepared that this process is slow and the server may hang in some situations.\nProceed (yes) or opt out (NO): ";
- $s = ; chomp $s;
- if ($s =~ /^yes$/i or $s =~ /^y$/i){
- pipeline;
- }
- }
- print "The sample dataset and its corresponding configuration files, plus an archive of sample output can be found in the /sample subdirectory. You may take a look and get a general idea of how the input / output files are organized in an HGTector analysis.\n";
- print "\n";
- print "Now I will assist you to create necessary databases for future analyses.\nProceed (YES/no)? ";
- $s = ; chomp $s;
- if (not $s or $s =~ /^yes$/i or $s =~ /^y$/i){
- print "HGTector needs three databases: a protein sequence database, a taxonomy database, and a protein-to-taxonomy dictionary. Using proper databases is important for effective analyses. Please read about the details of choice of databases in the GUI.\n";
- print "Since release 0.2.0, a Python script databaser.py is implemented to automate the database construction process.\n";
- die "Error: databaser.py is not found in the scripts/ subdirectory.\n" unless -e "$scripts/databaser.py";
- print "\nPlease specify a location to store the databases (default: db/ in the program directory): ";
- $s = ; chomp $s; $s =~ s/\/$//;
- $db = $s if $s;
- mkdir $db;
- chdir $db;
- print "\nBy default, I will download NCBI RefSeq proteomic data of microbial organisms (archaea, bacteria, fungi and protozoa), keep one proteome per species, plus all NCBI-defined representative proteomes. The data will be merged into a master multi-Fasta file, and compiled into a BLAST database.\n";
- print "As of 2015-10-25, the database contained around 30 million non-redundant protein sequences from more than 8000 organisms. Depending on the Internet connection quality, it may take up to 12 hr to download and compile.\n";
- my ($format, $range, $represent, $subsample, $out) = ("blast", "microbe", "auto", "1", "stdb");
- print "Accept (YES) or change options (no): ";
- $s = ; chomp $s;
- if ($s =~ /^no$/i or $s =~ /^n$/i){
- print "\nNCBI RefSeq genomes are categorized as: archaea, bacteria, fungi, invertebrate, plant, protozoa, vertebrate_mammalian, vertebrate_other, and viral. Type one or more desired categories separated by comma, or press Enter to accept the default (microbe): ";
- $s = ; chomp $s; $s =~ s/\s+//g;
- $range = $s if $s;
- print "\nI will keep one representative organism per species by default, in order to save computing time. Otherwise, enter the number of organisms you want to keep, or type '0' to keep all organisms (no subsampling): ";
- $s = ; chomp $s;
- if ($s eq "0"){ $subsample = "0"; }
- elsif ($s and $s =~ /^\d+$/){ $subsample = $s; }
- print "\nAll NCBI-defined representative organisms will be kept. Press Enter to accept, type 'no' to opt out, or enter a file name that contains a list of TaxID you wish to keep: ";
- $s = ; chomp $s;
- $represent = $s if $s;
- print "\nAfter downloading the proteomic data, do you want to build a BLAST database? Note: you need makeblastdb from the NCBI-BLAST+ package to enable this function. You may also build database manually using tools of choice later. (YES/no) ";
- $s = ; chomp $s;
- $format = "none" if ($s =~ /^no$/i or $s =~ /^n$/i);
- print "\nEnter a stem file name for your database (default: 'stdb'): ";
- $s = ; chomp $s;
- $out = $s if $s;
- }
- my $i = system "python $scripts/databaser.py -format=$format -range=$range -represent=$represent -subsample=$subsample -out=$out";
- chdir $cwd;
- print "\nThere appear to be some problems in database building.\n" unless -s "$db/$out.faa";
- print "\nNow that databases are created. Your may link HGTector to them in future analyses, by setting the following parameters in config.txt:\n";
- if ($format eq "blast"){ print "protdb=$db/blast/$out\n"; }
- else{ print "# You still need to build a searchable database based on $db/$out.faa"; }
- print "taxdump=$db/taxdump\n";
- print "prot2taxid=$db/gi2taxid.txt\n";
- print "\n";
- }
- print "You may start to use HGTector to analyze your data.\n";
- print $usage;
- }
-}else{
- $wkDir = $ARGV[0];
- pipeline;
+if (not @ARGV or $ARGV[0] eq "-h" or $ARGV[0] eq "--help") {
+ if (-d $db) {
+ print $usage;
+ } else {
+ print "\nWelcome! This seems to be the first time you run HGTector on this device.\n";
+ print "I will guide you through the basics of the program.\n";
+ print "\nWould you like to test on a small sample dataset (yes/NO)? ";
+ my $s = ; chomp $s;
+ if ($s =~ /^yes$/i or $s =~ /^y$/i) {
+ print "\nWarning: HGTector will connect the NCBI BLAST server to perform sequence similarity search and identify taxonomy of hits. You don't need any local search tool, database or computing power. However, please be prepared that this process is slow and the server may hang in some situations.\nProceed (yes) or opt out (NO): ";
+ $s = ; chomp $s;
+ if ($s =~ /^yes$/i or $s =~ /^y$/i) {
+ pipeline;
+ }
+ }
+ print "The sample dataset and its corresponding configuration files, plus an archive of sample output can be found in the /sample subdirectory. You may take a look and get a general idea of how the input / output files are organized in an HGTector analysis.\n";
+ print "\n";
+ print "Now I will assist you to create necessary databases for future analyses.\nProceed (YES/no)? ";
+ $s = ; chomp $s;
+ if (not $s or $s =~ /^yes$/i or $s =~ /^y$/i) {
+ print "HGTector needs three databases: a protein sequence database, a taxonomy database, and a protein-to-taxonomy dictionary. Using proper databases is important for effective analyses. Please read about the details of choice of databases in the GUI.\n";
+ print "Since release 0.2.0, a Python script databaser.py is implemented to automate the database construction process.\n";
+ die "Error: databaser.py is not found in the scripts/ subdirectory.\n" unless -e "$scripts/databaser.py";
+ print "\nPlease specify a location to store the databases (default: db/ in the program directory): ";
+ $s = ; chomp $s; $s =~ s/\/$//;
+ $db = $s if $s;
+ mkdir $db;
+ chdir $db;
+ print "\nBy default, I will download NCBI RefSeq proteomic data of microbial organisms (archaea, bacteria, fungi and protozoa), keep one proteome per species, plus all NCBI-defined representative proteomes. The data will be merged into a master multi-Fasta file, and compiled into a BLAST database.\n";
+ print "As of 2015-10-25, the database contained around 30 million non-redundant protein sequences from more than 8000 organisms. Depending on the Internet connection quality, it may take up to 12 hr to download and compile.\n";
+ my ($format, $range, $represent, $subsample, $out) = ("blast", "microbe", "auto", "1", "stdb");
+ print "Accept (YES) or change options (no): ";
+ $s = ; chomp $s;
+ if ($s =~ /^no$/i or $s =~ /^n$/i) {
+ print "\nNCBI RefSeq genomes are categorized as: archaea, bacteria, fungi, invertebrate, plant, protozoa, vertebrate_mammalian, vertebrate_other, and viral. Type one or more desired categories separated by comma, or press Enter to accept the default (microbe): ";
+ $s = ; chomp $s; $s =~ s/\s+//g;
+ $range = $s if $s;
+ print "\nI will keep one representative organism per species by default, in order to save computing time. Otherwise, enter the number of organisms you want to keep, or type '0' to keep all organisms (no subsampling): ";
+ $s = ; chomp $s;
+ if ($s eq "0") { $subsample = "0"; }
+ elsif ($s and $s =~ /^\d+$/) { $subsample = $s; }
+ print "\nAll NCBI-defined representative organisms will be kept. Press Enter to accept, type 'no' to opt out, or enter a file name that contains a list of TaxID you wish to keep: ";
+ $s = ; chomp $s;
+ $represent = $s if $s;
+ print "\nAfter downloading the proteomic data, do you want to build a BLAST database? Note: you need makeblastdb from the NCBI-BLAST+ package to enable this function. You may also build database manually using tools of choice later. (YES/no) ";
+ $s = ; chomp $s;
+ $format = "none" if ($s =~ /^no$/i or $s =~ /^n$/i);
+ print "\nEnter a stem file name for your database (default: 'stdb'): ";
+ $s = ; chomp $s;
+ $out = $s if $s;
+ }
+ my $i = system "python $scripts/databaser.py -format=$format -range=$range -represent=$represent -subsample=$subsample -out=$out";
+ chdir $cwd;
+ print "\nThere appear to be some problems in database building.\n" unless -s "$db/$out.faa";
+ print "\nNow that databases are created. Your may link HGTector to them in future analyses, by setting the following parameters in config.txt:\n";
+ if ($format eq "blast") {
+ print "protdb=$db/blast/$out\n";
+ } else {
+ print "# You still need to build a searchable database based on $db/$out.faa";
+ }
+ print "taxdump=$db/taxdump\n";
+ print "prot2taxid=$db/gi2taxid.txt\n";
+ print "\n";
+ }
+ print "You may start to use HGTector to analyze your data.\n";
+ print $usage;
+ }
+} else {
+ $wkDir = $ARGV[0];
+ pipeline;
}
exit 0;
sub pipeline {
- print "\nValidating task...\n";
- die "Error: Invalid working directory $wkDir.\n" unless -d $wkDir;
- my $interactive = 1;
- if (-e "$wkDir/config.txt"){
- open IN, "<$wkDir/config.txt" or die "Error: Invalid configuration file $wkDir/config.txt.\n";
- while (){
- s/#.*$//; s/\s+$//; s/^\s+//; next unless $_;
- $interactive = 0 if /^interactive=0$/;
- }
- }else{
- print "Warning: Configuration file $wkDir/config.txt is not found. HGTector will use default settings.\n";
- if ($interactive){
- print "Press Enter to proceed, or Ctrl+C to exit:";
- my $s = ;
- }
- }
- die "Error: Input folder $wkDir/input is not found. Please prepare input data before running HGTector.\n" unless -d "$wkDir/input";
- print "Done.\n\n";
+ print "\nValidating task...\n";
+ die "Error: Invalid working directory $wkDir.\n" unless -d $wkDir;
+ my $interactive = 1;
+ if (-e "$wkDir/config.txt") {
+ open IN, "<$wkDir/config.txt" or die "Error: Invalid configuration file $wkDir/config.txt.\n";
+ while () {
+ s/#.*$//; s/\s+$//; s/^\s+//; next unless $_;
+ $interactive = 0 if /^interactive=0$/;
+ }
+ } else {
+ print "Warning: Configuration file $wkDir/config.txt is not found. HGTector will use default settings.\n";
+ if ($interactive) {
+ print "Press Enter to proceed, or Ctrl+C to exit:";
+ my $s = ;
+ }
+ }
+ die "Error: Input folder $wkDir/input is not found. Please prepare input data before running HGTector.\n" unless -d "$wkDir/input";
+ print "Done.\n\n";
- print "Step 1: Searcher - batch protein sequence homology search.\n";
- die "Error: searcher.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/searcher.pl";
- my $i = system "$^X $scripts/searcher.pl $wkDir";
- die "Error: Execution of searcher.pl failed. HGTector exists.\n" if $i;
- die "Error: No search result detected. HGTector exists.\n" unless -d "$wkDir/search";
- print "\n";
+ print "Step 1: Searcher - batch protein sequence homology search.\n";
+ die "Error: searcher.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/searcher.pl";
+ my $i = system "$^X $scripts/searcher.pl $wkDir";
+ die "Error: Execution of searcher.pl failed. HGTector exists.\n" if $i;
+ die "Error: No search result detected. HGTector exists.\n" unless -d "$wkDir/search";
+ print "\n";
- print "Step 2: Analyzer - predict HGT based on hit distribution statistics.\n";
- die "Error: analyzer.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/analyzer.pl";
- $i = system "$^X $scripts/analyzer.pl $wkDir";
- die "Execution of analyzer.pl failed.\n" if $i;
- die "Error: No analysis result detected. HGTector exists.\n" unless -d "$wkDir/result";
- print "\n";
+ print "Step 2: Analyzer - predict HGT based on hit distribution statistics.\n";
+ die "Error: analyzer.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/analyzer.pl";
+ $i = system "$^X $scripts/analyzer.pl $wkDir";
+ die "Execution of analyzer.pl failed.\n" if $i;
+ die "Error: No analysis result detected. HGTector exists.\n" unless -d "$wkDir/result";
+ print "\n";
- print "Step 3: Reporter - generate report for prediction results.\n";
- die "Error: reporter.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/reporter.pl";
- $i = system "$^X $scripts/reporter.pl $wkDir";
- die "Execution of analyzer.pl failed.\n" if $i;
- print "\n";
-
- print "All steps completed.\n";
-}
+ print "Step 3: Reporter - generate report for prediction results.\n";
+ die "Error: reporter.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/reporter.pl";
+ $i = system "$^X $scripts/reporter.pl $wkDir";
+ die "Execution of analyzer.pl failed.\n" if $i;
+ print "\n";
+ print "All steps completed.\n";
+}
diff --git a/scripts/analyzer.pl b/scripts/analyzer.pl
old mode 100755
new mode 100644
index 24f8e8b..8e41bfc
--- a/scripts/analyzer.pl
+++ b/scripts/analyzer.pl
@@ -50,86 +50,86 @@
## global variables ##
-my @sets; # proteins sets to analyze
-my %lv = (); # taxonomic grouping scenario
-my %selves = (); # taxonomic information of self group
-my %lvList = (); # list of taxids of each level
+my @sets; # proteins sets to analyze
+my %lv = (); # taxonomic grouping scenario
+my %selves = (); # taxonomic information of self group
+my %lvList = (); # list of taxids of each level
-my %taxadb = (); # taxa.db
-my %ranksdb = (); # ranks.db
-my %selfinfo = (); # self.info
+my %taxadb = (); # taxa.db
+my %ranksdb = (); # ranks.db
+my %selfinfo = (); # self.info
-my %proteins = (); # accn -> name (identified by COG)
+my %proteins = (); # accn -> name (identified by COG)
-my @files; # search report files for each protein set
+my @files; # search report files for each protein set
## the master table storing everything. It's an array of hashes. Each row is a record.
my %results = ();
## the phyletic pattern of the whole genome, aka "fingerprint"
-my %fpN = (); # number of hits per protein
+my %fpN = (); # number of hits per protein
## program parameters ##
-my $wkDir = $ARGV[0]; # working directory
-my $interactive = 1; # interactive or automatic mode
+my $wkDir = $ARGV[0]; # working directory
+my $interactive = 1; # interactive or automatic mode
-my $minHits = 0; # minimal number of hits a valid search result should contain
-my $maxHits = 0; # maximal number of hits to retain for one protein, 0 means infinite
-my $minSize = 0; # minimal size (aa) of a valid protein (0 means infinite)
-my $evalue = 1e-5; # E-value cutoff
-my $identity = 0; # percent identity cutoff
-my $coverage = 0; # query coverage cutoff
+my $minHits = 0; # minimal number of hits a valid search result should contain
+my $maxHits = 0; # maximal number of hits to retain for one protein, 0 means infinite
+my $minSize = 0; # minimal size (aa) of a valid protein (0 means infinite)
+my $evalue = 1e-5; # E-value cutoff
+my $identity = 0; # percent identity cutoff
+my $coverage = 0; # query coverage cutoff
# algorithm
-my $selfRank = 0; # taxonomic rank(s) on which the program analyzes
-my $normalize = 1; # use relative bit score (bit score of subject / bit score of query)
-my $unite = 1; # hit pattern (0: each genome has own pattern, 1: one pattern for all genomes)
+my $selfRank = 0; # taxonomic rank(s) on which the program analyzes
+my $normalize = 1; # use relative bit score (bit score of subject / bit score of query)
+my $unite = 1; # hit pattern (0: each genome has own pattern, 1: one pattern for all genomes)
-my $useDistance = 0; # use phylogenetic distance instead of bit scores
-my $useWeight = 1; # use weight (sum of scores) instead of number of hits
+my $useDistance = 0; # use phylogenetic distance instead of bit scores
+my $useWeight = 1; # use weight (sum of scores) instead of number of hits
# fingerprints
-my $outRaw = 1; # output raw number/weight data
-my $outFp = 1; # output fingerprint
-my $graphFp = 0; # graph fingerprint (requires R)
+my $outRaw = 1; # output raw number/weight data
+my $outFp = 1; # output fingerprint
+my $graphFp = 0; # graph fingerprint (requires R)
-my $plotRef = ""; # file name of reference set of genes
+my $plotRef = ""; # file name of reference set of genes
-my $boxPlot = 1; # box plot
-my $histogram = 1; # histogram
-my $densityPlot = 1; # density plot
-my $scatterPlot = 1; # scatter plot
-my $plot3D = 0; # 3-way scatter plot
+my $boxPlot = 1; # box plot
+my $histogram = 1; # histogram
+my $densityPlot = 1; # density plot
+my $scatterPlot = 1; # scatter plot
+my $plot3D = 0; # 3-way scatter plot
# cutoffs
-my $howCO = 4; # how to determine cutoffs (0: user-defined global cutoff (%), 1: user-defined individual
- # cutoffs, 2: wait for user input, 3: histogram, 4: kernel density estimation,
- # 5: hierarchical clustering)
-my $globalCO = 0.25; # arbitrary global cutoff (%)
-my ($selfCO, $closeCO, $distalCO) = (0, 0, 0); # user-defined cutoffs for individual groups
+my $howCO = 4; # how to determine cutoffs (0: user-defined global cutoff (%), 1: user-defined individual
+ # cutoffs, 2: wait for user input, 3: histogram, 4: kernel density estimation,
+ # 5: hierarchical clustering)
+my $globalCO = 0.25; # arbitrary global cutoff (%)
+my ($selfCO, $closeCO, $distalCO) = (0, 0, 0); # user-defined cutoffs for individual groups
-my $exOutlier = 0; # exclude outliers, hits distant from other hits of the same group
+my $exOutlier = 0; # exclude outliers, hits distant from other hits of the same group
-my $nBin = 20; # number of bins in histogram
-my $plotHist = 0; # plot histogram on screen
+my $nBin = 20; # number of bins in histogram
+my $plotHist = 0; # plot histogram on screen
-my $toolKDE = 0; # computational tool for kernel density estimation
-my $bwF = 1; # bandwidth selection factor
-my $plotKDE = 0; # plot density function on screen
-my $toolExtrema = 0; # computational tool for identifying local extrema of density function (0: Perl code, 1: R package "pastecs")
-my $whichPeak = 0; # definition of "typical" and "atypical" regions
-my $modKCO = 1; # location of cutoff (0: 1st pit, 1: midpoint of x-coordinates between 1st peak and 1st pit, 2/3: quantile
-my $qKCO = 0.5; # horizontal/vertical quantile from 1st pit toward 1st peak
+my $toolKDE = 0; # computational tool for kernel density estimation
+my $bwF = 1; # bandwidth selection factor
+my $plotKDE = 0; # plot density function on screen
+my $toolExtrema = 0; # computational tool for identifying local extrema of density function (0: Perl code, 1: R package "pastecs")
+my $whichPeak = 0; # definition of "typical" and "atypical" regions
+my $modKCO = 1; # location of cutoff (0: 1st pit, 1: midpoint of x-coordinates between 1st peak and 1st pit, 2/3: quantile
+my $qKCO = 0.5; # horizontal/vertical quantile from 1st pit toward 1st peak
-my $dipTest = 0; # perform non-unimodality test (Hartigan's dip test) and report p-value
-my $dipSig = 0; # use global cutoff if dip test's result is not significant
+my $dipTest = 0; # perform non-unimodality test (Hartigan's dip test) and report p-value
+my $dipSig = 0; # use global cutoff if dip test's result is not significant
-my $selfLow = 0; # HGT-derived genes must have low self weight (an optional criterion)
+my $selfLow = 0; # HGT-derived genes must have low self weight (an optional criterion)
-my $BBH = 0; # use conventional best match method instead
-my $loss = 0; # also report gene loss events
-my $POE = 0; # also report POE
+my $BBH = 0; # use conventional best match method instead
+my $loss = 0; # also report gene loss events
+my $POE = 0; # also report POE
my @ranks = ('species', 'genus', 'family', 'order', 'class', 'phylum');
@@ -139,109 +139,109 @@
my @inSets = ();
my @exSets = ();
-my $R; # Statistics::R instance
+my $R; # Statistics::R instance
## read configurations ##
-if (-e "$wkDir/config.txt"){
- open IN, "<$wkDir/config.txt";
- while (){
- s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
- $interactive = $1 if /^interactive=([01])$/;
- $minHits = $1 if /^minHits=(\d+)$/;
- $maxHits = $1 if /^maxHits=(\d+)$/;
- $minSize = $1 if /^minSize=(\d+)$/;
- $evalue = $1 if /^evalue=(.+)$/;
- $identity = $1 if /^identity=(.+)$/;
- $identity = $1 if /^percIdent=(.+)$/; # backward compatibility
- $coverage = $1 if /^coverage=(.+)$/;
-
- $selfRank = $1 if /^selfRank=(.+)$/;
- $normalize = $1 if /^normalize=(.+)$/;
- $unite = $1 if /^unite=([01])$/;
- $useDistance = $1 if /^useDistance=([01])$/;
- $useWeight = $1 if /^useWeight=([01])$/;
-
- $howCO = $1 if /^howCO=(\d)$/;
- $globalCO = $1 if /^globalCO=(.+)$/;
- $selfCO = $1 if /^selfCO=(.+)$/;
- $closeCO = $1 if /^closeCO=(.+)$/;
- $distalCO = $1 if /^distalCO=(.+)$/;
-
- $exOutlier = $1 if /^exOutlier=([0123])$/;
-
- $nBin = $1 if /^nBin=(\d+)$/;
- $plotHist = $1 if /^plotHist=([01])$/;
-
- $toolKDE = $1 if /^toolKDE=([01])$/;
- $bwF = $1 if /^bwF=(.+)$/;
- $plotKDE = $1 if /^plotKDE=([01])$/;
- $toolExtrema = $1 if /^toolExtrema=([01])$/;
- $whichPeak = $1 if /^whichPeak=([0123])$/;
- $modKCO = $1 if /^modKCO=([0123])$/;
- $qKCO = $1 if /^qKCO=(.+)$/;
-
- $dipTest = $1 if /^dipTest=([01])$/;
- $dipSig = $1 if /^dipSig=(.+)$/;
-
- $selfLow = $1 if /^selfLow=([01])$/;
-
- $outRaw = $1 if /^outRaw=([01])$/;
- $outFp = $1 if /^outFp=([01])$/;
- $graphFp = $1 if /^graphFp=([01])$/;
- $plotRef = $1 if /^plotRef=(.+)$/;
-
- $boxPlot = $1 if /^boxPlot=([01])$/;
- $histogram = $1 if /^histogram=([01])$/;
- $densityPlot = $1 if /^densityPlot=([01])$/;
- $scatterPlot = $1 if /^scatterPlot=([01])$/;
- $plot3D = $1 if /^plot3D=([01])$/;
-
- $BBH = $1 if /^BBH=([012])$/;
- $loss = $1 if /^loss=([01])$/;
- $POE = $1 if /^POE=([01])$/;
-
- @ranks = split (/\s*,\s*/, $1) if /^ranks=(.+)$/;
-
- @selfGroup = split (/\s*,\s*/, $1) if /^selfGroup=(.+)$/;
- @closeGroup = split (/\s*,\s*/, $1) if /^closeGroup=(.+)$/;
- @excludeGroup = split (/\s*,\s*/, $1) if /^excludeGroup=(.+)$/;
- @inSets = split (/\s*,\s*/, $1) if /^inSets=(.+)$/;
- @exSets = split (/\s*,\s*/, $1) if /^exSets=(.+)$/;
- }
- close IN;
+if (-e "$wkDir/config.txt") {
+ open IN, "<$wkDir/config.txt";
+ while () {
+ s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+ $interactive = $1 if /^interactive=([01])$/;
+ $minHits = $1 if /^minHits=(\d+)$/;
+ $maxHits = $1 if /^maxHits=(\d+)$/;
+ $minSize = $1 if /^minSize=(\d+)$/;
+ $evalue = $1 if /^evalue=(.+)$/;
+ $identity = $1 if /^identity=(.+)$/;
+ $identity = $1 if /^percIdent=(.+)$/; # backward compatibility
+ $coverage = $1 if /^coverage=(.+)$/;
+
+ $selfRank = $1 if /^selfRank=(.+)$/;
+ $normalize = $1 if /^normalize=(.+)$/;
+ $unite = $1 if /^unite=([01])$/;
+ $useDistance = $1 if /^useDistance=([01])$/;
+ $useWeight = $1 if /^useWeight=([01])$/;
+
+ $howCO = $1 if /^howCO=(\d)$/;
+ $globalCO = $1 if /^globalCO=(.+)$/;
+ $selfCO = $1 if /^selfCO=(.+)$/;
+ $closeCO = $1 if /^closeCO=(.+)$/;
+ $distalCO = $1 if /^distalCO=(.+)$/;
+
+ $exOutlier = $1 if /^exOutlier=([0123])$/;
+
+ $nBin = $1 if /^nBin=(\d+)$/;
+ $plotHist = $1 if /^plotHist=([01])$/;
+
+ $toolKDE = $1 if /^toolKDE=([01])$/;
+ $bwF = $1 if /^bwF=(.+)$/;
+ $plotKDE = $1 if /^plotKDE=([01])$/;
+ $toolExtrema = $1 if /^toolExtrema=([01])$/;
+ $whichPeak = $1 if /^whichPeak=([0123])$/;
+ $modKCO = $1 if /^modKCO=([0123])$/;
+ $qKCO = $1 if /^qKCO=(.+)$/;
+
+ $dipTest = $1 if /^dipTest=([01])$/;
+ $dipSig = $1 if /^dipSig=(.+)$/;
+
+ $selfLow = $1 if /^selfLow=([01])$/;
+
+ $outRaw = $1 if /^outRaw=([01])$/;
+ $outFp = $1 if /^outFp=([01])$/;
+ $graphFp = $1 if /^graphFp=([01])$/;
+ $plotRef = $1 if /^plotRef=(.+)$/;
+
+ $boxPlot = $1 if /^boxPlot=([01])$/;
+ $histogram = $1 if /^histogram=([01])$/;
+ $densityPlot = $1 if /^densityPlot=([01])$/;
+ $scatterPlot = $1 if /^scatterPlot=([01])$/;
+ $plot3D = $1 if /^plot3D=([01])$/;
+
+ $BBH = $1 if /^BBH=([012])$/;
+ $loss = $1 if /^loss=([01])$/;
+ $POE = $1 if /^POE=([01])$/;
+
+ @ranks = split (/\s*,\s*/, $1) if /^ranks=(.+)$/;
+
+ @selfGroup = split (/\s*,\s*/, $1) if /^selfGroup=(.+)$/;
+ @closeGroup = split (/\s*,\s*/, $1) if /^closeGroup=(.+)$/;
+ @excludeGroup = split (/\s*,\s*/, $1) if /^excludeGroup=(.+)$/;
+ @inSets = split (/\s*,\s*/, $1) if /^inSets=(.+)$/;
+ @exSets = split (/\s*,\s*/, $1) if /^exSets=(.+)$/;
+ }
+ close IN;
}
-if ($identity and $identity < 1){ $identity *= 100; }
-if ($coverage and $coverage < 1){ $coverage *= 100; }
+if ($identity and $identity < 1) { $identity *= 100; }
+if ($coverage and $coverage < 1) { $coverage *= 100; }
## check previous result ##
-if (-d "$wkDir/result/detail" and -d "$wkDir/result/statistics"){
- print "Warning: Prediction result from a previous analysis is detected.\n";
- if ($interactive){ print "Press Enter to overwrite, or Ctrl+C to exit:\n"; $s = ; }
- else{ print "To be overwritten.\n"; }
+if (-d "$wkDir/result/detail" and -d "$wkDir/result/statistics") {
+ print "Warning: Prediction result from a previous analysis is detected.\n";
+ if ($interactive) { print "Press Enter to overwrite, or Ctrl+C to exit:\n"; $s = ; }
+ else { print "To be overwritten.\n"; }
}
## verify configurations ##
-if ($globalCO){
- $globalCO = $globalCO / 100 if ($globalCO =~ s/%$//);
- die "Error: Global cutoff must be between 0 and 1.\n" if ($globalCO <= 0 or $globalCO >=1);
+if ($globalCO) {
+ $globalCO = $globalCO / 100 if ($globalCO =~ s/%$//);
+ die "Error: Global cutoff must be between 0 and 1.\n" if ($globalCO <= 0 or $globalCO >=1);
}
# conditionally use Statistics::R for Perl-R communication;
-if ($graphFp or ($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest){
- eval{ require Statistics::R; Statistics::R->import() };
- die "Error: Perl module Statistics::R is not available.\n" if ($@);
- $R = Statistics::R->new();
+if ($graphFp or ($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest) {
+ eval { require Statistics::R; Statistics::R->import() };
+ die "Error: Perl module Statistics::R is not available.\n" if ($@);
+ $R = Statistics::R->new();
}
## initiate global variables ##
-if ($unite){
- @b = (); @a = ('0','1','2');
- $fpN{'0'}{$_}{'data'} = [@b] for (@a);
- # $fpS{'0'}{$_}{'data'} = [@b] for (@a);
+if ($unite) {
+ @b = (); @a = ('0','1','2');
+ $fpN{'0'}{$_}{'data'} = [@b] for (@a);
+ # $fpS{'0'}{$_}{'data'} = [@b] for (@a);
}
@@ -249,136 +249,136 @@
print "Reading taxonomic information...";
open IN, "<$wkDir/taxonomy/taxa.db";
-while (){
- next if /^#/;
- @a = split /\t/;
- next unless $#a;
- $a[$#a] =~ s/\s+$//;
- %h = ('name',$a[1],'rank',$a[2]);
- $i = 3; $h{$_} = $a[$i++] for (@ranks);
- $taxadb{$a[0]} = {%h};
+while () {
+ next if /^#/;
+ @a = split /\t/;
+ next unless $#a;
+ $a[$#a] =~ s/\s+$//;
+ %h = ('name',$a[1],'rank',$a[2]);
+ $i = 3; $h{$_} = $a[$i++] for (@ranks);
+ $taxadb{$a[0]} = {%h};
}
close IN;
open IN, "<$wkDir/taxonomy/ranks.db";
-while (){
- s/\s+$//; next if /^#/; next unless $_;
- @a = split /\t/;
- $ranksdb{$a[0]} = $a[1];
+while () {
+ s/\s+$//; next if /^#/; next unless $_;
+ @a = split /\t/;
+ $ranksdb{$a[0]} = $a[1];
}
close IN;
open IN, "<$wkDir/taxonomy/self.info";
-while (){
- s/\s+$//; next if /^#/; next unless $_;
- @a = split /\t/;
- next if (scalar(@a) < 3);
- %h = ('taxid',$a[1],'name',$a[2]);
- $selfinfo{$a[0]} = {%h};
+while () {
+ s/\s+$//; next if /^#/; next unless $_;
+ @a = split /\t/;
+ next if (scalar(@a) < 3);
+ %h = ('taxid',$a[1],'name',$a[2]);
+ $selfinfo{$a[0]} = {%h};
}
close IN;
print " done.\n";
print "Analyzing taxonomic information...";
-for ($i=0; $i<=$#ranks; $i++){
- $j = 1; # if all names are the same
- $s = 0; # last name of this rank
- foreach my $key (keys %selfinfo){
- $t = $taxadb{$selfinfo{$key}{'taxid'}}{$ranks[$i]};
- $s = $t unless $s;
- if ($t != $s){ $j = 0; last; }
- }
- last if $j;
+for ($i=0; $i<=$#ranks; $i++) {
+ $j = 1; # if all names are the same
+ $s = 0; # last name of this rank
+ foreach my $key (keys %selfinfo) {
+ $t = $taxadb{$selfinfo{$key}{'taxid'}}{$ranks[$i]};
+ $s = $t unless $s;
+ if ($t != $s) { $j = 0; last; }
+ }
+ last if $j;
}
print " done.\n";
$selves{'rank'} = $ranks[$i];
$selves{'rank_id'} = $i;
$selves{'rank_taxid'} = $s;
-if (exists $ranksdb{$s}){ $selves{'rank_name'} = $ranksdb{$s}; }
-elsif (exists $taxadb{$s}){ $selves{'rank_name'} = $taxadb{$s}{'name'}; }
-else{ die "Unknown TaxID $s.\n"; }
+if (exists $ranksdb{$s}) { $selves{'rank_name'} = $ranksdb{$s}; }
+elsif (exists $taxadb{$s}) { $selves{'rank_name'} = $taxadb{$s}{'name'}; }
+else { die "Unknown TaxID $s.\n"; }
print " All input genomes belong to $selves{'rank'} $selves{'rank_name'} (TaxID: $selves{'rank_taxid'}).\n";
-if (@selfGroup){ # user-defined self group
- $lv{'rank'} = "(user-defined self)"; $lv{'id'} = -1; $lv{'taxid'} = join (",", @selfGroup);
- @a = ();
- foreach (@selfGroup){
- if (exists $ranksdb{$_}){ push @a, $ranksdb{$_}; }
- elsif (exists $taxadb{$_}) { push @a, $taxadb{$_}{'name'}; }
- else { push @a, "unknown"; }
- }
- $lv{'name'} = join (",", @a);
-}else{ # lowest rank that contains all input organisms
- if ($selfRank =~ /\d/){ # relative level number
- $lv{'rank'} = $ranks[$selves{'rank_id'} + $selfRank];
- $lv{'id'} = $selves{'rank_id'} + $selfRank;
- }else{ # rank name
- $lv{'rank'} = $selfRank;
- for ($j=0; $j<=$#ranks; $j++){
- if ($ranks[$j] eq $selfRank){ $lv{'id'} = $j; last; }
- }
- }
- $s = $selfinfo{(keys %selfinfo)[0]}{'taxid'};
- $lv{'taxid'} = $taxadb{$s}{$lv{'rank'}};
- if (exists $ranksdb{$lv{'taxid'}}){ $lv{'name'} = $ranksdb{$lv{'taxid'}}; }
- elsif (exists $taxadb{$lv{'taxid'}}){ $lv{'name'} = $taxadb{$lv{'taxid'}}{'name'}; }
- else{ die "Unknown TaxID $lv{'taxid'}.\n"; }
+if (@selfGroup) { # user-defined self group
+ $lv{'rank'} = "(user-defined self)"; $lv{'id'} = -1; $lv{'taxid'} = join (",", @selfGroup);
+ @a = ();
+ foreach (@selfGroup) {
+ if (exists $ranksdb{$_}) { push @a, $ranksdb{$_}; }
+ elsif (exists $taxadb{$_}) { push @a, $taxadb{$_}{'name'}; }
+ else { push @a, "unknown"; }
+ }
+ $lv{'name'} = join (",", @a);
+} else { # lowest rank that contains all input organisms
+ if ($selfRank =~ /\d/) { # relative level number
+ $lv{'rank'} = $ranks[$selves{'rank_id'} + $selfRank];
+ $lv{'id'} = $selves{'rank_id'} + $selfRank;
+ } else { # rank name
+ $lv{'rank'} = $selfRank;
+ for ($j=0; $j<=$#ranks; $j++) {
+ if ($ranks[$j] eq $selfRank) { $lv{'id'} = $j; last; }
+ }
+ }
+ $s = $selfinfo{(keys %selfinfo)[0]}{'taxid'};
+ $lv{'taxid'} = $taxadb{$s}{$lv{'rank'}};
+ if (exists $ranksdb{$lv{'taxid'}}) { $lv{'name'} = $ranksdb{$lv{'taxid'}}; }
+ elsif (exists $taxadb{$lv{'taxid'}}) { $lv{'name'} = $taxadb{$lv{'taxid'}}{'name'}; }
+ else { die "Unknown TaxID $lv{'taxid'}.\n"; }
}
%h = ();
-foreach my $key (keys %taxadb){
- next unless $taxadb{$key}{'rank'};
- foreach (split (/,/, $lv{'taxid'})){
- $h{$key} = 1 if ($key eq $_ or $taxadb{$key}{'rank'} =~ /\/$_$/ or $taxadb{$key}{'rank'} =~ /\/$_\//);
- }
+foreach my $key (keys %taxadb) {
+ next unless $taxadb{$key}{'rank'};
+ foreach (split (/,/, $lv{'taxid'})) {
+ $h{$key} = 1 if ($key eq $_ or $taxadb{$key}{'rank'} =~ /\/$_$/ or $taxadb{$key}{'rank'} =~ /\/$_\//);
+ }
}
$lvList{$lv{'rank'}} = {%h};
-if (@closeGroup){ # user-defined close group
- $lv{'prank'} = "(user-defined close)"; $lv{'ptaxid'} = join (",", @closeGroup);
- @a = ();
- foreach (@closeGroup){
- if (exists $ranksdb{$_}){ push @a, $ranksdb{$_}; }
- elsif (exists $taxadb{$_}) { push @a, $taxadb{$_}{'name'}; }
- else { push @a, "unknown"; }
- }
- $lv{'pname'} = join (",", @a);
-}else{ # lowest parent rank that has adequate members for statistical analysis
- if ($lv{'id'} >= $#ranks){ die "Cannot find a parent taxonomic rank of $lv{'taxid'}.\n"; }
- print " Choose one of the following parental taxonomic ranks as the close group:\n";
- my @pranks = ();
- $s = $selfinfo{(keys %selfinfo)[0]}{'taxid'};
- for ($i=$lv{'id'}+1; $i<=$#ranks; $i++){
- next unless exists $taxadb{$s}{$ranks[$i]};
- $t = $taxadb{$s}{$ranks[$i]};
- %h = ('rank' => $ranks[$i], 'taxid' => $t, 'name' => '', 'n' => 0, 'm' => 0); # m is the number of members - number of self members
- if (exists $ranksdb{$t}){ $h{'name'} = $ranksdb{$t}; }
- elsif (exists $taxadb{$t}){ $h{'name'} = $taxadb{$t}{'name'}; }
- else{ next; }
- foreach my $key (keys %taxadb){
- if ($key eq $t or $taxadb{$key}{'rank'} =~ /\/$t$/ or $taxadb{$key}{'rank'} =~ /\/$t\//){
- $h{'n'} ++;
- $h{'m'} ++ unless exists $lvList{$lv{'rank'}}{$key};
- }
- }
- push @pranks, {%h};
- print " ".$h{'rank'}." ".$h{'name'}." (TaxID: ".$h{'taxid'}.") (".$h{'n'}." members).\n";
- }
- unless (scalar @pranks){ die "Cannot find a parent taxonomic rank of $lv{'taxid'}.\n"; }
- for ($i=0; $i<=$#pranks; $i++){
- if ($pranks[$i]{'m'} >= 10 or $i == $#pranks){ ######## new rule: close group should have >=10 members
- print " The program intelligently chose $pranks[$i]{'rank'} $pranks[$i]{'name'}.\n";
- if ($interactive){ print "Press Enter to accept, or Ctrl+C to exit:\n"; $s = ; }
- $lv{'prank'} = $pranks[$i]{'rank'};
- $lv{'ptaxid'} = $pranks[$i]{'taxid'};
- $lv{'pname'} = $pranks[$i]{'name'};
- last;
- }
- }
+if (@closeGroup) { # user-defined close group
+ $lv{'prank'} = "(user-defined close)"; $lv{'ptaxid'} = join (",", @closeGroup);
+ @a = ();
+ foreach (@closeGroup) {
+ if (exists $ranksdb{$_}) { push @a, $ranksdb{$_}; }
+ elsif (exists $taxadb{$_}) { push @a, $taxadb{$_}{'name'}; }
+ else { push @a, "unknown"; }
+ }
+ $lv{'pname'} = join (",", @a);
+} else { # lowest parent rank that has adequate members for statistical analysis
+ if ($lv{'id'} >= $#ranks) { die "Cannot find a parent taxonomic rank of $lv{'taxid'}.\n"; }
+ print " Choose one of the following parental taxonomic ranks as the close group:\n";
+ my @pranks = ();
+ $s = $selfinfo{(keys %selfinfo)[0]}{'taxid'};
+ for ($i=$lv{'id'}+1; $i<=$#ranks; $i++) {
+ next unless exists $taxadb{$s}{$ranks[$i]};
+ $t = $taxadb{$s}{$ranks[$i]};
+ %h = ('rank' => $ranks[$i], 'taxid' => $t, 'name' => '', 'n' => 0, 'm' => 0); # m is the number of members - number of self members
+ if (exists $ranksdb{$t}) { $h{'name'} = $ranksdb{$t}; }
+ elsif (exists $taxadb{$t}) { $h{'name'} = $taxadb{$t}{'name'}; }
+ else { next; }
+ foreach my $key (keys %taxadb) {
+ if ($key eq $t or $taxadb{$key}{'rank'} =~ /\/$t$/ or $taxadb{$key}{'rank'} =~ /\/$t\//) {
+ $h{'n'} ++;
+ $h{'m'} ++ unless exists $lvList{$lv{'rank'}}{$key};
+ }
+ }
+ push @pranks, {%h};
+ print " ".$h{'rank'}." ".$h{'name'}." (TaxID: ".$h{'taxid'}.") (".$h{'n'}." members).\n";
+ }
+ unless (scalar @pranks) { die "Cannot find a parent taxonomic rank of $lv{'taxid'}.\n"; }
+ for ($i=0; $i<=$#pranks; $i++) {
+ if ($pranks[$i]{'m'} >= 10 or $i == $#pranks) { ######## new rule: close group should have >=10 members
+ print " The program intelligently chose $pranks[$i]{'rank'} $pranks[$i]{'name'}.\n";
+ if ($interactive) { print "Press Enter to accept, or Ctrl+C to exit:\n"; $s = ; }
+ $lv{'prank'} = $pranks[$i]{'rank'};
+ $lv{'ptaxid'} = $pranks[$i]{'taxid'};
+ $lv{'pname'} = $pranks[$i]{'name'};
+ last;
+ }
+ }
}
%h = ();
-foreach my $key (keys %taxadb){
- next unless $taxadb{$key}{'rank'};
- foreach (split (/,/, $lv{'ptaxid'})){
- $h{$key} = 1 if ($key eq $_ or $taxadb{$key}{'rank'} =~ /\/$_$/ or $taxadb{$key}{'rank'} =~ /\/$_\//);
- }
+foreach my $key (keys %taxadb) {
+ next unless $taxadb{$key}{'rank'};
+ foreach (split (/,/, $lv{'ptaxid'})) {
+ $h{$key} = 1 if ($key eq $_ or $taxadb{$key}{'rank'} =~ /\/$_$/ or $taxadb{$key}{'rank'} =~ /\/$_\//);
+ }
}
$lvList{$lv{'prank'}} = {%h};
@@ -386,25 +386,25 @@
print " Self: $lv{'rank'} $lv{'name'} (TaxID: $lv{'taxid'}) (".(keys %{$lvList{$lv{'rank'}}})." members),\n";
print " Close: $lv{'prank'} $lv{'pname'} (TaxID: $lv{'ptaxid'}) (".(keys %{$lvList{$lv{'prank'}}})." members),\n";
print " Distal: all other organisms.\n";
-if ($interactive){
- print "Press Enter to continue, or Ctrl+C to exit:";
- $s = ;
+if ($interactive) {
+ print "Press Enter to continue, or Ctrl+C to exit:";
+ $s = ;
}
## Read reference proteins ##
my %refs = ();
-if ($plotRef){
- if (-e $plotRef){ open IN, "<$plotRef"; }
- elsif (-e "$wkDir/$plotRef"){ open IN, "<$wkDir/$plotRef"; }
- else{ die "Reference file $plotRef does not exist.\n"; }
- while (){
- s/\s+$//; next unless $_;
- @a = split (/\t/);
- $a[0] =~ s/\.\d+$//;
- $refs{$a[0]} = [0,0,0,0]; # exist, self, close, distal
- }
- close IN;
+if ($plotRef) {
+ if (-e $plotRef) { open IN, "<$plotRef"; }
+ elsif (-e "$wkDir/$plotRef") { open IN, "<$wkDir/$plotRef"; }
+ else { die "Reference file $plotRef does not exist.\n"; }
+ while () {
+ s/\s+$//; next unless $_;
+ @a = split (/\t/);
+ $a[0] =~ s/\.\d+$//;
+ $refs{$a[0]} = [0,0,0,0]; # exist, self, close, distal
+ }
+ close IN;
}
@@ -414,9 +414,9 @@
opendir (DIR, "$wkDir/search");
@a = readdir(DIR);
close DIR;
-foreach (@a){
- next if (/^\./);
- push @sets, $_ if -d "$wkDir/search/$_";
+foreach (@a) {
+ next if (/^\./);
+ push @sets, $_ if -d "$wkDir/search/$_";
}
print "done. ";
die "No genome detected.\n" unless @sets;
@@ -428,248 +428,248 @@
print "Analyzing search results...\n";
print "0-------------25-------------50------------75------------100%\n";
-foreach my $set (@sets){
- if (@inSets){ $i = 0; foreach (@inSets){ if ($set eq $_){ $i = 1; last; } } next unless $i; }
- if (@exSets){ $i = 0; foreach (@exSets){ if ($set eq $_){ $i = 1; last; } } next if $i; }
- opendir (DIR, "$wkDir/search/$set");
- @files = grep(/\.txt$/,readdir(DIR));
- close DIR;
- print "No protein found in $set\n" and next unless @files;
-
- ## varibles to show a progress bar
- my $iProtein = 0;
- my $iProgress = 0;
- my $nProtein = $#files+1;
- print "$set has $nProtein proteins. Analyzing...\n";
-
- unless ($unite){
- @a = ('0','1','2'); @b = ();
- $fpN{$set}{$_}{'data'} = [@b] for (@a);
- }
-
- ## information of self
- my %self = ();
-
- foreach my $file (@files){
- $iProtein ++;
-
- my %result = (); # a record to store everything about this search
- my %scores = (); # scores of each hit by category, as a buffer for computing the statistics above
- my @hits; # parameters of the hits. one hit contains:
- # accn, organism, group, taxid, genus, score
- $file =~ /(.+)\.[^.]+$/;
- $result{'query'} = $1;
-
- my $nHits = 0; # total number of hits. just for convenience
- my $nScore = 0; # total score
- my $lastHit = ""; # store the last hit in the organism table, for the identification of duplicated taxa
- my $selfScore = 0;
-
- # read hit table #
-
- open IN, "<$wkDir/search/$set/$file" or next;
- my $reading = 0;
- my ($hasCoverage, $hasDistance) = (0, 0);
- while () {
- s/\s+$//;
- if (/^BEGIN QUERY/){ $reading = "query"; next; }
- if (/^BEGIN ORGANISM/){ $reading = "organism"; next; }
- if (/^BEGIN DATA/){ $reading = "data"; next; }
- if (/^END;/){ $reading = 0; next; }
- if ($reading eq "query"){ # read query (self)
- $result{'accn'} = $1 if /^\tName=(.+);$/;
- $result{'gi'} = $1 if /^\tGI=(\d+);$/;
- $result{'length'} = $1 if /^\tLength=(.+);$/;
- $result{'product'} = $1 if /^\tProduct=(.+)\s*;$/;
- $result{'organism'} = $1 if /^\tOrganism=(.+)\s*;$/;
- if (/^\tAccession=(.+);$/){ $result{'accn'} = $1; $result{'accn'} =~ s/\.[\d]+$//; }
- }
- if ($reading eq "organism"){ # read organisms
- next if /^;/;
- if (/^\[/){
- $hasCoverage = 1 if /Coverage/;
- $hasDistance = 1 if /Distance/;
- next;
- }
- @a = split (/\t/);
- if ($#a < 5){
- print "\nIncomplete hit record $a[0] in $file of $set.\n";
- if ($interactive){
- print "Press Enter to continue, or Ctrl+C to exit:";
- $s = ;
- }
- next;
- }
-
- # filter out low-quality hits
- next if ($a[$#a] eq "x");
- next if ($evalue and $a[4] ne "*" and $a[4] > $evalue);
- next if ($identity and $a[5] ne "*" and $a[5] < $identity);
- next if ($coverage and $hasCoverage and $a[6] and $a[6] ne "*" and $a[6] < $coverage);
-
- # filter out unwanted taxonomy groups
- if (@excludeGroup){
- my $isExclude = 0;
- foreach (@excludeGroup){
- if ($a[2] == $_ or $taxadb{$a[2]}{'rank'} =~ /\/$_$/ or $taxadb{$a[2]}{'rank'} =~ /\/$_\//){
- $isExclude = 1;
- last;
- }
- }
- next if $isExclude;
- }
-
- my %hit = ();
- $hit{'accns'} = $a[0];
- $hit{'organism'} = $a[1];
- $hit{'taxid'} = $a[2];
- $hit{'score'} = $a[3];
- $hit{'evalue'} = $a[4];
- $hit{'identity'} = $a[5];
- $hit{'coverage'} = $a[6] if $hasCoverage;
-
- # use phylogenetic distance instead of bit score
- if ($useDistance){
- if ($#a >= 6){ $hit{'score'} = 1 - $a[6+$hasCoverage]; }
- elsif (!@hits){ $hit{'score'} = 0; }
- else { $hit{'score'} = $hits[$#hits]{'score'}; }
- }
-
- @a = split(/\//, $a[0]);
- $hit{'accn'} = $a[0];
- push @hits, {%hit};
- }
- last if ($maxHits and $#hits >= $maxHits-1);
- }
- close IN;
-
- # skip if there is no hit
- next unless @hits;
-
- # next if ($minSize and ($result{'length'} < $minSize));
- unless (exists $result{'query'} and exists $result{'length'}){
- print "\nIncomplete search result: $set/$file.\n" ;
- if ($interactive){
- print "Press Enter to continue, or Ctrl+C to exit:";
- $s = ;
- }
- }
- $result{'product'} = '' unless exists $result{'product'};
-
- ## Intepret hit table ##
-
- # total number of hits
- $result{'n'} = $#hits+1;
-
- # sort hits by bit score or phylogenetic distance
- @hits = sort {$b->{'score'} <=> $a->{'score'}} @hits;
-
- # identify self (query) information
- my $isQuery = 0;
- for ($i=0; $i<=$#hits; $i++){
- @a = split(/\//, $hits[$i]{'accns'});
- foreach (@a){
- if ($result{'accn'} eq $_){
- $result{'id'} = $i;
- $result{'taxid'} = $hits[$i]{'taxid'};
- $result{'score'} = $hits[$i]{'score'};
- $result{'organism'} = $hits[$i]{'organism'};
- $isQuery = 1;
- last;
- }
- }
- last if exists $result{'id'};
- }
- unless (exists $result{'id'}){
- $result{'id'} = 0;
- $result{'taxid'} = $hits[0]{'taxid'};
- $result{'score'} = $hits[0]{'score'};
- $result{'organism'} = $hits[0]{'organism'};
- }
- next unless $result{'score'};
-
- # Use absolute or relative bit scores
-
- if ($normalize and not $useDistance){
- for ($i=0; $i<=$#hits; $i++){
- $hits[$i]{'score'} = sprintf("%.3f", $hits[$i]{'score'}/$result{'score'});
- }
- }
-
- # initialize values of prediction results
- $result{'in'} = ""; # whether incoming HGT or origination took place within the group
- $result{'loss'} = ""; # gene loss event
- $result{'origin'} = ""; # gene origination event
- $result{'income'} = ""; # incoming HGT event
- $result{'outcome'} = ""; # outcoming HGT event
-
- # Summarize numbers and scores #
- ## 0 - self group, 1 - close groups, 2 - distal gs
- ## N - number, S - scores
- ## hit1 - first close hit, hit 2 - first distal hit
-
- my ($topCloseScore, $topDistalScore) = (0, 0);
- $result{'N0'} = 0; $result{'N1'} = 0; $result{'N2'} = 0;
- @a = (); $result{'S0'} = [@a]; $result{'S1'} = [@a]; $result{'S2'} = [@a];
- for ($i=0; $i<=$#hits; $i++){
- if (exists $lvList{$lv{'rank'}}{$hits[$i]{'taxid'}}){
- next unless $isQuery;
- if ($useWeight){ $result{'N0'} += $hits[$i]{'score'}; }
- else { $result{'N0'} ++; }
- push @{$result{'S0'}}, $hits[$i]{'score'};
- }elsif (exists $lvList{$lv{'prank'}}{$hits[$i]{'taxid'}}){
- if ($useWeight){ $result{'N1'} += $hits[$i]{'score'}; }
- else { $result{'N1'} ++; }
- push @{$result{'S1'}}, $hits[$i]{'score'};
- $result{'hit1'} = $hits[$i]{'taxid'} unless exists $result{'hit1'};
- $topCloseScore = $hits[$i]{'score'} unless $topCloseScore;
- $result{'BBH'} = "" unless exists $result{'BBH'};
- }else{
- if ($useWeight){ $result{'N2'} += $hits[$i]{'score'}; }
- else { $result{'N2'} ++; }
- push @{$result{'S2'}}, $hits[$i]{'score'};
- unless (exists $result{'hit2'}){
- $result{'hit2'} = $hits[$i]{'taxid'};
- $result{'hit2accn'} = $hits[$i]{'accn'};
- }
- $topDistalScore = $hits[$i]{'score'} unless $topDistalScore;
- $result{'BBH'} = 1 unless exists $result{'BBH'};
- }
- }
- # more stringent BBH method:
- # $result{'BBH'} = 1 if ($topDistalScore > $topCloseScore);
-
- $result{'N0'} = sprintf("%.3f", $result{'N0'});
- $result{'N1'} = sprintf("%.3f", $result{'N1'});
- $result{'N2'} = sprintf("%.3f", $result{'N2'});
-
- if (exists $refs{$result{'accn'}}){
- $refs{$result{'accn'}} = [1,$result{'N0'},$result{'N1'},$result{'N2'}];
- }
-
- # proteins without non-self hits are considered as de novo originated and not considered for statistics
- $result{'origin'} = 1 if $result{'N1'}+$result{'N2'} < 0.000001;
-
- # proteins with hits lower than minimum cutoff are considered as de novo originated and not considered for statistics
- if ($minHits and ($result{'n'} < $minHits)){ $result{'origin'} = 1; $result{'BBH'} = ""; }
-
- # Put this record into the master record
- push @{$results{$set}}, {%result};
-
- # Record fingerprint #
- if ($unite){ $s = '0'; }
- else { $s = $set; }
-
- if ($result{'N0'} and not $result{'origin'}){ # skip in case distance is used and no distance is available
- push @{$fpN{$s}{'0'}{'data'}}, $result{'N0'};
- push @{$fpN{$s}{'1'}{'data'}}, $result{'N1'};
- push @{$fpN{$s}{'2'}{'data'}}, $result{'N2'};
- }
-
- # Show progress
- print "." and $iProgress++ if ($iProtein/$nProtein >= $iProgress/60);
- }
- print "\n";
+foreach my $set (@sets) {
+ if (@inSets) { $i = 0; foreach (@inSets) { if ($set eq $_) { $i = 1; last; } } next unless $i; }
+ if (@exSets) { $i = 0; foreach (@exSets) { if ($set eq $_) { $i = 1; last; } } next if $i; }
+ opendir (DIR, "$wkDir/search/$set");
+ @files = grep(/\.txt$/,readdir(DIR));
+ close DIR;
+ print "No protein found in $set\n" and next unless @files;
+
+ ## varibles to show a progress bar
+ my $iProtein = 0;
+ my $iProgress = 0;
+ my $nProtein = $#files+1;
+ print "$set has $nProtein proteins. Analyzing...\n";
+
+ unless ($unite) {
+ @a = ('0','1','2'); @b = ();
+ $fpN{$set}{$_}{'data'} = [@b] for (@a);
+ }
+
+ ## information of self
+ my %self = ();
+
+ foreach my $file (@files) {
+ $iProtein ++;
+
+ my %result = (); # a record to store everything about this search
+ my %scores = (); # scores of each hit by category, as a buffer for computing the statistics above
+ my @hits; # parameters of the hits. one hit contains:
+ # accn, organism, group, taxid, genus, score
+ $file =~ /(.+)\.[^.]+$/;
+ $result{'query'} = $1;
+
+ my $nHits = 0; # total number of hits. just for convenience
+ my $nScore = 0; # total score
+ my $lastHit = ""; # store the last hit in the organism table, for the identification of duplicated taxa
+ my $selfScore = 0;
+
+ # read hit table #
+
+ open IN, "<$wkDir/search/$set/$file" or next;
+ my $reading = 0;
+ my ($hasCoverage, $hasDistance) = (0, 0);
+ while () {
+ s/\s+$//;
+ if (/^BEGIN QUERY/) { $reading = "query"; next; }
+ if (/^BEGIN ORGANISM/) { $reading = "organism"; next; }
+ if (/^BEGIN DATA/) { $reading = "data"; next; }
+ if (/^END;/) { $reading = 0; next; }
+ if ($reading eq "query") { # read query (self)
+ $result{'accn'} = $1 if /^\tName=(.+);$/;
+ $result{'gi'} = $1 if /^\tGI=(\d+);$/;
+ $result{'length'} = $1 if /^\tLength=(.+);$/;
+ $result{'product'} = $1 if /^\tProduct=(.+)\s*;$/;
+ $result{'organism'} = $1 if /^\tOrganism=(.+)\s*;$/;
+ if (/^\tAccession=(.+);$/) { $result{'accn'} = $1; $result{'accn'} =~ s/\.[\d]+$//; }
+ }
+ if ($reading eq "organism") { # read organisms
+ next if /^;/;
+ if (/^\[/) {
+ $hasCoverage = 1 if /Coverage/;
+ $hasDistance = 1 if /Distance/;
+ next;
+ }
+ @a = split (/\t/);
+ if ($#a < 5) {
+ print "\nIncomplete hit record $a[0] in $file of $set.\n";
+ if ($interactive) {
+ print "Press Enter to continue, or Ctrl+C to exit:";
+ $s = ;
+ }
+ next;
+ }
+
+ # filter out low-quality hits
+ next if ($a[$#a] eq "x");
+ next if ($evalue and $a[4] ne "*" and $a[4] > $evalue);
+ next if ($identity and $a[5] ne "*" and $a[5] < $identity);
+ next if ($coverage and $hasCoverage and $a[6] and $a[6] ne "*" and $a[6] < $coverage);
+
+ # filter out unwanted taxonomy groups
+ if (@excludeGroup) {
+ my $isExclude = 0;
+ foreach (@excludeGroup) {
+ if ($a[2] == $_ or $taxadb{$a[2]}{'rank'} =~ /\/$_$/ or $taxadb{$a[2]}{'rank'} =~ /\/$_\//) {
+ $isExclude = 1;
+ last;
+ }
+ }
+ next if $isExclude;
+ }
+
+ my %hit = ();
+ $hit{'accns'} = $a[0];
+ $hit{'organism'} = $a[1];
+ $hit{'taxid'} = $a[2];
+ $hit{'score'} = $a[3];
+ $hit{'evalue'} = $a[4];
+ $hit{'identity'} = $a[5];
+ $hit{'coverage'} = $a[6] if $hasCoverage;
+
+ # use phylogenetic distance instead of bit score
+ if ($useDistance) {
+ if ($#a >= 6) { $hit{'score'} = 1 - $a[6+$hasCoverage]; }
+ elsif (!@hits) { $hit{'score'} = 0; }
+ else { $hit{'score'} = $hits[$#hits]{'score'}; }
+ }
+
+ @a = split(/\//, $a[0]);
+ $hit{'accn'} = $a[0];
+ push @hits, {%hit};
+ }
+ last if ($maxHits and $#hits >= $maxHits-1);
+ }
+ close IN;
+
+ # skip if there is no hit
+ next unless @hits;
+
+ # next if ($minSize and ($result{'length'} < $minSize));
+ unless (exists $result{'query'} and exists $result{'length'}) {
+ print "\nIncomplete search result: $set/$file.\n" ;
+ if ($interactive) {
+ print "Press Enter to continue, or Ctrl+C to exit:";
+ $s = ;
+ }
+ }
+ $result{'product'} = '' unless exists $result{'product'};
+
+ ## Intepret hit table ##
+
+ # total number of hits
+ $result{'n'} = $#hits+1;
+
+ # sort hits by bit score or phylogenetic distance
+ @hits = sort {$b->{'score'} <=> $a->{'score'}} @hits;
+
+ # identify self (query) information
+ my $isQuery = 0;
+ for ($i=0; $i<=$#hits; $i++) {
+ @a = split(/\//, $hits[$i]{'accns'});
+ foreach (@a) {
+ if ($result{'accn'} eq $_) {
+ $result{'id'} = $i;
+ $result{'taxid'} = $hits[$i]{'taxid'};
+ $result{'score'} = $hits[$i]{'score'};
+ $result{'organism'} = $hits[$i]{'organism'};
+ $isQuery = 1;
+ last;
+ }
+ }
+ last if exists $result{'id'};
+ }
+ unless (exists $result{'id'}) {
+ $result{'id'} = 0;
+ $result{'taxid'} = $hits[0]{'taxid'};
+ $result{'score'} = $hits[0]{'score'};
+ $result{'organism'} = $hits[0]{'organism'};
+ }
+ next unless $result{'score'};
+
+ # Use absolute or relative bit scores
+
+ if ($normalize and not $useDistance) {
+ for ($i=0; $i<=$#hits; $i++) {
+ $hits[$i]{'score'} = sprintf("%.3f", $hits[$i]{'score'}/$result{'score'});
+ }
+ }
+
+ # initialize values of prediction results
+ $result{'in'} = ""; # whether incoming HGT or origination took place within the group
+ $result{'loss'} = ""; # gene loss event
+ $result{'origin'} = ""; # gene origination event
+ $result{'income'} = ""; # incoming HGT event
+ $result{'outcome'} = ""; # outcoming HGT event
+
+ # Summarize numbers and scores #
+ ## 0 - self group, 1 - close groups, 2 - distal gs
+ ## N - number, S - scores
+ ## hit1 - first close hit, hit 2 - first distal hit
+
+ my ($topCloseScore, $topDistalScore) = (0, 0);
+ $result{'N0'} = 0; $result{'N1'} = 0; $result{'N2'} = 0;
+ @a = (); $result{'S0'} = [@a]; $result{'S1'} = [@a]; $result{'S2'} = [@a];
+ for ($i=0; $i<=$#hits; $i++) {
+ if (exists $lvList{$lv{'rank'}}{$hits[$i]{'taxid'}}) {
+ next unless $isQuery;
+ if ($useWeight) { $result{'N0'} += $hits[$i]{'score'}; }
+ else { $result{'N0'} ++; }
+ push @{$result{'S0'}}, $hits[$i]{'score'};
+ } elsif (exists $lvList{$lv{'prank'}}{$hits[$i]{'taxid'}}) {
+ if ($useWeight) { $result{'N1'} += $hits[$i]{'score'}; }
+ else { $result{'N1'} ++; }
+ push @{$result{'S1'}}, $hits[$i]{'score'};
+ $result{'hit1'} = $hits[$i]{'taxid'} unless exists $result{'hit1'};
+ $topCloseScore = $hits[$i]{'score'} unless $topCloseScore;
+ $result{'BBH'} = "" unless exists $result{'BBH'};
+ } else {
+ if ($useWeight) { $result{'N2'} += $hits[$i]{'score'}; }
+ else { $result{'N2'} ++; }
+ push @{$result{'S2'}}, $hits[$i]{'score'};
+ unless (exists $result{'hit2'}) {
+ $result{'hit2'} = $hits[$i]{'taxid'};
+ $result{'hit2accn'} = $hits[$i]{'accn'};
+ }
+ $topDistalScore = $hits[$i]{'score'} unless $topDistalScore;
+ $result{'BBH'} = 1 unless exists $result{'BBH'};
+ }
+ }
+ # more stringent BBH method:
+ # $result{'BBH'} = 1 if ($topDistalScore > $topCloseScore);
+
+ $result{'N0'} = sprintf("%.3f", $result{'N0'});
+ $result{'N1'} = sprintf("%.3f", $result{'N1'});
+ $result{'N2'} = sprintf("%.3f", $result{'N2'});
+
+ if (exists $refs{$result{'accn'}}) {
+ $refs{$result{'accn'}} = [1,$result{'N0'},$result{'N1'},$result{'N2'}];
+ }
+
+ # proteins without non-self hits are considered as de novo originated and not considered for statistics
+ $result{'origin'} = 1 if $result{'N1'}+$result{'N2'} < 0.000001;
+
+ # proteins with hits lower than minimum cutoff are considered as de novo originated and not considered for statistics
+ if ($minHits and ($result{'n'} < $minHits)) { $result{'origin'} = 1; $result{'BBH'} = ""; }
+
+ # Put this record into the master record
+ push @{$results{$set}}, {%result};
+
+ # Record fingerprint #
+ if ($unite) { $s = '0'; }
+ else { $s = $set; }
+
+ if ($result{'N0'} and not $result{'origin'}) { # skip in case distance is used and no distance is available
+ push @{$fpN{$s}{'0'}{'data'}}, $result{'N0'};
+ push @{$fpN{$s}{'1'}{'data'}}, $result{'N1'};
+ push @{$fpN{$s}{'2'}{'data'}}, $result{'N2'};
+ }
+
+ # Show progress
+ print "." and $iProgress++ if ($iProtein/$nProtein >= $iProgress/60);
+ }
+ print "\n";
}
print " done.\n";
@@ -680,622 +680,622 @@
## output raw data for further statistical analysis ##
-if ($outRaw){
- open OUT, ">$wkDir/result/statistics/rawdata.txt";
- print OUT "Query\tSet\tLength\tHits\tSelf\tClose\tDistal\n";
- foreach my $set (sort keys %results){
- $n = @{$results{$set}};
- for ($i=0; $i<$n; $i++){
- my %res = %{$results{$set}[$i]};
- next if $res{'origin'};
- next unless $res{'N0'};
- print OUT "$res{'accn'}\t$set\t$res{'length'}\t$res{'n'}\t$res{'N0'}\t$res{'N1'}\t$res{'N2'}\n";
- }
- }
- close OUT;
- print "Raw data are saved in result/statistics/rawdata.txt.\n";
- print "You may conduct further analyses on these data.\n";
- if ($interactive){
- print "Press Enter to continue, or Ctrl+C to exit:";
- $s = ;
- }
+if ($outRaw) {
+ open OUT, ">$wkDir/result/statistics/rawdata.txt";
+ print OUT "Query\tSet\tLength\tHits\tSelf\tClose\tDistal\n";
+ foreach my $set (sort keys %results) {
+ $n = @{$results{$set}};
+ for ($i=0; $i<$n; $i++) {
+ my %res = %{$results{$set}[$i]};
+ next if $res{'origin'};
+ next unless $res{'N0'};
+ print OUT "$res{'accn'}\t$set\t$res{'length'}\t$res{'n'}\t$res{'N0'}\t$res{'N1'}\t$res{'N2'}\n";
+ }
+ }
+ close OUT;
+ print "Raw data are saved in result/statistics/rawdata.txt.\n";
+ print "You may conduct further analyses on these data.\n";
+ if ($interactive) {
+ print "Press Enter to continue, or Ctrl+C to exit:";
+ $s = ;
+ }
}
## graph fingerprints with R ##
-if ($graphFp){
- print "\nGraphing fingerprints with R...";
- $R->startR;
- print "R cannot be started. Make sure it is properly installed in the system.\n" and exit 1 unless $R->is_started();
- if ($plot3D){
- $R->send("library('rgl')");
- print "\n You chose to display interactive 3D scatter plots. They will be displayed sequentially. Use mouse to rotate plots. Press Enter in the terminal to move to the next plot.\n";
- }
- foreach my $set (sort keys %fpN){
- my $fpre = "$wkDir/result/statistics/".("$set." x ($set ne "0")); # prefix of filename
- my $tpost = " of $set" x ($set ne "0"); # postfix of title
- my @gcode = ('Self', 'Close', 'Distal');
- for (0..2){ # send data to R
- @b = @{$fpN{$set}{$_}{'data'}};
- $_ = sprintf("%.3f", $_) for (@b);
- $R->send("x$_<-c(".join (",", @b).")");
- @b = sort{$a<=>$b}@b;
- $R->send("lim$_<-".$b[$#b]); # find proper xlim
- if ($exOutlier){
- @c = boxplot(@b) if ($exOutlier == 1);
- @c = adjusted_boxplot(@b) if ($exOutlier == 2);
- @c = modified_z(@b) if ($exOutlier == 3);
- if ($b[$#b] > $c[1]){
- for ($i=$#b; $i>=0; $i--){
- if ($b[$i] <= $c[1]){
- $R->send("lim$_<-".$b[$i]);
- last;
- }
- }
- }
- }
- }
- my @xr = ([], [], []); # reference positives
- if (keys %refs){
- foreach my $key (keys %refs){
- if ($refs{$key}[0]){
- push (@{$xr[$_]}, $refs{$key}[$_+1]) for (0..2);
- }
- }
- for (0..2){
- @b = @{$xr[$_]};
- $_ = sprintf("%.3f", $_) for (@b);
- $R->send("xr$_<-c(".join (",", @b).")");
- }
- }
- if ($boxPlot){
- $R->send("pdf('$fpre"."box.pdf',useDingbats=F)");
- $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
- $R->send("boxplot(x0,x1,x2,names=c('self','close','distal'),xlab='Group',ylab='Weight',main='')");
- $R->send("dev.off()");
- }
- if ($selfLow){ @b = (0, 1, 2); }
- else{ @b = (1, 2); }
- if ($histogram){
- for (@b){
- $R->send("pdf('$fpre"."hist.".lc($gcode[$_]).".pdf',useDingbats=F)");
- $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
- $R->send("hist(x$_, breaks=$nBin,freq=F,col='lightgrey',xlab='$gcode[$_] weight',ylab='Probability density',main=''".",xlim=range(0:lim$_)" x ($exOutlier and $exOutlier > 0).")");
- $R->send("dev.off()");
- }
- }
- if ($densityPlot){
- for (@b){
- $R->send("pdf('$fpre"."density.".lc($gcode[$_]).".pdf',useDingbats=F)");
- if ($plotRef){ $R->send("par(mar=c(4,4,1,3)+0.1,mgp=c(2.5,0.75,0))"); }
- else{ $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))"); }
- $R->send("d<-density(x$_".(",bw=bw.nrd0(x$_)*$bwF" x ($bwF and $bwF != 1)).")");
- if ($exOutlier){ $R->send("lim<-range(min(d\$x):lim$_)"); }
- else{ $R->send("lim<-range(d\$x[is.finite(d\$x)])"); }
- $R->send("plot(d,lwd=2,xlim=lim,xlab='Weight',ylab='Probability density',main='')");
- if (scalar @{$xr[0]}){
- $R->send("par(new=TRUE)");
- $R->send("plot(density(xr$_".(",bw=bw.nrd0(ab)*$bwF" x ($bwF and $bwF != 1))."),xlim=lim,xaxt='n',yaxt='n',xlab='',ylab='',main='',col=2)");
- $R->send("axis(4,col=2,col.ticks=2)");
- $R->send("mtext('Probability density of true positives',side=4,line=-2,col=2)");
- $R->send("rug(xr$_,ticksize=0.04,lwd=1,col=rgb(1,0,0,0.25))");
- }
- $R->send("dev.off()");
- }
- }
- if ($scatterPlot){
- if ($selfLow){
- for (@b){
- ($i, $j) = ($_, $_+1);
- $j = 0 if ($j == 3);
- $R->send("pdf('$fpre"."scatter.".lc($gcode[$i])."-".lc($gcode[$j]).".pdf',useDingbats=F)");
- $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
- $R->send("plot(x$i,x$j,pch=16,col=rgb(0,0,0,0.25),xlim=range(0:lim$i),ylim=range(0:lim$j),xlab='$gcode[$i] weight',ylab='$gcode[$j] weight')");
- if (scalar @{$xr[0]}){
- $R->send("points(xr$i,xr$j,pch=16,col=rgb(1,0,0,0.5))");
- $R->send("legend('topright','true positive',pch=16,col='red')");
- }
- $R->send("dev.off()");
- }
- }else{
- $R->send("pdf(\"$fpre"."scatter.pdf\",useDingbats=F)");
- $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
- $R->send("plot(x1,x2,pch=16,col=rgb(0,0,0,0.25),xlim=range(0:lim1),ylim=range(0:lim2),xlab='Close weight',ylab='Distal weight',main='')");
- if (scalar @{$xr[0]}){
- $R->send("points(xr1,xr2,pch=16,col=rgb(1,0,0,0.5))");
- $R->send("legend('topright','true positive',pch=16,col='red')");
- }
- $R->send("dev.off()");
- }
- }
- if ($plot3D){ # This function is not functioning properly
- # if ($^O=~/Win/){ $R->send("windows()"); }elsif ($^O=~/Mac/){ $R->send("quartz()"); }else{ $R->send("x11()"); }
- $R->send("plot3d(x0,x1,x2,xlab='Self weight',ylab='Close weight',zlab='Distal weight')");
- print "Displaying 3D plot$tpost. Press Enter to move on.";
- $s = ;
- }
- }
- $R->stopR();
- print " done.\n";
- print "Graphs are saved in result/statistics/.\n";
- if ($interactive){
- print "You may take a look at the graphs before proceeding.\n";
- print "Press Enter to continue, or Ctrl+C to exit:";
- $s = ;
- }
+if ($graphFp) {
+ print "\nGraphing fingerprints with R...";
+ $R->startR;
+ print "R cannot be started. Make sure it is properly installed in the system.\n" and exit 1 unless $R->is_started();
+ if ($plot3D) {
+ $R->send("library('rgl')");
+ print "\n You chose to display interactive 3D scatter plots. They will be displayed sequentially. Use mouse to rotate plots. Press Enter in the terminal to move to the next plot.\n";
+ }
+ foreach my $set (sort keys %fpN) {
+ my $fpre = "$wkDir/result/statistics/".("$set." x ($set ne "0")); # prefix of filename
+ my $tpost = " of $set" x ($set ne "0"); # postfix of title
+ my @gcode = ('Self', 'Close', 'Distal');
+ for (0..2) { # send data to R
+ @b = @{$fpN{$set}{$_}{'data'}};
+ $_ = sprintf("%.3f", $_) for (@b);
+ $R->send("x$_<-c(".join (",", @b).")");
+ @b = sort{$a<=>$b}@b;
+ $R->send("lim$_<-".$b[$#b]); # find proper xlim
+ if ($exOutlier) {
+ @c = boxplot(@b) if ($exOutlier == 1);
+ @c = adjusted_boxplot(@b) if ($exOutlier == 2);
+ @c = modified_z(@b) if ($exOutlier == 3);
+ if ($b[$#b] > $c[1]) {
+ for ($i=$#b; $i>=0; $i--) {
+ if ($b[$i] <= $c[1]) {
+ $R->send("lim$_<-".$b[$i]);
+ last;
+ }
+ }
+ }
+ }
+ }
+ my @xr = ([], [], []); # reference positives
+ if (keys %refs) {
+ foreach my $key (keys %refs) {
+ if ($refs{$key}[0]) {
+ push (@{$xr[$_]}, $refs{$key}[$_+1]) for (0..2);
+ }
+ }
+ for (0..2) {
+ @b = @{$xr[$_]};
+ $_ = sprintf("%.3f", $_) for (@b);
+ $R->send("xr$_<-c(".join (",", @b).")");
+ }
+ }
+ if ($boxPlot) {
+ $R->send("pdf('$fpre"."box.pdf',useDingbats=F)");
+ $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
+ $R->send("boxplot(x0,x1,x2,names=c('self','close','distal'),xlab='Group',ylab='Weight',main='')");
+ $R->send("dev.off()");
+ }
+ if ($selfLow) { @b = (0, 1, 2); }
+ else { @b = (1, 2); }
+ if ($histogram) {
+ for (@b) {
+ $R->send("pdf('$fpre"."hist.".lc($gcode[$_]).".pdf',useDingbats=F)");
+ $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
+ $R->send("hist(x$_, breaks=$nBin,freq=F,col='lightgrey',xlab='$gcode[$_] weight',ylab='Probability density',main=''".",xlim=range(0:lim$_)" x ($exOutlier and $exOutlier > 0).")");
+ $R->send("dev.off()");
+ }
+ }
+ if ($densityPlot) {
+ for (@b) {
+ $R->send("pdf('$fpre"."density.".lc($gcode[$_]).".pdf',useDingbats=F)");
+ if ($plotRef) { $R->send("par(mar=c(4,4,1,3)+0.1,mgp=c(2.5,0.75,0))"); }
+ else { $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))"); }
+ $R->send("d<-density(x$_".(",bw=bw.nrd0(x$_)*$bwF" x ($bwF and $bwF != 1)).")");
+ if ($exOutlier) { $R->send("lim<-range(min(d\$x):lim$_)"); }
+ else { $R->send("lim<-range(d\$x[is.finite(d\$x)])"); }
+ $R->send("plot(d,lwd=2,xlim=lim,xlab='Weight',ylab='Probability density',main='')");
+ if (scalar @{$xr[0]}) {
+ $R->send("par(new=TRUE)");
+ $R->send("plot(density(xr$_".(",bw=bw.nrd0(ab)*$bwF" x ($bwF and $bwF != 1))."),xlim=lim,xaxt='n',yaxt='n',xlab='',ylab='',main='',col=2)");
+ $R->send("axis(4,col=2,col.ticks=2)");
+ $R->send("mtext('Probability density of true positives',side=4,line=-2,col=2)");
+ $R->send("rug(xr$_,ticksize=0.04,lwd=1,col=rgb(1,0,0,0.25))");
+ }
+ $R->send("dev.off()");
+ }
+ }
+ if ($scatterPlot) {
+ if ($selfLow) {
+ for (@b) {
+ ($i, $j) = ($_, $_+1);
+ $j = 0 if ($j == 3);
+ $R->send("pdf('$fpre"."scatter.".lc($gcode[$i])."-".lc($gcode[$j]).".pdf',useDingbats=F)");
+ $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
+ $R->send("plot(x$i,x$j,pch=16,col=rgb(0,0,0,0.25),xlim=range(0:lim$i),ylim=range(0:lim$j),xlab='$gcode[$i] weight',ylab='$gcode[$j] weight')");
+ if (scalar @{$xr[0]}) {
+ $R->send("points(xr$i,xr$j,pch=16,col=rgb(1,0,0,0.5))");
+ $R->send("legend('topright','true positive',pch=16,col='red')");
+ }
+ $R->send("dev.off()");
+ }
+ } else {
+ $R->send("pdf(\"$fpre"."scatter.pdf\",useDingbats=F)");
+ $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
+ $R->send("plot(x1,x2,pch=16,col=rgb(0,0,0,0.25),xlim=range(0:lim1),ylim=range(0:lim2),xlab='Close weight',ylab='Distal weight',main='')");
+ if (scalar @{$xr[0]}) {
+ $R->send("points(xr1,xr2,pch=16,col=rgb(1,0,0,0.5))");
+ $R->send("legend('topright','true positive',pch=16,col='red')");
+ }
+ $R->send("dev.off()");
+ }
+ }
+ if ($plot3D) { # This function is not functioning properly
+ # if ($^O=~/Win/) { $R->send("windows()"); } elsif ($^O=~/Mac/) { $R->send("quartz()"); } else { $R->send("x11()"); }
+ $R->send("plot3d(x0,x1,x2,xlab='Self weight',ylab='Close weight',zlab='Distal weight')");
+ print "Displaying 3D plot$tpost. Press Enter to move on.";
+ $s = ;
+ }
+ }
+ $R->stopR();
+ print " done.\n";
+ print "Graphs are saved in result/statistics/.\n";
+ if ($interactive) {
+ print "You may take a look at the graphs before proceeding.\n";
+ print "Press Enter to continue, or Ctrl+C to exit:";
+ $s = ;
+ }
}
## Compute the statistics for the whole genome(s), i.e., phyletic pattern, or "fingerprint" ##
print "\nComputing statistics...";
-if (($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest){
- $R->startR;
- print "R cannot be started. Make sure it is properly installed in the system.\n" and exit 1 unless $R->is_started();
- $R->send("library(pastecs)") if ($howCO == 4 and $toolExtrema);
- $R->send("library(diptest)") if $dipTest;
+if (($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest) {
+ $R->startR;
+ print "R cannot be started. Make sure it is properly installed in the system.\n" and exit 1 unless $R->is_started();
+ $R->send("library(pastecs)") if ($howCO == 4 and $toolExtrema);
+ $R->send("library(diptest)") if $dipTest;
}
-foreach my $set (keys %fpN){
- if ($set eq "0"){ print "\n All protein sets:\n"; }else{ print "\n Protein set $set:\n"; }
- foreach my $key ('0','1','2'){
- next unless @{$fpN{$set}{$key}{'data'}};
-
- if ($key eq "0"){ print " Self group:\n"; }elsif ($key eq "1"){ print " Close group:\n"; }elsif ($key eq "2"){ print " Distal group:\n"; }
-
- @a = sort {$a<=>$b} @{$fpN{$set}{$key}{'data'}}; # sort low to high
-
- my $global_cutoff;
- my $computed_cutoff;
- my $use_global = 0;
- my $half_way = median(@a); ########## if computed cutoff > median, use global cutoff
-
- # compute basic statistical parameters
-
- $fpN{$set}{$key}{'n'} = @a;
- $s = 0; $s += $_ for @a;
- $fpN{$set}{$key}{'mean'} = $s/$fpN{$set}{$key}{'n'};
- $s = 0; $s += ($fpN{$set}{$key}{'mean'} - $_)**2 for @a;
- $fpN{$set}{$key}{'stdev'} = sqrt($s/($fpN{$set}{$key}{'n'}-1));
- $fpN{$set}{$key}{'min'} = $a[0];
- $fpN{$set}{$key}{'max'} = $a[$#a];
- $fpN{$set}{$key}{'median'} = median(@a);
- $fpN{$set}{$key}{'mad'} = mad(@a);
- ($fpN{$set}{$key}{'q1'}, $fpN{$set}{$key}{'q3'}) = quantiles(@a);
-
- if ($key eq '0' and not $selfLow){
- $fpN{$set}{$key}{'cutoff'} = 0;
- print " Skipped.\n";
- next;
- }
-
- # determine cutoff using global cutoff
-
- $i = $fpN{$set}{$key}{'n'}*$globalCO;
- if (int($i) == $i){
- $global_cutoff = ($a[$i-1]+$a[$i])/2;
- }else{
- if ($i-int($i) <= int($i)-$i+1){
- $global_cutoff = $a[int($i)];
- }else{
- $global_cutoff = $a[int($i)-1];
- }
- }
- print " Global cutoff ($globalCO) = $global_cutoff.\n";
-
- # override individual cutoff with global cutoff
-
- $use_global = 1 if ((($key eq '0') and ($selfCO eq 'G')) or (($key eq '1') and ($closeCO eq 'G')) or (($key eq '2') and ($distalCO eq 'G')));
-
- # exclude outliers
-
- if ($exOutlier){
- @c = boxplot(@a) if ($exOutlier == 1);
- @c = adjusted_boxplot(@a) if ($exOutlier == 2);
- @c = modified_z(@a) if ($exOutlier == 3);
- if ($a[$#a] > $c[1]){
- for ($i=$#a; $i>=0; $i--){
- if ($a[$i] <= $c[1]){
- @a = @a[0..$i];
- last;
- }
- }
- }
- }
-
- # perform Hartigan's dip test to assess non-unimodality
-
- if ($dipTest){
- print " Performing Hartigan's dip test...";
- $R->send("x<-c(".join (",", @a).")");
- $R->send("dip.test(x)");
- $s = $R->read;
- open OUT, ">$key.out";
- print OUT join(' ', @a);
- close OUT;
-
- if ($s =~ /D = (\S+), p-value [<=] (\S+)\n/){
- print " done.\n";
- print " D = $1, p-value = $2\n";
- if ($dipSig){
- if ($2 >= $dipSig){
- print " The weight distribution is NOT significantly non-unimodal.\n";
- if ($howCO >= 3){
- if ($interactive){
- print " Proceed with statistical analysis anyway (yes) or use global cutoff ($globalCO) instead (NO)? ";
- while (){
- chomp;
- unless ($_){ $use_global = 1; last; }
- if (/^y$/i or /^yes$/i){ last; }
- if (/^n$/i or /^no$/i){ $use_global = 1; last; }
- }
- }else{ $use_global = 1; }
- }
- }else{ print " The weight distribution is significantly non-unimodal.\n"; }
- }else{ print " The weight distribution is ". ("NOT " x ($2 >= 0.05)). "significantly non-unimodal.\n"; }
- }else{ print " failed.\n"; }
- }
-
- # determine cutoff using histogram
-
- print " Generating histogram..." if ($howCO == 3 and not $use_global);
-
- my @freqs = (0)x$nBin;
- my $interval = ($a[$#a]-$a[0])/$nBin;
- my $cid = 0;
- for ($j=1; $j<$nBin; $j++){
- my $high_bound = $interval*$j;
- for ($i=$cid; $i<=$#a; $i++){
- if ($a[$i] < $high_bound){
- $freqs[$j-1] ++;
- }else{
- $cid = $i;
- last;
- }
- }
- }
- $freqs[$nBin-1] = $#a-$cid+1;
- my $local_min = 0; # index of the lowest bar from left
- for ($i=1; $i<$nBin-1; $i++){
- if ($freqs[$i]<$freqs[$i-1] and $freqs[$i]<=$freqs[$i+1]){
- $computed_cutoff = $interval*$i;
- $local_min = $i;
- last;
- }
- }
-
- print " done.\n" if ($howCO == 3 and not $use_global);
-
- # draw histogram
-
- if ($plotHist){
- print " Histogram:";
- @c = sort {$b<=>$a} @freqs;
- $s = 50/$c[0];
- my @widths = (0)x$nBin;
- my @labels = (0)x$nBin;
- for ($i=0; $i<$nBin; $i++){
- $widths[$i] = int($freqs[$i]*$s);
- $labels[$i] = sprintf("%.2f", $interval*$i)."-".sprintf("%.2f", $interval*($i+1));
- }
- @c = sort {length($b)<=>length($a)} @labels;
- $s = length($c[0]);
- print "\n";
- for ($i=0; $i<$nBin; $i++){
- print " "x($s-length($labels[$i]))."$labels[$i] ".("*"x$widths[$i])."$freqs[$i]".(" (local minimum)"x($local_min and $local_min==$i))."\n";
- }
- }
-
- # determine cutoff using kernel density estimation (KDE)
-
- if ($howCO == 4 and not $use_global){
-
- # perform kernel density estimation (KDE)
-
- print " Performing kernel density estimation...";
- my (@dx, @dy); # x- and y-coornidates of density function
- my $KDEstats = "";
-
- # perform KDE using basic R command "density"
-
- if ($toolKDE){
- $R->send("x<-c(".join (",", @a).")");
- if ($bwF and $bwF != 1){
- $R->send("bwx<-bw.nrd0(x)*$bwF");
- $R->send("d<-density(x,bw=bwx)");
- }else{
- $R->send("d<-density(x)");
- }
- @dx = @{$R->get('d$x')};
- @dy = @{$R->get('d$y')};
- }
-
- # perform KDE using self-written Perl code
-
- else{
- my $n = scalar @a;
- my $mean; $mean += $_ for @a; $mean = $mean/$n;
- my $stdev; $stdev += ($mean-$_)**2 for @a; $stdev = sqrt($stdev/($n-1));
- my @Q = quantiles(@a); my $iqr = $Q[1]-$Q[0];
- my $bw;
- if ($stdev == 0 and $iqr == 0){ $bw = 1; }
- elsif ($stdev == 0){ $bw = $iqr/1.34; }
- elsif ($iqr == 0){ $bw = $stdev; }
- elsif ($stdev <= $iqr/1.34){ $bw = $stdev; }
- else{ $bw = $iqr/1.34; }
- $bw = 0.9*$bw*$n**(-1/5); # select bandwidth by Silverman's ¡°rule of thumb¡± (1986)
- $bw = $bw*$bwF if ($bwF and $bwF != 1);
- my ($min, $max) = ($a[0]-3*$bw, $a[$#a]+3*$bw); # cut = 3
- $KDEstats = " N = $n, bandwidth = ".sprintf("%.3f", $bw).".\n";
- for (my $x=$min; $x<=$max; $x+=($max-$min)/511) { # 512 points
- my $e = 0; $e += exp(-(($x-$_)/$bw)**2/2)/sqrt(2*3.1415926536) for @a; # Gaussian kernel
- push @dx, $x;
- push @dy, 1/$n/$bw*$e;
- }
- }
-
- print " done.\n";
- print $KDEstats;
-
- # plot density function on screen
-
- if ($plotKDE){
- print " Density function:\n";
- my $k_x = int(@dx/100);
- $k_x = int(@dx/$plotKDE) if ($plotKDE > 1);
- my $k_y = 0;
- foreach (@dy){
- $k_y = $_ if ($_ > $k_y);
- }
- $k_y = 64/$k_y;
- for ($i=0; $i<=$#dy; $i+=$k_x){
- print " "x(7-length(sprintf("%.2f", $dx[$i]))).sprintf("%.2f", $dx[$i]);
- print " "x (int($dy[$i]*$k_y)+1)."*\n";
- }
- }
-
- my $peak1st;
- my $failed;
- my ($peak_i, $peak_x, $peak_y);
- my ($pit_i, $pit_x, $pit_y);
-
- # identify local extrema using R package "pastecs"
-
- if ($toolExtrema){
- if ($toolKDE < 2){
- $R->send("d<-data.frame(x=c(".join (",", @dx)."),y=c(".join (",", @dy)."))");
- }
- $R->send("tp<-turnpoints(ts(d\$y))");
- if ($R->get('tp$firstispeak') eq "TRUE"){
- if ($R->get('tp$nturns') == 1){ $failed = 1; }
- else{ $peak1st = 1; }
- }else{
- $peak1st = 0;
- }
- unless ($failed){
- my @tpx = @{$R->get("d\$x[tp\$tppos]")};
- my @tpy = @{$R->get("d\$y[tp\$tppos]")};
- if ($peak1st){
- ($peak_x, $peak_y) = ($tpx[0], $tpy[0]);
- ($pit_x, $pit_y) = ($tpx[1], $tpy[1]);
- }else{
- ($peak_x, $peak_y) = ($dx[0], $dy[0]);
- ($pit_x, $pit_y) = ($tpx[0], $tpy[0]);
- }
- }
- }
-
- # identify local extrema using self-written Perl code
-
- else{
- for ($i=0; $i<$#dy; $i++){
- if ($dy[$i] < $dy[$i+1]){ $peak1st = 1; last; }
- elsif ($dy[$i] > $dy[$i+1]){ $peak1st = 0; last; }
- else{ next; }
- }
- if ($peak1st){
- for ($i=1; $i<$#dy; $i++){
- if (($dy[$i-1] <= $dy[$i]) and ($dy[$i] > $dy[$i+1])){
- ($peak_i, $peak_x, $peak_y) = ($i, $dx[$i], $dy[$i]);
- }
- if (($dy[$i-1] > $dy[$i]) and ($dy[$i] <= $dy[$i+1])){
- ($pit_i, $pit_x, $pit_y) = ($i, $dx[$i], $dy[$i]);
- }
- last if ($peak_i and $pit_i);
- }
- $failed = 1 unless $pit_i;
- }else{
- ($peak_i, $peak_x, $peak_y) = (0, $dx[0], $dy[0]);
- for (my $i=1; $i<$#dx; $i++){
- if (($dy[$i-1] > $dy[$i]) and ($dy[$i] <= $dy[$i+1])){
- ($pit_i, $pit_x, $pit_y) = ($i, $dx[$i], $dy[$i]);
- }
- last if $pit_i;
- }
- }
- }
-
- if ($failed){
- print " The weight population cannot be clustered using kernel density estimation. This is typically caused by an even distribution of all weights.\n";
- $use_global = 1;
- }else{
-
- # locate cutoff point
-
- if ($modKCO == 0){ # pit
- $computed_cutoff = $pit_x;
- }elsif ($modKCO == 1){ # horizontal midpoint
- $computed_cutoff = ($pit_x+$peak_x)/2;
- }elsif ($modKCO == 2){ # horizontal quantile
- $computed_cutoff = $pit_x-($pit_x-$peak_x)*$qKCO;
- }else{ # vertical quantile
- my $vCO = 0;
- $vCO = $pit_y+($peak_y-$pit_y)*$qKCO;
- for (my $i=$peak_i; $i<$pit_i; $i++){
- if (($dy[$i] >= $vCO) and ($vCO > $dy[$i+1])){
- $computed_cutoff = $dx[$i]+($dx[$i+1]-$dx[$i])*($dy[$i]-$vCO)/($dy[$i]-$dy[$i+1]);
- last;
- }
- }
- }
- }
- }
-
- # determine cutoff using hierarchical clustering
-
- if ($howCO == 5 and not $use_global){
- print " Performing hierarchical clustering...";
- $R->send("x<-c(".join (",", @a).")");
- $R->send("d<-dist(x,method=\"euclidean\")");
- $R->send("fit<-hclust(d,method=\"ward\")");
- print " done.\n";
- my $nCluster = 1;
- while ($nCluster++){
- $R->send("c<-cutree(fit,k=$nCluster)");
- my $clusters = $R->get('c'); # tip: return vector from R
- @c = (()) x $nCluster; # data of clusters
- for ($i=0; $i<=@{$clusters}-1; $i++){
- foreach (1..$nCluster){
- if (@{$clusters}[$i] == $_){
- push @{$c[$_-1]}, $a[$i];
- last;
- }
- }
- }
- for ($i=1; $i<=$#c; $i++){
- @{$c[$i]} = sort {$a <=> $b} @{$c[$i]};
- }
- @b = sort {$a->[0] <=> $b->[0]} @c;
- print " With $nCluster clusters, cutoff is $b[1][0]. Accept? (YES/no) ";
- my $user_okay = 1;
- while (){
- chomp; last unless $_;
- last if (/^y$/i or /^yes$/i);
- if (/^n$/i or /^no$/i){ $user_okay = 0; last; }
- }
- last if $user_okay;
- }
- $computed_cutoff = $b[1][0];
- }
-
- # report cutoff to user
-
- if ($howCO == 0 or ($howCO >= 3 and $use_global)){ # user-defined global cutoff
- $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
- print " Cutoff is $global_cutoff (determined by global cutoff $globalCO)\n";
- }
- if ($howCO == 1 and $unite){ # user-defined individual cutoff
- if ($key eq '0'){ $s = $selfCO; }
- elsif ($key eq '1'){ $s = $closeCO; }
- elsif ($key eq '2'){ $s = $distalCO; }
- if ($s){
- $fpN{$set}{$key}{'cutoff'} = $s;
- print " Cutoff is $s (user-defined).\n";
- }else{
- $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
- print " User-defined cutoff is not available. Use global cutoff $global_cutoff instead.\n";
- }
- }
- if ($howCO >= 3 and not $use_global){ # computed cutoff
- $s = 0; if ($howCO == 3){ $s = "histogram"; }
- elsif ($howCO == 4){ $s = "kernel density estimation"; }
- elsif ($howCO == 5){ $s = "clustering analysis"; }
- if ($computed_cutoff){
- if ($computed_cutoff <= $half_way){
- $fpN{$set}{$key}{'cutoff'} = $computed_cutoff;
- print " Cutoff is ".sprintf("%.3f", $computed_cutoff)." (determined by $s).\n";
- }else{
- $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
- print " ". ucfirst($s) ." identified a cutoff ".sprintf("%.3f", $computed_cutoff)." which is too large. Use global cutoff $global_cutoff instead.\n";
- }
- }else{
- $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
- print " ". ucfirst($s) ." failed to identify a cutoff. Use global cutoff $global_cutoff instead.\n";
- }
- }
- # $fpN{$set}{$key}{'cutoff'} = 0.00001 unless $fpN{$set}{$key}{'cutoff'};
-
- # ask user to enter cutoff
-
- my $user_enter;
- if ($interactive and $fpN{$set}{$key}{'cutoff'}){
- print " Accept? (YES/no) ";
- while (){
- chomp;
- last unless $_;
- last if (/^y$/i or /^yes$/i);
- if (/^n$/i or /^no$/i){ $user_enter = 1; last; }
- }
- }
- if ($user_enter or $howCO == 2){
- print " Enter user-specified cutoff: ";
- while (){
- chomp;
- last if /^\d+\.?\d*$/;
- print " Invalid cutoff value. Re-enter: ";
- }
- $fpN{$set}{$key}{'cutoff'} = $_;
- }
- }
+foreach my $set (keys %fpN) {
+ if ($set eq "0") { print "\n All protein sets:\n"; } else { print "\n Protein set $set:\n"; }
+ foreach my $key ('0','1','2') {
+ next unless @{$fpN{$set}{$key}{'data'}};
+
+ if ($key eq "0") { print " Self group:\n"; } elsif ($key eq "1") { print " Close group:\n"; } elsif ($key eq "2") { print " Distal group:\n"; }
+
+ @a = sort {$a<=>$b} @{$fpN{$set}{$key}{'data'}}; # sort low to high
+
+ my $global_cutoff;
+ my $computed_cutoff;
+ my $use_global = 0;
+ my $half_way = median(@a); ########## if computed cutoff > median, use global cutoff
+
+ # compute basic statistical parameters
+
+ $fpN{$set}{$key}{'n'} = @a;
+ $s = 0; $s += $_ for @a;
+ $fpN{$set}{$key}{'mean'} = $s/$fpN{$set}{$key}{'n'};
+ $s = 0; $s += ($fpN{$set}{$key}{'mean'} - $_)**2 for @a;
+ $fpN{$set}{$key}{'stdev'} = sqrt($s/($fpN{$set}{$key}{'n'}-1));
+ $fpN{$set}{$key}{'min'} = $a[0];
+ $fpN{$set}{$key}{'max'} = $a[$#a];
+ $fpN{$set}{$key}{'median'} = median(@a);
+ $fpN{$set}{$key}{'mad'} = mad(@a);
+ ($fpN{$set}{$key}{'q1'}, $fpN{$set}{$key}{'q3'}) = quantiles(@a);
+
+ if ($key eq '0' and not $selfLow) {
+ $fpN{$set}{$key}{'cutoff'} = 0;
+ print " Skipped.\n";
+ next;
+ }
+
+ # determine cutoff using global cutoff
+
+ $i = $fpN{$set}{$key}{'n'}*$globalCO;
+ if (int($i) == $i) {
+ $global_cutoff = ($a[$i-1]+$a[$i])/2;
+ } else {
+ if ($i-int($i) <= int($i)-$i+1) {
+ $global_cutoff = $a[int($i)];
+ } else {
+ $global_cutoff = $a[int($i)-1];
+ }
+ }
+ print " Global cutoff ($globalCO) = $global_cutoff.\n";
+
+ # override individual cutoff with global cutoff
+
+ $use_global = 1 if ((($key eq '0') and ($selfCO eq 'G')) or (($key eq '1') and ($closeCO eq 'G')) or (($key eq '2') and ($distalCO eq 'G')));
+
+ # exclude outliers
+
+ if ($exOutlier) {
+ @c = boxplot(@a) if ($exOutlier == 1);
+ @c = adjusted_boxplot(@a) if ($exOutlier == 2);
+ @c = modified_z(@a) if ($exOutlier == 3);
+ if ($a[$#a] > $c[1]) {
+ for ($i=$#a; $i>=0; $i--) {
+ if ($a[$i] <= $c[1]) {
+ @a = @a[0..$i];
+ last;
+ }
+ }
+ }
+ }
+
+ # perform Hartigan's dip test to assess non-unimodality
+
+ if ($dipTest) {
+ print " Performing Hartigan's dip test...";
+ $R->send("x<-c(".join (",", @a).")");
+ $R->send("dip.test(x)");
+ $s = $R->read;
+ open OUT, ">$key.out";
+ print OUT join(' ', @a);
+ close OUT;
+
+ if ($s =~ /D = (\S+), p-value [<=] (\S+)\n/) {
+ print " done.\n";
+ print " D = $1, p-value = $2\n";
+ if ($dipSig) {
+ if ($2 >= $dipSig) {
+ print " The weight distribution is NOT significantly non-unimodal.\n";
+ if ($howCO >= 3) {
+ if ($interactive) {
+ print " Proceed with statistical analysis anyway (yes) or use global cutoff ($globalCO) instead (NO)? ";
+ while () {
+ chomp;
+ unless ($_) { $use_global = 1; last; }
+ if (/^y$/i or /^yes$/i) { last; }
+ if (/^n$/i or /^no$/i) { $use_global = 1; last; }
+ }
+ } else { $use_global = 1; }
+ }
+ } else { print " The weight distribution is significantly non-unimodal.\n"; }
+ } else { print " The weight distribution is ". ("NOT " x ($2 >= 0.05)). "significantly non-unimodal.\n"; }
+ } else { print " failed.\n"; }
+ }
+
+ # determine cutoff using histogram
+
+ print " Generating histogram..." if ($howCO == 3 and not $use_global);
+
+ my @freqs = (0)x$nBin;
+ my $interval = ($a[$#a]-$a[0])/$nBin;
+ my $cid = 0;
+ for ($j=1; $j<$nBin; $j++) {
+ my $high_bound = $interval*$j;
+ for ($i=$cid; $i<=$#a; $i++) {
+ if ($a[$i] < $high_bound) {
+ $freqs[$j-1] ++;
+ } else {
+ $cid = $i;
+ last;
+ }
+ }
+ }
+ $freqs[$nBin-1] = $#a-$cid+1;
+ my $local_min = 0; # index of the lowest bar from left
+ for ($i=1; $i<$nBin-1; $i++) {
+ if ($freqs[$i]<$freqs[$i-1] and $freqs[$i]<=$freqs[$i+1]) {
+ $computed_cutoff = $interval*$i;
+ $local_min = $i;
+ last;
+ }
+ }
+
+ print " done.\n" if ($howCO == 3 and not $use_global);
+
+ # draw histogram
+
+ if ($plotHist) {
+ print " Histogram:";
+ @c = sort {$b<=>$a} @freqs;
+ $s = 50/$c[0];
+ my @widths = (0)x$nBin;
+ my @labels = (0)x$nBin;
+ for ($i=0; $i<$nBin; $i++) {
+ $widths[$i] = int($freqs[$i]*$s);
+ $labels[$i] = sprintf("%.2f", $interval*$i)."-".sprintf("%.2f", $interval*($i+1));
+ }
+ @c = sort {length($b)<=>length($a)} @labels;
+ $s = length($c[0]);
+ print "\n";
+ for ($i=0; $i<$nBin; $i++) {
+ print " "x($s-length($labels[$i]))."$labels[$i] ".("*"x$widths[$i])."$freqs[$i]".(" (local minimum)"x($local_min and $local_min==$i))."\n";
+ }
+ }
+
+ # determine cutoff using kernel density estimation (KDE)
+
+ if ($howCO == 4 and not $use_global) {
+
+ # perform kernel density estimation (KDE)
+
+ print " Performing kernel density estimation...";
+ my (@dx, @dy); # x- and y-coornidates of density function
+ my $KDEstats = "";
+
+ # perform KDE using basic R command "density"
+
+ if ($toolKDE) {
+ $R->send("x<-c(".join (",", @a).")");
+ if ($bwF and $bwF != 1) {
+ $R->send("bwx<-bw.nrd0(x)*$bwF");
+ $R->send("d<-density(x,bw=bwx)");
+ } else {
+ $R->send("d<-density(x)");
+ }
+ @dx = @{$R->get('d$x')};
+ @dy = @{$R->get('d$y')};
+ }
+
+ # perform KDE using self-written Perl code
+
+ else {
+ my $n = scalar @a;
+ my $mean; $mean += $_ for @a; $mean = $mean/$n;
+ my $stdev; $stdev += ($mean-$_)**2 for @a; $stdev = sqrt($stdev/($n-1));
+ my @Q = quantiles(@a); my $iqr = $Q[1]-$Q[0];
+ my $bw;
+ if ($stdev == 0 and $iqr == 0) { $bw = 1; }
+ elsif ($stdev == 0) { $bw = $iqr/1.34; }
+ elsif ($iqr == 0) { $bw = $stdev; }
+ elsif ($stdev <= $iqr/1.34) { $bw = $stdev; }
+ else { $bw = $iqr/1.34; }
+ $bw = 0.9*$bw*$n**(-1/5); # select bandwidth by Silverman's ¡°rule of thumb¡± (1986)
+ $bw = $bw*$bwF if ($bwF and $bwF != 1);
+ my ($min, $max) = ($a[0]-3*$bw, $a[$#a]+3*$bw); # cut = 3
+ $KDEstats = " N = $n, bandwidth = ".sprintf("%.3f", $bw).".\n";
+ for (my $x=$min; $x<=$max; $x+=($max-$min)/511) { # 512 points
+ my $e = 0; $e += exp(-(($x-$_)/$bw)**2/2)/sqrt(2*3.1415926536) for @a; # Gaussian kernel
+ push @dx, $x;
+ push @dy, 1/$n/$bw*$e;
+ }
+ }
+
+ print " done.\n";
+ print $KDEstats;
+
+ # plot density function on screen
+
+ if ($plotKDE) {
+ print " Density function:\n";
+ my $k_x = int(@dx/100);
+ $k_x = int(@dx/$plotKDE) if ($plotKDE > 1);
+ my $k_y = 0;
+ foreach (@dy) {
+ $k_y = $_ if ($_ > $k_y);
+ }
+ $k_y = 64/$k_y;
+ for ($i=0; $i<=$#dy; $i+=$k_x) {
+ print " "x(7-length(sprintf("%.2f", $dx[$i]))).sprintf("%.2f", $dx[$i]);
+ print " "x (int($dy[$i]*$k_y)+1)."*\n";
+ }
+ }
+
+ my $peak1st;
+ my $failed;
+ my ($peak_i, $peak_x, $peak_y);
+ my ($pit_i, $pit_x, $pit_y);
+
+ # identify local extrema using R package "pastecs"
+
+ if ($toolExtrema) {
+ if ($toolKDE < 2) {
+ $R->send("d<-data.frame(x=c(".join (",", @dx)."),y=c(".join (",", @dy)."))");
+ }
+ $R->send("tp<-turnpoints(ts(d\$y))");
+ if ($R->get('tp$firstispeak') eq "TRUE") {
+ if ($R->get('tp$nturns') == 1) { $failed = 1; }
+ else { $peak1st = 1; }
+ } else {
+ $peak1st = 0;
+ }
+ unless ($failed) {
+ my @tpx = @{$R->get("d\$x[tp\$tppos]")};
+ my @tpy = @{$R->get("d\$y[tp\$tppos]")};
+ if ($peak1st) {
+ ($peak_x, $peak_y) = ($tpx[0], $tpy[0]);
+ ($pit_x, $pit_y) = ($tpx[1], $tpy[1]);
+ } else {
+ ($peak_x, $peak_y) = ($dx[0], $dy[0]);
+ ($pit_x, $pit_y) = ($tpx[0], $tpy[0]);
+ }
+ }
+ }
+
+ # identify local extrema using self-written Perl code
+
+ else {
+ for ($i=0; $i<$#dy; $i++) {
+ if ($dy[$i] < $dy[$i+1]) { $peak1st = 1; last; }
+ elsif ($dy[$i] > $dy[$i+1]) { $peak1st = 0; last; }
+ else { next; }
+ }
+ if ($peak1st) {
+ for ($i=1; $i<$#dy; $i++) {
+ if (($dy[$i-1] <= $dy[$i]) and ($dy[$i] > $dy[$i+1])) {
+ ($peak_i, $peak_x, $peak_y) = ($i, $dx[$i], $dy[$i]);
+ }
+ if (($dy[$i-1] > $dy[$i]) and ($dy[$i] <= $dy[$i+1])) {
+ ($pit_i, $pit_x, $pit_y) = ($i, $dx[$i], $dy[$i]);
+ }
+ last if ($peak_i and $pit_i);
+ }
+ $failed = 1 unless $pit_i;
+ } else {
+ ($peak_i, $peak_x, $peak_y) = (0, $dx[0], $dy[0]);
+ for (my $i=1; $i<$#dx; $i++) {
+ if (($dy[$i-1] > $dy[$i]) and ($dy[$i] <= $dy[$i+1])) {
+ ($pit_i, $pit_x, $pit_y) = ($i, $dx[$i], $dy[$i]);
+ }
+ last if $pit_i;
+ }
+ }
+ }
+
+ if ($failed) {
+ print " The weight population cannot be clustered using kernel density estimation. This is typically caused by an even distribution of all weights.\n";
+ $use_global = 1;
+ } else {
+
+ # locate cutoff point
+
+ if ($modKCO == 0) { # pit
+ $computed_cutoff = $pit_x;
+ } elsif ($modKCO == 1) { # horizontal midpoint
+ $computed_cutoff = ($pit_x+$peak_x)/2;
+ } elsif ($modKCO == 2) { # horizontal quantile
+ $computed_cutoff = $pit_x-($pit_x-$peak_x)*$qKCO;
+ } else { # vertical quantile
+ my $vCO = 0;
+ $vCO = $pit_y+($peak_y-$pit_y)*$qKCO;
+ for (my $i=$peak_i; $i<$pit_i; $i++) {
+ if (($dy[$i] >= $vCO) and ($vCO > $dy[$i+1])) {
+ $computed_cutoff = $dx[$i]+($dx[$i+1]-$dx[$i])*($dy[$i]-$vCO)/($dy[$i]-$dy[$i+1]);
+ last;
+ }
+ }
+ }
+ }
+ }
+
+ # determine cutoff using hierarchical clustering
+
+ if ($howCO == 5 and not $use_global) {
+ print " Performing hierarchical clustering...";
+ $R->send("x<-c(".join (",", @a).")");
+ $R->send("d<-dist(x,method=\"euclidean\")");
+ $R->send("fit<-hclust(d,method=\"ward\")");
+ print " done.\n";
+ my $nCluster = 1;
+ while ($nCluster++) {
+ $R->send("c<-cutree(fit,k=$nCluster)");
+ my $clusters = $R->get('c'); # tip: return vector from R
+ @c = (()) x $nCluster; # data of clusters
+ for ($i=0; $i<=@{$clusters}-1; $i++) {
+ foreach (1..$nCluster) {
+ if (@{$clusters}[$i] == $_) {
+ push @{$c[$_-1]}, $a[$i];
+ last;
+ }
+ }
+ }
+ for ($i=1; $i<=$#c; $i++) {
+ @{$c[$i]} = sort {$a <=> $b} @{$c[$i]};
+ }
+ @b = sort {$a->[0] <=> $b->[0]} @c;
+ print " With $nCluster clusters, cutoff is $b[1][0]. Accept? (YES/no) ";
+ my $user_okay = 1;
+ while () {
+ chomp; last unless $_;
+ last if (/^y$/i or /^yes$/i);
+ if (/^n$/i or /^no$/i) { $user_okay = 0; last; }
+ }
+ last if $user_okay;
+ }
+ $computed_cutoff = $b[1][0];
+ }
+
+ # report cutoff to user
+
+ if ($howCO == 0 or ($howCO >= 3 and $use_global)) { # user-defined global cutoff
+ $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
+ print " Cutoff is $global_cutoff (determined by global cutoff $globalCO)\n";
+ }
+ if ($howCO == 1 and $unite) { # user-defined individual cutoff
+ if ($key eq '0') { $s = $selfCO; }
+ elsif ($key eq '1') { $s = $closeCO; }
+ elsif ($key eq '2') { $s = $distalCO; }
+ if ($s) {
+ $fpN{$set}{$key}{'cutoff'} = $s;
+ print " Cutoff is $s (user-defined).\n";
+ } else {
+ $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
+ print " User-defined cutoff is not available. Use global cutoff $global_cutoff instead.\n";
+ }
+ }
+ if ($howCO >= 3 and not $use_global) { # computed cutoff
+ $s = 0; if ($howCO == 3) { $s = "histogram"; }
+ elsif ($howCO == 4) { $s = "kernel density estimation"; }
+ elsif ($howCO == 5) { $s = "clustering analysis"; }
+ if ($computed_cutoff) {
+ if ($computed_cutoff <= $half_way) {
+ $fpN{$set}{$key}{'cutoff'} = $computed_cutoff;
+ print " Cutoff is ".sprintf("%.3f", $computed_cutoff)." (determined by $s).\n";
+ } else {
+ $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
+ print " ". ucfirst($s) ." identified a cutoff ".sprintf("%.3f", $computed_cutoff)." which is too large. Use global cutoff $global_cutoff instead.\n";
+ }
+ } else {
+ $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
+ print " ". ucfirst($s) ." failed to identify a cutoff. Use global cutoff $global_cutoff instead.\n";
+ }
+ }
+ # $fpN{$set}{$key}{'cutoff'} = 0.00001 unless $fpN{$set}{$key}{'cutoff'};
+
+ # ask user to enter cutoff
+
+ my $user_enter;
+ if ($interactive and $fpN{$set}{$key}{'cutoff'}) {
+ print " Accept? (YES/no) ";
+ while () {
+ chomp;
+ last unless $_;
+ last if (/^y$/i or /^yes$/i);
+ if (/^n$/i or /^no$/i) { $user_enter = 1; last; }
+ }
+ }
+ if ($user_enter or $howCO == 2) {
+ print " Enter user-specified cutoff: ";
+ while () {
+ chomp;
+ last if /^\d+\.?\d*$/;
+ print " Invalid cutoff value. Re-enter: ";
+ }
+ $fpN{$set}{$key}{'cutoff'} = $_;
+ }
+ }
}
-if (($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest){
- $R->stopR;
+if (($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest) {
+ $R->stopR;
}
print " done.\n";
## output fingerprint ##
-if ($outFp){
- open OUT, ">$wkDir/result/statistics/fingerprint.txt";
- print OUT "#NEXUS\nBEGIN STATISTICS;\n";
- print OUT "\tGroup\tNumber\tMean\tSD\tMin\tMax\tMedian\tMAD\tQ1\tQ3\tCutoff\n";
- foreach my $set (sort keys %fpN){
- foreach ('0','1','2'){
- %h = %{$fpN{$set}{$_}};
- if ($_ eq '0'){ $s = "self"; }elsif ($_ eq '1'){ $s = "close"; }elsif ($_ eq '2'){ $s = "distal"; }
- if ($set eq '0'){ print OUT "all"; }else{ print OUT $set; }
- print OUT "\t$s\t$h{'n'}\t".sprintf("%.2f", $h{'mean'})."\t".sprintf("%.2f", $h{'stdev'})."\t".sprintf("%.2f", $h{'min'})."\t".sprintf("%.2f", $h{'max'})."\t".sprintf("%.2f", $h{'median'})."\t".sprintf("%.2f", $h{'mad'})."\t".sprintf("%.2f", $h{'q1'})."\t".sprintf("%.2f", $h{'q3'})."\t".sprintf("%.2f", $h{'cutoff'})."\n";
- }
- }
- print OUT "END;\n";
- close OUT;
- print "Result is saved in result/statistics/fingerprint.txt.\n";
+if ($outFp) {
+ open OUT, ">$wkDir/result/statistics/fingerprint.txt";
+ print OUT "#NEXUS\nBEGIN STATISTICS;\n";
+ print OUT "\tGroup\tNumber\tMean\tSD\tMin\tMax\tMedian\tMAD\tQ1\tQ3\tCutoff\n";
+ foreach my $set (sort keys %fpN) {
+ foreach ('0','1','2') {
+ %h = %{$fpN{$set}{$_}};
+ if ($_ eq '0') { $s = "self"; } elsif ($_ eq '1') { $s = "close"; } elsif ($_ eq '2') { $s = "distal"; }
+ if ($set eq '0') { print OUT "all"; } else { print OUT $set; }
+ print OUT "\t$s\t$h{'n'}\t".sprintf("%.2f", $h{'mean'})."\t".sprintf("%.2f", $h{'stdev'})."\t".sprintf("%.2f", $h{'min'})."\t".sprintf("%.2f", $h{'max'})."\t".sprintf("%.2f", $h{'median'})."\t".sprintf("%.2f", $h{'mad'})."\t".sprintf("%.2f", $h{'q1'})."\t".sprintf("%.2f", $h{'q3'})."\t".sprintf("%.2f", $h{'cutoff'})."\n";
+ }
+ }
+ print OUT "END;\n";
+ close OUT;
+ print "Result is saved in result/statistics/fingerprint.txt.\n";
}
-if ($interactive){
- print "Press Enter to proceed with prediction, or Ctrl+C to exit:";
- $s = ;
+if ($interactive) {
+ print "Press Enter to proceed with prediction, or Ctrl+C to exit:";
+ $s = ;
}
## conduct bidirectional best hit (BBH) search ##
-if ($BBH == 2){
- print "Conducting bidirectional best hit (BBH) search...\n";
- unless (-e "$wkDir/result/bbh_input.txt"){
- open OUT, ">$wkDir/result/bbh_input.txt";
- foreach my $set (keys %results){
- for ($i=0; $i<@{$results{$set}}; $i++){
- my %res = %{$results{$set}[$i]};
- if (exists $res{'BBH'} and $res{'BBH'}){
- # set id query_accn query_taxid subject_accn subject_taxid
- print OUT $set."\t".$i."\t".$res{'accn'}."\t".$selfinfo{$set}{'taxid'}."\t".$res{'hit2accn'}."\t".$res{'hit2'}."\n";
- }
- }
- }
- }
- $s = $0; $s =~ s/analyzer\.pl$/bbh.pl/;
- system "$^X $s $wkDir";
- unlink "$wkDir/result/bbh_input.txt";
- open IN, "<$wkDir/result/bbh.txt";
- while (){
- s/\s+$//;
- @a = split (/\t/);
- $results{$a[0]}[$a[1]]{'BBH'} = "" if ($a[$#a] ne '1');
- }
- close IN;
- print " done.\n";
+if ($BBH == 2) {
+ print "Conducting bidirectional best hit (BBH) search...\n";
+ unless (-e "$wkDir/result/bbh_input.txt") {
+ open OUT, ">$wkDir/result/bbh_input.txt";
+ foreach my $set (keys %results) {
+ for ($i=0; $i<@{$results{$set}}; $i++) {
+ my %res = %{$results{$set}[$i]};
+ if (exists $res{'BBH'} and $res{'BBH'}) {
+ # set id query_accn query_taxid subject_accn subject_taxid
+ print OUT $set."\t".$i."\t".$res{'accn'}."\t".$selfinfo{$set}{'taxid'}."\t".$res{'hit2accn'}."\t".$res{'hit2'}."\n";
+ }
+ }
+ }
+ }
+ $s = $0; $s =~ s/analyzer\.pl$/bbh.pl/;
+ system "$^X $s $wkDir";
+ unlink "$wkDir/result/bbh_input.txt";
+ open IN, "<$wkDir/result/bbh.txt";
+ while () {
+ s/\s+$//;
+ @a = split (/\t/);
+ $results{$a[0]}[$a[1]]{'BBH'} = "" if ($a[$#a] ne '1');
+ }
+ close IN;
+ print " done.\n";
}
@@ -1306,103 +1306,103 @@
mkdir "$wkDir/result/detail" unless -d "$wkDir/result/detail";
print "Predicting...";
-foreach my $set (keys %results){
- $n = @{$results{$set}};
- for ($i=0; $i<$n; $i++){
- my %res = %{$results{$set}[$i]};
- next if ($minHits && ($res{'n'} < $minHits));
- next if ($minSize && ($res{'length'} < $minSize));
-
- my $fp = $set; $fp = '0' if $unite;
-
- # principle: fewer hits suggests income or loss
- # check if self hits are low (indicating the gene is not prevalent within the group)
- if (($res{'N0'} < $fpN{$fp}{'0'}{'cutoff'}) or ($res{'N0'} == 0)){
- $res{'in'} = 1;
- }
-
- ## predict incoming HGT events ##
- # hits from close sister groups are low, indicating there's no vertical ancestor
- if (($res{'N1'} < $fpN{$fp}{'1'}{'cutoff'}) or ($res{'N1'} == 0)){
- # overall non-self hits are normal, indicating it's not an origination event
- if ($res{'N2'} and $res{'N2'} >= $fpN{$fp}{'2'}{'cutoff'}){
- $res{'income'} = 1;
- $res{'income'} = "" if ($selfLow and (($res{'N0'} > 0) and ($res{'N0'} >= $fpN{$fp}{'0'}{'cutoff'})));
- }
- }
-
- ## predict gene loss events ##
- # hits from close sister groups are normal,
- elsif ($res{'in'}){
- # self hits are low, suggesting the gene is absent in some lineages
- $res{'loss'} = 1;
- }
-
- if (0){ # not available in this version
- ## predict gene origination event ##
- # detection won't work for saturated search result.
- if ($maxHits and ($res{'N0'}+$res{'N1'}+$res{'N2'}) < $maxHits){
- # if every hit is self, then must be origination.
- unless ($res{'N1'}+$res{'N2'}){
- $res{'origin'} = 1;
- }else{
- # if non-self hits are significantly weak.
- $s = 0;
- $s = $res{'S1'}[0] if $res{'N1'};
- $s = $res{'S2'}[0] if ($res{'N2'} and $res{'S2'}[0] > $s);
- }
- }
- }
-
- # predicting gene outcoming events
- # self hits must be normal, excluding any paralogs
- unless ($res{'in'}){
- if ($res{'N1'}+$res{'N2'} and exists($res{'sGenus'}{'q1'})){
- if ($res{'nGenus'}{'max'} > $res{'sGenus'}{'q1'}){ # the best hit from other genera fall within the range of self genus hits
- @a = @{$res{'nGenus'}{'scores'}};
- $s = 0;
- foreach (@a){
- if ($_ < $res{'sGenus'}{'min'}){ last;}
- else{ $s = $_;}
- }
- if ($s > $res{'sGenus'}{'q1'}){
- $res{'genus_outcome'} = $res{'nGenusHit'};
- }
- }
- }
- }
- $results{$set}[$i] = {%res};
- }
-
- ###### Output report ######
-
- open (OUT, ">$wkDir/result/detail/$set.txt");
- @a = ('Query','Length','Product','Hits','Self','Close','Distal','HGT');
- unless ($BBH){
- push (@a, "Loss") if $loss;
- push (@a, "POE") if $POE;
- }
- push @a, "Match";
- print OUT "HGTector result of $set\n".join("\t",@a)."\n";
- for ($i=0; $i<$n; $i++){
- %h = %{$results{$set}[$i]};
- print OUT $h{'query'}."\t".$h{'length'}."\t".$h{'product'}."\t".$h{'n'}."\t";
- print OUT "\n" and next if ($minHits and ($h{'n'} < $minHits)) or ($minSize and ($h{'length'} < $minSize)); #####??????#######
- print OUT sprintf("%.2f", $h{'N0'})."\t".sprintf("%.2f", $h{'N1'})."\t".sprintf("%.2f", $h{'N2'})."\t";
- unless ($BBH){
- print OUT $h{'income'};
- print OUT "\t".$h{'loss'} if $loss;
- print OUT "\t".$h{'origin'} if $POE;
- }else{
- print OUT $h{'BBH'} if exists $h{'BBH'} and $h{'BBH'};
- }
- print OUT "\t";
- if (exists $h{'hit2'} and exists $taxadb{$h{'hit2'}}){
- print OUT $h{'hit2'}." (".$taxadb{$h{'hit2'}}{'name'}.")";
- }
- print OUT "\n";
- }
- close OUT;
+foreach my $set (keys %results) {
+ $n = @{$results{$set}};
+ for ($i=0; $i<$n; $i++) {
+ my %res = %{$results{$set}[$i]};
+ next if ($minHits && ($res{'n'} < $minHits));
+ next if ($minSize && ($res{'length'} < $minSize));
+
+ my $fp = $set; $fp = '0' if $unite;
+
+ # principle: fewer hits suggests income or loss
+ # check if self hits are low (indicating the gene is not prevalent within the group)
+ if (($res{'N0'} < $fpN{$fp}{'0'}{'cutoff'}) or ($res{'N0'} == 0)) {
+ $res{'in'} = 1;
+ }
+
+ ## predict incoming HGT events ##
+ # hits from close sister groups are low, indicating there's no vertical ancestor
+ if (($res{'N1'} < $fpN{$fp}{'1'}{'cutoff'}) or ($res{'N1'} == 0)) {
+ # overall non-self hits are normal, indicating it's not an origination event
+ if ($res{'N2'} and $res{'N2'} >= $fpN{$fp}{'2'}{'cutoff'}) {
+ $res{'income'} = 1;
+ $res{'income'} = "" if ($selfLow and (($res{'N0'} > 0) and ($res{'N0'} >= $fpN{$fp}{'0'}{'cutoff'})));
+ }
+ }
+
+ ## predict gene loss events ##
+ # hits from close sister groups are normal,
+ elsif ($res{'in'}) {
+ # self hits are low, suggesting the gene is absent in some lineages
+ $res{'loss'} = 1;
+ }
+
+ if (0) { # not available in this version
+ ## predict gene origination event ##
+ # detection won't work for saturated search result.
+ if ($maxHits and ($res{'N0'}+$res{'N1'}+$res{'N2'}) < $maxHits) {
+ # if every hit is self, then must be origination.
+ unless ($res{'N1'}+$res{'N2'}) {
+ $res{'origin'} = 1;
+ } else {
+ # if non-self hits are significantly weak.
+ $s = 0;
+ $s = $res{'S1'}[0] if $res{'N1'};
+ $s = $res{'S2'}[0] if ($res{'N2'} and $res{'S2'}[0] > $s);
+ }
+ }
+ }
+
+ # predicting gene outcoming events
+ # self hits must be normal, excluding any paralogs
+ unless ($res{'in'}) {
+ if ($res{'N1'}+$res{'N2'} and exists($res{'sGenus'}{'q1'})) {
+ if ($res{'nGenus'}{'max'} > $res{'sGenus'}{'q1'}) { # the best hit from other genera fall within the range of self genus hits
+ @a = @{$res{'nGenus'}{'scores'}};
+ $s = 0;
+ foreach (@a) {
+ if ($_ < $res{'sGenus'}{'min'}) { last;}
+ else { $s = $_;}
+ }
+ if ($s > $res{'sGenus'}{'q1'}) {
+ $res{'genus_outcome'} = $res{'nGenusHit'};
+ }
+ }
+ }
+ }
+ $results{$set}[$i] = {%res};
+ }
+
+ ###### Output report ######
+
+ open (OUT, ">$wkDir/result/detail/$set.txt");
+ @a = ('Query','Length','Product','Hits','Self','Close','Distal','HGT');
+ unless ($BBH) {
+ push (@a, "Loss") if $loss;
+ push (@a, "POE") if $POE;
+ }
+ push @a, "Match";
+ print OUT "HGTector result of $set\n".join("\t",@a)."\n";
+ for ($i=0; $i<$n; $i++) {
+ %h = %{$results{$set}[$i]};
+ print OUT $h{'query'}."\t".$h{'length'}."\t".$h{'product'}."\t".$h{'n'}."\t";
+ print OUT "\n" and next if ($minHits and ($h{'n'} < $minHits)) or ($minSize and ($h{'length'} < $minSize)); #####??????#######
+ print OUT sprintf("%.2f", $h{'N0'})."\t".sprintf("%.2f", $h{'N1'})."\t".sprintf("%.2f", $h{'N2'})."\t";
+ unless ($BBH) {
+ print OUT $h{'income'};
+ print OUT "\t".$h{'loss'} if $loss;
+ print OUT "\t".$h{'origin'} if $POE;
+ } else {
+ print OUT $h{'BBH'} if exists $h{'BBH'} and $h{'BBH'};
+ }
+ print OUT "\t";
+ if (exists $h{'hit2'} and exists $taxadb{$h{'hit2'}}) {
+ print OUT $h{'hit2'}." (".$taxadb{$h{'hit2'}}{'name'}.")";
+ }
+ print OUT "\n";
+ }
+ close OUT;
}
print " done.\n";
print "Prediction results are saved in result/detail/.\n";
@@ -1414,170 +1414,169 @@
## sub routines ##
##################
-sub median (@){
- my $mid = int(@_/2);
- return $_[0] unless $mid;
- if (@_ % 2) {
- return $_[$mid];
- }else{
- return ($_[$mid-1] + $_[$mid])/2;
- }
+sub median (@) {
+ my $mid = int(@_/2);
+ return $_[0] unless $mid;
+ if (@_ % 2) {
+ return $_[$mid];
+ } else {
+ return ($_[$mid-1] + $_[$mid])/2;
+ }
}
-sub mad (@){
- my @absdev;
- my $median = median(@_);
- foreach my $x(@_){
- push @absdev, abs($x - $median);
- }
- return median(sort{$a<=>$b}@absdev);
+sub mad (@) {
+ my @absdev;
+ my $median = median(@_);
+ foreach my $x(@_) {
+ push @absdev, abs($x - $median);
+ }
+ return median(sort{$a<=>$b}@absdev);
}
-sub quantiles (@){
- my $Q1, my $Q3;
- my $mid = int(@_/2);
- return ($_[0],$_[0]) unless $mid;
- if (@_ % 2) {
- $Q1 = median(@_[0..$mid]);
- $Q3 = median(@_[$mid..$#_]);
- }else{
- $Q1 = median(@_[0..($mid-1)]);
- $Q3 = median(@_[$mid..$#_]);
- }
- return ($Q1,$Q3);
+sub quantiles (@) {
+ my $Q1, my $Q3;
+ my $mid = int(@_/2);
+ return ($_[0],$_[0]) unless $mid;
+ if (@_ % 2) {
+ $Q1 = median(@_[0..$mid]);
+ $Q3 = median(@_[$mid..$#_]);
+ } else {
+ $Q1 = median(@_[0..($mid-1)]);
+ $Q3 = median(@_[$mid..$#_]);
+ }
+ return ($Q1,$Q3);
}
# compute Z-score (Z = (xi-x^)/s)
-sub z_scores(@){
- my $mean = 0;
- $mean += $_ for @_;
- $mean = $mean / @_;
- my $stdev = 0;
- $stdev += ($mean-$_)**2 for @_;
- $stdev = sqrt($stdev/(@_-1));
- my @z = ();
- push (@z, ($_-$mean)/$stdev) for @_;
- return @z;
+sub z_scores(@) {
+ my $mean = 0;
+ $mean += $_ for @_;
+ $mean = $mean / @_;
+ my $stdev = 0;
+ $stdev += ($mean-$_)**2 for @_;
+ $stdev = sqrt($stdev/(@_-1));
+ my @z = ();
+ push (@z, ($_-$mean)/$stdev) for @_;
+ return @z;
}
# Z-score test for outliers (|Z| > 3)
-sub z_test(@){
- my @data = sort{$a<=>$b}@_;
- my @z = z_scores(@data);
- my $lower_fence = $data[0];
- my $upper_fence = $data[$#data];
- for (my $i=0; $i<=$#data; $i++){
- if (abs($z[$i]) <= 3){
- $lower_fence = $data[$i];
- last;
- }
- }
- for (my $i=$#data; $i>=0; $i--){
- if (abs($z[$i]) <= 3){
- $upper_fence = $data[$i];
- last;
- }
- }
- return ($lower_fence, $upper_fence);
+sub z_test(@) {
+ my @data = sort{$a<=>$b}@_;
+ my @z = z_scores(@data);
+ my $lower_fence = $data[0];
+ my $upper_fence = $data[$#data];
+ for (my $i=0; $i<=$#data; $i++) {
+ if (abs($z[$i]) <= 3) {
+ $lower_fence = $data[$i];
+ last;
+ }
+ }
+ for (my $i=$#data; $i>=0; $i--) {
+ if (abs($z[$i]) <= 3) {
+ $upper_fence = $data[$i];
+ last;
+ }
+ }
+ return ($lower_fence, $upper_fence);
}
# modified Z-score test for outliers (|modified_Z| > 3.5) (Iglewicz and Hoaglin, 1993)
-sub modified_z(@){
- my @data = sort{$a<=>$b}@_;
- my $lower_fence = $data[0];
- my $upper_fence = $data[$#data];
- my $median = median(@data);
- my $mad = mad(@data);
- return ($data[0],$data[$#data]) unless $mad;
- for (my $i=0; $i<=$#data; $i++){
- if (abs(0.6745*($data[$i]-$median)/$mad) <= 3.5){
- $lower_fence = $data[$i];
- last;
- }
- }
- for (my $i=$#data; $i>=0; $i--){
- if (abs(0.6745*($data[$i]-$median)/$mad) <= 3.5){
- $upper_fence = $data[$i];
- last;
- }
- }
- return ($lower_fence, $upper_fence);
+sub modified_z(@) {
+ my @data = sort{$a<=>$b}@_;
+ my $lower_fence = $data[0];
+ my $upper_fence = $data[$#data];
+ my $median = median(@data);
+ my $mad = mad(@data);
+ return ($data[0],$data[$#data]) unless $mad;
+ for (my $i=0; $i<=$#data; $i++) {
+ if (abs(0.6745*($data[$i]-$median)/$mad) <= 3.5) {
+ $lower_fence = $data[$i];
+ last;
+ }
+ }
+ for (my $i=$#data; $i>=0; $i--) {
+ if (abs(0.6745*($data[$i]-$median)/$mad) <= 3.5) {
+ $upper_fence = $data[$i];
+ last;
+ }
+ }
+ return ($lower_fence, $upper_fence);
}
# boxplot test for outliers
-sub boxplot(@){
- my $lower_fence, my $upper_fence;
- my @data = sort{$a<=>$b}@_;
- my @Q = quantiles(@data);
- my $iqr = $Q[1]-$Q[0];
- my $f = 3*exp(10/@data);
- $lower_fence = $Q[0]-$f*$iqr;
- $upper_fence = $Q[1]+$f*$iqr;
- return ($lower_fence, $upper_fence);
+sub boxplot(@) {
+ my $lower_fence, my $upper_fence;
+ my @data = sort{$a<=>$b}@_;
+ my @Q = quantiles(@data);
+ my $iqr = $Q[1]-$Q[0];
+ my $f = 3*exp(10/@data);
+ $lower_fence = $Q[0]-$f*$iqr;
+ $upper_fence = $Q[1]+$f*$iqr;
+ return ($lower_fence, $upper_fence);
}
# adjusted boxplot test for outliers
-sub adjusted_boxplot(@){
- my $lower_fence, my $upper_fence;
- my @data = sort{$a<=>$b}@_;
- my $median = median(@data);
- my @lower; my @upper;
- foreach (@data){
- push (@lower, $_) if ($_ <= $median);
- push (@upper, $_) if ($_ >= $median);
- }
- my @kernel;
- foreach my $i (@lower){
- foreach my $j (@upper){
- next if ($i == $j);
- push @kernel, (($j-$median)-($median-$i))/($j-$i);
- }
- }
- my $mc = median(sort{$a<=>$b}@kernel);
- my @Q = quantiles(@data);
- my $iqr = $Q[1]-$Q[0];
- my $f = 1.5; # *exp(10/@data);
- if ($mc >= 0){
- $lower_fence = $Q[0]-$f*exp(-3.5*$mc)*$iqr;
- $upper_fence = $Q[1]+$f*exp(4*$mc)*$iqr;
- }else{
- $lower_fence = $Q[0]-$f*exp(-4*$mc)*$iqr;
- $upper_fence = $Q[1]+$f*exp(3.5*$mc)*$iqr;
- }
- return ($lower_fence, $upper_fence);
+sub adjusted_boxplot(@) {
+ my $lower_fence, my $upper_fence;
+ my @data = sort{$a<=>$b}@_;
+ my $median = median(@data);
+ my @lower; my @upper;
+ foreach (@data) {
+ push (@lower, $_) if ($_ <= $median);
+ push (@upper, $_) if ($_ >= $median);
+ }
+ my @kernel;
+ foreach my $i (@lower) {
+ foreach my $j (@upper) {
+ next if ($i == $j);
+ push @kernel, (($j-$median)-($median-$i))/($j-$i);
+ }
+ }
+ my $mc = median(sort{$a<=>$b}@kernel);
+ my @Q = quantiles(@data);
+ my $iqr = $Q[1]-$Q[0];
+ my $f = 1.5; # *exp(10/@data);
+ if ($mc >= 0) {
+ $lower_fence = $Q[0]-$f*exp(-3.5*$mc)*$iqr;
+ $upper_fence = $Q[1]+$f*exp(4*$mc)*$iqr;
+ } else {
+ $lower_fence = $Q[0]-$f*exp(-4*$mc)*$iqr;
+ $upper_fence = $Q[1]+$f*exp(3.5*$mc)*$iqr;
+ }
+ return ($lower_fence, $upper_fence);
}
-sub recurse_deOutlier(@){ # assume data are high to low
- my @data = @_;
- my @fences = adjusted_boxplot @data;
- for ($i=0; $i<=$#data; $i++){
- if ($data[$i] < $fences[0]){
- $i--;
- last;
- }
- }
- if ($i < $#data){
- @data = @data[0..$i];
- @data = recurse_deOutlier @data;
- }
- return @data;
+sub recurse_deOutlier(@) { # assume data are high to low
+ my @data = @_;
+ my @fences = adjusted_boxplot @data;
+ for ($i=0; $i<=$#data; $i++) {
+ if ($data[$i] < $fences[0]) {
+ $i--;
+ last;
+ }
+ }
+ if ($i < $#data) {
+ @data = @data[0..$i];
+ @data = recurse_deOutlier @data;
+ }
+ return @data;
}
-sub recurse_Z(@){
- my @data = @_;
- if (modified_z(@data) > 3.5){
- pop @data;
- @data = recurse_Z(@data);
- }
- return @data;
+sub recurse_Z(@) {
+ my @data = @_;
+ if (modified_z(@data) > 3.5) {
+ pop @data;
+ @data = recurse_Z(@data);
+ }
+ return @data;
}
-
diff --git a/scripts/bbh.pl b/scripts/bbh.pl
old mode 100755
new mode 100644
index facbe79..24a416d
--- a/scripts/bbh.pl
+++ b/scripts/bbh.pl
@@ -25,17 +25,17 @@
## global variables ##
-my @queries = (); # list of query proteins
-my %done = (); # queries that is already done.
+my @queries = (); # list of query proteins
+my %done = (); # queries that is already done.
-my $query; # current query protein (accession number)
-my $taxid; # target organism
-my $subject; # subject protein (accession number)
-my $eqText; # entrez query text
-my $isBBH; # whether a hit is BBH
+my $query; # current query protein (accession number)
+my $taxid; # target organism
+my $subject; # subject protein (accession number)
+my $eqText; # entrez query text
+my $isBBH; # whether a hit is BBH
-my $iRetry = 0; # current number of retries
-my $isError = 0; # if error
+my $iRetry = 0; # current number of retries
+my $isError = 0; # if error
use LWP::UserAgent;
use HTTP::Request::Common qw(POST);
@@ -50,97 +50,97 @@
my $wkDir = $ARGV[0];
-my $httpBlast = 0; # BLAST mode (0: http, 1: local, 2: remote)
-my $maxHits = 0; # maximum number of valid hits to preserve. if 0 then = nHit
+my $httpBlast = 0; # BLAST mode (0: http, 1: local, 2: remote)
+my $maxHits = 0; # maximum number of valid hits to preserve. if 0 then = nHit
-my $retries = 10; # maximum number of retries
-my $nHit = 100; # number of hits to return
-my $evalue = 0.01; # maximum E-value cutoff
+my $retries = 10; # maximum number of retries
+my $nHit = 100; # number of hits to return
+my $evalue = 0.01; # maximum E-value cutoff
my $blastServer = "http://blast.ncbi.nlm.nih.gov/Blast.cgi";
my $protdb = "nr";
my $blastdbcmd = "blastdbcmd";
my $blastp = "blastp";
-my $threads = 1; # Multiple threads for local BLAST program
+my $threads = 1; # Multiple threads for local BLAST program
## read configurations ##
-if (-e "$wkDir/config.txt"){
- open IN, "<$wkDir/config.txt";
- while (){
- s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
- $httpBlast = $1 if /^httpBlast=(\d)$/;
- $nHit = $1 if /^nHit=(\d+)$/;
- $evalue = $1 if /^evalue=(.+)$/;
- $retries = $1 if /^retries=(\d+)$/;
- $maxHits = $1 if /^maxHits=(\d+)$/;
- $dbBlast = $1 if /^dbBlast=(.+)$/;
- $eqText = $1 if /^eqText=(.+)$/;
- $blastServer = $1 if /^blastServer=(.+)$/;
- $blastdbcmd = $1 if /^blastdbcmd=(.+)$/;
- $blastp = $1 if /^blastp=(.+)$/;
- $threads = $1 if /^threads=(\d+)$/;
- }
- close IN;
+if (-e "$wkDir/config.txt") {
+ open IN, "<$wkDir/config.txt";
+ while () {
+ s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+ $httpBlast = $1 if /^httpBlast=(\d)$/;
+ $nHit = $1 if /^nHit=(\d+)$/;
+ $evalue = $1 if /^evalue=(.+)$/;
+ $retries = $1 if /^retries=(\d+)$/;
+ $maxHits = $1 if /^maxHits=(\d+)$/;
+ $dbBlast = $1 if /^dbBlast=(.+)$/;
+ $eqText = $1 if /^eqText=(.+)$/;
+ $blastServer = $1 if /^blastServer=(.+)$/;
+ $blastdbcmd = $1 if /^blastdbcmd=(.+)$/;
+ $blastp = $1 if /^blastp=(.+)$/;
+ $threads = $1 if /^threads=(\d+)$/;
+ }
+ close IN;
}
## read query list ##
open IN, "<$wkDir/result/bbh_input.txt" or die "Error: input file not accessible.\n";
-while(){
- s/\s+$//;
- @a = split (/\t/);
- # '1' is subject and '2' is query
- %h = ('set', $a[0], 'id', $a[1], 'accn1', $a[2], 'taxid1', $a[3], 'accn2', $a[4], 'taxid2', $a[5]);
- push @queries, {%h};
+while() {
+ s/\s+$//;
+ @a = split (/\t/);
+ # '1' is subject and '2' is query
+ %h = ('set', $a[0], 'id', $a[1], 'accn1', $a[2], 'taxid1', $a[3], 'accn2', $a[4], 'taxid2', $a[5]);
+ push @queries, {%h};
}
close IN;
## read previous results ##
-if (-e "$wkDir/result/bbh.txt"){
- open IN, "<$wkDir/result/bbh.txt";
- while (){
- s/\s+$//;
- @a = split (/\t/);
- $done{$a[0]."_".$a[2]} = 1;
- }
- close IN;
- my $nDone = 0;
- for ($i=0; $i<=$#queries; $i++){
- if (exists $done{$queries[$i]{'set'}."_".$queries[$i]{'accn1'}}){
- $queries[$i]{'done'} = 1;
- $nDone ++;
- }
- }
- if ((scalar @queries) <= $nDone){
- print "Reversal BLAST is already completed. Exiting...\n";
- exit 0;
- }else{
- print "".(scalar @queries)." proteins in total, $nDone completed, remaining ".(scalar @queries - $nDone)." to BLAST.\n";
- }
-}else{
- print "".(scalar @queries)." proteins to BLAST.\n";
+if (-e "$wkDir/result/bbh.txt") {
+ open IN, "<$wkDir/result/bbh.txt";
+ while () {
+ s/\s+$//;
+ @a = split (/\t/);
+ $done{$a[0]."_".$a[2]} = 1;
+ }
+ close IN;
+ my $nDone = 0;
+ for ($i=0; $i<=$#queries; $i++) {
+ if (exists $done{$queries[$i]{'set'}."_".$queries[$i]{'accn1'}}) {
+ $queries[$i]{'done'} = 1;
+ $nDone ++;
+ }
+ }
+ if ((scalar @queries) <= $nDone) {
+ print "Reversal BLAST is already completed. Exiting...\n";
+ exit 0;
+ } else {
+ print "".(scalar @queries)." proteins in total, $nDone completed, remaining ".(scalar @queries - $nDone)." to BLAST.\n";
+ }
+} else {
+ print "".(scalar @queries)." proteins to BLAST.\n";
}
## perform batch BLAST ##
-for ($i=0; $i<=$#queries; $i++){
- next if exists $queries[$i]{'done'};
- $isBBH = 0;
- $query = $queries[$i]{'accn2'};
- $query =~ s/\.[\d]+$//;
- $subject = $queries[$i]{'accn1'};
- $subject =~ s/\.[\d]+$//;
- $taxid = $queries[$i]{'taxid1'};
- $eqText = "txid$taxid [ORGN]";
- print "BLASTing $query ...";
- blast;
- open OUT, ">>$wkDir/result/bbh.txt";
- print OUT $queries[$i]{'set'}."\t".$queries[$i]{'id'}."\t".$queries[$i]{'accn1'}."\t".$queries[$i]{'taxid1'}."\t".$queries[$i]{'accn2'}."\t".$queries[$i]{'taxid2'}."\t$isBBH\n"; close OUT;
- print " $isBBH.\n";
+for ($i=0; $i<=$#queries; $i++) {
+ next if exists $queries[$i]{'done'};
+ $isBBH = 0;
+ $query = $queries[$i]{'accn2'};
+ $query =~ s/\.[\d]+$//;
+ $subject = $queries[$i]{'accn1'};
+ $subject =~ s/\.[\d]+$//;
+ $taxid = $queries[$i]{'taxid1'};
+ $eqText = "txid$taxid [ORGN]";
+ print "BLASTing $query ...";
+ blast;
+ open OUT, ">>$wkDir/result/bbh.txt";
+ print OUT $queries[$i]{'set'}."\t".$queries[$i]{'id'}."\t".$queries[$i]{'accn1'}."\t".$queries[$i]{'taxid1'}."\t".$queries[$i]{'accn2'}."\t".$queries[$i]{'taxid2'}."\t$isBBH\n"; close OUT;
+ print " $isBBH.\n";
}
unlink "$wkDir/blast.seq";
print "Reversal BLAST is completed.\n";
@@ -152,89 +152,86 @@
## perform BLAST ##
-sub blast (){
-
- ## BLAST using standalone ncbi-blast+ program ##
-
- if ($httpBlast){
- my @hits;
- # the report contains: accession, gi, length, taxid, sequence, title
- my @out = `$blastdbcmd -db $dbBlast -entry $query -outfmt \"%a %g %l %T %s %t\"`;
- foreach (@out){
- s/\s+$//; @b = split (/\s+/);
- last if ($b[0] eq $query or $b[0] =~/^$query\.\d+$/);
- }
- my $length = $b[2];
- open TMP, ">$wkDir/blast.seq"; print TMP $b[4]; close TMP;
- # the report contains: subject accessions (all), E-value, bit score, aligned part of subject sequence
- @out = `$blastp -query $wkDir/blast.seq -db $dbBlast -entrez_query \"$eqText\" -remote -outfmt \"6 sallacc evalue bitscore\"`;
- @a = split (/\t/, $out[0]);
- my @accns = split (/;/, $a[0]);
- foreach (@accns){
- if ($_ eq $subject){ $isBBH = 1; last; }
- }
- }
-
- ## BLAST via http connection to NCBI server ##
-
- else{
- $isError = 0;
- my $args = "CMD=Put&PROGRAM=blastp&DATABASE=$dbBlast&QUERY=$query&EQ_TEXT=$eqText";
- my $req = new HTTP::Request POST => $blastServer;
- $req->content_type('application/x-www-form-urlencoded');
- $req->content($args);
- my $response = $ua->request($req);
- my $rid;
- if ($response->content =~ /^ RID = (.*$)/m){ $rid = $1; }else{ retry; return; };
- if ($response->content =~ /^ RTOE = (.*$)/m){ sleep $1; }else{ retry; return; };
- while (1){
- sleep 5;
- $args = "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
- $req = new HTTP::Request GET => $args;
- $response = $ua->request($req);
- if ($response->content =~ /\s+Status=WAITING/m){ next; }
- if ($response->content =~ /\s+Status=FAILED/m){ $isError = 1; last; }
- if ($response->content =~ /\s+Status=UNKNOWN/m){ $isError = 1; last; }
- if ($response->content =~ /\s+Status=READY/m){
- if ($response->content =~ /\s+ThereAreHits=yes/m){ last; }
- else{ last; } # no hits;
- }
- $isError = 1;
- last;
- }
- if ($isError){ retry; return; }
- $args = "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
- $req = new HTTP::Request GET => $args;
- $response = $ua->request($req);
- if ($response->content !~ /blastp/){ retry; return; }
- my @out = split(/\n/, $response->content);
- my $read = 0;
- foreach (@out){
- if (/hits? found/){ $read = 1; next; }
- next unless $read;
-# print;
- @a = split (/\t/);
- if ($a[1] =~ $subject){ $isBBH = 1; }
- last;
- }
- unless ($read){ retry; return; }
- }
- return;
+sub blast () {
+
+ ## BLAST using standalone ncbi-blast+ program ##
+
+ if ($httpBlast) {
+ my @hits;
+ # the report contains: accession, gi, length, taxid, sequence, title
+ my @out = `$blastdbcmd -db $dbBlast -entry $query -outfmt \"%a %g %l %T %s %t\"`;
+ foreach (@out) {
+ s/\s+$//; @b = split (/\s+/);
+ last if ($b[0] eq $query or $b[0] =~/^$query\.\d+$/);
+ }
+ my $length = $b[2];
+ open TMP, ">$wkDir/blast.seq"; print TMP $b[4]; close TMP;
+ # the report contains: subject accessions (all), E-value, bit score, aligned part of subject sequence
+ @out = `$blastp -query $wkDir/blast.seq -db $dbBlast -entrez_query \"$eqText\" -remote -outfmt \"6 sallacc evalue bitscore\"`;
+ @a = split (/\t/, $out[0]);
+ my @accns = split (/;/, $a[0]);
+ foreach (@accns) {
+ if ($_ eq $subject) { $isBBH = 1; last; }
+ }
+ }
+
+ ## BLAST via http connection to NCBI server ##
+
+ else{
+ $isError = 0;
+ my $args = "CMD=Put&PROGRAM=blastp&DATABASE=$dbBlast&QUERY=$query&EQ_TEXT=$eqText";
+ my $req = new HTTP::Request POST => $blastServer;
+ $req->content_type('application/x-www-form-urlencoded');
+ $req->content($args);
+ my $response = $ua->request($req);
+ my $rid;
+ if ($response->content =~ /^ RID = (.*$)/m) { $rid = $1; } else { retry; return; };
+ if ($response->content =~ /^ RTOE = (.*$)/m) { sleep $1; } else { retry; return; };
+ while (1) {
+ sleep 5;
+ $args = "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
+ $req = new HTTP::Request GET => $args;
+ $response = $ua->request($req);
+ if ($response->content =~ /\s+Status=WAITING/m) { next; }
+ if ($response->content =~ /\s+Status=FAILED/m) { $isError = 1; last; }
+ if ($response->content =~ /\s+Status=UNKNOWN/m) { $isError = 1; last; }
+ if ($response->content =~ /\s+Status=READY/m) {
+ if ($response->content =~ /\s+ThereAreHits=yes/m) { last; }
+ else{ last; } # no hits;
+ }
+ $isError = 1;
+ last;
+ }
+ if ($isError) { retry; return; }
+ $args = "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
+ $req = new HTTP::Request GET => $args;
+ $response = $ua->request($req);
+ if ($response->content !~ /blastp/) { retry; return; }
+ my @out = split(/\n/, $response->content);
+ my $read = 0;
+ foreach (@out) {
+ if (/hits? found/) { $read = 1; next; }
+ next unless $read;
+# print;
+ @a = split (/\t/);
+ if ($a[1] =~ $subject) { $isBBH = 1; }
+ last;
+ }
+ unless ($read) { retry; return; }
+ }
+ return;
}
## retry BLAST ##
-sub retry (){
- if ($iRetry < $retries){ # retry
- print ".";
- $iRetry ++;
- sleep 10;
- blast;
- }else{ # fail
- $iRetry = 0;
- $isBBH = "error";
- }
+sub retry () {
+ if ($iRetry < $retries) { # retry
+ print ".";
+ $iRetry ++;
+ sleep 10;
+ blast;
+ } else { # fail
+ $iRetry = 0;
+ $isBBH = "error";
+ }
}
-
-
-
diff --git a/scripts/orthologer.pl b/scripts/orthologer.pl
old mode 100755
new mode 100644
index bae019a..0f90675
--- a/scripts/orthologer.pl
+++ b/scripts/orthologer.pl
@@ -26,72 +26,72 @@
## Program parameters ##
-my $wkDir = $ARGV[0]; # working directory
-my @sets = (); # list of protein sets
+my $wkDir = $ARGV[0]; # working directory
+my @sets = (); # list of protein sets
-my %taxa = (); # set -> taxid
-my %proteins = (); # all input proteins (accn -> id)
+my %taxa = (); # set -> taxid
+my %proteins = (); # all input proteins (accn -> id)
-my @cogs = (); # all COGs. array of hashes, including:
- # accns -> accn1/accn2/accn3...
- # names -> name1 name2 name3...
- # nProtein -> number of proteins
- # nSet -> number of genomes
- # name -> best name
-my %names = (); # id -> names
-my %accns = (); # id -> accns
-my $cogid = 1; # current id
+my @cogs = (); # all COGs. array of hashes, including:
+ # accns -> accn1/accn2/accn3...
+ # names -> name1 name2 name3...
+ # nProtein -> number of proteins
+ # nSet -> number of genomes
+ # name -> best name
+my %names = (); # id -> names
+my %accns = (); # id -> accns
+my $cogid = 1; # current id
-my $evalue = 1e-5; # maximum E-value threshold
+my $evalue = 1e-5; # maximum E-value threshold
-my $pairRule = 3; # pairing rule
+my $pairRule = 3; # pairing rule
## Read configuration ##
-if (-e "$wkDir/config.txt"){
- open IN, "<$wkDir/config.txt";
- while (){
- s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
- @sets = split (/,/, $1) if /^inSets=(.+)$/;
- $evalue = $1 if /^evalue=(.+)$/;
- $pairRule = $1 if /^pairRule=(\d)$/;
- }
+if (-e "$wkDir/config.txt") {
+ open IN, "<$wkDir/config.txt";
+ while () {
+ s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+ @sets = split (/,/, $1) if /^inSets=(.+)$/;
+ $evalue = $1 if /^evalue=(.+)$/;
+ $pairRule = $1 if /^pairRule=(\d)$/;
+ }
}
-unless (@sets){
- opendir (DIR, "$wkDir/search");
- @a = readdir(DIR);
- close DIR;
- foreach (@a){
- next if /^\./;
- next unless -d "$wkDir/search/$_";
- push @sets, $_;
- }
+unless (@sets) {
+ opendir (DIR, "$wkDir/search");
+ @a = readdir(DIR);
+ close DIR;
+ foreach (@a) {
+ next if /^\./;
+ next unless -d "$wkDir/search/$_";
+ push @sets, $_;
+ }
}
## Read input proteins ##
print "Identifying orthologous groups (OGs)...\n";
print " Reading input proteins... ";
-foreach my $set (@sets){
- next unless -d "$wkDir/search/$set";
- opendir (DIR, "$wkDir/search/$set");
- my @blasts = grep(/\.txt$/,readdir(DIR));
- close DIR;
- foreach (@blasts){
- /^(.+)\.txt$/;
- %h = ('set', $set, 'id', "");
- $proteins{$1} = {%h};
- }
+foreach my $set (@sets) {
+ next unless -d "$wkDir/search/$set";
+ opendir (DIR, "$wkDir/search/$set");
+ my @blasts = grep(/\.txt$/,readdir(DIR));
+ close DIR;
+ foreach (@blasts) {
+ /^(.+)\.txt$/;
+ %h = ('set', $set, 'id', "");
+ $proteins{$1} = {%h};
+ }
}
print " done. ".(scalar keys %proteins)." proteins read.\n";
## Read input taxonomy ##
open IN, "<$wkDir/taxonomy/self.info";
-while (){
- @a = split(/\t/);
- $taxa{$a[0]} = $a[2];
+while () {
+ @a = split(/\t/);
+ $taxa{$a[0]} = $a[2];
}
close IN;
@@ -101,99 +101,99 @@
##########################################################
print " Parsing BLAST results...\n ";
-foreach my $set (@sets){
- next unless -d "$wkDir/search/$set";
- opendir (DIR, "$wkDir/search/$set");
- my @blasts = grep(/\.txt$/,readdir(DIR));
- close DIR;
- my $myTaxon = $taxa{$set};
- print "$set ";
- foreach (@blasts){
- /^(.+)\.txt$/;
- my $id; # array id of OGs
- my $myAccn = $1;
- next unless exists $proteins{$myAccn};
- open IN, "<$wkDir/search/$set/$_" or next;
- my $product;
- my $reading = 0;
- @b = (); # all accns occurred in this report
- %h = (); # store used taxids, to rule out duplicates.
- while (){
- s/\s+$//; next unless $_;
- last if $reading and /^;/;
- if (/^\tProduct=(.+);$/){ $product = $1; $product =~ s/\s+$//; }
- if (/BEGIN ORGANISM;/){ $reading = 1; next; }
- next unless $reading;
- @a = split (/\t/);
- next if $a[4] > $evalue;
- if ($pairRule == 2 or $pairRule == 4 or $pairRule == 5){ # must be best hit from another organism
- next if $a[2] eq $myTaxon;
- next if exists $h{$a[2]};
- $h{$a[2]} = 1;
- }
- @a = split (/\//, $a[0]);
- foreach (@a){
- next if $_ eq $myAccn;
- if (exists $proteins{$_}){
- next if $proteins{$_}{'id'}; # if a protein is already assigned an OG ID, then skip
- push (@b, $_);
- }
- }
- }
- close IN;
-
- if ($proteins{$myAccn}{'id'}){ # existed OG
- $id = $proteins{$myAccn}{'id'};
- }else{ # new OG
- %h = ('accns', $myAccn, 'names', $product);
- push @cogs, {%h};
- $id = $#cogs;
- $proteins{$myAccn}{'id'} = $id;
- }
-
- # check reversal Blast results
-
- foreach my $accn (@b){
- next if $proteins{$accn}{'id'};
- $product = "";
- $reading = 0;
- my $found = 0; # the source gene is found in the reverse Blast report
- open IN, "<$wkDir/search/$proteins{$accn}{'set'}/$accn.txt" or next;
- while (){
- s/\s+$//; next unless $_;
- last if $reading and /^;/;
- if (/^\tProduct=(.+);$/){
- $product = $1;
- $product =~ s/\s+$//;
- last if ($pairRule < 3); # single direction
- }
- if (/BEGIN ORGANISM;/){ $reading = 1; next; }
- next unless $reading;
- @a = split (/\t/);
- next if $a[4] > $evalue;
- my $taxon = $a[2];
- @a = split (/\//, $a[0]);
- foreach (@a){
- if ($myAccn eq $_){
- $found = 1;
- last;
- }
- }
- last if $found;
- last if (($pairRule == 4) and ($myTaxon eq $taxon) and !$found);
- }
- close IN;
- if ($pairRule >= 3){ # bidirectional hits
- unless ($found){
- # print "not match! $set $myAccn <> $accn.\n";
- next;
- }
- }
- $proteins{$accn}{'id'} = $id;
- $cogs[$id]{'names'} .= "\t$product" if $product;
- $cogs[$id]{'accns'} .= "/$accn";
- }
- }
+foreach my $set (@sets) {
+ next unless -d "$wkDir/search/$set";
+ opendir (DIR, "$wkDir/search/$set");
+ my @blasts = grep(/\.txt$/,readdir(DIR));
+ close DIR;
+ my $myTaxon = $taxa{$set};
+ print "$set ";
+ foreach (@blasts) {
+ /^(.+)\.txt$/;
+ my $id; # array id of OGs
+ my $myAccn = $1;
+ next unless exists $proteins{$myAccn};
+ open IN, "<$wkDir/search/$set/$_" or next;
+ my $product;
+ my $reading = 0;
+ @b = (); # all accns occurred in this report
+ %h = (); # store used taxids, to rule out duplicates.
+ while () {
+ s/\s+$//; next unless $_;
+ last if $reading and /^;/;
+ if (/^\tProduct=(.+);$/) { $product = $1; $product =~ s/\s+$//; }
+ if (/BEGIN ORGANISM;/) { $reading = 1; next; }
+ next unless $reading;
+ @a = split (/\t/);
+ next if $a[4] > $evalue;
+ if ($pairRule == 2 or $pairRule == 4 or $pairRule == 5) { # must be best hit from another organism
+ next if $a[2] eq $myTaxon;
+ next if exists $h{$a[2]};
+ $h{$a[2]} = 1;
+ }
+ @a = split (/\//, $a[0]);
+ foreach (@a) {
+ next if $_ eq $myAccn;
+ if (exists $proteins{$_}) {
+ next if $proteins{$_}{'id'}; # if a protein is already assigned an OG ID, then skip
+ push (@b, $_);
+ }
+ }
+ }
+ close IN;
+
+ if ($proteins{$myAccn}{'id'}) { # existed OG
+ $id = $proteins{$myAccn}{'id'};
+ } else { # new OG
+ %h = ('accns', $myAccn, 'names', $product);
+ push @cogs, {%h};
+ $id = $#cogs;
+ $proteins{$myAccn}{'id'} = $id;
+ }
+
+ # check reversal Blast results
+
+ foreach my $accn (@b) {
+ next if $proteins{$accn}{'id'};
+ $product = "";
+ $reading = 0;
+ my $found = 0; # the source gene is found in the reverse Blast report
+ open IN, "<$wkDir/search/$proteins{$accn}{'set'}/$accn.txt" or next;
+ while () {
+ s/\s+$//; next unless $_;
+ last if $reading and /^;/;
+ if (/^\tProduct=(.+);$/) {
+ $product = $1;
+ $product =~ s/\s+$//;
+ last if ($pairRule < 3); # single direction
+ }
+ if (/BEGIN ORGANISM;/) { $reading = 1; next; }
+ next unless $reading;
+ @a = split (/\t/);
+ next if $a[4] > $evalue;
+ my $taxon = $a[2];
+ @a = split (/\//, $a[0]);
+ foreach (@a) {
+ if ($myAccn eq $_) {
+ $found = 1;
+ last;
+ }
+ }
+ last if $found;
+ last if (($pairRule == 4) and ($myTaxon eq $taxon) and !$found);
+ }
+ close IN;
+ if ($pairRule >= 3) { # bidirectional hits
+ unless ($found) {
+ # print "not match! $set $myAccn <> $accn.\n";
+ next;
+ }
+ }
+ $proteins{$accn}{'id'} = $id;
+ $cogs[$id]{'names'} .= "\t$product" if $product;
+ $cogs[$id]{'accns'} .= "/$accn";
+ }
+ }
}
print "\n done. ".(scalar @cogs)." OGs identified.\n";
@@ -202,51 +202,51 @@
# rule: the shortest name without "hypothetical"
print " Naming OGs...";
-for ($i=0; $i<=$#cogs; $i++){
- @a = split(/\t/, $cogs[$i]{'names'});
- @a = sort {length($a) <=> length($b)}(@a);
- my @noHypo = ();
- foreach (@a){
- push (@noHypo, $_) unless (/hypothetical/ or /hypotethical/ or /hypothetcial/);
- }
- if (@noHypo){ $cogs[$i]{'name'} = "$noHypo[0]"; }
- else{ $cogs[$i]{'name'} = "$a[0]"; }
+for ($i=0; $i<=$#cogs; $i++) {
+ @a = split(/\t/, $cogs[$i]{'names'});
+ @a = sort {length($a) <=> length($b)}(@a);
+ my @noHypo = ();
+ foreach (@a) {
+ push (@noHypo, $_) unless (/hypothetical/ or /hypotethical/ or /hypothetcial/);
+ }
+ if (@noHypo) { $cogs[$i]{'name'} = "$noHypo[0]"; }
+ else{ $cogs[$i]{'name'} = "$a[0]"; }
}
print " done.\n";
-# my $shortName; my $longName;
-# my @noHypo; # subset of names which does not contain "hypothetical"
-# foreach (@a){
-# push (@noHypo, $_) unless (/hypothetical/ or /hypotethical/ or /hypothetcial/);
-# }
-# if (@noHypo){
-# foreach (@noHypo){
-# if (/^([A-Za-z0-9]+) gene product$/ or /^([A-Za-z0-9]+) protein$/ or /^protein ([A-Za-z0-9]+)$/){
-# $shortName = $1; last;
-# }
-# }
-# @noHypo = sort {length($a) <=> length($b)}(@noHypo);
-# $longName = $noHypo[$#noHypo];
-# $shortName or $shortName = $noHypo[0];
-# }else{
-# @a = sort {length($a) <=> length($b)}(@a);
-# $longName = $a[$#a];
-# $shortName or $shortName = $a[0];
-# }
-# $names{$key} = $shortName;
-# # $names{$key} = "$shortName\t$longName";
+# my $shortName; my $longName;
+# my @noHypo; # subset of names which does not contain "hypothetical"
+# foreach (@a) {
+# push (@noHypo, $_) unless (/hypothetical/ or /hypotethical/ or /hypothetcial/);
+# }
+# if (@noHypo) {
+# foreach (@noHypo) {
+# if (/^([A-Za-z0-9]+) gene product$/ or /^([A-Za-z0-9]+) protein$/ or /^protein ([A-Za-z0-9]+)$/) {
+# $shortName = $1; last;
+# }
+# }
+# @noHypo = sort {length($a) <=> length($b)}(@noHypo);
+# $longName = $noHypo[$#noHypo];
+# $shortName or $shortName = $noHypo[0];
+# } else {
+# @a = sort {length($a) <=> length($b)}(@a);
+# $longName = $a[$#a];
+# $shortName or $shortName = $a[0];
+# }
+# $names{$key} = $shortName;
+# # $names{$key} = "$shortName\t$longName";
## Sort COGs ##
print " Sorting OGs...";
-for ($i=0; $i<=$#cogs; $i++){
- @a = split(/\//, $cogs[$i]{'accns'});
- %h = ();
- $h{$proteins{$_}{'set'}} = 1 for @a;
- $cogs[$i]{'nProtein'} = scalar (@a);
- $cogs[$i]{'nSet'} = scalar (keys %h);
+for ($i=0; $i<=$#cogs; $i++) {
+ @a = split(/\//, $cogs[$i]{'accns'});
+ %h = ();
+ $h{$proteins{$_}{'set'}} = 1 for @a;
+ $cogs[$i]{'nProtein'} = scalar (@a);
+ $cogs[$i]{'nSet'} = scalar (keys %h);
}
print " done.\n";
@@ -256,12 +256,11 @@
## Export COGs ##
open OUT, ">$wkDir/taxonomy/orthology.db";
-for ($i=0; $i<=$#cogs; $i++){
- print OUT ($i+1)."|$cogs[$i]{'name'}\t$cogs[$i]{'accns'}\n";
+for ($i=0; $i<=$#cogs; $i++) {
+ print OUT ($i+1)."|$cogs[$i]{'name'}\t$cogs[$i]{'accns'}\n";
}
close OUT;
print " OG names and members exported as taxonomy/orthology.db.\n";
exit 0;
-
diff --git a/scripts/reporter.pl b/scripts/reporter.pl
old mode 100755
new mode 100644
index 295ab3a..2ce63ec
--- a/scripts/reporter.pl
+++ b/scripts/reporter.pl
@@ -31,228 +31,228 @@
my $wkDir = $ARGV[0];
my $interactive = 1;
-my $deHypo = 0; # ignore hypothetical proteins
+my $deHypo = 0; # ignore hypothetical proteins
-my $byDonor = 1; # summarize HGT events by organism
+my $byDonor = 1; # summarize HGT events by organism
my @ranks = ('species', 'genus', 'family', 'order', 'class', 'phylum');
-my $sumRank = "order"; # on this rank
-my $defOnly = 1; # ignore those without this rank defined
+my $sumRank = "order"; # on this rank
+my $defOnly = 1; # ignore those without this rank defined
-my $byFunction = 0; # summarize HGT by function
-my $dirFunction; # directory containing functional annotations
+my $byFunction = 0; # summarize HGT by function
+my $dirFunction; # directory containing functional annotations
-my $byOrthology = 0; # generate report by ortholog
-my $smOrthology; # file containing scheme of gene orthology
-my $nameOGs = 1; # Name COGs if necessary
-my $exORFan = 1; # Exclude ORFans
-my @cogs; # clusters of orthologs (COGs) [ID] -> name short (long), predicted?
-my %cogids = (); # clusters of orthologs (COGs): accn -> ID
-my $isCOG = 0; # whether use COG
-my %ORFans = (); # ORFans set -> (array)
+my $byOrthology = 0; # generate report by ortholog
+my $smOrthology; # file containing scheme of gene orthology
+my $nameOGs = 1; # Name COGs if necessary
+my $exORFan = 1; # Exclude ORFans
+my @cogs; # clusters of orthologs (COGs) [ID] -> name short (long), predicted?
+my %cogids = (); # clusters of orthologs (COGs): accn -> ID
+my $isCOG = 0; # whether use COG
+my %ORFans = (); # ORFans set -> (array)
-my $outText = 1; # generate report in plain text
-my $outHTML = 1; # generate report in web page (HTML)
-my $outExcel = 0; # generate report in Excel spreadsheet
-my $detailExcel = 1; # attach detailed output in Excel workbook
+my $outText = 1; # generate report in plain text
+my $outHTML = 1; # generate report in web page (HTML)
+my $outExcel = 0; # generate report in Excel spreadsheet
+my $detailExcel = 1; # attach detailed output in Excel workbook
## global variables ##
-my $title; # title of the analysis
-my @sets; # protein sets (genomes)
+my $title; # title of the analysis
+my @sets; # protein sets (genomes)
-my %results = (); # master variable of results: $results{$set}{$accn}{$match}
+my %results = (); # master variable of results: $results{$set}{$accn}{$match}
-my %totals = (); # total number of genes per genome
+my %totals = (); # total number of genes per genome
my %organisms = ();
-my %taxadb = (); # taxa.db
-my %ranksdb = (); # ranks.db
+my %taxadb = (); # taxa.db
+my %ranksdb = (); # ranks.db
-my $workbook; # Excel workbook
-my $worksheet; # Excel worksheet
-my $excelRow; # active row number
-my $excelTitle; # Excel title format
-my $excelHeader; # Excel header format
-my ($excelGrey, $excelGreen, $excelYellow, $excelRed); # Excel data formats
+my $workbook; # Excel workbook
+my $worksheet; # Excel worksheet
+my $excelRow; # active row number
+my $excelTitle; # Excel title format
+my $excelHeader; # Excel header format
+my ($excelGrey, $excelGreen, $excelYellow, $excelRed); # Excel data formats
my $iLoss = 0; my $iPOE = 0; my $iMatch = 0;
## Read configuration ##
-if (-e "$wkDir/config.txt"){
- open IN, "<$wkDir/config.txt";
- while (){
- s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
- @sets = split (/,/, $1) if /^inSets=(.+)$/;
- $title = $1 if /^title=(.+)$/;
- $deHypo = $1 if /^deHypo=([012])$/;
- $detailExcel = $1 if /^detailExcel=([01])$/;
-
- $byDonor = $1 if /^byDonor=([01])$/;
- @ranks = split (/,/, $1) if /^ranks=(.+)$/;
- $sumRank = $1 if /^sumRank=(.+)$/;
- $defOnly = $1 if /^definedOnly=([01])$/;
-
- $byOrthology = $1 if /^byOrthology=([01])$/;
- $smOrthology = $1 if /^smOrthology=(.+)$/;
- $nameOGs = $1 if /^nameOGs=([01])$/;
- $exORFan = $1 if /^exORFan=([01])$/;
-
- $byFunction = $1 if /^byFunction=([01])$/;
- $dirFunction = $1 if /^dirFunction=(.+)$/;
-
- $outText = $1 if /^outText=([01])$/;
- $outExcel = $1 if /^outExcel=([01])$/;
- $outHTML = $1 if /^outHTML=([01])$/;
-
- $interactive = $1 if /^interactive=([01])$/;
- }
+if (-e "$wkDir/config.txt") {
+ open IN, "<$wkDir/config.txt";
+ while () {
+ s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+ @sets = split (/,/, $1) if /^inSets=(.+)$/;
+ $title = $1 if /^title=(.+)$/;
+ $deHypo = $1 if /^deHypo=([012])$/;
+ $detailExcel = $1 if /^detailExcel=([01])$/;
+
+ $byDonor = $1 if /^byDonor=([01])$/;
+ @ranks = split (/,/, $1) if /^ranks=(.+)$/;
+ $sumRank = $1 if /^sumRank=(.+)$/;
+ $defOnly = $1 if /^definedOnly=([01])$/;
+
+ $byOrthology = $1 if /^byOrthology=([01])$/;
+ $smOrthology = $1 if /^smOrthology=(.+)$/;
+ $nameOGs = $1 if /^nameOGs=([01])$/;
+ $exORFan = $1 if /^exORFan=([01])$/;
+
+ $byFunction = $1 if /^byFunction=([01])$/;
+ $dirFunction = $1 if /^dirFunction=(.+)$/;
+
+ $outText = $1 if /^outText=([01])$/;
+ $outExcel = $1 if /^outExcel=([01])$/;
+ $outHTML = $1 if /^outHTML=([01])$/;
+
+ $interactive = $1 if /^interactive=([01])$/;
+ }
}
## check previous result ##
-if (-e "$wkDir/result/summary.txt" or -d "$wkDir/result/HGT"){
- print "Warning: Summarized report from a previous analysis is detected.\n";
- if ($interactive){ print "Press Enter to overwrite, or press Ctrl+C to exit:\n"; $s = ; }
- else{ print "To be overwritten.\n"; }
+if (-e "$wkDir/result/summary.txt" or -d "$wkDir/result/HGT") {
+ print "Warning: Summarized report from a previous analysis is detected.\n";
+ if ($interactive) { print "Press Enter to overwrite, or press Ctrl+C to exit:\n"; $s = ; }
+ else{ print "To be overwritten.\n"; }
}
## Read input protein sets ##
-unless (@sets){
- opendir (DIR, "$wkDir/result/detail") or die "Error: result directory not accessible.\n";
- @sets = grep(/\.txt$/,readdir(DIR)) or die "Error: no result found.\n";
- for ($i=0; $i<=$#sets; $i++){ $sets[$i] =~ s/\.txt$//; }
- close DIR;
+unless (@sets) {
+ opendir (DIR, "$wkDir/result/detail") or die "Error: result directory not accessible.\n";
+ @sets = grep(/\.txt$/,readdir(DIR)) or die "Error: no result found.\n";
+ for ($i=0; $i<=$#sets; $i++) { $sets[$i] =~ s/\.txt$//; }
+ close DIR;
}
## Read taxonomy information ##
open IN, "<$wkDir/taxonomy/taxa.db";
-while (){
- s/\s+$//; next if /^#/; next unless $_;
- @a = split /\t/;
- %h = ('name',$a[1],'rank',$a[2]);
- $i = 3; $h{$_} = $a[$i++] for (@ranks);
- $taxadb{$a[0]} = {%h};
+while () {
+ s/\s+$//; next if /^#/; next unless $_;
+ @a = split /\t/;
+ %h = ('name',$a[1],'rank',$a[2]);
+ $i = 3; $h{$_} = $a[$i++] for (@ranks);
+ $taxadb{$a[0]} = {%h};
}
close IN;
open IN, "<$wkDir/taxonomy/ranks.db";
-while (){
- s/\s+$//; next if /^#/; next unless $_;
- @a = split /\t/;
- $ranksdb{$a[0]} = $a[1];
+while () {
+ s/\s+$//; next if /^#/; next unless $_;
+ @a = split /\t/;
+ $ranksdb{$a[0]} = $a[1];
}
close IN;
## Computing and/or reading COG scheme ##
-if ($byOrthology){
- if ($smOrthology){
- unless (-e $smOrthology){
- print "Orthology scheme file $smOrthology not accessible.\n";
- $smOrthology = "$wkDir/taxonomy/orthology.db";
- }
- }else{ $smOrthology = "$wkDir/taxonomy/orthology.db"; }
- unless (-e $smOrthology){ # Identify orthology scenario using BBH
- $s = $0; $s =~ s/analyzer\.pl$/orthologer.pl/;
- system "$^X $s $wkDir";
- }
- unless (-e $smOrthology){
- print "Error: Identification of orthology failed.\n";
- exit 1;
- }
- if ($nameOGs){ # Name OGs if necessary
- $i = 0;
- open IN, "<$smOrthology";
- while (){
- s/\s+$//; next if /^#/; next unless $_;
- if (/\t/){ $i = 1; }
- last;
- }
- close IN;
- unless ($i){
- system "perl scripts/cogNamer.pl $wkDir $smOrthology $wkDir/taxonomy/orthology.named.db";
- }
- $smOrthology = "$wkDir/taxonomy/orthology.named.db" if -e "$wkDir/taxonomy/orthology.named.db";
- }
- open IN, "<$smOrthology";
- while (){
- s/\s+$//; next if /^#/; next unless $_;
- @a = split (/\t/);
- @b = split (/[\s\/,]/, $a[$#a]);
- next if ($exORFan and !$#b);
- %h = ();
- $h{'name'} = "";
- $h{'name'} = $a[0] if ($#a);
- $h{'data'} = [("") x @sets];
- push @cogs, {%h};
- $cogids{$_} = $#cogs for @b;
- }
- close IN;
- $isCOG = 1 if @cogs;
+if ($byOrthology) {
+ if ($smOrthology) {
+ unless (-e $smOrthology) {
+ print "Orthology scheme file $smOrthology not accessible.\n";
+ $smOrthology = "$wkDir/taxonomy/orthology.db";
+ }
+ } else { $smOrthology = "$wkDir/taxonomy/orthology.db"; }
+ unless (-e $smOrthology) { # Identify orthology scenario using BBH
+ $s = $0; $s =~ s/analyzer\.pl$/orthologer.pl/;
+ system "$^X $s $wkDir";
+ }
+ unless (-e $smOrthology) {
+ print "Error: Identification of orthology failed.\n";
+ exit 1;
+ }
+ if ($nameOGs) { # Name OGs if necessary
+ $i = 0;
+ open IN, "<$smOrthology";
+ while () {
+ s/\s+$//; next if /^#/; next unless $_;
+ if (/\t/) { $i = 1; }
+ last;
+ }
+ close IN;
+ unless ($i) {
+ system "perl scripts/cogNamer.pl $wkDir $smOrthology $wkDir/taxonomy/orthology.named.db";
+ }
+ $smOrthology = "$wkDir/taxonomy/orthology.named.db" if -e "$wkDir/taxonomy/orthology.named.db";
+ }
+ open IN, "<$smOrthology";
+ while () {
+ s/\s+$//; next if /^#/; next unless $_;
+ @a = split (/\t/);
+ @b = split (/[\s\/,]/, $a[$#a]);
+ next if ($exORFan and !$#b);
+ %h = ();
+ $h{'name'} = "";
+ $h{'name'} = $a[0] if ($#a);
+ $h{'data'} = [("") x @sets];
+ push @cogs, {%h};
+ $cogids{$_} = $#cogs for @b;
+ }
+ close IN;
+ $isCOG = 1 if @cogs;
}
## Read prediction results ##
-foreach my $set (@sets){
- my %result = ();
- my %titles = ();
- @a = (); $ORFans{$set} = [@a];
- open IN, "<$wkDir/result/detail/$set.txt";
- while (){
- s/\s+$//; next unless $_; next if (/HGTector/);
- my @a = split (/\t/);
- if (/^Query\t/){
- next if ($iLoss+$iPOE+$iMatch);
- for ($i=0; $i<=$#a; $i++){
- $iLoss = $i if ($a[$i] eq "Loss");
- $iPOE = $i if ($a[$i] eq "POE");
- $iMatch = $i if ($a[$i] eq "Match");
- }
- next;
- }
- if ($deHypo){ # ignore hypothetical proteins
- if ($isCOG and exists $cogids{$a[0]}){
- my $name = $cogs[$cogids{$a[0]}]{'name'};
- next if ($name =~ /hypothetical/ or $name =~ /hypotethical/ or $name =~ /hypothetcial/);
- }else{
- next if ($a[2] =~ /hypothetical/ or $a[2] =~ /hypotethical/ or $a[2] =~ /hypothetcial/);
- }
- }
- my %gene = ();
- $gene{'length'} = $a[1];
- $gene{'product'} = $a[2];
- $gene{'hits'} = $a[3];
- if ($isCOG){
- if (exists $cogids{$a[0]}){
- $gene{'cogid'} = $cogids{$a[0]};
- }else{
- push @{$ORFans{$set}}, "$a[0]|$a[2]";
- }
- }
- if ($#a > 4){
- $gene{'self'} = $a[4];
- $gene{'close'} = $a[5];
- $gene{'distal'} = $a[6];
- $gene{'hgt'} = $a[7];
- $gene{'loss'} = $a[$iLoss] if $iLoss;
- $gene{'poe'} = $a[$iPOE] if $iPOE;
- $gene{'match'} = $a[$iMatch] if $iMatch;
- }
- $result{$a[0]} = {%gene};
- }
- close IN;
- $results{$set} = {%result};
- @a = keys %result;
- $totals{$set} = @a;
+foreach my $set (@sets) {
+ my %result = ();
+ my %titles = ();
+ @a = (); $ORFans{$set} = [@a];
+ open IN, "<$wkDir/result/detail/$set.txt";
+ while () {
+ s/\s+$//; next unless $_; next if (/HGTector/);
+ my @a = split (/\t/);
+ if (/^Query\t/) {
+ next if ($iLoss+$iPOE+$iMatch);
+ for ($i=0; $i<=$#a; $i++) {
+ $iLoss = $i if ($a[$i] eq "Loss");
+ $iPOE = $i if ($a[$i] eq "POE");
+ $iMatch = $i if ($a[$i] eq "Match");
+ }
+ next;
+ }
+ if ($deHypo) { # ignore hypothetical proteins
+ if ($isCOG and exists $cogids{$a[0]}) {
+ my $name = $cogs[$cogids{$a[0]}]{'name'};
+ next if ($name =~ /hypothetical/ or $name =~ /hypotethical/ or $name =~ /hypothetcial/);
+ } else {
+ next if ($a[2] =~ /hypothetical/ or $a[2] =~ /hypotethical/ or $a[2] =~ /hypothetcial/);
+ }
+ }
+ my %gene = ();
+ $gene{'length'} = $a[1];
+ $gene{'product'} = $a[2];
+ $gene{'hits'} = $a[3];
+ if ($isCOG) {
+ if (exists $cogids{$a[0]}) {
+ $gene{'cogid'} = $cogids{$a[0]};
+ } else {
+ push @{$ORFans{$set}}, "$a[0]|$a[2]";
+ }
+ }
+ if ($#a > 4) {
+ $gene{'self'} = $a[4];
+ $gene{'close'} = $a[5];
+ $gene{'distal'} = $a[6];
+ $gene{'hgt'} = $a[7];
+ $gene{'loss'} = $a[$iLoss] if $iLoss;
+ $gene{'poe'} = $a[$iPOE] if $iPOE;
+ $gene{'match'} = $a[$iMatch] if $iMatch;
+ }
+ $result{$a[0]} = {%gene};
+ }
+ close IN;
+ $results{$set} = {%result};
+ @a = keys %result;
+ $totals{$set} = @a;
}
@@ -264,90 +264,90 @@
## load module to access Excel ##
-if ($outExcel){
- eval { require Spreadsheet::WriteExcel; Spreadsheet::WriteExcel->import() };
- print "Error: Perl module Spreadsheet::WriteExcel is not installed.\n" and exit 1 if ($@);
- $workbook = Spreadsheet::WriteExcel->new("$wkDir/result/report.xls");
- $excelTitle = $workbook->add_format (bold=>1, size=>12, valign=>'vcenter');
- $excelHeader = $workbook->add_format(bold=>1, align=>'center', top=>1, bottom=>1);
- $excelGrey = $workbook->add_format(align=>'center', fg_color=>'silver', pattern=>2, border=>1, border_color=>'white');
- $excelGreen = $workbook->add_format(align=>'center', fg_color=>'lime', pattern=>4, border=>1, border_color=>'white');
- $excelYellow = $workbook->add_format(align=>'center', fg_color=>'yellow', pattern=>2, border=>1, border_color=>'white');
- $excelRed = $workbook->add_format(align=>'center', fg_color=>'red', pattern=>4, border=>1, border_color=>'white');
+if ($outExcel) {
+ eval { require Spreadsheet::WriteExcel; Spreadsheet::WriteExcel->import() };
+ print "Error: Perl module Spreadsheet::WriteExcel is not installed.\n" and exit 1 if ($@);
+ $workbook = Spreadsheet::WriteExcel->new("$wkDir/result/report.xls");
+ $excelTitle = $workbook->add_format (bold=>1, size=>12, valign=>'vcenter');
+ $excelHeader = $workbook->add_format(bold=>1, align=>'center', top=>1, bottom=>1);
+ $excelGrey = $workbook->add_format(align=>'center', fg_color=>'silver', pattern=>2, border=>1, border_color=>'white');
+ $excelGreen = $workbook->add_format(align=>'center', fg_color=>'lime', pattern=>4, border=>1, border_color=>'white');
+ $excelYellow = $workbook->add_format(align=>'center', fg_color=>'yellow', pattern=>2, border=>1, border_color=>'white');
+ $excelRed = $workbook->add_format(align=>'center', fg_color=>'red', pattern=>4, border=>1, border_color=>'white');
## This is a good example of conditional formatting code, but I cannot use more than three rules.
-# $excelData = $workbook->add_format (num_format=>'[Red][<=0]0;[Green][<=20]0;[Blue]0');
+# $excelData = $workbook->add_format (num_format=>'[Red][<=0]0;[Green][<=20]0;[Blue]0');
}
-if ($outHTML){
- open HTML, ">$wkDir/result/report.html";
- print HTML "\n\n \n $title\n \n \n";
- print HTML " $title
\n";
+if ($outHTML) {
+ open HTML, ">$wkDir/result/report.html";
+ print HTML "\n\n \n $title\n \n \n";
+ print HTML " $title
\n";
}
## Generate general report ##
mkdir "$wkDir/result/HGT" unless -d "$wkDir/result/HGT";
-if ($iLoss){ mkdir "$wkDir/result/loss" unless -d "$wkDir/result/loss"; }
-if ($iPOE){ mkdir "$wkDir/result/POE" unless -d "$wkDir/result/POE"; }
-if ($outText){
- open OUT, ">$wkDir/result/summary.txt";
+if ($iLoss) { mkdir "$wkDir/result/loss" unless -d "$wkDir/result/loss"; }
+if ($iPOE) { mkdir "$wkDir/result/POE" unless -d "$wkDir/result/POE"; }
+if ($outText) {
+ open OUT, ">$wkDir/result/summary.txt";
}
-if ($outHTML){
- print HTML " Summary
";
- print HTML " \n \n Genome | \n Total | \n HGT | \n";
- print HTML " Loss | \n" if $iLoss;
- print HTML " POE | \n" if $iPOE;
- print HTML "
\n";
+if ($outHTML) {
+ print HTML " Summary
";
+ print HTML " \n \n Genome | \n Total | \n HGT | \n";
+ print HTML " Loss | \n" if $iLoss;
+ print HTML " POE | \n" if $iPOE;
+ print HTML "
\n";
}
-if ($outExcel){
- $worksheet = $workbook->add_worksheet ('Summary');
- $worksheet->set_row (0, 24);
- $worksheet->set_column (0, 4, 12);
- $worksheet->write (0, 0, $title, $excelTitle);
- $worksheet->write (1, 0, ['Genome', 'Total', 'HGT'], $excelHeader);
- $worksheet->write (1, 3, 'Loss', $excelHeader) if $iLoss;
- $i = 3; $i ++ if $iLoss;
- $worksheet->write (1, $i, 'POE', $excelHeader) if $iPOE;
- $excelRow = 2;
+if ($outExcel) {
+ $worksheet = $workbook->add_worksheet ('Summary');
+ $worksheet->set_row (0, 24);
+ $worksheet->set_column (0, 4, 12);
+ $worksheet->write (0, 0, $title, $excelTitle);
+ $worksheet->write (1, 0, ['Genome', 'Total', 'HGT'], $excelHeader);
+ $worksheet->write (1, 3, 'Loss', $excelHeader) if $iLoss;
+ $i = 3; $i ++ if $iLoss;
+ $worksheet->write (1, $i, 'POE', $excelHeader) if $iPOE;
+ $excelRow = 2;
}
my $allhgt; # total number of HGT-derived genes
-foreach my $set (@sets){
- my $hgt = 0; my $loss = 0; my $poe = 0;
- open HGT, ">$wkDir/result/HGT/$set.txt";
- open LOSS, ">$wkDir/result/loss/$set.txt" if $iLoss;
- open POE, ">$wkDir/result/POE/$set.txt" if $iPOE;
- my %result = %{$results{$set}};
- my $total = keys %result;
- foreach my $accn (sort keys %result){
- print HGT $accn."\n" and $hgt ++ if (exists $result{$accn}{'hgt'} and $result{$accn}{'hgt'});
- if ($iLoss){ print LOSS $accn."\n" and $loss ++ if (exists $result{$accn}{'loss'} and $result{$accn}{'loss'}); }
- if ($iPOE){ print POE $accn."\n" and $poe ++ if (exists $result{$accn}{'poe'} and $result{$accn}{'poe'}); }
- }
- close HGT; close LOSS if $iLoss; close POE if $iPOE;
- if ($outText){
- print OUT "$set has $total "; print OUT "non-hypothetical " if $deHypo; print OUT "protein-coding genes.\n";
- print OUT " HGT: $hgt";
- print OUT ", Loss: $loss" if $iLoss;
- print OUT ", POE: $poe" if $iPOE;
- print OUT ".\n\n";
- }
- if ($outHTML){
- print HTML " \n $set | \n $total | \n $hgt | \n";
- print HTML " $loss | \n" if $iLoss;
- print HTML " $poe | \n" if $iPOE;
- print HTML "
\n";
- }
- if ($outExcel){
- $worksheet->write ($excelRow, 0, [$set, $total, $hgt]);
- $worksheet->write ($excelRow, 3, $loss) if $iLoss;
- $i = 3; $i ++ if $iLoss;
- $worksheet->write ($excelRow, $i, $poe) if $iPOE;
- $excelRow ++;
- }
- $allhgt += $hgt;
+foreach my $set (@sets) {
+ my $hgt = 0; my $loss = 0; my $poe = 0;
+ open HGT, ">$wkDir/result/HGT/$set.txt";
+ open LOSS, ">$wkDir/result/loss/$set.txt" if $iLoss;
+ open POE, ">$wkDir/result/POE/$set.txt" if $iPOE;
+ my %result = %{$results{$set}};
+ my $total = keys %result;
+ foreach my $accn (sort keys %result) {
+ print HGT $accn."\n" and $hgt ++ if (exists $result{$accn}{'hgt'} and $result{$accn}{'hgt'});
+ if ($iLoss) { print LOSS $accn."\n" and $loss ++ if (exists $result{$accn}{'loss'} and $result{$accn}{'loss'}); }
+ if ($iPOE) { print POE $accn."\n" and $poe ++ if (exists $result{$accn}{'poe'} and $result{$accn}{'poe'}); }
+ }
+ close HGT; close LOSS if $iLoss; close POE if $iPOE;
+ if ($outText) {
+ print OUT "$set has $total "; print OUT "non-hypothetical " if $deHypo; print OUT "protein-coding genes.\n";
+ print OUT " HGT: $hgt";
+ print OUT ", Loss: $loss" if $iLoss;
+ print OUT ", POE: $poe" if $iPOE;
+ print OUT ".\n\n";
+ }
+ if ($outHTML) {
+ print HTML " \n $set | \n $total | \n $hgt | \n";
+ print HTML " $loss | \n" if $iLoss;
+ print HTML " $poe | \n" if $iPOE;
+ print HTML "
\n";
+ }
+ if ($outExcel) {
+ $worksheet->write ($excelRow, 0, [$set, $total, $hgt]);
+ $worksheet->write ($excelRow, 3, $loss) if $iLoss;
+ $i = 3; $i ++ if $iLoss;
+ $worksheet->write ($excelRow, $i, $poe) if $iPOE;
+ $excelRow ++;
+ }
+ $allhgt += $hgt;
}
close OUT if ($outText);
print HTML "
\n" if ($outHTML);
@@ -355,454 +355,453 @@
## Generate by donor group report ##
-if ($byDonor){
- foreach my $set (@sets){
- my %result = %{$results{$set}};
- my $count = 0;
- foreach my $accn (keys %result){
- $count ++;
- next unless exists $result{$accn}{'hgt'};
- next unless $result{$accn}{'hgt'};
- next unless exists $result{$accn}{'match'};
- next unless $result{$accn}{'match'};
- my $organism = $result{$accn}{'match'};
- my $group = "";
- if ($sumRank){
- $organism =~ /^(\d+) \(/;
- if (exists $taxadb{$1} and exists $taxadb{$1}{$sumRank} and $taxadb{$1}{$sumRank} and exists $ranksdb{$taxadb{$1}{$sumRank}}){
- # next unless exists $taxadb{$1}{'class'} and $taxadb{$1}{'class'} eq '28211'; # alphaproteobacteria
- # next unless substr ($taxadb{$1}{'rank'}, 0, 3) eq "/2/"; # bacteria
- # next if exists $taxadb{$1}{'class'} and $taxadb{$1}{'class'};
- # $organism = $ranksdb{$taxadb{$1}{'phylum'}}.",".$ranksdb{$taxadb{$1}{'class'}}.",".$ranksdb{$taxadb{$1}{'order'}};
- $organism = $ranksdb{$taxadb{$1}{$sumRank}};
- }else{ next; }
- # if (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 3) eq "/2/"){ $organism = "Bacteria"; }
- # elsif (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 6) eq "/2157/"){ $organism = "Archaea"; }
- # elsif (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 6) eq "/2759/"){ $organism = "Eukaryota"; }
- # else{ next; }
- }
- $results{$set}{$accn}{'group'} = $organism;
- if (exists $organisms{$organism}){
- if ($organisms{$organism}{$set}){ $organisms{$organism}{$set} .= ",$accn"; }
- else { $organisms{$organism}{$set} .= $accn; }
- }else{
- my %h = ();
- $h{$_} = "" for (@sets);
- $h{$set} = $accn;
- $organisms{$organism} = {%h};
- }
- }
- }
-
- # head row
- if ($outText){
- open OUT, ">$wkDir/result/donor.txt";
- print OUT "Organism\tMean\t".join ("\t", @sets)."\n";
- }
- if ($outHTML){
- print HTML "
\n \n HGT by putative donor group
\n";
- print HTML " HGT-derived genes are summarized by putative donor group as indicated by the best distal match. Note that the real donor might be an ancestor of the match organism, therefore, a higher taxonomic rank is recommended to describe the donor group.";
- print HTML "
green: 1-4, yellow: 5-9, red: 10+
\n";
- print HTML " \n \n Donor | \n Mean | \n";
- print HTML " $_ | \n" for (@sets);
- print HTML "
\n";
- }
- if ($outExcel){
- $worksheet = $workbook->add_worksheet ('Donor');
- $worksheet->set_row (0, 24);
- $worksheet->set_column (0, 0, 16);
- $worksheet->set_column (1, scalar @sets, 7);
- $worksheet->write (0, 0, "HGT by donor organism", $excelTitle);
- $worksheet->write (1, 0, "Donor", $excelHeader);
- $worksheet->write (1, 1, "Mean", $excelHeader);
- $worksheet->write (1, 2, \@sets, $excelHeader);
- $excelRow = 3;
- }
-
- # total row
- @a = (); push (@a, $totals{$_}) for (@sets);
- $j = 0; $j += $_ for (@a); $j = sprintf("%d", $j/@a);
- if ($outText){
- print OUT "Total\t$j\t".join ("\t", @a)."\n";
- }
- if ($outHTML){
- print HTML " \n Total | \n";
- print HTML " $j | \n";
- print HTML " $_ | \n" for (@a);
- print HTML "
\n";
- }
- if ($outExcel){
- $worksheet->write (2, 0, "Total");
- $worksheet->write (2, 1, $j);
- $worksheet->write (2, 2, \@a);
- }
-
- # table content
- foreach my $organism (sort keys %organisms){
- my @accns = (); # protein accns (comma-separated)
- my @nums = (); # numbers only
- foreach (@sets){
- $s = $organisms{$organism}{$_};
- push (@accns, $s);
- @a = split (/,/, $s);
- $s = @a; push (@nums, $s);
- }
- $j = 0; $j += $_ for (@nums); $j = sprintf("%d", $j/@nums);
- if ($outText){
- print OUT "$organism\t$j\t".join("\t", @nums)."\n";
- }
- if ($outHTML){
- print HTML " \n $organism | \n $j | \n";
- for ($i=0; $i<=$#nums; $i++){
- $n = $nums[$i];
- $t = "";
- $s = join ("
", split (/,/, $accns[$i]));
- if ($n > 0 and $n < 5){ $t = "lightgreen"; }
- elsif ($n >= 5 and $n < 10) { $t = "khaki"; }
- elsif ($n >= 10) { $t = "lightpink"; }
- print HTML " $n | \n";
- }
- print HTML "
\n";
- }
- if ($outExcel){
- $worksheet->write ($excelRow, 0, $organism);
- $worksheet->write ($excelRow, 1, $j);
- for ($i=0; $i<=$#nums; $i++){
- next unless $nums[$i]; # (=0)
- if ($nums[$i] < 5){ $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelGreen); }
- elsif ($nums[$i] < 10){ $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelYellow); }
- else{ $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelRed); }
- $worksheet->write_comment ($excelRow, $i+2, join ("\n", split (/,/, $accns[$i])));
- }
- $excelRow ++;
- }
- }
- close OUT if ($outText);
- print HTML "
\n" if ($outHTML);
- print "Report by donor organism generated.\n";
+if ($byDonor) {
+ foreach my $set (@sets) {
+ my %result = %{$results{$set}};
+ my $count = 0;
+ foreach my $accn (keys %result) {
+ $count ++;
+ next unless exists $result{$accn}{'hgt'};
+ next unless $result{$accn}{'hgt'};
+ next unless exists $result{$accn}{'match'};
+ next unless $result{$accn}{'match'};
+ my $organism = $result{$accn}{'match'};
+ my $group = "";
+ if ($sumRank) {
+ $organism =~ /^(\d+) \(/;
+ if (exists $taxadb{$1} and exists $taxadb{$1}{$sumRank} and $taxadb{$1}{$sumRank} and exists $ranksdb{$taxadb{$1}{$sumRank}}) {
+ # next unless exists $taxadb{$1}{'class'} and $taxadb{$1}{'class'} eq '28211'; # alphaproteobacteria
+ # next unless substr ($taxadb{$1}{'rank'}, 0, 3) eq "/2/"; # bacteria
+ # next if exists $taxadb{$1}{'class'} and $taxadb{$1}{'class'};
+ # $organism = $ranksdb{$taxadb{$1}{'phylum'}}.",".$ranksdb{$taxadb{$1}{'class'}}.",".$ranksdb{$taxadb{$1}{'order'}};
+ $organism = $ranksdb{$taxadb{$1}{$sumRank}};
+ } else { next; }
+ # if (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 3) eq "/2/") { $organism = "Bacteria"; }
+ # elsif (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 6) eq "/2157/") { $organism = "Archaea"; }
+ # elsif (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 6) eq "/2759/") { $organism = "Eukaryota"; }
+ # else{ next; }
+ }
+ $results{$set}{$accn}{'group'} = $organism;
+ if (exists $organisms{$organism}) {
+ if ($organisms{$organism}{$set}) { $organisms{$organism}{$set} .= ",$accn"; }
+ else { $organisms{$organism}{$set} .= $accn; }
+ } else {
+ my %h = ();
+ $h{$_} = "" for (@sets);
+ $h{$set} = $accn;
+ $organisms{$organism} = {%h};
+ }
+ }
+ }
+
+ # head row
+ if ($outText) {
+ open OUT, ">$wkDir/result/donor.txt";
+ print OUT "Organism\tMean\t".join ("\t", @sets)."\n";
+ }
+ if ($outHTML) {
+ print HTML "
\n \n HGT by putative donor group
\n";
+ print HTML " HGT-derived genes are summarized by putative donor group as indicated by the best distal match. Note that the real donor might be an ancestor of the match organism, therefore, a higher taxonomic rank is recommended to describe the donor group.";
+ print HTML "
green: 1-4, yellow: 5-9, red: 10+
\n";
+ print HTML " \n \n Donor | \n Mean | \n";
+ print HTML " $_ | \n" for (@sets);
+ print HTML "
\n";
+ }
+ if ($outExcel) {
+ $worksheet = $workbook->add_worksheet ('Donor');
+ $worksheet->set_row (0, 24);
+ $worksheet->set_column (0, 0, 16);
+ $worksheet->set_column (1, scalar @sets, 7);
+ $worksheet->write (0, 0, "HGT by donor organism", $excelTitle);
+ $worksheet->write (1, 0, "Donor", $excelHeader);
+ $worksheet->write (1, 1, "Mean", $excelHeader);
+ $worksheet->write (1, 2, \@sets, $excelHeader);
+ $excelRow = 3;
+ }
+
+ # total row
+ @a = (); push (@a, $totals{$_}) for (@sets);
+ $j = 0; $j += $_ for (@a); $j = sprintf("%d", $j/@a);
+ if ($outText) {
+ print OUT "Total\t$j\t".join ("\t", @a)."\n";
+ }
+ if ($outHTML) {
+ print HTML " \n Total | \n";
+ print HTML " $j | \n";
+ print HTML " $_ | \n" for (@a);
+ print HTML "
\n";
+ }
+ if ($outExcel) {
+ $worksheet->write (2, 0, "Total");
+ $worksheet->write (2, 1, $j);
+ $worksheet->write (2, 2, \@a);
+ }
+
+ # table content
+ foreach my $organism (sort keys %organisms) {
+ my @accns = (); # protein accns (comma-separated)
+ my @nums = (); # numbers only
+ foreach (@sets) {
+ $s = $organisms{$organism}{$_};
+ push (@accns, $s);
+ @a = split (/,/, $s);
+ $s = @a; push (@nums, $s);
+ }
+ $j = 0; $j += $_ for (@nums); $j = sprintf("%d", $j/@nums);
+ if ($outText) {
+ print OUT "$organism\t$j\t".join("\t", @nums)."\n";
+ }
+ if ($outHTML) {
+ print HTML " \n $organism | \n $j | \n";
+ for ($i=0; $i<=$#nums; $i++) {
+ $n = $nums[$i];
+ $t = "";
+ $s = join ("
", split (/,/, $accns[$i]));
+ if ($n > 0 and $n < 5) { $t = "lightgreen"; }
+ elsif ($n >= 5 and $n < 10) { $t = "khaki"; }
+ elsif ($n >= 10) { $t = "lightpink"; }
+ print HTML " $n | \n";
+ }
+ print HTML "
\n";
+ }
+ if ($outExcel) {
+ $worksheet->write ($excelRow, 0, $organism);
+ $worksheet->write ($excelRow, 1, $j);
+ for ($i=0; $i<=$#nums; $i++) {
+ next unless $nums[$i]; # (=0)
+ if ($nums[$i] < 5) { $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelGreen); }
+ elsif ($nums[$i] < 10) { $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelYellow); }
+ else{ $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelRed); }
+ $worksheet->write_comment ($excelRow, $i+2, join ("\n", split (/,/, $accns[$i])));
+ }
+ $excelRow ++;
+ }
+ }
+ close OUT if ($outText);
+ print HTML "
\n" if ($outHTML);
+ print "Report by donor organism generated.\n";
}
## Generate by function report ##
-if ($byFunction and -d $dirFunction){
-
- ## read Blast2GO output ##
-
- my %goes = (); # master record of GOes. GOes{ID} = { $group, $term, $nGene, $nHGT, $genes}
- opendir DIR, $dirFunction;
- my @files = readdir(DIR);
- close DIR;
- foreach my $file (@files){
- my $set = "";
- if ($file =~ /(.+)\.[^.]+$/){ $set = $1; }
- else{ $set = $file; }
- next unless exists $results{$set};
- open IN, "<$dirFunction/$file";
- while (){
- s/\s+$//;
- next if /^SeqName/;
- @a = split (/\t/);
- push @a, "" if $#a == 3;
- my $gene = "";
- if ($a[0] =~ /\|/){
- @b = split (/\|/, $a[0]);
- $b[$#b] =~ s/\.\d+$//;
- if ($b[$#b] =~ /cdsid_(.+)$/){ $gene = $1; }
- else{ $gene = $b[$#b]; }
- }else{
- $a[0] =~ s/\.\d+$//;
- $gene = $a[0];
- }
- next unless $gene;
- next unless exists $results{$set}{$gene};
- $a[3] =~ s/^GO://;
- if (exists $goes{$a[3]}){
- $goes{$a[3]}{$set}{'genes'} .= ",$gene";
- }else{
- %h = ('nGene', 0, 'nHGT', 0, 'genes', "");
- my %go = ('group', $a[2], 'term', $a[4]);
- $go{$_} = {%h} for (@sets);
- $go{$set}{'genes'} = $gene;
- $goes{$a[3]} = {%go};
- }
- $goes{$a[3]}{$set}{'nGene'} ++;
- $goes{$a[3]}{$set}{'nHGT'} ++ if $results{$set}{$gene}{'hgt'};
- if (exists $results{$set}{$gene}{'goes'}){ $results{$set}{$gene}{'goes'} .= "; $a[3] ($a[2]) $a[4]"; }
- else{ $results{$set}{$gene}{'goes'} = "$a[3] ($a[2]) $a[4]"; }
- }
- close IN;
- }
-
- ## write report by function ##
-
- if ($outText){
- open OUT, ">$wkDir/result/function.txt";
- print OUT "GO\tGroup\tTerm".join ("\t", @sets)."\n";
- }
- if ($outHTML){
- print HTML "
\n \n HGT by functional annotation
";
- print HTML " HGT-derived genes are summarized by annotated gene ontology (GO). Cell values are ratio of HGT-derived genes associated with a GO versus all genes associated with this GO. Every GO is counted if multiple GOes are associated with one gene.";
- print HTML "
white: 0, green: (0, 0.1), yellow: [0.1, 0.5), red: [0.5, 1]
\n";
- print HTML " \n \n GO | \n Term | \n";
- print HTML " $_ | \n" for (@sets);
- print HTML "
\n";
- }
- if ($outExcel){
- $worksheet = $workbook->add_worksheet ('Function');
- $worksheet->set_row (0, 24);
- $worksheet->set_column (0, 0, 8);
- $worksheet->set_column (1, 1, 40);
- $worksheet->set_column (2, 1 + scalar @sets, 7);
- $worksheet->write (0, 0, "HGT by function", $excelTitle);
- $worksheet->write (1, 0, ["GO","Term"], $excelHeader);
- $worksheet->write (1, 2, \@sets, $excelHeader);
- $excelRow = 2;
- }
- foreach my $group ('C', 'P', 'F'){
- if ($outHTML){
- if ($group eq "C"){ print HTML " Cellular component |
\n"; }
- if ($group eq "P"){ print HTML " Biological process |
\n"; }
- if ($group eq "F"){ print HTML " Molecular function |
\n"; }
- }
- if ($outExcel){
- if ($group eq "C"){ $worksheet->write ($excelRow++, 1, "Cellular component", $excelHeader); }
- if ($group eq "P"){ $worksheet->write ($excelRow++, 1, "Biological process", $excelHeader); }
- if ($group eq "F"){ $worksheet->write ($excelRow++, 1, "Molecular function", $excelHeader); }
- }
- foreach my $go (sort {$goes{$a}{'term'} cmp $goes{$b}{'term'}} keys %goes){
- next unless $goes{$go}{'group'} eq $group;
- next if (($goes{$go}{'term'} eq "cellular_component") or ($goes{$go}{'term'} eq "biological_process") or ($goes{$go}{'term'} eq "molecular_function"));
- if ($outText){ print OUT "$go\t$goes{$go}{'group'}\t$goes{$go}{'term'}"; }
- if ($outHTML){ print HTML " \n $go | \n $goes{$go}{'term'} | \n"; }
- if ($outExcel){
- $worksheet->write_string ($excelRow, 0, $go);
- $worksheet->write_string ($excelRow, 1, $goes{$go}{'term'});
- }
- $j = 2;
- foreach (@sets){
- if ($goes{$go}{$_}{'nGene'} and $goes{$go}{$_}{'nHGT'}){ $i = sprintf ("%.2g", $goes{$go}{$_}{'nHGT'}/$goes{$go}{$_}{'nGene'}); }
- else { $i = 0; }
- if ($outText){ print OUT "\t$goes{$go}{$_}{'nHGT'}\/$goes{$go}{$_}{'nGene'}"; }
- if ($outHTML){
- print HTML " 0 and $i < 0.1){ $t = "lightgreen"; }
- elsif ($i >= 0.1 and $i < 0.5) { $t = "khaki"; }
- elsif ($i >= 0.5) { $t = "lightpink"; }
- print HTML " style='background-color: $t;'>$i | \n";
- }else{
- print HTML ">$i\n";
- }
- }
- if ($outExcel){ $worksheet->write_number ($excelRow, $j, $i); $worksheet->write_comment ($excelRow, $j++, "$goes{$go}{$_}{'nHGT'}\/$goes{$go}{$_}{'nGene'}"); }
- }
- if ($outText){ print OUT "\n"; }
- if ($outHTML){ print HTML "
\n"; }
- if ($outExcel){ $excelRow++; }
- }
- }
- if ($outText){ close OUT; }
- if ($outHTML){ print HTML "
\n";}
- print "Report by functional annotation generated.\n";
+if ($byFunction and -d $dirFunction) {
+
+ ## read Blast2GO output ##
+
+ my %goes = (); # master record of GOes. GOes{ID} = { $group, $term, $nGene, $nHGT, $genes}
+ opendir DIR, $dirFunction;
+ my @files = readdir(DIR);
+ close DIR;
+ foreach my $file (@files) {
+ my $set = "";
+ if ($file =~ /(.+)\.[^.]+$/) { $set = $1; }
+ else{ $set = $file; }
+ next unless exists $results{$set};
+ open IN, "<$dirFunction/$file";
+ while () {
+ s/\s+$//;
+ next if /^SeqName/;
+ @a = split (/\t/);
+ push @a, "" if $#a == 3;
+ my $gene = "";
+ if ($a[0] =~ /\|/) {
+ @b = split (/\|/, $a[0]);
+ $b[$#b] =~ s/\.\d+$//;
+ if ($b[$#b] =~ /cdsid_(.+)$/) { $gene = $1; }
+ else{ $gene = $b[$#b]; }
+ } else {
+ $a[0] =~ s/\.\d+$//;
+ $gene = $a[0];
+ }
+ next unless $gene;
+ next unless exists $results{$set}{$gene};
+ $a[3] =~ s/^GO://;
+ if (exists $goes{$a[3]}) {
+ $goes{$a[3]}{$set}{'genes'} .= ",$gene";
+ } else {
+ %h = ('nGene', 0, 'nHGT', 0, 'genes', "");
+ my %go = ('group', $a[2], 'term', $a[4]);
+ $go{$_} = {%h} for (@sets);
+ $go{$set}{'genes'} = $gene;
+ $goes{$a[3]} = {%go};
+ }
+ $goes{$a[3]}{$set}{'nGene'} ++;
+ $goes{$a[3]}{$set}{'nHGT'} ++ if $results{$set}{$gene}{'hgt'};
+ if (exists $results{$set}{$gene}{'goes'}) { $results{$set}{$gene}{'goes'} .= "; $a[3] ($a[2]) $a[4]"; }
+ else{ $results{$set}{$gene}{'goes'} = "$a[3] ($a[2]) $a[4]"; }
+ }
+ close IN;
+ }
+
+ ## write report by function ##
+
+ if ($outText) {
+ open OUT, ">$wkDir/result/function.txt";
+ print OUT "GO\tGroup\tTerm".join ("\t", @sets)."\n";
+ }
+ if ($outHTML) {
+ print HTML "
\n \n HGT by functional annotation
";
+ print HTML " HGT-derived genes are summarized by annotated gene ontology (GO). Cell values are ratio of HGT-derived genes associated with a GO versus all genes associated with this GO. Every GO is counted if multiple GOes are associated with one gene.";
+ print HTML "
white: 0, green: (0, 0.1), yellow: [0.1, 0.5), red: [0.5, 1]
\n";
+ print HTML " \n \n GO | \n Term | \n";
+ print HTML " $_ | \n" for (@sets);
+ print HTML "
\n";
+ }
+ if ($outExcel) {
+ $worksheet = $workbook->add_worksheet ('Function');
+ $worksheet->set_row (0, 24);
+ $worksheet->set_column (0, 0, 8);
+ $worksheet->set_column (1, 1, 40);
+ $worksheet->set_column (2, 1 + scalar @sets, 7);
+ $worksheet->write (0, 0, "HGT by function", $excelTitle);
+ $worksheet->write (1, 0, ["GO","Term"], $excelHeader);
+ $worksheet->write (1, 2, \@sets, $excelHeader);
+ $excelRow = 2;
+ }
+ foreach my $group ('C', 'P', 'F') {
+ if ($outHTML) {
+ if ($group eq "C") { print HTML " Cellular component |
\n"; }
+ if ($group eq "P") { print HTML " Biological process |
\n"; }
+ if ($group eq "F") { print HTML " Molecular function |
\n"; }
+ }
+ if ($outExcel) {
+ if ($group eq "C") { $worksheet->write ($excelRow++, 1, "Cellular component", $excelHeader); }
+ if ($group eq "P") { $worksheet->write ($excelRow++, 1, "Biological process", $excelHeader); }
+ if ($group eq "F") { $worksheet->write ($excelRow++, 1, "Molecular function", $excelHeader); }
+ }
+ foreach my $go (sort {$goes{$a}{'term'} cmp $goes{$b}{'term'}} keys %goes) {
+ next unless $goes{$go}{'group'} eq $group;
+ next if (($goes{$go}{'term'} eq "cellular_component") or ($goes{$go}{'term'} eq "biological_process") or ($goes{$go}{'term'} eq "molecular_function"));
+ if ($outText) { print OUT "$go\t$goes{$go}{'group'}\t$goes{$go}{'term'}"; }
+ if ($outHTML) { print HTML " \n $go | \n $goes{$go}{'term'} | \n"; }
+ if ($outExcel) {
+ $worksheet->write_string ($excelRow, 0, $go);
+ $worksheet->write_string ($excelRow, 1, $goes{$go}{'term'});
+ }
+ $j = 2;
+ foreach (@sets) {
+ if ($goes{$go}{$_}{'nGene'} and $goes{$go}{$_}{'nHGT'}) { $i = sprintf ("%.2g", $goes{$go}{$_}{'nHGT'}/$goes{$go}{$_}{'nGene'}); }
+ else { $i = 0; }
+ if ($outText) { print OUT "\t$goes{$go}{$_}{'nHGT'}\/$goes{$go}{$_}{'nGene'}"; }
+ if ($outHTML) {
+ print HTML " 0 and $i < 0.1) { $t = "lightgreen"; }
+ elsif ($i >= 0.1 and $i < 0.5) { $t = "khaki"; }
+ elsif ($i >= 0.5) { $t = "lightpink"; }
+ print HTML " style='background-color: $t;'>$i | \n";
+ } else {
+ print HTML ">$i\n";
+ }
+ }
+ if ($outExcel) { $worksheet->write_number ($excelRow, $j, $i); $worksheet->write_comment ($excelRow, $j++, "$goes{$go}{$_}{'nHGT'}\/$goes{$go}{$_}{'nGene'}"); }
+ }
+ if ($outText) { print OUT "\n"; }
+ if ($outHTML) { print HTML "
\n"; }
+ if ($outExcel) { $excelRow++; }
+ }
+ }
+ if ($outText) { close OUT; }
+ if ($outHTML) { print HTML "
\n";}
+ print "Report by functional annotation generated.\n";
}
## Generate by ortholog report ##
-if ($byOrthology){
- for ($i=0; $i <=$#sets; $i++){
- foreach my $accn (keys %{$results{$sets[$i]}}){
- %h = %{$results{$sets[$i]}{$accn}};
- if (exists $h{'cogid'}){
- if (exists $h{'hgt'} and $h{'hgt'}){
- $cogs[$h{'cogid'}]{'data'}[$i] .= "1:$accn ";
- }else{
- $cogs[$h{'cogid'}]{'data'}[$i] .= "0:$accn ";
- }
- }
- }
- }
- if ($outText){
- open OUT, ">$wkDir/result/orthology.txt";
- print OUT "ID\tName\t".join ("\t", @sets)."\n";
- }
- if ($outHTML){
- print HTML "
\n \n HGT by gene orthology
";
- print HTML " HGT-derived genes are summarized by orthologous groups (OGs).";
- print HTML "
empty: gene not present, 0: gene present but not predicted as HGT, 1: gene present and predicted as HGT, m/n (n>=2): n paralogs are present, in which m are predicted as HGT.
\n";
- if ($cogs[0]{'name'}){ $i = 250; }else{ $i = 50; }
- print HTML " \n \n OG | \n";
- print HTML " $_ | \n" for (@sets);
- print HTML "
\n";
- }
- if ($outExcel){
- $worksheet = $workbook->add_worksheet ('Orthology');
- $worksheet->set_row (0, 24);
- if ($cogs[0]{'name'}){ $i = 40; }else{ $i = 6; }
- $worksheet->set_column (0, 0, $i);
- $worksheet->set_column (1, scalar @sets, 7);
- $worksheet->write (0, 0, "HGT by gene orthology", $excelTitle);
- $worksheet->write (1, 0, "OG", $excelHeader);
- $worksheet->write (1, 1, \@sets, $excelHeader);
- $excelRow = 2;
- }
-
- my $allCOG; my $hgtCOG;
-
- for ($i=0; $i <=$#cogs; $i++){
- @a = @{$cogs[$i]{'data'}};
- $s = 0; foreach (@a){ if (/^1/){ $s = 1; last; } } next unless $s;
-
- for ($j=0; $j<=$#a; $j++){
- next unless $a[$j];
- @b = split (/\s/, $a[$j]);
- $allCOG += scalar @b;
- foreach (@b){ $hgtCOG ++ if /^1/; }
- }
-
- if ($outText){
- print OUT "$i\t$cogs[$i]{'name'}\t".join("\t",@a)."\n";
- }
- if ($outHTML){
- if ($cogs[$i]{'name'}){ $s = $cogs[$i]{'name'}; }else{ $s = $i; }
- print HTML " \n $s | \n";
- for ($j=0; $j<=$#a; $j++){
- if ($a[$j]){
- @b = split (/\s/, $a[$j]);
- my $hgt = 0; my $first; $t = "";
- foreach (@b){
- /^([01]):(.+)$/;
- $first = $2 unless $first;
- $t .= "$2,";
- $hgt ++ if $1 == 1;
- }
- print HTML " ";
- print HTML "";
- unless ($#b){ print HTML "$hgt | \n"; }
- else{ print HTML "$hgt/".scalar(@b)."\n"; }
- }else{
- print HTML " | \n";
- }
- }
- print HTML "
\n";
- }
- if ($outExcel){
- if ($cogs[$i]{'name'}){ $s = $cogs[$i]{'name'}; }else{ $s = $i; }
- $worksheet->write ($excelRow, 0, $s);
- for ($j=0; $j<=$#a; $j++){
- next unless $a[$j];
- @b = split (/\s/, $a[$j]);
- my $hgt = 0; my $first; $t = "";
- foreach (@b){
- /^([01]):(.+)$/;
- $first = $2 unless $first;
- $t .= "$2,";
- $hgt ++ if $1 == 1;
- }
- if ($#b){
- if ($hgt){ $worksheet->write_url ($excelRow, $j+1, $2, "$hgt/".scalar(@b), $excelGreen); }
- else { $worksheet->write_url ($excelRow, $j+1, $2, "$hgt/".scalar(@b), $excelGrey); }
- }else{
- if ($hgt){ $worksheet->write_url ($excelRow, $j+1, $2, 1, $excelGreen); }
- else { $worksheet->write_url ($excelRow, $j+1, $2, 0, $excelGrey); }
- }
- $worksheet->write_comment ($excelRow, $j+1, $t);
- }
- $excelRow ++;
- }
- }
- print HTML "
\n" if ($outHTML);
-
- if (%ORFans and 0){
- if ($outText){ print OUT "#Singleton ORFans:\n"; }
- if ($outHTML){ print HTML "
\n \n Singleton ORFans\n"; }
- foreach my $set (@sets){
- if ($outText){ print OUT "$set: "; }
- if ($outHTML){ print HTML "
$set: "; }
- foreach my $gene (@{$ORFans{$set}}){
- if ($outText){ print OUT "$gene "; }
- if ($outHTML){ print HTML "$gene "; }
- }
- if ($outText){ print OUT "\n"; }
- if ($outHTML){ print HTML "
\n"; }
- }
- }
-
- close OUT if ($outText);
-
- print "Report by gene orthology generated.\n";
-
- print " Positive rate = $allhgt / ".($allhgt-$hgtCOG+$allCOG)." = ". (sprintf("%.3f", $allhgt/($allhgt-$hgtCOG+$allCOG))).".\n";
-
+if ($byOrthology) {
+ for ($i=0; $i <=$#sets; $i++) {
+ foreach my $accn (keys %{$results{$sets[$i]}}) {
+ %h = %{$results{$sets[$i]}{$accn}};
+ if (exists $h{'cogid'}) {
+ if (exists $h{'hgt'} and $h{'hgt'}) {
+ $cogs[$h{'cogid'}]{'data'}[$i] .= "1:$accn ";
+ } else {
+ $cogs[$h{'cogid'}]{'data'}[$i] .= "0:$accn ";
+ }
+ }
+ }
+ }
+ if ($outText) {
+ open OUT, ">$wkDir/result/orthology.txt";
+ print OUT "ID\tName\t".join ("\t", @sets)."\n";
+ }
+ if ($outHTML) {
+ print HTML "
\n \n HGT by gene orthology
";
+ print HTML " HGT-derived genes are summarized by orthologous groups (OGs).";
+ print HTML "
empty: gene not present, 0: gene present but not predicted as HGT, 1: gene present and predicted as HGT, m/n (n>=2): n paralogs are present, in which m are predicted as HGT.
\n";
+ if ($cogs[0]{'name'}) { $i = 250; } else { $i = 50; }
+ print HTML " \n \n OG | \n";
+ print HTML " $_ | \n" for (@sets);
+ print HTML "
\n";
+ }
+ if ($outExcel) {
+ $worksheet = $workbook->add_worksheet ('Orthology');
+ $worksheet->set_row (0, 24);
+ if ($cogs[0]{'name'}) { $i = 40; } else { $i = 6; }
+ $worksheet->set_column (0, 0, $i);
+ $worksheet->set_column (1, scalar @sets, 7);
+ $worksheet->write (0, 0, "HGT by gene orthology", $excelTitle);
+ $worksheet->write (1, 0, "OG", $excelHeader);
+ $worksheet->write (1, 1, \@sets, $excelHeader);
+ $excelRow = 2;
+ }
+
+ my $allCOG; my $hgtCOG;
+
+ for ($i=0; $i <=$#cogs; $i++) {
+ @a = @{$cogs[$i]{'data'}};
+ $s = 0; foreach (@a) { if (/^1/) { $s = 1; last; } } next unless $s;
+
+ for ($j=0; $j<=$#a; $j++) {
+ next unless $a[$j];
+ @b = split (/\s/, $a[$j]);
+ $allCOG += scalar @b;
+ foreach (@b) { $hgtCOG ++ if /^1/; }
+ }
+
+ if ($outText) {
+ print OUT "$i\t$cogs[$i]{'name'}\t".join("\t",@a)."\n";
+ }
+ if ($outHTML) {
+ if ($cogs[$i]{'name'}) { $s = $cogs[$i]{'name'}; } else { $s = $i; }
+ print HTML " \n $s | \n";
+ for ($j=0; $j<=$#a; $j++) {
+ if ($a[$j]) {
+ @b = split (/\s/, $a[$j]);
+ my $hgt = 0; my $first; $t = "";
+ foreach (@b) {
+ /^([01]):(.+)$/;
+ $first = $2 unless $first;
+ $t .= "$2,";
+ $hgt ++ if $1 == 1;
+ }
+ print HTML " ";
+ print HTML "";
+ unless ($#b) { print HTML "$hgt | \n"; }
+ else{ print HTML "$hgt/".scalar(@b)."\n"; }
+ } else {
+ print HTML " | \n";
+ }
+ }
+ print HTML "
\n";
+ }
+ if ($outExcel) {
+ if ($cogs[$i]{'name'}) { $s = $cogs[$i]{'name'}; } else { $s = $i; }
+ $worksheet->write ($excelRow, 0, $s);
+ for ($j=0; $j<=$#a; $j++) {
+ next unless $a[$j];
+ @b = split (/\s/, $a[$j]);
+ my $hgt = 0; my $first; $t = "";
+ foreach (@b) {
+ /^([01]):(.+)$/;
+ $first = $2 unless $first;
+ $t .= "$2,";
+ $hgt ++ if $1 == 1;
+ }
+ if ($#b) {
+ if ($hgt) { $worksheet->write_url ($excelRow, $j+1, $2, "$hgt/".scalar(@b), $excelGreen); }
+ else { $worksheet->write_url ($excelRow, $j+1, $2, "$hgt/".scalar(@b), $excelGrey); }
+ } else {
+ if ($hgt) { $worksheet->write_url ($excelRow, $j+1, $2, 1, $excelGreen); }
+ else { $worksheet->write_url ($excelRow, $j+1, $2, 0, $excelGrey); }
+ }
+ $worksheet->write_comment ($excelRow, $j+1, $t);
+ }
+ $excelRow ++;
+ }
+ }
+ print HTML "
\n" if ($outHTML);
+
+ if (%ORFans and 0) {
+ if ($outText) { print OUT "#Singleton ORFans:\n"; }
+ if ($outHTML) { print HTML "
\n \n Singleton ORFans\n"; }
+ foreach my $set (@sets) {
+ if ($outText) { print OUT "$set: "; }
+ if ($outHTML) { print HTML "
$set: "; }
+ foreach my $gene (@{$ORFans{$set}}) {
+ if ($outText) { print OUT "$gene "; }
+ if ($outHTML) { print HTML "$gene "; }
+ }
+ if ($outText) { print OUT "\n"; }
+ if ($outHTML) { print HTML "
\n"; }
+ }
+ }
+
+ close OUT if ($outText);
+
+ print "Report by gene orthology generated.\n";
+
+ print " Positive rate = $allhgt / ".($allhgt-$hgtCOG+$allCOG)." = ". (sprintf("%.3f", $allhgt/($allhgt-$hgtCOG+$allCOG))).".\n";
+
}
## Attach individual reports in spreadsheet ##
-if ($outExcel and $detailExcel){
- foreach my $set (@sets){
- $worksheet = $workbook->add_worksheet ($set);
- $worksheet->set_row (0, 24);
- $worksheet->set_column (0, 0, 14);
- $worksheet->set_column (1, 1, 8);
- $worksheet->set_column (2, 2, 32);
- $worksheet->set_column (3, 9, 8);
- $worksheet->set_column ($iMatch, $iMatch+$byDonor, 32) if $iMatch;
- $worksheet->write (0, 0, $set, $excelTitle);
- $worksheet->write (1, 0, ["Query","Length","Product","Hits","Self","Close","Distal","HGT"], $excelHeader);
- $worksheet->write (1, $iLoss, "Loss", $excelHeader) if $iLoss;
- $worksheet->write (1, $iPOE, "POE", $excelHeader) if $iPOE;
- $worksheet->write (1, $iMatch, "Best distal match", $excelHeader) if $iMatch;
- $worksheet->write (1, $iMatch+1, "Putative donor group", $excelHeader) if $byDonor;
- $worksheet->write (1, $iMatch+$byDonor+1, "Function", $excelHeader) if $byFunction;
- $excelRow = 1;
- %h = %{$results{$set}};
- foreach my $accn (sort keys %h){
- $excelRow ++;
- $worksheet->write_url ($excelRow, 0, "../search/$set/$accn.txt", $accn);
- $worksheet->write_number ($excelRow, 1, $h{$accn}{'length'});
- $worksheet->write ($excelRow, 2, $h{$accn}{'product'});
- $worksheet->write_number ($excelRow, 3, $h{$accn}{'hits'});
- next unless exists $h{$accn}{'self'};
- $worksheet->write_number ($excelRow, 4, $h{$accn}{'self'});
- $worksheet->write_number ($excelRow, 5, $h{$accn}{'close'});
- $worksheet->write_number ($excelRow, 6, $h{$accn}{'distal'});
- $worksheet->write_number ($excelRow, 7, $h{$accn}{'hgt'}) if $h{$accn}{'hgt'};
- $worksheet->write_number ($excelRow, $iLoss, $h{$accn}{'loss'}) if ($iLoss and $h{$accn}{'loss'});
- $worksheet->write_number ($excelRow, $iPOE, $h{$accn}{'poe'}) if ($iPOE and $h{$accn}{'poe'});
- next unless ($iMatch and $h{$accn}{'match'});
- $h{$accn}{'match'} =~ /^(\d+) \((.+)\)$/;
- $worksheet->write ($excelRow, $iMatch, $2);
- if ($byDonor and exists $h{$accn}{'group'}){
- $worksheet->write ($excelRow, $iMatch+1, $h{$accn}{'group'});
- }
- if ($byFunction and exists $h{$accn}{'goes'}){
- $s = $h{$accn}{'goes'};
- $s =~ s/GO:\d+ \(.\) //g;
- $worksheet->write ($excelRow, $iMatch+$byDonor+1, $s);
- }
- # $worksheet->write_comment ($excelRow, 10, $1);
- }
- }
+if ($outExcel and $detailExcel) {
+ foreach my $set (@sets) {
+ $worksheet = $workbook->add_worksheet ($set);
+ $worksheet->set_row (0, 24);
+ $worksheet->set_column (0, 0, 14);
+ $worksheet->set_column (1, 1, 8);
+ $worksheet->set_column (2, 2, 32);
+ $worksheet->set_column (3, 9, 8);
+ $worksheet->set_column ($iMatch, $iMatch+$byDonor, 32) if $iMatch;
+ $worksheet->write (0, 0, $set, $excelTitle);
+ $worksheet->write (1, 0, ["Query","Length","Product","Hits","Self","Close","Distal","HGT"], $excelHeader);
+ $worksheet->write (1, $iLoss, "Loss", $excelHeader) if $iLoss;
+ $worksheet->write (1, $iPOE, "POE", $excelHeader) if $iPOE;
+ $worksheet->write (1, $iMatch, "Best distal match", $excelHeader) if $iMatch;
+ $worksheet->write (1, $iMatch+1, "Putative donor group", $excelHeader) if $byDonor;
+ $worksheet->write (1, $iMatch+$byDonor+1, "Function", $excelHeader) if $byFunction;
+ $excelRow = 1;
+ %h = %{$results{$set}};
+ foreach my $accn (sort keys %h) {
+ $excelRow ++;
+ $worksheet->write_url ($excelRow, 0, "../search/$set/$accn.txt", $accn);
+ $worksheet->write_number ($excelRow, 1, $h{$accn}{'length'});
+ $worksheet->write ($excelRow, 2, $h{$accn}{'product'});
+ $worksheet->write_number ($excelRow, 3, $h{$accn}{'hits'});
+ next unless exists $h{$accn}{'self'};
+ $worksheet->write_number ($excelRow, 4, $h{$accn}{'self'});
+ $worksheet->write_number ($excelRow, 5, $h{$accn}{'close'});
+ $worksheet->write_number ($excelRow, 6, $h{$accn}{'distal'});
+ $worksheet->write_number ($excelRow, 7, $h{$accn}{'hgt'}) if $h{$accn}{'hgt'};
+ $worksheet->write_number ($excelRow, $iLoss, $h{$accn}{'loss'}) if ($iLoss and $h{$accn}{'loss'});
+ $worksheet->write_number ($excelRow, $iPOE, $h{$accn}{'poe'}) if ($iPOE and $h{$accn}{'poe'});
+ next unless ($iMatch and $h{$accn}{'match'});
+ $h{$accn}{'match'} =~ /^(\d+) \((.+)\)$/;
+ $worksheet->write ($excelRow, $iMatch, $2);
+ if ($byDonor and exists $h{$accn}{'group'}) {
+ $worksheet->write ($excelRow, $iMatch+1, $h{$accn}{'group'});
+ }
+ if ($byFunction and exists $h{$accn}{'goes'}) {
+ $s = $h{$accn}{'goes'};
+ $s =~ s/GO:\d+ \(.\) //g;
+ $worksheet->write ($excelRow, $iMatch+$byDonor+1, $s);
+ }
+ # $worksheet->write_comment ($excelRow, 10, $1);
+ }
+ }
}
-if ($outHTML){
- print HTML "\t\n\n";
- close HTML;
+if ($outHTML) {
+ print HTML "\t\n\n";
+ close HTML;
}
-if ($outExcel){
- $worksheet = $workbook->sheets(0);
+if ($outExcel) {
+ $worksheet = $workbook->sheets(0);
$worksheet->set_first_sheet();
$worksheet->activate();
$worksheet->set_selection(0, 0);
- $workbook->close();
+ $workbook->close();
}
exit 0;
-
diff --git a/scripts/searcher.pl b/scripts/searcher.pl
old mode 100755
new mode 100644
index 0a36c35..0c9c48e
--- a/scripts/searcher.pl
+++ b/scripts/searcher.pl
@@ -47,86 +47,84 @@
## global variables ##
-my $wkDir = $ARGV[0]; # working directory
-
-my %ins = (); # the master data structure of the whole procedure
- # set => (
- # file (file)
- # taxid (str)
- # organism (str)
- # prefile (file)
- # done (boolean)
- # prots (array...)
- # prot => (
- # name (accn or user-defined)
- # gi, accn (only for GenBank-style sequence titles)
- # product (after the 1st whitespace)
- # seq (necessary for local searches, optional for remote BLAST)
- # done (boolean)
- # hits (array...)
- # hit => (
- # sseqid, pident, evalue, bitscore, qstart, qend
- # sseq (only for BLAST)
- # name (accn or full name)
-
-my $nProt = 0; # total number of proteins
-my $nDone = 0; # number of proteins that have been searched already
+my $wkDir = $ARGV[0]; # working directory
+
+my %ins = (); # the master data structure of the whole procedure
+ # set => (
+ # file (file)
+ # taxid (str)
+ # organism (str)
+ # prefile (file)
+ # done (boolean)
+ # prots (array...)
+ # prot => (
+ # name (accn or user-defined)
+ # title (after the 1st whitespace)
+ # seq (necessary for local searches, optional for remote BLAST)
+ # done (boolean)
+ # hits (array...)
+ # hit => (
+ # sseqid, pident, evalue, bitscore, qstart, qend
+ # sseq (only for BLAST)
+ # name (accn or full name)
+
+my $nProt = 0; # total number of proteins
+my $nDone = 0; # number of proteins that have been searched already
my $retryFailed = 1;
-my %inSeqs = (); # query protein sequences (if applicable)
-my %taxdumps = (); # local taxonomy database: taxid => (name, parent, rank)
-my %prot2taxids = (); # protein to TaxID dictionary (the protein name may be GI or accn)
-my %dbTaxa :shared = (); # taxa.db
-my %dbRanks :shared = (); # ranks.db
-my %badTaxids = (); # TaxIDs that don't exist in the local taxonomy database
+my %inSeqs = (); # query protein sequences (if applicable)
+my %taxdumps = (); # local taxonomy database: taxid => (name, parent, rank)
+my %prot2taxids = (); # protein name to TaxID dictionary
+my %dbTaxa :shared = (); # taxa.db
+my %dbRanks :shared = (); # ranks.db
+my %badTaxids = (); # TaxIDs that don't exist in the local taxonomy database
## subroutines ##
-sub http_blast; # parameters: query (accn. no.), set; return: 1 - succeeded, 0 - fail
-sub local_search; # same as above
-sub self_align; # search against itself
-sub get_taxonomy; # paramter: array of TaxIDs
-sub stem_name; # get stem file name
-sub seq_title; # parse sequence title
-sub order_accns; # reorder accession number
+sub http_blast; # parameters: query (accn. no.), set; return: 1 - succeeded, 0 - fail
+sub local_search; # same as above
+sub self_align; # search against itself
+sub get_taxonomy; # paramter: array of TaxIDs
+sub stem_name; # get stem file name
+sub order_accns; # reorder accession number
## program parameters ##
-my $interactive = 1; # interactive or automatic mode
-my $searchTool = "blast"; # protein sequence similarity search tool
-my $preSearch = ""; # directory of pre-computed search results.
-my $selfTax = ""; # taxonomy of input protein sets
+my $interactive = 1; # interactive or automatic mode
+my $searchTool = "blast"; # protein sequence similarity search tool
+my $preSearch = ""; # directory of pre-computed search results.
+my $selfTax = ""; # taxonomy of input protein sets
# databases
-my $protdb = ""; # protein database for homolgy search
-my $taxdump = ""; # directory of the NCBI taxonomy database (nodes.dmp and names.dmp)
-my $prot2taxid = ""; # protein name / GI to TaxID dictionary file
+my $protdb = ""; # protein database for homolgy search
+my $taxdump = ""; # directory of the NCBI taxonomy database (nodes.dmp and names.dmp)
+my $prot2taxid = ""; # protein name to TaxID dictionary file
# search cutoffs
-my $nHits = 500; # number of hits to return
-my $maxHits = 0; # maximum number of valid hits to preserve. if 0 then = nHits
-my $evalue = 1e-5; # maximum E-value cutoff
-my $identity = 0; # minimum percent identity cutoff
-my $coverage = 0; # minimum query coverage cutoff
+my $nHits = 500; # number of hits to return
+my $maxHits = 0; # maximum number of valid hits to preserve. if 0 then = nHits
+my $evalue = 1e-5; # maximum E-value cutoff
+my $identity = 0; # minimum percent identity cutoff
+my $coverage = 0; # minimum query coverage cutoff
# taxonomy filters
-my $mergeDuplicates = 1; # ignore hits with same taxon names and bit scores
-my $taxonUCASE = 0; # ignore taxon names that do not start with a capital letter
-my @ignoreTaxa = split (/,/, "unknown,uncultured,unidentified,unclassified,environmental,plasmid,vector,synthetic,phage");
- # ignore taxon names containing these words
-my $ignoreParalogs = 1; # ignore potential paralogs (hits with same taxon names but different bit scores)
-my $ignoreSeqRepeats = 1; # ignore repeated sequences (hits targetting different regions of the same protein)
-my $ignoreSubspecies = 0; # ignore more than one subspecies from the same species
-my @ranks = ('species', 'genus', 'family', 'order', 'class', 'phylum');
- # record the TaxIDs on these ranks for each hit
+my $mergeDuplicates = 1; # ignore hits with same taxon names and bit scores
+my $taxonUCASE = 0; # ignore taxon names that do not start with a capital letter
+my @ignoreTaxa = ("unknown", "uncultured", "unidentified", "unclassified", "environmental", "plasmid", "vector", "synthetic", "phage");
+ # ignore taxon names containing these words
+my $ignoreParalogs = 1; # ignore potential paralogs (hits with same taxon names but different bit scores)
+my $ignoreSeqRepeats = 1; # ignore repeated sequences (hits targetting different regions of the same protein)
+my $ignoreSubspecies = 0; # ignore more than one subspecies from the same species
+my @ranks = ("species", "genus", "family", "order", "class", "phylum");
+ # record the TaxIDs on these ranks for each hit
# search tool behavior
-my $threads = 0; # multiple threads (0 for all CPU cores)
-my $queries = 0; # multiple queries per run (0 for all sequences per sample)
-my $getAln = 0; # retrieve aligned part of subject sequence (for BLAST only)
+my $threads = 0; # multiple threads (0 for all CPU cores)
+my $queries = 0; # multiple queries per run (0 for all sequences per sample)
+my $getAln = 0; # retrieve aligned part of subject sequence (for BLAST only)
my $blastdbcmd = "blastdbcmd";
my $blastp = "blastp";
my $rapsearch = "rapsearch";
@@ -134,88 +132,90 @@
my $diamond = "diamond";
# remote BLAST behavior
-my $httpBlast = 0; # call the NCBI server to perform BLAST and to retrieve sequence / taxonomy information
-my $httpDb = "nr"; # NCBI BLAST database
-my $requests = 1; # number of requests for http BLAST (default: 1)
-my $retries = 5; # maximum number of retries
-my $delay = 30; # time (seconds) between two http requests
-my $timeout = 600; # time (seconds) to give up waiting
-my $exUncultured = 1; # exclude uncultured and environmental samples
-my @searchTaxids = (); # search under the following taxon groups (taxids)
-my @ignoreTaxids = (); # ignore organisms under the following taxids
-my $eqText = ""; # entrez query parameter
-my $taxBlast = 0; # retrieve taxonomy report
-my $seqBlast = 0; # retrieve hit sequences
-my $alnBlast = 0; # retrieve multiple sequence alignment (conflicts seqBlast)
-my $blastServer = "http://blast.ncbi.nlm.nih.gov/Blast.cgi";
-my $blastAlignServer = "http://blast.ncbi.nlm.nih.gov/BlastAlign.cgi";
-my $eSearchServer = "http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi";
-my $eFetchServer = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi";
-my $eSummaryServer = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi";
+my $httpBlast = 0; # call the NCBI server to perform BLAST and to retrieve sequence / taxonomy information
+my $httpDb = "nr"; # NCBI BLAST database
+my $requests = 1; # number of requests for http BLAST (default: 1)
+my $retries = 5; # maximum number of retries
+my $delay = 30; # time (seconds) between two http requests
+my $timeout = 600; # time (seconds) to give up waiting
+my $exUncultured = 1; # exclude uncultured and environmental samples
+my @searchTaxids = (); # search under the following taxon groups (taxids)
+my @ignoreTaxids = (); # ignore organisms under the following taxids
+my $eqText = ""; # entrez query parameter
+my $taxBlast = 0; # retrieve taxonomy report
+my $seqBlast = 0; # retrieve hit sequences
+my $alnBlast = 0; # retrieve multiple sequence alignment (conflicts seqBlast)
+
+# remote BLAST server URLs
+my $blastServer = "https://blast.ncbi.nlm.nih.gov/Blast.cgi";
+my $blastAlignServer = "https://blast.ncbi.nlm.nih.gov/BlastAlign.cgi";
+my $eSearchServer = "https://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi";
+my $eFetchServer = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi";
+my $eSummaryServer = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi";
## read configuration ##
-if (-e "$wkDir/config.txt"){
- open IN, "<$wkDir/config.txt";
- while (){
- s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
- $interactive = $1 if /^interactive=([01])$/;
- $selfTax = $1 if /^selfTax=(.+)$/;
- @ranks = split (/,/, $1) if /^ranks=(.+)$/;
- $preSearch = $1 if /^preSearch=(.+)$/;
- $preSearch =~ s/\/$//;
-
- $searchTool = $1 if /^searchTool=(.+)$/;
- $threads = $1 if /^threads=(\d+)$/;
- $queries = $1 if /^queries=(\d+)$/;
- $requests = $1 if /^requests=(\d+)$/;
- $getAln = $1 if /^getAln=([01])$/;
-
- $blastdbcmd = $1 if /^blastdbcmd=(.+)$/;
- $blastp = $1 if /^blastp=(.+)$/;
- $rapsearch = $1 if /^rapsearch=(.+)$/;
- $prerapsearch = $1 if /^prerapsearch=(.+)$/;
- $diamond = $1 if /^diamond=(.+)$/;
-
- $protdb = $1 if /^protdb=(.+)$/;
- $taxdump = $1 if /^taxdump=(.+)$/;
- $taxdump =~ s/\/$//;
- $prot2taxid = $1 if /^prot2taxid=(.+)$/;
-
- $evalue = $1 if /^evalue=(.+)$/;
- $identity = $1 if /^identity=(.+)$/;
- $coverage = $1 if /^coverage=(.+)$/;
- $nHits = $1 if /^nHits=(\d+)$/;
- $maxHits = $1 if /^maxHits=(\d+)$/;
-
- $taxonUCASE = $1 if /^taxonUCASE=([01])$/;
- $mergeDuplicates = $1 if /^mergeDuplicates=([01])$/;
- $ignoreParalogs = $1 if /^ignoreParalogs=([01])$/;
- $ignoreSeqRepeats = $1 if /^ignoreSeqRepeats=([01])$/;
- $ignoreSubspecies = $1 if /^ignoreSubspecies=([01])$/;
- @ignoreTaxa = split(/,/, $1) if /^ignoreTaxa=(.+)$/;
- @ignoreTaxa = () if /^ignoreTaxa=$/;
-
- $httpBlast = $1 if /^httpBlast=(\d)$/;
- $httpDb = $1 if /^httpDb=(.+)$/;
- $retries = $1 if /^retries=(\d+)$/;
- $delay = $1 if /^delay=(\d+)$/;
- $eqText = $1 if /^eqText=(.+)$/;
- $exUncultured = $1 if /^exUncultured=([01])$/;
- @searchTaxids = split(/,/, $1) if /^searchTaxids=(.+)$/;
- @ignoreTaxids = split(/,/, $1) if /^ignoreTaxids=(.+)$/;
- $seqBlast = $1 if /^seqBlast=([01])$/;
- $taxBlast = $1 if /^taxBlast=([01])$/;
- $alnBlast = $1 if /^alnBlast=([01])$/;
-
- $blastServer = $1 if /^blastServer=(.+)$/;
- $blastAlignServer = $1 if /^blastAlignServer=(.+)$/;
- $eSearchServer = $1 if /^eSearchServer=(.+)$/;
- $eFetchServer = $1 if /^eFetchServer=(.+)$/;
- $eSummaryServer = $1 if /^eSummaryServer=(.+)$/;
- }
- close IN;
+if (-e "$wkDir/config.txt") {
+ open IN, "<$wkDir/config.txt";
+ while () {
+ s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+ $interactive = $1 if /^interactive=([01])$/;
+ $selfTax = $1 if /^selfTax=(.+)$/;
+ @ranks = split (/,/, $1) if /^ranks=(.+)$/;
+ $preSearch = $1 if /^preSearch=(.+)$/;
+ $preSearch =~ s/\/$//;
+
+ $searchTool = $1 if /^searchTool=(.+)$/;
+ $threads = $1 if /^threads=(\d+)$/;
+ $queries = $1 if /^queries=(\d+)$/;
+ $requests = $1 if /^requests=(\d+)$/;
+ $getAln = $1 if /^getAln=([01])$/;
+
+ $blastdbcmd = $1 if /^blastdbcmd=(.+)$/;
+ $blastp = $1 if /^blastp=(.+)$/;
+ $rapsearch = $1 if /^rapsearch=(.+)$/;
+ $prerapsearch = $1 if /^prerapsearch=(.+)$/;
+ $diamond = $1 if /^diamond=(.+)$/;
+
+ $protdb = $1 if /^protdb=(.+)$/;
+ $taxdump = $1 if /^taxdump=(.+)$/;
+ $taxdump =~ s/\/$//;
+ $prot2taxid = $1 if /^prot2taxid=(.+)$/;
+
+ $evalue = $1 if /^evalue=(.+)$/;
+ $identity = $1 if /^identity=(.+)$/;
+ $coverage = $1 if /^coverage=(.+)$/;
+ $nHits = $1 if /^nHits=(\d+)$/;
+ $maxHits = $1 if /^maxHits=(\d+)$/;
+
+ $taxonUCASE = $1 if /^taxonUCASE=([01])$/;
+ $mergeDuplicates = $1 if /^mergeDuplicates=([01])$/;
+ $ignoreParalogs = $1 if /^ignoreParalogs=([01])$/;
+ $ignoreSeqRepeats = $1 if /^ignoreSeqRepeats=([01])$/;
+ $ignoreSubspecies = $1 if /^ignoreSubspecies=([01])$/;
+ @ignoreTaxa = split(/,/, $1) if /^ignoreTaxa=(.+)$/;
+ @ignoreTaxa = () if /^ignoreTaxa=$/;
+
+ $httpBlast = $1 if /^httpBlast=(\d)$/;
+ $httpDb = $1 if /^httpDb=(.+)$/;
+ $retries = $1 if /^retries=(\d+)$/;
+ $delay = $1 if /^delay=(\d+)$/;
+ $eqText = $1 if /^eqText=(.+)$/;
+ $exUncultured = $1 if /^exUncultured=([01])$/;
+ @searchTaxids = split(/,/, $1) if /^searchTaxids=(.+)$/;
+ @ignoreTaxids = split(/,/, $1) if /^ignoreTaxids=(.+)$/;
+ $seqBlast = $1 if /^seqBlast=([01])$/;
+ $taxBlast = $1 if /^taxBlast=([01])$/;
+ $alnBlast = $1 if /^alnBlast=([01])$/;
+
+ $blastServer = $1 if /^blastServer=(.+)$/;
+ $blastAlignServer = $1 if /^blastAlignServer=(.+)$/;
+ $eSearchServer = $1 if /^eSearchServer=(.+)$/;
+ $eFetchServer = $1 if /^eFetchServer=(.+)$/;
+ $eSummaryServer = $1 if /^eSummaryServer=(.+)$/;
+ }
+ close IN;
}
@@ -225,55 +225,55 @@
$identity *= 100 if ($identity and $identity < 1);
$coverage *= 100 if ($coverage and $coverage < 1);
-if (lc($searchTool) eq "blast"){ $searchTool = "BLAST"; }
-elsif (lc($searchTool) eq "rapsearch"){ $searchTool = "RAPSearch2"; }
-elsif (lc($searchTool) eq "rapsearch2"){ $searchTool = "RAPSearch2"; }
-elsif (lc($searchTool) eq "diamond"){ $searchTool = "DIAMOND"; }
-elsif (lc($searchTool) eq "customized"){ $searchTool = "customized"; }
+if (lc($searchTool) eq "blast") { $searchTool = "BLAST"; }
+elsif (lc($searchTool) eq "rapsearch") { $searchTool = "RAPSearch2"; }
+elsif (lc($searchTool) eq "rapsearch2") { $searchTool = "RAPSearch2"; }
+elsif (lc($searchTool) eq "diamond") { $searchTool = "DIAMOND"; }
+elsif (lc($searchTool) eq "customized") { $searchTool = "customized"; }
else{ die "Error: Invalid search tool: $searchTool.\n"; }
-if ($searchTool eq "customized" and not $preSearch){ die "Error: You must provide pre-computed search results for \"customized\" search tool.\n"; }
-if ($preSearch and not -d $preSearch){ die "Error: Invalid directory for pre-computed search results: $preSearch\n"; }
-
-if ($searchTool ne "BLAST"){ $httpBlast = 0; }
-if ($httpBlast and not $protdb){ $protdb = "nr"; }
-
-unless ($threads){ # attempt to get number of CPU cores
- if (-e "/proc/cpuinfo"){
- $threads = `grep -c ^processor /proc/cpuinfo`;
- }elsif ($^O eq "darwin"){
- $threads = `sysctl -n hw.ncpu`;
- }elsif ($^O eq "MSWin32"){
- $threads = `echo %NUMBER_OF_PROCESSORS%`;
- }
- $threads =~ s/\s+$//;
- $threads = 0 unless $threads =~ /^\d+$/;
- unless ($threads){
- print "Cannot determine the number of CPUs. Do single threading.\n";
- $threads = 1;
- }
+if ($searchTool eq "customized" and not $preSearch) { die "Error: You must provide pre-computed search results for \"customized\" search tool.\n"; }
+if ($preSearch and not -d $preSearch) { die "Error: Invalid directory for pre-computed search results: $preSearch\n"; }
+
+if ($searchTool ne "BLAST") { $httpBlast = 0; }
+if ($httpBlast and not $protdb) { $protdb = "nr"; }
+
+unless ($threads) { # attempt to get number of CPU cores
+ if (-e "/proc/cpuinfo") {
+ $threads = `grep -c ^processor /proc/cpuinfo`;
+ } elsif ($^O eq "darwin") {
+ $threads = `sysctl -n hw.ncpu`;
+ } elsif ($^O eq "MSWin32") {
+ $threads = `echo %NUMBER_OF_PROCESSORS%`;
+ }
+ $threads =~ s/\s+$//;
+ $threads = 0 unless $threads =~ /^\d+$/;
+ unless ($threads) {
+ print "Cannot determine the number of CPUs. Do single threading.\n";
+ $threads = 1;
+ }
}
-unless ($eqText){ # generate Entrez query text #
- if ($exUncultured){
- $eqText = "all [filter] NOT(environmental samples[filter] OR metagenomes[orgn])";
- }
- if (@searchTaxids){
- for (my $i=0; $i$file, 'prots'=>[] }
+foreach my $file (grep {!/^\./} readdir DIR) {
+ next unless -s "$wkDir/input/$file";
+ $ins{stem_name($file)} = { 'file'=>$file, 'prots'=>[] }
}
closedir DIR;
die "Error: No data are found in the input/ folder.\n" unless (%ins);
-foreach my $set (sort keys %ins){
- $s = $ins{$set}{'file'};
- my $intype = ""; # type of input file format
- open IN, "<$wkDir/input/$s" or die "\nError: Failed to read input file $s.\n";
- while(){
- s/\s+$//; next unless $_; next if /^#/;
- unless ($intype){
- $intype = "list";
- $intype = "fasta" if (/^>/);
- }
- if ($intype eq 'fasta'){
- if (s/^>//){
- @a = seq_title ($_);
- push @{$ins{$set}{'prots'}}, { 'gi'=>$a[0], 'accn'=>$a[1], 'name'=>$a[2], 'product'=>$a[3], 'seq'=>'', 'hits'=>[], 'done'=>0 };
- }else{
- $ins{$set}{'prots'}[-1]{'seq'} .= $_; # append sequence
- }
- }elsif ($intype eq "list"){
- @a = seq_title ($_);
- push @{$ins{$set}{'prots'}}, { 'gi'=>$a[0], 'accn'=>$a[1], 'name'=>$a[2], 'product'=>$a[3], 'seq'=>'', 'hits'=>[], 'done'=>0 };
- }
- }
- close IN;
- my $i = scalar @{$ins{$set}{'prots'}};
- die "Error: sample $set does not contain any protein entries.\n" unless $i;
- print " $set: $i proteins.\n";
+foreach my $set (sort keys %ins) {
+ $s = $ins{$set}{'file'};
+ my $intype = ""; # type of input file format
+ open IN, "<$wkDir/input/$s" or die "\nError: Failed to read input file $s.\n";
+ while () {
+ s/\s+$//; next unless $_; next if /^#/;
+ $intype = /^>/ ? "fasta" : "list" unless $intype;
+ my %prot = ('name'=>'', 'title'=>'', 'seq'=>'', 'hits'=>[], 'done'=>0);
+ if ($intype eq 'fasta') {
+ if (s/^>//) {
+ @a = split(/\s+/, $_, 2);
+ $prot{'name'} = $a[0];
+ $prot{'title'} = $a[1] if $#a;
+ push @{$ins{$set}{'prots'}}, {%prot};
+ } else {
+ $ins{$set}{'prots'}[-1]{'seq'} .= $_; # append sequence
+ }
+ } elsif ($intype eq "list") {
+ $prot{'name'} = $_;
+ push @{$ins{$set}{'prots'}}, {%prot};
+ }
+ }
+ close IN;
+ my $n = scalar @{$ins{$set}{'prots'}};
+ die "Error: sample $set does not contain any protein entries.\n" unless $n;
+ print " $set: $n proteins.\n";
}
$nProt += scalar @{$ins{$_}{'prots'}} for (keys %ins);
print "Done. $nProt proteins from ".(scalar keys %ins)." set(s) to query.\n";
@@ -321,525 +321,518 @@
## read search results from previous runs ##
-if (-d "$wkDir/search"){
- print "Reading search results from previous run(s)...\n";
- foreach my $set (keys %ins){
- next unless -d "$wkDir/search/$set";
- my ($goodResults, $goodHits) = (0, 0);
- my ($badResults, $badHits) = (0, 0);
- foreach (my $i=0; $i){
- s/\s+$//;
- push (@out, $_);
- }
- close IN;
- my $badHitsHere = 0;
- my $k = 1; # status of report
- foreach (@out){
- next unless $_;
- if ($k == 1 and (/^#NEXUS/)){ $k = 2; next; }
- if ($k == 2 and /^BEGIN QUERY;/){ $k = 3; next; }
- if ($k == 3 and /^END;$/){ $k = 4; next; }
- if ($k == 4 and /^BEGIN ORGANISM;$/){ $k = 5; next; }
- if ($k == 5 and /^END;$/){ $k = 6; last; }
- if ($k == 5){
- next if /^\[/;
- next if /^;/;
- my @a = split (/\t/);
- if (@a and scalar(@a) < 6){
- $_ = "[deleted]";
- $badHitsHere ++;
- $badHits ++;
- }else{
- $goodHits ++;
- }
- }
- }
- if ($k < 6){ # bad report
- $badResults ++;
- unlink $file;
- print " Warning: Incomplete search result: $set/".$ins{$set}{'prots'}[$i]{'name'}.". Deleted.\n";
- }else{
- $ins{$set}{'prots'}[$i]{'done'} = 1;
- $goodResults ++;
- if ($badHitsHere){
- open OUT, ">$file";
- foreach (@out){
- next if /^\[deleted\]/;
- print OUT "$_\n";
- }
- close OUT;
- print " Warning: $badHitsHere invalid hits deleted from $set/".$ins{$set}{'prots'}[$i]{'name'}.".\n";
- }
- }
- }else{
- $badResults ++;
- unlink $file;
- print " Empty search result: $set/".$ins{$set}{'prots'}[$i]{'name'}.". Deleted.\n";
- }
- }
- }
- print " $set: $goodHits valid hits for $goodResults proteins.\n";
- $ins{$set}{'done'} = 1 if ($goodResults == scalar(@{$ins{$set}{'prots'}}));
- $nDone += $goodResults;
- }
- print "Done. $nDone results found, remaining ".($nProt - $nDone)." proteins to search.\n";
+if (-d "$wkDir/search") {
+ print "Reading search results from previous run(s)...\n";
+ foreach my $set (keys %ins) {
+ next unless -d "$wkDir/search/$set";
+ my ($goodResults, $goodHits) = (0, 0);
+ my ($badResults, $badHits) = (0, 0);
+ foreach (my $i = 0; $i < scalar(@{$ins{$set}{'prots'}}); $i ++) {
+ my $file = "$wkDir/search/$set/".$ins{$set}{'prots'}[$i]{'name'}.".txt";
+ if (-e $file) {
+ if (-s $file) {
+ my @out = ();
+ open IN, "<$file" or next;
+ while () {
+ s/\s+$//;
+ push (@out, $_);
+ }
+ close IN;
+ my $badHitsHere = 0;
+ my $k = 1; # status of report
+ foreach (@out) {
+ next unless $_;
+ if ($k == 1 and (/^#NEXUS/)) { $k = 2; next; }
+ if ($k == 2 and /^BEGIN QUERY;/) { $k = 3; next; }
+ if ($k == 3 and /^END;$/) { $k = 4; next; }
+ if ($k == 4 and /^BEGIN ORGANISM;$/) { $k = 5; next; }
+ if ($k == 5 and /^END;$/) { $k = 6; last; }
+ if ($k == 5) {
+ next if /^\[/;
+ next if /^;/;
+ my @a = split (/\t/);
+ if (@a and scalar(@a) < 6) {
+ $_ = "[deleted]";
+ $badHitsHere ++;
+ $badHits ++;
+ } else {
+ $goodHits ++;
+ }
+ }
+ }
+ if ($k < 6) { # bad report
+ $badResults ++;
+ unlink $file;
+ print " Warning: Incomplete search result: $set/".$ins{$set}{'prots'}[$i]{'name'}.". Deleted.\n";
+ } else {
+ $ins{$set}{'prots'}[$i]{'done'} = 1;
+ $goodResults ++;
+ if ($badHitsHere) {
+ open OUT, ">$file";
+ foreach (@out) {
+ next if /^\[deleted\]/;
+ print OUT "$_\n";
+ }
+ close OUT;
+ print " Warning: $badHitsHere invalid hits deleted from $set/".$ins{$set}{'prots'}[$i]{'name'}.".\n";
+ }
+ }
+ } else {
+ $badResults ++;
+ unlink $file;
+ print " Empty search result: $set/".$ins{$set}{'prots'}[$i]{'name'}.". Deleted.\n";
+ }
+ }
+ }
+ print " $set: $goodHits valid hits for $goodResults proteins.\n";
+ $ins{$set}{'done'} = 1 if ($goodResults == scalar(@{$ins{$set}{'prots'}}));
+ $nDone += $goodResults;
+ }
+ print "Done. $nDone results found, remaining ".($nProt - $nDone)." proteins to search.\n";
}
-if ($nProt-$nDone <= 0){
- print "Batch homology search completed. searcher.pl exits.\n";
- print "You may proceed with HGT prediction by running analyzer.pl.\n\n";
- exit 0;
+if ($nProt-$nDone <= 0) {
+ print "Batch homology search completed. searcher.pl exits.\n";
+ print "You may proceed with HGT prediction by running analyzer.pl.\n\n";
+ exit 0;
}
## detect pre-computed search results ##
# This function was inspired by Conor Meehan (cmeehan@itg.be) #
-if ($preSearch){
- my $n = 0;
- opendir (DIR, $preSearch);
- foreach my $file (grep {!/^\./} readdir DIR){
- next unless -s "$preSearch/$file";
- $s = stem_name($file);
- if (exists $ins{$s}){
- $ins{$s}{'prefile'} = $file;
- $n ++;
- }
- }
- closedir DIR;
- print "Pre-computed search results are found for $n protein set(s).\n";
+if ($preSearch) {
+ my $n = 0;
+ opendir (DIR, $preSearch);
+ foreach my $file (grep {!/^\./} readdir DIR) {
+ next unless -s "$preSearch/$file";
+ $s = stem_name($file);
+ if (exists $ins{$s}) {
+ $ins{$s}{'prefile'} = $file;
+ $n ++;
+ }
+ }
+ closedir DIR;
+ print "Pre-computed search results are found for $n protein set(s).\n";
}
-unless ($httpBlast){
-
- ## read local taxonomy database ##
-
- unless ($protdb){ print "Warning: A protein database is required for local sequence homology search.\n"; }
- unless ($taxdump){ die "Error: A taxonomy database is required for local sequence homology search.\n"; }
- unless (-d $taxdump){ die "Error: Invalid taxonomy database directory: $taxdump.\n"; }
- unless (-s "$taxdump/nodes.dmp" and -s "$taxdump/names.dmp"){ die "Error: Taxonomy database is not found under $taxdump.\n" ; }
- print "Reading taxonomy database...";
- open IN, "<$taxdump/nodes.dmp";
- while (){
- s/\s+$//;
- @a = split (/\s+\|\s+/);
- %h = ('parent', $a[1], 'rank', $a[2]);
- $taxdumps{$a[0]} = {%h};
- }
- close IN;
- open IN, "<$taxdump/names.dmp";
- while (){
- s/\s+$//;
- next unless (/scientific name\s*\|$/);
- @a = split (/\s+\|\s+/);
- $taxdumps{$a[0]}{'name'} = $a[1] if (exists $taxdumps{$a[0]});
- }
- close IN;
- print " done. ".scalar(keys %taxdumps)." records read.\n";
-
- ## read protein-to-TaxID dictionary ##
-
- if ($prot2taxid){
- unless (-s $prot2taxid){ die "Error: Invalid protein-to-TaxID dictionary: $prot2taxid.\n"; }
- print "Reading protein-to-TaxID dictionary...";
- open IN, "<$prot2taxid";
- while (){
- s/\s+$//; next unless $_;
- @a = split (/\s+/);
- next unless $#a;
- $prot2taxids{$a[0]} = $a[1];
- }
- close IN;
- print " done. ".scalar(keys %prot2taxids)." records read.\n";
- }elsif ($searchTool ne "BLAST"){
- print "Warning: A protein-to-TaxID dictionary is not provided.\n" ;
- if ($interactive){
- print "Press Enter to proceed, or Ctrl+C to exit:";
- $s = ;
- }
- }
+unless ($httpBlast) {
+
+ ## read local taxonomy database ##
+
+ unless ($protdb) { print "Warning: A protein database is required for local sequence homology search.\n"; }
+ unless ($taxdump) { die "Error: A taxonomy database is required for local sequence homology search.\n"; }
+ unless (-d $taxdump) { die "Error: Invalid taxonomy database directory: $taxdump.\n"; }
+ unless (-s "$taxdump/nodes.dmp" and -s "$taxdump/names.dmp") { die "Error: Taxonomy database is not found under $taxdump.\n" ; }
+ print "Reading taxonomy database...";
+ open IN, "<$taxdump/nodes.dmp";
+ while () {
+ s/\s+$//;
+ @a = split (/\s+\|\s+/);
+ %h = ('parent', $a[1], 'rank', $a[2]);
+ $taxdumps{$a[0]} = {%h};
+ }
+ close IN;
+ open IN, "<$taxdump/names.dmp";
+ while () {
+ s/\s+$//;
+ next unless (/scientific name\s*\|$/);
+ @a = split (/\s+\|\s+/);
+ $taxdumps{$a[0]}{'name'} = $a[1] if (exists $taxdumps{$a[0]});
+ }
+ close IN;
+ print " done. ".scalar(keys %taxdumps)." records read.\n";
+
+ ## read protein-to-TaxID dictionary ##
+
+ if ($prot2taxid) {
+ unless (-s $prot2taxid) { die "Error: Invalid protein-to-TaxID dictionary: $prot2taxid.\n"; }
+ print "Reading protein-to-TaxID dictionary...";
+ open IN, "<$prot2taxid";
+ while () {
+ s/\s+$//; next unless $_;
+ @a = split (/\s+/);
+ next unless $#a;
+ $prot2taxids{$a[0]} = $a[1];
+ }
+ close IN;
+ print " done. ".scalar(keys %prot2taxids)." records read.\n";
+ } elsif ($searchTool ne "BLAST") {
+ print "Warning: A protein-to-TaxID dictionary is not provided.\n" ;
+ if ($interactive) {
+ print "Press Enter to proceed, or Ctrl+C to exit:";
+ $s = ;
+ }
+ }
}
## read taxonomic information ##
-if (-d "$wkDir/taxonomy"){
- print "Reading taxonomy records from previous run(s)...";
- if (-s "$wkDir/taxonomy/taxa.db"){
- open IN, "<$wkDir/taxonomy/taxa.db";
- while (){
- s/\s+$//; next if /^#/; next unless $_;
- @a = split (/\t/);
- next if exists $dbTaxa{$a[0]};
- my %taxon :shared = ('organism'=>$a[1], 'lineage'=>$a[2]);
- my $i = 3; $taxon{$_} = $a[$i++] for (@ranks);
- $dbTaxa{$a[0]} = \%taxon;
- }
- close IN;
- }
- if (-s "$wkDir/taxonomy/ranks.db"){
- open IN, "<$wkDir/taxonomy/ranks.db";
- while (){
- s/\s+$//; next if /^#/; next unless $_;
- @a = split (/\t/);
- next if exists $dbRanks{$a[0]};
- $dbRanks{$a[0]} = $a[1];
- }
- close IN;
- }
- print " done. ".(scalar keys %dbTaxa)." taxa and ".(scalar keys %dbRanks)." ranks read.\n";
-}else{
- mkdir "$wkDir/taxonomy";
+if (-d "$wkDir/taxonomy") {
+ print "Reading taxonomy records from previous run(s)...";
+ if (-s "$wkDir/taxonomy/taxa.db") {
+ open IN, "<$wkDir/taxonomy/taxa.db";
+ while () {
+ s/\s+$//; next if /^#/; next unless $_;
+ @a = split (/\t/);
+ next if exists $dbTaxa{$a[0]};
+ my %taxon :shared = ('organism'=>$a[1], 'lineage'=>$a[2]);
+ my $i = 3; $taxon{$_} = $a[$i++] for (@ranks);
+ $dbTaxa{$a[0]} = \%taxon;
+ }
+ close IN;
+ }
+ if (-s "$wkDir/taxonomy/ranks.db") {
+ open IN, "<$wkDir/taxonomy/ranks.db";
+ while () {
+ s/\s+$//; next if /^#/; next unless $_;
+ @a = split (/\t/);
+ next if exists $dbRanks{$a[0]};
+ $dbRanks{$a[0]} = $a[1];
+ }
+ close IN;
+ }
+ print " done. ".(scalar keys %dbTaxa)." taxa and ".(scalar keys %dbRanks)." ranks read.\n";
+} else {
+ mkdir "$wkDir/taxonomy";
}
## collect 'self' information ##
-if (-s "$wkDir/taxonomy/self.info"){
- # The file self.info is like: sample_name | TaxID | organism_name
- my $nRead = 0;
- open IN, "<$wkDir/taxonomy/self.info";
- while (){
- s/\s+$//; next if /^#/; next unless $_;
- @a = split (/\t/);
- if (exists $ins{$a[0]}){
- $ins{$a[0]}{'taxid'} = $a[1];
- $ins{$a[0]}{'organism'} = $a[2];
- }
- }
- close IN;
+if (-s "$wkDir/taxonomy/self.info") {
+ # The file self.info is like: sample_name | TaxID | organism_name
+ my $nRead = 0;
+ open IN, "<$wkDir/taxonomy/self.info";
+ while () {
+ s/\s+$//; next if /^#/; next unless $_;
+ @a = split (/\t/);
+ if (exists $ins{$a[0]}) {
+ $ins{$a[0]}{'taxid'} = $a[1];
+ $ins{$a[0]}{'organism'} = $a[2];
+ }
+ }
+ close IN;
}
-if ($selfTax){ # TaxIDs of input protein sets
- %h = ();
- @a = split (/,/, $selfTax);
- foreach (@a){
- @b = split (/:/);
- next unless $#b;
- $h{$b[0]} = $b[1];
- }
- foreach my $set (keys %ins){
- next if exists $ins{$set}{'taxid'};
- next unless exists $h{$set};
- @a = get_taxonomy (($h{$set}));
- $ins{$set}{'taxid'} = $h{$set};
- $ins{$set}{'organism'} = $a[0];
- open OUT, ">>$wkDir/taxonomy/self.info";
- print OUT $set,"\t",$ins{$set}{'taxid'},"\t",$ins{$set}{'organism'},"\n";
- close OUT;
- }
+if ($selfTax) { # TaxIDs of input protein sets
+ %h = ();
+ @a = split (/,/, $selfTax);
+ foreach (@a) {
+ @b = split (/:/);
+ next unless $#b;
+ $h{$b[0]} = $b[1];
+ }
+ foreach my $set (keys %ins) {
+ next if exists $ins{$set}{'taxid'};
+ next unless exists $h{$set};
+ @a = get_taxonomy (($h{$set}));
+ $ins{$set}{'taxid'} = $h{$set};
+ $ins{$set}{'organism'} = $a[0];
+ open OUT, ">>$wkDir/taxonomy/self.info";
+ print OUT $set,"\t",$ins{$set}{'taxid'},"\t",$ins{$set}{'organism'},"\n";
+ close OUT;
+ }
}
-if ($interactive){
- foreach my $set (keys %ins){
- next if exists $ins{$set}{'taxid'};
- print "Enter the TaxID of $set, or press Enter if you don't know:";
- $s = ; chomp $s; next unless $s;
- @a = get_taxonomy (($s));
- $ins{$set}{'taxid'} = $s;
- $ins{$set}{'organism'} = $a[0];
- open OUT, ">>$wkDir/taxonomy/self.info";
- print OUT $set,"\t",$ins{$set}{'taxid'},"\t",$ins{$set}{'organism'},"\n";
- close OUT;
- }
+if ($interactive) {
+ foreach my $set (keys %ins) {
+ next if exists $ins{$set}{'taxid'};
+ print "Enter the TaxID of $set, or press Enter if you don't know:";
+ $s = ; chomp $s; next unless $s;
+ @a = get_taxonomy (($s));
+ $ins{$set}{'taxid'} = $s;
+ $ins{$set}{'organism'} = $a[0];
+ open OUT, ">>$wkDir/taxonomy/self.info";
+ print OUT $set,"\t",$ins{$set}{'taxid'},"\t",$ins{$set}{'organism'},"\n";
+ close OUT;
+ }
}
print "Taxonomy of input protein sets:\n";
my @missingTax = ();
-foreach my $set (sort keys %ins){
- if (exists $ins{$set}{'taxid'}){
- print " $set: ", $ins{$set}{'organism'}, " (", $ins{$set}{'taxid'}, ")\n";
- }else{
- push (@missingTax, $set);
- }
+foreach my $set (sort keys %ins) {
+ if (exists $ins{$set}{'taxid'}) {
+ print " $set: ", $ins{$set}{'organism'}, " (", $ins{$set}{'taxid'}, ")\n";
+ } else {
+ push (@missingTax, $set);
+ }
}
-if (@missingTax){
- print "Attempting to identify taxonomy of ".(scalar @missingTax)." protein set(s) :\n";
- print " ", join (",", @missingTax), "\n";
- foreach my $set (@missingTax){
- my $taxid = "";
- if (%prot2taxids){ # look up the dictionary
- foreach my $prot (@{$ins{$set}{'prots'}}){
- $taxid = '';
- my ($gi, $accn, $name) = ($prot->{'gi'}, $prot->{'accn'}, $prot->{'name'});
- if (exists $prot2taxids{$name} and $prot2taxids{$name} !~ /,/){ $taxid = $prot2taxids{$name}; }
- elsif (exists $prot2taxids{$gi} and $prot2taxids{$name} !~ /,/){ $taxid = $prot2taxids{$gi}; }
- elsif (exists $prot2taxids{$accn} and $prot2taxids{$name} !~ /,/){ $taxid = $prot2taxids{$accn}; }
- next unless $taxid;
- next if $taxid =~ /,/;
- last;
- }
- }
- unless ($taxid){ # look up the BLAST database
- my ($gi, $accn, $name) = ($ins{$set}{'prots'}[0]{'gi'}, $ins{$set}{'prots'}[0]{'accn'}, $ins{$set}{'prots'}[0]{'name'});
- if ($searchTool eq "BLAST" and not $httpBlast){
- my $query = "";
-
- if ($accn){ $query = $accn; }
- elsif ($gi){ $query = $gi; }
- else { $query = $name; }
- my @out = `$blastdbcmd -db $protdb -entry $query -outfmt \"%a %g %T %t\"`;
-
- # The command is to query the BLAST database for one or more particular sequences.
- # The four codes represent accn, gi, taxid, title
- # The query may be gi or accn or "gi|###|ref|###" or "title".
- # If the database does not contain TaxID (or if not -taxid_map), %T will be 0.
- # when making database (using makeblastdb), one should do -parse_seqids to enable gi and accn search
-
- my $found = 1;
- foreach (@out){ if (/not found in BLAST database/){ $found = 0; last; } }
- if ($found){
- @a = split (/\s+/, $out[0]);
- $taxid = $a[2] if (scalar(@a) > 3 and $a[2] and $a[2] =~ /^\d+$/);
- }
- unless ($taxid){
- if ($gi){ # look up the NCBI server
- my $iRetry = 0;
- sleep 1;
- while (1){
- $s = get "$eSummaryServer?db=protein&id=$gi";
- last if (defined $s) and ($s =~ //s);
- die "\nFailed to retrieve taxonomic information from NCBI.\n" if ($iRetry >= $retries);
- $iRetry ++; sleep $delay; next;
- }
- $taxid = $1 if ($s =~ /- (\d+?)<\/Item>/);
- }
- }
- }
- }
- if ($taxid){
- @a = get_taxonomy(($taxid));
- $ins{$set}{'taxid'} = $taxid;
- $ins{$set}{'organism'} = $a[0];
- open OUT, ">>$wkDir/taxonomy/self.info";
- print OUT $set, "\t", $taxid, "\t", $a[0], "\n";
- close OUT;
- print OUT " $set: $taxid ($a[0])\n";
- }else{
- die "Error: Cannot identify the taxonomy of $set.\n";
- }
- }
+if (@missingTax) {
+ print "Attempting to identify taxonomy of ".(scalar @missingTax)." protein set(s) :\n";
+ print " ", join (",", @missingTax), "\n";
+ foreach my $set (@missingTax) {
+ my $taxid = "";
+ if (%prot2taxids) { # look up the dictionary
+ foreach my $prot (@{$ins{$set}{'prots'}}) {
+ $taxid = '';
+ my $name = $prot->{'name'};
+ if (exists $prot2taxids{$name} and $prot2taxids{$name} !~ /,/) { $taxid = $prot2taxids{$name}; }
+ next unless $taxid;
+ next if $taxid =~ /,/;
+ last;
+ }
+ }
+ unless ($taxid) { # look up the BLAST database
+ my $name = $ins{$set}{'prots'}[0]{'name'};
+ if ($searchTool eq "BLAST" and not $httpBlast) {
+ my $query = $name;
+ my @out = `$blastdbcmd -db $protdb -entry $query -outfmt \"%a %T %t\"`;
+
+ # The command is to query the BLAST database for one or more particular sequences.
+ # The four codes represent name (accession), taxid, title
+ # If the database does not contain TaxID (or if not -taxid_map), %T will be 0.
+ # when making database (using makeblastdb), one should do -parse_seqids to enable search by name (instead of sequence)
+
+ my $found = 1;
+ foreach (@out) { if (/not found in BLAST database/) { $found = 0; last; } }
+ if ($found) {
+ @a = split (/\s+/, $out[0]);
+ $taxid = $a[2] if (scalar(@a) > 3 and $a[2] and $a[2] =~ /^\d+$/);
+ }
+ unless ($taxid) {
+ my $iRetry = 0;
+ sleep 1;
+ while (1) {
+ $s = get "$eSummaryServer?db=protein&id=$name";
+ last if (defined $s) and ($s =~ //s);
+ die "\nFailed to retrieve taxonomic information from NCBI.\n" if ($iRetry >= $retries);
+ $iRetry ++; sleep $delay; next;
+ }
+ $taxid = $1 if ($s =~ /
- (\d+?)<\/Item>/);
+ }
+ }
+ }
+ if ($taxid) {
+ @a = get_taxonomy(($taxid));
+ $ins{$set}{'taxid'} = $taxid;
+ $ins{$set}{'organism'} = $a[0];
+ open OUT, ">>$wkDir/taxonomy/self.info";
+ print OUT $set, "\t", $taxid, "\t", $a[0], "\n";
+ close OUT;
+ print OUT " $set: $taxid ($a[0])\n";
+ } else {
+ die "Error: Cannot identify the taxonomy of $set.\n";
+ }
+ }
}
## perform batch sequence similarity search ##
mkdir "$wkDir/search" unless -d "$wkDir/search";
-if ($requests == 1 or not $httpBlast){
- foreach my $set (sort keys %ins){
- next if exists $ins{$set}{'done'};
- print "Batch homology search of $set (".(scalar @{$ins{$set}{'prots'}})." queries) started.\n";
- mkdir "$wkDir/search/$set" unless -d "$wkDir/search/$set";
- @a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
- open LOG, ">>$wkDir/search/$set.log";
- print LOG "Program started at $s.\n";
- print LOG "Number of queries: ". (scalar @{$ins{$set}{'prots'}}) .".\n";
- close LOG;
-
- unless ($httpBlast){ # local search
-
- %h = ();
- for (my $i=0; $i$wkDir/seqids.txt";
- print OUT $_."\n" for (keys %h);
- close OUT;
- my @out = `$blastdbcmd -dbtype=prot -db $protdb -entry_batch $wkDir/seqids.txt -outfmt \"%a %s\"`;
- unlink "$wkDir/seqids.txt";
- if (join('', @out) =~ /not found in BLAST database/){ die "Error: Query sequences not found in $protdb.\n"; }
- foreach (@out){
- s/\s+$//; next unless $_;
- next if /^Error/;
- @a = split (/\s+/); next unless $#a;
- $a[0] =~ s/\.\d+$//;
- if (exists $h{$a[0]}){ $ins{$set}{'prots'}[$h{$a[0]}]{'seq'} = $a[1]; }
- }
- }
- %h = ();
- for (my $i=0; $i$wkDir/tmp.in";
- for (my $i=0; $i".$ins{$set}{'prots'}[$i]{'name'}."\n".$ins{$set}{'prots'}[$i]{'seq'}."\n";
- }
- close OUT;
- if ($searchTool eq "RAPSearch2"){
- `$prerapsearch -d $wkDir/tmp.in -n $wkDir/raptmp`;
- `$rapsearch -q $wkDir/tmp.in -d $wkDir/raptmp -t a -s f -o $wkDir/tmp -z $threads`;
- unlink "$wkDir/tmp.in";
- unlink "$wkDir/raptmp";
- unlink "$wkDir/raptmp.info";
- die "Error in running RAPSearch2. Please check." unless -s "$wkDir/tmp.m8";
- open IN, "<$wkDir/tmp.m8";
- while (){
- s/\s+$//; next unless $_; next if /^#/;
- @a = split (/\t/);
- next if ($#a < 11);
- next unless $a[0] eq $a[1];
- next unless exists $h{$a[0]};
- next if exists $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'};
- $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'} = $a[10]."/".$a[11]."/".$a[2]."/".length($ins{$set}{'prots'}[$h{$a[0]}]{'seq'});
- }
- close IN;
- unlink "$wkDir/tmp.aln";
- unlink "$wkDir/tmp.m8";
- }elsif ($searchTool eq "DIAMOND"){
- `$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
- `$diamond blastp -p $threads -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
- my @out = `$diamond view -a $wkDir/tmp.daa`;
- unlink "$wkDir/tmp.in";
- unlink "$wkDir/tmp.dmnd";
- unlink "$wkDir/tmp.daa";
- die "Error in running DIAMOND. Please check." unless @out;
- foreach (@out){
- s/\s+$//;
- @a = split (/\t/);
- next unless $#a == 11;
- next unless $a[0] eq $a[1];
- next unless exists $h{$a[0]};
- next if exists $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'};
- $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'} = $a[10]."/".$a[11]."/".$a[2]."/".length($ins{$set}{'prots'}[$h{$a[0]}]{'seq'});
- }
- }
- }
-
- # end of new feature #
-
- my @ids = ();
- for (my $i=0; $i= $retries);
- print " Retrying...";
- $iRetry ++; sleep $delay;
- }
- open LOG, ">>$wkDir/search/$set.log";
- if ($return =~ /\/(\d+)\/1$/){
- print LOG "$query\t$1\n";
- print "done. $1 hits.\n";
- }
- else{ print LOG "$query\tfailed\n"; }
- close LOG;
- sleep $delay;
- }
- }
- @a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
- open LOG, ">>$wkDir/search/$set.log";
- print LOG "Program ended at $s.\n";
- close LOG;
- unlink "$wkDir/tmp.in";
- print "Batch homology search of $set (".(scalar @{$ins{$set}{'prots'}})." queries) completed.\n";
- }
-
-}else{ # multi-process http BLAST #
-
- print "$requests http BLAST threads are running in parallel.\n";
-
- my %running = (); # accn -> 1
- my %retry = (); # accn -> times of retry
- my %rSets = (); # protein sets that have been started
- my %failed = ();
-
- while ($nDone + (scalar keys %failed) < $nProt){
- if (my $n = $requests-scalar(threads->list(threads::running))){
- for (1..$n){
- foreach my $set (keys %ins){
- next if $ins{$set}{'done'};
- unless (exists $rSets{$set}){
- print "Batch BLAST of $set (".(scalar @{$ins{$set}{'prots'}})." queries) started.\n";
- $rSets{$set} = 1;
- mkdir "$wkDir/search/$set" unless -d "$wkDir/search/$set";
- @a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
- open LOG, ">>$wkDir/search/$set.log"; print LOG "Program started at $s.\nNumber of queries: ". (scalar @{$ins{$set}{'prots'}}) .".\n"; close LOG;
- }
- my $started = 0;
- for (my $i=0; $icreate(\&http_blast, $set, $i);
- print " BLAST of $query started.\n";
- $running{$query} = 1;
- $started = 1;
- select (undef, undef, undef, 0.25); # this is to delay 0.25 seconds
- last;
- }
- last if $started;
- print "Batch BLAST of $set (".(scalar @{$ins{$set}{'prots'}})." queries) completed.\n";
- $ins{$set}{'done'} = 1;
- delete $rSets{$set};
- @a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
- open LOG, ">>$wkDir/search/$set.log"; print LOG "Program ended at $s.\n"; close LOG;
- }
- }
- }
- if (@a = threads->list(threads::joinable)){
- foreach (@a){
- my ($query, $set, $hits, $return) = split (/\//, $_->join());
- delete $running{$query};
- if ($return){
- $nDone ++;
- print " BLAST of $query completed. $hits hits retrieved.\n";
- open LOG, ">>$wkDir/search/$set.log";
- print LOG "$query\t$hits\n";
- close LOG;
- }else{
- $retry{$query} = 0 unless exists $retry{$query};
- $retry{$query} ++;
- if ($retry{$query} >= $retries){
- delete $retry{$query};
- $failed{"$set|$query"} = 1;
- print " BLAST of $query failed.\n";
- open LOG, ">>$wkDir/search/$set.log"; print LOG "$query\tfailed\n"; close LOG;
- }else{
- print " BLAST of $query failed. Scheduled to retry.\n";
- }
- }
- }
- }
- sleep $delay;
- if ($retryFailed and %failed and ($nDone + (scalar keys %failed) == $nProt)){
- $retryFailed = 0;
- %failed = ();
- $ins{$_}{'done'} = 0 for (keys %ins);
- }
- }
+if ($requests == 1 or not $httpBlast) {
+ foreach my $set (sort keys %ins) {
+ next if exists $ins{$set}{'done'};
+ print "Batch homology search of $set (".(scalar @{$ins{$set}{'prots'}})." queries) started.\n";
+ mkdir "$wkDir/search/$set" unless -d "$wkDir/search/$set";
+ @a = localtime(time);
+ $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
+ open LOG, ">>$wkDir/search/$set.log";
+ print LOG "Program started at $s.\n";
+ print LOG "Number of queries: ". (scalar @{$ins{$set}{'prots'}}) .".\n";
+ close LOG;
+
+ unless ($httpBlast) { # local search
+
+ %h = ();
+ for (my $i = 0; $i < scalar(@{$ins{$set}{'prots'}}); $i ++) {
+ $h{$ins{$set}{'prots'}[$i]{'name'}} = $i unless $ins{$set}{'prots'}[$i]{'seq'};
+ }
+ if (%h) {
+ die "Error: Cannot extract query sequences from a non-BLAST local database.\n" if $searchTool ne "BLAST";
+ open OUT, ">$wkDir/seqids.txt";
+ print OUT $_."\n" for (keys %h);
+ close OUT;
+ my @out = `$blastdbcmd -dbtype=prot -db $protdb -entry_batch $wkDir/seqids.txt -outfmt \"%a %s\"`;
+ unlink "$wkDir/seqids.txt";
+ die "Error: Query sequences not found in $protdb.\n" if join('', @out) =~ /not found in BLAST database/;
+ foreach (@out) {
+ s/\s+$//; next unless $_;
+ next if /^Error/;
+ @a = split (/\s+/); next unless $#a;
+ $a[0] =~ s/\.\d+$//;
+ if (exists $h{$a[0]}) { $ins{$set}{'prots'}[$h{$a[0]}]{'seq'} = $a[1]; }
+ }
+ }
+ %h = ();
+ for (my $i = 0; $i < scalar(@{$ins{$set}{'prots'}}); $i ++) {
+ $h{$ins{$set}{'prots'}[$i]{'name'}} = $i unless $ins{$set}{'prots'}[$i]{'seq'};
+ }
+ if (%h) { die "Error: One or more query sequences cannot be extracted from $protdb.\n"; }
+
+ # new feature: batch self search #
+
+ if ($searchTool eq "RAPSearch2" or $searchTool eq "DIAMOND") {
+ %h = ();
+ open OUT, ">$wkDir/tmp.in";
+ for (my $i = 0; $i < scalar(@{$ins{$set}{'prots'}}); $i ++) {
+ $h{$ins{$set}{'prots'}[$i]{'name'}} = $i;
+ print OUT ">".$ins{$set}{'prots'}[$i]{'name'}."\n".$ins{$set}{'prots'}[$i]{'seq'}."\n";
+ }
+ close OUT;
+ if ($searchTool eq "RAPSearch2") {
+ `$prerapsearch -d $wkDir/tmp.in -n $wkDir/raptmp`;
+ `$rapsearch -q $wkDir/tmp.in -d $wkDir/raptmp -t a -s f -o $wkDir/tmp -z $threads`;
+ unlink "$wkDir/tmp.in";
+ unlink "$wkDir/raptmp";
+ unlink "$wkDir/raptmp.info";
+ die "Error in running RAPSearch2. Please check." unless -s "$wkDir/tmp.m8";
+ open IN, "<$wkDir/tmp.m8";
+ while () {
+ s/\s+$//; next unless $_; next if /^#/;
+ @a = split (/\t/);
+ next if ($#a < 11);
+ next unless $a[0] eq $a[1];
+ next unless exists $h{$a[0]};
+ next if exists $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'};
+ $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'} = $a[10]."/".$a[11]."/".$a[2]."/".length($ins{$set}{'prots'}[$h{$a[0]}]{'seq'});
+ }
+ close IN;
+ unlink "$wkDir/tmp.aln";
+ unlink "$wkDir/tmp.m8";
+ } elsif ($searchTool eq "DIAMOND") {
+ `$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
+ `$diamond blastp -p $threads -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
+ my @out = `$diamond view -a $wkDir/tmp.daa`;
+ unlink "$wkDir/tmp.in";
+ unlink "$wkDir/tmp.dmnd";
+ unlink "$wkDir/tmp.daa";
+ die "Error in running DIAMOND. Please check." unless @out;
+ foreach (@out) {
+ s/\s+$//;
+ @a = split (/\t/);
+ next unless $#a == 11;
+ next unless $a[0] eq $a[1];
+ next unless exists $h{$a[0]};
+ next if exists $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'};
+ $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'} = $a[10]."/".$a[11]."/".$a[2]."/".length($ins{$set}{'prots'}[$h{$a[0]}]{'seq'});
+ }
+ }
+ }
+
+ # end of new feature #
+
+ my @ids = ();
+ for (my $i=0; $i= $retries);
+ print " Retrying...";
+ $iRetry ++; sleep $delay;
+ }
+ open LOG, ">>$wkDir/search/$set.log";
+ if ($return =~ /\/(\d+)\/1$/) {
+ print LOG "$query\t$1\n";
+ print "done. $1 hits.\n";
+ } else {
+ print LOG "$query\tfailed\n";
+ }
+ close LOG;
+ sleep $delay;
+ }
+ }
+ @a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
+ open LOG, ">>$wkDir/search/$set.log";
+ print LOG "Program ended at $s.\n";
+ close LOG;
+ unlink "$wkDir/tmp.in";
+ print "Batch homology search of $set (".(scalar @{$ins{$set}{'prots'}})." queries) completed.\n";
+ }
+
+} else { # multi-process http BLAST #
+
+ print "$requests http BLAST threads are running in parallel.\n";
+
+ my %running = (); # name -> 1
+ my %retry = (); # name -> times of retry
+ my %rSets = (); # protein sets that have been started
+ my %failed = ();
+
+ while ($nDone + (scalar keys %failed) < $nProt) {
+ if (my $n = $requests-scalar(threads->list(threads::running))) {
+ for (1..$n) {
+ foreach my $set (keys %ins) {
+ next if $ins{$set}{'done'};
+ unless (exists $rSets{$set}) {
+ print "Batch BLAST of $set (".(scalar @{$ins{$set}{'prots'}})." queries) started.\n";
+ $rSets{$set} = 1;
+ mkdir "$wkDir/search/$set" unless -d "$wkDir/search/$set";
+ @a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
+ open LOG, ">>$wkDir/search/$set.log"; print LOG "Program started at $s.\nNumber of queries: ". (scalar @{$ins{$set}{'prots'}}) .".\n"; close LOG;
+ }
+ my $started = 0;
+ for (my $i=0; $icreate(\&http_blast, $set, $i);
+ print " BLAST of $query started.\n";
+ $running{$query} = 1;
+ $started = 1;
+ select (undef, undef, undef, 0.25); # this is to delay 0.25 seconds
+ last;
+ }
+ last if $started;
+ print "Batch BLAST of $set (".(scalar @{$ins{$set}{'prots'}})." queries) completed.\n";
+ $ins{$set}{'done'} = 1;
+ delete $rSets{$set};
+ @a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
+ open LOG, ">>$wkDir/search/$set.log"; print LOG "Program ended at $s.\n"; close LOG;
+ }
+ }
+ }
+ if (@a = threads->list(threads::joinable)) {
+ foreach (@a) {
+ my ($query, $set, $hits, $return) = split (/\//, $_->join());
+ delete $running{$query};
+ if ($return) {
+ $nDone ++;
+ print " BLAST of $query completed. $hits hits retrieved.\n";
+ open LOG, ">>$wkDir/search/$set.log";
+ print LOG "$query\t$hits\n";
+ close LOG;
+ } else {
+ $retry{$query} = 0 unless exists $retry{$query};
+ $retry{$query} ++;
+ if ($retry{$query} >= $retries) {
+ delete $retry{$query};
+ $failed{"$set|$query"} = 1;
+ print " BLAST of $query failed.\n";
+ open LOG, ">>$wkDir/search/$set.log"; print LOG "$query\tfailed\n"; close LOG;
+ } else {
+ print " BLAST of $query failed. Scheduled to retry.\n";
+ }
+ }
+ }
+ }
+ sleep $delay;
+ if ($retryFailed and %failed and ($nDone + (scalar keys %failed) == $nProt)) {
+ $retryFailed = 0;
+ %failed = ();
+ $ins{$_}{'done'} = 0 for (keys %ins);
+ }
+ }
}
print "Batch homology search completed. searcher.pl exits.\n";
print "You may re-run searcher.pl to validate the results and finish incomplete searches.\n";
@@ -854,281 +847,260 @@
# return: 0 (success) or 1 (fail)
sub local_search {
- my ($set, $refID) = @_;
- my @ids = @$refID;
- my %name2id = (); # protein name (accn) to index
- my $outfile = $wkDir."/tmp.out"; # search result file name
- if (exists $ins{$set}{'prefile'}){ # read pre-computed results
- print " Importing pre-computed search results of $set...";
- $outfile = $preSearch."/".$ins{$set}{'prefile'};
- $name2id{$ins{$set}{'prots'}[$_]{'name'}} = $_ for (@ids);
- }else{ # de novo search
- if (scalar(@ids) == scalar(@{$ins{$set}{'prots'}})){
- print " $searchTool"."ing all proteins of $set...";
- }else{
- if (scalar(@ids) <= 32){
- @a = ();
- push (@a, $ins{$set}{'prots'}[$_]{'name'}) for (@ids);
- print " $searchTool"."ing ".join(",", @a)."...";
- }else{
- @a = ();
- for (my $i=0; $i<3; $i++){
- push (@a, $ins{$set}{'prots'}[$ids[$i]]{'name'});
- }
- print " $searchTool"."ing ".join(",", @a)."... (".scalar(@ids)." proteins)";
- }
- }
- unlink "$wkDir/tmp.in" if -e "$wkDir/tmp.in";
- open OUT, ">$wkDir/tmp.in";
- foreach my $id (@ids){
- print OUT ">".$ins{$set}{'prots'}[$id]{'name'}."\n".$ins{$set}{'prots'}[$id]{'seq'}."\n";
- $name2id{$ins{$set}{'prots'}[$id]{'name'}} = $id;
- }
- close OUT;
- if ($searchTool eq "BLAST"){
- $s = "$blastp -query $wkDir/tmp.in -db $protdb -out $outfile";
- $s .= " -num_threads $threads" if ($threads > 1);
- $s .= " -evalue $evalue" if ($evalue);
- $s .= " -max_target_seqs $nHits" if ($nHits);
- $s .= " -outfmt \"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sseq\"";
- `$s`;
- # this is the standard tabular format, plus aligned part of subject sequence
- }elsif ($searchTool eq "RAPSearch2"){
- $s = "$rapsearch -q $wkDir/tmp.in -d $protdb -o $wkDir/tmp -t a -s f";
- $s .= " -z $threads" if ($threads > 1);
- $s .= " -e $evalue" if ($evalue);
- $s .= " -v $nHits" if ($nHits);
- `$s`;
- $outfile = $wkDir."/tmp.m8";
- unlink $wkDir."/tmp.aln";
- }elsif ($searchTool eq "DIAMOND"){
- $s = "$diamond blastp -p $threads -q $wkDir/tmp.in -d $protdb -a $wkDir/tmp -t $wkDir";
- $s .= " -e $evalue" if ($evalue);
- $s .= " -k $nHits" if ($nHits);
- `$s`;
- $s = "$diamond view -a $wkDir/tmp.daa -o $outfile";
- `$s`;
- unlink $wkDir."/tmp.daa";
- }else{
- unlink "$wkDir/tmp.in";
- die "Error: Search tool not specified and pre-computed results not found for $set.\n";
- }
- unlink "$wkDir/tmp.in";
- }
- unless (-e $outfile){
- print "Warning: Search result missing.\n";
- return 1;
- }
-
- # read search result file
- my (%accn2gi, %gi2accn) = ((), ());
- my %seqid2taxid = (); # need to look up the TaxIDs of these sequence IDs (GI, accn or name)
- open IN, "<$outfile";
- while (){
- s/\s+$//; next unless $_; next if /^#/;
- @a = split (/\t/);
- next unless exists $name2id{$a[0]};
- my $id = $name2id{$a[0]};
- $a[1] =~ s/\s+$//; # RAPSearch2 adds ^M to subject ID. probably a bug.
- next if ($evalue and $a[10] ne "*" and $evalue < $a[10]); # evalue cutoff
- next if ($identity and $a[2] ne "*" and $identity > $a[2]); # % identity cutoff
- next if ($coverage and $a[6] ne "*" and $a[7] ne "*" and $coverage > ($a[7]-$a[6]+1)/length($ins{$set}{'prots'}[$id]{'seq'})*100); # % coverage cutoff
- %h = ('sseqid'=>$a[1], 'pident'=>$a[2], 'evalue'=>$a[10], 'bitscore'=>$a[11], 'qstart'=>$a[6], 'qend'=>$a[7], 'sseq'=>'', 'name'=>$a[1]);
- $h{'sseq'} = $a[12] if ($#a >= 12 and $getAln);
- if ($h{'sseqid'} =~ /^gi\|(\d+)\|.+\|([A-Z0-9_]+)\.\d+\|(.*)$/){
- $h{'name'} = $2;
- $accn2gi{$2} = $1 unless exists $accn2gi{$2};
- $gi2accn{$1} = $2 unless exists $gi2accn{$1};
- $seqid2taxid{$1} = '0' unless exists $seqid2taxid{$1};
- $seqid2taxid{$2} = '0' unless exists $seqid2taxid{$2};
- }else{
- $seqid2taxid{$h{'name'}} = '0' unless exists $seqid2taxid{$h{'name'}};
- }
- push (@{$ins{$set}{'prots'}[$id]{'hits'}}, {%h});
- }
- close IN;
- unlink $outfile unless exists $ins{$set}{'prefile'};
-
- # get TaxIDs for hits
- if (%seqid2taxid){
- my @seqids4db = (); # sequence IDs not in dictionary
-
- # look up in the dictionary
- if (%prot2taxids){
- foreach my $seqid (keys %seqid2taxid){
- next if $seqid2taxid{$seqid};
- if (exists $prot2taxids{$seqid}){
- $seqid2taxid{$seqid} = $prot2taxids{$seqid};
- if (exists $accn2gi{$seqid} and not $seqid2taxid{$accn2gi{$seqid}}){ $seqid2taxid{$accn2gi{$seqid}} = $prot2taxids{$seqid}; }
- if (exists $gi2accn{$seqid} and not $seqid2taxid{$gi2accn{$seqid}}){ $seqid2taxid{$gi2accn{$seqid}} = $prot2taxids{$seqid}; }
- }
- }
- foreach my $seqid (keys %seqid2taxid){
- push (@seqids4db, $seqid) unless $seqid2taxid{$seqid};
- }
- }else{
- @seqids4db = keys %seqid2taxid;
- }
-
- # find the rest in database
- if (@seqids4db and $searchTool eq "BLAST" and not $httpBlast){
- open OUT, ">$wkDir/seqids.txt";
- print OUT $_."\n" for (keys %seqid2taxid);
- close OUT;
- my @out = `$blastdbcmd -dbtype=prot -db $protdb -entry_batch $wkDir/seqids.txt -outfmt \"%a %g %T\"`;
- unlink "$wkDir/seqids.txt";
- foreach (@out){
- s/\s+$//; next unless $_;
- @a = split (/\s+/); next if $#a < 2;
- next unless $a[2];
- $a[0] =~ s/\.\d+$//;
- if (exists $seqid2taxid{$a[0]} and not $seqid2taxid{$a[0]}){ $seqid2taxid{$a[0]} = $a[2]; }
- elsif ($a[1] ne "N/A" and exists $seqid2taxid{$a[1]} and not $seqid2taxid{$a[1]}){ $seqid2taxid{$a[1]} = $a[2]; }
- }
- }
- }
-
- # get complete taxonomy information
- my %taxid2orgn = ();
- foreach my $seqid (keys %seqid2taxid){
- next unless $seqid2taxid{$seqid};
- @a = split (/,/, $seqid2taxid{$seqid}); # one SeqID may correspond to multiple TaxIDs
- foreach (@a){
- next if exists $taxid2orgn{$_};
- $taxid2orgn{$_} = '';
- }
- }
- if (%taxid2orgn){
- @a = keys %taxid2orgn;
- @b = get_taxonomy (@a);
- for (my $i=0; $i $id2score{$a} } keys %id2score){
- %h = %{$ins{$set}{'prots'}[$id]{'hits'}[$i]};
- my $taxids = '';
- if (exists $seqid2taxid{$h{'name'}}){ $taxids = $seqid2taxid{$h{'name'}}; }
- elsif (exists $accn2gi{$h{'name'}} and exists $seqid2taxid{$accn2gi{$h{'name'}}}){ $taxids = $seqid2taxid{$accn2gi{$h{'name'}}}; }
- next unless $taxids;
- foreach my $taxid (split (/,/, $taxids)){
- next unless exists $taxid2orgn{$taxid};
- my $organism = $taxid2orgn{$taxid};
- if ($ins{$set}{'prots'}[$id]{'name'} eq $h{'name'}){ # self-align result is always retained
- $isQueryIn = 1;
- }else{ # remove redundancy from other hits
- next if $taxid2orgn{$taxid} eq 'na';
- next if exists $usedTaxids{$taxid};
- if ($ignoreSubspecies){
- next unless exists $dbTaxa{$taxid};
- next unless exists $dbTaxa{$taxid}{'species'};
- my $species = $dbTaxa{$taxid}{'species'};
- next unless $species;
- next if exists $usedSpecies{$species};
- $usedSpecies{$species} = $dbRanks{$species};
- }
- $usedTaxids{$taxid} = 1;
- }
- my %hit = ('accn'=>$h{'name'}, 'expect'=>$h{'evalue'}, 'score'=>$h{'bitscore'}, 'identity'=>$h{'pident'}, 'coverage'=>'*', 'taxid'=>$taxid, 'organism'=>$organism, 'sequence'=>$h{'sseq'});
- if ($h{'qstart'} ne "*" and $h{'qend'} ne "*"){ $hit{'coverage'} = sprintf("%.2f", ($h{'qend'}-$h{'qstart'}+1)/length($ins{$set}{'prots'}[$id]{'seq'})*100); }
- push (@hits, {%hit});
- last if (scalar @hits >= $nHits);
- }
- }
- }
-
- # perform self search, in case not targetted in previous steps #
- unless ($isQueryIn){
- if (exists $ins{$set}{'prots'}[$id]{'selfalign'}){
- @a = split(/\//, $ins{$set}{'prots'}[$id]{'selfalign'});
- }else{
- @a = self_align ($name, $ins{$set}{'prots'}[$id]{'seq'});
- }
- if (@a != (0,0,0,0)){
- my %hit = ('accn'=>$name, 'expect'=>$a[0], 'score'=>$a[1], 'identity'=>$a[2], 'coverage'=>'100.00', 'taxid'=>$ins{$set}{'taxid'}, 'organism'=>$ins{$set}{'organism'}, 'sequence'=>$ins{$set}{'prots'}[$id]{'seq'});
- unshift (@hits, {%hit});
- pop @hits if (scalar @hits > $nHits);
- }
- }
-
- # output result
- my ($ntax, $nchar) = (0, 0);
- open (OUT, ">>$wkDir/search/$set/$name.txt");
- print OUT "#NEXUS\nBEGIN QUERY;\n";
- if ($ins{$set}{'prots'}[$id]{'accn'}){
- print OUT "\tGI=".$ins{$set}{'prots'}[$id]{'gi'}.";\n\tAccession=".$ins{$set}{'prots'}[$id]{'accn'}.";\n";
- }else{
- print OUT "\tName=".$ins{$set}{'prots'}[$id]{'name'}.";\n";
- }
- print OUT "\tLength=".length($ins{$set}{'prots'}[$id]{'seq'}).";\n";
- if ($ins{$set}{'prots'}[$id]{'product'}){
- print OUT "\tProduct=".$ins{$set}{'prots'}[$id]{'product'}.";\n";
- }
- print OUT "\tOrganism=".$ins{$set}{'organism'}.";\n";
- print OUT "END;\n\n";
- print OUT "BEGIN ORGANISM;\n";
- print OUT "[Accession\tOrganism\tTaxID\tBit-score\tE-value\t\%Identity\t\%Coverage]\n";
- for (my $i=0; $i $maxHits;
- }
- print OUT ";\nEND;\n\n";
- if ($getAln){
- print OUT "BEGIN DATA;\n";
- print OUT "\tDIMENSIONS NTAX=$ntax NCHAR=$nchar;\n\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n\tMATRIX\n";
- for (my $i=0; $i $maxHits;
- }
- print OUT ";\nEND;\n\n";
- }
- close OUT;
- open LOG, ">>$wkDir/search/$set.log";
- print LOG "$name\t$ntax\n";
- close LOG;
- }
- print " done.\n";
- return 0;
+ my ($set, $refID) = @_;
+ my @ids = @$refID;
+ my %name2id = (); # protein name to index
+ my $outfile = $wkDir."/tmp.out"; # search result file name
+ if (exists $ins{$set}{'prefile'}) { # read pre-computed results
+ print " Importing pre-computed search results of $set...";
+ $outfile = $preSearch."/".$ins{$set}{'prefile'};
+ $name2id{$ins{$set}{'prots'}[$_]{'name'}} = $_ for (@ids);
+ } else { # de novo search
+ if (scalar(@ids) == scalar(@{$ins{$set}{'prots'}})) {
+ print " $searchTool"."ing all proteins of $set...";
+ } else {
+ if (scalar(@ids) <= 32) {
+ @a = ();
+ push (@a, $ins{$set}{'prots'}[$_]{'name'}) for (@ids);
+ print " $searchTool"."ing ".join(",", @a)."...";
+ } else {
+ @a = ();
+ for (my $i=0; $i<3; $i++) {
+ push (@a, $ins{$set}{'prots'}[$ids[$i]]{'name'});
+ }
+ print " $searchTool"."ing ".join(",", @a)."... (".scalar(@ids)." proteins)";
+ }
+ }
+ unlink "$wkDir/tmp.in" if -e "$wkDir/tmp.in";
+ open OUT, ">$wkDir/tmp.in";
+ foreach my $id (@ids) {
+ print OUT ">".$ins{$set}{'prots'}[$id]{'name'}."\n".$ins{$set}{'prots'}[$id]{'seq'}."\n";
+ $name2id{$ins{$set}{'prots'}[$id]{'name'}} = $id;
+ }
+ close OUT;
+ if ($searchTool eq "BLAST") {
+ $s = "$blastp -query $wkDir/tmp.in -db $protdb -out $outfile";
+ $s .= " -num_threads $threads" if ($threads > 1);
+ $s .= " -evalue $evalue" if ($evalue);
+ $s .= " -max_target_seqs $nHits" if ($nHits);
+ $s .= " -outfmt \"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sseq\"";
+ `$s`;
+ # this is the standard tabular format, plus aligned part of subject sequence
+ } elsif ($searchTool eq "RAPSearch2") {
+ $s = "$rapsearch -q $wkDir/tmp.in -d $protdb -o $wkDir/tmp -t a -s f";
+ $s .= " -z $threads" if ($threads > 1);
+ $s .= " -e $evalue" if ($evalue);
+ $s .= " -v $nHits" if ($nHits);
+ `$s`;
+ $outfile = $wkDir."/tmp.m8";
+ unlink $wkDir."/tmp.aln";
+ } elsif ($searchTool eq "DIAMOND") {
+ $s = "$diamond blastp -p $threads -q $wkDir/tmp.in -d $protdb -a $wkDir/tmp -t $wkDir";
+ $s .= " -e $evalue" if ($evalue);
+ $s .= " -k $nHits" if ($nHits);
+ `$s`;
+ $s = "$diamond view -a $wkDir/tmp.daa -o $outfile";
+ `$s`;
+ unlink $wkDir."/tmp.daa";
+ } else {
+ unlink "$wkDir/tmp.in";
+ die "Error: Search tool not specified and pre-computed results not found for $set.\n";
+ }
+ unlink "$wkDir/tmp.in";
+ }
+ unless (-e $outfile) {
+ print "Warning: Search result missing.\n";
+ return 1;
+ }
+
+ # read search result file
+ my %seqid2taxid = (); # need to look up the TaxIDs of these sequence IDs
+ open IN, "<$outfile";
+ while () {
+ s/\s+$//; next unless $_; next if /^#/;
+ @a = split (/\t/);
+ next unless exists $name2id{$a[0]};
+ my $id = $name2id{$a[0]};
+ $a[1] =~ s/\s+$//; # RAPSearch2 adds ^M to subject ID. probably a bug.
+ next if ($evalue and $a[10] ne "*" and $evalue < $a[10]); # evalue cutoff
+ next if ($identity and $a[2] ne "*" and $identity > $a[2]); # % identity cutoff
+ next if ($coverage and $a[6] ne "*" and $a[7] ne "*" and $coverage > ($a[7]-$a[6]+1)/length($ins{$set}{'prots'}[$id]{'seq'})*100); # % coverage cutoff
+ %h = ('sseqid'=>$a[1], 'pident'=>$a[2], 'evalue'=>$a[10], 'bitscore'=>$a[11], 'qstart'=>$a[6], 'qend'=>$a[7], 'sseq'=>'', 'name'=>$a[1]);
+ $h{'sseq'} = $a[12] if ($#a >= 12 and $getAln);
+ $seqid2taxid{$h{'name'}} = '0' unless exists $seqid2taxid{$h{'name'}};
+ push (@{$ins{$set}{'prots'}[$id]{'hits'}}, {%h});
+ }
+ close IN;
+ unlink $outfile unless exists $ins{$set}{'prefile'};
+
+ # get TaxIDs for hits
+ if (%seqid2taxid) {
+ my @seqids4db = (); # sequence IDs not in dictionary
+
+ # look up in the dictionary
+ if (%prot2taxids) {
+ foreach my $seqid (keys %seqid2taxid) {
+ next if $seqid2taxid{$seqid};
+ if (exists $prot2taxids{$seqid}) {
+ $seqid2taxid{$seqid} = $prot2taxids{$seqid};
+ }
+ }
+ foreach my $seqid (keys %seqid2taxid) {
+ push (@seqids4db, $seqid) unless $seqid2taxid{$seqid};
+ }
+ } else {
+ @seqids4db = keys %seqid2taxid;
+ }
+
+ # find the rest in database
+ if (@seqids4db and $searchTool eq "BLAST" and not $httpBlast) {
+ open OUT, ">$wkDir/seqids.txt";
+ print OUT $_."\n" for (keys %seqid2taxid);
+ close OUT;
+ my @out = `$blastdbcmd -dbtype=prot -db $protdb -entry_batch $wkDir/seqids.txt -outfmt \"%a %g %T\"`;
+ unlink "$wkDir/seqids.txt";
+ foreach (@out) {
+ s/\s+$//; next unless $_;
+ @a = split (/\s+/); next if $#a < 2;
+ next unless $a[2];
+ $a[0] =~ s/\.\d+$//;
+ if (exists $seqid2taxid{$a[0]} and not $seqid2taxid{$a[0]}) { $seqid2taxid{$a[0]} = $a[2]; }
+ elsif ($a[1] ne "N/A" and exists $seqid2taxid{$a[1]} and not $seqid2taxid{$a[1]}) { $seqid2taxid{$a[1]} = $a[2]; }
+ }
+ }
+ }
+
+ # get complete taxonomy information
+ my %taxid2orgn = ();
+ foreach my $seqid (keys %seqid2taxid) {
+ next unless $seqid2taxid{$seqid};
+ @a = split (/,/, $seqid2taxid{$seqid}); # one SeqID may correspond to multiple TaxIDs
+ foreach (@a) {
+ next if exists $taxid2orgn{$_};
+ $taxid2orgn{$_} = '';
+ }
+ }
+ if (%taxid2orgn) {
+ @a = keys %taxid2orgn;
+ @b = get_taxonomy (@a);
+ for (my $i=0; $i $id2score{$a} } keys %id2score) {
+ %h = %{$ins{$set}{'prots'}[$id]{'hits'}[$i]};
+ next unless exists $seqid2taxid{$h{'name'}};
+ foreach my $taxid (split (/,/, $seqid2taxid{$h{'name'}})) {
+ next unless exists $taxid2orgn{$taxid};
+ my $organism = $taxid2orgn{$taxid};
+ if ($ins{$set}{'prots'}[$id]{'name'} eq $h{'name'}) { # self-align result is always retained
+ $isQueryIn = 1;
+ } else { # remove redundancy from other hits
+ next if $taxid2orgn{$taxid} eq 'na';
+ next if exists $usedTaxids{$taxid};
+ if ($ignoreSubspecies) {
+ next unless exists $dbTaxa{$taxid};
+ next unless exists $dbTaxa{$taxid}{'species'};
+ my $species = $dbTaxa{$taxid}{'species'};
+ next unless $species;
+ next if exists $usedSpecies{$species};
+ $usedSpecies{$species} = $dbRanks{$species};
+ }
+ $usedTaxids{$taxid} = 1;
+ }
+ my %hit = ('name'=>$h{'name'}, 'expect'=>$h{'evalue'}, 'score'=>$h{'bitscore'}, 'identity'=>$h{'pident'}, 'coverage'=>'*', 'taxid'=>$taxid, 'organism'=>$organism, 'sequence'=>$h{'sseq'});
+ if ($h{'qstart'} ne "*" and $h{'qend'} ne "*") { $hit{'coverage'} = sprintf("%.2f", ($h{'qend'}-$h{'qstart'}+1)/length($ins{$set}{'prots'}[$id]{'seq'})*100); }
+ push (@hits, {%hit});
+ last if (scalar @hits >= $nHits);
+ }
+ }
+ }
+
+ # perform self search, in case not targetted in previous steps #
+ unless ($isQueryIn) {
+ if (exists $ins{$set}{'prots'}[$id]{'selfalign'}) {
+ @a = split(/\//, $ins{$set}{'prots'}[$id]{'selfalign'});
+ } else {
+ @a = self_align ($name, $ins{$set}{'prots'}[$id]{'seq'});
+ }
+ if (@a != (0,0,0,0)) {
+ my %hit = ('name'=>$name, 'expect'=>$a[0], 'score'=>$a[1], 'identity'=>$a[2], 'coverage'=>'100.00', 'taxid'=>$ins{$set}{'taxid'}, 'organism'=>$ins{$set}{'organism'}, 'sequence'=>$ins{$set}{'prots'}[$id]{'seq'});
+ unshift (@hits, {%hit});
+ pop @hits if (scalar @hits > $nHits);
+ }
+ }
+
+ # output result
+ my ($ntax, $nchar) = (0, 0);
+ open (OUT, ">>$wkDir/search/$set/$name.txt");
+ print OUT "#NEXUS\nBEGIN QUERY;\n";
+ print OUT "\tName=".$ins{$set}{'prots'}[$id]{'name'}.";\n";
+ print OUT "\tLength=".length($ins{$set}{'prots'}[$id]{'seq'}).";\n";
+ print OUT "\tTitle=".$ins{$set}{'prots'}[$id]{'title'}.";\n";
+ print OUT "END;\n\n";
+ print OUT "BEGIN ORGANISM;\n";
+ print OUT "[Accession\tOrganism\tTaxID\tBit-score\tE-value\t\%Identity\t\%Coverage]\n";
+ for (my $i=0; $i $maxHits;
+ }
+ print OUT ";\nEND;\n\n";
+ if ($getAln) {
+ print OUT "BEGIN DATA;\n";
+ print OUT "\tDIMENSIONS NTAX=$ntax NCHAR=$nchar;\n\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n\tMATRIX\n";
+ for (my $i=0; $i $maxHits;
+ }
+ print OUT ";\nEND;\n\n";
+ }
+ close OUT;
+ open LOG, ">>$wkDir/search/$set.log";
+ print LOG "$name\t$ntax\n";
+ close LOG;
+ }
+ print " done.\n";
+ return 0;
}
@@ -1140,446 +1112,441 @@ sub local_search {
# note: the Perl thread join function has some problems, that why I used this inconvenient way.
sub http_blast{
- my ($set, $id) = ($_[0], $_[1]);
-
- # generate query information
- my %self = %{$ins{$set}{'prots'}[$id]}; # copy the whole record
- $self{'length'} = 0; # need to know length
- $self{'length'} = length($self{'seq'}) if $self{'seq'}; # ideally, sequence is available, otherwise see below
- my $query = $self{'name'};
-
- # send BLAST request #
- my $isError = 0;
- my $url = "$blastServer?CMD=Put&PROGRAM=blastp&DATABASE=$httpDb&FILTER=m S";
- $url .= "&EXPECT=$evalue" if $evalue;
- $url .= "&MAX_NUM_SEQ=$nHits" if ($nHits != 100);
- $url .= "&EQ_TEXT=$eqText" if $eqText;
- my $querykey = $self{'name'};
- $querykey = $self{'seq'} if $self{'seq'}; # user-defined query sequence
- $url .= "&QUERY=$querykey";
- $s = get $url;
-
- # $url = substr($args,0,2048) if (length($args) > 2048); # In some situations, the max size of a URL is 2048 bytes
- # @a = localtime(time); print "Post: $a[2]:$a[1]:$a[0], ";
- my $starttime = time;
-
- my $rid;
- # @a = localtime(time); print "Respond: $a[2]:$a[1]:$a[0], ";
- if ($s =~ /^ RID = (.*$)/m){ $rid = $1; print "RID=$rid "; }
- else{ return "$query/$set/0/0"; } ##########################
- if ($s =~ /\s\((\d+) letters\)/){ $self{'length'} = $1; }
- if ($s =~ /^ RTOE = (.*$)/m){
- my $rtoe = $1;
- # print "RTOE=$rtoe, ";
- if ($rtoe and $rtoe =~ /^\d+$/){ sleep $rtoe; }else{ sleep $delay; }
- }else{ return "$query/$set/0/0"; }
- while (1){
- sleep 1;
- $s = get "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
- if ($s =~ /\s+Status=([A-Z]+)/m){
- if ($1 eq "WAITING"){
- if (time - $starttime > $timeout){
- $isError = 1; last;
- }else{
- print ". "; sleep $delay; next;
- }
- }
- elsif ($1 eq "FAILED"){ $isError = 1; last; }
- elsif ($1 eq "UNKNOWN"){ $isError = 1; last; }
- elsif ($1 eq "READY"){ @a = localtime(time); last; }
- else{ $isError = 1; last; }
- }else{
- $isError = 1; last;
- }
-
- # if ($s =~ /\s+Status=WAITING/m){ next; }
- # if ($s =~ /\s+Status=FAILED/m){ $isError = 1; last; }
- # if ($s =~ /\s+Status=UNKNOWN/m){ $isError = 1; last; }
- # if ($s =~ /\s+Status=READY/m){
- # if ($s =~ /\s+ThereAreHits=yes/m){ last; }
- # else{ last; } # no hits;
- # }
- }
- if ($isError){ return "$query/$set/0/0"; }
-
- # retrieve tabular report #
- sleep 1;
- $url = "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
- $url .= "&MAX_NUM_SEQ=$nHits&DESCRIPTIONS=$nHits" if ($nHits != 100);
- $s = get $url;
- if (($s !~ /# blastp/) or ($s !~ /# Query:\s/)){ return "$query/$set/0/0"; }
- my %hits = ();
- my $hitid = 0;
- foreach (split(/\n/, $s)){
- if (/^# Query/ and not $self{'seq'}){
- s/>.*//;
- @a = split (/\|/);
- $self{'gi'} = $a[1];
- $self{'accession'} = $a[3];
- $a[4] =~ /^\s*(.+\S)\s*\[(.+)\]/;
- $self{'product'} = $1;
- $self{'organism'} = $2;
- }
- next if (/^#/);
- @a = split (/\t/);
- next if ($#a < 12);
- next unless $a[12];
- next if ($identity and $identity > $a[2]); # % identity cutoff
- @b = split (/;/, $a[1]);
- $hitid ++;
- foreach (@b){
- next unless (/\|$/);
- @c = split (/\|/);
- $c[3] =~ s/\.\d+$//;
- my %hit = ('id', $hitid, 'accn', $c[3], 'expect', $a[11], 'score', $a[12], 'identity', $a[2], 'qlength', $a[8]-$a[7]+1);
- $hits{$c[1]} = {%hit};
- }
- }
-
- # retrieve taxonomy information #
- my $i = 0; # count
- @a = (); # all results
- @b = (); # subset of GIs
- foreach (keys %hits){
- $i ++;
- push (@b, $_);
- if ($i == 190){ # in some situations, a URI should not exceed ~2000 characters, which is approximately 190 GIs.
- sleep 1;
- while (1){
- $s = get $eSummaryServer."?db=protein&id=".join (",", @b);
- last if (defined $s);
- sleep $delay;
- }
- push (@a, $1) while ($s =~ s/(.+?)<\/DocSum>//s);
- $i = 0; @b = ();
- }
- }
- if (@b){
- sleep 1;
- while (1){
- $s = get $eSummaryServer."?db=protein&id=".join (",", @b);
- last if (defined $s);
- sleep $delay;
- }
- push (@a, $1) while ($s =~ s/(.+?)<\/DocSum>//s);
- }
-
- foreach (@a){
- /(\d+)<\/Id>/;
- $s = $1;
- /
- (\d+?)<\/Item>/;
- $hits{$s}{'taxid'} = $1;
- /
- .*\[([^\[\]]+?)\]<\/Item>/;
- $hits{$s}{'organism'} = $1;
- /
- (\d+)<\/Item>/;
- $hits{$s}{'length'} = $1;
- }
-
- # discard hits whose taxonomy information is unidentified #
- foreach (keys %hits){
- delete $hits{$_} and next unless exists $hits{$_}{'taxid'};
- delete $hits{$_} unless $hits{$_}{'taxid'};
- delete $hits{$_} and next unless exists $hits{$_}{'organism'};
- delete $hits{$_} unless $hits{$_}{'organism'};
- delete $hits{$_} if ($hits{$_}{'organism'} =~ /^Unresolved/);
- delete $hits{$_} if ($taxonUCASE and ($hits{$_}{'organism'} !~ /^[A-Z]/));
- }
-
- # discard hits whose organism names contain specified strings #
- if (@ignoreTaxa){
- foreach (keys %hits){
- foreach $s (@ignoreTaxa){
- if ($hits{$_}{'organism'} =~ /$s/){
- delete $hits{$_};
- last;
- }
- }
- }
- }
-
- # merge identical proteins (defined by NCBI) from one organism #
- $hitid = 0;
- my ($taxid, $key) = (0, '');
- foreach ( sort {$hits{$a}{'id'} <=> $hits{$b}{'id'}} keys %hits){
- if ($hits{$_}{'id'} != $hitid){
- $hitid = $hits{$_}{'id'};
- $taxid = $hits{$_}{'taxid'};
- $key = $_;
- }else{
- if ($hits{$_}{'taxid'} == $taxid){
- $hits{$key}{'accn'} .= "/".$hits{$_}{'accn'};
- delete $hits{$_};
- }
- }
- }
-
- # merge duplicated hits (proteins with same bit score from one organism) #
- if ($mergeDuplicates){
- my $score = 0;
- ($taxid, $key) = (0, '');
- foreach ( sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'taxid'} <=> $hits{$b}{'taxid'})} keys %hits){
- if (($hits{$_}{'score'} != $score) or ($hits{$_}{'taxid'} != $taxid)){
- $score = $hits{$_}{'score'};
- $taxid = $hits{$_}{'taxid'};
- $key = $_;
- }else{
- $hits{$key}{'accn'} .= "/".$hits{$_}{'accn'};
- delete $hits{$_};
- }
- }
- }
-
- # reorder accession numbers #
- foreach (keys %hits){
- $hits{$_}{'accn'} = order_accns ($hits{$_}{'accn'}, $query);
- }
-
- # retrieve taxonomy information from NCBI server #
- $i = 0;
- @a = ();
- foreach (keys %hits){
- next if exists $dbTaxa{$hits{$_}{'taxid'}};
- push @a, $hits{$_}{'taxid'};
- $i ++;
- if ($i == 150){
- sleep 1;
- get_taxonomy @a;
- $i = 0; @a = (); sleep $delay;
- }
- }
- sleep 1;
- get_taxonomy @a if @a;
-
- # check whether BLAST result contains query itself #
- my $isQueryIn = 0;
- foreach (keys %hits){
- @a = split (/\//, $hits{$_}{'accn'});
- foreach $s (@a){ if ($query eq $s){ $isQueryIn = $hits{$_}{'id'}; last; }}
- if ($isQueryIn){ $self{'length'} = $hits{$_}{'length'} unless $self{'length'}; last; }
- }
-
- # find out length of query sequence, in case not in previous steps #
- unless ($self{'length'} or $self{'seq'}){
- sleep 1;
- while (1){
- $s = get "$eSearchServer?db=protein&term=$query";
- last if (defined $s);
- sleep $delay;
- }
- $s =~ /(\d+)<\/Id>/;
- sleep 1;
- while (1){
- $s = get "$eSummaryServer?db=protein&id=$1";
- last if (defined $s);
- sleep $delay;
- }
- $s =~ /
- (\d+)<\/Item>/;
- $self{'length'} = $1;
- }
- next unless $self{'length'};
- $hits{$_}{'coverage'} = sprintf("%.2f", $hits{$_}{'qlength'}/$self{'length'}*100) for (keys %hits);
-
- # perform self search, in case not targetted in previous steps #
- unless ($isQueryIn){
- sleep 1;
- @a = self_align ($query, '');
- return "$query/$set/0/0" if (@a == (0,0,0,0));
- my %hit = ('id', 0, 'accn', $query, 'expect', $a[0], 'score', $a[1], 'identity', $a[2], 'coverage', '100.00', 'taxid', $ins{$set}{'taxid'}, 'organism', $ins{$set}{'organism'});
- if ($self{'length'}){ $hit{'length'} = $self{'length'}; }
- elsif ($a[3]){ $hit{'length'} = $a[3]; }
- else{ $hit{'length'} = 0; }
- $hits{$query} = {%hit};
- }
-
- # in multiple hits from one species, only keep the hit with the highest bit score #
- if ($ignoreSubspecies){
- my %speciesDB = ();
- foreach (sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits){
- unless (exists $hits{$_}{'taxid'} and exists $dbTaxa{$hits{$_}{'taxid'}} and exists $dbTaxa{$hits{$_}{'taxid'}}{'species'} and $dbTaxa{$hits{$_}{'taxid'}}{'species'}){
- delete $hits{$_}; next;
- }
- $s = $dbTaxa{$hits{$_}{'taxid'}}{'species'};
- unless (exists $speciesDB{$s}){
- $speciesDB{$s} = $dbRanks{$s};
- }else{
- next if ($hits{$_}{'taxid'} == $ins{$set}{'taxid'});
- next if (($hits{$_}{'accn'} eq $query) or ($hits{$_}{'accn'} =~ /^$query\//) or ($hits{$_}{'accn'} =~ /\/$query$/) or ($hits{$_}{'accn'} =~ /\/$query\//));
- delete $hits{$_};
- }
- }
- }
-
- # remove excessive hits #
- if ($maxHits){
- my $i = 0;
- foreach (sort {$hits{$b}{'score'} <=> $hits{$a}{'score'}} keys %hits){
- $i ++;
- delete $hits{$_} if ($i > $maxHits);
- }
- }
-
- # create output file #
- open (OUT, ">$wkDir/search/$set/$query.txt");
-
- print OUT "#NEXUS\nBEGIN QUERY;\n";
- if ($self{'accn'}){
- print OUT "\tGI=".$self{'gi'}.";\n\tAccession=".$self{'accn'}.";\n";
- }else{
- print OUT "\tName=".$self{'name'}.";\n";
- }
- print OUT "\tLength=".$self{'length'}.";\n";
- if ($self{'product'}){
- print OUT "\tProduct=".$self{'product'}.";\n";
- }
- print OUT "\tOrganism=".$ins{$set}{'organism'}.";\n";
- print OUT "END;\n\n";
-
- # retrieve taxonomy report (using TaxBLAST)
- if ($taxBlast){
- sleep 1;
- $s = get "$blastServer?CMD=Get&FORMAT_TYPE=HTML&FORMAT_OBJECT=TaxBlast&RID=$rid&ALIGNMENTS=100000";
- if (($s !~ /Tax BLAST Report/) or ($s !~ /Lineage Report/)){
- print OUT "BEGIN ERROR;\nRetrieval of taxonomy report failed.\n;\nEND;\n\n";
- }else{
- my $reading = 0;
- foreach (split(/\n/, $s)){
- if ($_ eq "Lineage Report
"){
- print OUT "BEGIN LINEAGE;\n";
- $reading = 1; next;
- }
- if (($_ eq "
") and $reading){
- print OUT ";\nEND;\n\n";
- $reading = 0; next;
- }
- if ($reading){
- s/\<.*?>//g;
- s/( hit[ s] \[.*?\])(.*)$/$1/;
- if (@ignoreTaxa){
- $i = 0;
- foreach $s (@ignoreTaxa){
- if (/$s/){ $i = 1; last; }
- }
- next if $i;
- }
- print OUT "$_\n";
- next;
- }
- }
- }
- }
-
- # output hit table #
-
- print OUT "BEGIN ORGANISM;\n";
- print OUT "[Accession\tOrganism\tTaxID\tBit-score\tE-value\t\%Identity\t\%Coverage]\n";
- foreach (sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits){
- print OUT $hits{$_}{'accn'}."\t".$hits{$_}{'organism'}."\t".$hits{$_}{'taxid'}."\t".$hits{$_}{'score'}."\t".$hits{$_}{'expect'}."\t".$hits{$_}{'identity'}."\t".$hits{$_}{'coverage'}."\n";
- }
- print OUT ";\nEND;\n\n";
-
- # retrieve hit sequences #
- if ($seqBlast){
- my $i = 0; # count
- my $allinfo = "";
- @b = (); # subset of GIs
- foreach (keys %hits){
- $i ++;
- push (@b, $_);
- if ($i == 190){ # a URI should not exceed ~2000 characters, which is approximately 190 GIs.
- sleep 1;
- while (1){
- $s = get $eFetchServer."?db=protein&rettype=FASTA&id=".join (",", @b);
- last if (defined $s);
- sleep $delay;
- }
- $allinfo .= $s;
- $i = 0; @b = ();
- }
- }
- if (@b){
- sleep 1;
- while (1){
- $s = get $eFetchServer."?db=protein&rettype=FASTA&id=".join (",", @b);
- last if (defined $s);
- sleep $delay;
- }
- $allinfo .= $s;
- }
- $i = 0; # current GI
- foreach (split (/\n/, $allinfo)){
- next unless $_;
- if (/^>gi\|(\d+)\|/){
- $i = $1;
- $hits{$i}{'sequence'} = "";
- }else{
- $hits{$i}{'sequence'} .= $_ if $i;
- }
- }
- }
-
- # retrieve multiple sequence alignment (conflicts seqBlast)
- if ($alnBlast and not $seqBlast){
- sleep 1;
- $s = get "$blastServer?CMD=Get&ALIGNMENT_VIEW=FlatQueryAnchoredNoIdentities&FORMAT_TYPE=Text&RID=$rid&ALIGNMENTS=100000";
- if (($s !~ /blastp/i) or ($s !~ /\nQuery=\s/)){
- print OUT "BEGIN ERROR;\nRetrieval of multiple sequence alignment failed.\n;\nEND;\n\n";
- }else{
- @c = split(/\n/, $s);
- my $reading = 0; # reading status
- my $iBlock = -1; # block ID
- my %seqs; # sequence alignment
- foreach (@c){
- if ($_ eq "ALIGNMENTS"){ $reading = 1; next; }
- if ($reading and /^\s/){ $reading = 0; last; }
- next unless $reading;
- unless ($_){ $iBlock ++; next; } # Start a new block if empty line
- @a = split(/\s+/,substr($_,0,19)); # id
- #next if ($a[0] eq "Query");
- $_ =~ /(\s\s\d*$)/;
- $s = substr($_,19,length($_)-19-length($1)); # sequence
- $s =~ s/\s/-/g;
- $seqs{$a[0]} = "-"x($iBlock*60) unless (exists $seqs{$a[0]}); # add new sequence
- $seqs{$a[0]} .= $s; # add new sequence
- }
- $i = 0;
- foreach $s(keys %seqs){
- $i = length($seqs{$s}) if (length($seqs{$s}) > $i);
- }
- foreach $s(keys %seqs){
- $seqs{$s} .= "-"x($i - length($seqs{$s})) if (length($seqs{$s}) < $i);
- }
- foreach my $hit (keys %hits){
- @a = split /\//, $hits{$hit}{'accn'};
- foreach (@a){
- if (exists $seqs{$_}){
- $hits{$hit}{'sequence'} = $seqs{$_};
- last;
- }
- }
- }
- }
- }
-
- # output sequences #
- if ($seqBlast or $alnBlast){
- my $ntax = 0; # number of sequences
- my $nchar = 0; # maximum length of sequence
- foreach (keys %hits){
- next unless exists $hits{$_}{'sequence'};
- $ntax ++;
- $nchar = length ($hits{$_}{'sequence'}) if (length ($hits{$_}{'sequence'}) > $nchar);
- }
- print OUT "BEGIN DATA;\n";
- print OUT "\tDIMENSIONS NTAX=$ntax NCHAR=$nchar;\n\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n\tMATRIX\n";
- foreach (sort {($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits){
- next unless exists $hits{$_}{'sequence'};
- @a = split (/\//, $hits{$_}{'accn'});
- print OUT $a[0]."\t".$hits{$_}{'sequence'}."\n";
- }
- print OUT ";\nEND;\n\n";
- }
- close OUT;
- return "$query/$set/".scalar(keys %hits)."/1";
+ my ($set, $id) = ($_[0], $_[1]);
+
+ # generate query information
+ my %self = %{$ins{$set}{'prots'}[$id]}; # copy the whole record
+ $self{'length'} = 0; # need to know length
+ $self{'length'} = length($self{'seq'}) if $self{'seq'}; # ideally, sequence is available, otherwise see below
+ my $query = $self{'name'};
+
+ # send BLAST request #
+ my $isError = 0;
+ my $url = "$blastServer?CMD=Put&PROGRAM=blastp&DATABASE=$httpDb&FILTER=m%20S";
+ $url .= "&EXPECT=$evalue" if $evalue;
+ $url .= "&MAX_NUM_SEQ=$nHits" if ($nHits != 100);
+ $url .= "&EQ_TEXT=$eqText" if $eqText;
+ my $querykey = $self{'name'};
+ $querykey = $self{'seq'} if $self{'seq'}; # user-defined query sequence
+ $url .= "&QUERY=$querykey";
+ $s = get $url;
+
+ # $url = substr($args,0,2048) if (length($args) > 2048); # In some situations, the max size of a URL is 2048 bytes
+ # @a = localtime(time); print "Post: $a[2]:$a[1]:$a[0], ";
+ my $starttime = time;
+
+ my $rid;
+ # @a = localtime(time); print "Respond: $a[2]:$a[1]:$a[0], ";
+ if ($s =~ /^ RID = (.*$)/m) { $rid = $1; print "RID=$rid "; }
+ else{ return "$query/$set/0/0"; } ##########################
+ if ($s =~ /\s\((\d+) letters\)/) { $self{'length'} = $1; }
+ if ($s =~ /^ RTOE = (.*$)/m) {
+ my $rtoe = $1;
+ # print "RTOE=$rtoe, ";
+ if ($rtoe and $rtoe =~ /^\d+$/) { sleep $rtoe; } else { sleep $delay; }
+ } else { return "$query/$set/0/0"; }
+ while (1) {
+ sleep 1;
+ $s = get "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
+ if ($s =~ /\s+Status=([A-Z]+)/m) {
+ if ($1 eq "WAITING") {
+ if (time - $starttime > $timeout) {
+ $isError = 1; last;
+ } else {
+ print ". "; sleep $delay; next;
+ }
+ }
+ elsif ($1 eq "FAILED") { $isError = 1; last; }
+ elsif ($1 eq "UNKNOWN") { $isError = 1; last; }
+ elsif ($1 eq "READY") { @a = localtime(time); last; }
+ else{ $isError = 1; last; }
+ } else {
+ $isError = 1; last;
+ }
+
+ # if ($s =~ /\s+Status=WAITING/m) { next; }
+ # if ($s =~ /\s+Status=FAILED/m) { $isError = 1; last; }
+ # if ($s =~ /\s+Status=UNKNOWN/m) { $isError = 1; last; }
+ # if ($s =~ /\s+Status=READY/m) {
+ # if ($s =~ /\s+ThereAreHits=yes/m) { last; }
+ # else{ last; } # no hits;
+ # }
+ }
+ if ($isError) { return "$query/$set/0/0"; }
+
+ # retrieve tabular report #
+ sleep 1;
+ $url = "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
+ $url .= "&MAX_NUM_SEQ=$nHits&DESCRIPTIONS=$nHits" if ($nHits != 100);
+ $s = get $url;
+ if (($s !~ /# blastp/) or ($s !~ /# Query:\s/)) { return "$query/$set/0/0"; }
+ my %hits = ();
+ my $hitIdx = 0;
+ foreach (split(/\n/, $s)) {
+ if (/^#/) {
+ if (s/^# Query: // and not $self{'seq'}) {
+ # s/>.*//;
+ @a = split(/\s+/, $_, 2);
+ $self{'title'} = $a[1] if $#a;
+ }
+ } else {
+ # fields: query id, subject ids, % identity, % positives, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
+ @a = split (/\t/);
+ next if ($#a < 12);
+ next if $identity and $identity > $a[2]; # % identity cutoff
+ $hitIdx ++;
+ foreach (split (/;/, $a[1])) {
+ next unless /\|$/;
+ # as of 2017, the NCBI BLAST server still prints sequence ids like "gi|123456789|ref|NP_123456.1|"
+ my $gi = (split(/\|/))[1];
+ my $name = (split(/\|/))[-1];
+ # length of aligned region of query sequence = q. end - q. start + 1
+ my $qlen = $a[8] - $a[7] + 1;
+ my %hit = ('id'=>$hitIdx, 'name'=>$name, 'expect'=>$a[11], 'score'=>$a[12], 'identity'=>$a[2], 'qlen'=>$qlen);
+ $hits{$gi} = {%hit};
+ }
+ }
+ }
+
+ # retrieve taxonomy information #
+ my $i = 0; # count
+ @a = (); # all results
+ @b = (); # subset of names
+ foreach (keys %hits) {
+ $i ++;
+ push (@b, $_);
+ if ($i == 190) { # in some situations, a URI should not exceed ~2000 characters, which is approximately 190 GIs.
+ sleep 1;
+ while (1) {
+ $s = get $eSummaryServer."?db=protein&id=".join (",", @b);
+ last if (defined $s);
+ sleep $delay;
+ }
+ push (@a, $1) while ($s =~ s/(.+?)<\/DocSum>//s);
+ $i = 0; @b = ();
+ }
+ }
+ if (@b) {
+ sleep 1;
+ while (1) {
+ $s = get $eSummaryServer."?db=protein&id=".join (",", @b);
+ last if (defined $s);
+ sleep $delay;
+ }
+ push (@a, $1) while ($s =~ s/(.+?)<\/DocSum>//s);
+ }
+
+ foreach (@a) {
+ # as of 2017, this Id is still the GI number
+ /(\d+)<\/Id>/;
+ $s = $1;
+ /- (\d+?)<\/Item>/;
+ $hits{$s}{'taxid'} = $1;
+ /
- .*\[([^\[\]]+?)\]<\/Item>/;
+ $hits{$s}{'organism'} = $1;
+ /
- (\d+)<\/Item>/;
+ $hits{$s}{'length'} = $1;
+ }
+
+ # discard hits whose taxonomy information is unidentified #
+ foreach (keys %hits) {
+ delete $hits{$_} and next unless exists $hits{$_}{'taxid'};
+ delete $hits{$_} unless $hits{$_}{'taxid'};
+ delete $hits{$_} and next unless exists $hits{$_}{'organism'};
+ delete $hits{$_} unless $hits{$_}{'organism'};
+ delete $hits{$_} if ($hits{$_}{'organism'} =~ /^Unresolved/);
+ delete $hits{$_} if ($taxonUCASE and ($hits{$_}{'organism'} !~ /^[A-Z]/));
+ }
+
+ # discard hits whose organism names contain specified strings #
+ if (@ignoreTaxa) {
+ foreach (keys %hits) {
+ foreach $s (@ignoreTaxa) {
+ if ($hits{$_}{'organism'} =~ /$s/) {
+ delete $hits{$_};
+ last;
+ }
+ }
+ }
+ }
+
+ # merge identical proteins (defined by NCBI) from one organism #
+ $hitIdx = 0;
+ my ($taxid, $key) = (0, '');
+ foreach ( sort {$hits{$a}{'id'} <=> $hits{$b}{'id'}} keys %hits) {
+ if ($hits{$_}{'id'} != $hitIdx) {
+ $hitIdx = $hits{$_}{'id'};
+ $taxid = $hits{$_}{'taxid'};
+ $key = $_;
+ } else {
+ if ($hits{$_}{'taxid'} == $taxid) {
+ $hits{$key}{'name'} .= "/".$hits{$_}{'name'};
+ delete $hits{$_};
+ }
+ }
+ }
+
+ # merge duplicated hits (proteins with same bit score from one organism) #
+ if ($mergeDuplicates) {
+ my $score = 0;
+ ($taxid, $key) = (0, '');
+ foreach ( sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'taxid'} <=> $hits{$b}{'taxid'})} keys %hits) {
+ if (($hits{$_}{'score'} != $score) or ($hits{$_}{'taxid'} != $taxid)) {
+ $score = $hits{$_}{'score'};
+ $taxid = $hits{$_}{'taxid'};
+ $key = $_;
+ } else {
+ $hits{$key}{'name'} .= "/".$hits{$_}{'name'};
+ delete $hits{$_};
+ }
+ }
+ }
+
+ # reorder accession numbers #
+ foreach (keys %hits) {
+ $hits{$_}{'name'} = order_accns ($hits{$_}{'name'}, $query);
+ }
+
+ # retrieve taxonomy information from NCBI server #
+ $i = 0;
+ @a = ();
+ foreach (keys %hits) {
+ next if exists $dbTaxa{$hits{$_}{'taxid'}};
+ push @a, $hits{$_}{'taxid'};
+ $i ++;
+ if ($i == 150) {
+ sleep 1;
+ get_taxonomy @a;
+ $i = 0; @a = (); sleep $delay;
+ }
+ }
+ sleep 1;
+ get_taxonomy @a if @a;
+
+ # check whether BLAST result contains query itself #
+ my $isQueryIn = 0;
+ foreach (keys %hits) {
+ @a = split (/\//, $hits{$_}{'name'});
+ foreach $s (@a) { if ($query eq $s) { $isQueryIn = $hits{$_}{'id'}; last; }}
+ if ($isQueryIn) { $self{'length'} = $hits{$_}{'length'} unless $self{'length'}; last; }
+ }
+
+ # find out length of query sequence, in case not in previous steps #
+ unless ($self{'length'} or $self{'seq'}) {
+ sleep 1;
+ while (1) {
+ $s = get "$eSearchServer?db=protein&term=$query";
+ last if (defined $s);
+ sleep $delay;
+ }
+ # as of 2017, this Id is still the GI number
+ $s =~ /(\d+)<\/Id>/;
+ sleep 1;
+ while (1) {
+ $s = get "$eSummaryServer?db=protein&id=$1";
+ last if (defined $s);
+ sleep $delay;
+ }
+ $s =~ /
- (\d+)<\/Item>/;
+ $self{'length'} = $1;
+ }
+ next unless $self{'length'};
+ $hits{$_}{'coverage'} = sprintf("%.2f", $hits{$_}{'qlen'}/$self{'length'}*100) for (keys %hits);
+
+ # perform self search, in case not targetted in previous steps #
+ unless ($isQueryIn) {
+ sleep 1;
+ @a = self_align ($query, '');
+ return "$query/$set/0/0" if (@a == (0,0,0,0));
+ my %hit = ('id', 0, 'name', $query, 'expect', $a[0], 'score', $a[1], 'identity', $a[2], 'coverage', '100.00', 'taxid', $ins{$set}{'taxid'}, 'organism', $ins{$set}{'organism'});
+ if ($self{'length'}) { $hit{'length'} = $self{'length'}; }
+ elsif ($a[3]) { $hit{'length'} = $a[3]; }
+ else{ $hit{'length'} = 0; }
+ $hits{$query} = {%hit};
+ }
+
+ # in multiple hits from one species, only keep the hit with the highest bit score #
+ if ($ignoreSubspecies) {
+ my %speciesDB = ();
+ foreach (sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits) {
+ unless (exists $hits{$_}{'taxid'} and exists $dbTaxa{$hits{$_}{'taxid'}} and exists $dbTaxa{$hits{$_}{'taxid'}}{'species'} and $dbTaxa{$hits{$_}{'taxid'}}{'species'}) {
+ delete $hits{$_}; next;
+ }
+ $s = $dbTaxa{$hits{$_}{'taxid'}}{'species'};
+ unless (exists $speciesDB{$s}) {
+ $speciesDB{$s} = $dbRanks{$s};
+ } else {
+ next if ($hits{$_}{'taxid'} == $ins{$set}{'taxid'});
+ next if (($hits{$_}{'name'} eq $query) or ($hits{$_}{'name'} =~ /^$query\//) or ($hits{$_}{'name'} =~ /\/$query$/) or ($hits{$_}{'name'} =~ /\/$query\//));
+ delete $hits{$_};
+ }
+ }
+ }
+
+ # remove excessive hits #
+ if ($maxHits) {
+ my $i = 0;
+ foreach (sort {$hits{$b}{'score'} <=> $hits{$a}{'score'}} keys %hits) {
+ $i ++;
+ delete $hits{$_} if ($i > $maxHits);
+ }
+ }
+
+ # create output file #
+ open (OUT, ">$wkDir/search/$set/$query.txt");
+
+ print OUT "#NEXUS\nBEGIN QUERY;\n";
+ print OUT "\tName=".$self{'name'}.";\n";
+ print OUT "\tLength=".$self{'length'}.";\n";
+ print OUT "\tTitle=".$self{'title'}.";\n";
+ print OUT "END;\n\n";
+
+ # retrieve taxonomy report (using TaxBLAST)
+ if ($taxBlast) {
+ sleep 1;
+ $s = get "$blastServer?CMD=Get&FORMAT_TYPE=HTML&FORMAT_OBJECT=TaxBlast&RID=$rid&ALIGNMENTS=100000";
+ if (($s !~ /Tax BLAST Report/) or ($s !~ /Lineage Report/)) {
+ print OUT "BEGIN ERROR;\nRetrieval of taxonomy report failed.\n;\nEND;\n\n";
+ } else {
+ my $reading = 0;
+ foreach (split(/\n/, $s)) {
+ if ($_ eq "Lineage Report
") {
+ print OUT "BEGIN LINEAGE;\n";
+ $reading = 1; next;
+ }
+ if (($_ eq "
") and $reading) {
+ print OUT ";\nEND;\n\n";
+ $reading = 0; next;
+ }
+ if ($reading) {
+ s/\<.*?>//g;
+ s/( hit[ s] \[.*?\])(.*)$/$1/;
+ if (@ignoreTaxa) {
+ $i = 0;
+ foreach $s (@ignoreTaxa) {
+ if (/$s/) { $i = 1; last; }
+ }
+ next if $i;
+ }
+ print OUT "$_\n";
+ next;
+ }
+ }
+ }
+ }
+
+ # output hit table #
+
+ print OUT "BEGIN ORGANISM;\n";
+ print OUT "[Accession\tOrganism\tTaxID\tBit-score\tE-value\t\%Identity\t\%Coverage]\n";
+ foreach (sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits) {
+ print OUT $hits{$_}{'name'}."\t".$hits{$_}{'organism'}."\t".$hits{$_}{'taxid'}."\t".$hits{$_}{'score'}."\t".$hits{$_}{'expect'}."\t".$hits{$_}{'identity'}."\t".$hits{$_}{'coverage'}."\n";
+ }
+ print OUT ";\nEND;\n\n";
+
+ # retrieve hit sequences #
+ if ($seqBlast) {
+ my $i = 0; # count
+ my $allinfo = "";
+ @b = (); # subset of GIs
+ foreach (keys %hits) {
+ $i ++;
+ push (@b, $_);
+ if ($i == 190) { # a URI should not exceed ~2000 characters, which is approximately 190 GIs.
+ sleep 1;
+ while (1) {
+ $s = get $eFetchServer."?db=protein&rettype=FASTA&id=".join (",", @b);
+ last if (defined $s);
+ sleep $delay;
+ }
+ $allinfo .= $s;
+ $i = 0; @b = ();
+ }
+ }
+ if (@b) {
+ sleep 1;
+ while (1) {
+ $s = get $eFetchServer."?db=protein&rettype=FASTA&id=".join (",", @b);
+ last if (defined $s);
+ sleep $delay;
+ }
+ $allinfo .= $s;
+ }
+ $i = 0; # current GI
+ foreach (split (/\n/, $allinfo)) {
+ next unless $_;
+ if (/^>gi\|(\d+)\|/) {
+ $i = $1;
+ $hits{$i}{'sequence'} = "";
+ } else {
+ $hits{$i}{'sequence'} .= $_ if $i;
+ }
+ }
+ }
+
+ # retrieve multiple sequence alignment (conflicts seqBlast)
+ if ($alnBlast and not $seqBlast) {
+ sleep 1;
+ $s = get "$blastServer?CMD=Get&ALIGNMENT_VIEW=FlatQueryAnchoredNoIdentities&FORMAT_TYPE=Text&RID=$rid&ALIGNMENTS=100000";
+ if (($s !~ /blastp/i) or ($s !~ /\nQuery=\s/)) {
+ print OUT "BEGIN ERROR;\nRetrieval of multiple sequence alignment failed.\n;\nEND;\n\n";
+ } else {
+ @c = split(/\n/, $s);
+ my $reading = 0; # reading status
+ my $iBlock = -1; # block ID
+ my %seqs; # sequence alignment
+ foreach (@c) {
+ if ($_ eq "ALIGNMENTS") { $reading = 1; next; }
+ if ($reading and /^\s/) { $reading = 0; last; }
+ next unless $reading;
+ unless ($_) { $iBlock ++; next; } # Start a new block if empty line
+ @a = split(/\s+/,substr($_,0,19)); # id
+ #next if ($a[0] eq "Query");
+ $_ =~ /(\s\s\d*$)/;
+ $s = substr($_,19,length($_)-19-length($1)); # sequence
+ $s =~ s/\s/-/g;
+ $seqs{$a[0]} = "-"x($iBlock*60) unless (exists $seqs{$a[0]}); # add new sequence
+ $seqs{$a[0]} .= $s; # add new sequence
+ }
+ $i = 0;
+ foreach $s(keys %seqs) {
+ $i = length($seqs{$s}) if (length($seqs{$s}) > $i);
+ }
+ foreach $s(keys %seqs) {
+ $seqs{$s} .= "-"x($i - length($seqs{$s})) if (length($seqs{$s}) < $i);
+ }
+ foreach my $hit (keys %hits) {
+ @a = split /\//, $hits{$hit}{'name'};
+ foreach (@a) {
+ if (exists $seqs{$_}) {
+ $hits{$hit}{'sequence'} = $seqs{$_};
+ last;
+ }
+ }
+ }
+ }
+ }
+
+ # output sequences #
+ if ($seqBlast or $alnBlast) {
+ my $ntax = 0; # number of sequences
+ my $nchar = 0; # maximum length of sequence
+ foreach (keys %hits) {
+ next unless exists $hits{$_}{'sequence'};
+ $ntax ++;
+ $nchar = length ($hits{$_}{'sequence'}) if (length ($hits{$_}{'sequence'}) > $nchar);
+ }
+ print OUT "BEGIN DATA;\n";
+ print OUT "\tDIMENSIONS NTAX=$ntax NCHAR=$nchar;\n\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n\tMATRIX\n";
+ foreach (sort {($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits) {
+ next unless exists $hits{$_}{'sequence'};
+ @a = split (/\//, $hits{$_}{'name'});
+ print OUT $a[0]."\t".$hits{$_}{'sequence'}."\n";
+ }
+ print OUT ";\nEND;\n\n";
+ }
+ close OUT;
+ return "$query/$set/".scalar(keys %hits)."/1";
}
@@ -1589,91 +1556,91 @@ sub http_blast{
# return (0, 0, 0, 0) if failed
sub self_align {
- my ($name, $seq, $length) = ($_[0], $_[1], 0);
- $length = length($seq) if $seq;
- unless ($httpBlast){ # local mode
- my $fail = 0;
- my @result = ();
- open OUT, ">$wkDir/tmp.in";
- print OUT ">$name\n".$seq."\n";
- close OUT;
- if ($searchTool eq "BLAST"){
- my @out = `$blastp -query $wkDir/tmp.in -subject $wkDir/tmp.in -outfmt \"6 evalue bitscore pident\"`;
- unlink "$wkDir/tmp.in";
- @a = split (/\t/, $out[0]);
- if ($#a == 2){
- $a[2] =~ s/\s+$//;
- @result = ($a[0], $a[1], $a[2], $length);
- }else{ $fail = 1; }
- }elsif ($searchTool eq "RAPSearch2"){
- `$prerapsearch -d $wkDir/tmp.in -n $wkDir/raptmp`;
- `$rapsearch -q $wkDir/tmp.in -d $wkDir/raptmp -t a -s f -o $wkDir/tmp`;
- unlink "$wkDir/tmp.in";
- unlink "$wkDir/raptmp";
- unlink "$wkDir/raptmp.info";
- if (-s "$wkDir/tmp.m8"){
- open IN, "<$wkDir/tmp.m8";
- while (){
- s/\s+$//; next unless $_; next if /^#/;
- @a = split (/\t/);
- if ($#a < 11){ $fail = 1; last; }
- @result = ($a[10], $a[11], $a[2], $length);
- last;
- }
- close IN;
- unlink "$wkDir/tmp.aln";
- unlink "$wkDir/tmp.m8";
- }else{ $fail = 1; }
-
- }elsif ($searchTool eq "DIAMOND"){
- `$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
- `$diamond blastp -p 1 -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
- my @out = `$diamond view -a $wkDir/tmp.daa`;
- unlink "$wkDir/tmp.in";
- unlink "$wkDir/tmp.dmnd";
- unlink "$wkDir/tmp.daa";
- if (@out){
- @a = split (/\t/, $out[0]);
- if ($#a == 11){
- $a[11] =~ s/\s+$//;
- @result = ($a[10], $a[11], $a[2], $length);
- }else{ $fail = 1; }
- }else{ $fail = 1; }
- }else{ $fail = 1; }
- if ($fail){
- unlink "$wkDir/tmp.in" if -e "$wkDir/tmp.in";
- return (0,0,0,0);
- }else{ return @result; }
-
- }else{ # http mode
- my $isError = 0;
- if ($seq){ $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=".$seq."&SUBJECTS=".$seq; }
- else{ $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=$name&SUBJECTS=$name"; };
- return (0,0,0,0) unless (defined $s and $s =~ /^ RID = (.*$)/m);
- my $rid = $1;
- if ($s =~ /\s\((\d+) letters\)/ and not $length){ $length = $1; }
- sleep 1;
- while (1){
- $s = get "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
- return (0,0,0,0) unless defined $s and $s =~ /\s+Status=(.+)/m;
- if ($1 eq "WAITING"){ sleep $delay; next; }
- if ($1 eq "FAILED" or $1 eq "UNKNOWN"){ $isError = 1; last; }
- if ($1 eq "READY" and $s =~ /\s+ThereAreHits=yes/m){ last; }
- $isError = 1; last;
- }
- return (0,0,0,0) if $isError;
- sleep 1;
- $s = get "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
- return (0,0,0,0) unless (defined $s and $s =~ /# blastp/ and $s =~ /# Query:\s/);
- $s =~ /\(.*)\<\/PRE\>/s; $s = $1;
- foreach (split(/\n/, $s)){
- next unless $_;
- next if (/^#/);
- my @a = split (/\t/);
- $a[12] =~ s/^\s+//;
- return ($a[11], $a[12], $a[2], $length);
- }
- }
+ my ($name, $seq, $length) = ($_[0], $_[1], 0);
+ $length = length($seq) if $seq;
+ unless ($httpBlast) { # local mode
+ my $fail = 0;
+ my @result = ();
+ open OUT, ">$wkDir/tmp.in";
+ print OUT ">$name\n".$seq."\n";
+ close OUT;
+ if ($searchTool eq "BLAST") {
+ my @out = `$blastp -query $wkDir/tmp.in -subject $wkDir/tmp.in -outfmt \"6 evalue bitscore pident\"`;
+ unlink "$wkDir/tmp.in";
+ @a = split (/\t/, $out[0]);
+ if ($#a == 2) {
+ $a[2] =~ s/\s+$//;
+ @result = ($a[0], $a[1], $a[2], $length);
+ } else { $fail = 1; }
+ } elsif ($searchTool eq "RAPSearch2") {
+ `$prerapsearch -d $wkDir/tmp.in -n $wkDir/raptmp`;
+ `$rapsearch -q $wkDir/tmp.in -d $wkDir/raptmp -t a -s f -o $wkDir/tmp`;
+ unlink "$wkDir/tmp.in";
+ unlink "$wkDir/raptmp";
+ unlink "$wkDir/raptmp.info";
+ if (-s "$wkDir/tmp.m8") {
+ open IN, "<$wkDir/tmp.m8";
+ while () {
+ s/\s+$//; next unless $_; next if /^#/;
+ @a = split (/\t/);
+ if ($#a < 11) { $fail = 1; last; }
+ @result = ($a[10], $a[11], $a[2], $length);
+ last;
+ }
+ close IN;
+ unlink "$wkDir/tmp.aln";
+ unlink "$wkDir/tmp.m8";
+ } else { $fail = 1; }
+
+ } elsif ($searchTool eq "DIAMOND") {
+ `$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
+ `$diamond blastp -p 1 -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
+ my @out = `$diamond view -a $wkDir/tmp.daa`;
+ unlink "$wkDir/tmp.in";
+ unlink "$wkDir/tmp.dmnd";
+ unlink "$wkDir/tmp.daa";
+ if (@out) {
+ @a = split (/\t/, $out[0]);
+ if ($#a == 11) {
+ $a[11] =~ s/\s+$//;
+ @result = ($a[10], $a[11], $a[2], $length);
+ } else { $fail = 1; }
+ } else { $fail = 1; }
+ } else { $fail = 1; }
+ if ($fail) {
+ unlink "$wkDir/tmp.in" if -e "$wkDir/tmp.in";
+ return (0,0,0,0);
+ } else { return @result; }
+
+ } else { # http mode
+ my $isError = 0;
+ if ($seq) { $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=".$seq."&SUBJECTS=".$seq; }
+ else{ $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=$name&SUBJECTS=$name"; };
+ return (0,0,0,0) unless (defined $s and $s =~ /^ RID = (.*$)/m);
+ my $rid = $1;
+ if ($s =~ /\s\((\d+) letters\)/ and not $length) { $length = $1; }
+ sleep 1;
+ while (1) {
+ $s = get "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
+ return (0,0,0,0) unless defined $s and $s =~ /\s+Status=(.+)/m;
+ if ($1 eq "WAITING") { sleep $delay; next; }
+ if ($1 eq "FAILED" or $1 eq "UNKNOWN") { $isError = 1; last; }
+ if ($1 eq "READY" and $s =~ /\s+ThereAreHits=yes/m) { last; }
+ $isError = 1; last;
+ }
+ return (0,0,0,0) if $isError;
+ sleep 1;
+ $s = get "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
+ return (0,0,0,0) unless (defined $s and $s =~ /# blastp/ and $s =~ /# Query:\s/);
+ $s =~ /\(.*)\<\/PRE\>/s; $s = $1;
+ foreach (split(/\n/, $s)) {
+ next unless $_;
+ next if (/^#/);
+ my @a = split (/\t/);
+ $a[12] =~ s/^\s+//;
+ return ($a[11], $a[12], $a[2], $length);
+ }
+ }
}
@@ -1683,181 +1650,156 @@ sub self_align {
# other: write global variable %dbTaxa and %dbRanks, write file taxa.db and ranks.db
sub get_taxonomy{
- my @organisms = ();
- my %taxa2w = (); # taxa to write
- my %ranks2w = (); # ranks to write
- my %hRanks = map { $_ => 1 } @ranks;
-
- unless ($httpBlast){ # local taxonomy database
- foreach my $taxid (@_){
- if (exists $dbTaxa{$taxid}){
- push (@organisms, $dbTaxa{$taxid}{'organism'});
- next;
- }
- if (exists $taxdumps{$taxid}){
- my $id = $taxid;
- my $pid = $taxdumps{$taxid}{'parent'};
- my $name = $taxdumps{$taxid}{'name'};
- my $rank = $taxdumps{$taxid}{'rank'};
- push (@organisms, $name);
- my %taxon :shared = ('organism', $name);
- $taxon{$_} = "" for (@ranks);
- $taxon{$rank} = $id if ($rank and exists $hRanks{$rank});
- while (1){
- last unless ($pid and $pid != $id);
- $name = $taxdumps{$pid}{'name'};
- $rank = $taxdumps{$pid}{'rank'};
- unless (exists $dbRanks{$pid}){
- $dbRanks{$pid} = $name;
- $ranks2w{$pid} = $name;
- }
- $taxon{'lineage'} .= "/$pid";
- $taxon{$rank} = $pid if (exists $hRanks{$rank});
- $id = $pid;
- $pid = $taxdumps{$id}{'parent'};
- }
- $dbTaxa{$taxid} = \%taxon;
- $taxa2w{$taxid} = {%taxon};
- }else{
- push (@organisms, "na");
- unless (exists $badTaxids{$taxid}){
- print " Warning: Invalid TaxID: $taxid.\n";
- $badTaxids{$taxid} = 1;
- open OUT, ">>$wkDir/taxonomy/invalid.taxids";
- print OUT $taxid."\n";
- close OUT;
- }
- }
- }
-
- }else{ # remote taxonomy database
- my $iRetry = 0;
- my @taxids2get = ();
- my %taxid2organism = ();
- foreach (@_){
- if (exists $dbTaxa{$_}){ $taxid2organism{$_} = $dbTaxa{$_}{'organism'}; }
- else{ push (@taxids2get, $_) }
- }
- sleep 1;
- while (1){
- $s = get "$eFetchServer?db=taxonomy&id=".join (",", @taxids2get);
- last if (defined $s);
- die "\nFailed to retrieve taxonomic information from NCBI.\n" if ($iRetry >= $retries);
- $iRetry ++; sleep $delay; next;
- }
- if ($s =~ /ID list is empty/){
- print "Warning: Invalid TaxIDs:", join (",", @taxids2get), ".\n";
- }else{
- $s =~ s///;
- while ($s =~ s/\n\s+(\d+)<\/TaxId>\s+(.+?)<\/ScientificName>(.+?)\n<\/Taxon>//s){
- my $id = $1; my $t = $3;
- $taxid2organism{$id} = $2;
- my %taxon :shared = ('organism', $2);
- $taxon{$_} = "" for (@ranks);
- while ($t =~ s/\s+(\d+)<\/TaxId>\s+(.+?)<\/ScientificName>\s+(.+?)<\/Rank>\s+<\/Taxon>//s){
- unless (exists $dbRanks{$1}){
- $dbRanks{$1} = $2;
- $ranks2w{$1} = $2;
- }
- $taxon{'lineage'} .= "/$1";
- $taxon{$3} = $1 if (exists $hRanks{$3});
- }
- if ($t =~ /<\/ParentTaxId>\s+(\S+)<\/Rank>/){ # the taxon itself is a rank
- foreach (@ranks){
- next if $taxon{$_};
- if ($1 eq $_){ $taxon{$_} = $id; last; }
- }
- }
- $dbTaxa{$id} = \%taxon;
- $taxa2w{$id} = {%taxon};
- }
- }
- foreach (@_){
- if (exists $taxid2organism{$_}){
- push (@organisms, $taxid2organism{$_});
- }else{
- push (@organisms, "na");
- print "Warning: Invalid TaxID: $_.\n";
- unless (exists $badTaxids{$_}){
- $badTaxids{$_} = 1;
- open OUT, ">>$wkDir/taxonomy/invalid.taxids";
- print OUT $_."\n";
- close OUT;
- }
- }
- }
- }
- if (%taxa2w){
- open OUT, ">>$wkDir/taxonomy/taxa.db";
- foreach my $id (keys %taxa2w){
- print OUT $id."\t".$taxa2w{$id}{'organism'}."\t".$taxa2w{$id}{'lineage'};
- print OUT "\t".$taxa2w{$id}{$_} for (@ranks);
- print OUT "\n";
- }
- close OUT;
- }
- if (%ranks2w){
- open OUT, ">>$wkDir/taxonomy/ranks.db";
- foreach my $id (keys %ranks2w){
- print OUT $id."\t".$ranks2w{$id}."\n";
- }
- close OUT;
- }
- return @organisms;
+ my @organisms = ();
+ my %taxa2w = (); # taxa to write
+ my %ranks2w = (); # ranks to write
+ my %hRanks = map { $_ => 1 } @ranks;
+
+ unless ($httpBlast) { # local taxonomy database
+ foreach my $taxid (@_) {
+ if (exists $dbTaxa{$taxid}) {
+ push (@organisms, $dbTaxa{$taxid}{'organism'});
+ next;
+ }
+ if (exists $taxdumps{$taxid}) {
+ my $id = $taxid;
+ my $pid = $taxdumps{$taxid}{'parent'};
+ my $name = $taxdumps{$taxid}{'name'};
+ my $rank = $taxdumps{$taxid}{'rank'};
+ push (@organisms, $name);
+ my %taxon :shared = ('organism', $name);
+ $taxon{$_} = "" for (@ranks);
+ $taxon{$rank} = $id if ($rank and exists $hRanks{$rank});
+ while (1) {
+ last unless ($pid and $pid != $id);
+ $name = $taxdumps{$pid}{'name'};
+ $rank = $taxdumps{$pid}{'rank'};
+ unless (exists $dbRanks{$pid}) {
+ $dbRanks{$pid} = $name;
+ $ranks2w{$pid} = $name;
+ }
+ $taxon{'lineage'} .= "/$pid";
+ $taxon{$rank} = $pid if (exists $hRanks{$rank});
+ $id = $pid;
+ $pid = $taxdumps{$id}{'parent'};
+ }
+ $dbTaxa{$taxid} = \%taxon;
+ $taxa2w{$taxid} = {%taxon};
+ } else {
+ push (@organisms, "na");
+ unless (exists $badTaxids{$taxid}) {
+ print " Warning: Invalid TaxID: $taxid.\n";
+ $badTaxids{$taxid} = 1;
+ open OUT, ">>$wkDir/taxonomy/invalid.taxids";
+ print OUT $taxid."\n";
+ close OUT;
+ }
+ }
+ }
+
+ } else { # remote taxonomy database
+ my $iRetry = 0;
+ my @taxids2get = ();
+ my %taxid2organism = ();
+ foreach (@_) {
+ if (exists $dbTaxa{$_}) { $taxid2organism{$_} = $dbTaxa{$_}{'organism'}; }
+ else{ push (@taxids2get, $_) }
+ }
+ sleep 1;
+ while (1) {
+ $s = get "$eFetchServer?db=taxonomy&id=".join (",", @taxids2get);
+ last if (defined $s);
+ die "\nFailed to retrieve taxonomic information from NCBI.\n" if ($iRetry >= $retries);
+ $iRetry ++; sleep $delay; next;
+ }
+ if ($s =~ /ID list is empty/) {
+ print "Warning: Invalid TaxIDs:", join (",", @taxids2get), ".\n";
+ } else {
+ $s =~ s///;
+ while ($s =~ s/\n\s+(\d+)<\/TaxId>\s+(.+?)<\/ScientificName>(.+?)\n<\/Taxon>//s) {
+ my $id = $1; my $t = $3;
+ $taxid2organism{$id} = $2;
+ my %taxon :shared = ('organism', $2);
+ $taxon{$_} = "" for (@ranks);
+ while ($t =~ s/\s+(\d+)<\/TaxId>\s+(.+?)<\/ScientificName>\s+(.+?)<\/Rank>\s+<\/Taxon>//s) {
+ unless (exists $dbRanks{$1}) {
+ $dbRanks{$1} = $2;
+ $ranks2w{$1} = $2;
+ }
+ $taxon{'lineage'} .= "/$1";
+ $taxon{$3} = $1 if (exists $hRanks{$3});
+ }
+ if ($t =~ /<\/ParentTaxId>\s+(\S+)<\/Rank>/) { # the taxon itself is a rank
+ foreach (@ranks) {
+ next if $taxon{$_};
+ if ($1 eq $_) { $taxon{$_} = $id; last; }
+ }
+ }
+ $dbTaxa{$id} = \%taxon;
+ $taxa2w{$id} = {%taxon};
+ }
+ }
+ foreach (@_) {
+ if (exists $taxid2organism{$_}) {
+ push (@organisms, $taxid2organism{$_});
+ } else {
+ push (@organisms, "na");
+ print "Warning: Invalid TaxID: $_.\n";
+ unless (exists $badTaxids{$_}) {
+ $badTaxids{$_} = 1;
+ open OUT, ">>$wkDir/taxonomy/invalid.taxids";
+ print OUT $_."\n";
+ close OUT;
+ }
+ }
+ }
+ }
+ if (%taxa2w) {
+ open OUT, ">>$wkDir/taxonomy/taxa.db";
+ foreach my $id (keys %taxa2w) {
+ print OUT $id."\t".$taxa2w{$id}{'organism'}."\t".$taxa2w{$id}{'lineage'};
+ print OUT "\t".$taxa2w{$id}{$_} for (@ranks);
+ print OUT "\n";
+ }
+ close OUT;
+ }
+ if (%ranks2w) {
+ open OUT, ">>$wkDir/taxonomy/ranks.db";
+ foreach my $id (keys %ranks2w) {
+ print OUT $id."\t".$ranks2w{$id}."\n";
+ }
+ close OUT;
+ }
+ return @organisms;
}
-## parse sequence title ##
- # GenBank-style sequence titles have gi, accn, product and name (accn)
- # plain titles have name and product
-
-sub seq_title {
- my $title = $_[0];
- my ($gi, $accn, $name, $product) = ('', '', '', '');
- if ($title =~ /^gi\|(\d+)\|.+\|([A-Z0-9_]+)\.\d+\|(.*)$/){ # GenBank-style title
- ($gi, $accn, $name) = ($1, $2, $2);
- if ($3){
- $product = $3;
- $product = $1 if ($product =~ /^(.+)\[/);
- $product =~ s/^\s+|\s+$//g;
- }
- }else{ # user-defined name
- $title =~ s/^\s+|\s+$//g;
- if ($title =~ /^(\S+)\s+(.+)$/){
- ($name, $product) = ($1, $2);
- }else{
- $name = $title;
- }
- }
- return ($gi, $accn, $name, $product);
-}
# reorder accession numbers #
sub order_accns {
- # order: NP_ > XP_ = YP_ = ZP_ = AP_ > everything else
- return $_[0] unless $_[0] =~ /\//;
- my @accns0 = split (/\//, $_[0]);
- my @accns1 = ();
- foreach (@accns0){
- if (/_/){ unshift (@accns1, $_); }
- else{ push (@accns1, $_); }
- }
- @accns0 = ();
- foreach (@accns1){
- if (/^NP_/){ unshift (@accns0, $_); }
- else{ push (@accns0, $_); }
- }
- @accns1 = ();
- foreach (@accns0){
- if ($#_ and /^$_[1]$/){ unshift (@accns1, $_); }
- else{ push (@accns1, $_); }
- }
- return join ("/", @accns1);
+ # order: NP_ > ?P_ > everything else
+ return $_[0] unless $_[0] =~ /\//;
+ my @accns0 = split (/\//, $_[0]);
+ my @accns1 = ();
+ foreach (@accns0) {
+ if (/_/) { unshift (@accns1, $_); }
+ else{ push (@accns1, $_); }
+ }
+ @accns0 = ();
+ foreach (@accns1) {
+ if (/^NP_/) { unshift (@accns0, $_); }
+ else{ push (@accns0, $_); }
+ }
+ @accns1 = ();
+ foreach (@accns0) {
+ if ($#_ and /^$_[1]$/) { unshift (@accns1, $_); }
+ else{ push (@accns1, $_); }
+ }
+ return join ("/", @accns1);
}
# get stem file name #
sub stem_name {
- my $stem = $_[0];
- $stem =~ s/\.[^\.]+$//;
- return $stem;
+ my $stem = $_[0];
+ $stem =~ s/\.[^\.]+$//;
+ return $stem;
}
-
diff --git a/scripts/treer.pl b/scripts/treer.pl
old mode 100755
new mode 100644
index e317985..d7f6f34
--- a/scripts/treer.pl
+++ b/scripts/treer.pl
@@ -23,30 +23,30 @@
my $i; my $j; my $n; my $s; my $t; my @a; my @b; my @c; my %h;
-my @sets; # protein sets (genomes)
-my @hits; # blast hits, each element is a hash, including:
- # accn, accns, organism, group, taxid, score, distance, ignore
+my @sets; # protein sets (genomes)
+my @hits; # blast hits, each element is a hash, including:
+ # accn, accns, organism, group, taxid, score, distance, ignore
-my %self; # query information
-my %seqs; # accn -> sequence
-my %names; # accn -> organism name
-my $nSeq; # number of hits with sequences
-my $nChar; # number of aa per sequence in alignment
-my $tree; # phylogenetic tree with annotations
+my %self; # query information
+my %seqs; # accn -> sequence
+my %names; # accn -> organism name
+my $nSeq; # number of hits with sequences
+my $nChar; # number of aa per sequence in alignment
+my $tree; # phylogenetic tree with annotations
## program parameters ##
-my $wkDir = $ARGV[0]; # working directory
+my $wkDir = $ARGV[0]; # working directory
-my $minHits = 3; # minimal number of valid hits for phylogenetic analysis
+my $minHits = 3; # minimal number of valid hits for phylogenetic analysis
-my $trimSeq = 0; # trim unreliable regions from sequence alignment (options: 0, gblocks)
-my $realign = 0; # realign sequences (options: 0, clustalw, mafft, muscle, clustalo)
-my $buildTree = 0; # Build phylogenetic tree using specified program (options: 0, clustalw, mafft, phyml, raxml)
-my $bsTree = 0; # perform bootstrap for designated times.
-my $distance = 0; # use distance matrix instead of BLAST bit scores to rank hits (options: 0, clustalw, mafft, raxml)
-my $aaModel = "WAG"; # protein substitution model for RAxML
+my $trimSeq = 0; # trim unreliable regions from sequence alignment (options: 0, gblocks)
+my $realign = 0; # realign sequences (options: 0, clustalw, mafft, muscle, clustalo)
+my $buildTree = 0; # Build phylogenetic tree using specified program (options: 0, clustalw, mafft, phyml, raxml)
+my $bsTree = 0; # perform bootstrap for designated times.
+my $distance = 0; # use distance matrix instead of BLAST bit scores to rank hits (options: 0, clustalw, mafft, raxml)
+my $aaModel = "WAG"; # protein substitution model for RAxML
my $gblocks = "Gblocks";
my $clustalw = "clustalw";
@@ -64,26 +64,26 @@
## read configurations ##
-if (-e "$wkDir/config.txt"){
- open IN, "<$wkDir/config.txt";
- my $readMonitor = 0; my $readMiddle = 0;
- while (){
- s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
- $minHits = $1 if /^minHits=(\d+)$/;
- $trimSeq = $1 if /^trimSeq=(.+)$/;
- $realign = $1 if /^realign=(.+)$/;
- $buildTree = $1 if /^buildTree=(.+)$/;
- $bsTree = $1 if /^bsTree=(.+)$/;
- $distance = $1 if /^distance=(.+)$/;
- $aaModel = $1 if /^aaModel=(.+)$/;
- $gblocks = $1 if /^gblocks=(.+)$/;
- $clustalw = $1 if /^clustalw=(.+)$/;
- $mafft = $1 if /^mafft=(.+)$/;
- $raxml = $1 if /^raxml=(.+)$/;
- $phyml = $1 if /^phyml=(.+)$/;
- $fasttree = $1 if /^fasttree=(.+)$/;
- }
- close IN;
+if (-e "$wkDir/config.txt") {
+ open IN, "<$wkDir/config.txt";
+ my $readMonitor = 0; my $readMiddle = 0;
+ while () {
+ s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+ $minHits = $1 if /^minHits=(\d+)$/;
+ $trimSeq = $1 if /^trimSeq=(.+)$/;
+ $realign = $1 if /^realign=(.+)$/;
+ $buildTree = $1 if /^buildTree=(.+)$/;
+ $bsTree = $1 if /^bsTree=(.+)$/;
+ $distance = $1 if /^distance=(.+)$/;
+ $aaModel = $1 if /^aaModel=(.+)$/;
+ $gblocks = $1 if /^gblocks=(.+)$/;
+ $clustalw = $1 if /^clustalw=(.+)$/;
+ $mafft = $1 if /^mafft=(.+)$/;
+ $raxml = $1 if /^raxml=(.+)$/;
+ $phyml = $1 if /^phyml=(.+)$/;
+ $fasttree = $1 if /^fasttree=(.+)$/;
+ }
+ close IN;
}
@@ -94,9 +94,9 @@
@a = readdir(DIR);
close DIR;
-foreach (@a){
- next if (/^\./);
- push @sets, $_ if -d "$wkDir/search/$_";
+foreach (@a) {
+ next if (/^\./);
+ push @sets, $_ if -d "$wkDir/search/$_";
}
print "done. ";
die "No protein sets detected.\n" unless @sets;
@@ -104,348 +104,348 @@
print "\n0-------------25-------------50------------75------------100%\n";
-foreach my $set (@sets){
- my $dir = "$wkDir/search/$set";
- opendir (DIR, "$dir") or next;
- my @files = grep (/\.txt$/, readdir(DIR));
- close DIR;
- next unless @files;
-
- my $iProtein = 0;
- my $iProgress = 0;
- my $nProtein = $#files+1;
- print "$set has $nProtein proteins. Analyzing...\n";
-
- foreach my $file (@files){
- $iProtein ++;
- @hits = (); %seqs = (); %self = (); $nSeq = 0; $nChar = 0; $tree = ""; %names = ();
- open IN, "<$dir/$file" or next;
- my $reading = 0;
- my ($hasCoverage, $hasDistance) = (0, 0);
- while () {
- s/\s+$//;
- if (/^BEGIN QUERY/){ $reading = "query"; next; }
- if (/^BEGIN ORGANISM/){ $reading = "organism"; next; }
- if (/^BEGIN DATA/){ $reading = "data"; next; }
- if (/^END;/){ $reading = 0; next; }
- if ($reading eq "query"){
- $self{'accn'} = $1 if /^\tName=(.+);$/;
- $self{'length'} = $1 if /^\tLength=(.+);$/;
- $self{'product'} = $1 if /^\tProduct=(.+);$/;
- if (/^\tAccession=(.+);$/){ $self{'accn'} = $1; $self{'accn'} =~ s/\.[\d]+$//; }
- }
- if ($reading eq "organism"){ # read organisms
- last unless exists $self{'accn'};
- next if /^;/;
- if (/^\[/){
- $hasCoverage = 1 if /Coverage/;
- $hasDistance = 1 if /Distance/;
- next;
- }
- @a = split (/\t/);
- my %hit = ();
- $hit{'organism'} = $a[1];
- $hit{'taxid'} = $a[2];
- $hit{'score'} = $a[3];
- $hit{'expect'} = $a[4];
- $hit{'identity'} = $a[5];
- $hit{'coverage'} = $a[6] if $hasCoverage;
- $hit{'accns'} = $a[0];
- @a = split(/\//, $a[0]);
- $hit{'accn'} = $a[0];
- push @hits, {%hit};
- unless (exists $self{'taxid'}){ # identify self
- foreach (@a){
- if ($self{'accn'} eq $_){
- $self{'id'} = $#hits;
- $self{'taxid'} = $hit{'taxid'};
- $self{'score'} = $hit{'score'};
- $self{'expect'} = $hit{'expect'};
- $self{'identity'} = $hit{'identity'};
- $self{'coverage'} = $hit{'coverage'} if exists $hit{'coverage'};
- $self{'organism'} = $hit{'organism'};
- last;
- }
- }
- }
- }
- if ($reading eq "data"){ # read sequences
- next if /^;/;
- next if /^\t/;
- @a = split (/\t/);
- $seqs{$a[0]} = $a[1];
- $nSeq ++;
- $nChar = length($a[1]) unless $nChar;
- }
- }
- close IN;
- print "BLAST report of $file is incomplete.\n" and next unless exists $self{'accn'};
- next if @hits < $minHits;
- next if keys %seqs < $minHits;
-
- ## realign with ClustalW ##
-
- if ($realign eq "clustalw"){
- next if write_fasta ("$dir/temp") < 2;
- system "$clustalw -infile=$dir/temp -quicktree -output=fasta -quiet > $dir/buffer";
- die "Execution of ClustalW failed. Please check.\n" unless -s "$dir/temp.fasta";
- read_fasta "$dir/temp.fasta";
- unlink "$dir/temp", "$dir/temp.fasta", "$dir/temp.dnd", "$dir/buffer";
- }
-
- ## trim alignment with GBlocks ##
-
- if ($trimSeq eq "gblocks"){
- write_fasta "$dir/temp";
- system "$gblocks $dir/temp -b2=".(int($nSeq/2)+1)." -b3=3 -b4=6 -b5=a > $dir/buffer";
- die "Execution of Gblocks failed on $file. Please check.\n" unless -s "$dir/temp-gb";
- read_fasta "$dir/temp-gb";
- foreach (keys %seqs){
- $seqs{$_} =~ s/\s//g;
- delete $seqs{$_} unless ($seqs{$_} =~ /[a-zA-Z]/);
- }
- unlink "$dir/temp", "$dir/temp-gb", "$dir/temp-gb.htm";
- }
-
- ## create translation table for tree annotation ##
-
- if ($buildTree){
- for ($i=0; $i<=$#hits; $i++){
- $s = $hits[$i]{'organism'};
- $s =~ s/[^a-zA-Z0-9,\.\-]/ /g;
- $names{$hits[$i]{'accn'}} = $s;
- }
- }
-
- ## build phylogenetic tree and compute distance matrix with ClustalW (Neighbor-Joining) ##
-
- if ($buildTree eq "clustalw"){
- if (write_fasta("$dir/temp")>= 3){
- $s = "$clustalw -infile=$dir/temp -outputtree=dist -quiet";
- if ($bsTree){ $s .= " -bootstrap=$bsTree -seed=12345 -bootlabels=node"; }else{ $s .= " -tree"; }
- $s .= " > $dir/buffer";
- system $s;
- die "Execution of ClustalW failed. Please check.\n" unless (-s "$dir/temp.ph" or "$dir/temp.phb");
- if ($bsTree){ open IN, "<$dir/temp.phb"; }else{ open IN, "<$dir/temp.ph"; }
- while (){ s/\s+$//; $tree .= $_; }
- close IN;
- $tree =~ s/TRICHOTOMY//i;
- $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
- if ($distance){
- $s = "";
- open IN, "<$dir/temp.dst";
- $_ = ; $s = ; $s =~ s/\s+$//;
- while (){
- last unless (/^\s/);
- s/\s+$//; s/^\s+//;
- $s .= " $_";
- }
- close IN;
- @a = split (/\s+/, $s);
- shift @a;
- for ($i=0; $i<=$#hits; $i++){
- next if exists $hits[$i]{'ignore'};
- next unless exists $seqs{$hits[$i]{'accn'}};
- $hits[$i]{'distance'} = shift (@a);
- }
- }
- unlink "$dir/temp", "$dir/temp.ph", "$dir/temp.phb","$dir/temp.dst", "$dir/buffer";
- }
- }
-
- ## realign sequences, build phylogenetic tree and compute distance matrix with MAFFT (Neighbor-Joining) ##
-
- if ($realign eq "mafft" or $buildTree eq "mafft"){
- if (write_fasta("$dir/temp")>=3){
- system "$mafft --retree 1 --treeout --distout --quiet $dir/temp > $dir/buffer";
- die "Execution of MAFFT failed. Please check.\n" unless -s "$dir/buffer";
- read_fasta "$dir/buffer" if ($realign eq "mafft");
- if ($buildTree eq "mafft"){
- open IN, "<$dir/temp.tree";
- while (){ s/\s+$//; $tree .= $_; }
- close IN;
- $tree =~ s/\s+$/;/;
- $tree =~ s/_+:/:/g;
- $tree =~ s/([,\(])\d{1,4}_/$1/g;
- $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
- $tree .= ";" unless $tree =~ /;$/;
- if ($distance){
- my %mafftIDs = ();
- open IN, "<$dir/temp.hat2";
- $_ = ; $_ = ; $_ = ;
- @a = ();
- while (){
- s/\s+$//; s/^\s+//; next if (/\. =/);
- @a = (@a, split (/\s/));
- last if @a >= $nSeq;
- }
- close IN;
- $j = 0;
- $hits[0]{'distance'} = "0.000";
- for ($i=1; $i<=$#hits; $i++){
- next if exists $hits[$i]{'ignore'};
- next unless exists $seqs{$hits[$i]{'accn'}};
- $hits[$i]{'distance'} = $a[$j++];
- }
- }
- }
- unlink "$dir/temp", "$dir/buffer", "$dir/temp.tree", "$dir/temp.hat2";
- }
- }
-
- ## build phylogenetic tree with PhyML (Maximum Likelihood) ##
-
- if ($buildTree eq "phyml"){
- if (write_phylip("$dir/temp")>=3){
- $s = "$phyml -i $dir/temp -d aa --quiet";
- $s .= "-b $bsTree" if $bsTree;
- system $s;
- open IN, "<$dir/temp_phyml_tree.txt"; $tree = ; close IN;
- $tree =~ s/\s+$//;
- $tree =~ s/([,\(])t(\d+):/$1$hits[$2]{'accn'}:/g;
- $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
- unlink "$dir/temp", "$dir/temp_phyml_stats.txt", "$dir/temp_phyml_tree.txt";
- unlink "$dir/temp_phyml_boot_stats.txt", "$dir/temp_phyml_boot_trees.txt" if $bsTree;
- }
- }
-
- ## build phylogenetic tree with FastTree (Maximum Likelihood) and perform SH test ##
-
- if ($buildTree eq "fasttree"){
- if (write_fasta("$dir/temp")>=3){
- $s = "$fasttree < $dir/temp > fasttree.tmp -quiet";
- $s .= " -wag" if $aaModel eq "WAG";
- system $s;
- open IN, "; close IN;
- $tree =~ s/\s+$//;
- $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
- unlink "$dir/temp", "fasttree.tmp";
- }
- }
-
- ## build phylogenetic tree and compute distance matrix with RAxML (Maximum Likelihood) ##
-
- if ($buildTree eq "raxml"){
- open OUT, ">$dir/temp";
- foreach (keys %seqs){
- $i = length ($seqs{$_});
- last;
- }
- print OUT " $nSeq $i\n";
- for ($i=0; $i<=$#hits; $i++){
- next if exists $hits[$i]{'ignore'};
- next unless exists $seqs{$hits[$i]{'accn'}};
- print OUT "t$i".(" " x (13-length("t$i"))).$seqs{$hits[$i]{'accn'}}."\n";
- }
- close OUT;
- $s = "$raxml -m PROTGAMMA$aaModel -p 12345 -s $dir/temp -n tmp";
- $s .= " -f a -x 12345 -# $bsTree" if $bsTree;
- $s .= " > RAxML_screen.tmp";
- system $s;
- die "Execution of RAxML failed. Please check.\n" unless ((-s "RAxML_result.tmp") or (-s "RAxML_bipartitions.tmp"));
- if ($bsTree){ open IN, "RAxML_bipartitions.tmp"; }
- else { open IN, "RAxML_result.tmp"; }
- $tree = ; close IN;
- $tree =~ s/\s+$//;
- $tree =~ s/([,\(])t(\d+):/$1$hits[$2]{'accn'}:/g;
- $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
- if ($distance){
- $s = "$raxml -f x -m PROTGAMMA$aaModel -s $dir/temp -n tmp2";
- if ($bsTree){ $s .= " -t RAxML_bipartitions.tmp"; }
- else { open IN, " -t RAxML_result.tmp"; }
- $s .= " > RAxML_screen.tmp";
- system $s;
- open IN, "){
- s/\s+$//; next unless $_;
- next unless (/^t0/);
- @b = split (/\s+/);
- push @a, $b[2];
- }
- close IN;
- $hits[0]{'distance'} = "0.000000";
- for ($i=1; $i<=$#hits; $i++){
- next if exists $hits[$i]{'ignore'};
- next unless exists $seqs{$hits[$i]{'accn'}};
- $hits[$i]{'distance'} = shift (@a);
- }
- unlink "RAxML_info.tmp2", "RAxML_distances.tmp2";
- }
- unlink "$dir/temp", "$dir/temp.reduced";
- unlink "RAxML_bestTree.tmp", "RAxML_info.tmp", "RAxML_log.tmp", "RAxML_parsimonyTree.tmp", "RAxML_result.tmp", "RAxML_screen.tmp";
- unlink "RAxML_bootstrap.tmp", "RAxML_bipartitions.tmp", "RAxML_bipartitionsBranchLabels.tmp" if $bsTree;
- }
-
- ## print realigned sequences ##
-
- if ($realign or $trimSeq or $distance){
- foreach (keys %seqs){
- $nChar = length ($seqs{$_});
- last;
- }
- $s = "";
- $reading = 1;
- open IN, "<$dir/$file" or next;
- while (){
- if (/^BEGIN TREE/ and $buildTree){ $reading = 0; next; }
- if ($reading) { $s .= $_; }
- if (/^END;/ and not $reading) { $reading = 1; next; }
- if (/^BEGIN ORGANISM/ and $distance){
- $reading = 0;
- for ($i=0; $i<=$#hits; $i++){
- $s .= $hits[$i]{'accns'}."\t".$hits[$i]{'organism'}."\t".$hits[$i]{'taxid'}."\t".$hits[$i]{'score'}."\t".$hits[$i]{'expect'}."\t".$hits[$i]{'identity'};
- $s .= "\t".$hits[$i]{'coverage'} if exists $hits[$i]{'coverage'};
- if (exists $hits[$i]{'ignore'}){
- $s .= "\tx\n";
- }else{
- if ($distance and exists $hits[$i]{'distance'}){
- $s .= "\t".$hits[$i]{'distance'}."\n";
- }else{
- $s .= "\t\n";
- }
- }
- }
- $s .= ";\nEND;\n";
- }
- if (/^BEGIN DATA/){
- $reading = 0;
- $s .= "\tDIMENSIONS NTAX=$nSeq NCHAR=$nChar;\n";
- $s .= "\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n";
- $s .= "\tMATRIX\n";
- for ($i=0; $i<=$#hits; $i++){
- next if exists $hits[$i]{'ignore'};
- next unless exists $seqs{$hits[$i]{'accn'}};
- $s .= $hits[$i]{'accn'}."\t".$seqs{$hits[$i]{'accn'}}."\n";
- }
- $s .= ";\nEND;\n";
- }
- }
- close IN;
- open OUT, ">$dir/$file" or next;
- print OUT $s;
- close OUT;
- }
-
- ## print tree ##
-
- if ($buildTree){
- open OUT, ">>$dir/$file" or next;
- print OUT "BEGIN TREES;\n\tTREE 1 = $tree\nEND;\n";
- close OUT;
- }
-
- ## show progress ##
-
- if ($iProtein/$nProtein >= $iProgress/60){
- print ".";
- $iProgress++;
- }
- }
-
- unlink "RAxML_screen.tmp" if -e "RAxML_screen.tmp";
-
- print " done.\n";
+foreach my $set (@sets) {
+ my $dir = "$wkDir/search/$set";
+ opendir (DIR, "$dir") or next;
+ my @files = grep (/\.txt$/, readdir(DIR));
+ close DIR;
+ next unless @files;
+
+ my $iProtein = 0;
+ my $iProgress = 0;
+ my $nProtein = $#files+1;
+ print "$set has $nProtein proteins. Analyzing...\n";
+
+ foreach my $file (@files) {
+ $iProtein ++;
+ @hits = (); %seqs = (); %self = (); $nSeq = 0; $nChar = 0; $tree = ""; %names = ();
+ open IN, "<$dir/$file" or next;
+ my $reading = 0;
+ my ($hasCoverage, $hasDistance) = (0, 0);
+ while () {
+ s/\s+$//;
+ if (/^BEGIN QUERY/) { $reading = "query"; next; }
+ if (/^BEGIN ORGANISM/) { $reading = "organism"; next; }
+ if (/^BEGIN DATA/) { $reading = "data"; next; }
+ if (/^END;/) { $reading = 0; next; }
+ if ($reading eq "query") {
+ $self{'accn'} = $1 if /^\tName=(.+);$/;
+ $self{'length'} = $1 if /^\tLength=(.+);$/;
+ $self{'product'} = $1 if /^\tProduct=(.+);$/;
+ if (/^\tAccession=(.+);$/) { $self{'accn'} = $1; $self{'accn'} =~ s/\.[\d]+$//; }
+ }
+ if ($reading eq "organism") { # read organisms
+ last unless exists $self{'accn'};
+ next if /^;/;
+ if (/^\[/) {
+ $hasCoverage = 1 if /Coverage/;
+ $hasDistance = 1 if /Distance/;
+ next;
+ }
+ @a = split (/\t/);
+ my %hit = ();
+ $hit{'organism'} = $a[1];
+ $hit{'taxid'} = $a[2];
+ $hit{'score'} = $a[3];
+ $hit{'expect'} = $a[4];
+ $hit{'identity'} = $a[5];
+ $hit{'coverage'} = $a[6] if $hasCoverage;
+ $hit{'accns'} = $a[0];
+ @a = split(/\//, $a[0]);
+ $hit{'accn'} = $a[0];
+ push @hits, {%hit};
+ unless (exists $self{'taxid'}) { # identify self
+ foreach (@a) {
+ if ($self{'accn'} eq $_) {
+ $self{'id'} = $#hits;
+ $self{'taxid'} = $hit{'taxid'};
+ $self{'score'} = $hit{'score'};
+ $self{'expect'} = $hit{'expect'};
+ $self{'identity'} = $hit{'identity'};
+ $self{'coverage'} = $hit{'coverage'} if exists $hit{'coverage'};
+ $self{'organism'} = $hit{'organism'};
+ last;
+ }
+ }
+ }
+ }
+ if ($reading eq "data") { # read sequences
+ next if /^;/;
+ next if /^\t/;
+ @a = split (/\t/);
+ $seqs{$a[0]} = $a[1];
+ $nSeq ++;
+ $nChar = length($a[1]) unless $nChar;
+ }
+ }
+ close IN;
+ print "BLAST report of $file is incomplete.\n" and next unless exists $self{'accn'};
+ next if @hits < $minHits;
+ next if keys %seqs < $minHits;
+
+ ## realign with ClustalW ##
+
+ if ($realign eq "clustalw") {
+ next if write_fasta ("$dir/temp") < 2;
+ system "$clustalw -infile=$dir/temp -quicktree -output=fasta -quiet > $dir/buffer";
+ die "Execution of ClustalW failed. Please check.\n" unless -s "$dir/temp.fasta";
+ read_fasta "$dir/temp.fasta";
+ unlink "$dir/temp", "$dir/temp.fasta", "$dir/temp.dnd", "$dir/buffer";
+ }
+
+ ## trim alignment with GBlocks ##
+
+ if ($trimSeq eq "gblocks") {
+ write_fasta "$dir/temp";
+ system "$gblocks $dir/temp -b2=".(int($nSeq/2)+1)." -b3=3 -b4=6 -b5=a > $dir/buffer";
+ die "Execution of Gblocks failed on $file. Please check.\n" unless -s "$dir/temp-gb";
+ read_fasta "$dir/temp-gb";
+ foreach (keys %seqs) {
+ $seqs{$_} =~ s/\s//g;
+ delete $seqs{$_} unless ($seqs{$_} =~ /[a-zA-Z]/);
+ }
+ unlink "$dir/temp", "$dir/temp-gb", "$dir/temp-gb.htm";
+ }
+
+ ## create translation table for tree annotation ##
+
+ if ($buildTree) {
+ for ($i=0; $i<=$#hits; $i++) {
+ $s = $hits[$i]{'organism'};
+ $s =~ s/[^a-zA-Z0-9,\.\-]/ /g;
+ $names{$hits[$i]{'accn'}} = $s;
+ }
+ }
+
+ ## build phylogenetic tree and compute distance matrix with ClustalW (Neighbor-Joining) ##
+
+ if ($buildTree eq "clustalw") {
+ if (write_fasta("$dir/temp")>= 3) {
+ $s = "$clustalw -infile=$dir/temp -outputtree=dist -quiet";
+ if ($bsTree) { $s .= " -bootstrap=$bsTree -seed=12345 -bootlabels=node"; } else { $s .= " -tree"; }
+ $s .= " > $dir/buffer";
+ system $s;
+ die "Execution of ClustalW failed. Please check.\n" unless (-s "$dir/temp.ph" or "$dir/temp.phb");
+ if ($bsTree) { open IN, "<$dir/temp.phb"; } else { open IN, "<$dir/temp.ph"; }
+ while () { s/\s+$//; $tree .= $_; }
+ close IN;
+ $tree =~ s/TRICHOTOMY//i;
+ $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
+ if ($distance) {
+ $s = "";
+ open IN, "<$dir/temp.dst";
+ $_ = ; $s = ; $s =~ s/\s+$//;
+ while () {
+ last unless (/^\s/);
+ s/\s+$//; s/^\s+//;
+ $s .= " $_";
+ }
+ close IN;
+ @a = split (/\s+/, $s);
+ shift @a;
+ for ($i=0; $i<=$#hits; $i++) {
+ next if exists $hits[$i]{'ignore'};
+ next unless exists $seqs{$hits[$i]{'accn'}};
+ $hits[$i]{'distance'} = shift (@a);
+ }
+ }
+ unlink "$dir/temp", "$dir/temp.ph", "$dir/temp.phb","$dir/temp.dst", "$dir/buffer";
+ }
+ }
+
+ ## realign sequences, build phylogenetic tree and compute distance matrix with MAFFT (Neighbor-Joining) ##
+
+ if ($realign eq "mafft" or $buildTree eq "mafft") {
+ if (write_fasta("$dir/temp")>=3) {
+ system "$mafft --retree 1 --treeout --distout --quiet $dir/temp > $dir/buffer";
+ die "Execution of MAFFT failed. Please check.\n" unless -s "$dir/buffer";
+ read_fasta "$dir/buffer" if ($realign eq "mafft");
+ if ($buildTree eq "mafft") {
+ open IN, "<$dir/temp.tree";
+ while () { s/\s+$//; $tree .= $_; }
+ close IN;
+ $tree =~ s/\s+$/;/;
+ $tree =~ s/_+:/:/g;
+ $tree =~ s/([,\(])\d{1,4}_/$1/g;
+ $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
+ $tree .= ";" unless $tree =~ /;$/;
+ if ($distance) {
+ my %mafftIDs = ();
+ open IN, "<$dir/temp.hat2";
+ $_ = ; $_ = ; $_ = ;
+ @a = ();
+ while () {
+ s/\s+$//; s/^\s+//; next if (/\. =/);
+ @a = (@a, split (/\s/));
+ last if @a >= $nSeq;
+ }
+ close IN;
+ $j = 0;
+ $hits[0]{'distance'} = "0.000";
+ for ($i=1; $i<=$#hits; $i++) {
+ next if exists $hits[$i]{'ignore'};
+ next unless exists $seqs{$hits[$i]{'accn'}};
+ $hits[$i]{'distance'} = $a[$j++];
+ }
+ }
+ }
+ unlink "$dir/temp", "$dir/buffer", "$dir/temp.tree", "$dir/temp.hat2";
+ }
+ }
+
+ ## build phylogenetic tree with PhyML (Maximum Likelihood) ##
+
+ if ($buildTree eq "phyml") {
+ if (write_phylip("$dir/temp")>=3) {
+ $s = "$phyml -i $dir/temp -d aa --quiet";
+ $s .= "-b $bsTree" if $bsTree;
+ system $s;
+ open IN, "<$dir/temp_phyml_tree.txt"; $tree = ; close IN;
+ $tree =~ s/\s+$//;
+ $tree =~ s/([,\(])t(\d+):/$1$hits[$2]{'accn'}:/g;
+ $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
+ unlink "$dir/temp", "$dir/temp_phyml_stats.txt", "$dir/temp_phyml_tree.txt";
+ unlink "$dir/temp_phyml_boot_stats.txt", "$dir/temp_phyml_boot_trees.txt" if $bsTree;
+ }
+ }
+
+ ## build phylogenetic tree with FastTree (Maximum Likelihood) and perform SH test ##
+
+ if ($buildTree eq "fasttree") {
+ if (write_fasta("$dir/temp")>=3) {
+ $s = "$fasttree < $dir/temp > fasttree.tmp -quiet";
+ $s .= " -wag" if $aaModel eq "WAG";
+ system $s;
+ open IN, "; close IN;
+ $tree =~ s/\s+$//;
+ $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
+ unlink "$dir/temp", "fasttree.tmp";
+ }
+ }
+
+ ## build phylogenetic tree and compute distance matrix with RAxML (Maximum Likelihood) ##
+
+ if ($buildTree eq "raxml") {
+ open OUT, ">$dir/temp";
+ foreach (keys %seqs) {
+ $i = length ($seqs{$_});
+ last;
+ }
+ print OUT " $nSeq $i\n";
+ for ($i=0; $i<=$#hits; $i++) {
+ next if exists $hits[$i]{'ignore'};
+ next unless exists $seqs{$hits[$i]{'accn'}};
+ print OUT "t$i".(" " x (13-length("t$i"))).$seqs{$hits[$i]{'accn'}}."\n";
+ }
+ close OUT;
+ $s = "$raxml -m PROTGAMMA$aaModel -p 12345 -s $dir/temp -n tmp";
+ $s .= " -f a -x 12345 -# $bsTree" if $bsTree;
+ $s .= " > RAxML_screen.tmp";
+ system $s;
+ die "Execution of RAxML failed. Please check.\n" unless ((-s "RAxML_result.tmp") or (-s "RAxML_bipartitions.tmp"));
+ if ($bsTree) { open IN, "RAxML_bipartitions.tmp"; }
+ else { open IN, "RAxML_result.tmp"; }
+ $tree = ; close IN;
+ $tree =~ s/\s+$//;
+ $tree =~ s/([,\(])t(\d+):/$1$hits[$2]{'accn'}:/g;
+ $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
+ if ($distance) {
+ $s = "$raxml -f x -m PROTGAMMA$aaModel -s $dir/temp -n tmp2";
+ if ($bsTree) { $s .= " -t RAxML_bipartitions.tmp"; }
+ else { open IN, " -t RAxML_result.tmp"; }
+ $s .= " > RAxML_screen.tmp";
+ system $s;
+ open IN, ") {
+ s/\s+$//; next unless $_;
+ next unless (/^t0/);
+ @b = split (/\s+/);
+ push @a, $b[2];
+ }
+ close IN;
+ $hits[0]{'distance'} = "0.000000";
+ for ($i=1; $i<=$#hits; $i++) {
+ next if exists $hits[$i]{'ignore'};
+ next unless exists $seqs{$hits[$i]{'accn'}};
+ $hits[$i]{'distance'} = shift (@a);
+ }
+ unlink "RAxML_info.tmp2", "RAxML_distances.tmp2";
+ }
+ unlink "$dir/temp", "$dir/temp.reduced";
+ unlink "RAxML_bestTree.tmp", "RAxML_info.tmp", "RAxML_log.tmp", "RAxML_parsimonyTree.tmp", "RAxML_result.tmp", "RAxML_screen.tmp";
+ unlink "RAxML_bootstrap.tmp", "RAxML_bipartitions.tmp", "RAxML_bipartitionsBranchLabels.tmp" if $bsTree;
+ }
+
+ ## print realigned sequences ##
+
+ if ($realign or $trimSeq or $distance) {
+ foreach (keys %seqs) {
+ $nChar = length ($seqs{$_});
+ last;
+ }
+ $s = "";
+ $reading = 1;
+ open IN, "<$dir/$file" or next;
+ while () {
+ if (/^BEGIN TREE/ and $buildTree) { $reading = 0; next; }
+ if ($reading) { $s .= $_; }
+ if (/^END;/ and not $reading) { $reading = 1; next; }
+ if (/^BEGIN ORGANISM/ and $distance) {
+ $reading = 0;
+ for ($i=0; $i<=$#hits; $i++) {
+ $s .= $hits[$i]{'accns'}."\t".$hits[$i]{'organism'}."\t".$hits[$i]{'taxid'}."\t".$hits[$i]{'score'}."\t".$hits[$i]{'expect'}."\t".$hits[$i]{'identity'};
+ $s .= "\t".$hits[$i]{'coverage'} if exists $hits[$i]{'coverage'};
+ if (exists $hits[$i]{'ignore'}) {
+ $s .= "\tx\n";
+ } else {
+ if ($distance and exists $hits[$i]{'distance'}) {
+ $s .= "\t".$hits[$i]{'distance'}."\n";
+ } else {
+ $s .= "\t\n";
+ }
+ }
+ }
+ $s .= ";\nEND;\n";
+ }
+ if (/^BEGIN DATA/) {
+ $reading = 0;
+ $s .= "\tDIMENSIONS NTAX=$nSeq NCHAR=$nChar;\n";
+ $s .= "\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n";
+ $s .= "\tMATRIX\n";
+ for ($i=0; $i<=$#hits; $i++) {
+ next if exists $hits[$i]{'ignore'};
+ next unless exists $seqs{$hits[$i]{'accn'}};
+ $s .= $hits[$i]{'accn'}."\t".$seqs{$hits[$i]{'accn'}}."\n";
+ }
+ $s .= ";\nEND;\n";
+ }
+ }
+ close IN;
+ open OUT, ">$dir/$file" or next;
+ print OUT $s;
+ close OUT;
+ }
+
+ ## print tree ##
+
+ if ($buildTree) {
+ open OUT, ">>$dir/$file" or next;
+ print OUT "BEGIN TREES;\n\tTREE 1 = $tree\nEND;\n";
+ close OUT;
+ }
+
+ ## show progress ##
+
+ if ($iProtein/$nProtein >= $iProgress/60) {
+ print ".";
+ $iProgress++;
+ }
+ }
+
+ unlink "RAxML_screen.tmp" if -e "RAxML_screen.tmp";
+
+ print " done.\n";
}
print "Execution of treer completed.\n";
exit 0;
@@ -453,51 +453,50 @@
## public subroutines ##
-sub write_fasta ($){
- my $count = 0;
- open OUT, ">$_[0]";
- for ($i=0; $i<=$#hits; $i++){
- next if exists $hits[$i]{'ignore'};
- next unless exists $seqs{$hits[$i]{'accn'}};
- print OUT ">".$hits[$i]{'accn'}."\n".$seqs{$hits[$i]{'accn'}}."\n";
- $count ++;
- }
- close OUT;
- return $count;
+sub write_fasta ($) {
+ my $count = 0;
+ open OUT, ">$_[0]";
+ for ($i=0; $i<=$#hits; $i++) {
+ next if exists $hits[$i]{'ignore'};
+ next unless exists $seqs{$hits[$i]{'accn'}};
+ print OUT ">".$hits[$i]{'accn'}."\n".$seqs{$hits[$i]{'accn'}}."\n";
+ $count ++;
+ }
+ close OUT;
+ return $count;
}
-sub write_phylip ($){
- my $count = 0;
- open OUT, ">$_[0]";
- foreach (keys %seqs){
- $i = length ($seqs{$_});
- last;
- }
- print OUT " $nSeq $i\n";
- for ($i=0; $i<=$#hits; $i++){
- next if exists $hits[$i]{'ignore'};
- next unless exists $seqs{$hits[$i]{'accn'}};
- print OUT "t$i".(" " x (13-length("t$i"))).$seqs{$hits[$i]{'accn'}}."\n";
- $count ++;
- }
- close OUT;
- return $count;
+sub write_phylip ($) {
+ my $count = 0;
+ open OUT, ">$_[0]";
+ foreach (keys %seqs) {
+ $i = length ($seqs{$_});
+ last;
+ }
+ print OUT " $nSeq $i\n";
+ for ($i=0; $i<=$#hits; $i++) {
+ next if exists $hits[$i]{'ignore'};
+ next unless exists $seqs{$hits[$i]{'accn'}};
+ print OUT "t$i".(" " x (13-length("t$i"))).$seqs{$hits[$i]{'accn'}}."\n";
+ $count ++;
+ }
+ close OUT;
+ return $count;
}
-sub read_fasta ($){
- open IN, "<$_[0]";
- my $name = "";
- while (){
- s/\s+$//;
- next unless $_;
- if (s/^>//){
- $name = $_;
- $seqs{$name} = "";
- }else{
- $seqs{$name} .= $_;
- }
- }
- close IN;
+sub read_fasta ($) {
+ open IN, "<$_[0]";
+ my $name = "";
+ while () {
+ s/\s+$//;
+ next unless $_;
+ if (s/^>//) {
+ $name = $_;
+ $seqs{$name} = "";
+ } else {
+ $seqs{$name} .= $_;
+ }
+ }
+ close IN;
}
-
From d369dbd57fec04aa9f73e8bd8b99eb357f75f5cb Mon Sep 17 00:00:00 2001
From: qiyunzhu
Date: Wed, 23 Aug 2017 00:35:16 -0700
Subject: [PATCH 2/9] changed diamond support to version 0.8+
---
scripts/searcher.pl | 13 +++----------
1 file changed, 3 insertions(+), 10 deletions(-)
diff --git a/scripts/searcher.pl b/scripts/searcher.pl
index 0c9c48e..eee435f 100644
--- a/scripts/searcher.pl
+++ b/scripts/searcher.pl
@@ -691,11 +691,9 @@
unlink "$wkDir/tmp.m8";
} elsif ($searchTool eq "DIAMOND") {
`$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
- `$diamond blastp -p $threads -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
- my @out = `$diamond view -a $wkDir/tmp.daa`;
+ my @out = `$diamond blastp -p $threads -q $wkDir/tmp.in -d $wkDir/tmp.dmnd`;
unlink "$wkDir/tmp.in";
unlink "$wkDir/tmp.dmnd";
- unlink "$wkDir/tmp.daa";
die "Error in running DIAMOND. Please check." unless @out;
foreach (@out) {
s/\s+$//;
@@ -895,13 +893,10 @@ sub local_search {
$outfile = $wkDir."/tmp.m8";
unlink $wkDir."/tmp.aln";
} elsif ($searchTool eq "DIAMOND") {
- $s = "$diamond blastp -p $threads -q $wkDir/tmp.in -d $protdb -a $wkDir/tmp -t $wkDir";
+ $s = "$diamond blastp -p $threads -q $wkDir/tmp.in -d $protdb -o $outfile";
$s .= " -e $evalue" if ($evalue);
$s .= " -k $nHits" if ($nHits);
`$s`;
- $s = "$diamond view -a $wkDir/tmp.daa -o $outfile";
- `$s`;
- unlink $wkDir."/tmp.daa";
} else {
unlink "$wkDir/tmp.in";
die "Error: Search tool not specified and pre-computed results not found for $set.\n";
@@ -1594,11 +1589,9 @@ sub self_align {
} elsif ($searchTool eq "DIAMOND") {
`$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
- `$diamond blastp -p 1 -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
- my @out = `$diamond view -a $wkDir/tmp.daa`;
+ my @out = `$diamond blastp -p 1 -q $wkDir/tmp.in -d $wkDir/tmp.dmnd`;
unlink "$wkDir/tmp.in";
unlink "$wkDir/tmp.dmnd";
- unlink "$wkDir/tmp.daa";
if (@out) {
@a = split (/\t/, $out[0]);
if ($#a == 11) {
From b16c36b507b61ea2bdc556ecbb00b8b8e8fbd540 Mon Sep 17 00:00:00 2001
From: qiyunzhu
Date: Wed, 23 Aug 2017 01:02:55 -0700
Subject: [PATCH 3/9] removed GI parser from analyzer
---
scripts/analyzer.pl | 4 +---
scripts/searcher.pl | 21 ++++++++++++++-------
2 files changed, 15 insertions(+), 10 deletions(-)
diff --git a/scripts/analyzer.pl b/scripts/analyzer.pl
index 8e41bfc..2137ce1 100644
--- a/scripts/analyzer.pl
+++ b/scripts/analyzer.pl
@@ -478,11 +478,9 @@
if (/^END;/) { $reading = 0; next; }
if ($reading eq "query") { # read query (self)
$result{'accn'} = $1 if /^\tName=(.+);$/;
- $result{'gi'} = $1 if /^\tGI=(\d+);$/;
$result{'length'} = $1 if /^\tLength=(.+);$/;
$result{'product'} = $1 if /^\tProduct=(.+)\s*;$/;
- $result{'organism'} = $1 if /^\tOrganism=(.+)\s*;$/;
- if (/^\tAccession=(.+);$/) { $result{'accn'} = $1; $result{'accn'} =~ s/\.[\d]+$//; }
+ $result{'accn'} = $1 if /^\tAccession=(.+);$/;
}
if ($reading eq "organism") { # read organisms
next if /^;/;
diff --git a/scripts/searcher.pl b/scripts/searcher.pl
index eee435f..a163358 100644
--- a/scripts/searcher.pl
+++ b/scripts/searcher.pl
@@ -59,7 +59,7 @@
# prots (array...)
# prot => (
# name (accn or user-defined)
- # title (after the 1st whitespace)
+ # product (after 1st whitespace, excluding [organism name])
# seq (necessary for local searches, optional for remote BLAST)
# done (boolean)
# hits (array...)
@@ -87,6 +87,7 @@
sub local_search; # same as above
sub self_align; # search against itself
sub get_taxonomy; # paramter: array of TaxIDs
+sub get_product; # get product from protein title
sub stem_name; # get stem file name
sub order_accns; # reorder accession number
@@ -295,12 +296,12 @@
while () {
s/\s+$//; next unless $_; next if /^#/;
$intype = /^>/ ? "fasta" : "list" unless $intype;
- my %prot = ('name'=>'', 'title'=>'', 'seq'=>'', 'hits'=>[], 'done'=>0);
+ my %prot = ('name'=>'', 'product'=>'', 'seq'=>'', 'hits'=>[], 'done'=>0);
if ($intype eq 'fasta') {
if (s/^>//) {
@a = split(/\s+/, $_, 2);
$prot{'name'} = $a[0];
- $prot{'title'} = $a[1] if $#a;
+ $prot{'product'} = get_product($a[1]) if $#a;
push @{$ins{$set}{'prots'}}, {%prot};
} else {
$ins{$set}{'prots'}[-1]{'seq'} .= $_; # append sequence
@@ -1066,7 +1067,7 @@ sub local_search {
print OUT "#NEXUS\nBEGIN QUERY;\n";
print OUT "\tName=".$ins{$set}{'prots'}[$id]{'name'}.";\n";
print OUT "\tLength=".length($ins{$set}{'prots'}[$id]{'seq'}).";\n";
- print OUT "\tTitle=".$ins{$set}{'prots'}[$id]{'title'}.";\n";
+ print OUT "\tProduct=".$ins{$set}{'prots'}[$id]{'product'}.";\n";
print OUT "END;\n\n";
print OUT "BEGIN ORGANISM;\n";
print OUT "[Accession\tOrganism\tTaxID\tBit-score\tE-value\t\%Identity\t\%Coverage]\n";
@@ -1180,9 +1181,8 @@ sub http_blast{
foreach (split(/\n/, $s)) {
if (/^#/) {
if (s/^# Query: // and not $self{'seq'}) {
- # s/>.*//;
@a = split(/\s+/, $_, 2);
- $self{'title'} = $a[1] if $#a;
+ $self{'product'} = get_product($a[1]) if $#a;
}
} else {
# fields: query id, subject ids, % identity, % positives, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
@@ -1393,7 +1393,7 @@ sub http_blast{
print OUT "#NEXUS\nBEGIN QUERY;\n";
print OUT "\tName=".$self{'name'}.";\n";
print OUT "\tLength=".$self{'length'}.";\n";
- print OUT "\tTitle=".$self{'title'}.";\n";
+ print OUT "\tProduct=".$self{'product'}.";\n";
print OUT "END;\n\n";
# retrieve taxonomy report (using TaxBLAST)
@@ -1790,6 +1790,13 @@ sub order_accns {
return join ("/", @accns1);
}
+# get product from protein title
+sub get_product {
+ my $title = $_[0];
+ $title =~ s/^\s+|\s+$//g;
+ $title =~ s/\[.+\]$//;
+}
+
# get stem file name #
sub stem_name {
my $stem = $_[0];
From ca103d0d2be0cd69f7eae7ce9b518344d2b9bb2f Mon Sep 17 00:00:00 2001
From: qiyunzhu
Date: Wed, 23 Aug 2017 01:04:56 -0700
Subject: [PATCH 4/9] changed version number
---
ChangeLog | 6 +++---
GUI.html | 37 ++++++++++++++++++-------------------
LICENSE | 4 ++--
README.md | 2 +-
config.txt | 9 ++++-----
5 files changed, 28 insertions(+), 30 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index f390080..e25693f 100755
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,12 +1,12 @@
ChangeLog of HGTector
-== Version 0.2.2 (8/22/2017) ==
+== Version 0.2.2 (8/23/2017) ==
New features:
-- Adopt the new NCBI format (accession instead of GI as sequence identifier)
+- Adopt new NCBI standard (accession instead of GI as sequence identifier)
Bug fixes:
-- Reformatted all Perl scripts using Perltidy.
+- Reformatted all Perl scripts.
- Replaced 'http' in NCBI URLs with 'https'.
diff --git a/GUI.html b/GUI.html
index a9fcdf0..508f8cf 100755
--- a/GUI.html
+++ b/GUI.html
@@ -1,10 +1,10 @@
- HGTector v0.2.1 GUI
+ HGTector v0.2.2 GUI