From 060a07b8ed59c7017646370d945552d8b486184a Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Tue, 22 Aug 2017 17:01:14 -0700
Subject: [PATCH 1/9] replaced tab with 4 spaces, removed reliance on outdated
 NCBI GI

---
 ChangeLog             |   10 +
 HGTector.pl           |  220 +--
 scripts/analyzer.pl   | 2645 +++++++++++++++++----------------
 scripts/bbh.pl        |  309 ++--
 scripts/orthologer.pl |  353 +++--
 scripts/reporter.pl   | 1333 +++++++++--------
 scripts/searcher.pl   | 3278 ++++++++++++++++++++---------------------
 scripts/treer.pl      |  847 ++++++-----
 8 files changed, 4471 insertions(+), 4524 deletions(-)
 mode change 100755 => 100644 HGTector.pl
 mode change 100755 => 100644 scripts/analyzer.pl
 mode change 100755 => 100644 scripts/bbh.pl
 mode change 100755 => 100644 scripts/orthologer.pl
 mode change 100755 => 100644 scripts/reporter.pl
 mode change 100755 => 100644 scripts/searcher.pl
 mode change 100755 => 100644 scripts/treer.pl
diff --git a/ChangeLog b/ChangeLog
index 8b569cc..f390080 100755
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,15 @@
 ChangeLog of HGTector
 
+== Version 0.2.2 (8/22/2017) ==
+
+New features:
+- Adopt the new NCBI format (accession instead of GI as sequence identifier)
+
+Bug fixes:
+- Reformatted all Perl scripts using Perltidy.
+- Replaced 'http' in NCBI URLs with 'https'.
+
+
 == Incremental update (1/28/2017) ==
 Bug fixes:
 - Fixed a bug in databaser.py which incorrectly deals the case with subsampling off.
diff --git a/HGTector.pl b/HGTector.pl
old mode 100755
new mode 100644
index a2ec136..f2dfceb
--- a/HGTector.pl
+++ b/HGTector.pl
@@ -1,6 +1,6 @@
 #!/usr/bin/perl
 
-# HGTector (version 0.2.1): Genome-wide detection of horizontal gene transfer based on sequence similarity search result distribution statistics
+# HGTector (version 0.2.2): Genome-wide detection of horizontal gene transfer based on sequence similarity search result distribution statistics
 # Copyright (C) 2013-2017, Qiyun Zhu, Katharina Dittmar. All rights reserved.
 # Licensed under BSD 2-clause license.
 
@@ -11,9 +11,9 @@
 $| = 1;
 
 print "
-┌---------------------┐
-|   HGTector v0.2.1   |
-└---------------------┘
++---------------------+
+|   HGTector v0.2.2   |
++---------------------+
 ";
 
 my $usage = "
@@ -35,116 +35,118 @@
 my $scripts = abs_path($0); $scripts =~ s/HGTector\.pl$/scripts/;
 my $wkDir = abs_path($0); $wkDir =~ s/HGTector\.pl$/sample/;
 
-if (not @ARGV or $ARGV[0] eq "-h" or $ARGV[0] eq "--help"){
-	if (-d $db){
-		print $usage;
-	}else{
-		print "\nWelcome! This seems to be the first time you run HGTector on this device.\n";
-		print "I will guide you through the basics of the program.\n";
-		print "\nWould you like to test on a small sample dataset (yes/NO)? ";
-		my $s = <STDIN>; chomp $s;
-		if ($s =~ /^yes$/i or $s =~ /^y$/i){
-			print "\nWarning: HGTector will connect the NCBI BLAST server to perform sequence similarity search and identify taxonomy of hits. You don't need any local search tool, database or computing power. However, please be prepared that this process is slow and the server may hang in some situations.\nProceed (yes) or opt out (NO): ";
-			$s = <STDIN>; chomp $s;
-			if ($s =~ /^yes$/i or $s =~ /^y$/i){
-				pipeline;
-			}
-		}
-		print "The sample dataset and its corresponding configuration files, plus an archive of sample output can be found in the /sample subdirectory. You may take a look and get a general idea of how the input / output files are organized in an HGTector analysis.\n";
-		print "\n";
-		print "Now I will assist you to create necessary databases for future analyses.\nProceed (YES/no)? ";
-		$s = <STDIN>; chomp $s;
-		if (not $s or $s =~ /^yes$/i or $s =~ /^y$/i){
-			print "HGTector needs three databases: a protein sequence database, a taxonomy database, and a protein-to-taxonomy dictionary. Using proper databases is important for effective analyses. Please read about the details of choice of databases in the GUI.\n";
-			print "Since release 0.2.0, a Python script databaser.py is implemented to automate the database construction process.\n";
-			die "Error: databaser.py is not found in the scripts/ subdirectory.\n" unless -e "$scripts/databaser.py";
-			print "\nPlease specify a location to store the databases (default: db/ in the program directory): ";
-			$s = <STDIN>; chomp $s; $s =~ s/\/$//;
-			$db = $s if $s;
-			mkdir $db;
-			chdir $db;
-			print "\nBy default, I will download NCBI RefSeq proteomic data of microbial organisms (archaea, bacteria, fungi and protozoa), keep one proteome per species, plus all NCBI-defined representative proteomes. The data will be merged into a master multi-Fasta file, and compiled into a BLAST database.\n";
-			print "As of 2015-10-25, the database contained around 30 million non-redundant protein sequences from more than 8000 organisms. Depending on the Internet connection quality, it may take up to 12 hr to download and compile.\n";
-			my ($format, $range, $represent, $subsample, $out) = ("blast", "microbe", "auto", "1", "stdb");
-			print "Accept (YES) or change options (no): ";
-			$s = <STDIN>; chomp $s;
-			if ($s =~ /^no$/i or $s =~ /^n$/i){
-				print "\nNCBI RefSeq genomes are categorized as: archaea, bacteria, fungi, invertebrate, plant, protozoa, vertebrate_mammalian, vertebrate_other, and viral. Type one or more desired categories separated by comma, or press Enter to accept the default (microbe): ";
-				$s = <STDIN>; chomp $s; $s =~ s/\s+//g;
-				$range = $s if $s;
-				print "\nI will keep one representative organism per species by default, in order to save computing time. Otherwise, enter the number of organisms you want to keep, or type '0' to keep all organisms (no subsampling): ";
-				$s = <STDIN>; chomp $s;
-				if ($s eq "0"){ $subsample = "0"; }
-				elsif ($s and $s =~ /^\d+$/){ $subsample = $s; }
-				print "\nAll NCBI-defined representative organisms will be kept. Press Enter to accept, type 'no' to opt out, or enter a file name that contains a list of TaxID you wish to keep: ";
-				$s = <STDIN>; chomp $s;
-				$represent = $s if $s;
-				print "\nAfter downloading the proteomic data, do you want to build a BLAST database? Note: you need makeblastdb from the NCBI-BLAST+ package to enable this function. You may also build database manually using tools of choice later. (YES/no) ";
-				$s = <STDIN>; chomp $s;
-				$format = "none" if ($s =~ /^no$/i or $s =~ /^n$/i);
-				print "\nEnter a stem file name for your database (default: 'stdb'): ";
-				$s = <STDIN>; chomp $s;
-				$out = $s if $s;
-			}
-			my $i = system "python $scripts/databaser.py -format=$format -range=$range -represent=$represent -subsample=$subsample -out=$out";
-			chdir $cwd;
-			print "\nThere appear to be some problems in database building.\n" unless -s "$db/$out.faa";
-			print "\nNow that databases are created. Your may link HGTector to them in future analyses, by setting the following parameters in config.txt:\n";
-			if ($format eq "blast"){ print "protdb=$db/blast/$out\n"; }
-			else{ print "# You still need to build a searchable database based on $db/$out.faa"; }
-			print "taxdump=$db/taxdump\n";
-			print "prot2taxid=$db/gi2taxid.txt\n";
-			print "\n";
-		}
-		print "You may start to use HGTector to analyze your data.\n";
-		print $usage;
-	}
-}else{
-	$wkDir = $ARGV[0];
-	pipeline;
+if (not @ARGV or $ARGV[0] eq "-h" or $ARGV[0] eq "--help") {
+    if (-d $db) {
+        print $usage;
+    } else {
+        print "\nWelcome! This seems to be the first time you run HGTector on this device.\n";
+        print "I will guide you through the basics of the program.\n";
+        print "\nWould you like to test on a small sample dataset (yes/NO)? ";
+        my $s = <STDIN>; chomp $s;
+        if ($s =~ /^yes$/i or $s =~ /^y$/i) {
+            print "\nWarning: HGTector will connect the NCBI BLAST server to perform sequence similarity search and identify taxonomy of hits. You don't need any local search tool, database or computing power. However, please be prepared that this process is slow and the server may hang in some situations.\nProceed (yes) or opt out (NO): ";
+            $s = <STDIN>; chomp $s;
+            if ($s =~ /^yes$/i or $s =~ /^y$/i) {
+                pipeline;
+            }
+        }
+        print "The sample dataset and its corresponding configuration files, plus an archive of sample output can be found in the /sample subdirectory. You may take a look and get a general idea of how the input / output files are organized in an HGTector analysis.\n";
+        print "\n";
+        print "Now I will assist you to create necessary databases for future analyses.\nProceed (YES/no)? ";
+        $s = <STDIN>; chomp $s;
+        if (not $s or $s =~ /^yes$/i or $s =~ /^y$/i) {
+            print "HGTector needs three databases: a protein sequence database, a taxonomy database, and a protein-to-taxonomy dictionary. Using proper databases is important for effective analyses. Please read about the details of choice of databases in the GUI.\n";
+            print "Since release 0.2.0, a Python script databaser.py is implemented to automate the database construction process.\n";
+            die "Error: databaser.py is not found in the scripts/ subdirectory.\n" unless -e "$scripts/databaser.py";
+            print "\nPlease specify a location to store the databases (default: db/ in the program directory): ";
+            $s = <STDIN>; chomp $s; $s =~ s/\/$//;
+            $db = $s if $s;
+            mkdir $db;
+            chdir $db;
+            print "\nBy default, I will download NCBI RefSeq proteomic data of microbial organisms (archaea, bacteria, fungi and protozoa), keep one proteome per species, plus all NCBI-defined representative proteomes. The data will be merged into a master multi-Fasta file, and compiled into a BLAST database.\n";
+            print "As of 2015-10-25, the database contained around 30 million non-redundant protein sequences from more than 8000 organisms. Depending on the Internet connection quality, it may take up to 12 hr to download and compile.\n";
+            my ($format, $range, $represent, $subsample, $out) = ("blast", "microbe", "auto", "1", "stdb");
+            print "Accept (YES) or change options (no): ";
+            $s = <STDIN>; chomp $s;
+            if ($s =~ /^no$/i or $s =~ /^n$/i) {
+                print "\nNCBI RefSeq genomes are categorized as: archaea, bacteria, fungi, invertebrate, plant, protozoa, vertebrate_mammalian, vertebrate_other, and viral. Type one or more desired categories separated by comma, or press Enter to accept the default (microbe): ";
+                $s = <STDIN>; chomp $s; $s =~ s/\s+//g;
+                $range = $s if $s;
+                print "\nI will keep one representative organism per species by default, in order to save computing time. Otherwise, enter the number of organisms you want to keep, or type '0' to keep all organisms (no subsampling): ";
+                $s = <STDIN>; chomp $s;
+                if ($s eq "0") { $subsample = "0"; }
+                elsif ($s and $s =~ /^\d+$/) { $subsample = $s; }
+                print "\nAll NCBI-defined representative organisms will be kept. Press Enter to accept, type 'no' to opt out, or enter a file name that contains a list of TaxID you wish to keep: ";
+                $s = <STDIN>; chomp $s;
+                $represent = $s if $s;
+                print "\nAfter downloading the proteomic data, do you want to build a BLAST database? Note: you need makeblastdb from the NCBI-BLAST+ package to enable this function. You may also build database manually using tools of choice later. (YES/no) ";
+                $s = <STDIN>; chomp $s;
+                $format = "none" if ($s =~ /^no$/i or $s =~ /^n$/i);
+                print "\nEnter a stem file name for your database (default: 'stdb'): ";
+                $s = <STDIN>; chomp $s;
+                $out = $s if $s;
+            }
+            my $i = system "python $scripts/databaser.py -format=$format -range=$range -represent=$represent -subsample=$subsample -out=$out";
+            chdir $cwd;
+            print "\nThere appear to be some problems in database building.\n" unless -s "$db/$out.faa";
+            print "\nNow that databases are created. Your may link HGTector to them in future analyses, by setting the following parameters in config.txt:\n";
+            if ($format eq "blast") {
+                print "protdb=$db/blast/$out\n";
+            } else {
+                print "# You still need to build a searchable database based on $db/$out.faa";
+            }
+            print "taxdump=$db/taxdump\n";
+            print "prot2taxid=$db/gi2taxid.txt\n";
+            print "\n";
+        }
+        print "You may start to use HGTector to analyze your data.\n";
+        print $usage;
+    }
+} else {
+    $wkDir = $ARGV[0];
+    pipeline;
 }
 exit 0;
 
 sub pipeline {
-	print "\nValidating task...\n";
-	die "Error: Invalid working directory $wkDir.\n" unless -d $wkDir;
-	my $interactive = 1;
-	if (-e "$wkDir/config.txt"){
-		open IN, "<$wkDir/config.txt" or die "Error: Invalid configuration file $wkDir/config.txt.\n";
-		while (<IN>){
-			s/#.*$//; s/\s+$//; s/^\s+//; next unless $_;
-			$interactive = 0 if /^interactive=0$/;
-		}
-	}else{
-		print "Warning: Configuration file $wkDir/config.txt is not found. HGTector will use default settings.\n";
-		if ($interactive){
-			print "Press Enter to proceed, or Ctrl+C to exit:";
-			my $s = <STDIN>;
-		}
-	}
-	die "Error: Input folder $wkDir/input is not found. Please prepare input data before running HGTector.\n" unless -d "$wkDir/input";
-	print "Done.\n\n";
+    print "\nValidating task...\n";
+    die "Error: Invalid working directory $wkDir.\n" unless -d $wkDir;
+    my $interactive = 1;
+    if (-e "$wkDir/config.txt") {
+        open IN, "<$wkDir/config.txt" or die "Error: Invalid configuration file $wkDir/config.txt.\n";
+        while (<IN>) {
+            s/#.*$//; s/\s+$//; s/^\s+//; next unless $_;
+            $interactive = 0 if /^interactive=0$/;
+        }
+    } else {
+        print "Warning: Configuration file $wkDir/config.txt is not found. HGTector will use default settings.\n";
+        if ($interactive) {
+            print "Press Enter to proceed, or Ctrl+C to exit:";
+            my $s = <STDIN>;
+        }
+    }
+    die "Error: Input folder $wkDir/input is not found. Please prepare input data before running HGTector.\n" unless -d "$wkDir/input";
+    print "Done.\n\n";
 
-	print "Step 1: Searcher - batch protein sequence homology search.\n";
-	die "Error: searcher.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/searcher.pl";
-	my $i = system "$^X $scripts/searcher.pl $wkDir";
-	die "Error: Execution of searcher.pl failed. HGTector exists.\n" if $i;
-	die "Error: No search result detected. HGTector exists.\n" unless -d "$wkDir/search";
-	print "\n";
+    print "Step 1: Searcher - batch protein sequence homology search.\n";
+    die "Error: searcher.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/searcher.pl";
+    my $i = system "$^X $scripts/searcher.pl $wkDir";
+    die "Error: Execution of searcher.pl failed. HGTector exists.\n" if $i;
+    die "Error: No search result detected. HGTector exists.\n" unless -d "$wkDir/search";
+    print "\n";
 
-	print "Step 2: Analyzer - predict HGT based on hit distribution statistics.\n";
-	die "Error: analyzer.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/analyzer.pl";
-	$i = system "$^X $scripts/analyzer.pl $wkDir";
-	die "Execution of analyzer.pl failed.\n" if $i;
-	die "Error: No analysis result detected. HGTector exists.\n" unless -d "$wkDir/result";
-	print "\n";
+    print "Step 2: Analyzer - predict HGT based on hit distribution statistics.\n";
+    die "Error: analyzer.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/analyzer.pl";
+    $i = system "$^X $scripts/analyzer.pl $wkDir";
+    die "Execution of analyzer.pl failed.\n" if $i;
+    die "Error: No analysis result detected. HGTector exists.\n" unless -d "$wkDir/result";
+    print "\n";
 
-	print "Step 3: Reporter - generate report for prediction results.\n";
-	die "Error: reporter.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/reporter.pl";
-	$i = system "$^X $scripts/reporter.pl $wkDir";
-	die "Execution of analyzer.pl failed.\n" if $i;
-	print "\n";
-	
-	print "All steps completed.\n";
-}
+    print "Step 3: Reporter - generate report for prediction results.\n";
+    die "Error: reporter.pl is not found in the scripts/ subdirectory.\n" unless -e "$scripts/reporter.pl";
+    $i = system "$^X $scripts/reporter.pl $wkDir";
+    die "Execution of analyzer.pl failed.\n" if $i;
+    print "\n";
 
+    print "All steps completed.\n";
+}
diff --git a/scripts/analyzer.pl b/scripts/analyzer.pl
old mode 100755
new mode 100644
index 24f8e8b..8e41bfc
--- a/scripts/analyzer.pl
+++ b/scripts/analyzer.pl
@@ -50,86 +50,86 @@
 
 ## global variables ##
 
-my @sets;										# proteins sets to analyze
-my %lv = ();											# taxonomic grouping scenario
-my %selves = ();								# taxonomic information of self group
-my %lvList = ();								# list of taxids of each level
+my @sets;                                        # proteins sets to analyze
+my %lv = ();                                            # taxonomic grouping scenario
+my %selves = ();                                # taxonomic information of self group
+my %lvList = ();                                # list of taxids of each level
 
-my %taxadb = ();								# taxa.db
-my %ranksdb = ();								# ranks.db
-my %selfinfo = ();								# self.info
+my %taxadb = ();                                # taxa.db
+my %ranksdb = ();                                # ranks.db
+my %selfinfo = ();                                # self.info
 
-my %proteins = ();								# accn -> name (identified by COG)
+my %proteins = ();                                # accn -> name (identified by COG)
 
-my @files;										# search report files for each protein set
+my @files;                                        # search report files for each protein set
 
 ## the master table storing everything. It's an array of hashes. Each row is a record.
 my %results = ();
 
 ## the phyletic pattern of the whole genome, aka "fingerprint"
-my %fpN = ();									# number of hits per protein
+my %fpN = ();                                    # number of hits per protein
 
 ## program parameters ##
 
-my $wkDir = $ARGV[0];							# working directory
-my $interactive = 1;							# interactive or automatic mode
+my $wkDir = $ARGV[0];                            # working directory
+my $interactive = 1;                            # interactive or automatic mode
 
-my $minHits = 0;								# minimal number of hits a valid search result should contain
-my $maxHits = 0;								# maximal number of hits to retain for one protein, 0 means infinite
-my $minSize = 0;								# minimal size (aa) of a valid protein (0 means infinite)
-my $evalue = 1e-5;								# E-value cutoff
-my $identity = 0;								# percent identity cutoff
-my $coverage = 0;								# query coverage cutoff
+my $minHits = 0;                                # minimal number of hits a valid search result should contain
+my $maxHits = 0;                                # maximal number of hits to retain for one protein, 0 means infinite
+my $minSize = 0;                                # minimal size (aa) of a valid protein (0 means infinite)
+my $evalue = 1e-5;                                # E-value cutoff
+my $identity = 0;                                # percent identity cutoff
+my $coverage = 0;                                # query coverage cutoff
 
 # algorithm
-my $selfRank = 0;								# taxonomic rank(s) on which the program analyzes
-my $normalize = 1;								# use relative bit score (bit score of subject / bit score of query)
-my $unite = 1;									# hit pattern (0: each genome has own pattern, 1: one pattern for all genomes)
+my $selfRank = 0;                                # taxonomic rank(s) on which the program analyzes
+my $normalize = 1;                                # use relative bit score (bit score of subject / bit score of query)
+my $unite = 1;                                    # hit pattern (0: each genome has own pattern, 1: one pattern for all genomes)
 
-my $useDistance = 0;							# use phylogenetic distance instead of bit scores
-my $useWeight = 1;								# use weight (sum of scores) instead of number of hits
+my $useDistance = 0;                            # use phylogenetic distance instead of bit scores
+my $useWeight = 1;                                # use weight (sum of scores) instead of number of hits
 
 # fingerprints
-my $outRaw = 1;									# output raw number/weight data
-my $outFp = 1;									# output fingerprint
-my $graphFp = 0;								# graph fingerprint (requires R)
+my $outRaw = 1;                                    # output raw number/weight data
+my $outFp = 1;                                    # output fingerprint
+my $graphFp = 0;                                # graph fingerprint (requires R)
 
-my $plotRef = "";								# file name of reference set of genes
+my $plotRef = "";                                # file name of reference set of genes
 
-my $boxPlot = 1;								# box plot
-my $histogram = 1;								# histogram
-my $densityPlot = 1;							# density plot
-my $scatterPlot = 1;							# scatter plot
-my $plot3D = 0;									# 3-way scatter plot
+my $boxPlot = 1;                                # box plot
+my $histogram = 1;                                # histogram
+my $densityPlot = 1;                            # density plot
+my $scatterPlot = 1;                            # scatter plot
+my $plot3D = 0;                                    # 3-way scatter plot
 
 # cutoffs
-my $howCO = 4;									# how to determine cutoffs (0: user-defined global cutoff (%), 1: user-defined individual
-													# cutoffs, 2: wait for user input, 3: histogram, 4: kernel density estimation,
-													# 5: hierarchical clustering)
-my $globalCO = 0.25;							# arbitrary global cutoff (%)
-my ($selfCO, $closeCO, $distalCO) = (0, 0, 0);	# user-defined cutoffs for individual groups
+my $howCO = 4;                                    # how to determine cutoffs (0: user-defined global cutoff (%), 1: user-defined individual
+                                                    # cutoffs, 2: wait for user input, 3: histogram, 4: kernel density estimation,
+                                                    # 5: hierarchical clustering)
+my $globalCO = 0.25;                            # arbitrary global cutoff (%)
+my ($selfCO, $closeCO, $distalCO) = (0, 0, 0);    # user-defined cutoffs for individual groups
 
-my $exOutlier = 0;								# exclude outliers, hits distant from other hits of the same group
+my $exOutlier = 0;                                # exclude outliers, hits distant from other hits of the same group
 
-my $nBin = 20;									# number of bins in histogram
-my $plotHist = 0;								# plot histogram on screen
+my $nBin = 20;                                    # number of bins in histogram
+my $plotHist = 0;                                # plot histogram on screen
 
-my $toolKDE = 0;								# computational tool for kernel density estimation
-my $bwF = 1;									# bandwidth selection factor
-my $plotKDE = 0;								# plot density function on screen
-my $toolExtrema = 0;							# computational tool for identifying local extrema of density function (0: Perl code, 1: R package "pastecs")
-my $whichPeak = 0;								# definition of "typical" and "atypical" regions
-my $modKCO = 1;									# location of cutoff (0: 1st pit, 1: midpoint of x-coordinates between 1st peak and 1st pit, 2/3: quantile
-my $qKCO = 0.5;									# horizontal/vertical quantile from 1st pit toward 1st peak
+my $toolKDE = 0;                                # computational tool for kernel density estimation
+my $bwF = 1;                                    # bandwidth selection factor
+my $plotKDE = 0;                                # plot density function on screen
+my $toolExtrema = 0;                            # computational tool for identifying local extrema of density function (0: Perl code, 1: R package "pastecs")
+my $whichPeak = 0;                                # definition of "typical" and "atypical" regions
+my $modKCO = 1;                                    # location of cutoff (0: 1st pit, 1: midpoint of x-coordinates between 1st peak and 1st pit, 2/3: quantile
+my $qKCO = 0.5;                                    # horizontal/vertical quantile from 1st pit toward 1st peak
 
-my $dipTest = 0;								# perform non-unimodality test (Hartigan's dip test) and report p-value
-my $dipSig = 0;									# use global cutoff if dip test's result is not significant
+my $dipTest = 0;                                # perform non-unimodality test (Hartigan's dip test) and report p-value
+my $dipSig = 0;                                    # use global cutoff if dip test's result is not significant
 
-my $selfLow = 0;								# HGT-derived genes must have low self weight (an optional criterion)
+my $selfLow = 0;                                # HGT-derived genes must have low self weight (an optional criterion)
 
-my $BBH = 0;									# use conventional best match method instead
-my $loss = 0;									# also report gene loss events
-my $POE = 0;									# also report POE
+my $BBH = 0;                                    # use conventional best match method instead
+my $loss = 0;                                    # also report gene loss events
+my $POE = 0;                                    # also report POE
 
 my @ranks = ('species', 'genus', 'family', 'order', 'class', 'phylum');
 
@@ -139,109 +139,109 @@
 my @inSets = ();
 my @exSets = ();
 
-my $R;											# Statistics::R instance
+my $R;                                            # Statistics::R instance
 
 ## read configurations ##
 
-if (-e "$wkDir/config.txt"){
-	open IN, "<$wkDir/config.txt";
-	while (<IN>){
-		s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
-		$interactive = $1 if /^interactive=([01])$/;
-		$minHits = $1 if /^minHits=(\d+)$/;
-		$maxHits = $1 if /^maxHits=(\d+)$/;
-		$minSize = $1 if /^minSize=(\d+)$/;
-		$evalue = $1 if /^evalue=(.+)$/;
-		$identity = $1 if /^identity=(.+)$/;
-		$identity = $1 if /^percIdent=(.+)$/; # backward compatibility
-		$coverage = $1 if /^coverage=(.+)$/;
-
-		$selfRank = $1 if /^selfRank=(.+)$/;
-		$normalize = $1 if /^normalize=(.+)$/;
-		$unite = $1 if /^unite=([01])$/;
-		$useDistance = $1 if /^useDistance=([01])$/;
-		$useWeight = $1 if /^useWeight=([01])$/;
-
-		$howCO = $1 if /^howCO=(\d)$/;
-		$globalCO = $1 if /^globalCO=(.+)$/;
-		$selfCO = $1 if /^selfCO=(.+)$/;
-		$closeCO = $1 if /^closeCO=(.+)$/;
-		$distalCO = $1 if /^distalCO=(.+)$/;
-
-		$exOutlier = $1 if /^exOutlier=([0123])$/;
-
-		$nBin = $1 if /^nBin=(\d+)$/;
-		$plotHist = $1 if /^plotHist=([01])$/;
-
-		$toolKDE = $1 if /^toolKDE=([01])$/;
-		$bwF = $1 if /^bwF=(.+)$/;
-		$plotKDE = $1 if /^plotKDE=([01])$/;
-		$toolExtrema = $1 if /^toolExtrema=([01])$/;
-		$whichPeak = $1 if /^whichPeak=([0123])$/;
-		$modKCO = $1 if /^modKCO=([0123])$/;
-		$qKCO = $1 if /^qKCO=(.+)$/;
-
-		$dipTest = $1 if /^dipTest=([01])$/;
-		$dipSig = $1 if /^dipSig=(.+)$/;
-
-		$selfLow = $1 if /^selfLow=([01])$/;
-		
-		$outRaw = $1 if /^outRaw=([01])$/;
-		$outFp = $1 if /^outFp=([01])$/;
-		$graphFp = $1 if /^graphFp=([01])$/;
-		$plotRef = $1 if /^plotRef=(.+)$/;
-		
-		$boxPlot = $1 if /^boxPlot=([01])$/;
-		$histogram = $1 if /^histogram=([01])$/;
-		$densityPlot = $1 if /^densityPlot=([01])$/;
-		$scatterPlot = $1 if /^scatterPlot=([01])$/;
-		$plot3D = $1 if /^plot3D=([01])$/;
-	
-		$BBH = $1 if /^BBH=([012])$/;
-		$loss = $1 if /^loss=([01])$/;
-		$POE = $1 if /^POE=([01])$/;
-		
-		@ranks = split (/\s*,\s*/, $1) if /^ranks=(.+)$/;
-
-		@selfGroup = split (/\s*,\s*/, $1) if /^selfGroup=(.+)$/;
-		@closeGroup = split (/\s*,\s*/, $1) if /^closeGroup=(.+)$/;
-		@excludeGroup = split (/\s*,\s*/, $1) if /^excludeGroup=(.+)$/;
-		@inSets = split (/\s*,\s*/, $1) if /^inSets=(.+)$/;
-		@exSets = split (/\s*,\s*/, $1) if /^exSets=(.+)$/;
-	}
-	close IN;
+if (-e "$wkDir/config.txt") {
+    open IN, "<$wkDir/config.txt";
+    while (<IN>) {
+        s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+        $interactive = $1 if /^interactive=([01])$/;
+        $minHits = $1 if /^minHits=(\d+)$/;
+        $maxHits = $1 if /^maxHits=(\d+)$/;
+        $minSize = $1 if /^minSize=(\d+)$/;
+        $evalue = $1 if /^evalue=(.+)$/;
+        $identity = $1 if /^identity=(.+)$/;
+        $identity = $1 if /^percIdent=(.+)$/; # backward compatibility
+        $coverage = $1 if /^coverage=(.+)$/;
+
+        $selfRank = $1 if /^selfRank=(.+)$/;
+        $normalize = $1 if /^normalize=(.+)$/;
+        $unite = $1 if /^unite=([01])$/;
+        $useDistance = $1 if /^useDistance=([01])$/;
+        $useWeight = $1 if /^useWeight=([01])$/;
+
+        $howCO = $1 if /^howCO=(\d)$/;
+        $globalCO = $1 if /^globalCO=(.+)$/;
+        $selfCO = $1 if /^selfCO=(.+)$/;
+        $closeCO = $1 if /^closeCO=(.+)$/;
+        $distalCO = $1 if /^distalCO=(.+)$/;
+
+        $exOutlier = $1 if /^exOutlier=([0123])$/;
+
+        $nBin = $1 if /^nBin=(\d+)$/;
+        $plotHist = $1 if /^plotHist=([01])$/;
+
+        $toolKDE = $1 if /^toolKDE=([01])$/;
+        $bwF = $1 if /^bwF=(.+)$/;
+        $plotKDE = $1 if /^plotKDE=([01])$/;
+        $toolExtrema = $1 if /^toolExtrema=([01])$/;
+        $whichPeak = $1 if /^whichPeak=([0123])$/;
+        $modKCO = $1 if /^modKCO=([0123])$/;
+        $qKCO = $1 if /^qKCO=(.+)$/;
+
+        $dipTest = $1 if /^dipTest=([01])$/;
+        $dipSig = $1 if /^dipSig=(.+)$/;
+
+        $selfLow = $1 if /^selfLow=([01])$/;
+
+        $outRaw = $1 if /^outRaw=([01])$/;
+        $outFp = $1 if /^outFp=([01])$/;
+        $graphFp = $1 if /^graphFp=([01])$/;
+        $plotRef = $1 if /^plotRef=(.+)$/;
+
+        $boxPlot = $1 if /^boxPlot=([01])$/;
+        $histogram = $1 if /^histogram=([01])$/;
+        $densityPlot = $1 if /^densityPlot=([01])$/;
+        $scatterPlot = $1 if /^scatterPlot=([01])$/;
+        $plot3D = $1 if /^plot3D=([01])$/;
+
+        $BBH = $1 if /^BBH=([012])$/;
+        $loss = $1 if /^loss=([01])$/;
+        $POE = $1 if /^POE=([01])$/;
+
+        @ranks = split (/\s*,\s*/, $1) if /^ranks=(.+)$/;
+
+        @selfGroup = split (/\s*,\s*/, $1) if /^selfGroup=(.+)$/;
+        @closeGroup = split (/\s*,\s*/, $1) if /^closeGroup=(.+)$/;
+        @excludeGroup = split (/\s*,\s*/, $1) if /^excludeGroup=(.+)$/;
+        @inSets = split (/\s*,\s*/, $1) if /^inSets=(.+)$/;
+        @exSets = split (/\s*,\s*/, $1) if /^exSets=(.+)$/;
+    }
+    close IN;
 }
-if ($identity and $identity < 1){ $identity *= 100; }
-if ($coverage and $coverage < 1){ $coverage *= 100; }
+if ($identity and $identity < 1) { $identity *= 100; }
+if ($coverage and $coverage < 1) { $coverage *= 100; }
 
 ## check previous result ##
 
-if (-d "$wkDir/result/detail" and -d "$wkDir/result/statistics"){
-	print "Warning: Prediction result from a previous analysis is detected.\n";
-	if ($interactive){ print "Press Enter to overwrite, or Ctrl+C to exit:\n"; $s = <STDIN>; }
-	else{ print "To be overwritten.\n"; }
+if (-d "$wkDir/result/detail" and -d "$wkDir/result/statistics") {
+    print "Warning: Prediction result from a previous analysis is detected.\n";
+    if ($interactive) { print "Press Enter to overwrite, or Ctrl+C to exit:\n"; $s = <STDIN>; }
+    else { print "To be overwritten.\n"; }
 }
 
 ## verify configurations ##
 
-if ($globalCO){
-	$globalCO = $globalCO / 100 if ($globalCO =~ s/%$//);
-	die "Error: Global cutoff must be between 0 and 1.\n" if ($globalCO <= 0 or $globalCO >=1);
+if ($globalCO) {
+    $globalCO = $globalCO / 100 if ($globalCO =~ s/%$//);
+    die "Error: Global cutoff must be between 0 and 1.\n" if ($globalCO <= 0 or $globalCO >=1);
 }
 
 # conditionally use Statistics::R for Perl-R communication;
-if ($graphFp or ($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest){
-	eval{ require Statistics::R; Statistics::R->import() };
-	die "Error: Perl module Statistics::R is not available.\n" if ($@);
-	$R = Statistics::R->new();
+if ($graphFp or ($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest) {
+    eval { require Statistics::R; Statistics::R->import() };
+    die "Error: Perl module Statistics::R is not available.\n" if ($@);
+    $R = Statistics::R->new();
 }
 
 ## initiate global variables ##
 
-if ($unite){
-	@b = (); @a = ('0','1','2');
-	$fpN{'0'}{$_}{'data'} = [@b] for (@a);
-	# $fpS{'0'}{$_}{'data'} = [@b] for (@a);
+if ($unite) {
+    @b = (); @a = ('0','1','2');
+    $fpN{'0'}{$_}{'data'} = [@b] for (@a);
+    # $fpS{'0'}{$_}{'data'} = [@b] for (@a);
 }
 
 
@@ -249,136 +249,136 @@
 
 print "Reading taxonomic information...";
 open IN, "<$wkDir/taxonomy/taxa.db";
-while (<IN>){
-	next if /^#/;
-	@a = split /\t/;
-	next unless $#a;
-	$a[$#a] =~ s/\s+$//;
-	%h = ('name',$a[1],'rank',$a[2]);
-	$i = 3; $h{$_} = $a[$i++] for (@ranks);
-	$taxadb{$a[0]} = {%h};
+while (<IN>) {
+    next if /^#/;
+    @a = split /\t/;
+    next unless $#a;
+    $a[$#a] =~ s/\s+$//;
+    %h = ('name',$a[1],'rank',$a[2]);
+    $i = 3; $h{$_} = $a[$i++] for (@ranks);
+    $taxadb{$a[0]} = {%h};
 }
 close IN;
 open IN, "<$wkDir/taxonomy/ranks.db";
-while (<IN>){
-	s/\s+$//; next if /^#/; next unless $_;
-	@a = split /\t/;
-	$ranksdb{$a[0]} = $a[1];
+while (<IN>) {
+    s/\s+$//; next if /^#/; next unless $_;
+    @a = split /\t/;
+    $ranksdb{$a[0]} = $a[1];
 }
 close IN;
 open IN, "<$wkDir/taxonomy/self.info";
-while (<IN>){
-	s/\s+$//; next if /^#/; next unless $_;
-	@a = split /\t/;
-	next if (scalar(@a) < 3);
-	%h = ('taxid',$a[1],'name',$a[2]);
-	$selfinfo{$a[0]} = {%h};
+while (<IN>) {
+    s/\s+$//; next if /^#/; next unless $_;
+    @a = split /\t/;
+    next if (scalar(@a) < 3);
+    %h = ('taxid',$a[1],'name',$a[2]);
+    $selfinfo{$a[0]} = {%h};
 }
 close IN;
 print " done.\n";
 
 print "Analyzing taxonomic information...";
-for ($i=0; $i<=$#ranks; $i++){
-	$j = 1; # if all names are the same
-	$s = 0; # last name of this rank
-	foreach my $key (keys %selfinfo){
-		$t = $taxadb{$selfinfo{$key}{'taxid'}}{$ranks[$i]};
-		$s = $t unless $s;
-		if ($t != $s){ $j = 0; last; }
-	}
-	last if $j;
+for ($i=0; $i<=$#ranks; $i++) {
+    $j = 1; # if all names are the same
+    $s = 0; # last name of this rank
+    foreach my $key (keys %selfinfo) {
+        $t = $taxadb{$selfinfo{$key}{'taxid'}}{$ranks[$i]};
+        $s = $t unless $s;
+        if ($t != $s) { $j = 0; last; }
+    }
+    last if $j;
 }
 print " done.\n";
 $selves{'rank'} = $ranks[$i];
 $selves{'rank_id'} = $i;
 $selves{'rank_taxid'} = $s;
-if (exists $ranksdb{$s}){ $selves{'rank_name'} = $ranksdb{$s}; }
-elsif (exists $taxadb{$s}){ $selves{'rank_name'} = $taxadb{$s}{'name'}; }
-else{ die "Unknown TaxID $s.\n"; }
+if (exists $ranksdb{$s}) { $selves{'rank_name'} = $ranksdb{$s}; }
+elsif (exists $taxadb{$s}) { $selves{'rank_name'} = $taxadb{$s}{'name'}; }
+else { die "Unknown TaxID $s.\n"; }
 print "  All input genomes belong to $selves{'rank'} $selves{'rank_name'} (TaxID: $selves{'rank_taxid'}).\n";
 
-if (@selfGroup){ # user-defined self group
-	$lv{'rank'} = "(user-defined self)"; $lv{'id'} = -1; $lv{'taxid'} = join (",", @selfGroup);
-	@a = ();
-	foreach (@selfGroup){
-		if (exists $ranksdb{$_}){ push @a, $ranksdb{$_}; }
-		elsif (exists $taxadb{$_}) { push @a, $taxadb{$_}{'name'}; }
-		else { push @a, "unknown"; }
-	}
-	$lv{'name'} = join (",", @a);
-}else{ # lowest rank that contains all input organisms
-	if ($selfRank =~ /\d/){ # relative level number
-		$lv{'rank'} = $ranks[$selves{'rank_id'} + $selfRank];
-		$lv{'id'} = $selves{'rank_id'} + $selfRank;
-	}else{ # rank name
-		$lv{'rank'} = $selfRank;
-		for ($j=0; $j<=$#ranks; $j++){
-			if ($ranks[$j] eq $selfRank){ $lv{'id'} = $j; last; }
-		}
-	}
-	$s = $selfinfo{(keys %selfinfo)[0]}{'taxid'};
-	$lv{'taxid'} = $taxadb{$s}{$lv{'rank'}};
-	if (exists $ranksdb{$lv{'taxid'}}){ $lv{'name'} = $ranksdb{$lv{'taxid'}}; }
-	elsif (exists $taxadb{$lv{'taxid'}}){ $lv{'name'} = $taxadb{$lv{'taxid'}}{'name'}; }
-	else{ die "Unknown TaxID $lv{'taxid'}.\n"; }
+if (@selfGroup) { # user-defined self group
+    $lv{'rank'} = "(user-defined self)"; $lv{'id'} = -1; $lv{'taxid'} = join (",", @selfGroup);
+    @a = ();
+    foreach (@selfGroup) {
+        if (exists $ranksdb{$_}) { push @a, $ranksdb{$_}; }
+        elsif (exists $taxadb{$_}) { push @a, $taxadb{$_}{'name'}; }
+        else { push @a, "unknown"; }
+    }
+    $lv{'name'} = join (",", @a);
+} else { # lowest rank that contains all input organisms
+    if ($selfRank =~ /\d/) { # relative level number
+        $lv{'rank'} = $ranks[$selves{'rank_id'} + $selfRank];
+        $lv{'id'} = $selves{'rank_id'} + $selfRank;
+    } else { # rank name
+        $lv{'rank'} = $selfRank;
+        for ($j=0; $j<=$#ranks; $j++) {
+            if ($ranks[$j] eq $selfRank) { $lv{'id'} = $j; last; }
+        }
+    }
+    $s = $selfinfo{(keys %selfinfo)[0]}{'taxid'};
+    $lv{'taxid'} = $taxadb{$s}{$lv{'rank'}};
+    if (exists $ranksdb{$lv{'taxid'}}) { $lv{'name'} = $ranksdb{$lv{'taxid'}}; }
+    elsif (exists $taxadb{$lv{'taxid'}}) { $lv{'name'} = $taxadb{$lv{'taxid'}}{'name'}; }
+    else { die "Unknown TaxID $lv{'taxid'}.\n"; }
 }
 %h = ();
-foreach my $key (keys %taxadb){
-	next unless $taxadb{$key}{'rank'};
-	foreach (split (/,/, $lv{'taxid'})){
-		$h{$key} = 1 if ($key eq $_ or $taxadb{$key}{'rank'} =~ /\/$_$/ or $taxadb{$key}{'rank'} =~ /\/$_\//);
-	}
+foreach my $key (keys %taxadb) {
+    next unless $taxadb{$key}{'rank'};
+    foreach (split (/,/, $lv{'taxid'})) {
+        $h{$key} = 1 if ($key eq $_ or $taxadb{$key}{'rank'} =~ /\/$_$/ or $taxadb{$key}{'rank'} =~ /\/$_\//);
+    }
 }
 $lvList{$lv{'rank'}} = {%h};
 
-if (@closeGroup){ # user-defined close group
-	$lv{'prank'} = "(user-defined close)"; $lv{'ptaxid'} = join (",", @closeGroup);
-	@a = ();
-	foreach (@closeGroup){
-		if (exists $ranksdb{$_}){ push @a, $ranksdb{$_}; }
-		elsif (exists $taxadb{$_}) { push @a, $taxadb{$_}{'name'}; }
-		else { push @a, "unknown"; }
-	}
-	$lv{'pname'} = join (",", @a);
-}else{ # lowest parent rank that has adequate members for statistical analysis
-	if ($lv{'id'} >= $#ranks){ die "Cannot find a parent taxonomic rank of $lv{'taxid'}.\n"; }
-	print "  Choose one of the following parental taxonomic ranks as the close group:\n";
-	my @pranks = ();
-	$s = $selfinfo{(keys %selfinfo)[0]}{'taxid'};
-	for ($i=$lv{'id'}+1; $i<=$#ranks; $i++){
-		next unless exists $taxadb{$s}{$ranks[$i]};
-		$t = $taxadb{$s}{$ranks[$i]};
-		%h = ('rank' => $ranks[$i], 'taxid' => $t, 'name' => '', 'n' => 0, 'm' => 0); # m is the number of members - number of self members
-		if (exists $ranksdb{$t}){ $h{'name'} = $ranksdb{$t}; }
-		elsif (exists $taxadb{$t}){ $h{'name'} = $taxadb{$t}{'name'}; }
-		else{ next; }
-		foreach my $key (keys %taxadb){
-			if ($key eq $t or $taxadb{$key}{'rank'} =~ /\/$t$/ or $taxadb{$key}{'rank'} =~ /\/$t\//){
-				$h{'n'} ++;
-				$h{'m'} ++ unless exists $lvList{$lv{'rank'}}{$key};
-			}
-		}
-		push @pranks, {%h};
-		print "    ".$h{'rank'}." ".$h{'name'}." (TaxID: ".$h{'taxid'}.") (".$h{'n'}." members).\n";
-	}
-	unless (scalar @pranks){ die "Cannot find a parent taxonomic rank of $lv{'taxid'}.\n"; }
-	for ($i=0; $i<=$#pranks; $i++){
-		if ($pranks[$i]{'m'} >= 10 or $i == $#pranks){ ######## new rule: close group should have >=10 members
-			print "  The program intelligently chose $pranks[$i]{'rank'} $pranks[$i]{'name'}.\n";
-			if ($interactive){ print "Press Enter to accept, or Ctrl+C to exit:\n"; $s = <STDIN>; }
-			$lv{'prank'} = $pranks[$i]{'rank'};
-			$lv{'ptaxid'} = $pranks[$i]{'taxid'};
-			$lv{'pname'} = $pranks[$i]{'name'};
-			last;
-		}
-	}
+if (@closeGroup) { # user-defined close group
+    $lv{'prank'} = "(user-defined close)"; $lv{'ptaxid'} = join (",", @closeGroup);
+    @a = ();
+    foreach (@closeGroup) {
+        if (exists $ranksdb{$_}) { push @a, $ranksdb{$_}; }
+        elsif (exists $taxadb{$_}) { push @a, $taxadb{$_}{'name'}; }
+        else { push @a, "unknown"; }
+    }
+    $lv{'pname'} = join (",", @a);
+} else { # lowest parent rank that has adequate members for statistical analysis
+    if ($lv{'id'} >= $#ranks) { die "Cannot find a parent taxonomic rank of $lv{'taxid'}.\n"; }
+    print "  Choose one of the following parental taxonomic ranks as the close group:\n";
+    my @pranks = ();
+    $s = $selfinfo{(keys %selfinfo)[0]}{'taxid'};
+    for ($i=$lv{'id'}+1; $i<=$#ranks; $i++) {
+        next unless exists $taxadb{$s}{$ranks[$i]};
+        $t = $taxadb{$s}{$ranks[$i]};
+        %h = ('rank' => $ranks[$i], 'taxid' => $t, 'name' => '', 'n' => 0, 'm' => 0); # m is the number of members - number of self members
+        if (exists $ranksdb{$t}) { $h{'name'} = $ranksdb{$t}; }
+        elsif (exists $taxadb{$t}) { $h{'name'} = $taxadb{$t}{'name'}; }
+        else { next; }
+        foreach my $key (keys %taxadb) {
+            if ($key eq $t or $taxadb{$key}{'rank'} =~ /\/$t$/ or $taxadb{$key}{'rank'} =~ /\/$t\//) {
+                $h{'n'} ++;
+                $h{'m'} ++ unless exists $lvList{$lv{'rank'}}{$key};
+            }
+        }
+        push @pranks, {%h};
+        print "    ".$h{'rank'}." ".$h{'name'}." (TaxID: ".$h{'taxid'}.") (".$h{'n'}." members).\n";
+    }
+    unless (scalar @pranks) { die "Cannot find a parent taxonomic rank of $lv{'taxid'}.\n"; }
+    for ($i=0; $i<=$#pranks; $i++) {
+        if ($pranks[$i]{'m'} >= 10 or $i == $#pranks) { ######## new rule: close group should have >=10 members
+            print "  The program intelligently chose $pranks[$i]{'rank'} $pranks[$i]{'name'}.\n";
+            if ($interactive) { print "Press Enter to accept, or Ctrl+C to exit:\n"; $s = <STDIN>; }
+            $lv{'prank'} = $pranks[$i]{'rank'};
+            $lv{'ptaxid'} = $pranks[$i]{'taxid'};
+            $lv{'pname'} = $pranks[$i]{'name'};
+            last;
+        }
+    }
 }
 %h = ();
-foreach my $key (keys %taxadb){
-	next unless $taxadb{$key}{'rank'};
-	foreach (split (/,/, $lv{'ptaxid'})){
-		$h{$key} = 1 if ($key eq $_ or $taxadb{$key}{'rank'} =~ /\/$_$/ or $taxadb{$key}{'rank'} =~ /\/$_\//);
-	}
+foreach my $key (keys %taxadb) {
+    next unless $taxadb{$key}{'rank'};
+    foreach (split (/,/, $lv{'ptaxid'})) {
+        $h{$key} = 1 if ($key eq $_ or $taxadb{$key}{'rank'} =~ /\/$_$/ or $taxadb{$key}{'rank'} =~ /\/$_\//);
+    }
 }
 $lvList{$lv{'prank'}} = {%h};
 
@@ -386,25 +386,25 @@
 print "  Self: $lv{'rank'} $lv{'name'} (TaxID: $lv{'taxid'}) (".(keys %{$lvList{$lv{'rank'}}})." members),\n";
 print "  Close: $lv{'prank'} $lv{'pname'} (TaxID: $lv{'ptaxid'}) (".(keys %{$lvList{$lv{'prank'}}})." members),\n";
 print "  Distal: all other organisms.\n";
-if ($interactive){
-	print "Press Enter to continue, or Ctrl+C to exit:";
-	$s = <STDIN>;
+if ($interactive) {
+    print "Press Enter to continue, or Ctrl+C to exit:";
+    $s = <STDIN>;
 }
 
 ## Read reference proteins ##
 
 my %refs = ();
-if ($plotRef){
-	if (-e $plotRef){ open IN, "<$plotRef"; }
-	elsif (-e "$wkDir/$plotRef"){ open IN, "<$wkDir/$plotRef"; }
-	else{ die "Reference file $plotRef does not exist.\n"; }
-	while (<IN>){
-		s/\s+$//; next unless $_;
-		@a = split (/\t/);
-		$a[0] =~ s/\.\d+$//;
-		$refs{$a[0]} = [0,0,0,0]; # exist, self, close, distal
-	}
-	close IN;
+if ($plotRef) {
+    if (-e $plotRef) { open IN, "<$plotRef"; }
+    elsif (-e "$wkDir/$plotRef") { open IN, "<$wkDir/$plotRef"; }
+    else { die "Reference file $plotRef does not exist.\n"; }
+    while (<IN>) {
+        s/\s+$//; next unless $_;
+        @a = split (/\t/);
+        $a[0] =~ s/\.\d+$//;
+        $refs{$a[0]} = [0,0,0,0]; # exist, self, close, distal
+    }
+    close IN;
 }
 
 
@@ -414,9 +414,9 @@
 opendir (DIR, "$wkDir/search");
 @a = readdir(DIR);
 close DIR;
-foreach (@a){
-	next if (/^\./);
-	push @sets, $_ if -d "$wkDir/search/$_";
+foreach (@a) {
+    next if (/^\./);
+    push @sets, $_ if -d "$wkDir/search/$_";
 }
 print "done. ";
 die "No genome detected.\n" unless @sets;
@@ -428,248 +428,248 @@
 print "Analyzing search results...\n";
 print "0-------------25-------------50------------75------------100%\n";
 
-foreach my $set (@sets){
-	if (@inSets){ $i = 0; foreach (@inSets){ if ($set eq $_){ $i = 1; last; } } next unless $i; }
-	if (@exSets){ $i = 0; foreach (@exSets){ if ($set eq $_){ $i = 1; last; } } next if $i; }
-	opendir (DIR, "$wkDir/search/$set");
-	@files = grep(/\.txt$/,readdir(DIR));
-	close DIR;
-	print "No protein found in $set\n" and next unless @files;
-
-	## varibles to show a progress bar
-	my $iProtein = 0;
-	my $iProgress = 0;
-	my $nProtein = $#files+1;
-	print "$set has $nProtein proteins. Analyzing...\n";
-
-	unless ($unite){
-		@a = ('0','1','2'); @b = ();
-		$fpN{$set}{$_}{'data'} = [@b] for (@a);
-	}	
-	
-	## information of self
-	my %self = ();
-
-	foreach my $file (@files){
-		$iProtein ++;
-		
-		my %result = ();				# a record to store everything about this search										
-		my %scores = ();				# scores of each hit by category, as a buffer for computing the statistics above
-		my @hits;						# parameters of the hits. one hit contains:
-										# accn, organism, group, taxid, genus, score
-		$file =~ /(.+)\.[^.]+$/;
-		$result{'query'} = $1;
-
-		my $nHits = 0;					# total number of hits. just for convenience
-		my $nScore = 0;					# total score
-		my $lastHit = "";				# store the last hit in the organism table, for the identification of duplicated taxa
-		my $selfScore = 0;
-
-		# read hit table #
-		
-		open IN, "<$wkDir/search/$set/$file" or next;
-		my $reading = 0;
-		my ($hasCoverage, $hasDistance) = (0, 0);
-		while (<IN>) {
-			s/\s+$//;
-			if (/^BEGIN QUERY/){ $reading = "query"; next; }
-			if (/^BEGIN ORGANISM/){ $reading = "organism"; next; }
-			if (/^BEGIN DATA/){ $reading = "data"; next; }
-			if (/^END;/){ $reading = 0; next; }
-			if ($reading eq "query"){ # read query (self)
-				$result{'accn'} = $1 if /^\tName=(.+);$/;
-				$result{'gi'} = $1 if /^\tGI=(\d+);$/;
-				$result{'length'} = $1 if /^\tLength=(.+);$/;
-				$result{'product'} = $1 if /^\tProduct=(.+)\s*;$/;
-				$result{'organism'} = $1 if /^\tOrganism=(.+)\s*;$/;
-				if (/^\tAccession=(.+);$/){ $result{'accn'} = $1; $result{'accn'} =~ s/\.[\d]+$//; }
-			}
-			if ($reading eq "organism"){ # read organisms
-				next if /^;/;
-				if (/^\[/){
-					$hasCoverage = 1 if /Coverage/;
-					$hasDistance = 1 if /Distance/;
-					next;
-				}
-				@a = split (/\t/);
-				if ($#a < 5){
-					print "\nIncomplete hit record $a[0] in $file of $set.\n";
-					if ($interactive){
-						print "Press Enter to continue, or Ctrl+C to exit:";
-						$s = <STDIN>;
-					}
-					next;
-				}
-				
-				# filter out low-quality hits
-				next if ($a[$#a] eq "x");
-				next if ($evalue and $a[4] ne "*" and $a[4] > $evalue);
-				next if ($identity and $a[5] ne "*" and $a[5] < $identity);
-				next if ($coverage and $hasCoverage and $a[6] and $a[6] ne "*" and $a[6] < $coverage);
-				
-				# filter out unwanted taxonomy groups
-				if (@excludeGroup){
-					my $isExclude = 0;
-					foreach (@excludeGroup){
-						if ($a[2] == $_ or $taxadb{$a[2]}{'rank'} =~ /\/$_$/ or $taxadb{$a[2]}{'rank'} =~ /\/$_\//){
-							$isExclude = 1;
-							last;
-						}
-					}
-					next if $isExclude;
-				}
-				
-				my %hit = ();
-				$hit{'accns'} = $a[0];
-				$hit{'organism'} = $a[1];
-				$hit{'taxid'} = $a[2];
-				$hit{'score'} = $a[3];
-				$hit{'evalue'} = $a[4];
-				$hit{'identity'} = $a[5];
-				$hit{'coverage'} = $a[6] if $hasCoverage;
-				
-				# use phylogenetic distance instead of bit score
-				if ($useDistance){
-					if ($#a >= 6){ $hit{'score'} = 1 - $a[6+$hasCoverage]; }
-					elsif (!@hits){ $hit{'score'} = 0; }
-					else { $hit{'score'} = $hits[$#hits]{'score'}; }
-				}
-				
-				@a = split(/\//, $a[0]);
-				$hit{'accn'} = $a[0];
-				push @hits, {%hit};
-			}
-			last if ($maxHits and $#hits >= $maxHits-1);
-		}
-		close IN;
-		
-		# skip if there is no hit
-		next unless @hits;
-		
-		# next if ($minSize and ($result{'length'} < $minSize));
-		unless (exists $result{'query'} and exists $result{'length'}){
-			print "\nIncomplete search result: $set/$file.\n" ;
-			if ($interactive){
-				print "Press Enter to continue, or Ctrl+C to exit:";
-				$s = <STDIN>;
-			}
-		}
-		$result{'product'} = '' unless exists $result{'product'};
-
-		## Intepret hit table ##
-
-		# total number of hits
-		$result{'n'} = $#hits+1;
-
-		# sort hits by bit score or phylogenetic distance
-		@hits = sort {$b->{'score'} <=> $a->{'score'}} @hits;
-
-		# identify self (query) information
-		my $isQuery = 0;
-		for ($i=0; $i<=$#hits; $i++){
-			@a = split(/\//, $hits[$i]{'accns'});
-			foreach (@a){
-				if ($result{'accn'} eq $_){
-					$result{'id'} = $i;
-					$result{'taxid'} = $hits[$i]{'taxid'};
-					$result{'score'} = $hits[$i]{'score'};
-					$result{'organism'} = $hits[$i]{'organism'};
-					$isQuery = 1;
-					last;
-				}
-			}
-			last if exists $result{'id'};
-		}
-		unless (exists $result{'id'}){
-			$result{'id'} = 0;
-			$result{'taxid'} = $hits[0]{'taxid'};
-			$result{'score'} = $hits[0]{'score'};
-			$result{'organism'} = $hits[0]{'organism'};		
-		}
-		next unless $result{'score'};
-
-		# Use absolute or relative bit scores
-		
-		if ($normalize and not $useDistance){
-			for ($i=0; $i<=$#hits; $i++){
-				$hits[$i]{'score'} = sprintf("%.3f", $hits[$i]{'score'}/$result{'score'});
-			}
-		}
-
-		# initialize values of prediction results
-		$result{'in'} = "";					# whether incoming HGT or origination took place within the group
-		$result{'loss'} = "";				# gene loss event
-		$result{'origin'} = "";				# gene origination event
-		$result{'income'} = "";				# incoming HGT event
-		$result{'outcome'} = "";			# outcoming HGT event
-
-		# Summarize numbers and scores #
-		## 0 - self group, 1 - close groups, 2 - distal gs
-		## N - number, S - scores
-		## hit1 - first close hit, hit 2 - first distal hit
-		
-		my ($topCloseScore, $topDistalScore) = (0, 0);
-		$result{'N0'} = 0; $result{'N1'} = 0; $result{'N2'} = 0;
-		@a = (); $result{'S0'} = [@a]; $result{'S1'} = [@a]; $result{'S2'} = [@a];
-		for ($i=0; $i<=$#hits; $i++){
-			if (exists $lvList{$lv{'rank'}}{$hits[$i]{'taxid'}}){
-				next unless $isQuery;
-				if ($useWeight){ $result{'N0'} += $hits[$i]{'score'}; }
-				else { $result{'N0'} ++; }
-				push @{$result{'S0'}}, $hits[$i]{'score'};
-			}elsif (exists $lvList{$lv{'prank'}}{$hits[$i]{'taxid'}}){
-				if ($useWeight){ $result{'N1'} += $hits[$i]{'score'}; }
-				else { $result{'N1'} ++; }
-				push @{$result{'S1'}}, $hits[$i]{'score'};
-				$result{'hit1'} = $hits[$i]{'taxid'} unless exists $result{'hit1'};
-				$topCloseScore = $hits[$i]{'score'} unless $topCloseScore;
-				$result{'BBH'} = "" unless exists $result{'BBH'};
-			}else{
-				if ($useWeight){ $result{'N2'} += $hits[$i]{'score'}; }
-				else { $result{'N2'} ++; }
-				push @{$result{'S2'}}, $hits[$i]{'score'};
-				unless (exists $result{'hit2'}){
-					$result{'hit2'} = $hits[$i]{'taxid'};
-					$result{'hit2accn'} = $hits[$i]{'accn'};
-				}
-				$topDistalScore = $hits[$i]{'score'} unless $topDistalScore;
-				$result{'BBH'} = 1 unless exists $result{'BBH'};
-			}
-		}
-		# more stringent BBH method:
-		# $result{'BBH'} = 1 if ($topDistalScore > $topCloseScore);
-		
-		$result{'N0'} = sprintf("%.3f", $result{'N0'});
-		$result{'N1'} = sprintf("%.3f", $result{'N1'});
-		$result{'N2'} = sprintf("%.3f", $result{'N2'});
-		
-		if (exists $refs{$result{'accn'}}){
-			$refs{$result{'accn'}} = [1,$result{'N0'},$result{'N1'},$result{'N2'}];
-		}
-
-		# proteins without non-self hits are considered as de novo originated and not considered for statistics
-		$result{'origin'} = 1 if $result{'N1'}+$result{'N2'} < 0.000001;
-
-		# proteins with hits lower than minimum cutoff are considered as de novo originated and not considered for statistics
-		if ($minHits and ($result{'n'} < $minHits)){ $result{'origin'} = 1; $result{'BBH'} = ""; }
-
-		# Put this record into the master record
-		push @{$results{$set}}, {%result};
-		
-		# Record fingerprint #
-		if ($unite){ $s = '0'; }
-		else { $s = $set; }
-
-		if ($result{'N0'} and not $result{'origin'}){ # skip in case distance is used and no distance is available
-			push @{$fpN{$s}{'0'}{'data'}}, $result{'N0'};
-			push @{$fpN{$s}{'1'}{'data'}}, $result{'N1'};
-			push @{$fpN{$s}{'2'}{'data'}}, $result{'N2'};
-		}
-		
-		# Show progress
-		print "." and $iProgress++ if ($iProtein/$nProtein >= $iProgress/60);
-	}
-	print "\n";
+foreach my $set (@sets) {
+    if (@inSets) { $i = 0; foreach (@inSets) { if ($set eq $_) { $i = 1; last; } } next unless $i; }
+    if (@exSets) { $i = 0; foreach (@exSets) { if ($set eq $_) { $i = 1; last; } } next if $i; }
+    opendir (DIR, "$wkDir/search/$set");
+    @files = grep(/\.txt$/,readdir(DIR));
+    close DIR;
+    print "No protein found in $set\n" and next unless @files;
+
+    ## varibles to show a progress bar
+    my $iProtein = 0;
+    my $iProgress = 0;
+    my $nProtein = $#files+1;
+    print "$set has $nProtein proteins. Analyzing...\n";
+
+    unless ($unite) {
+        @a = ('0','1','2'); @b = ();
+        $fpN{$set}{$_}{'data'} = [@b] for (@a);
+    }
+
+    ## information of self
+    my %self = ();
+
+    foreach my $file (@files) {
+        $iProtein ++;
+
+        my %result = ();                # a record to store everything about this search
+        my %scores = ();                # scores of each hit by category, as a buffer for computing the statistics above
+        my @hits;                        # parameters of the hits. one hit contains:
+                                        # accn, organism, group, taxid, genus, score
+        $file =~ /(.+)\.[^.]+$/;
+        $result{'query'} = $1;
+
+        my $nHits = 0;                    # total number of hits. just for convenience
+        my $nScore = 0;                    # total score
+        my $lastHit = "";                # store the last hit in the organism table, for the identification of duplicated taxa
+        my $selfScore = 0;
+
+        # read hit table #
+
+        open IN, "<$wkDir/search/$set/$file" or next;
+        my $reading = 0;
+        my ($hasCoverage, $hasDistance) = (0, 0);
+        while (<IN>) {
+            s/\s+$//;
+            if (/^BEGIN QUERY/) { $reading = "query"; next; }
+            if (/^BEGIN ORGANISM/) { $reading = "organism"; next; }
+            if (/^BEGIN DATA/) { $reading = "data"; next; }
+            if (/^END;/) { $reading = 0; next; }
+            if ($reading eq "query") { # read query (self)
+                $result{'accn'} = $1 if /^\tName=(.+);$/;
+                $result{'gi'} = $1 if /^\tGI=(\d+);$/;
+                $result{'length'} = $1 if /^\tLength=(.+);$/;
+                $result{'product'} = $1 if /^\tProduct=(.+)\s*;$/;
+                $result{'organism'} = $1 if /^\tOrganism=(.+)\s*;$/;
+                if (/^\tAccession=(.+);$/) { $result{'accn'} = $1; $result{'accn'} =~ s/\.[\d]+$//; }
+            }
+            if ($reading eq "organism") { # read organisms
+                next if /^;/;
+                if (/^\[/) {
+                    $hasCoverage = 1 if /Coverage/;
+                    $hasDistance = 1 if /Distance/;
+                    next;
+                }
+                @a = split (/\t/);
+                if ($#a < 5) {
+                    print "\nIncomplete hit record $a[0] in $file of $set.\n";
+                    if ($interactive) {
+                        print "Press Enter to continue, or Ctrl+C to exit:";
+                        $s = <STDIN>;
+                    }
+                    next;
+                }
+
+                # filter out low-quality hits
+                next if ($a[$#a] eq "x");
+                next if ($evalue and $a[4] ne "*" and $a[4] > $evalue);
+                next if ($identity and $a[5] ne "*" and $a[5] < $identity);
+                next if ($coverage and $hasCoverage and $a[6] and $a[6] ne "*" and $a[6] < $coverage);
+
+                # filter out unwanted taxonomy groups
+                if (@excludeGroup) {
+                    my $isExclude = 0;
+                    foreach (@excludeGroup) {
+                        if ($a[2] == $_ or $taxadb{$a[2]}{'rank'} =~ /\/$_$/ or $taxadb{$a[2]}{'rank'} =~ /\/$_\//) {
+                            $isExclude = 1;
+                            last;
+                        }
+                    }
+                    next if $isExclude;
+                }
+
+                my %hit = ();
+                $hit{'accns'} = $a[0];
+                $hit{'organism'} = $a[1];
+                $hit{'taxid'} = $a[2];
+                $hit{'score'} = $a[3];
+                $hit{'evalue'} = $a[4];
+                $hit{'identity'} = $a[5];
+                $hit{'coverage'} = $a[6] if $hasCoverage;
+
+                # use phylogenetic distance instead of bit score
+                if ($useDistance) {
+                    if ($#a >= 6) { $hit{'score'} = 1 - $a[6+$hasCoverage]; }
+                    elsif (!@hits) { $hit{'score'} = 0; }
+                    else { $hit{'score'} = $hits[$#hits]{'score'}; }
+                }
+
+                @a = split(/\//, $a[0]);
+                $hit{'accn'} = $a[0];
+                push @hits, {%hit};
+            }
+            last if ($maxHits and $#hits >= $maxHits-1);
+        }
+        close IN;
+
+        # skip if there is no hit
+        next unless @hits;
+
+        # next if ($minSize and ($result{'length'} < $minSize));
+        unless (exists $result{'query'} and exists $result{'length'}) {
+            print "\nIncomplete search result: $set/$file.\n" ;
+            if ($interactive) {
+                print "Press Enter to continue, or Ctrl+C to exit:";
+                $s = <STDIN>;
+            }
+        }
+        $result{'product'} = '' unless exists $result{'product'};
+
+        ## Intepret hit table ##
+
+        # total number of hits
+        $result{'n'} = $#hits+1;
+
+        # sort hits by bit score or phylogenetic distance
+        @hits = sort {$b->{'score'} <=> $a->{'score'}} @hits;
+
+        # identify self (query) information
+        my $isQuery = 0;
+        for ($i=0; $i<=$#hits; $i++) {
+            @a = split(/\//, $hits[$i]{'accns'});
+            foreach (@a) {
+                if ($result{'accn'} eq $_) {
+                    $result{'id'} = $i;
+                    $result{'taxid'} = $hits[$i]{'taxid'};
+                    $result{'score'} = $hits[$i]{'score'};
+                    $result{'organism'} = $hits[$i]{'organism'};
+                    $isQuery = 1;
+                    last;
+                }
+            }
+            last if exists $result{'id'};
+        }
+        unless (exists $result{'id'}) {
+            $result{'id'} = 0;
+            $result{'taxid'} = $hits[0]{'taxid'};
+            $result{'score'} = $hits[0]{'score'};
+            $result{'organism'} = $hits[0]{'organism'};
+        }
+        next unless $result{'score'};
+
+        # Use absolute or relative bit scores
+
+        if ($normalize and not $useDistance) {
+            for ($i=0; $i<=$#hits; $i++) {
+                $hits[$i]{'score'} = sprintf("%.3f", $hits[$i]{'score'}/$result{'score'});
+            }
+        }
+
+        # initialize values of prediction results
+        $result{'in'} = "";                    # whether incoming HGT or origination took place within the group
+        $result{'loss'} = "";                # gene loss event
+        $result{'origin'} = "";                # gene origination event
+        $result{'income'} = "";                # incoming HGT event
+        $result{'outcome'} = "";            # outcoming HGT event
+
+        # Summarize numbers and scores #
+        ## 0 - self group, 1 - close groups, 2 - distal gs
+        ## N - number, S - scores
+        ## hit1 - first close hit, hit 2 - first distal hit
+
+        my ($topCloseScore, $topDistalScore) = (0, 0);
+        $result{'N0'} = 0; $result{'N1'} = 0; $result{'N2'} = 0;
+        @a = (); $result{'S0'} = [@a]; $result{'S1'} = [@a]; $result{'S2'} = [@a];
+        for ($i=0; $i<=$#hits; $i++) {
+            if (exists $lvList{$lv{'rank'}}{$hits[$i]{'taxid'}}) {
+                next unless $isQuery;
+                if ($useWeight) { $result{'N0'} += $hits[$i]{'score'}; }
+                else { $result{'N0'} ++; }
+                push @{$result{'S0'}}, $hits[$i]{'score'};
+            } elsif (exists $lvList{$lv{'prank'}}{$hits[$i]{'taxid'}}) {
+                if ($useWeight) { $result{'N1'} += $hits[$i]{'score'}; }
+                else { $result{'N1'} ++; }
+                push @{$result{'S1'}}, $hits[$i]{'score'};
+                $result{'hit1'} = $hits[$i]{'taxid'} unless exists $result{'hit1'};
+                $topCloseScore = $hits[$i]{'score'} unless $topCloseScore;
+                $result{'BBH'} = "" unless exists $result{'BBH'};
+            } else {
+                if ($useWeight) { $result{'N2'} += $hits[$i]{'score'}; }
+                else { $result{'N2'} ++; }
+                push @{$result{'S2'}}, $hits[$i]{'score'};
+                unless (exists $result{'hit2'}) {
+                    $result{'hit2'} = $hits[$i]{'taxid'};
+                    $result{'hit2accn'} = $hits[$i]{'accn'};
+                }
+                $topDistalScore = $hits[$i]{'score'} unless $topDistalScore;
+                $result{'BBH'} = 1 unless exists $result{'BBH'};
+            }
+        }
+        # more stringent BBH method:
+        # $result{'BBH'} = 1 if ($topDistalScore > $topCloseScore);
+
+        $result{'N0'} = sprintf("%.3f", $result{'N0'});
+        $result{'N1'} = sprintf("%.3f", $result{'N1'});
+        $result{'N2'} = sprintf("%.3f", $result{'N2'});
+
+        if (exists $refs{$result{'accn'}}) {
+            $refs{$result{'accn'}} = [1,$result{'N0'},$result{'N1'},$result{'N2'}];
+        }
+
+        # proteins without non-self hits are considered as de novo originated and not considered for statistics
+        $result{'origin'} = 1 if $result{'N1'}+$result{'N2'} < 0.000001;
+
+        # proteins with hits lower than minimum cutoff are considered as de novo originated and not considered for statistics
+        if ($minHits and ($result{'n'} < $minHits)) { $result{'origin'} = 1; $result{'BBH'} = ""; }
+
+        # Put this record into the master record
+        push @{$results{$set}}, {%result};
+
+        # Record fingerprint #
+        if ($unite) { $s = '0'; }
+        else { $s = $set; }
+
+        if ($result{'N0'} and not $result{'origin'}) { # skip in case distance is used and no distance is available
+            push @{$fpN{$s}{'0'}{'data'}}, $result{'N0'};
+            push @{$fpN{$s}{'1'}{'data'}}, $result{'N1'};
+            push @{$fpN{$s}{'2'}{'data'}}, $result{'N2'};
+        }
+
+        # Show progress
+        print "." and $iProgress++ if ($iProtein/$nProtein >= $iProgress/60);
+    }
+    print "\n";
 }
 print " done.\n";
 
@@ -680,622 +680,622 @@
 
 ## output raw data for further statistical analysis ##
 
-if ($outRaw){
-	open OUT, ">$wkDir/result/statistics/rawdata.txt";
-	print OUT "Query\tSet\tLength\tHits\tSelf\tClose\tDistal\n";
-	foreach my $set (sort keys %results){
-		$n = @{$results{$set}};
-		for ($i=0; $i<$n; $i++){
-			my %res = %{$results{$set}[$i]};
-			next if $res{'origin'};
-			next unless $res{'N0'};
-			print OUT "$res{'accn'}\t$set\t$res{'length'}\t$res{'n'}\t$res{'N0'}\t$res{'N1'}\t$res{'N2'}\n";
-		}
-	}
-	close OUT;
-	print "Raw data are saved in result/statistics/rawdata.txt.\n";
-	print "You may conduct further analyses on these data.\n";
-	if ($interactive){
-		print "Press Enter to continue, or Ctrl+C to exit:";
-		$s = <STDIN>;
-	}
+if ($outRaw) {
+    open OUT, ">$wkDir/result/statistics/rawdata.txt";
+    print OUT "Query\tSet\tLength\tHits\tSelf\tClose\tDistal\n";
+    foreach my $set (sort keys %results) {
+        $n = @{$results{$set}};
+        for ($i=0; $i<$n; $i++) {
+            my %res = %{$results{$set}[$i]};
+            next if $res{'origin'};
+            next unless $res{'N0'};
+            print OUT "$res{'accn'}\t$set\t$res{'length'}\t$res{'n'}\t$res{'N0'}\t$res{'N1'}\t$res{'N2'}\n";
+        }
+    }
+    close OUT;
+    print "Raw data are saved in result/statistics/rawdata.txt.\n";
+    print "You may conduct further analyses on these data.\n";
+    if ($interactive) {
+        print "Press Enter to continue, or Ctrl+C to exit:";
+        $s = <STDIN>;
+    }
 }
 
 ## graph fingerprints with R ##
 
-if ($graphFp){
-	print "\nGraphing fingerprints with R...";
-	$R->startR;
-	print "R cannot be started. Make sure it is properly installed in the system.\n" and exit 1 unless $R->is_started();
-	if ($plot3D){
-		$R->send("library('rgl')");
-		print "\n  You chose to display interactive 3D scatter plots. They will be displayed sequentially. Use mouse to rotate plots. Press Enter in the terminal to move to the next plot.\n";
-	}
-	foreach my $set (sort keys %fpN){
-		my $fpre = "$wkDir/result/statistics/".("$set." x ($set ne "0")); # prefix of filename
-		my $tpost = " of $set" x ($set ne "0"); # postfix of title
-		my @gcode = ('Self', 'Close', 'Distal');
-		for (0..2){ # send data to R
-			@b = @{$fpN{$set}{$_}{'data'}};
-			$_ = sprintf("%.3f", $_) for (@b);
-			$R->send("x$_<-c(".join (",", @b).")");
-			@b = sort{$a<=>$b}@b;
-			$R->send("lim$_<-".$b[$#b]); # find proper xlim
-			if ($exOutlier){
-				@c = boxplot(@b) if ($exOutlier == 1);
-				@c = adjusted_boxplot(@b) if ($exOutlier == 2);
-				@c = modified_z(@b) if ($exOutlier == 3);
-				if ($b[$#b] > $c[1]){
-					for ($i=$#b; $i>=0; $i--){
-						if ($b[$i] <= $c[1]){
-							$R->send("lim$_<-".$b[$i]);
-							last;
-						}
-					}
-				}
-			}
-		}
-		my @xr = ([], [], []); # reference positives
-		if (keys %refs){
-			foreach my $key (keys %refs){
-				if ($refs{$key}[0]){
-					push (@{$xr[$_]}, $refs{$key}[$_+1]) for (0..2);
-				}
-			}
-			for (0..2){
-				@b = @{$xr[$_]};
-				$_ = sprintf("%.3f", $_) for (@b);
-				$R->send("xr$_<-c(".join (",", @b).")");
-			}
-		}
-		if ($boxPlot){
-			$R->send("pdf('$fpre"."box.pdf',useDingbats=F)");
-			$R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
-			$R->send("boxplot(x0,x1,x2,names=c('self','close','distal'),xlab='Group',ylab='Weight',main='')");
-			$R->send("dev.off()");
-		}
-		if ($selfLow){ @b = (0, 1, 2); }
-		else{ @b = (1, 2); }
-		if ($histogram){
-			for (@b){
-				$R->send("pdf('$fpre"."hist.".lc($gcode[$_]).".pdf',useDingbats=F)");
-				$R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
-				$R->send("hist(x$_, breaks=$nBin,freq=F,col='lightgrey',xlab='$gcode[$_] weight',ylab='Probability density',main=''".",xlim=range(0:lim$_)" x ($exOutlier and $exOutlier > 0).")");
-				$R->send("dev.off()");
-			}
-		}
-		if ($densityPlot){
-			for (@b){
-				$R->send("pdf('$fpre"."density.".lc($gcode[$_]).".pdf',useDingbats=F)");
-				if ($plotRef){ $R->send("par(mar=c(4,4,1,3)+0.1,mgp=c(2.5,0.75,0))"); }
-				else{ $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))"); }
-				$R->send("d<-density(x$_".(",bw=bw.nrd0(x$_)*$bwF" x ($bwF and $bwF != 1)).")");
-				if ($exOutlier){ $R->send("lim<-range(min(d\$x):lim$_)"); }
-				else{ $R->send("lim<-range(d\$x[is.finite(d\$x)])"); }
-				$R->send("plot(d,lwd=2,xlim=lim,xlab='Weight',ylab='Probability density',main='')");
-				if (scalar @{$xr[0]}){
-					$R->send("par(new=TRUE)");
-					$R->send("plot(density(xr$_".(",bw=bw.nrd0(ab)*$bwF" x ($bwF and $bwF != 1))."),xlim=lim,xaxt='n',yaxt='n',xlab='',ylab='',main='',col=2)");
-					$R->send("axis(4,col=2,col.ticks=2)");
-					$R->send("mtext('Probability density of true positives',side=4,line=-2,col=2)");
-					$R->send("rug(xr$_,ticksize=0.04,lwd=1,col=rgb(1,0,0,0.25))");
-				}
-				$R->send("dev.off()");
-			}
-		}
-		if ($scatterPlot){
-			if ($selfLow){
-				for (@b){
-					($i, $j) = ($_, $_+1);
-					$j = 0 if ($j == 3);
-					$R->send("pdf('$fpre"."scatter.".lc($gcode[$i])."-".lc($gcode[$j]).".pdf',useDingbats=F)");
-					$R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
-					$R->send("plot(x$i,x$j,pch=16,col=rgb(0,0,0,0.25),xlim=range(0:lim$i),ylim=range(0:lim$j),xlab='$gcode[$i] weight',ylab='$gcode[$j] weight')");
-					if (scalar @{$xr[0]}){
-						$R->send("points(xr$i,xr$j,pch=16,col=rgb(1,0,0,0.5))");
-						$R->send("legend('topright','true positive',pch=16,col='red')");
-					}
-					$R->send("dev.off()");
-				}
-			}else{
-				$R->send("pdf(\"$fpre"."scatter.pdf\",useDingbats=F)");
-				$R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
-				$R->send("plot(x1,x2,pch=16,col=rgb(0,0,0,0.25),xlim=range(0:lim1),ylim=range(0:lim2),xlab='Close weight',ylab='Distal weight',main='')");
-				if (scalar @{$xr[0]}){
-					$R->send("points(xr1,xr2,pch=16,col=rgb(1,0,0,0.5))");
-					$R->send("legend('topright','true positive',pch=16,col='red')");
-				}
-				$R->send("dev.off()");
-			}
-		}
-		if ($plot3D){ # This function is not functioning properly
-			# if ($^O=~/Win/){ $R->send("windows()"); }elsif ($^O=~/Mac/){ $R->send("quartz()"); }else{ $R->send("x11()"); }
-			$R->send("plot3d(x0,x1,x2,xlab='Self weight',ylab='Close weight',zlab='Distal weight')");
-			print "Displaying 3D plot$tpost. Press Enter to move on.";
-			$s = <STDIN>;
-		}
-	}
-	$R->stopR();
-	print " done.\n";
-	print "Graphs are saved in result/statistics/.\n";
-	if ($interactive){
-		print "You may take a look at the graphs before proceeding.\n";
-		print "Press Enter to continue, or Ctrl+C to exit:";
-		$s = <STDIN>;
-	}
+if ($graphFp) {
+    print "\nGraphing fingerprints with R...";
+    $R->startR;
+    print "R cannot be started. Make sure it is properly installed in the system.\n" and exit 1 unless $R->is_started();
+    if ($plot3D) {
+        $R->send("library('rgl')");
+        print "\n  You chose to display interactive 3D scatter plots. They will be displayed sequentially. Use mouse to rotate plots. Press Enter in the terminal to move to the next plot.\n";
+    }
+    foreach my $set (sort keys %fpN) {
+        my $fpre = "$wkDir/result/statistics/".("$set." x ($set ne "0")); # prefix of filename
+        my $tpost = " of $set" x ($set ne "0"); # postfix of title
+        my @gcode = ('Self', 'Close', 'Distal');
+        for (0..2) { # send data to R
+            @b = @{$fpN{$set}{$_}{'data'}};
+            $_ = sprintf("%.3f", $_) for (@b);
+            $R->send("x$_<-c(".join (",", @b).")");
+            @b = sort{$a<=>$b}@b;
+            $R->send("lim$_<-".$b[$#b]); # find proper xlim
+            if ($exOutlier) {
+                @c = boxplot(@b) if ($exOutlier == 1);
+                @c = adjusted_boxplot(@b) if ($exOutlier == 2);
+                @c = modified_z(@b) if ($exOutlier == 3);
+                if ($b[$#b] > $c[1]) {
+                    for ($i=$#b; $i>=0; $i--) {
+                        if ($b[$i] <= $c[1]) {
+                            $R->send("lim$_<-".$b[$i]);
+                            last;
+                        }
+                    }
+                }
+            }
+        }
+        my @xr = ([], [], []); # reference positives
+        if (keys %refs) {
+            foreach my $key (keys %refs) {
+                if ($refs{$key}[0]) {
+                    push (@{$xr[$_]}, $refs{$key}[$_+1]) for (0..2);
+                }
+            }
+            for (0..2) {
+                @b = @{$xr[$_]};
+                $_ = sprintf("%.3f", $_) for (@b);
+                $R->send("xr$_<-c(".join (",", @b).")");
+            }
+        }
+        if ($boxPlot) {
+            $R->send("pdf('$fpre"."box.pdf',useDingbats=F)");
+            $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
+            $R->send("boxplot(x0,x1,x2,names=c('self','close','distal'),xlab='Group',ylab='Weight',main='')");
+            $R->send("dev.off()");
+        }
+        if ($selfLow) { @b = (0, 1, 2); }
+        else { @b = (1, 2); }
+        if ($histogram) {
+            for (@b) {
+                $R->send("pdf('$fpre"."hist.".lc($gcode[$_]).".pdf',useDingbats=F)");
+                $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
+                $R->send("hist(x$_, breaks=$nBin,freq=F,col='lightgrey',xlab='$gcode[$_] weight',ylab='Probability density',main=''".",xlim=range(0:lim$_)" x ($exOutlier and $exOutlier > 0).")");
+                $R->send("dev.off()");
+            }
+        }
+        if ($densityPlot) {
+            for (@b) {
+                $R->send("pdf('$fpre"."density.".lc($gcode[$_]).".pdf',useDingbats=F)");
+                if ($plotRef) { $R->send("par(mar=c(4,4,1,3)+0.1,mgp=c(2.5,0.75,0))"); }
+                else { $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))"); }
+                $R->send("d<-density(x$_".(",bw=bw.nrd0(x$_)*$bwF" x ($bwF and $bwF != 1)).")");
+                if ($exOutlier) { $R->send("lim<-range(min(d\$x):lim$_)"); }
+                else { $R->send("lim<-range(d\$x[is.finite(d\$x)])"); }
+                $R->send("plot(d,lwd=2,xlim=lim,xlab='Weight',ylab='Probability density',main='')");
+                if (scalar @{$xr[0]}) {
+                    $R->send("par(new=TRUE)");
+                    $R->send("plot(density(xr$_".(",bw=bw.nrd0(ab)*$bwF" x ($bwF and $bwF != 1))."),xlim=lim,xaxt='n',yaxt='n',xlab='',ylab='',main='',col=2)");
+                    $R->send("axis(4,col=2,col.ticks=2)");
+                    $R->send("mtext('Probability density of true positives',side=4,line=-2,col=2)");
+                    $R->send("rug(xr$_,ticksize=0.04,lwd=1,col=rgb(1,0,0,0.25))");
+                }
+                $R->send("dev.off()");
+            }
+        }
+        if ($scatterPlot) {
+            if ($selfLow) {
+                for (@b) {
+                    ($i, $j) = ($_, $_+1);
+                    $j = 0 if ($j == 3);
+                    $R->send("pdf('$fpre"."scatter.".lc($gcode[$i])."-".lc($gcode[$j]).".pdf',useDingbats=F)");
+                    $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
+                    $R->send("plot(x$i,x$j,pch=16,col=rgb(0,0,0,0.25),xlim=range(0:lim$i),ylim=range(0:lim$j),xlab='$gcode[$i] weight',ylab='$gcode[$j] weight')");
+                    if (scalar @{$xr[0]}) {
+                        $R->send("points(xr$i,xr$j,pch=16,col=rgb(1,0,0,0.5))");
+                        $R->send("legend('topright','true positive',pch=16,col='red')");
+                    }
+                    $R->send("dev.off()");
+                }
+            } else {
+                $R->send("pdf(\"$fpre"."scatter.pdf\",useDingbats=F)");
+                $R->send("par(mar=c(4,4,1,1)+0.1,mgp=c(2.5,0.75,0))");
+                $R->send("plot(x1,x2,pch=16,col=rgb(0,0,0,0.25),xlim=range(0:lim1),ylim=range(0:lim2),xlab='Close weight',ylab='Distal weight',main='')");
+                if (scalar @{$xr[0]}) {
+                    $R->send("points(xr1,xr2,pch=16,col=rgb(1,0,0,0.5))");
+                    $R->send("legend('topright','true positive',pch=16,col='red')");
+                }
+                $R->send("dev.off()");
+            }
+        }
+        if ($plot3D) { # This function is not functioning properly
+            # if ($^O=~/Win/) { $R->send("windows()"); } elsif ($^O=~/Mac/) { $R->send("quartz()"); } else { $R->send("x11()"); }
+            $R->send("plot3d(x0,x1,x2,xlab='Self weight',ylab='Close weight',zlab='Distal weight')");
+            print "Displaying 3D plot$tpost. Press Enter to move on.";
+            $s = <STDIN>;
+        }
+    }
+    $R->stopR();
+    print " done.\n";
+    print "Graphs are saved in result/statistics/.\n";
+    if ($interactive) {
+        print "You may take a look at the graphs before proceeding.\n";
+        print "Press Enter to continue, or Ctrl+C to exit:";
+        $s = <STDIN>;
+    }
 }
 
 
 ## Compute the statistics for the whole genome(s), i.e., phyletic pattern, or "fingerprint" ##
 
 print "\nComputing statistics...";
-if (($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest){
-	$R->startR;
-	print "R cannot be started. Make sure it is properly installed in the system.\n" and exit 1 unless $R->is_started();
-	$R->send("library(pastecs)") if ($howCO == 4 and $toolExtrema);
-	$R->send("library(diptest)") if $dipTest;
+if (($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest) {
+    $R->startR;
+    print "R cannot be started. Make sure it is properly installed in the system.\n" and exit 1 unless $R->is_started();
+    $R->send("library(pastecs)") if ($howCO == 4 and $toolExtrema);
+    $R->send("library(diptest)") if $dipTest;
 }
 
-foreach my $set (keys %fpN){
-	if ($set eq "0"){ print "\n  All protein sets:\n"; }else{ print "\n  Protein set $set:\n"; }
-	foreach my $key ('0','1','2'){
-		next unless @{$fpN{$set}{$key}{'data'}};
-		
-		if ($key eq "0"){ print "    Self group:\n"; }elsif ($key eq "1"){ print "    Close group:\n"; }elsif ($key eq "2"){ print "    Distal group:\n"; }
-		
-		@a = sort {$a<=>$b} @{$fpN{$set}{$key}{'data'}}; # sort low to high
-
-		my $global_cutoff;
-		my $computed_cutoff;
-		my $use_global = 0;
-		my $half_way = median(@a); ########## if computed cutoff > median, use global cutoff
-
-		# compute basic statistical parameters
-		
-		$fpN{$set}{$key}{'n'} = @a;
-		$s = 0; $s += $_ for @a;
-		$fpN{$set}{$key}{'mean'} = $s/$fpN{$set}{$key}{'n'};
-		$s = 0; $s += ($fpN{$set}{$key}{'mean'} - $_)**2 for @a;
-		$fpN{$set}{$key}{'stdev'} = sqrt($s/($fpN{$set}{$key}{'n'}-1));
-		$fpN{$set}{$key}{'min'} = $a[0];
-		$fpN{$set}{$key}{'max'} = $a[$#a];
-		$fpN{$set}{$key}{'median'} = median(@a);
-		$fpN{$set}{$key}{'mad'} = mad(@a);
-		($fpN{$set}{$key}{'q1'}, $fpN{$set}{$key}{'q3'}) = quantiles(@a);
-		
-		if ($key eq '0' and not $selfLow){
-			$fpN{$set}{$key}{'cutoff'} = 0;
-			print "      Skipped.\n";
-			next;
-		}
-		
-		# determine cutoff using global cutoff
-
-		$i = $fpN{$set}{$key}{'n'}*$globalCO;
-		if (int($i) == $i){
-			$global_cutoff = ($a[$i-1]+$a[$i])/2;
-		}else{
-			if ($i-int($i) <= int($i)-$i+1){
-				$global_cutoff = $a[int($i)];
-			}else{
-				$global_cutoff = $a[int($i)-1];
-			}
-		}
-		print "      Global cutoff ($globalCO) = $global_cutoff.\n";
-
-		# override individual cutoff with global cutoff
-
-		$use_global = 1 if ((($key eq '0') and ($selfCO eq 'G')) or (($key eq '1') and ($closeCO eq 'G')) or (($key eq '2') and ($distalCO eq 'G')));
-		
-		# exclude outliers
-
-		if ($exOutlier){
-			@c = boxplot(@a) if ($exOutlier == 1);
-			@c = adjusted_boxplot(@a) if ($exOutlier == 2);
-			@c = modified_z(@a) if ($exOutlier == 3);
-			if ($a[$#a] > $c[1]){
-				for ($i=$#a; $i>=0; $i--){
-					if ($a[$i] <= $c[1]){
-						@a = @a[0..$i];
-						last;
-					}
-				}
-			}
-		}
-
-		# perform Hartigan's dip test to assess non-unimodality
-
-		if ($dipTest){
-			print "      Performing Hartigan's dip test...";
-			$R->send("x<-c(".join (",", @a).")");
-			$R->send("dip.test(x)");
-			$s = $R->read;
-			open OUT, ">$key.out";
-			print OUT join(' ', @a);
-			close OUT;
-			
-			if ($s =~ /D = (\S+), p-value [<=] (\S+)\n/){
-				print " done.\n";
-				print "      D = $1, p-value = $2\n";
-				if ($dipSig){
-					if ($2 >= $dipSig){
-						print "      The weight distribution is NOT significantly non-unimodal.\n";
-						if ($howCO >= 3){
-							if ($interactive){
-								print "      Proceed with statistical analysis anyway (yes) or use global cutoff ($globalCO) instead (NO)? ";
-								while (<STDIN>){
-									chomp;
-									unless ($_){ $use_global = 1; last; }
-									if (/^y$/i or /^yes$/i){ last; }
-									if (/^n$/i or /^no$/i){ $use_global = 1; last; }
-								}
-							}else{ $use_global = 1; }
-						}
-					}else{ print "      The weight distribution is significantly non-unimodal.\n"; }
-				}else{ print "      The weight distribution is ". ("NOT " x ($2 >= 0.05)). "significantly non-unimodal.\n"; }
-			}else{ print " failed.\n"; }
-		}
-
-		# determine cutoff using histogram
-
-		print "      Generating histogram..." if ($howCO == 3 and not $use_global);
-
-		my @freqs = (0)x$nBin;
-		my $interval = ($a[$#a]-$a[0])/$nBin;
-		my $cid = 0;
-		for ($j=1; $j<$nBin; $j++){
-			my $high_bound = $interval*$j;
-			for ($i=$cid; $i<=$#a; $i++){
-				if ($a[$i] < $high_bound){
-					$freqs[$j-1] ++;
-				}else{
-					$cid = $i;
-					last;
-				}
-			}
-		}
-		$freqs[$nBin-1] = $#a-$cid+1;
-		my $local_min = 0; # index of the lowest bar from left
-		for ($i=1; $i<$nBin-1; $i++){
-			if ($freqs[$i]<$freqs[$i-1] and $freqs[$i]<=$freqs[$i+1]){
-				$computed_cutoff = $interval*$i;
-				$local_min = $i;
-				last;
-			}
-		}
-		
-		print " done.\n" if ($howCO == 3 and not $use_global);
-		
-		# draw histogram
-		
-		if ($plotHist){
-			print "  Histogram:";
-			@c = sort {$b<=>$a} @freqs;
-			$s = 50/$c[0];
-			my @widths = (0)x$nBin;
-			my @labels = (0)x$nBin;
-			for ($i=0; $i<$nBin; $i++){
-				$widths[$i] = int($freqs[$i]*$s);
-				$labels[$i] = sprintf("%.2f", $interval*$i)."-".sprintf("%.2f", $interval*($i+1));
-			}
-			@c = sort {length($b)<=>length($a)} @labels;
-			$s = length($c[0]);
-			print "\n";
-			for ($i=0; $i<$nBin; $i++){
-				print " "x($s-length($labels[$i]))."$labels[$i] ".("*"x$widths[$i])."$freqs[$i]".(" (local minimum)"x($local_min and $local_min==$i))."\n";
-			}
-		}
-
-		# determine cutoff using kernel density estimation (KDE)
-
-		if ($howCO == 4 and not $use_global){
-		
-			# perform kernel density estimation (KDE)
-		
-			print "      Performing kernel density estimation...";
-			my (@dx, @dy); # x- and y-coornidates of density function
-			my $KDEstats = "";
-			
-			# perform KDE using basic R command "density"
-			
-			if ($toolKDE){
-				$R->send("x<-c(".join (",", @a).")");
-				if ($bwF and $bwF != 1){
-					$R->send("bwx<-bw.nrd0(x)*$bwF");
-					$R->send("d<-density(x,bw=bwx)");
-				}else{
-					$R->send("d<-density(x)");
-				}
-				@dx = @{$R->get('d$x')};
-				@dy = @{$R->get('d$y')};
-			}
-			
-			# perform KDE using self-written Perl code
-			
-			else{
-				my $n = scalar @a;
-				my $mean; $mean += $_ for @a; $mean = $mean/$n;
-				my $stdev; $stdev += ($mean-$_)**2 for @a; $stdev = sqrt($stdev/($n-1));
-				my @Q = quantiles(@a); my $iqr = $Q[1]-$Q[0];
-				my $bw;
-				if ($stdev == 0 and $iqr == 0){ $bw = 1; }
-				elsif ($stdev == 0){ $bw = $iqr/1.34; }
-				elsif ($iqr == 0){ $bw = $stdev; }
-				elsif ($stdev <= $iqr/1.34){ $bw = $stdev; }
-				else{ $bw = $iqr/1.34; }
-				$bw = 0.9*$bw*$n**(-1/5); # select bandwidth by Silverman's ¡°rule of thumb¡± (1986)
-				$bw = $bw*$bwF if ($bwF and $bwF != 1);
-				my ($min, $max) = ($a[0]-3*$bw, $a[$#a]+3*$bw); # cut = 3
-				$KDEstats = "      N = $n, bandwidth = ".sprintf("%.3f", $bw).".\n";
-				for (my $x=$min; $x<=$max; $x+=($max-$min)/511) { # 512 points
-					my $e = 0; $e += exp(-(($x-$_)/$bw)**2/2)/sqrt(2*3.1415926536) for @a; # Gaussian kernel
-					push @dx, $x;
-					push @dy, 1/$n/$bw*$e;
-				}
-			}
-			
-			print " done.\n";
-			print $KDEstats;
-			
-			# plot density function on screen
-
-			if ($plotKDE){
-				print "  Density function:\n";
-				my $k_x = int(@dx/100);
-				$k_x = int(@dx/$plotKDE) if ($plotKDE > 1);
-				my $k_y = 0;
-				foreach (@dy){
-					$k_y = $_ if ($_ > $k_y);
-				}
-				$k_y = 64/$k_y;
-				for ($i=0; $i<=$#dy; $i+=$k_x){
-					print " "x(7-length(sprintf("%.2f", $dx[$i]))).sprintf("%.2f", $dx[$i]);
-					print " "x (int($dy[$i]*$k_y)+1)."*\n";
-				}
-			}
-
-			my $peak1st;
-			my $failed;
-			my ($peak_i, $peak_x, $peak_y);
-			my ($pit_i, $pit_x, $pit_y);
-
-			# identify local extrema using R package "pastecs"
-			
-			if ($toolExtrema){
-				if ($toolKDE < 2){
-					$R->send("d<-data.frame(x=c(".join (",", @dx)."),y=c(".join (",", @dy)."))");
-				}
-				$R->send("tp<-turnpoints(ts(d\$y))");
-				if ($R->get('tp$firstispeak') eq "TRUE"){
-					if ($R->get('tp$nturns') == 1){ $failed = 1; }
-					else{ $peak1st = 1; }
-				}else{
-					$peak1st = 0;
-				}
-				unless ($failed){
-					my @tpx = @{$R->get("d\$x[tp\$tppos]")};
-					my @tpy = @{$R->get("d\$y[tp\$tppos]")};
-					if ($peak1st){
-						($peak_x, $peak_y) = ($tpx[0], $tpy[0]);
-						($pit_x, $pit_y) = ($tpx[1], $tpy[1]);
-					}else{
-						($peak_x, $peak_y) = ($dx[0], $dy[0]);
-						($pit_x, $pit_y) = ($tpx[0], $tpy[0]);
-					}
-				}
-			}
-			
-			# identify local extrema using self-written Perl code
-			
-			else{
-				for ($i=0; $i<$#dy; $i++){
-					if ($dy[$i] < $dy[$i+1]){ $peak1st = 1; last; }
-					elsif ($dy[$i] > $dy[$i+1]){ $peak1st = 0; last; }
-					else{ next; }
-				}
-				if ($peak1st){
-					for ($i=1; $i<$#dy; $i++){
-						if (($dy[$i-1] <= $dy[$i]) and ($dy[$i] > $dy[$i+1])){
-							($peak_i, $peak_x, $peak_y) = ($i, $dx[$i], $dy[$i]);
-						}
-						if (($dy[$i-1] > $dy[$i]) and ($dy[$i] <= $dy[$i+1])){
-							($pit_i, $pit_x, $pit_y) = ($i, $dx[$i], $dy[$i]);
-						}
-						last if ($peak_i and $pit_i);
-					}
-					$failed = 1 unless $pit_i;
-				}else{
-					($peak_i, $peak_x, $peak_y) = (0, $dx[0], $dy[0]);
-					for (my $i=1; $i<$#dx; $i++){
-						if (($dy[$i-1] > $dy[$i]) and ($dy[$i] <= $dy[$i+1])){
-							($pit_i, $pit_x, $pit_y) = ($i, $dx[$i], $dy[$i]);
-						}
-						last if $pit_i;
-					}
-				}
-			}
-			
-			if ($failed){
-				print "      The weight population cannot be clustered using kernel density estimation. This is typically caused by an even distribution of all weights.\n";
-				$use_global = 1;
-			}else{
-			
-				# locate cutoff point
-			
-				if ($modKCO == 0){ # pit
-					$computed_cutoff = $pit_x;
-				}elsif ($modKCO == 1){ # horizontal midpoint
-					$computed_cutoff = ($pit_x+$peak_x)/2;
-				}elsif ($modKCO == 2){ # horizontal quantile
-					$computed_cutoff = $pit_x-($pit_x-$peak_x)*$qKCO;
-				}else{ # vertical quantile
-					my $vCO = 0;
-					$vCO = $pit_y+($peak_y-$pit_y)*$qKCO;
-					for (my $i=$peak_i; $i<$pit_i; $i++){
-						if (($dy[$i] >= $vCO) and ($vCO > $dy[$i+1])){
-							$computed_cutoff = $dx[$i]+($dx[$i+1]-$dx[$i])*($dy[$i]-$vCO)/($dy[$i]-$dy[$i+1]);
-							last;
-						}
-					}
-				}
-			}
-		}
-		
-		# determine cutoff using hierarchical clustering
-
-		if ($howCO == 5 and not $use_global){
-			print "      Performing hierarchical clustering...";
-			$R->send("x<-c(".join (",", @a).")");
-			$R->send("d<-dist(x,method=\"euclidean\")");
-			$R->send("fit<-hclust(d,method=\"ward\")");
-			print " done.\n";
-			my $nCluster = 1;
-			while ($nCluster++){
-				$R->send("c<-cutree(fit,k=$nCluster)");
-				my $clusters = $R->get('c'); # tip: return vector from R
-				@c = (()) x $nCluster; # data of clusters
-				for ($i=0; $i<=@{$clusters}-1; $i++){
-					foreach (1..$nCluster){
-						if (@{$clusters}[$i] == $_){
-							push @{$c[$_-1]}, $a[$i];
-							last;
-						}
-					}
-				}
-				for ($i=1; $i<=$#c; $i++){
-					@{$c[$i]} = sort {$a <=> $b} @{$c[$i]};
-				}
-				@b = sort {$a->[0] <=> $b->[0]} @c;
-				print "      With $nCluster clusters, cutoff is $b[1][0]. Accept? (YES/no) ";
-				my $user_okay = 1;
-				while (<STDIN>){
-					chomp; last unless $_;
-					last if (/^y$/i or /^yes$/i);
-					if (/^n$/i or /^no$/i){ $user_okay = 0; last; }
-				}
-				last if $user_okay;
-			}
-			$computed_cutoff = $b[1][0];
-		}
-		
-		# report cutoff to user
-		
-		if ($howCO == 0 or ($howCO >= 3 and $use_global)){ # user-defined global cutoff
-			$fpN{$set}{$key}{'cutoff'} = $global_cutoff;
-			print "      Cutoff is $global_cutoff (determined by global cutoff $globalCO)\n";
-		}
-		if ($howCO == 1 and $unite){ # user-defined individual cutoff
-			if ($key eq '0'){ $s = $selfCO; }
-			elsif ($key eq '1'){ $s = $closeCO; }
-			elsif ($key eq '2'){ $s = $distalCO; }
-			if ($s){
-				$fpN{$set}{$key}{'cutoff'} = $s;
-				print "      Cutoff is $s (user-defined).\n";
-			}else{
-				$fpN{$set}{$key}{'cutoff'} = $global_cutoff;
-				print "      User-defined cutoff is not available. Use global cutoff $global_cutoff instead.\n";
-			}
-		}
-		if ($howCO >= 3 and not $use_global){ # computed cutoff
-			$s = 0; if ($howCO == 3){ $s = "histogram"; }
-			elsif ($howCO == 4){ $s = "kernel density estimation"; }
-			elsif ($howCO == 5){ $s = "clustering analysis"; }
-			if ($computed_cutoff){
-				if ($computed_cutoff <= $half_way){
-					$fpN{$set}{$key}{'cutoff'} = $computed_cutoff;
-					print "      Cutoff is ".sprintf("%.3f", $computed_cutoff)." (determined by $s).\n";
-				}else{
-					$fpN{$set}{$key}{'cutoff'} = $global_cutoff;
-					print "      ". ucfirst($s) ." identified a cutoff ".sprintf("%.3f", $computed_cutoff)." which is too large. Use global cutoff $global_cutoff instead.\n";
-				}
-			}else{
-				$fpN{$set}{$key}{'cutoff'} = $global_cutoff;
-				print "      ". ucfirst($s) ." failed to identify a cutoff. Use global cutoff $global_cutoff instead.\n";
-			}
-		}
-		# $fpN{$set}{$key}{'cutoff'} = 0.00001 unless $fpN{$set}{$key}{'cutoff'};
-		
-		# ask user to enter cutoff
-		
-		my $user_enter;
-		if ($interactive and $fpN{$set}{$key}{'cutoff'}){
-			print "      Accept? (YES/no) ";
-			while (<STDIN>){
-				chomp;
-				last unless $_;
-				last if (/^y$/i or /^yes$/i);
-				if (/^n$/i or /^no$/i){ $user_enter = 1; last; }
-			}
-		}
-		if ($user_enter or $howCO == 2){
-			print "      Enter user-specified cutoff: ";
-			while (<STDIN>){
-				chomp;
-				last if /^\d+\.?\d*$/;
-				print "      Invalid cutoff value. Re-enter: ";
-			}
-			$fpN{$set}{$key}{'cutoff'} = $_;
-		}
-	}
+foreach my $set (keys %fpN) {
+    if ($set eq "0") { print "\n  All protein sets:\n"; } else { print "\n  Protein set $set:\n"; }
+    foreach my $key ('0','1','2') {
+        next unless @{$fpN{$set}{$key}{'data'}};
+
+        if ($key eq "0") { print "    Self group:\n"; } elsif ($key eq "1") { print "    Close group:\n"; } elsif ($key eq "2") { print "    Distal group:\n"; }
+
+        @a = sort {$a<=>$b} @{$fpN{$set}{$key}{'data'}}; # sort low to high
+
+        my $global_cutoff;
+        my $computed_cutoff;
+        my $use_global = 0;
+        my $half_way = median(@a); ########## if computed cutoff > median, use global cutoff
+
+        # compute basic statistical parameters
+
+        $fpN{$set}{$key}{'n'} = @a;
+        $s = 0; $s += $_ for @a;
+        $fpN{$set}{$key}{'mean'} = $s/$fpN{$set}{$key}{'n'};
+        $s = 0; $s += ($fpN{$set}{$key}{'mean'} - $_)**2 for @a;
+        $fpN{$set}{$key}{'stdev'} = sqrt($s/($fpN{$set}{$key}{'n'}-1));
+        $fpN{$set}{$key}{'min'} = $a[0];
+        $fpN{$set}{$key}{'max'} = $a[$#a];
+        $fpN{$set}{$key}{'median'} = median(@a);
+        $fpN{$set}{$key}{'mad'} = mad(@a);
+        ($fpN{$set}{$key}{'q1'}, $fpN{$set}{$key}{'q3'}) = quantiles(@a);
+
+        if ($key eq '0' and not $selfLow) {
+            $fpN{$set}{$key}{'cutoff'} = 0;
+            print "      Skipped.\n";
+            next;
+        }
+
+        # determine cutoff using global cutoff
+
+        $i = $fpN{$set}{$key}{'n'}*$globalCO;
+        if (int($i) == $i) {
+            $global_cutoff = ($a[$i-1]+$a[$i])/2;
+        } else {
+            if ($i-int($i) <= int($i)-$i+1) {
+                $global_cutoff = $a[int($i)];
+            } else {
+                $global_cutoff = $a[int($i)-1];
+            }
+        }
+        print "      Global cutoff ($globalCO) = $global_cutoff.\n";
+
+        # override individual cutoff with global cutoff
+
+        $use_global = 1 if ((($key eq '0') and ($selfCO eq 'G')) or (($key eq '1') and ($closeCO eq 'G')) or (($key eq '2') and ($distalCO eq 'G')));
+
+        # exclude outliers
+
+        if ($exOutlier) {
+            @c = boxplot(@a) if ($exOutlier == 1);
+            @c = adjusted_boxplot(@a) if ($exOutlier == 2);
+            @c = modified_z(@a) if ($exOutlier == 3);
+            if ($a[$#a] > $c[1]) {
+                for ($i=$#a; $i>=0; $i--) {
+                    if ($a[$i] <= $c[1]) {
+                        @a = @a[0..$i];
+                        last;
+                    }
+                }
+            }
+        }
+
+        # perform Hartigan's dip test to assess non-unimodality
+
+        if ($dipTest) {
+            print "      Performing Hartigan's dip test...";
+            $R->send("x<-c(".join (",", @a).")");
+            $R->send("dip.test(x)");
+            $s = $R->read;
+            open OUT, ">$key.out";
+            print OUT join(' ', @a);
+            close OUT;
+
+            if ($s =~ /D = (\S+), p-value [<=] (\S+)\n/) {
+                print " done.\n";
+                print "      D = $1, p-value = $2\n";
+                if ($dipSig) {
+                    if ($2 >= $dipSig) {
+                        print "      The weight distribution is NOT significantly non-unimodal.\n";
+                        if ($howCO >= 3) {
+                            if ($interactive) {
+                                print "      Proceed with statistical analysis anyway (yes) or use global cutoff ($globalCO) instead (NO)? ";
+                                while (<STDIN>) {
+                                    chomp;
+                                    unless ($_) { $use_global = 1; last; }
+                                    if (/^y$/i or /^yes$/i) { last; }
+                                    if (/^n$/i or /^no$/i) { $use_global = 1; last; }
+                                }
+                            } else { $use_global = 1; }
+                        }
+                    } else { print "      The weight distribution is significantly non-unimodal.\n"; }
+                } else { print "      The weight distribution is ". ("NOT " x ($2 >= 0.05)). "significantly non-unimodal.\n"; }
+            } else { print " failed.\n"; }
+        }
+
+        # determine cutoff using histogram
+
+        print "      Generating histogram..." if ($howCO == 3 and not $use_global);
+
+        my @freqs = (0)x$nBin;
+        my $interval = ($a[$#a]-$a[0])/$nBin;
+        my $cid = 0;
+        for ($j=1; $j<$nBin; $j++) {
+            my $high_bound = $interval*$j;
+            for ($i=$cid; $i<=$#a; $i++) {
+                if ($a[$i] < $high_bound) {
+                    $freqs[$j-1] ++;
+                } else {
+                    $cid = $i;
+                    last;
+                }
+            }
+        }
+        $freqs[$nBin-1] = $#a-$cid+1;
+        my $local_min = 0; # index of the lowest bar from left
+        for ($i=1; $i<$nBin-1; $i++) {
+            if ($freqs[$i]<$freqs[$i-1] and $freqs[$i]<=$freqs[$i+1]) {
+                $computed_cutoff = $interval*$i;
+                $local_min = $i;
+                last;
+            }
+        }
+
+        print " done.\n" if ($howCO == 3 and not $use_global);
+
+        # draw histogram
+
+        if ($plotHist) {
+            print "  Histogram:";
+            @c = sort {$b<=>$a} @freqs;
+            $s = 50/$c[0];
+            my @widths = (0)x$nBin;
+            my @labels = (0)x$nBin;
+            for ($i=0; $i<$nBin; $i++) {
+                $widths[$i] = int($freqs[$i]*$s);
+                $labels[$i] = sprintf("%.2f", $interval*$i)."-".sprintf("%.2f", $interval*($i+1));
+            }
+            @c = sort {length($b)<=>length($a)} @labels;
+            $s = length($c[0]);
+            print "\n";
+            for ($i=0; $i<$nBin; $i++) {
+                print " "x($s-length($labels[$i]))."$labels[$i] ".("*"x$widths[$i])."$freqs[$i]".(" (local minimum)"x($local_min and $local_min==$i))."\n";
+            }
+        }
+
+        # determine cutoff using kernel density estimation (KDE)
+
+        if ($howCO == 4 and not $use_global) {
+
+            # perform kernel density estimation (KDE)
+
+            print "      Performing kernel density estimation...";
+            my (@dx, @dy); # x- and y-coornidates of density function
+            my $KDEstats = "";
+
+            # perform KDE using basic R command "density"
+
+            if ($toolKDE) {
+                $R->send("x<-c(".join (",", @a).")");
+                if ($bwF and $bwF != 1) {
+                    $R->send("bwx<-bw.nrd0(x)*$bwF");
+                    $R->send("d<-density(x,bw=bwx)");
+                } else {
+                    $R->send("d<-density(x)");
+                }
+                @dx = @{$R->get('d$x')};
+                @dy = @{$R->get('d$y')};
+            }
+
+            # perform KDE using self-written Perl code
+
+            else {
+                my $n = scalar @a;
+                my $mean; $mean += $_ for @a; $mean = $mean/$n;
+                my $stdev; $stdev += ($mean-$_)**2 for @a; $stdev = sqrt($stdev/($n-1));
+                my @Q = quantiles(@a); my $iqr = $Q[1]-$Q[0];
+                my $bw;
+                if ($stdev == 0 and $iqr == 0) { $bw = 1; }
+                elsif ($stdev == 0) { $bw = $iqr/1.34; }
+                elsif ($iqr == 0) { $bw = $stdev; }
+                elsif ($stdev <= $iqr/1.34) { $bw = $stdev; }
+                else { $bw = $iqr/1.34; }
+                $bw = 0.9*$bw*$n**(-1/5); # select bandwidth by Silverman's ¡°rule of thumb¡± (1986)
+                $bw = $bw*$bwF if ($bwF and $bwF != 1);
+                my ($min, $max) = ($a[0]-3*$bw, $a[$#a]+3*$bw); # cut = 3
+                $KDEstats = "      N = $n, bandwidth = ".sprintf("%.3f", $bw).".\n";
+                for (my $x=$min; $x<=$max; $x+=($max-$min)/511) { # 512 points
+                    my $e = 0; $e += exp(-(($x-$_)/$bw)**2/2)/sqrt(2*3.1415926536) for @a; # Gaussian kernel
+                    push @dx, $x;
+                    push @dy, 1/$n/$bw*$e;
+                }
+            }
+
+            print " done.\n";
+            print $KDEstats;
+
+            # plot density function on screen
+
+            if ($plotKDE) {
+                print "  Density function:\n";
+                my $k_x = int(@dx/100);
+                $k_x = int(@dx/$plotKDE) if ($plotKDE > 1);
+                my $k_y = 0;
+                foreach (@dy) {
+                    $k_y = $_ if ($_ > $k_y);
+                }
+                $k_y = 64/$k_y;
+                for ($i=0; $i<=$#dy; $i+=$k_x) {
+                    print " "x(7-length(sprintf("%.2f", $dx[$i]))).sprintf("%.2f", $dx[$i]);
+                    print " "x (int($dy[$i]*$k_y)+1)."*\n";
+                }
+            }
+
+            my $peak1st;
+            my $failed;
+            my ($peak_i, $peak_x, $peak_y);
+            my ($pit_i, $pit_x, $pit_y);
+
+            # identify local extrema using R package "pastecs"
+
+            if ($toolExtrema) {
+                if ($toolKDE < 2) {
+                    $R->send("d<-data.frame(x=c(".join (",", @dx)."),y=c(".join (",", @dy)."))");
+                }
+                $R->send("tp<-turnpoints(ts(d\$y))");
+                if ($R->get('tp$firstispeak') eq "TRUE") {
+                    if ($R->get('tp$nturns') == 1) { $failed = 1; }
+                    else { $peak1st = 1; }
+                } else {
+                    $peak1st = 0;
+                }
+                unless ($failed) {
+                    my @tpx = @{$R->get("d\$x[tp\$tppos]")};
+                    my @tpy = @{$R->get("d\$y[tp\$tppos]")};
+                    if ($peak1st) {
+                        ($peak_x, $peak_y) = ($tpx[0], $tpy[0]);
+                        ($pit_x, $pit_y) = ($tpx[1], $tpy[1]);
+                    } else {
+                        ($peak_x, $peak_y) = ($dx[0], $dy[0]);
+                        ($pit_x, $pit_y) = ($tpx[0], $tpy[0]);
+                    }
+                }
+            }
+
+            # identify local extrema using self-written Perl code
+
+            else {
+                for ($i=0; $i<$#dy; $i++) {
+                    if ($dy[$i] < $dy[$i+1]) { $peak1st = 1; last; }
+                    elsif ($dy[$i] > $dy[$i+1]) { $peak1st = 0; last; }
+                    else { next; }
+                }
+                if ($peak1st) {
+                    for ($i=1; $i<$#dy; $i++) {
+                        if (($dy[$i-1] <= $dy[$i]) and ($dy[$i] > $dy[$i+1])) {
+                            ($peak_i, $peak_x, $peak_y) = ($i, $dx[$i], $dy[$i]);
+                        }
+                        if (($dy[$i-1] > $dy[$i]) and ($dy[$i] <= $dy[$i+1])) {
+                            ($pit_i, $pit_x, $pit_y) = ($i, $dx[$i], $dy[$i]);
+                        }
+                        last if ($peak_i and $pit_i);
+                    }
+                    $failed = 1 unless $pit_i;
+                } else {
+                    ($peak_i, $peak_x, $peak_y) = (0, $dx[0], $dy[0]);
+                    for (my $i=1; $i<$#dx; $i++) {
+                        if (($dy[$i-1] > $dy[$i]) and ($dy[$i] <= $dy[$i+1])) {
+                            ($pit_i, $pit_x, $pit_y) = ($i, $dx[$i], $dy[$i]);
+                        }
+                        last if $pit_i;
+                    }
+                }
+            }
+
+            if ($failed) {
+                print "      The weight population cannot be clustered using kernel density estimation. This is typically caused by an even distribution of all weights.\n";
+                $use_global = 1;
+            } else {
+
+                # locate cutoff point
+
+                if ($modKCO == 0) { # pit
+                    $computed_cutoff = $pit_x;
+                } elsif ($modKCO == 1) { # horizontal midpoint
+                    $computed_cutoff = ($pit_x+$peak_x)/2;
+                } elsif ($modKCO == 2) { # horizontal quantile
+                    $computed_cutoff = $pit_x-($pit_x-$peak_x)*$qKCO;
+                } else { # vertical quantile
+                    my $vCO = 0;
+                    $vCO = $pit_y+($peak_y-$pit_y)*$qKCO;
+                    for (my $i=$peak_i; $i<$pit_i; $i++) {
+                        if (($dy[$i] >= $vCO) and ($vCO > $dy[$i+1])) {
+                            $computed_cutoff = $dx[$i]+($dx[$i+1]-$dx[$i])*($dy[$i]-$vCO)/($dy[$i]-$dy[$i+1]);
+                            last;
+                        }
+                    }
+                }
+            }
+        }
+
+        # determine cutoff using hierarchical clustering
+
+        if ($howCO == 5 and not $use_global) {
+            print "      Performing hierarchical clustering...";
+            $R->send("x<-c(".join (",", @a).")");
+            $R->send("d<-dist(x,method=\"euclidean\")");
+            $R->send("fit<-hclust(d,method=\"ward\")");
+            print " done.\n";
+            my $nCluster = 1;
+            while ($nCluster++) {
+                $R->send("c<-cutree(fit,k=$nCluster)");
+                my $clusters = $R->get('c'); # tip: return vector from R
+                @c = (()) x $nCluster; # data of clusters
+                for ($i=0; $i<=@{$clusters}-1; $i++) {
+                    foreach (1..$nCluster) {
+                        if (@{$clusters}[$i] == $_) {
+                            push @{$c[$_-1]}, $a[$i];
+                            last;
+                        }
+                    }
+                }
+                for ($i=1; $i<=$#c; $i++) {
+                    @{$c[$i]} = sort {$a <=> $b} @{$c[$i]};
+                }
+                @b = sort {$a->[0] <=> $b->[0]} @c;
+                print "      With $nCluster clusters, cutoff is $b[1][0]. Accept? (YES/no) ";
+                my $user_okay = 1;
+                while (<STDIN>) {
+                    chomp; last unless $_;
+                    last if (/^y$/i or /^yes$/i);
+                    if (/^n$/i or /^no$/i) { $user_okay = 0; last; }
+                }
+                last if $user_okay;
+            }
+            $computed_cutoff = $b[1][0];
+        }
+
+        # report cutoff to user
+
+        if ($howCO == 0 or ($howCO >= 3 and $use_global)) { # user-defined global cutoff
+            $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
+            print "      Cutoff is $global_cutoff (determined by global cutoff $globalCO)\n";
+        }
+        if ($howCO == 1 and $unite) { # user-defined individual cutoff
+            if ($key eq '0') { $s = $selfCO; }
+            elsif ($key eq '1') { $s = $closeCO; }
+            elsif ($key eq '2') { $s = $distalCO; }
+            if ($s) {
+                $fpN{$set}{$key}{'cutoff'} = $s;
+                print "      Cutoff is $s (user-defined).\n";
+            } else {
+                $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
+                print "      User-defined cutoff is not available. Use global cutoff $global_cutoff instead.\n";
+            }
+        }
+        if ($howCO >= 3 and not $use_global) { # computed cutoff
+            $s = 0; if ($howCO == 3) { $s = "histogram"; }
+            elsif ($howCO == 4) { $s = "kernel density estimation"; }
+            elsif ($howCO == 5) { $s = "clustering analysis"; }
+            if ($computed_cutoff) {
+                if ($computed_cutoff <= $half_way) {
+                    $fpN{$set}{$key}{'cutoff'} = $computed_cutoff;
+                    print "      Cutoff is ".sprintf("%.3f", $computed_cutoff)." (determined by $s).\n";
+                } else {
+                    $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
+                    print "      ". ucfirst($s) ." identified a cutoff ".sprintf("%.3f", $computed_cutoff)." which is too large. Use global cutoff $global_cutoff instead.\n";
+                }
+            } else {
+                $fpN{$set}{$key}{'cutoff'} = $global_cutoff;
+                print "      ". ucfirst($s) ." failed to identify a cutoff. Use global cutoff $global_cutoff instead.\n";
+            }
+        }
+        # $fpN{$set}{$key}{'cutoff'} = 0.00001 unless $fpN{$set}{$key}{'cutoff'};
+
+        # ask user to enter cutoff
+
+        my $user_enter;
+        if ($interactive and $fpN{$set}{$key}{'cutoff'}) {
+            print "      Accept? (YES/no) ";
+            while (<STDIN>) {
+                chomp;
+                last unless $_;
+                last if (/^y$/i or /^yes$/i);
+                if (/^n$/i or /^no$/i) { $user_enter = 1; last; }
+            }
+        }
+        if ($user_enter or $howCO == 2) {
+            print "      Enter user-specified cutoff: ";
+            while (<STDIN>) {
+                chomp;
+                last if /^\d+\.?\d*$/;
+                print "      Invalid cutoff value. Re-enter: ";
+            }
+            $fpN{$set}{$key}{'cutoff'} = $_;
+        }
+    }
 }
-if (($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest){
-	$R->stopR;
+if (($howCO == 4 and ($toolKDE or $toolExtrema)) or ($howCO == 5) or $dipTest) {
+    $R->stopR;
 }
 print " done.\n";
 
 
 ## output fingerprint ##
 
-if ($outFp){
-	open OUT, ">$wkDir/result/statistics/fingerprint.txt";
-	print OUT "#NEXUS\nBEGIN STATISTICS;\n";
-	print OUT "\tGroup\tNumber\tMean\tSD\tMin\tMax\tMedian\tMAD\tQ1\tQ3\tCutoff\n";
-	foreach my $set (sort keys %fpN){
-		foreach ('0','1','2'){
-			%h = %{$fpN{$set}{$_}};
-			if ($_ eq '0'){ $s = "self"; }elsif ($_ eq '1'){ $s = "close"; }elsif ($_ eq '2'){ $s = "distal"; }
-			if ($set eq '0'){ print OUT "all"; }else{ print OUT $set; }
-			print OUT "\t$s\t$h{'n'}\t".sprintf("%.2f", $h{'mean'})."\t".sprintf("%.2f", $h{'stdev'})."\t".sprintf("%.2f", $h{'min'})."\t".sprintf("%.2f", $h{'max'})."\t".sprintf("%.2f", $h{'median'})."\t".sprintf("%.2f", $h{'mad'})."\t".sprintf("%.2f", $h{'q1'})."\t".sprintf("%.2f", $h{'q3'})."\t".sprintf("%.2f", $h{'cutoff'})."\n";
-		}
-	}
-	print OUT "END;\n";
-	close OUT;
-	print "Result is saved in result/statistics/fingerprint.txt.\n";
+if ($outFp) {
+    open OUT, ">$wkDir/result/statistics/fingerprint.txt";
+    print OUT "#NEXUS\nBEGIN STATISTICS;\n";
+    print OUT "\tGroup\tNumber\tMean\tSD\tMin\tMax\tMedian\tMAD\tQ1\tQ3\tCutoff\n";
+    foreach my $set (sort keys %fpN) {
+        foreach ('0','1','2') {
+            %h = %{$fpN{$set}{$_}};
+            if ($_ eq '0') { $s = "self"; } elsif ($_ eq '1') { $s = "close"; } elsif ($_ eq '2') { $s = "distal"; }
+            if ($set eq '0') { print OUT "all"; } else { print OUT $set; }
+            print OUT "\t$s\t$h{'n'}\t".sprintf("%.2f", $h{'mean'})."\t".sprintf("%.2f", $h{'stdev'})."\t".sprintf("%.2f", $h{'min'})."\t".sprintf("%.2f", $h{'max'})."\t".sprintf("%.2f", $h{'median'})."\t".sprintf("%.2f", $h{'mad'})."\t".sprintf("%.2f", $h{'q1'})."\t".sprintf("%.2f", $h{'q3'})."\t".sprintf("%.2f", $h{'cutoff'})."\n";
+        }
+    }
+    print OUT "END;\n";
+    close OUT;
+    print "Result is saved in result/statistics/fingerprint.txt.\n";
 }
 
-if ($interactive){
-	print "Press Enter to proceed with prediction, or Ctrl+C to exit:";
-	$s = <STDIN>;
+if ($interactive) {
+    print "Press Enter to proceed with prediction, or Ctrl+C to exit:";
+    $s = <STDIN>;
 }
 
 
 ## conduct bidirectional best hit (BBH) search ##
 
-if ($BBH == 2){
-	print "Conducting bidirectional best hit (BBH) search...\n";
-	unless (-e "$wkDir/result/bbh_input.txt"){
-		open OUT, ">$wkDir/result/bbh_input.txt";
-		foreach my $set (keys %results){
-			for ($i=0; $i<@{$results{$set}}; $i++){
-				my %res = %{$results{$set}[$i]};
-				if (exists $res{'BBH'} and $res{'BBH'}){
-					# set id query_accn query_taxid subject_accn subject_taxid
-					print OUT $set."\t".$i."\t".$res{'accn'}."\t".$selfinfo{$set}{'taxid'}."\t".$res{'hit2accn'}."\t".$res{'hit2'}."\n";
-				}
-			}
-		}
-	}
-	$s = $0; $s =~ s/analyzer\.pl$/bbh.pl/;
-	system "$^X $s $wkDir";
-	unlink "$wkDir/result/bbh_input.txt";
-	open IN, "<$wkDir/result/bbh.txt";
-	while (<IN>){
-		s/\s+$//;
-		@a = split (/\t/);
-		$results{$a[0]}[$a[1]]{'BBH'} = "" if ($a[$#a] ne '1');
-	}
-	close IN;
-	print " done.\n";
+if ($BBH == 2) {
+    print "Conducting bidirectional best hit (BBH) search...\n";
+    unless (-e "$wkDir/result/bbh_input.txt") {
+        open OUT, ">$wkDir/result/bbh_input.txt";
+        foreach my $set (keys %results) {
+            for ($i=0; $i<@{$results{$set}}; $i++) {
+                my %res = %{$results{$set}[$i]};
+                if (exists $res{'BBH'} and $res{'BBH'}) {
+                    # set id query_accn query_taxid subject_accn subject_taxid
+                    print OUT $set."\t".$i."\t".$res{'accn'}."\t".$selfinfo{$set}{'taxid'}."\t".$res{'hit2accn'}."\t".$res{'hit2'}."\n";
+                }
+            }
+        }
+    }
+    $s = $0; $s =~ s/analyzer\.pl$/bbh.pl/;
+    system "$^X $s $wkDir";
+    unlink "$wkDir/result/bbh_input.txt";
+    open IN, "<$wkDir/result/bbh.txt";
+    while (<IN>) {
+        s/\s+$//;
+        @a = split (/\t/);
+        $results{$a[0]}[$a[1]]{'BBH'} = "" if ($a[$#a] ne '1');
+    }
+    close IN;
+    print " done.\n";
 }
 
 
@@ -1306,103 +1306,103 @@
 mkdir "$wkDir/result/detail" unless -d "$wkDir/result/detail";
 
 print "Predicting...";
-foreach my $set (keys %results){
-	$n = @{$results{$set}};
-	for ($i=0; $i<$n; $i++){
-		my %res = %{$results{$set}[$i]};
-		next if ($minHits && ($res{'n'} < $minHits));
-		next if ($minSize && ($res{'length'} < $minSize));
-
-		my $fp = $set; 	$fp = '0' if $unite;
-
-		# principle: fewer hits suggests income or loss
-		# check if self hits are low (indicating the gene is not prevalent within the group)
-		if (($res{'N0'} < $fpN{$fp}{'0'}{'cutoff'}) or ($res{'N0'} == 0)){
-			$res{'in'} = 1;
-		}
-
-		## predict incoming HGT events ##
-		# hits from close sister groups are low, indicating there's no vertical ancestor
-		if (($res{'N1'} < $fpN{$fp}{'1'}{'cutoff'}) or ($res{'N1'} == 0)){
-			# overall non-self hits are normal, indicating it's not an origination event
-			if ($res{'N2'} and $res{'N2'} >= $fpN{$fp}{'2'}{'cutoff'}){
-				$res{'income'} = 1;
-				$res{'income'} = "" if ($selfLow and (($res{'N0'} > 0) and ($res{'N0'} >= $fpN{$fp}{'0'}{'cutoff'})));
-			}
-		}
-		
-		## predict gene loss events ##
-		# hits from close sister groups are normal,
-		elsif ($res{'in'}){
-			# self hits are low, suggesting the gene is absent in some lineages
-			$res{'loss'} = 1;
-		}
-
-		if (0){ # not available in this version
-			## predict gene origination event ##
-			# detection won't work for saturated search result.
-			if ($maxHits and ($res{'N0'}+$res{'N1'}+$res{'N2'}) < $maxHits){	
-				# if every hit is self, then must be origination.
-				unless ($res{'N1'}+$res{'N2'}){	
-					$res{'origin'} = 1;							
-				}else{
-					# if non-self hits are significantly weak.
-					$s = 0;
-					$s = $res{'S1'}[0] if $res{'N1'};
-					$s = $res{'S2'}[0] if ($res{'N2'} and $res{'S2'}[0] > $s);
-				}
-			}
-		}
-
-		# predicting gene outcoming events
-		# self hits must be normal, excluding any paralogs
-		unless ($res{'in'}){
-			if ($res{'N1'}+$res{'N2'} and exists($res{'sGenus'}{'q1'})){
-				if ($res{'nGenus'}{'max'} > $res{'sGenus'}{'q1'}){		# the best hit from other genera fall within the range of self genus hits
-					@a = @{$res{'nGenus'}{'scores'}};
-					$s = 0;
-					foreach (@a){
-						if ($_ < $res{'sGenus'}{'min'}){ last;}
-						else{ $s = $_;}
-					}
-					if ($s > $res{'sGenus'}{'q1'}){
-						$res{'genus_outcome'} = $res{'nGenusHit'};
-					}
-				}
-			}
-		}
-		$results{$set}[$i] = {%res};
-	}
-
-	###### Output report ######
-
-	open (OUT, ">$wkDir/result/detail/$set.txt");
-	@a = ('Query','Length','Product','Hits','Self','Close','Distal','HGT');
-	unless ($BBH){
-		push (@a, "Loss") if $loss;
-		push (@a, "POE") if $POE;
-	}
-	push @a, "Match";
-	print OUT "HGTector result of $set\n".join("\t",@a)."\n";
-	for ($i=0; $i<$n; $i++){
-		%h = %{$results{$set}[$i]};
-		print OUT $h{'query'}."\t".$h{'length'}."\t".$h{'product'}."\t".$h{'n'}."\t";
-		print OUT "\n" and next if ($minHits and ($h{'n'} < $minHits)) or ($minSize and ($h{'length'} < $minSize)); #####??????#######
-		print OUT sprintf("%.2f", $h{'N0'})."\t".sprintf("%.2f", $h{'N1'})."\t".sprintf("%.2f", $h{'N2'})."\t";
-		unless ($BBH){
-			print OUT $h{'income'};
-			print OUT "\t".$h{'loss'} if $loss;
-			print OUT "\t".$h{'origin'} if $POE;
-		}else{
-			print OUT $h{'BBH'} if exists $h{'BBH'} and $h{'BBH'};
-		}
-		print OUT "\t";
-		if (exists $h{'hit2'} and exists $taxadb{$h{'hit2'}}){
-			print OUT $h{'hit2'}." (".$taxadb{$h{'hit2'}}{'name'}.")";
-		}
-		print OUT "\n";
-	}
-	close OUT;
+foreach my $set (keys %results) {
+    $n = @{$results{$set}};
+    for ($i=0; $i<$n; $i++) {
+        my %res = %{$results{$set}[$i]};
+        next if ($minHits && ($res{'n'} < $minHits));
+        next if ($minSize && ($res{'length'} < $minSize));
+
+        my $fp = $set;     $fp = '0' if $unite;
+
+        # principle: fewer hits suggests income or loss
+        # check if self hits are low (indicating the gene is not prevalent within the group)
+        if (($res{'N0'} < $fpN{$fp}{'0'}{'cutoff'}) or ($res{'N0'} == 0)) {
+            $res{'in'} = 1;
+        }
+
+        ## predict incoming HGT events ##
+        # hits from close sister groups are low, indicating there's no vertical ancestor
+        if (($res{'N1'} < $fpN{$fp}{'1'}{'cutoff'}) or ($res{'N1'} == 0)) {
+            # overall non-self hits are normal, indicating it's not an origination event
+            if ($res{'N2'} and $res{'N2'} >= $fpN{$fp}{'2'}{'cutoff'}) {
+                $res{'income'} = 1;
+                $res{'income'} = "" if ($selfLow and (($res{'N0'} > 0) and ($res{'N0'} >= $fpN{$fp}{'0'}{'cutoff'})));
+            }
+        }
+
+        ## predict gene loss events ##
+        # hits from close sister groups are normal,
+        elsif ($res{'in'}) {
+            # self hits are low, suggesting the gene is absent in some lineages
+            $res{'loss'} = 1;
+        }
+
+        if (0) { # not available in this version
+            ## predict gene origination event ##
+            # detection won't work for saturated search result.
+            if ($maxHits and ($res{'N0'}+$res{'N1'}+$res{'N2'}) < $maxHits) {
+                # if every hit is self, then must be origination.
+                unless ($res{'N1'}+$res{'N2'}) {
+                    $res{'origin'} = 1;
+                } else {
+                    # if non-self hits are significantly weak.
+                    $s = 0;
+                    $s = $res{'S1'}[0] if $res{'N1'};
+                    $s = $res{'S2'}[0] if ($res{'N2'} and $res{'S2'}[0] > $s);
+                }
+            }
+        }
+
+        # predicting gene outcoming events
+        # self hits must be normal, excluding any paralogs
+        unless ($res{'in'}) {
+            if ($res{'N1'}+$res{'N2'} and exists($res{'sGenus'}{'q1'})) {
+                if ($res{'nGenus'}{'max'} > $res{'sGenus'}{'q1'}) {        # the best hit from other genera fall within the range of self genus hits
+                    @a = @{$res{'nGenus'}{'scores'}};
+                    $s = 0;
+                    foreach (@a) {
+                        if ($_ < $res{'sGenus'}{'min'}) { last;}
+                        else { $s = $_;}
+                    }
+                    if ($s > $res{'sGenus'}{'q1'}) {
+                        $res{'genus_outcome'} = $res{'nGenusHit'};
+                    }
+                }
+            }
+        }
+        $results{$set}[$i] = {%res};
+    }
+
+    ###### Output report ######
+
+    open (OUT, ">$wkDir/result/detail/$set.txt");
+    @a = ('Query','Length','Product','Hits','Self','Close','Distal','HGT');
+    unless ($BBH) {
+        push (@a, "Loss") if $loss;
+        push (@a, "POE") if $POE;
+    }
+    push @a, "Match";
+    print OUT "HGTector result of $set\n".join("\t",@a)."\n";
+    for ($i=0; $i<$n; $i++) {
+        %h = %{$results{$set}[$i]};
+        print OUT $h{'query'}."\t".$h{'length'}."\t".$h{'product'}."\t".$h{'n'}."\t";
+        print OUT "\n" and next if ($minHits and ($h{'n'} < $minHits)) or ($minSize and ($h{'length'} < $minSize)); #####??????#######
+        print OUT sprintf("%.2f", $h{'N0'})."\t".sprintf("%.2f", $h{'N1'})."\t".sprintf("%.2f", $h{'N2'})."\t";
+        unless ($BBH) {
+            print OUT $h{'income'};
+            print OUT "\t".$h{'loss'} if $loss;
+            print OUT "\t".$h{'origin'} if $POE;
+        } else {
+            print OUT $h{'BBH'} if exists $h{'BBH'} and $h{'BBH'};
+        }
+        print OUT "\t";
+        if (exists $h{'hit2'} and exists $taxadb{$h{'hit2'}}) {
+            print OUT $h{'hit2'}." (".$taxadb{$h{'hit2'}}{'name'}.")";
+        }
+        print OUT "\n";
+    }
+    close OUT;
 }
 print " done.\n";
 print "Prediction results are saved in result/detail/.\n";
@@ -1414,170 +1414,169 @@
 ## sub routines ##
 ##################
 
-sub median (@){
-	my $mid = int(@_/2);
-	return $_[0] unless $mid;
-	if (@_ % 2) {
-		return $_[$mid];
-	}else{
-		return ($_[$mid-1] + $_[$mid])/2;
-	}
+sub median (@) {
+    my $mid = int(@_/2);
+    return $_[0] unless $mid;
+    if (@_ % 2) {
+        return $_[$mid];
+    } else {
+        return ($_[$mid-1] + $_[$mid])/2;
+    }
 }
 
-sub mad (@){
-	my @absdev;
-	my $median = median(@_);
-	foreach my $x(@_){
-		push @absdev, abs($x - $median);
-	}
-	return median(sort{$a<=>$b}@absdev);
+sub mad (@) {
+    my @absdev;
+    my $median = median(@_);
+    foreach my $x(@_) {
+        push @absdev, abs($x - $median);
+    }
+    return median(sort{$a<=>$b}@absdev);
 }
 
-sub quantiles (@){
-	my $Q1, my $Q3;
-	my $mid = int(@_/2);
-	return ($_[0],$_[0]) unless $mid;
-	if (@_ % 2) {
-		$Q1 = median(@_[0..$mid]);
-		$Q3 = median(@_[$mid..$#_]);
-	}else{
-		$Q1 = median(@_[0..($mid-1)]);
-		$Q3 = median(@_[$mid..$#_]);
-	}
-	return ($Q1,$Q3);
+sub quantiles (@) {
+    my $Q1, my $Q3;
+    my $mid = int(@_/2);
+    return ($_[0],$_[0]) unless $mid;
+    if (@_ % 2) {
+        $Q1 = median(@_[0..$mid]);
+        $Q3 = median(@_[$mid..$#_]);
+    } else {
+        $Q1 = median(@_[0..($mid-1)]);
+        $Q3 = median(@_[$mid..$#_]);
+    }
+    return ($Q1,$Q3);
 }
 
 
 # compute Z-score (Z = (xi-x^)/s)
 
-sub z_scores(@){
-	my $mean = 0;
-	$mean += $_ for @_;
-	$mean = $mean / @_;
-	my $stdev = 0;
-	$stdev += ($mean-$_)**2 for @_;
-	$stdev = sqrt($stdev/(@_-1));
-	my @z = ();
-	push (@z, ($_-$mean)/$stdev) for @_;
-	return @z;
+sub z_scores(@) {
+    my $mean = 0;
+    $mean += $_ for @_;
+    $mean = $mean / @_;
+    my $stdev = 0;
+    $stdev += ($mean-$_)**2 for @_;
+    $stdev = sqrt($stdev/(@_-1));
+    my @z = ();
+    push (@z, ($_-$mean)/$stdev) for @_;
+    return @z;
 }
 
 
 # Z-score test for outliers (|Z| > 3)
 
-sub z_test(@){
-	my @data = sort{$a<=>$b}@_;
-	my @z = z_scores(@data);
-	my $lower_fence = $data[0];
-	my $upper_fence = $data[$#data];
-	for (my $i=0; $i<=$#data; $i++){
-		if (abs($z[$i]) <= 3){
-			$lower_fence = $data[$i];
-			last;
-		}
-	}
-	for (my $i=$#data; $i>=0; $i--){
-		if (abs($z[$i]) <= 3){
-			$upper_fence = $data[$i];
-			last;
-		}
-	}
-	return ($lower_fence, $upper_fence);
+sub z_test(@) {
+    my @data = sort{$a<=>$b}@_;
+    my @z = z_scores(@data);
+    my $lower_fence = $data[0];
+    my $upper_fence = $data[$#data];
+    for (my $i=0; $i<=$#data; $i++) {
+        if (abs($z[$i]) <= 3) {
+            $lower_fence = $data[$i];
+            last;
+        }
+    }
+    for (my $i=$#data; $i>=0; $i--) {
+        if (abs($z[$i]) <= 3) {
+            $upper_fence = $data[$i];
+            last;
+        }
+    }
+    return ($lower_fence, $upper_fence);
 }
 
 # modified Z-score test for outliers (|modified_Z| > 3.5) (Iglewicz and Hoaglin, 1993)
 
-sub modified_z(@){
-	my @data = sort{$a<=>$b}@_;
-	my $lower_fence = $data[0];
-	my $upper_fence = $data[$#data];
-	my $median = median(@data);
-	my $mad = mad(@data);
-	return ($data[0],$data[$#data]) unless $mad;
-	for (my $i=0; $i<=$#data; $i++){
-		if (abs(0.6745*($data[$i]-$median)/$mad) <= 3.5){
-			$lower_fence = $data[$i];
-			last;
-		}
-	}
-	for (my $i=$#data; $i>=0; $i--){
-		if (abs(0.6745*($data[$i]-$median)/$mad) <= 3.5){
-			$upper_fence = $data[$i];
-			last;
-		}
-	}
-	return ($lower_fence, $upper_fence);
+sub modified_z(@) {
+    my @data = sort{$a<=>$b}@_;
+    my $lower_fence = $data[0];
+    my $upper_fence = $data[$#data];
+    my $median = median(@data);
+    my $mad = mad(@data);
+    return ($data[0],$data[$#data]) unless $mad;
+    for (my $i=0; $i<=$#data; $i++) {
+        if (abs(0.6745*($data[$i]-$median)/$mad) <= 3.5) {
+            $lower_fence = $data[$i];
+            last;
+        }
+    }
+    for (my $i=$#data; $i>=0; $i--) {
+        if (abs(0.6745*($data[$i]-$median)/$mad) <= 3.5) {
+            $upper_fence = $data[$i];
+            last;
+        }
+    }
+    return ($lower_fence, $upper_fence);
 }
 
 
 # boxplot test for outliers
 
-sub boxplot(@){
-	my $lower_fence, my $upper_fence;
-	my @data = sort{$a<=>$b}@_;
-	my @Q = quantiles(@data);
-	my $iqr = $Q[1]-$Q[0];
-	my $f = 3*exp(10/@data);
-	$lower_fence = $Q[0]-$f*$iqr;
-	$upper_fence = $Q[1]+$f*$iqr;
-	return ($lower_fence, $upper_fence);
+sub boxplot(@) {
+    my $lower_fence, my $upper_fence;
+    my @data = sort{$a<=>$b}@_;
+    my @Q = quantiles(@data);
+    my $iqr = $Q[1]-$Q[0];
+    my $f = 3*exp(10/@data);
+    $lower_fence = $Q[0]-$f*$iqr;
+    $upper_fence = $Q[1]+$f*$iqr;
+    return ($lower_fence, $upper_fence);
 }
 
 
 # adjusted boxplot test for outliers
 
-sub adjusted_boxplot(@){
-	my $lower_fence, my $upper_fence;
-	my @data = sort{$a<=>$b}@_;
-	my $median = median(@data);
-	my @lower; my @upper;
-	foreach (@data){
-		push (@lower, $_) if ($_ <= $median);
-		push (@upper, $_) if ($_ >= $median);
-	}
-	my @kernel;
-	foreach my $i (@lower){
-		foreach my $j (@upper){
-			next if ($i == $j);
-			push @kernel, (($j-$median)-($median-$i))/($j-$i);
-		}
-	}
-	my $mc = median(sort{$a<=>$b}@kernel);
-	my @Q = quantiles(@data);
-	my $iqr = $Q[1]-$Q[0];
-	my $f = 1.5; # *exp(10/@data);
-	if ($mc >= 0){
-		$lower_fence = $Q[0]-$f*exp(-3.5*$mc)*$iqr;
-		$upper_fence = $Q[1]+$f*exp(4*$mc)*$iqr;
-	}else{
-		$lower_fence = $Q[0]-$f*exp(-4*$mc)*$iqr;
-		$upper_fence = $Q[1]+$f*exp(3.5*$mc)*$iqr;
-	}
-	return ($lower_fence, $upper_fence);
+sub adjusted_boxplot(@) {
+    my $lower_fence, my $upper_fence;
+    my @data = sort{$a<=>$b}@_;
+    my $median = median(@data);
+    my @lower; my @upper;
+    foreach (@data) {
+        push (@lower, $_) if ($_ <= $median);
+        push (@upper, $_) if ($_ >= $median);
+    }
+    my @kernel;
+    foreach my $i (@lower) {
+        foreach my $j (@upper) {
+            next if ($i == $j);
+            push @kernel, (($j-$median)-($median-$i))/($j-$i);
+        }
+    }
+    my $mc = median(sort{$a<=>$b}@kernel);
+    my @Q = quantiles(@data);
+    my $iqr = $Q[1]-$Q[0];
+    my $f = 1.5; # *exp(10/@data);
+    if ($mc >= 0) {
+        $lower_fence = $Q[0]-$f*exp(-3.5*$mc)*$iqr;
+        $upper_fence = $Q[1]+$f*exp(4*$mc)*$iqr;
+    } else {
+        $lower_fence = $Q[0]-$f*exp(-4*$mc)*$iqr;
+        $upper_fence = $Q[1]+$f*exp(3.5*$mc)*$iqr;
+    }
+    return ($lower_fence, $upper_fence);
 }
 
-sub recurse_deOutlier(@){ # assume data are high to low
-	my @data = @_;
-	my @fences = adjusted_boxplot @data;
-	for ($i=0; $i<=$#data; $i++){
-		if ($data[$i] < $fences[0]){
-			$i--;
-			last;
-		}
-	}
-	if ($i < $#data){
-		@data = @data[0..$i];
-		@data = recurse_deOutlier @data;
-	}
-	return @data;
+sub recurse_deOutlier(@) { # assume data are high to low
+    my @data = @_;
+    my @fences = adjusted_boxplot @data;
+    for ($i=0; $i<=$#data; $i++) {
+        if ($data[$i] < $fences[0]) {
+            $i--;
+            last;
+        }
+    }
+    if ($i < $#data) {
+        @data = @data[0..$i];
+        @data = recurse_deOutlier @data;
+    }
+    return @data;
 }
 
-sub recurse_Z(@){
-	my @data = @_;
-	if (modified_z(@data) > 3.5){
-		pop @data;
-		@data = recurse_Z(@data);
-	}
-	return @data;
+sub recurse_Z(@) {
+    my @data = @_;
+    if (modified_z(@data) > 3.5) {
+        pop @data;
+        @data = recurse_Z(@data);
+    }
+    return @data;
 }
-
diff --git a/scripts/bbh.pl b/scripts/bbh.pl
old mode 100755
new mode 100644
index facbe79..24a416d
--- a/scripts/bbh.pl
+++ b/scripts/bbh.pl
@@ -25,17 +25,17 @@
 
 ## global variables ##
 
-my @queries = ();									# list of query proteins
-my %done = ();										# queries that is already done.
+my @queries = ();                                    # list of query proteins
+my %done = ();                                        # queries that is already done.
 
-my $query;											# current query protein (accession number)
-my $taxid;											# target organism
-my $subject;										# subject protein (accession number)
-my $eqText;											# entrez query text
-my $isBBH;											# whether a hit is BBH
+my $query;                                            # current query protein (accession number)
+my $taxid;                                            # target organism
+my $subject;                                        # subject protein (accession number)
+my $eqText;                                            # entrez query text
+my $isBBH;                                            # whether a hit is BBH
 
-my $iRetry = 0;										# current number of retries
-my $isError = 0;									# if error
+my $iRetry = 0;                                        # current number of retries
+my $isError = 0;                                    # if error
 
 use LWP::UserAgent;
 use HTTP::Request::Common qw(POST);
@@ -50,97 +50,97 @@
 
 my $wkDir = $ARGV[0];
 
-my $httpBlast = 0;									# BLAST mode (0: http, 1: local, 2: remote)
-my $maxHits = 0;									# maximum number of valid hits to preserve. if 0 then = nHit
+my $httpBlast = 0;                                    # BLAST mode (0: http, 1: local, 2: remote)
+my $maxHits = 0;                                    # maximum number of valid hits to preserve. if 0 then = nHit
 
-my $retries = 10;									# maximum number of retries
-my $nHit = 100;										# number of hits to return
-my $evalue = 0.01;									# maximum E-value cutoff
+my $retries = 10;                                    # maximum number of retries
+my $nHit = 100;                                        # number of hits to return
+my $evalue = 0.01;                                    # maximum E-value cutoff
 
 my $blastServer = "http://blast.ncbi.nlm.nih.gov/Blast.cgi";
 my $protdb = "nr";
 
 my $blastdbcmd = "blastdbcmd";
 my $blastp = "blastp";
-my $threads = 1;									# Multiple threads for local BLAST program
+my $threads = 1;                                    # Multiple threads for local BLAST program
 
 ## read configurations ##
 
-if (-e "$wkDir/config.txt"){
-	open IN, "<$wkDir/config.txt";
-	while (<IN>){
-		s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
-		$httpBlast = $1 if /^httpBlast=(\d)$/;
-		$nHit = $1 if /^nHit=(\d+)$/;
-		$evalue = $1 if /^evalue=(.+)$/;
-		$retries = $1 if /^retries=(\d+)$/;
-		$maxHits = $1 if /^maxHits=(\d+)$/;
-		$dbBlast = $1 if /^dbBlast=(.+)$/;
-		$eqText = $1 if /^eqText=(.+)$/;
-		$blastServer = $1 if /^blastServer=(.+)$/;
-		$blastdbcmd = $1 if /^blastdbcmd=(.+)$/;
-		$blastp = $1 if /^blastp=(.+)$/;
-		$threads = $1 if /^threads=(\d+)$/;
-	}
-	close IN;
+if (-e "$wkDir/config.txt") {
+    open IN, "<$wkDir/config.txt";
+    while (<IN>) {
+        s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+        $httpBlast = $1 if /^httpBlast=(\d)$/;
+        $nHit = $1 if /^nHit=(\d+)$/;
+        $evalue = $1 if /^evalue=(.+)$/;
+        $retries = $1 if /^retries=(\d+)$/;
+        $maxHits = $1 if /^maxHits=(\d+)$/;
+        $dbBlast = $1 if /^dbBlast=(.+)$/;
+        $eqText = $1 if /^eqText=(.+)$/;
+        $blastServer = $1 if /^blastServer=(.+)$/;
+        $blastdbcmd = $1 if /^blastdbcmd=(.+)$/;
+        $blastp = $1 if /^blastp=(.+)$/;
+        $threads = $1 if /^threads=(\d+)$/;
+    }
+    close IN;
 }
 
 
 ## read query list ##
 
 open IN, "<$wkDir/result/bbh_input.txt" or die "Error: input file not accessible.\n";
-while(<IN>){
-	s/\s+$//;
-	@a = split (/\t/);
-	# '1' is subject and '2' is query
-	%h = ('set', $a[0], 'id', $a[1], 'accn1', $a[2], 'taxid1', $a[3], 'accn2', $a[4], 'taxid2', $a[5]);
-	push @queries, {%h};
+while(<IN>) {
+    s/\s+$//;
+    @a = split (/\t/);
+    # '1' is subject and '2' is query
+    %h = ('set', $a[0], 'id', $a[1], 'accn1', $a[2], 'taxid1', $a[3], 'accn2', $a[4], 'taxid2', $a[5]);
+    push @queries, {%h};
 }
 close IN;
 
 ## read previous results ##
 
-if (-e "$wkDir/result/bbh.txt"){
-	open IN, "<$wkDir/result/bbh.txt";
-	while (<IN>){
-		s/\s+$//;
-		@a = split (/\t/);
-		$done{$a[0]."_".$a[2]} = 1;
-	}
-	close IN;
-	my $nDone = 0;
-	for ($i=0; $i<=$#queries; $i++){
-		if (exists $done{$queries[$i]{'set'}."_".$queries[$i]{'accn1'}}){
-			$queries[$i]{'done'} = 1;
-			$nDone ++;
-		}
-	}
-	if ((scalar @queries) <= $nDone){
-		print "Reversal BLAST is already completed. Exiting...\n";
-		exit 0;
-	}else{
-		print "".(scalar @queries)." proteins in total, $nDone completed, remaining ".(scalar @queries - $nDone)." to BLAST.\n";
-	}
-}else{
-	print "".(scalar @queries)." proteins to BLAST.\n";
+if (-e "$wkDir/result/bbh.txt") {
+    open IN, "<$wkDir/result/bbh.txt";
+    while (<IN>) {
+        s/\s+$//;
+        @a = split (/\t/);
+        $done{$a[0]."_".$a[2]} = 1;
+    }
+    close IN;
+    my $nDone = 0;
+    for ($i=0; $i<=$#queries; $i++) {
+        if (exists $done{$queries[$i]{'set'}."_".$queries[$i]{'accn1'}}) {
+            $queries[$i]{'done'} = 1;
+            $nDone ++;
+        }
+    }
+    if ((scalar @queries) <= $nDone) {
+        print "Reversal BLAST is already completed. Exiting...\n";
+        exit 0;
+    } else {
+        print "".(scalar @queries)." proteins in total, $nDone completed, remaining ".(scalar @queries - $nDone)." to BLAST.\n";
+    }
+} else {
+    print "".(scalar @queries)." proteins to BLAST.\n";
 }
 
 ## perform batch BLAST ##
 
-for ($i=0; $i<=$#queries; $i++){
-	next if exists $queries[$i]{'done'};
-	$isBBH = 0;
-	$query = $queries[$i]{'accn2'};
-	$query =~ s/\.[\d]+$//;
-	$subject = $queries[$i]{'accn1'};
-	$subject =~ s/\.[\d]+$//;
-	$taxid = $queries[$i]{'taxid1'};
-	$eqText = "txid$taxid [ORGN]";
-	print "BLASTing $query ...";
-	blast;
-	open OUT, ">>$wkDir/result/bbh.txt";
-	print OUT $queries[$i]{'set'}."\t".$queries[$i]{'id'}."\t".$queries[$i]{'accn1'}."\t".$queries[$i]{'taxid1'}."\t".$queries[$i]{'accn2'}."\t".$queries[$i]{'taxid2'}."\t$isBBH\n";	close OUT;
-	print " $isBBH.\n";
+for ($i=0; $i<=$#queries; $i++) {
+    next if exists $queries[$i]{'done'};
+    $isBBH = 0;
+    $query = $queries[$i]{'accn2'};
+    $query =~ s/\.[\d]+$//;
+    $subject = $queries[$i]{'accn1'};
+    $subject =~ s/\.[\d]+$//;
+    $taxid = $queries[$i]{'taxid1'};
+    $eqText = "txid$taxid [ORGN]";
+    print "BLASTing $query ...";
+    blast;
+    open OUT, ">>$wkDir/result/bbh.txt";
+    print OUT $queries[$i]{'set'}."\t".$queries[$i]{'id'}."\t".$queries[$i]{'accn1'}."\t".$queries[$i]{'taxid1'}."\t".$queries[$i]{'accn2'}."\t".$queries[$i]{'taxid2'}."\t$isBBH\n";    close OUT;
+    print " $isBBH.\n";
 }
 unlink "$wkDir/blast.seq";
 print "Reversal BLAST is completed.\n";
@@ -152,89 +152,86 @@
 
 ## perform BLAST ##
 
-sub blast (){
-
-	## BLAST using standalone ncbi-blast+ program ##
-
-	if ($httpBlast){
-		my @hits;
-		# the report contains: accession, gi, length, taxid, sequence, title
-		my @out = `$blastdbcmd -db $dbBlast -entry $query -outfmt \"%a %g %l %T %s %t\"`;
-		foreach (@out){
-			s/\s+$//; @b = split (/\s+/);
-			last if ($b[0] eq $query or $b[0] =~/^$query\.\d+$/);
-		}
-		my $length = $b[2];
-		open TMP, ">$wkDir/blast.seq"; print TMP $b[4]; close TMP;
-		# the report contains: subject accessions (all), E-value, bit score, aligned part of subject sequence
-		@out = `$blastp -query $wkDir/blast.seq -db $dbBlast -entrez_query \"$eqText\" -remote -outfmt \"6 sallacc evalue bitscore\"`;
-		@a = split (/\t/, $out[0]);
-		my @accns = split (/;/, $a[0]);
-		foreach (@accns){
-			if ($_ eq $subject){ $isBBH = 1; last; }
-		}
-	}
-
-	## BLAST via http connection to NCBI server ##
-
-	else{
-		$isError = 0;
-		my $args = "CMD=Put&PROGRAM=blastp&DATABASE=$dbBlast&QUERY=$query&EQ_TEXT=$eqText";
-		my $req = new HTTP::Request POST => $blastServer;
-		$req->content_type('application/x-www-form-urlencoded');
-		$req->content($args);
-		my $response = $ua->request($req);
-		my $rid;
-		if ($response->content =~ /^    RID = (.*$)/m){ $rid = $1; }else{ retry; return; };
-		if ($response->content =~ /^    RTOE = (.*$)/m){ sleep $1; }else{ retry; return; };
-		while (1){
-			sleep 5;
-			$args = "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
-			$req = new HTTP::Request GET => $args;
-			$response = $ua->request($req);
-			if ($response->content =~ /\s+Status=WAITING/m){ next; }
-			if ($response->content =~ /\s+Status=FAILED/m){ $isError = 1; last; }
-			if ($response->content =~ /\s+Status=UNKNOWN/m){ $isError = 1; last; }
-			if ($response->content =~ /\s+Status=READY/m){
-				if ($response->content =~ /\s+ThereAreHits=yes/m){ last; }
-				else{ last; } # no hits;
-			}
-			$isError = 1;
-			last;
-		}
-		if ($isError){ retry; return; }
-		$args = "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
-		$req = new HTTP::Request GET => $args;
-		$response = $ua->request($req);
-		if ($response->content !~ /blastp/){ retry; return; }
-		my @out = split(/\n/, $response->content);
-		my $read = 0;
-		foreach (@out){
-			if (/hits? found/){ $read = 1; next; }
-			next unless $read;
-#			print;
-			@a = split (/\t/);
-			if ($a[1] =~ $subject){ $isBBH = 1; }
-			last;
-		}
-		unless ($read){ retry; return; }
-	}
-	return;
+sub blast () {
+
+    ## BLAST using standalone ncbi-blast+ program ##
+
+    if ($httpBlast) {
+        my @hits;
+        # the report contains: accession, gi, length, taxid, sequence, title
+        my @out = `$blastdbcmd -db $dbBlast -entry $query -outfmt \"%a %g %l %T %s %t\"`;
+        foreach (@out) {
+            s/\s+$//; @b = split (/\s+/);
+            last if ($b[0] eq $query or $b[0] =~/^$query\.\d+$/);
+        }
+        my $length = $b[2];
+        open TMP, ">$wkDir/blast.seq"; print TMP $b[4]; close TMP;
+        # the report contains: subject accessions (all), E-value, bit score, aligned part of subject sequence
+        @out = `$blastp -query $wkDir/blast.seq -db $dbBlast -entrez_query \"$eqText\" -remote -outfmt \"6 sallacc evalue bitscore\"`;
+        @a = split (/\t/, $out[0]);
+        my @accns = split (/;/, $a[0]);
+        foreach (@accns) {
+            if ($_ eq $subject) { $isBBH = 1; last; }
+        }
+    }
+
+    ## BLAST via http connection to NCBI server ##
+
+    else{
+        $isError = 0;
+        my $args = "CMD=Put&PROGRAM=blastp&DATABASE=$dbBlast&QUERY=$query&EQ_TEXT=$eqText";
+        my $req = new HTTP::Request POST => $blastServer;
+        $req->content_type('application/x-www-form-urlencoded');
+        $req->content($args);
+        my $response = $ua->request($req);
+        my $rid;
+        if ($response->content =~ /^    RID = (.*$)/m) { $rid = $1; } else { retry; return; };
+        if ($response->content =~ /^    RTOE = (.*$)/m) { sleep $1; } else { retry; return; };
+        while (1) {
+            sleep 5;
+            $args = "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
+            $req = new HTTP::Request GET => $args;
+            $response = $ua->request($req);
+            if ($response->content =~ /\s+Status=WAITING/m) { next; }
+            if ($response->content =~ /\s+Status=FAILED/m) { $isError = 1; last; }
+            if ($response->content =~ /\s+Status=UNKNOWN/m) { $isError = 1; last; }
+            if ($response->content =~ /\s+Status=READY/m) {
+                if ($response->content =~ /\s+ThereAreHits=yes/m) { last; }
+                else{ last; } # no hits;
+            }
+            $isError = 1;
+            last;
+        }
+        if ($isError) { retry; return; }
+        $args = "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
+        $req = new HTTP::Request GET => $args;
+        $response = $ua->request($req);
+        if ($response->content !~ /blastp/) { retry; return; }
+        my @out = split(/\n/, $response->content);
+        my $read = 0;
+        foreach (@out) {
+            if (/hits? found/) { $read = 1; next; }
+            next unless $read;
+#            print;
+            @a = split (/\t/);
+            if ($a[1] =~ $subject) { $isBBH = 1; }
+            last;
+        }
+        unless ($read) { retry; return; }
+    }
+    return;
 }
 
 ## retry BLAST ##
 
-sub retry (){
-	if ($iRetry < $retries){ # retry
-		print ".";
-		$iRetry ++;
-		sleep 10;
-		blast;
-	}else{ # fail
-		$iRetry = 0;
-		$isBBH = "error";
-	}
+sub retry () {
+    if ($iRetry < $retries) { # retry
+        print ".";
+        $iRetry ++;
+        sleep 10;
+        blast;
+    } else { # fail
+        $iRetry = 0;
+        $isBBH = "error";
+    }
 }
-
-
-
diff --git a/scripts/orthologer.pl b/scripts/orthologer.pl
old mode 100755
new mode 100644
index bae019a..0f90675
--- a/scripts/orthologer.pl
+++ b/scripts/orthologer.pl
@@ -26,72 +26,72 @@
 
 ## Program parameters ##
 
-my $wkDir = $ARGV[0];								# working directory
-my @sets = ();										# list of protein sets
+my $wkDir = $ARGV[0];                                # working directory
+my @sets = ();                                        # list of protein sets
 
-my %taxa = ();										# set -> taxid
-my %proteins = ();									# all input proteins (accn -> id)
+my %taxa = ();                                        # set -> taxid
+my %proteins = ();                                    # all input proteins (accn -> id)
 
-my @cogs = ();										# all COGs. array of hashes, including: 
-														# accns -> accn1/accn2/accn3...
-														# names -> name1	name2	name3...
-														# nProtein -> number of proteins
-														# nSet -> number of genomes
-														# name -> best name
-my %names = ();										# id -> names
-my %accns = ();										# id -> accns
-my $cogid = 1;										# current id
+my @cogs = ();                                        # all COGs. array of hashes, including:
+                                                        # accns -> accn1/accn2/accn3...
+                                                        # names -> name1    name2    name3...
+                                                        # nProtein -> number of proteins
+                                                        # nSet -> number of genomes
+                                                        # name -> best name
+my %names = ();                                        # id -> names
+my %accns = ();                                        # id -> accns
+my $cogid = 1;                                        # current id
 
-my $evalue = 1e-5;									# maximum E-value threshold
+my $evalue = 1e-5;                                    # maximum E-value threshold
 
-my $pairRule = 3;									# pairing rule
+my $pairRule = 3;                                    # pairing rule
 
 ## Read configuration ##
 
-if (-e "$wkDir/config.txt"){
-	open IN, "<$wkDir/config.txt";
-	while (<IN>){
-		s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
-		@sets = split (/,/, $1) if /^inSets=(.+)$/;
-		$evalue = $1 if /^evalue=(.+)$/;
-		$pairRule = $1 if /^pairRule=(\d)$/;
-	}
+if (-e "$wkDir/config.txt") {
+    open IN, "<$wkDir/config.txt";
+    while (<IN>) {
+        s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+        @sets = split (/,/, $1) if /^inSets=(.+)$/;
+        $evalue = $1 if /^evalue=(.+)$/;
+        $pairRule = $1 if /^pairRule=(\d)$/;
+    }
 }
 
-unless (@sets){
-	opendir (DIR, "$wkDir/search");
-	@a = readdir(DIR);
-	close DIR;
-	foreach (@a){
-		next if /^\./;
-		next unless -d "$wkDir/search/$_";
-		push @sets, $_;
-	}
+unless (@sets) {
+    opendir (DIR, "$wkDir/search");
+    @a = readdir(DIR);
+    close DIR;
+    foreach (@a) {
+        next if /^\./;
+        next unless -d "$wkDir/search/$_";
+        push @sets, $_;
+    }
 }
 
 ## Read input proteins ##
 
 print "Identifying orthologous groups (OGs)...\n";
 print "  Reading input proteins... ";
-foreach my $set (@sets){
-	next unless -d "$wkDir/search/$set";
-	opendir (DIR, "$wkDir/search/$set");
-	my @blasts = grep(/\.txt$/,readdir(DIR));
-	close DIR;
-	foreach (@blasts){
-		/^(.+)\.txt$/;
-		%h = ('set', $set, 'id', "");
-		$proteins{$1} = {%h};	
-	}
+foreach my $set (@sets) {
+    next unless -d "$wkDir/search/$set";
+    opendir (DIR, "$wkDir/search/$set");
+    my @blasts = grep(/\.txt$/,readdir(DIR));
+    close DIR;
+    foreach (@blasts) {
+        /^(.+)\.txt$/;
+        %h = ('set', $set, 'id', "");
+        $proteins{$1} = {%h};
+    }
 }
 print " done. ".(scalar keys %proteins)." proteins read.\n";
 
 
 ## Read input taxonomy ##
 open IN, "<$wkDir/taxonomy/self.info";
-while (<IN>){
-	@a = split(/\t/);
-	$taxa{$a[0]} = $a[2];
+while (<IN>) {
+    @a = split(/\t/);
+    $taxa{$a[0]} = $a[2];
 }
 close IN;
 
@@ -101,99 +101,99 @@
 ##########################################################
 
 print "  Parsing BLAST results...\n    ";
-foreach my $set (@sets){
-	next unless -d "$wkDir/search/$set";
-	opendir (DIR, "$wkDir/search/$set");
-	my @blasts = grep(/\.txt$/,readdir(DIR));
-	close DIR;
-	my $myTaxon = $taxa{$set};
-	print "$set ";
-	foreach (@blasts){
-		/^(.+)\.txt$/;
-		my $id; # array id of OGs
-		my $myAccn = $1;
-		next unless exists $proteins{$myAccn};
-		open IN, "<$wkDir/search/$set/$_" or next;
-		my $product;
-		my $reading = 0;
-		@b = (); # all accns occurred in this report
-		%h = (); # store used taxids, to rule out duplicates.
-		while (<IN>){
-			s/\s+$//; next unless $_;
-			last if $reading and /^;/;
-			if (/^\tProduct=(.+);$/){ $product = $1; $product =~ s/\s+$//; }
-			if (/BEGIN ORGANISM;/){ $reading = 1; next; }
-			next unless $reading;
-			@a = split (/\t/);
-			next if $a[4] > $evalue;
-			if ($pairRule == 2 or $pairRule == 4 or $pairRule == 5){ # must be best hit from another organism
-				next if $a[2] eq $myTaxon;
-				next if exists $h{$a[2]};
-				$h{$a[2]} = 1;
-			}
-			@a = split (/\//, $a[0]);
-			foreach (@a){
-				next if $_ eq $myAccn;
-				if (exists $proteins{$_}){
-					next if $proteins{$_}{'id'}; # if a protein is already assigned an OG ID, then skip
-					push (@b, $_);
-				}
-			}
-		}
-		close IN;
-
-		if ($proteins{$myAccn}{'id'}){ # existed OG
-			$id = $proteins{$myAccn}{'id'};
-		}else{ # new OG
-			%h = ('accns', $myAccn, 'names', $product);
-			push @cogs, {%h};
-			$id = $#cogs;
-			$proteins{$myAccn}{'id'} = $id;
-		}
-		
-		# check reversal Blast results
-		
-		foreach my $accn (@b){
-			next if $proteins{$accn}{'id'};
-			$product = "";
-			$reading = 0;
-			my $found = 0; # the source gene is found in the reverse Blast report
-			open IN, "<$wkDir/search/$proteins{$accn}{'set'}/$accn.txt" or next;
-			while (<IN>){
-				s/\s+$//; next unless $_;
-				last if $reading and /^;/;
-				if (/^\tProduct=(.+);$/){
-					$product = $1;
-					$product =~ s/\s+$//;
-					last if ($pairRule < 3); # single direction
-				}
-				if (/BEGIN ORGANISM;/){ $reading = 1; next; }
-				next unless $reading;
-				@a = split (/\t/);
-				next if $a[4] > $evalue;
-				my $taxon = $a[2];
-				@a = split (/\//, $a[0]);
-				foreach (@a){
-					if ($myAccn eq $_){
-						$found = 1;
-						last;
-					}
-				}
-				last if $found;
-				last if (($pairRule == 4) and ($myTaxon eq $taxon) and !$found);
-			}
-			close IN;
-			if ($pairRule >= 3){ # bidirectional hits
-				unless ($found){
-					# print "not match! $set $myAccn <> $accn.\n";
-					next;
-				}
-			}
-			$proteins{$accn}{'id'} = $id;
-			$cogs[$id]{'names'} .= "\t$product" if $product;
-			$cogs[$id]{'accns'} .= "/$accn";
-		}
-	}
+foreach my $set (@sets) {
+    next unless -d "$wkDir/search/$set";
+    opendir (DIR, "$wkDir/search/$set");
+    my @blasts = grep(/\.txt$/,readdir(DIR));
+    close DIR;
+    my $myTaxon = $taxa{$set};
+    print "$set ";
+    foreach (@blasts) {
+        /^(.+)\.txt$/;
+        my $id; # array id of OGs
+        my $myAccn = $1;
+        next unless exists $proteins{$myAccn};
+        open IN, "<$wkDir/search/$set/$_" or next;
+        my $product;
+        my $reading = 0;
+        @b = (); # all accns occurred in this report
+        %h = (); # store used taxids, to rule out duplicates.
+        while (<IN>) {
+            s/\s+$//; next unless $_;
+            last if $reading and /^;/;
+            if (/^\tProduct=(.+);$/) { $product = $1; $product =~ s/\s+$//; }
+            if (/BEGIN ORGANISM;/) { $reading = 1; next; }
+            next unless $reading;
+            @a = split (/\t/);
+            next if $a[4] > $evalue;
+            if ($pairRule == 2 or $pairRule == 4 or $pairRule == 5) { # must be best hit from another organism
+                next if $a[2] eq $myTaxon;
+                next if exists $h{$a[2]};
+                $h{$a[2]} = 1;
+            }
+            @a = split (/\//, $a[0]);
+            foreach (@a) {
+                next if $_ eq $myAccn;
+                if (exists $proteins{$_}) {
+                    next if $proteins{$_}{'id'}; # if a protein is already assigned an OG ID, then skip
+                    push (@b, $_);
+                }
+            }
+        }
+        close IN;
+
+        if ($proteins{$myAccn}{'id'}) { # existed OG
+            $id = $proteins{$myAccn}{'id'};
+        } else { # new OG
+            %h = ('accns', $myAccn, 'names', $product);
+            push @cogs, {%h};
+            $id = $#cogs;
+            $proteins{$myAccn}{'id'} = $id;
+        }
+
+        # check reversal Blast results
+
+        foreach my $accn (@b) {
+            next if $proteins{$accn}{'id'};
+            $product = "";
+            $reading = 0;
+            my $found = 0; # the source gene is found in the reverse Blast report
+            open IN, "<$wkDir/search/$proteins{$accn}{'set'}/$accn.txt" or next;
+            while (<IN>) {
+                s/\s+$//; next unless $_;
+                last if $reading and /^;/;
+                if (/^\tProduct=(.+);$/) {
+                    $product = $1;
+                    $product =~ s/\s+$//;
+                    last if ($pairRule < 3); # single direction
+                }
+                if (/BEGIN ORGANISM;/) { $reading = 1; next; }
+                next unless $reading;
+                @a = split (/\t/);
+                next if $a[4] > $evalue;
+                my $taxon = $a[2];
+                @a = split (/\//, $a[0]);
+                foreach (@a) {
+                    if ($myAccn eq $_) {
+                        $found = 1;
+                        last;
+                    }
+                }
+                last if $found;
+                last if (($pairRule == 4) and ($myTaxon eq $taxon) and !$found);
+            }
+            close IN;
+            if ($pairRule >= 3) { # bidirectional hits
+                unless ($found) {
+                    # print "not match! $set $myAccn <> $accn.\n";
+                    next;
+                }
+            }
+            $proteins{$accn}{'id'} = $id;
+            $cogs[$id]{'names'} .= "\t$product" if $product;
+            $cogs[$id]{'accns'} .= "/$accn";
+        }
+    }
 }
 print "\n  done. ".(scalar @cogs)." OGs identified.\n";
 
@@ -202,51 +202,51 @@
 # rule: the shortest name without "hypothetical"
 
 print "  Naming OGs...";
-for ($i=0; $i<=$#cogs; $i++){
-	@a = split(/\t/, $cogs[$i]{'names'});
-	@a = sort {length($a) <=> length($b)}(@a);
-	my @noHypo = ();
-	foreach (@a){
-		push (@noHypo, $_) unless (/hypothetical/ or /hypotethical/ or /hypothetcial/);
-	}
-	if (@noHypo){ $cogs[$i]{'name'} = "$noHypo[0]"; }
-	else{ $cogs[$i]{'name'} = "$a[0]"; }
+for ($i=0; $i<=$#cogs; $i++) {
+    @a = split(/\t/, $cogs[$i]{'names'});
+    @a = sort {length($a) <=> length($b)}(@a);
+    my @noHypo = ();
+    foreach (@a) {
+        push (@noHypo, $_) unless (/hypothetical/ or /hypotethical/ or /hypothetcial/);
+    }
+    if (@noHypo) { $cogs[$i]{'name'} = "$noHypo[0]"; }
+    else{ $cogs[$i]{'name'} = "$a[0]"; }
 }
 print " done.\n";
 
 
-#	my $shortName; my $longName;
-#	my @noHypo; # subset of names which does not contain "hypothetical"
-#	foreach (@a){
-#		push (@noHypo, $_) unless (/hypothetical/ or /hypotethical/ or /hypothetcial/);
-#	}
-#	if (@noHypo){
-#		foreach (@noHypo){
-#			if (/^([A-Za-z0-9]+) gene product$/ or /^([A-Za-z0-9]+) protein$/ or /^protein ([A-Za-z0-9]+)$/){
-#				$shortName = $1; last;
-#			}
-#		}
-#		@noHypo = sort {length($a) <=> length($b)}(@noHypo);
-#		$longName = $noHypo[$#noHypo];
-#		$shortName or $shortName = $noHypo[0];
-#	}else{
-#		@a = sort {length($a) <=> length($b)}(@a);
-#		$longName = $a[$#a];
-#		$shortName or $shortName = $a[0];
-#	}
-#	$names{$key} = $shortName;
-#	# $names{$key} = "$shortName\t$longName";
+#    my $shortName; my $longName;
+#    my @noHypo; # subset of names which does not contain "hypothetical"
+#    foreach (@a) {
+#        push (@noHypo, $_) unless (/hypothetical/ or /hypotethical/ or /hypothetcial/);
+#    }
+#    if (@noHypo) {
+#        foreach (@noHypo) {
+#            if (/^([A-Za-z0-9]+) gene product$/ or /^([A-Za-z0-9]+) protein$/ or /^protein ([A-Za-z0-9]+)$/) {
+#                $shortName = $1; last;
+#            }
+#        }
+#        @noHypo = sort {length($a) <=> length($b)}(@noHypo);
+#        $longName = $noHypo[$#noHypo];
+#        $shortName or $shortName = $noHypo[0];
+#    } else {
+#        @a = sort {length($a) <=> length($b)}(@a);
+#        $longName = $a[$#a];
+#        $shortName or $shortName = $a[0];
+#    }
+#    $names{$key} = $shortName;
+#    # $names{$key} = "$shortName\t$longName";
 
 
 ## Sort COGs ##
 
 print "  Sorting OGs...";
-for ($i=0; $i<=$#cogs; $i++){
-	@a = split(/\//, $cogs[$i]{'accns'});
-	%h = ();
-	$h{$proteins{$_}{'set'}} = 1 for @a;
-	$cogs[$i]{'nProtein'} = scalar (@a);
-	$cogs[$i]{'nSet'} = scalar (keys %h);
+for ($i=0; $i<=$#cogs; $i++) {
+    @a = split(/\//, $cogs[$i]{'accns'});
+    %h = ();
+    $h{$proteins{$_}{'set'}} = 1 for @a;
+    $cogs[$i]{'nProtein'} = scalar (@a);
+    $cogs[$i]{'nSet'} = scalar (keys %h);
 }
 print " done.\n";
 
@@ -256,12 +256,11 @@
 ## Export COGs ##
 
 open OUT, ">$wkDir/taxonomy/orthology.db";
-for ($i=0; $i<=$#cogs; $i++){
-	print OUT ($i+1)."|$cogs[$i]{'name'}\t$cogs[$i]{'accns'}\n";
+for ($i=0; $i<=$#cogs; $i++) {
+    print OUT ($i+1)."|$cogs[$i]{'name'}\t$cogs[$i]{'accns'}\n";
 }
 close OUT;
 
 print "  OG names and members exported as taxonomy/orthology.db.\n";
 
 exit 0;
-
diff --git a/scripts/reporter.pl b/scripts/reporter.pl
old mode 100755
new mode 100644
index 295ab3a..2ce63ec
--- a/scripts/reporter.pl
+++ b/scripts/reporter.pl
@@ -31,228 +31,228 @@
 my $wkDir = $ARGV[0];
 my $interactive = 1;
 
-my $deHypo = 0;					# ignore hypothetical proteins
+my $deHypo = 0;                    # ignore hypothetical proteins
 
-my $byDonor = 1;					# summarize HGT events by organism
+my $byDonor = 1;                    # summarize HGT events by organism
 my @ranks = ('species', 'genus', 'family', 'order', 'class', 'phylum');
-my $sumRank = "order";			# on this rank
-my $defOnly = 1;					# ignore those without this rank defined
+my $sumRank = "order";            # on this rank
+my $defOnly = 1;                    # ignore those without this rank defined
 
-my $byFunction = 0;				# summarize HGT by function
-my $dirFunction;					# directory containing functional annotations
+my $byFunction = 0;                # summarize HGT by function
+my $dirFunction;                    # directory containing functional annotations
 
-my $byOrthology = 0;				# generate report by ortholog
-my $smOrthology;					# file containing scheme of gene orthology
-my $nameOGs = 1;					# Name COGs if necessary
-my $exORFan = 1;					# Exclude ORFans
-my @cogs;							# clusters of orthologs (COGs) [ID] -> name short (long), predicted?
-my %cogids = ();					# clusters of orthologs (COGs): accn -> ID
-my $isCOG = 0;						# whether use COG
-my %ORFans = ();					# ORFans set -> (array)
+my $byOrthology = 0;                # generate report by ortholog
+my $smOrthology;                    # file containing scheme of gene orthology
+my $nameOGs = 1;                    # Name COGs if necessary
+my $exORFan = 1;                    # Exclude ORFans
+my @cogs;                            # clusters of orthologs (COGs) [ID] -> name short (long), predicted?
+my %cogids = ();                    # clusters of orthologs (COGs): accn -> ID
+my $isCOG = 0;                        # whether use COG
+my %ORFans = ();                    # ORFans set -> (array)
 
-my $outText = 1;					# generate report in plain text
-my $outHTML = 1;					# generate report in web page (HTML)
-my $outExcel = 0;					# generate report in Excel spreadsheet
-my $detailExcel = 1;				# attach detailed output in Excel workbook
+my $outText = 1;                    # generate report in plain text
+my $outHTML = 1;                    # generate report in web page (HTML)
+my $outExcel = 0;                    # generate report in Excel spreadsheet
+my $detailExcel = 1;                # attach detailed output in Excel workbook
 
 
 ## global variables ##
 
-my $title;							# title of the analysis
-my @sets;							# protein sets (genomes)
+my $title;                            # title of the analysis
+my @sets;                            # protein sets (genomes)
 
-my %results = ();					# master variable of results: $results{$set}{$accn}{$match}
+my %results = ();                    # master variable of results: $results{$set}{$accn}{$match}
 
-my %totals = ();					# total number of genes per genome
+my %totals = ();                    # total number of genes per genome
 my %organisms = ();
 
-my %taxadb = ();					# taxa.db
-my %ranksdb = ();					# ranks.db
+my %taxadb = ();                    # taxa.db
+my %ranksdb = ();                    # ranks.db
 
-my $workbook;						# Excel workbook
-my $worksheet;						# Excel worksheet
-my $excelRow;						# active row number
-my $excelTitle;					# Excel title format
-my $excelHeader;					# Excel header format
-my ($excelGrey, $excelGreen, $excelYellow, $excelRed);				# Excel data formats
+my $workbook;                        # Excel workbook
+my $worksheet;                        # Excel worksheet
+my $excelRow;                        # active row number
+my $excelTitle;                    # Excel title format
+my $excelHeader;                    # Excel header format
+my ($excelGrey, $excelGreen, $excelYellow, $excelRed);                # Excel data formats
 
 my $iLoss = 0; my $iPOE = 0; my $iMatch = 0;
 
 
 ## Read configuration ##
 
-if (-e "$wkDir/config.txt"){
-	open IN, "<$wkDir/config.txt";
-	while (<IN>){
-		s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
-		@sets = split (/,/, $1) if /^inSets=(.+)$/;
-		$title = $1 if /^title=(.+)$/;
-		$deHypo = $1 if /^deHypo=([012])$/;
-		$detailExcel = $1 if /^detailExcel=([01])$/;
-
-		$byDonor = $1 if /^byDonor=([01])$/;
-		@ranks = split (/,/, $1) if /^ranks=(.+)$/;
-		$sumRank = $1 if /^sumRank=(.+)$/;
-		$defOnly = $1 if /^definedOnly=([01])$/;
-		
-		$byOrthology = $1 if /^byOrthology=([01])$/;
-		$smOrthology = $1 if /^smOrthology=(.+)$/;
-		$nameOGs = $1 if /^nameOGs=([01])$/;
-		$exORFan = $1 if /^exORFan=([01])$/;
-		
-		$byFunction = $1 if /^byFunction=([01])$/;
-		$dirFunction = $1 if /^dirFunction=(.+)$/;
-		
-		$outText = $1 if /^outText=([01])$/;
-		$outExcel = $1 if /^outExcel=([01])$/;
-		$outHTML = $1 if /^outHTML=([01])$/;
-		
-		$interactive = $1 if /^interactive=([01])$/;
-	}
+if (-e "$wkDir/config.txt") {
+    open IN, "<$wkDir/config.txt";
+    while (<IN>) {
+        s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+        @sets = split (/,/, $1) if /^inSets=(.+)$/;
+        $title = $1 if /^title=(.+)$/;
+        $deHypo = $1 if /^deHypo=([012])$/;
+        $detailExcel = $1 if /^detailExcel=([01])$/;
+
+        $byDonor = $1 if /^byDonor=([01])$/;
+        @ranks = split (/,/, $1) if /^ranks=(.+)$/;
+        $sumRank = $1 if /^sumRank=(.+)$/;
+        $defOnly = $1 if /^definedOnly=([01])$/;
+
+        $byOrthology = $1 if /^byOrthology=([01])$/;
+        $smOrthology = $1 if /^smOrthology=(.+)$/;
+        $nameOGs = $1 if /^nameOGs=([01])$/;
+        $exORFan = $1 if /^exORFan=([01])$/;
+
+        $byFunction = $1 if /^byFunction=([01])$/;
+        $dirFunction = $1 if /^dirFunction=(.+)$/;
+
+        $outText = $1 if /^outText=([01])$/;
+        $outExcel = $1 if /^outExcel=([01])$/;
+        $outHTML = $1 if /^outHTML=([01])$/;
+
+        $interactive = $1 if /^interactive=([01])$/;
+    }
 }
 
 
 ## check previous result ##
 
-if (-e "$wkDir/result/summary.txt" or -d "$wkDir/result/HGT"){
-	print "Warning: Summarized report from a previous analysis is detected.\n";
-	if ($interactive){ print "Press Enter to overwrite, or press Ctrl+C to exit:\n"; $s = <STDIN>; }
-	else{ print "To be overwritten.\n"; }
+if (-e "$wkDir/result/summary.txt" or -d "$wkDir/result/HGT") {
+    print "Warning: Summarized report from a previous analysis is detected.\n";
+    if ($interactive) { print "Press Enter to overwrite, or press Ctrl+C to exit:\n"; $s = <STDIN>; }
+    else{ print "To be overwritten.\n"; }
 }
 
 
 ## Read input protein sets ##
 
-unless (@sets){
-	opendir (DIR, "$wkDir/result/detail") or die "Error: result directory not accessible.\n";
-	@sets = grep(/\.txt$/,readdir(DIR)) or die "Error: no result found.\n";
-	for ($i=0; $i<=$#sets; $i++){ $sets[$i] =~ s/\.txt$//; }
-	close DIR;
+unless (@sets) {
+    opendir (DIR, "$wkDir/result/detail") or die "Error: result directory not accessible.\n";
+    @sets = grep(/\.txt$/,readdir(DIR)) or die "Error: no result found.\n";
+    for ($i=0; $i<=$#sets; $i++) { $sets[$i] =~ s/\.txt$//; }
+    close DIR;
 }
 
 
 ## Read taxonomy information ##
 
 open IN, "<$wkDir/taxonomy/taxa.db";
-while (<IN>){
-	s/\s+$//; next if /^#/; next unless $_;
-	@a = split /\t/;
-	%h = ('name',$a[1],'rank',$a[2]);
-	$i = 3; $h{$_} = $a[$i++] for (@ranks);
-	$taxadb{$a[0]} = {%h};
+while (<IN>) {
+    s/\s+$//; next if /^#/; next unless $_;
+    @a = split /\t/;
+    %h = ('name',$a[1],'rank',$a[2]);
+    $i = 3; $h{$_} = $a[$i++] for (@ranks);
+    $taxadb{$a[0]} = {%h};
 }
 close IN;
 open IN, "<$wkDir/taxonomy/ranks.db";
-while (<IN>){
-	s/\s+$//; next if /^#/; next unless $_;
-	@a = split /\t/;
-	$ranksdb{$a[0]} = $a[1];
+while (<IN>) {
+    s/\s+$//; next if /^#/; next unless $_;
+    @a = split /\t/;
+    $ranksdb{$a[0]} = $a[1];
 }
 close IN;
 
 
 ## Computing and/or reading COG scheme ##
 
-if ($byOrthology){
-	if ($smOrthology){
-		unless (-e $smOrthology){
-			print "Orthology scheme file $smOrthology not accessible.\n";
-			$smOrthology = "$wkDir/taxonomy/orthology.db";
-		}
-	}else{ $smOrthology = "$wkDir/taxonomy/orthology.db"; }
-	unless (-e $smOrthology){ # Identify orthology scenario using BBH
-		$s = $0; $s =~ s/analyzer\.pl$/orthologer.pl/;
-		system "$^X $s $wkDir";
-	}
-	unless (-e $smOrthology){
-		print "Error: Identification of orthology failed.\n";
-		exit 1;
-	}
-	if ($nameOGs){ # Name OGs if necessary
-		$i = 0;
-		open IN, "<$smOrthology";
-		while (<IN>){
-			s/\s+$//; next if /^#/; next unless $_;
-			if (/\t/){ $i = 1; }
-			last;
-		}
-		close IN;
-		unless ($i){
-			system "perl scripts/cogNamer.pl $wkDir $smOrthology $wkDir/taxonomy/orthology.named.db";
-		}
-		$smOrthology = "$wkDir/taxonomy/orthology.named.db" if -e "$wkDir/taxonomy/orthology.named.db";
-	}
-	open IN, "<$smOrthology";
-	while (<IN>){
-		s/\s+$//; next if /^#/; next unless $_;
-		@a = split (/\t/);
-		@b = split (/[\s\/,]/, $a[$#a]);
-		next if ($exORFan and !$#b);
-		%h = ();
-		$h{'name'} = "";
-		$h{'name'} = $a[0] if ($#a);
-		$h{'data'} = [("") x @sets];
-		push @cogs, {%h};
-		$cogids{$_} = $#cogs for @b;
-	}
-	close IN;
-	$isCOG = 1 if @cogs;
+if ($byOrthology) {
+    if ($smOrthology) {
+        unless (-e $smOrthology) {
+            print "Orthology scheme file $smOrthology not accessible.\n";
+            $smOrthology = "$wkDir/taxonomy/orthology.db";
+        }
+    } else { $smOrthology = "$wkDir/taxonomy/orthology.db"; }
+    unless (-e $smOrthology) { # Identify orthology scenario using BBH
+        $s = $0; $s =~ s/analyzer\.pl$/orthologer.pl/;
+        system "$^X $s $wkDir";
+    }
+    unless (-e $smOrthology) {
+        print "Error: Identification of orthology failed.\n";
+        exit 1;
+    }
+    if ($nameOGs) { # Name OGs if necessary
+        $i = 0;
+        open IN, "<$smOrthology";
+        while (<IN>) {
+            s/\s+$//; next if /^#/; next unless $_;
+            if (/\t/) { $i = 1; }
+            last;
+        }
+        close IN;
+        unless ($i) {
+            system "perl scripts/cogNamer.pl $wkDir $smOrthology $wkDir/taxonomy/orthology.named.db";
+        }
+        $smOrthology = "$wkDir/taxonomy/orthology.named.db" if -e "$wkDir/taxonomy/orthology.named.db";
+    }
+    open IN, "<$smOrthology";
+    while (<IN>) {
+        s/\s+$//; next if /^#/; next unless $_;
+        @a = split (/\t/);
+        @b = split (/[\s\/,]/, $a[$#a]);
+        next if ($exORFan and !$#b);
+        %h = ();
+        $h{'name'} = "";
+        $h{'name'} = $a[0] if ($#a);
+        $h{'data'} = [("") x @sets];
+        push @cogs, {%h};
+        $cogids{$_} = $#cogs for @b;
+    }
+    close IN;
+    $isCOG = 1 if @cogs;
 }
 
 
 ## Read prediction results ##
 
-foreach my $set (@sets){
-	my %result = ();
-	my %titles = ();
-	@a = (); $ORFans{$set} = [@a];
-	open IN, "<$wkDir/result/detail/$set.txt";
-	while (<IN>){
-		s/\s+$//; next unless $_; next if (/HGTector/);
-		my @a = split (/\t/);
-		if (/^Query\t/){
-			next if ($iLoss+$iPOE+$iMatch);
-			for ($i=0; $i<=$#a; $i++){
-				$iLoss = $i if ($a[$i] eq "Loss");
-				$iPOE = $i if ($a[$i] eq "POE");
-				$iMatch = $i if ($a[$i] eq "Match");
-			}
-			next;
-		}
-		if ($deHypo){ # ignore hypothetical proteins
-			if ($isCOG and exists $cogids{$a[0]}){
-				my $name = $cogs[$cogids{$a[0]}]{'name'};
-				next if ($name =~ /hypothetical/ or $name =~ /hypotethical/ or $name =~ /hypothetcial/);
-			}else{
-				next if ($a[2] =~ /hypothetical/ or $a[2] =~ /hypotethical/ or $a[2] =~ /hypothetcial/);
-			}
-		}
-		my %gene = ();
-		$gene{'length'} = $a[1];
-		$gene{'product'} = $a[2];
-		$gene{'hits'} = $a[3];
-		if ($isCOG){
-			if (exists $cogids{$a[0]}){
-				$gene{'cogid'} = $cogids{$a[0]};
-			}else{
-				push @{$ORFans{$set}}, "$a[0]|$a[2]";
-			}
-		}
-		if ($#a > 4){
-			$gene{'self'} = $a[4];
-			$gene{'close'} = $a[5];
-			$gene{'distal'} = $a[6];
-			$gene{'hgt'} = $a[7];
-			$gene{'loss'} = $a[$iLoss] if $iLoss;
-			$gene{'poe'} = $a[$iPOE] if $iPOE;
-			$gene{'match'} = $a[$iMatch] if $iMatch;
-		}
-		$result{$a[0]} = {%gene};
-	}
-	close IN;
-	$results{$set} = {%result};
-	@a = keys %result;
-	$totals{$set} = @a;
+foreach my $set (@sets) {
+    my %result = ();
+    my %titles = ();
+    @a = (); $ORFans{$set} = [@a];
+    open IN, "<$wkDir/result/detail/$set.txt";
+    while (<IN>) {
+        s/\s+$//; next unless $_; next if (/HGTector/);
+        my @a = split (/\t/);
+        if (/^Query\t/) {
+            next if ($iLoss+$iPOE+$iMatch);
+            for ($i=0; $i<=$#a; $i++) {
+                $iLoss = $i if ($a[$i] eq "Loss");
+                $iPOE = $i if ($a[$i] eq "POE");
+                $iMatch = $i if ($a[$i] eq "Match");
+            }
+            next;
+        }
+        if ($deHypo) { # ignore hypothetical proteins
+            if ($isCOG and exists $cogids{$a[0]}) {
+                my $name = $cogs[$cogids{$a[0]}]{'name'};
+                next if ($name =~ /hypothetical/ or $name =~ /hypotethical/ or $name =~ /hypothetcial/);
+            } else {
+                next if ($a[2] =~ /hypothetical/ or $a[2] =~ /hypotethical/ or $a[2] =~ /hypothetcial/);
+            }
+        }
+        my %gene = ();
+        $gene{'length'} = $a[1];
+        $gene{'product'} = $a[2];
+        $gene{'hits'} = $a[3];
+        if ($isCOG) {
+            if (exists $cogids{$a[0]}) {
+                $gene{'cogid'} = $cogids{$a[0]};
+            } else {
+                push @{$ORFans{$set}}, "$a[0]|$a[2]";
+            }
+        }
+        if ($#a > 4) {
+            $gene{'self'} = $a[4];
+            $gene{'close'} = $a[5];
+            $gene{'distal'} = $a[6];
+            $gene{'hgt'} = $a[7];
+            $gene{'loss'} = $a[$iLoss] if $iLoss;
+            $gene{'poe'} = $a[$iPOE] if $iPOE;
+            $gene{'match'} = $a[$iMatch] if $iMatch;
+        }
+        $result{$a[0]} = {%gene};
+    }
+    close IN;
+    $results{$set} = {%result};
+    @a = keys %result;
+    $totals{$set} = @a;
 }
 
 
@@ -264,90 +264,90 @@
 
 ## load module to access Excel ##
 
-if ($outExcel){
-	eval { require Spreadsheet::WriteExcel; Spreadsheet::WriteExcel->import() };
-	print "Error: Perl module Spreadsheet::WriteExcel is not installed.\n" and exit 1 if ($@);
-	$workbook  = Spreadsheet::WriteExcel->new("$wkDir/result/report.xls");
-	$excelTitle = $workbook->add_format (bold=>1, size=>12, valign=>'vcenter');
-	$excelHeader = $workbook->add_format(bold=>1, align=>'center', top=>1, bottom=>1);
-	$excelGrey = $workbook->add_format(align=>'center', fg_color=>'silver', pattern=>2, border=>1, border_color=>'white');
-	$excelGreen = $workbook->add_format(align=>'center', fg_color=>'lime', pattern=>4, border=>1, border_color=>'white');
-	$excelYellow = $workbook->add_format(align=>'center', fg_color=>'yellow', pattern=>2, border=>1, border_color=>'white');
-	$excelRed = $workbook->add_format(align=>'center', fg_color=>'red', pattern=>4, border=>1, border_color=>'white');
+if ($outExcel) {
+    eval { require Spreadsheet::WriteExcel; Spreadsheet::WriteExcel->import() };
+    print "Error: Perl module Spreadsheet::WriteExcel is not installed.\n" and exit 1 if ($@);
+    $workbook  = Spreadsheet::WriteExcel->new("$wkDir/result/report.xls");
+    $excelTitle = $workbook->add_format (bold=>1, size=>12, valign=>'vcenter');
+    $excelHeader = $workbook->add_format(bold=>1, align=>'center', top=>1, bottom=>1);
+    $excelGrey = $workbook->add_format(align=>'center', fg_color=>'silver', pattern=>2, border=>1, border_color=>'white');
+    $excelGreen = $workbook->add_format(align=>'center', fg_color=>'lime', pattern=>4, border=>1, border_color=>'white');
+    $excelYellow = $workbook->add_format(align=>'center', fg_color=>'yellow', pattern=>2, border=>1, border_color=>'white');
+    $excelRed = $workbook->add_format(align=>'center', fg_color=>'red', pattern=>4, border=>1, border_color=>'white');
 ## This is a good example of conditional formatting code, but I cannot use more than three rules.
-#	$excelData = $workbook->add_format (num_format=>'[Red][<=0]0;[Green][<=20]0;[Blue]0');
+#    $excelData = $workbook->add_format (num_format=>'[Red][<=0]0;[Green][<=20]0;[Blue]0');
 }
 
 
-if ($outHTML){
-	open HTML, ">$wkDir/result/report.html";
-	print HTML "<!DOCTYPE html>\n<html>\n <head>\n  <title>$title</title>\n </head>\n <body>\n";
-	print HTML "  <p><h1>$title</h1></p>\n";
+if ($outHTML) {
+    open HTML, ">$wkDir/result/report.html";
+    print HTML "<!DOCTYPE html>\n<html>\n <head>\n  <title>$title</title>\n </head>\n <body>\n";
+    print HTML "  <p><h1>$title</h1></p>\n";
 }
 
 
 ## Generate general report ##
 
 mkdir "$wkDir/result/HGT" unless -d "$wkDir/result/HGT";
-if ($iLoss){ mkdir "$wkDir/result/loss" unless -d "$wkDir/result/loss"; }
-if ($iPOE){ mkdir "$wkDir/result/POE" unless -d "$wkDir/result/POE"; }
-if ($outText){
-	open OUT, ">$wkDir/result/summary.txt";
+if ($iLoss) { mkdir "$wkDir/result/loss" unless -d "$wkDir/result/loss"; }
+if ($iPOE) { mkdir "$wkDir/result/POE" unless -d "$wkDir/result/POE"; }
+if ($outText) {
+    open OUT, ">$wkDir/result/summary.txt";
 }
-if ($outHTML){
-	print HTML "  <p><h2>Summary</h2></p>";
-	print HTML "  <table border=1>\n   <tr>\n    <th width=120>Genome</th>\n    <th width=120>Total</th>\n    <th width=120>HGT</th>\n";
-	print HTML "    <th width=120>Loss</th>\n" if $iLoss;
-	print HTML "    <th width=120>POE</th>\n" if $iPOE;
-	print HTML "   </tr>\n";
+if ($outHTML) {
+    print HTML "  <p><h2>Summary</h2></p>";
+    print HTML "  <table border=1>\n   <tr>\n    <th width=120>Genome</th>\n    <th width=120>Total</th>\n    <th width=120>HGT</th>\n";
+    print HTML "    <th width=120>Loss</th>\n" if $iLoss;
+    print HTML "    <th width=120>POE</th>\n" if $iPOE;
+    print HTML "   </tr>\n";
 }
-if ($outExcel){
-	$worksheet = $workbook->add_worksheet ('Summary');
-	$worksheet->set_row (0, 24);
-	$worksheet->set_column (0, 4, 12);
-	$worksheet->write (0, 0,  $title, $excelTitle);
-	$worksheet->write (1, 0, ['Genome', 'Total', 'HGT'], $excelHeader);
-	$worksheet->write (1, 3, 'Loss', $excelHeader) if $iLoss;
-	$i = 3; $i ++ if $iLoss;
-	$worksheet->write (1, $i, 'POE', $excelHeader) if $iPOE;
-	$excelRow = 2;
+if ($outExcel) {
+    $worksheet = $workbook->add_worksheet ('Summary');
+    $worksheet->set_row (0, 24);
+    $worksheet->set_column (0, 4, 12);
+    $worksheet->write (0, 0,  $title, $excelTitle);
+    $worksheet->write (1, 0, ['Genome', 'Total', 'HGT'], $excelHeader);
+    $worksheet->write (1, 3, 'Loss', $excelHeader) if $iLoss;
+    $i = 3; $i ++ if $iLoss;
+    $worksheet->write (1, $i, 'POE', $excelHeader) if $iPOE;
+    $excelRow = 2;
 }
 
 my $allhgt; # total number of HGT-derived genes
-foreach my $set (@sets){
-	my $hgt = 0; my $loss = 0; my $poe = 0;
-	open HGT, ">$wkDir/result/HGT/$set.txt";
-	open LOSS, ">$wkDir/result/loss/$set.txt" if $iLoss;
-	open POE, ">$wkDir/result/POE/$set.txt" if $iPOE;
-	my %result = %{$results{$set}};
-	my $total = keys %result;
-	foreach my $accn (sort keys %result){
-		print HGT $accn."\n" and $hgt ++ if (exists $result{$accn}{'hgt'} and $result{$accn}{'hgt'});
-		if ($iLoss){ print LOSS $accn."\n" and $loss ++ if (exists $result{$accn}{'loss'} and $result{$accn}{'loss'}); }
-		if ($iPOE){ print POE $accn."\n" and $poe ++ if (exists $result{$accn}{'poe'} and $result{$accn}{'poe'}); }
-	}
-	close HGT; close LOSS if $iLoss; close POE if $iPOE;
-	if ($outText){
-		print OUT "$set has $total "; print OUT "non-hypothetical " if $deHypo; print OUT "protein-coding genes.\n";
-		print OUT "  HGT: $hgt";
-		print OUT ", Loss: $loss" if $iLoss;
-		print OUT ", POE: $poe" if $iPOE;
-		print OUT ".\n\n";
-	}
-	if ($outHTML){
-		print HTML "   <tr>\n    <td>$set</td>\n    <td><a href=\"detail/$set.txt\">$total</a></td>\n    <td><a href=\"HGT/$set.txt\">$hgt</a></td>\n";
-		print HTML "    <td><a href=\"loss/$set.txt\">$loss</a></td>\n" if $iLoss;
-		print HTML "    <td><a href=\"POE/$set.txt\">$poe</a></td>\n" if $iPOE;
-		print HTML "   </tr>\n";
-	}
-	if ($outExcel){
-		$worksheet->write ($excelRow, 0, [$set, $total, $hgt]);
-		$worksheet->write ($excelRow, 3, $loss) if $iLoss;
-		$i = 3; $i ++ if $iLoss;
-		$worksheet->write ($excelRow, $i, $poe) if $iPOE;
-		$excelRow ++;
-	}
-	$allhgt += $hgt;
+foreach my $set (@sets) {
+    my $hgt = 0; my $loss = 0; my $poe = 0;
+    open HGT, ">$wkDir/result/HGT/$set.txt";
+    open LOSS, ">$wkDir/result/loss/$set.txt" if $iLoss;
+    open POE, ">$wkDir/result/POE/$set.txt" if $iPOE;
+    my %result = %{$results{$set}};
+    my $total = keys %result;
+    foreach my $accn (sort keys %result) {
+        print HGT $accn."\n" and $hgt ++ if (exists $result{$accn}{'hgt'} and $result{$accn}{'hgt'});
+        if ($iLoss) { print LOSS $accn."\n" and $loss ++ if (exists $result{$accn}{'loss'} and $result{$accn}{'loss'}); }
+        if ($iPOE) { print POE $accn."\n" and $poe ++ if (exists $result{$accn}{'poe'} and $result{$accn}{'poe'}); }
+    }
+    close HGT; close LOSS if $iLoss; close POE if $iPOE;
+    if ($outText) {
+        print OUT "$set has $total "; print OUT "non-hypothetical " if $deHypo; print OUT "protein-coding genes.\n";
+        print OUT "  HGT: $hgt";
+        print OUT ", Loss: $loss" if $iLoss;
+        print OUT ", POE: $poe" if $iPOE;
+        print OUT ".\n\n";
+    }
+    if ($outHTML) {
+        print HTML "   <tr>\n    <td>$set</td>\n    <td><a href=\"detail/$set.txt\">$total</a></td>\n    <td><a href=\"HGT/$set.txt\">$hgt</a></td>\n";
+        print HTML "    <td><a href=\"loss/$set.txt\">$loss</a></td>\n" if $iLoss;
+        print HTML "    <td><a href=\"POE/$set.txt\">$poe</a></td>\n" if $iPOE;
+        print HTML "   </tr>\n";
+    }
+    if ($outExcel) {
+        $worksheet->write ($excelRow, 0, [$set, $total, $hgt]);
+        $worksheet->write ($excelRow, 3, $loss) if $iLoss;
+        $i = 3; $i ++ if $iLoss;
+        $worksheet->write ($excelRow, $i, $poe) if $iPOE;
+        $excelRow ++;
+    }
+    $allhgt += $hgt;
 }
 close OUT if ($outText);
 print HTML "  </table>\n" if ($outHTML);
@@ -355,454 +355,453 @@
 
 ## Generate by donor group report ##
 
-if ($byDonor){
-	foreach my $set (@sets){
-		my %result = %{$results{$set}};
-		my $count = 0;
-		foreach my $accn (keys %result){
-			$count ++;
-			next unless exists $result{$accn}{'hgt'};
-			next unless $result{$accn}{'hgt'};
-			next unless exists $result{$accn}{'match'};
-			next unless $result{$accn}{'match'};
-			my $organism = $result{$accn}{'match'};
-			my $group = "";
-			if ($sumRank){
-				$organism =~ /^(\d+) \(/;
-				if (exists $taxadb{$1} and exists $taxadb{$1}{$sumRank} and $taxadb{$1}{$sumRank} and exists $ranksdb{$taxadb{$1}{$sumRank}}){
-					# next unless exists $taxadb{$1}{'class'} and $taxadb{$1}{'class'} eq '28211'; # alphaproteobacteria
-					# next unless substr ($taxadb{$1}{'rank'}, 0, 3) eq "/2/"; # bacteria
-					# next if exists $taxadb{$1}{'class'} and $taxadb{$1}{'class'};
-					# $organism = $ranksdb{$taxadb{$1}{'phylum'}}.",".$ranksdb{$taxadb{$1}{'class'}}.",".$ranksdb{$taxadb{$1}{'order'}};
-					$organism = $ranksdb{$taxadb{$1}{$sumRank}};
-				}else{ next; }
-				# if (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 3) eq "/2/"){ $organism = "Bacteria"; }
-				# elsif (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 6) eq "/2157/"){ $organism = "Archaea"; }
-				# elsif (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 6) eq "/2759/"){ $organism = "Eukaryota"; }
-				# else{ next; }
-			}
-			$results{$set}{$accn}{'group'} = $organism;
-			if (exists $organisms{$organism}){
-				if ($organisms{$organism}{$set}){ $organisms{$organism}{$set} .= ",$accn"; }
-				else { $organisms{$organism}{$set} .= $accn; } 
-			}else{
-				my %h = ();
-				$h{$_} = "" for (@sets);
-				$h{$set} = $accn;
-				$organisms{$organism} = {%h};
-			}
-		}
-	}
-	
-	# head row
-	if ($outText){
-		open OUT, ">$wkDir/result/donor.txt";
-		print OUT "Organism\tMean\t".join ("\t", @sets)."\n";
-	}
-	if ($outHTML){
-		print HTML "  <br><hr>\n  <p></p>\n  <p><h2>HGT by putative donor group</h2></p>\n";
-		print HTML "  <p><font size='-1'>HGT-derived genes are summarized by putative donor group as indicated by the best distal match. Note that the real donor might be an ancestor of the match organism, therefore, a higher taxonomic rank is recommended to describe the donor group.";
-		print HTML "<br><font style='background-color: lightgreen'>green</font>: 1-4, <font style='background-color: khaki'>yellow</font>: 5-9, <font style='background-color: lightpink'>red</font>: 10+</font></p>\n";
-		print HTML "  <table border=1>\n   <tr>\n    <th width=250>Donor</th>\n    <th width=80>Mean</th>\n";
-		print HTML "    <th width=120>$_</th>\n" for (@sets);
-		print HTML "   </tr>\n";
-	}
-	if ($outExcel){
-		$worksheet = $workbook->add_worksheet ('Donor');
-		$worksheet->set_row (0, 24);
-		$worksheet->set_column (0, 0, 16);
-		$worksheet->set_column (1, scalar @sets, 7);
-		$worksheet->write (0, 0,  "HGT by donor organism", $excelTitle);
-		$worksheet->write (1, 0, "Donor", $excelHeader);
-		$worksheet->write (1, 1, "Mean", $excelHeader);
-		$worksheet->write (1, 2, \@sets, $excelHeader);
-		$excelRow = 3;
-	}
-	
-	# total row
-	@a = (); push (@a, $totals{$_}) for (@sets);
-	$j = 0; $j += $_ for (@a); $j = sprintf("%d", $j/@a);
-	if ($outText){
-		print OUT "Total\t$j\t".join ("\t", @a)."\n";
-	}
-	if ($outHTML){
-		print HTML "   <tr>\n    <td>Total</td>\n";
-		print HTML "    <td>$j</td>\n";
-		print HTML "    <td>$_</td>\n" for (@a);
-		print HTML "   </tr>\n";
-	}
-	if ($outExcel){
-		$worksheet->write (2, 0, "Total");
-		$worksheet->write (2, 1, $j);
-		$worksheet->write (2, 2, \@a);
-	}
-	
-	# table content
-	foreach my $organism (sort keys %organisms){
-		my @accns = (); # protein accns (comma-separated)
-		my @nums = (); # numbers only
-		foreach (@sets){
-			$s = $organisms{$organism}{$_};
-			push (@accns, $s);
-			@a = split (/,/, $s);
-			$s = @a; push (@nums, $s);
-		}
-		$j = 0; $j += $_ for (@nums); $j = sprintf("%d", $j/@nums);
-		if ($outText){
-			print OUT "$organism\t$j\t".join("\t", @nums)."\n";
-		}
-		if ($outHTML){
-			print HTML "   <tr>\n    <td>$organism</td>\n    <td>$j</td>\n";
-			for ($i=0; $i<=$#nums; $i++){
-				$n = $nums[$i];
-				$t = "";
-				$s = join ("&#013;", split (/,/, $accns[$i]));
-				if ($n > 0 and $n < 5){ $t = "lightgreen"; }
-				elsif ($n >= 5 and $n < 10) { $t = "khaki"; }
-				elsif ($n >= 10) { $t = "lightpink"; }
-				print HTML "    <td title='$s' style='background-color: $t;'>$n</td>\n";
-			}
-			print HTML "   </tr>\n";
-		}
-		if ($outExcel){
-			$worksheet->write ($excelRow, 0, $organism);
-			$worksheet->write ($excelRow, 1, $j);
-			for ($i=0; $i<=$#nums; $i++){
-				next unless $nums[$i]; # (=0)
-				if ($nums[$i] < 5){ $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelGreen); }
-				elsif ($nums[$i] < 10){ $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelYellow); }
-				else{ $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelRed); }
-				$worksheet->write_comment ($excelRow, $i+2, join ("\n", split (/,/, $accns[$i])));
-			}
-			$excelRow ++;
-		}
-	}
-	close OUT if ($outText);
-	print HTML "  </table>\n" if ($outHTML);
-	print "Report by donor organism generated.\n";
+if ($byDonor) {
+    foreach my $set (@sets) {
+        my %result = %{$results{$set}};
+        my $count = 0;
+        foreach my $accn (keys %result) {
+            $count ++;
+            next unless exists $result{$accn}{'hgt'};
+            next unless $result{$accn}{'hgt'};
+            next unless exists $result{$accn}{'match'};
+            next unless $result{$accn}{'match'};
+            my $organism = $result{$accn}{'match'};
+            my $group = "";
+            if ($sumRank) {
+                $organism =~ /^(\d+) \(/;
+                if (exists $taxadb{$1} and exists $taxadb{$1}{$sumRank} and $taxadb{$1}{$sumRank} and exists $ranksdb{$taxadb{$1}{$sumRank}}) {
+                    # next unless exists $taxadb{$1}{'class'} and $taxadb{$1}{'class'} eq '28211'; # alphaproteobacteria
+                    # next unless substr ($taxadb{$1}{'rank'}, 0, 3) eq "/2/"; # bacteria
+                    # next if exists $taxadb{$1}{'class'} and $taxadb{$1}{'class'};
+                    # $organism = $ranksdb{$taxadb{$1}{'phylum'}}.",".$ranksdb{$taxadb{$1}{'class'}}.",".$ranksdb{$taxadb{$1}{'order'}};
+                    $organism = $ranksdb{$taxadb{$1}{$sumRank}};
+                } else { next; }
+                # if (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 3) eq "/2/") { $organism = "Bacteria"; }
+                # elsif (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 6) eq "/2157/") { $organism = "Archaea"; }
+                # elsif (exists $taxadb{$1} and substr ($taxadb{$1}{'rank'}, 0, 6) eq "/2759/") { $organism = "Eukaryota"; }
+                # else{ next; }
+            }
+            $results{$set}{$accn}{'group'} = $organism;
+            if (exists $organisms{$organism}) {
+                if ($organisms{$organism}{$set}) { $organisms{$organism}{$set} .= ",$accn"; }
+                else { $organisms{$organism}{$set} .= $accn; }
+            } else {
+                my %h = ();
+                $h{$_} = "" for (@sets);
+                $h{$set} = $accn;
+                $organisms{$organism} = {%h};
+            }
+        }
+    }
+
+    # head row
+    if ($outText) {
+        open OUT, ">$wkDir/result/donor.txt";
+        print OUT "Organism\tMean\t".join ("\t", @sets)."\n";
+    }
+    if ($outHTML) {
+        print HTML "  <br><hr>\n  <p></p>\n  <p><h2>HGT by putative donor group</h2></p>\n";
+        print HTML "  <p><font size='-1'>HGT-derived genes are summarized by putative donor group as indicated by the best distal match. Note that the real donor might be an ancestor of the match organism, therefore, a higher taxonomic rank is recommended to describe the donor group.";
+        print HTML "<br><font style='background-color: lightgreen'>green</font>: 1-4, <font style='background-color: khaki'>yellow</font>: 5-9, <font style='background-color: lightpink'>red</font>: 10+</font></p>\n";
+        print HTML "  <table border=1>\n   <tr>\n    <th width=250>Donor</th>\n    <th width=80>Mean</th>\n";
+        print HTML "    <th width=120>$_</th>\n" for (@sets);
+        print HTML "   </tr>\n";
+    }
+    if ($outExcel) {
+        $worksheet = $workbook->add_worksheet ('Donor');
+        $worksheet->set_row (0, 24);
+        $worksheet->set_column (0, 0, 16);
+        $worksheet->set_column (1, scalar @sets, 7);
+        $worksheet->write (0, 0,  "HGT by donor organism", $excelTitle);
+        $worksheet->write (1, 0, "Donor", $excelHeader);
+        $worksheet->write (1, 1, "Mean", $excelHeader);
+        $worksheet->write (1, 2, \@sets, $excelHeader);
+        $excelRow = 3;
+    }
+
+    # total row
+    @a = (); push (@a, $totals{$_}) for (@sets);
+    $j = 0; $j += $_ for (@a); $j = sprintf("%d", $j/@a);
+    if ($outText) {
+        print OUT "Total\t$j\t".join ("\t", @a)."\n";
+    }
+    if ($outHTML) {
+        print HTML "   <tr>\n    <td>Total</td>\n";
+        print HTML "    <td>$j</td>\n";
+        print HTML "    <td>$_</td>\n" for (@a);
+        print HTML "   </tr>\n";
+    }
+    if ($outExcel) {
+        $worksheet->write (2, 0, "Total");
+        $worksheet->write (2, 1, $j);
+        $worksheet->write (2, 2, \@a);
+    }
+
+    # table content
+    foreach my $organism (sort keys %organisms) {
+        my @accns = (); # protein accns (comma-separated)
+        my @nums = (); # numbers only
+        foreach (@sets) {
+            $s = $organisms{$organism}{$_};
+            push (@accns, $s);
+            @a = split (/,/, $s);
+            $s = @a; push (@nums, $s);
+        }
+        $j = 0; $j += $_ for (@nums); $j = sprintf("%d", $j/@nums);
+        if ($outText) {
+            print OUT "$organism\t$j\t".join("\t", @nums)."\n";
+        }
+        if ($outHTML) {
+            print HTML "   <tr>\n    <td>$organism</td>\n    <td>$j</td>\n";
+            for ($i=0; $i<=$#nums; $i++) {
+                $n = $nums[$i];
+                $t = "";
+                $s = join ("&#013;", split (/,/, $accns[$i]));
+                if ($n > 0 and $n < 5) { $t = "lightgreen"; }
+                elsif ($n >= 5 and $n < 10) { $t = "khaki"; }
+                elsif ($n >= 10) { $t = "lightpink"; }
+                print HTML "    <td title='$s' style='background-color: $t;'>$n</td>\n";
+            }
+            print HTML "   </tr>\n";
+        }
+        if ($outExcel) {
+            $worksheet->write ($excelRow, 0, $organism);
+            $worksheet->write ($excelRow, 1, $j);
+            for ($i=0; $i<=$#nums; $i++) {
+                next unless $nums[$i]; # (=0)
+                if ($nums[$i] < 5) { $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelGreen); }
+                elsif ($nums[$i] < 10) { $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelYellow); }
+                else{ $worksheet->write_number ($excelRow, $i+2, $nums[$i], $excelRed); }
+                $worksheet->write_comment ($excelRow, $i+2, join ("\n", split (/,/, $accns[$i])));
+            }
+            $excelRow ++;
+        }
+    }
+    close OUT if ($outText);
+    print HTML "  </table>\n" if ($outHTML);
+    print "Report by donor organism generated.\n";
 }
 
 
 ## Generate by function report ##
 
-if ($byFunction and -d $dirFunction){
-
-	## read Blast2GO output ##
-
-	my %goes = ();							# master record of GOes. GOes{ID} = { $group, $term, $nGene, $nHGT, $genes}
-	opendir DIR, $dirFunction;
-	my @files = readdir(DIR);
-	close DIR;
-	foreach my $file (@files){
-		my $set = "";
-		if ($file =~ /(.+)\.[^.]+$/){ $set = $1; }
-		else{ $set = $file; }
-		next unless exists $results{$set};
-		open IN, "<$dirFunction/$file";
-		while (<IN>){
-			s/\s+$//;
-			next if /^SeqName/;
-			@a = split (/\t/);
-			push @a, "" if $#a == 3;
-			my $gene = "";
-			if ($a[0] =~ /\|/){
-				@b = split (/\|/, $a[0]);
-				$b[$#b] =~ s/\.\d+$//;
-				if ($b[$#b] =~ /cdsid_(.+)$/){ $gene = $1; }
-				else{ $gene = $b[$#b]; }
-			}else{
-				$a[0] =~ s/\.\d+$//;
-				$gene = $a[0];
-			}
-			next unless $gene;
-			next unless exists $results{$set}{$gene};
-			$a[3] =~ s/^GO://;
-			if (exists $goes{$a[3]}){
-				$goes{$a[3]}{$set}{'genes'} .= ",$gene";
-			}else{
-				%h = ('nGene', 0, 'nHGT', 0, 'genes', "");
-				my %go = ('group', $a[2], 'term', $a[4]);
-				$go{$_} = {%h} for (@sets);
-				$go{$set}{'genes'} = $gene;
-				$goes{$a[3]} = {%go};
-			}
-			$goes{$a[3]}{$set}{'nGene'} ++;
-			$goes{$a[3]}{$set}{'nHGT'} ++ if $results{$set}{$gene}{'hgt'};
-			if (exists $results{$set}{$gene}{'goes'}){ $results{$set}{$gene}{'goes'} .= "; $a[3] ($a[2]) $a[4]"; }
-			else{ $results{$set}{$gene}{'goes'} = "$a[3] ($a[2]) $a[4]"; }
-		}
-		close IN;
-	}
-	
-	## write report by function ##
-
-	if ($outText){
-		open OUT, ">$wkDir/result/function.txt";
-		print OUT "GO\tGroup\tTerm".join ("\t", @sets)."\n";
-	}
-	if ($outHTML){
-		print HTML "  <br><hr>\n  <p></p>\n  <p><h2>HGT by functional annotation</h2></p>";
-		print HTML "  <p><font size='-1'>HGT-derived genes are summarized by annotated gene ontology (GO). Cell values are ratio of HGT-derived genes associated with a GO versus all genes associated with this GO. Every GO is counted if multiple GOes are associated with one gene.";
-		print HTML "<br>white: 0, <font style='background-color: lightgreen'>green</font>: (0, 0.1), <font style='background-color: khaki'>yellow</font>: [0.1, 0.5), <font style='background-color: lightpink'>red</font>: [0.5, 1]</font></p>\n";
-		print HTML "  <table border=1>\n   <tr>\n    <th width=60>GO</th>\n    <th width=240>Term</th>\n";
-		print HTML "    <th width=120>$_</th>\n" for (@sets);
-		print HTML "   </tr>\n";
-	}
-	if ($outExcel){
-		$worksheet = $workbook->add_worksheet ('Function');
-		$worksheet->set_row (0, 24);
-		$worksheet->set_column (0, 0, 8);
-		$worksheet->set_column (1, 1, 40);
-		$worksheet->set_column (2, 1 + scalar @sets, 7);
-		$worksheet->write (0, 0,  "HGT by function", $excelTitle);
-		$worksheet->write (1, 0, ["GO","Term"], $excelHeader);
-		$worksheet->write (1, 2, \@sets, $excelHeader);
-		$excelRow = 2;
-	}
-	foreach my $group ('C', 'P', 'F'){
-		if ($outHTML){
-			if ($group eq "C"){ print HTML "    <tr><th colspan = '2'>Cellular component</th></tr>\n"; }
-			if ($group eq "P"){ print HTML "    <tr><th colspan = '2'>Biological process</th></tr>\n"; }
-			if ($group eq "F"){ print HTML "    <tr><th colspan = '2'>Molecular function</th></tr>\n"; }
-		}
-		if ($outExcel){
-			if ($group eq "C"){ $worksheet->write ($excelRow++, 1, "Cellular component", $excelHeader); }
-			if ($group eq "P"){ $worksheet->write ($excelRow++, 1, "Biological process", $excelHeader); }
-			if ($group eq "F"){ $worksheet->write ($excelRow++, 1, "Molecular function", $excelHeader); }
-		}
-		foreach my $go (sort {$goes{$a}{'term'} cmp $goes{$b}{'term'}} keys %goes){
-			next unless $goes{$go}{'group'} eq $group;
-			next if (($goes{$go}{'term'} eq "cellular_component") or ($goes{$go}{'term'} eq "biological_process") or ($goes{$go}{'term'} eq "molecular_function"));
-			if ($outText){ print OUT "$go\t$goes{$go}{'group'}\t$goes{$go}{'term'}"; }
-			if ($outHTML){ print HTML "   <tr>\n    <td>$go</td>\n    <td>$goes{$go}{'term'}</td>\n"; }
-			if ($outExcel){
-				$worksheet->write_string ($excelRow, 0, $go);
-				$worksheet->write_string ($excelRow, 1, $goes{$go}{'term'});
-			}
-			$j = 2;
-			foreach (@sets){
-				if ($goes{$go}{$_}{'nGene'} and $goes{$go}{$_}{'nHGT'}){ $i = sprintf ("%.2g", $goes{$go}{$_}{'nHGT'}/$goes{$go}{$_}{'nGene'}); }
-				else { $i = 0; }
-				if ($outText){ print OUT "\t$goes{$go}{$_}{'nHGT'}\/$goes{$go}{$_}{'nGene'}"; }
-				if ($outHTML){
-					print HTML "    <td title='$goes{$go}{$_}{'nHGT'}\/$goes{$go}{$_}{'nGene'}'";
-					if ($i){
-						if ($i > 0 and $i < 0.1){ $t = "lightgreen"; }
-						elsif ($i >= 0.1 and $i < 0.5) { $t = "khaki"; }
-						elsif ($i >= 0.5) { $t = "lightpink"; }
-						print HTML " style='background-color: $t;'>$i</td>\n";
-					}else{
-						print HTML ">$i</td>\n";
-					}
-				}
-				if ($outExcel){ $worksheet->write_number ($excelRow, $j, $i); $worksheet->write_comment ($excelRow, $j++, "$goes{$go}{$_}{'nHGT'}\/$goes{$go}{$_}{'nGene'}"); }
-			}
-			if ($outText){ print OUT "\n"; }
-			if ($outHTML){ print HTML "   </tr>\n"; }
-			if ($outExcel){ $excelRow++; }
-		}
-	}
-	if ($outText){ close OUT; }
-	if ($outHTML){ print HTML "  </table>\n";}
-	print "Report by functional annotation generated.\n";
+if ($byFunction and -d $dirFunction) {
+
+    ## read Blast2GO output ##
+
+    my %goes = ();                            # master record of GOes. GOes{ID} = { $group, $term, $nGene, $nHGT, $genes}
+    opendir DIR, $dirFunction;
+    my @files = readdir(DIR);
+    close DIR;
+    foreach my $file (@files) {
+        my $set = "";
+        if ($file =~ /(.+)\.[^.]+$/) { $set = $1; }
+        else{ $set = $file; }
+        next unless exists $results{$set};
+        open IN, "<$dirFunction/$file";
+        while (<IN>) {
+            s/\s+$//;
+            next if /^SeqName/;
+            @a = split (/\t/);
+            push @a, "" if $#a == 3;
+            my $gene = "";
+            if ($a[0] =~ /\|/) {
+                @b = split (/\|/, $a[0]);
+                $b[$#b] =~ s/\.\d+$//;
+                if ($b[$#b] =~ /cdsid_(.+)$/) { $gene = $1; }
+                else{ $gene = $b[$#b]; }
+            } else {
+                $a[0] =~ s/\.\d+$//;
+                $gene = $a[0];
+            }
+            next unless $gene;
+            next unless exists $results{$set}{$gene};
+            $a[3] =~ s/^GO://;
+            if (exists $goes{$a[3]}) {
+                $goes{$a[3]}{$set}{'genes'} .= ",$gene";
+            } else {
+                %h = ('nGene', 0, 'nHGT', 0, 'genes', "");
+                my %go = ('group', $a[2], 'term', $a[4]);
+                $go{$_} = {%h} for (@sets);
+                $go{$set}{'genes'} = $gene;
+                $goes{$a[3]} = {%go};
+            }
+            $goes{$a[3]}{$set}{'nGene'} ++;
+            $goes{$a[3]}{$set}{'nHGT'} ++ if $results{$set}{$gene}{'hgt'};
+            if (exists $results{$set}{$gene}{'goes'}) { $results{$set}{$gene}{'goes'} .= "; $a[3] ($a[2]) $a[4]"; }
+            else{ $results{$set}{$gene}{'goes'} = "$a[3] ($a[2]) $a[4]"; }
+        }
+        close IN;
+    }
+
+    ## write report by function ##
+
+    if ($outText) {
+        open OUT, ">$wkDir/result/function.txt";
+        print OUT "GO\tGroup\tTerm".join ("\t", @sets)."\n";
+    }
+    if ($outHTML) {
+        print HTML "  <br><hr>\n  <p></p>\n  <p><h2>HGT by functional annotation</h2></p>";
+        print HTML "  <p><font size='-1'>HGT-derived genes are summarized by annotated gene ontology (GO). Cell values are ratio of HGT-derived genes associated with a GO versus all genes associated with this GO. Every GO is counted if multiple GOes are associated with one gene.";
+        print HTML "<br>white: 0, <font style='background-color: lightgreen'>green</font>: (0, 0.1), <font style='background-color: khaki'>yellow</font>: [0.1, 0.5), <font style='background-color: lightpink'>red</font>: [0.5, 1]</font></p>\n";
+        print HTML "  <table border=1>\n   <tr>\n    <th width=60>GO</th>\n    <th width=240>Term</th>\n";
+        print HTML "    <th width=120>$_</th>\n" for (@sets);
+        print HTML "   </tr>\n";
+    }
+    if ($outExcel) {
+        $worksheet = $workbook->add_worksheet ('Function');
+        $worksheet->set_row (0, 24);
+        $worksheet->set_column (0, 0, 8);
+        $worksheet->set_column (1, 1, 40);
+        $worksheet->set_column (2, 1 + scalar @sets, 7);
+        $worksheet->write (0, 0,  "HGT by function", $excelTitle);
+        $worksheet->write (1, 0, ["GO","Term"], $excelHeader);
+        $worksheet->write (1, 2, \@sets, $excelHeader);
+        $excelRow = 2;
+    }
+    foreach my $group ('C', 'P', 'F') {
+        if ($outHTML) {
+            if ($group eq "C") { print HTML "    <tr><th colspan = '2'>Cellular component</th></tr>\n"; }
+            if ($group eq "P") { print HTML "    <tr><th colspan = '2'>Biological process</th></tr>\n"; }
+            if ($group eq "F") { print HTML "    <tr><th colspan = '2'>Molecular function</th></tr>\n"; }
+        }
+        if ($outExcel) {
+            if ($group eq "C") { $worksheet->write ($excelRow++, 1, "Cellular component", $excelHeader); }
+            if ($group eq "P") { $worksheet->write ($excelRow++, 1, "Biological process", $excelHeader); }
+            if ($group eq "F") { $worksheet->write ($excelRow++, 1, "Molecular function", $excelHeader); }
+        }
+        foreach my $go (sort {$goes{$a}{'term'} cmp $goes{$b}{'term'}} keys %goes) {
+            next unless $goes{$go}{'group'} eq $group;
+            next if (($goes{$go}{'term'} eq "cellular_component") or ($goes{$go}{'term'} eq "biological_process") or ($goes{$go}{'term'} eq "molecular_function"));
+            if ($outText) { print OUT "$go\t$goes{$go}{'group'}\t$goes{$go}{'term'}"; }
+            if ($outHTML) { print HTML "   <tr>\n    <td>$go</td>\n    <td>$goes{$go}{'term'}</td>\n"; }
+            if ($outExcel) {
+                $worksheet->write_string ($excelRow, 0, $go);
+                $worksheet->write_string ($excelRow, 1, $goes{$go}{'term'});
+            }
+            $j = 2;
+            foreach (@sets) {
+                if ($goes{$go}{$_}{'nGene'} and $goes{$go}{$_}{'nHGT'}) { $i = sprintf ("%.2g", $goes{$go}{$_}{'nHGT'}/$goes{$go}{$_}{'nGene'}); }
+                else { $i = 0; }
+                if ($outText) { print OUT "\t$goes{$go}{$_}{'nHGT'}\/$goes{$go}{$_}{'nGene'}"; }
+                if ($outHTML) {
+                    print HTML "    <td title='$goes{$go}{$_}{'nHGT'}\/$goes{$go}{$_}{'nGene'}'";
+                    if ($i) {
+                        if ($i > 0 and $i < 0.1) { $t = "lightgreen"; }
+                        elsif ($i >= 0.1 and $i < 0.5) { $t = "khaki"; }
+                        elsif ($i >= 0.5) { $t = "lightpink"; }
+                        print HTML " style='background-color: $t;'>$i</td>\n";
+                    } else {
+                        print HTML ">$i</td>\n";
+                    }
+                }
+                if ($outExcel) { $worksheet->write_number ($excelRow, $j, $i); $worksheet->write_comment ($excelRow, $j++, "$goes{$go}{$_}{'nHGT'}\/$goes{$go}{$_}{'nGene'}"); }
+            }
+            if ($outText) { print OUT "\n"; }
+            if ($outHTML) { print HTML "   </tr>\n"; }
+            if ($outExcel) { $excelRow++; }
+        }
+    }
+    if ($outText) { close OUT; }
+    if ($outHTML) { print HTML "  </table>\n";}
+    print "Report by functional annotation generated.\n";
 }
 
 
 ## Generate by ortholog report ##
 
-if ($byOrthology){
-	for ($i=0; $i <=$#sets; $i++){
-		foreach my $accn (keys %{$results{$sets[$i]}}){
-			%h = %{$results{$sets[$i]}{$accn}};
-			if (exists $h{'cogid'}){
-				if (exists $h{'hgt'} and $h{'hgt'}){
-					$cogs[$h{'cogid'}]{'data'}[$i] .= "1:$accn ";
-				}else{
-					$cogs[$h{'cogid'}]{'data'}[$i] .= "0:$accn ";
-				}
-			}
-		}
-	}
-	if ($outText){
-		open OUT, ">$wkDir/result/orthology.txt";
-		print OUT "ID\tName\t".join ("\t", @sets)."\n";
-	}
-	if ($outHTML){
-		print HTML "  <br><hr>\n  <p></p>\n  <p><h2>HGT by gene orthology</h2></p>";
-		print HTML "  <p><font size='-1'>HGT-derived genes are summarized by orthologous groups (OGs).";
-		print HTML "<br>empty: gene not present, <font style='background-color: lightgrey'>0</font>: gene present but not predicted as HGT, <font style='background-color: lightgreen'>1</font>: gene present and predicted as HGT, <font style='background-color: khaki'>m/n</font> (n>=2): n paralogs are present, in which m are predicted as HGT.</font></p>\n";
-		if ($cogs[0]{'name'}){ $i = 250; }else{ $i = 50; }
-		print HTML "  <table border=1>\n   <tr>\n    <th width=$i>OG</th>\n";
-		print HTML "    <th width=120>$_</th>\n" for (@sets);
-		print HTML "   </tr>\n";
-	}
-	if ($outExcel){
-		$worksheet = $workbook->add_worksheet ('Orthology');
-		$worksheet->set_row (0, 24);
-		if ($cogs[0]{'name'}){ $i = 40; }else{ $i = 6; }
-		$worksheet->set_column (0, 0, $i);
-		$worksheet->set_column (1, scalar @sets, 7);
-		$worksheet->write (0, 0,  "HGT by gene orthology", $excelTitle);
-		$worksheet->write (1, 0, "OG", $excelHeader);
-		$worksheet->write (1, 1, \@sets, $excelHeader);
-		$excelRow = 2;
-	}
-	
-	my $allCOG; my $hgtCOG;
-		
-	for ($i=0; $i <=$#cogs; $i++){
-		@a = @{$cogs[$i]{'data'}};
-		$s = 0; foreach (@a){ if (/^1/){ $s = 1; last; } } next unless $s;
-		
-		for ($j=0; $j<=$#a; $j++){
-			next unless $a[$j];
-			@b = split (/\s/, $a[$j]);
-			$allCOG += scalar @b;
-			foreach (@b){ $hgtCOG ++ if /^1/; }
-		}
-		
-		if ($outText){
-			print OUT "$i\t$cogs[$i]{'name'}\t".join("\t",@a)."\n";
-		}
-		if ($outHTML){
-			if ($cogs[$i]{'name'}){ $s = $cogs[$i]{'name'}; }else{ $s = $i; }
-			print HTML "   <tr>\n    <td>$s</td>\n";
-			for ($j=0; $j<=$#a; $j++){
-				if ($a[$j]){
-					@b = split (/\s/, $a[$j]);
-					my $hgt = 0; my $first; $t = "";
-					foreach (@b){
-						/^([01]):(.+)$/;
-						$first = $2 unless $first;
-						$t .= "$2,";
-						$hgt ++ if $1 == 1;
-					}
-					print HTML "    <td style='background-color: ";
-					if ($#b){ print HTML "khaki"; }
-					elsif ($hgt){ print HTML "lightgreen"; }
-					else { print HTML "lightgrey"; }
-					print HTML ";' title='".join ("&#013;", split (/,/, $t))."'>";
-					print HTML "<a href='../search/$sets[$j]/$first.txt' >";
-					unless ($#b){ print HTML "$hgt</td>\n"; }
-					else{ print HTML "$hgt/".scalar(@b)."</td>\n"; }
-				}else{
-					print HTML "    <td></td>\n";
-				}
-			}
-			print HTML "   </tr>\n";
-		}
-		if ($outExcel){
-			if ($cogs[$i]{'name'}){ $s = $cogs[$i]{'name'}; }else{ $s = $i; }
-			$worksheet->write ($excelRow, 0, $s);
-			for ($j=0; $j<=$#a; $j++){
-				next unless $a[$j];
-				@b = split (/\s/, $a[$j]);
-				my $hgt = 0; my $first; $t = "";
-				foreach (@b){
-					/^([01]):(.+)$/;
-					$first = $2 unless $first;
-					$t .= "$2,";
-					$hgt ++ if $1 == 1;
-				}
-				if ($#b){
-					if ($hgt){ $worksheet->write_url ($excelRow, $j+1, $2, "$hgt/".scalar(@b), $excelGreen); }
-					else { $worksheet->write_url ($excelRow, $j+1, $2, "$hgt/".scalar(@b), $excelGrey); }
-				}else{
-					if ($hgt){ $worksheet->write_url ($excelRow, $j+1, $2, 1, $excelGreen); }
-					else { $worksheet->write_url ($excelRow, $j+1, $2, 0, $excelGrey); }				
-				}
-				$worksheet->write_comment ($excelRow, $j+1, $t);
-			}
-			$excelRow ++;
-		}
-	}
-	print HTML "  </table>\n" if ($outHTML);
-	
-	if (%ORFans and 0){
-		if ($outText){ print OUT "#Singleton ORFans:\n"; }
-		if ($outHTML){ print HTML "  <br><hr>\n  <p></p>\n  <p><h3>Singleton ORFans</h2></p>\n"; }
-		foreach my $set (@sets){
-			if ($outText){ print OUT "$set: "; }
-			if ($outHTML){ print HTML "    <p><b>$set</b>: "; }
-			foreach my $gene (@{$ORFans{$set}}){
-				if ($outText){ print OUT "$gene "; }
-				if ($outHTML){ print HTML "$gene "; }				
-			}
-			if ($outText){ print OUT "\n"; }
-			if ($outHTML){ print HTML "</p>\n"; }
-		}
-	}
-	
-	close OUT if ($outText);
-	
-	print "Report by gene orthology generated.\n";
-	
-	print "  Positive rate = $allhgt / ".($allhgt-$hgtCOG+$allCOG)." = ". (sprintf("%.3f", $allhgt/($allhgt-$hgtCOG+$allCOG))).".\n";
-	
+if ($byOrthology) {
+    for ($i=0; $i <=$#sets; $i++) {
+        foreach my $accn (keys %{$results{$sets[$i]}}) {
+            %h = %{$results{$sets[$i]}{$accn}};
+            if (exists $h{'cogid'}) {
+                if (exists $h{'hgt'} and $h{'hgt'}) {
+                    $cogs[$h{'cogid'}]{'data'}[$i] .= "1:$accn ";
+                } else {
+                    $cogs[$h{'cogid'}]{'data'}[$i] .= "0:$accn ";
+                }
+            }
+        }
+    }
+    if ($outText) {
+        open OUT, ">$wkDir/result/orthology.txt";
+        print OUT "ID\tName\t".join ("\t", @sets)."\n";
+    }
+    if ($outHTML) {
+        print HTML "  <br><hr>\n  <p></p>\n  <p><h2>HGT by gene orthology</h2></p>";
+        print HTML "  <p><font size='-1'>HGT-derived genes are summarized by orthologous groups (OGs).";
+        print HTML "<br>empty: gene not present, <font style='background-color: lightgrey'>0</font>: gene present but not predicted as HGT, <font style='background-color: lightgreen'>1</font>: gene present and predicted as HGT, <font style='background-color: khaki'>m/n</font> (n>=2): n paralogs are present, in which m are predicted as HGT.</font></p>\n";
+        if ($cogs[0]{'name'}) { $i = 250; } else { $i = 50; }
+        print HTML "  <table border=1>\n   <tr>\n    <th width=$i>OG</th>\n";
+        print HTML "    <th width=120>$_</th>\n" for (@sets);
+        print HTML "   </tr>\n";
+    }
+    if ($outExcel) {
+        $worksheet = $workbook->add_worksheet ('Orthology');
+        $worksheet->set_row (0, 24);
+        if ($cogs[0]{'name'}) { $i = 40; } else { $i = 6; }
+        $worksheet->set_column (0, 0, $i);
+        $worksheet->set_column (1, scalar @sets, 7);
+        $worksheet->write (0, 0,  "HGT by gene orthology", $excelTitle);
+        $worksheet->write (1, 0, "OG", $excelHeader);
+        $worksheet->write (1, 1, \@sets, $excelHeader);
+        $excelRow = 2;
+    }
+
+    my $allCOG; my $hgtCOG;
+
+    for ($i=0; $i <=$#cogs; $i++) {
+        @a = @{$cogs[$i]{'data'}};
+        $s = 0; foreach (@a) { if (/^1/) { $s = 1; last; } } next unless $s;
+
+        for ($j=0; $j<=$#a; $j++) {
+            next unless $a[$j];
+            @b = split (/\s/, $a[$j]);
+            $allCOG += scalar @b;
+            foreach (@b) { $hgtCOG ++ if /^1/; }
+        }
+
+        if ($outText) {
+            print OUT "$i\t$cogs[$i]{'name'}\t".join("\t",@a)."\n";
+        }
+        if ($outHTML) {
+            if ($cogs[$i]{'name'}) { $s = $cogs[$i]{'name'}; } else { $s = $i; }
+            print HTML "   <tr>\n    <td>$s</td>\n";
+            for ($j=0; $j<=$#a; $j++) {
+                if ($a[$j]) {
+                    @b = split (/\s/, $a[$j]);
+                    my $hgt = 0; my $first; $t = "";
+                    foreach (@b) {
+                        /^([01]):(.+)$/;
+                        $first = $2 unless $first;
+                        $t .= "$2,";
+                        $hgt ++ if $1 == 1;
+                    }
+                    print HTML "    <td style='background-color: ";
+                    if ($#b) { print HTML "khaki"; }
+                    elsif ($hgt) { print HTML "lightgreen"; }
+                    else { print HTML "lightgrey"; }
+                    print HTML ";' title='".join ("&#013;", split (/,/, $t))."'>";
+                    print HTML "<a href='../search/$sets[$j]/$first.txt' >";
+                    unless ($#b) { print HTML "$hgt</td>\n"; }
+                    else{ print HTML "$hgt/".scalar(@b)."</td>\n"; }
+                } else {
+                    print HTML "    <td></td>\n";
+                }
+            }
+            print HTML "   </tr>\n";
+        }
+        if ($outExcel) {
+            if ($cogs[$i]{'name'}) { $s = $cogs[$i]{'name'}; } else { $s = $i; }
+            $worksheet->write ($excelRow, 0, $s);
+            for ($j=0; $j<=$#a; $j++) {
+                next unless $a[$j];
+                @b = split (/\s/, $a[$j]);
+                my $hgt = 0; my $first; $t = "";
+                foreach (@b) {
+                    /^([01]):(.+)$/;
+                    $first = $2 unless $first;
+                    $t .= "$2,";
+                    $hgt ++ if $1 == 1;
+                }
+                if ($#b) {
+                    if ($hgt) { $worksheet->write_url ($excelRow, $j+1, $2, "$hgt/".scalar(@b), $excelGreen); }
+                    else { $worksheet->write_url ($excelRow, $j+1, $2, "$hgt/".scalar(@b), $excelGrey); }
+                } else {
+                    if ($hgt) { $worksheet->write_url ($excelRow, $j+1, $2, 1, $excelGreen); }
+                    else { $worksheet->write_url ($excelRow, $j+1, $2, 0, $excelGrey); }
+                }
+                $worksheet->write_comment ($excelRow, $j+1, $t);
+            }
+            $excelRow ++;
+        }
+    }
+    print HTML "  </table>\n" if ($outHTML);
+
+    if (%ORFans and 0) {
+        if ($outText) { print OUT "#Singleton ORFans:\n"; }
+        if ($outHTML) { print HTML "  <br><hr>\n  <p></p>\n  <p><h3>Singleton ORFans</h2></p>\n"; }
+        foreach my $set (@sets) {
+            if ($outText) { print OUT "$set: "; }
+            if ($outHTML) { print HTML "    <p><b>$set</b>: "; }
+            foreach my $gene (@{$ORFans{$set}}) {
+                if ($outText) { print OUT "$gene "; }
+                if ($outHTML) { print HTML "$gene "; }
+            }
+            if ($outText) { print OUT "\n"; }
+            if ($outHTML) { print HTML "</p>\n"; }
+        }
+    }
+
+    close OUT if ($outText);
+
+    print "Report by gene orthology generated.\n";
+
+    print "  Positive rate = $allhgt / ".($allhgt-$hgtCOG+$allCOG)." = ". (sprintf("%.3f", $allhgt/($allhgt-$hgtCOG+$allCOG))).".\n";
+
 }
 
 
 
 ## Attach individual reports in spreadsheet ##
 
-if ($outExcel and $detailExcel){
-	foreach my $set (@sets){
-		$worksheet = $workbook->add_worksheet ($set);
-		$worksheet->set_row (0, 24);
-		$worksheet->set_column (0, 0, 14);
-		$worksheet->set_column (1, 1, 8);
-		$worksheet->set_column (2, 2, 32);
-		$worksheet->set_column (3, 9, 8);
-		$worksheet->set_column ($iMatch, $iMatch+$byDonor, 32) if $iMatch;
-		$worksheet->write (0, 0, $set, $excelTitle);
-		$worksheet->write (1, 0, ["Query","Length","Product","Hits","Self","Close","Distal","HGT"], $excelHeader);
-		$worksheet->write (1, $iLoss, "Loss", $excelHeader) if $iLoss;
-		$worksheet->write (1, $iPOE, "POE", $excelHeader) if $iPOE;
-		$worksheet->write (1, $iMatch, "Best distal match", $excelHeader) if $iMatch;
-		$worksheet->write (1, $iMatch+1, "Putative donor group", $excelHeader) if $byDonor;
-		$worksheet->write (1, $iMatch+$byDonor+1, "Function", $excelHeader) if $byFunction;
-		$excelRow = 1;
-		%h = %{$results{$set}};
-		foreach my $accn (sort keys %h){
-			$excelRow ++;
-			$worksheet->write_url ($excelRow, 0, "../search/$set/$accn.txt", $accn);
-			$worksheet->write_number ($excelRow, 1, $h{$accn}{'length'});
-			$worksheet->write ($excelRow, 2, $h{$accn}{'product'});
-			$worksheet->write_number ($excelRow, 3, $h{$accn}{'hits'});
-			next unless exists $h{$accn}{'self'};
-			$worksheet->write_number ($excelRow, 4, $h{$accn}{'self'});
-			$worksheet->write_number ($excelRow, 5, $h{$accn}{'close'});
-			$worksheet->write_number ($excelRow, 6, $h{$accn}{'distal'});
-			$worksheet->write_number ($excelRow, 7, $h{$accn}{'hgt'}) if $h{$accn}{'hgt'};
-			$worksheet->write_number ($excelRow, $iLoss, $h{$accn}{'loss'}) if ($iLoss and $h{$accn}{'loss'});
-			$worksheet->write_number ($excelRow, $iPOE, $h{$accn}{'poe'}) if ($iPOE and $h{$accn}{'poe'});
-			next unless ($iMatch and $h{$accn}{'match'});
-			$h{$accn}{'match'} =~ /^(\d+) \((.+)\)$/;
-			$worksheet->write ($excelRow, $iMatch, $2);
-			if ($byDonor and exists $h{$accn}{'group'}){
-				$worksheet->write ($excelRow, $iMatch+1, $h{$accn}{'group'});
-			}
-			if ($byFunction and exists $h{$accn}{'goes'}){
-				$s = $h{$accn}{'goes'};
-				$s =~ s/GO:\d+ \(.\) //g;
-				$worksheet->write ($excelRow, $iMatch+$byDonor+1, $s);
-			}
-			# $worksheet->write_comment ($excelRow, 10, $1);
-		}
-	}
+if ($outExcel and $detailExcel) {
+    foreach my $set (@sets) {
+        $worksheet = $workbook->add_worksheet ($set);
+        $worksheet->set_row (0, 24);
+        $worksheet->set_column (0, 0, 14);
+        $worksheet->set_column (1, 1, 8);
+        $worksheet->set_column (2, 2, 32);
+        $worksheet->set_column (3, 9, 8);
+        $worksheet->set_column ($iMatch, $iMatch+$byDonor, 32) if $iMatch;
+        $worksheet->write (0, 0, $set, $excelTitle);
+        $worksheet->write (1, 0, ["Query","Length","Product","Hits","Self","Close","Distal","HGT"], $excelHeader);
+        $worksheet->write (1, $iLoss, "Loss", $excelHeader) if $iLoss;
+        $worksheet->write (1, $iPOE, "POE", $excelHeader) if $iPOE;
+        $worksheet->write (1, $iMatch, "Best distal match", $excelHeader) if $iMatch;
+        $worksheet->write (1, $iMatch+1, "Putative donor group", $excelHeader) if $byDonor;
+        $worksheet->write (1, $iMatch+$byDonor+1, "Function", $excelHeader) if $byFunction;
+        $excelRow = 1;
+        %h = %{$results{$set}};
+        foreach my $accn (sort keys %h) {
+            $excelRow ++;
+            $worksheet->write_url ($excelRow, 0, "../search/$set/$accn.txt", $accn);
+            $worksheet->write_number ($excelRow, 1, $h{$accn}{'length'});
+            $worksheet->write ($excelRow, 2, $h{$accn}{'product'});
+            $worksheet->write_number ($excelRow, 3, $h{$accn}{'hits'});
+            next unless exists $h{$accn}{'self'};
+            $worksheet->write_number ($excelRow, 4, $h{$accn}{'self'});
+            $worksheet->write_number ($excelRow, 5, $h{$accn}{'close'});
+            $worksheet->write_number ($excelRow, 6, $h{$accn}{'distal'});
+            $worksheet->write_number ($excelRow, 7, $h{$accn}{'hgt'}) if $h{$accn}{'hgt'};
+            $worksheet->write_number ($excelRow, $iLoss, $h{$accn}{'loss'}) if ($iLoss and $h{$accn}{'loss'});
+            $worksheet->write_number ($excelRow, $iPOE, $h{$accn}{'poe'}) if ($iPOE and $h{$accn}{'poe'});
+            next unless ($iMatch and $h{$accn}{'match'});
+            $h{$accn}{'match'} =~ /^(\d+) \((.+)\)$/;
+            $worksheet->write ($excelRow, $iMatch, $2);
+            if ($byDonor and exists $h{$accn}{'group'}) {
+                $worksheet->write ($excelRow, $iMatch+1, $h{$accn}{'group'});
+            }
+            if ($byFunction and exists $h{$accn}{'goes'}) {
+                $s = $h{$accn}{'goes'};
+                $s =~ s/GO:\d+ \(.\) //g;
+                $worksheet->write ($excelRow, $iMatch+$byDonor+1, $s);
+            }
+            # $worksheet->write_comment ($excelRow, 10, $1);
+        }
+    }
 }
 
-if ($outHTML){
-	print HTML "\t</body>\n</html>\n";
-	close HTML;
+if ($outHTML) {
+    print HTML "\t</body>\n</html>\n";
+    close HTML;
 }
-if ($outExcel){
-	$worksheet = $workbook->sheets(0);
+if ($outExcel) {
+    $worksheet = $workbook->sheets(0);
     $worksheet->set_first_sheet();
     $worksheet->activate();
     $worksheet->set_selection(0, 0);
-	$workbook->close();
+    $workbook->close();
 }
 
 exit 0;
-
diff --git a/scripts/searcher.pl b/scripts/searcher.pl
old mode 100755
new mode 100644
index 0a36c35..0c9c48e
--- a/scripts/searcher.pl
+++ b/scripts/searcher.pl
@@ -47,86 +47,84 @@
 
 ## global variables ##
 
-my $wkDir = $ARGV[0];					# working directory
-
-my %ins = ();							# the master data structure of the whole procedure
-										# set => (
-										#	file (file)
-										#	taxid (str)
-										#	organism (str)
-										#	prefile (file)
-										#	done (boolean)
-										#	prots (array...)
-										# prot => (
-										#	name (accn or user-defined)
- 										#	gi, accn (only for GenBank-style sequence titles)
-										#	product (after the 1st whitespace)
-										#	seq (necessary for local searches, optional for remote BLAST)
-										#	done (boolean)
-										#	hits (array...)
-										# hit => (
-										#	sseqid, pident, evalue, bitscore, qstart, qend
-										#	sseq (only for BLAST)
-										#	name (accn or full name)
-
-my $nProt = 0;							# total number of proteins
-my $nDone = 0;							# number of proteins that have been searched already
+my $wkDir = $ARGV[0];                   # working directory
+
+my %ins = ();                           # the master data structure of the whole procedure
+                                        # set => (
+                                        #    file (file)
+                                        #    taxid (str)
+                                        #    organism (str)
+                                        #    prefile (file)
+                                        #    done (boolean)
+                                        #    prots (array...)
+                                        # prot => (
+                                        #    name (accn or user-defined)
+                                        #    title (after the 1st whitespace)
+                                        #    seq (necessary for local searches, optional for remote BLAST)
+                                        #    done (boolean)
+                                        #    hits (array...)
+                                        # hit => (
+                                        #    sseqid, pident, evalue, bitscore, qstart, qend
+                                        #    sseq (only for BLAST)
+                                        #    name (accn or full name)
+
+my $nProt = 0;                          # total number of proteins
+my $nDone = 0;                          # number of proteins that have been searched already
 
 my $retryFailed = 1;
 
-my %inSeqs = ();						# query protein sequences (if applicable)
-my %taxdumps = ();						# local taxonomy database: taxid => (name, parent, rank)
-my %prot2taxids = ();					# protein to TaxID dictionary (the protein name may be GI or accn)
-my %dbTaxa :shared = ();				# taxa.db
-my %dbRanks :shared = ();				# ranks.db
-my %badTaxids = ();						# TaxIDs that don't exist in the local taxonomy database
+my %inSeqs = ();                        # query protein sequences (if applicable)
+my %taxdumps = ();                      # local taxonomy database: taxid => (name, parent, rank)
+my %prot2taxids = ();                   # protein name to TaxID dictionary
+my %dbTaxa :shared = ();                # taxa.db
+my %dbRanks :shared = ();               # ranks.db
+my %badTaxids = ();                     # TaxIDs that don't exist in the local taxonomy database
 
 
 ## subroutines ##
 
-sub http_blast;							# parameters: query (accn. no.), set; return: 1 - succeeded, 0 - fail
-sub local_search;						# same as above
-sub self_align;							# search against itself
-sub get_taxonomy;						# paramter: array of TaxIDs
-sub stem_name;							# get stem file name
-sub seq_title;							# parse sequence title
-sub order_accns;						# reorder accession number
+sub http_blast;                          # parameters: query (accn. no.), set; return: 1 - succeeded, 0 - fail
+sub local_search;                        # same as above
+sub self_align;                          # search against itself
+sub get_taxonomy;                        # paramter: array of TaxIDs
+sub stem_name;                           # get stem file name
+sub order_accns;                         # reorder accession number
 
 
 ## program parameters ##
 
-my $interactive = 1;					# interactive or automatic mode
-my $searchTool = "blast";				# protein sequence similarity search tool
-my $preSearch = "";						# directory of pre-computed search results.
-my $selfTax = "";						# taxonomy of input protein sets
+my $interactive = 1;                     # interactive or automatic mode
+my $searchTool = "blast";                # protein sequence similarity search tool
+my $preSearch = "";                      # directory of pre-computed search results.
+my $selfTax = "";                        # taxonomy of input protein sets
 
 # databases
-my $protdb = "";						# protein database for homolgy search
-my $taxdump = "";						# directory of the NCBI taxonomy database (nodes.dmp and names.dmp)
-my $prot2taxid = "";					# protein name / GI to TaxID dictionary file
+my $protdb = "";                         # protein database for homolgy search
+my $taxdump = "";                        # directory of the NCBI taxonomy database (nodes.dmp and names.dmp)
+my $prot2taxid = "";                     # protein name to TaxID dictionary file
 
 # search cutoffs
-my $nHits = 500;						# number of hits to return
-my $maxHits = 0;						# maximum number of valid hits to preserve. if 0 then = nHits
-my $evalue = 1e-5;						# maximum E-value cutoff
-my $identity = 0;						# minimum percent identity cutoff
-my $coverage = 0;						# minimum query coverage cutoff
+my $nHits = 500;                         # number of hits to return
+my $maxHits = 0;                         # maximum number of valid hits to preserve. if 0 then = nHits
+my $evalue = 1e-5;                       # maximum E-value cutoff
+my $identity = 0;                        # minimum percent identity cutoff
+my $coverage = 0;                        # minimum query coverage cutoff
 
 # taxonomy filters
-my $mergeDuplicates = 1;				# ignore hits with same taxon names and bit scores
-my $taxonUCASE = 0;						# ignore taxon names that do not start with a capital letter
-my @ignoreTaxa = split (/,/, "unknown,uncultured,unidentified,unclassified,environmental,plasmid,vector,synthetic,phage");							
-										# ignore taxon names containing these words
-my $ignoreParalogs = 1;					# ignore potential paralogs (hits with same taxon names but different bit scores)
-my $ignoreSeqRepeats = 1;				# ignore repeated sequences (hits targetting different regions of the same protein)
-my $ignoreSubspecies = 0;				# ignore more than one subspecies from the same species
-my @ranks = ('species', 'genus', 'family', 'order', 'class', 'phylum');
-										# record the TaxIDs on these ranks for each hit
+my $mergeDuplicates = 1;                 # ignore hits with same taxon names and bit scores
+my $taxonUCASE = 0;                      # ignore taxon names that do not start with a capital letter
+my @ignoreTaxa = ("unknown", "uncultured", "unidentified", "unclassified", "environmental", "plasmid", "vector", "synthetic", "phage");
+                                         # ignore taxon names containing these words
+my $ignoreParalogs = 1;                  # ignore potential paralogs (hits with same taxon names but different bit scores)
+my $ignoreSeqRepeats = 1;                # ignore repeated sequences (hits targetting different regions of the same protein)
+my $ignoreSubspecies = 0;                # ignore more than one subspecies from the same species
+my @ranks = ("species", "genus", "family", "order", "class", "phylum");
+                                         # record the TaxIDs on these ranks for each hit
 
 # search tool behavior
-my $threads = 0;						# multiple threads (0 for all CPU cores)
-my $queries = 0;						# multiple queries per run (0 for all sequences per sample)
-my $getAln = 0;							# retrieve aligned part of subject sequence (for BLAST only)
+my $threads = 0;                         # multiple threads (0 for all CPU cores)
+my $queries = 0;                         # multiple queries per run (0 for all sequences per sample)
+my $getAln = 0;                          # retrieve aligned part of subject sequence (for BLAST only)
 my $blastdbcmd = "blastdbcmd";
 my $blastp = "blastp";
 my $rapsearch = "rapsearch";
@@ -134,88 +132,90 @@
 my $diamond = "diamond";
 
 # remote BLAST behavior
-my $httpBlast = 0;						# call the NCBI server to perform BLAST and to retrieve sequence / taxonomy information
-my $httpDb = "nr";						# NCBI BLAST database
-my $requests = 1;						# number of requests for http BLAST (default: 1)
-my $retries = 5;						# maximum number of retries
-my $delay = 30;							# time (seconds) between two http requests
-my $timeout = 600;						# time (seconds) to give up waiting
-my $exUncultured = 1;					# exclude uncultured and environmental samples
-my @searchTaxids = ();					# search under the following taxon groups (taxids)
-my @ignoreTaxids = ();					# ignore organisms under the following taxids
-my $eqText = "";						# entrez query parameter
-my $taxBlast = 0;						# retrieve taxonomy report
-my $seqBlast = 0;						# retrieve hit sequences
-my $alnBlast = 0;						# retrieve multiple sequence alignment (conflicts seqBlast)
-my $blastServer = "http://blast.ncbi.nlm.nih.gov/Blast.cgi";
-my $blastAlignServer = "http://blast.ncbi.nlm.nih.gov/BlastAlign.cgi";
-my $eSearchServer = "http://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi";
-my $eFetchServer = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi";
-my $eSummaryServer = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi";
+my $httpBlast = 0;                       # call the NCBI server to perform BLAST and to retrieve sequence / taxonomy information
+my $httpDb = "nr";                       # NCBI BLAST database
+my $requests = 1;                        # number of requests for http BLAST (default: 1)
+my $retries = 5;                         # maximum number of retries
+my $delay = 30;                          # time (seconds) between two http requests
+my $timeout = 600;                       # time (seconds) to give up waiting
+my $exUncultured = 1;                    # exclude uncultured and environmental samples
+my @searchTaxids = ();                   # search under the following taxon groups (taxids)
+my @ignoreTaxids = ();                   # ignore organisms under the following taxids
+my $eqText = "";                         # entrez query parameter
+my $taxBlast = 0;                        # retrieve taxonomy report
+my $seqBlast = 0;                        # retrieve hit sequences
+my $alnBlast = 0;                        # retrieve multiple sequence alignment (conflicts seqBlast)
+
+# remote BLAST server URLs
+my $blastServer = "https://blast.ncbi.nlm.nih.gov/Blast.cgi";
+my $blastAlignServer = "https://blast.ncbi.nlm.nih.gov/BlastAlign.cgi";
+my $eSearchServer = "https://www.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi";
+my $eFetchServer = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi";
+my $eSummaryServer = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi";
 
 
 ## read configuration ##
 
-if (-e "$wkDir/config.txt"){
-	open IN, "<$wkDir/config.txt";
-	while (<IN>){
-		s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
-		$interactive = $1 if /^interactive=([01])$/;
-		$selfTax = $1 if /^selfTax=(.+)$/;
-		@ranks = split (/,/, $1) if /^ranks=(.+)$/;
-		$preSearch = $1 if /^preSearch=(.+)$/;
-		$preSearch =~ s/\/$//;
-		
-		$searchTool = $1 if /^searchTool=(.+)$/;
-		$threads = $1 if /^threads=(\d+)$/;
-		$queries = $1 if /^queries=(\d+)$/;
-		$requests = $1 if /^requests=(\d+)$/;
-		$getAln = $1 if /^getAln=([01])$/;
-
-		$blastdbcmd = $1 if /^blastdbcmd=(.+)$/;
-		$blastp = $1 if /^blastp=(.+)$/;
-		$rapsearch = $1 if /^rapsearch=(.+)$/;
-		$prerapsearch = $1 if /^prerapsearch=(.+)$/;
-		$diamond = $1 if /^diamond=(.+)$/;
-
-		$protdb = $1 if /^protdb=(.+)$/;
-		$taxdump = $1 if /^taxdump=(.+)$/;
-		$taxdump =~ s/\/$//;
-		$prot2taxid = $1 if /^prot2taxid=(.+)$/;
-		
-		$evalue = $1 if /^evalue=(.+)$/;
-		$identity = $1 if /^identity=(.+)$/;
-		$coverage = $1 if /^coverage=(.+)$/;
-		$nHits = $1 if /^nHits=(\d+)$/;
-		$maxHits = $1 if /^maxHits=(\d+)$/;		
-
-		$taxonUCASE = $1 if /^taxonUCASE=([01])$/;
-		$mergeDuplicates = $1 if /^mergeDuplicates=([01])$/;
-		$ignoreParalogs = $1 if /^ignoreParalogs=([01])$/;
-		$ignoreSeqRepeats = $1 if /^ignoreSeqRepeats=([01])$/;
-		$ignoreSubspecies = $1 if /^ignoreSubspecies=([01])$/;
-		@ignoreTaxa = split(/,/, $1) if /^ignoreTaxa=(.+)$/;
-		@ignoreTaxa = () if /^ignoreTaxa=$/;
-		
-		$httpBlast = $1 if /^httpBlast=(\d)$/;
-		$httpDb = $1 if /^httpDb=(.+)$/;
-		$retries = $1 if /^retries=(\d+)$/;
-		$delay = $1 if /^delay=(\d+)$/;
-		$eqText = $1 if /^eqText=(.+)$/;
-		$exUncultured = $1 if /^exUncultured=([01])$/;
-		@searchTaxids = split(/,/, $1) if /^searchTaxids=(.+)$/;
-		@ignoreTaxids = split(/,/, $1) if /^ignoreTaxids=(.+)$/;
-		$seqBlast = $1 if /^seqBlast=([01])$/;		
-		$taxBlast = $1 if /^taxBlast=([01])$/;
-		$alnBlast = $1 if /^alnBlast=([01])$/;
-		
-		$blastServer = $1 if /^blastServer=(.+)$/;
-		$blastAlignServer = $1 if /^blastAlignServer=(.+)$/;
-		$eSearchServer = $1 if /^eSearchServer=(.+)$/;
-		$eFetchServer = $1 if /^eFetchServer=(.+)$/;
-		$eSummaryServer = $1 if /^eSummaryServer=(.+)$/;
-	}
-	close IN;
+if (-e "$wkDir/config.txt") {
+    open IN, "<$wkDir/config.txt";
+    while (<IN>) {
+        s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+        $interactive = $1 if /^interactive=([01])$/;
+        $selfTax = $1 if /^selfTax=(.+)$/;
+        @ranks = split (/,/, $1) if /^ranks=(.+)$/;
+        $preSearch = $1 if /^preSearch=(.+)$/;
+        $preSearch =~ s/\/$//;
+
+        $searchTool = $1 if /^searchTool=(.+)$/;
+        $threads = $1 if /^threads=(\d+)$/;
+        $queries = $1 if /^queries=(\d+)$/;
+        $requests = $1 if /^requests=(\d+)$/;
+        $getAln = $1 if /^getAln=([01])$/;
+
+        $blastdbcmd = $1 if /^blastdbcmd=(.+)$/;
+        $blastp = $1 if /^blastp=(.+)$/;
+        $rapsearch = $1 if /^rapsearch=(.+)$/;
+        $prerapsearch = $1 if /^prerapsearch=(.+)$/;
+        $diamond = $1 if /^diamond=(.+)$/;
+
+        $protdb = $1 if /^protdb=(.+)$/;
+        $taxdump = $1 if /^taxdump=(.+)$/;
+        $taxdump =~ s/\/$//;
+        $prot2taxid = $1 if /^prot2taxid=(.+)$/;
+
+        $evalue = $1 if /^evalue=(.+)$/;
+        $identity = $1 if /^identity=(.+)$/;
+        $coverage = $1 if /^coverage=(.+)$/;
+        $nHits = $1 if /^nHits=(\d+)$/;
+        $maxHits = $1 if /^maxHits=(\d+)$/;
+
+        $taxonUCASE = $1 if /^taxonUCASE=([01])$/;
+        $mergeDuplicates = $1 if /^mergeDuplicates=([01])$/;
+        $ignoreParalogs = $1 if /^ignoreParalogs=([01])$/;
+        $ignoreSeqRepeats = $1 if /^ignoreSeqRepeats=([01])$/;
+        $ignoreSubspecies = $1 if /^ignoreSubspecies=([01])$/;
+        @ignoreTaxa = split(/,/, $1) if /^ignoreTaxa=(.+)$/;
+        @ignoreTaxa = () if /^ignoreTaxa=$/;
+
+        $httpBlast = $1 if /^httpBlast=(\d)$/;
+        $httpDb = $1 if /^httpDb=(.+)$/;
+        $retries = $1 if /^retries=(\d+)$/;
+        $delay = $1 if /^delay=(\d+)$/;
+        $eqText = $1 if /^eqText=(.+)$/;
+        $exUncultured = $1 if /^exUncultured=([01])$/;
+        @searchTaxids = split(/,/, $1) if /^searchTaxids=(.+)$/;
+        @ignoreTaxids = split(/,/, $1) if /^ignoreTaxids=(.+)$/;
+        $seqBlast = $1 if /^seqBlast=([01])$/;
+        $taxBlast = $1 if /^taxBlast=([01])$/;
+        $alnBlast = $1 if /^alnBlast=([01])$/;
+
+        $blastServer = $1 if /^blastServer=(.+)$/;
+        $blastAlignServer = $1 if /^blastAlignServer=(.+)$/;
+        $eSearchServer = $1 if /^eSearchServer=(.+)$/;
+        $eFetchServer = $1 if /^eFetchServer=(.+)$/;
+        $eSummaryServer = $1 if /^eSummaryServer=(.+)$/;
+    }
+    close IN;
 }
 
 
@@ -225,55 +225,55 @@
 $identity *= 100 if ($identity and $identity < 1);
 $coverage *= 100 if ($coverage and $coverage < 1);
 
-if (lc($searchTool) eq "blast"){ $searchTool = "BLAST"; }
-elsif (lc($searchTool) eq "rapsearch"){ $searchTool = "RAPSearch2"; }
-elsif (lc($searchTool) eq "rapsearch2"){ $searchTool = "RAPSearch2"; }
-elsif (lc($searchTool) eq "diamond"){ $searchTool = "DIAMOND"; }
-elsif (lc($searchTool) eq "customized"){ $searchTool = "customized"; }
+if (lc($searchTool) eq "blast") { $searchTool = "BLAST"; }
+elsif (lc($searchTool) eq "rapsearch") { $searchTool = "RAPSearch2"; }
+elsif (lc($searchTool) eq "rapsearch2") { $searchTool = "RAPSearch2"; }
+elsif (lc($searchTool) eq "diamond") { $searchTool = "DIAMOND"; }
+elsif (lc($searchTool) eq "customized") { $searchTool = "customized"; }
 else{ die "Error: Invalid search tool: $searchTool.\n"; }
 
-if ($searchTool eq "customized" and not $preSearch){ die "Error: You must provide pre-computed search results for \"customized\" search tool.\n"; }
-if ($preSearch and not -d $preSearch){ die "Error: Invalid directory for pre-computed search results: $preSearch\n"; }
-
-if ($searchTool ne "BLAST"){ $httpBlast = 0; }
-if ($httpBlast and not $protdb){ $protdb = "nr"; }
-
-unless ($threads){ # attempt to get number of CPU cores
-	if (-e "/proc/cpuinfo"){
-		$threads = `grep -c ^processor /proc/cpuinfo`;
-	}elsif ($^O eq "darwin"){
-		$threads = `sysctl -n hw.ncpu`;
-	}elsif ($^O eq "MSWin32"){
-		$threads = `echo %NUMBER_OF_PROCESSORS%`;
-	}
-	$threads =~ s/\s+$//;
-	$threads = 0 unless $threads =~ /^\d+$/;
-	unless ($threads){
-		print "Cannot determine the number of CPUs. Do single threading.\n";
-		$threads = 1;
-	}
+if ($searchTool eq "customized" and not $preSearch) { die "Error: You must provide pre-computed search results for \"customized\" search tool.\n"; }
+if ($preSearch and not -d $preSearch) { die "Error: Invalid directory for pre-computed search results: $preSearch\n"; }
+
+if ($searchTool ne "BLAST") { $httpBlast = 0; }
+if ($httpBlast and not $protdb) { $protdb = "nr"; }
+
+unless ($threads) { # attempt to get number of CPU cores
+    if (-e "/proc/cpuinfo") {
+        $threads = `grep -c ^processor /proc/cpuinfo`;
+    } elsif ($^O eq "darwin") {
+        $threads = `sysctl -n hw.ncpu`;
+    } elsif ($^O eq "MSWin32") {
+        $threads = `echo %NUMBER_OF_PROCESSORS%`;
+    }
+    $threads =~ s/\s+$//;
+    $threads = 0 unless $threads =~ /^\d+$/;
+    unless ($threads) {
+        print "Cannot determine the number of CPUs. Do single threading.\n";
+        $threads = 1;
+    }
 }
 
-unless ($eqText){ # generate Entrez query text #
-	if ($exUncultured){
-		$eqText = "all [filter] NOT(environmental samples[filter] OR metagenomes[orgn])";
-	}
-	if (@searchTaxids){
-		for (my $i=0; $i<scalar(@searchTaxids); $i++){
-			next unless ($searchTaxids[$i] =~ /^\d+$/);
-			$eqText .= " OR " if $eqText;
-			$eqText .= "txid$searchTaxids[$i]\[orgn\]";
-		}
-	}
-	if (@ignoreTaxids){
-		$eqText .= " NOT (";
-		for (my $i=0; $i<scalar(@ignoreTaxids); $i++){
-			next unless $ignoreTaxids[$i] =~ /^\d+$/;
-			$eqText .= " OR " if $i;
-			$eqText .= "txid$ignoreTaxids[$i]\[orgn\]";
-		}
-		$eqText .= ")";
-	}
+unless ($eqText) { # generate Entrez query text #
+    if ($exUncultured) {
+        $eqText = "all [filter] NOT(environmental samples[filter] OR metagenomes[orgn])";
+    }
+    if (@searchTaxids) {
+        for (my $i=0; $i<scalar(@searchTaxids); $i++) {
+            next unless ($searchTaxids[$i] =~ /^\d+$/);
+            $eqText .= " OR " if $eqText;
+            $eqText .= "txid$searchTaxids[$i]\[orgn\]";
+        }
+    }
+    if (@ignoreTaxids) {
+        $eqText .= " NOT (";
+        for (my $i=0; $i<scalar(@ignoreTaxids); $i++) {
+            next unless $ignoreTaxids[$i] =~ /^\d+$/;
+            $eqText .= " OR " if $i;
+            $eqText .= "txid$ignoreTaxids[$i]\[orgn\]";
+        }
+        $eqText .= ")";
+    }
 }
 
 
@@ -282,38 +282,38 @@
 print "Reading input data...\n";
 die "\nError: The input/ folder is not found.\n" unless (-d "$wkDir/input");
 opendir (DIR, "$wkDir/input");
-foreach my $file (grep {!/^\./} readdir DIR){
-	next unless -s "$wkDir/input/$file";
-	$ins{stem_name($file)} = { 'file'=>$file, 'prots'=>[] }
+foreach my $file (grep {!/^\./} readdir DIR) {
+    next unless -s "$wkDir/input/$file";
+    $ins{stem_name($file)} = { 'file'=>$file, 'prots'=>[] }
 }
 closedir DIR;
 die "Error: No data are found in the input/ folder.\n" unless (%ins);
-foreach my $set (sort keys %ins){
-	$s = $ins{$set}{'file'};
-	my $intype = ""; # type of input file format
-	open IN, "<$wkDir/input/$s" or die "\nError: Failed to read input file $s.\n";
-	while(<IN>){
-		s/\s+$//; next unless $_; next if /^#/;
-		unless ($intype){
-			$intype = "list";
-			$intype = "fasta" if (/^>/);
-		}
-		if ($intype eq 'fasta'){
-			if (s/^>//){
-				@a = seq_title ($_);
-				push @{$ins{$set}{'prots'}}, { 'gi'=>$a[0], 'accn'=>$a[1], 'name'=>$a[2], 'product'=>$a[3], 'seq'=>'', 'hits'=>[], 'done'=>0 };
-			}else{
-				$ins{$set}{'prots'}[-1]{'seq'} .= $_; # append sequence
-			}
-		}elsif ($intype eq "list"){
-			@a = seq_title ($_);
-			push @{$ins{$set}{'prots'}}, { 'gi'=>$a[0], 'accn'=>$a[1], 'name'=>$a[2], 'product'=>$a[3], 'seq'=>'', 'hits'=>[], 'done'=>0 };
-		}
-	}
-	close IN;
-	my $i = scalar @{$ins{$set}{'prots'}};
-	die "Error: sample $set does not contain any protein entries.\n" unless $i;
-	print "  $set: $i proteins.\n";
+foreach my $set (sort keys %ins) {
+    $s = $ins{$set}{'file'};
+    my $intype = "";  # type of input file format
+    open IN, "<$wkDir/input/$s" or die "\nError: Failed to read input file $s.\n";
+    while (<IN>) {
+        s/\s+$//; next unless $_; next if /^#/;
+        $intype = /^>/ ? "fasta" : "list" unless $intype;
+        my %prot = ('name'=>'', 'title'=>'', 'seq'=>'', 'hits'=>[], 'done'=>0);
+        if ($intype eq 'fasta') {
+            if (s/^>//) {
+                @a = split(/\s+/, $_, 2);
+                $prot{'name'} = $a[0];
+                $prot{'title'} = $a[1] if $#a;
+                push @{$ins{$set}{'prots'}}, {%prot};
+            } else {
+                $ins{$set}{'prots'}[-1]{'seq'} .= $_;  # append sequence
+            }
+        } elsif ($intype eq "list") {
+            $prot{'name'} = $_;
+            push @{$ins{$set}{'prots'}}, {%prot};
+        }
+    }
+    close IN;
+    my $n = scalar @{$ins{$set}{'prots'}};
+    die "Error: sample $set does not contain any protein entries.\n" unless $n;
+    print "  $set: $n proteins.\n";
 }
 $nProt += scalar @{$ins{$_}{'prots'}} for (keys %ins);
 print "Done. $nProt proteins from ".(scalar keys %ins)." set(s) to query.\n";
@@ -321,525 +321,518 @@
 
 ## read search results from previous runs ##
 
-if (-d "$wkDir/search"){
-	print "Reading search results from previous run(s)...\n";
-	foreach my $set (keys %ins){
-		next unless -d "$wkDir/search/$set";
-		my ($goodResults, $goodHits) = (0, 0);
-		my ($badResults, $badHits) = (0, 0);
-		foreach (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++){
-			my $file = "$wkDir/search/$set/".$ins{$set}{'prots'}[$i]{'name'}.".txt";
-			if (-e $file){
-				if (-s $file){
-					my @out = ();
-					open IN, "<$file" or next;
-					while (<IN>){
-						s/\s+$//;
-						push (@out, $_);
-					}
-					close IN;
-					my $badHitsHere = 0;
-					my $k = 1; # status of report
-					foreach (@out){
-						next unless $_;
-						if ($k == 1 and (/^#NEXUS/)){ $k = 2; next; }
-						if ($k == 2 and /^BEGIN QUERY;/){ $k = 3; next; }
-						if ($k == 3 and /^END;$/){ $k = 4; next; }
-						if ($k == 4 and /^BEGIN ORGANISM;$/){ $k = 5; next; }
-						if ($k == 5 and /^END;$/){ $k = 6; last; }
-						if ($k == 5){
-							next if /^\[/;
-							next if /^;/;
-							my @a = split (/\t/);
-							if (@a and scalar(@a) < 6){
-								$_ = "[deleted]";
-								$badHitsHere ++;
-								$badHits ++;
-							}else{
-								$goodHits ++;
-							}
-						}
-					}
-					if ($k < 6){ # bad report
-						$badResults ++;
-						unlink $file;
-						print "  Warning: Incomplete search result: $set/".$ins{$set}{'prots'}[$i]{'name'}.". Deleted.\n";
-					}else{
-						$ins{$set}{'prots'}[$i]{'done'} = 1;
-						$goodResults ++;
-						if ($badHitsHere){
-							open OUT, ">$file";
-							foreach (@out){
-								next if /^\[deleted\]/;
-								print OUT "$_\n";
-							}
-							close OUT;
-							print "  Warning: $badHitsHere invalid hits deleted from $set/".$ins{$set}{'prots'}[$i]{'name'}.".\n";
-						}
-					}
-				}else{
-					$badResults ++;
-					unlink $file;
-					print "  Empty search result: $set/".$ins{$set}{'prots'}[$i]{'name'}.". Deleted.\n";
-				}
-			}
-		}
-		print "  $set: $goodHits valid hits for $goodResults proteins.\n";
-		$ins{$set}{'done'} = 1 if ($goodResults == scalar(@{$ins{$set}{'prots'}}));
-		$nDone += $goodResults;
-	}
-	print "Done. $nDone results found, remaining ".($nProt - $nDone)." proteins to search.\n";
+if (-d "$wkDir/search") {
+    print "Reading search results from previous run(s)...\n";
+    foreach my $set (keys %ins) {
+        next unless -d "$wkDir/search/$set";
+        my ($goodResults, $goodHits) = (0, 0);
+        my ($badResults, $badHits) = (0, 0);
+        foreach (my $i = 0; $i < scalar(@{$ins{$set}{'prots'}}); $i ++) {
+            my $file = "$wkDir/search/$set/".$ins{$set}{'prots'}[$i]{'name'}.".txt";
+            if (-e $file) {
+                if (-s $file) {
+                    my @out = ();
+                    open IN, "<$file" or next;
+                    while (<IN>) {
+                        s/\s+$//;
+                        push (@out, $_);
+                    }
+                    close IN;
+                    my $badHitsHere = 0;
+                    my $k = 1;  # status of report
+                    foreach (@out) {
+                        next unless $_;
+                        if ($k == 1 and (/^#NEXUS/)) { $k = 2; next; }
+                        if ($k == 2 and /^BEGIN QUERY;/) { $k = 3; next; }
+                        if ($k == 3 and /^END;$/) { $k = 4; next; }
+                        if ($k == 4 and /^BEGIN ORGANISM;$/) { $k = 5; next; }
+                        if ($k == 5 and /^END;$/) { $k = 6; last; }
+                        if ($k == 5) {
+                            next if /^\[/;
+                            next if /^;/;
+                            my @a = split (/\t/);
+                            if (@a and scalar(@a) < 6) {
+                                $_ = "[deleted]";
+                                $badHitsHere ++;
+                                $badHits ++;
+                            } else {
+                                $goodHits ++;
+                            }
+                        }
+                    }
+                    if ($k < 6) { # bad report
+                        $badResults ++;
+                        unlink $file;
+                        print "  Warning: Incomplete search result: $set/".$ins{$set}{'prots'}[$i]{'name'}.". Deleted.\n";
+                    } else {
+                        $ins{$set}{'prots'}[$i]{'done'} = 1;
+                        $goodResults ++;
+                        if ($badHitsHere) {
+                            open OUT, ">$file";
+                            foreach (@out) {
+                                next if /^\[deleted\]/;
+                                print OUT "$_\n";
+                            }
+                            close OUT;
+                            print "  Warning: $badHitsHere invalid hits deleted from $set/".$ins{$set}{'prots'}[$i]{'name'}.".\n";
+                        }
+                    }
+                } else {
+                    $badResults ++;
+                    unlink $file;
+                    print "  Empty search result: $set/".$ins{$set}{'prots'}[$i]{'name'}.". Deleted.\n";
+                }
+            }
+        }
+        print "  $set: $goodHits valid hits for $goodResults proteins.\n";
+        $ins{$set}{'done'} = 1 if ($goodResults == scalar(@{$ins{$set}{'prots'}}));
+        $nDone += $goodResults;
+    }
+    print "Done. $nDone results found, remaining ".($nProt - $nDone)." proteins to search.\n";
 }
 
-if ($nProt-$nDone <= 0){
-	print "Batch homology search completed. searcher.pl exits.\n";
-	print "You may proceed with HGT prediction by running analyzer.pl.\n\n";
-	exit 0;
+if ($nProt-$nDone <= 0) {
+    print "Batch homology search completed. searcher.pl exits.\n";
+    print "You may proceed with HGT prediction by running analyzer.pl.\n\n";
+    exit 0;
 }
 
 
 ## detect pre-computed search results ##
 # This function was inspired by Conor Meehan (cmeehan@itg.be) #
 
-if ($preSearch){
-	my $n = 0;
-	opendir (DIR, $preSearch);
-	foreach my $file (grep {!/^\./} readdir DIR){
-		next unless -s "$preSearch/$file";
-		$s = stem_name($file);
-		if (exists $ins{$s}){
-			$ins{$s}{'prefile'} = $file;
-			$n ++;
-		}
-	}
-	closedir DIR;
-	print "Pre-computed search results are found for $n protein set(s).\n";
+if ($preSearch) {
+    my $n = 0;
+    opendir (DIR, $preSearch);
+    foreach my $file (grep {!/^\./} readdir DIR) {
+        next unless -s "$preSearch/$file";
+        $s = stem_name($file);
+        if (exists $ins{$s}) {
+            $ins{$s}{'prefile'} = $file;
+            $n ++;
+        }
+    }
+    closedir DIR;
+    print "Pre-computed search results are found for $n protein set(s).\n";
 }
 
-unless ($httpBlast){
-
-	## read local taxonomy database ##
-	
-	unless ($protdb){ print "Warning: A protein database is required for local sequence homology search.\n"; }
-	unless ($taxdump){ die "Error: A taxonomy database is required for local sequence homology search.\n"; }
-	unless (-d $taxdump){ die "Error: Invalid taxonomy database directory: $taxdump.\n"; }
-	unless (-s "$taxdump/nodes.dmp" and -s "$taxdump/names.dmp"){ die "Error: Taxonomy database is not found under $taxdump.\n" ; }
-	print "Reading taxonomy database...";
-	open IN, "<$taxdump/nodes.dmp";
-	while (<IN>){
-		s/\s+$//;
-		@a = split (/\s+\|\s+/);
-		%h = ('parent', $a[1], 'rank', $a[2]);
-		$taxdumps{$a[0]} = {%h};
-	}
-	close IN;
-	open IN, "<$taxdump/names.dmp";
-	while (<IN>){
-		s/\s+$//;
-		next unless (/scientific name\s*\|$/);
-		@a = split (/\s+\|\s+/);
-		$taxdumps{$a[0]}{'name'} = $a[1] if (exists $taxdumps{$a[0]});
-	}
-	close IN;
-	print " done. ".scalar(keys %taxdumps)." records read.\n";
-
-	## read protein-to-TaxID dictionary ##
-	
-	if ($prot2taxid){
-		unless (-s $prot2taxid){ die "Error: Invalid protein-to-TaxID dictionary: $prot2taxid.\n"; }
-		print "Reading protein-to-TaxID dictionary...";
-		open IN, "<$prot2taxid";
-		while (<IN>){
-			s/\s+$//; next unless $_;
-			@a = split (/\s+/);
-			next unless $#a;
-			$prot2taxids{$a[0]} = $a[1];
-		}
-		close IN;
-		print " done. ".scalar(keys %prot2taxids)." records read.\n";
-	}elsif ($searchTool ne "BLAST"){
-		print "Warning: A protein-to-TaxID dictionary is not provided.\n" ;
-		if ($interactive){
-			print "Press Enter to proceed, or Ctrl+C to exit:";
-			$s = <STDIN>;
-		}
-	}
+unless ($httpBlast) {
+
+    ## read local taxonomy database ##
+
+    unless ($protdb) { print "Warning: A protein database is required for local sequence homology search.\n"; }
+    unless ($taxdump) { die "Error: A taxonomy database is required for local sequence homology search.\n"; }
+    unless (-d $taxdump) { die "Error: Invalid taxonomy database directory: $taxdump.\n"; }
+    unless (-s "$taxdump/nodes.dmp" and -s "$taxdump/names.dmp") { die "Error: Taxonomy database is not found under $taxdump.\n" ; }
+    print "Reading taxonomy database...";
+    open IN, "<$taxdump/nodes.dmp";
+    while (<IN>) {
+        s/\s+$//;
+        @a = split (/\s+\|\s+/);
+        %h = ('parent', $a[1], 'rank', $a[2]);
+        $taxdumps{$a[0]} = {%h};
+    }
+    close IN;
+    open IN, "<$taxdump/names.dmp";
+    while (<IN>) {
+        s/\s+$//;
+        next unless (/scientific name\s*\|$/);
+        @a = split (/\s+\|\s+/);
+        $taxdumps{$a[0]}{'name'} = $a[1] if (exists $taxdumps{$a[0]});
+    }
+    close IN;
+    print " done. ".scalar(keys %taxdumps)." records read.\n";
+
+    ## read protein-to-TaxID dictionary ##
+
+    if ($prot2taxid) {
+        unless (-s $prot2taxid) { die "Error: Invalid protein-to-TaxID dictionary: $prot2taxid.\n"; }
+        print "Reading protein-to-TaxID dictionary...";
+        open IN, "<$prot2taxid";
+        while (<IN>) {
+            s/\s+$//; next unless $_;
+            @a = split (/\s+/);
+            next unless $#a;
+            $prot2taxids{$a[0]} = $a[1];
+        }
+        close IN;
+        print " done. ".scalar(keys %prot2taxids)." records read.\n";
+    } elsif ($searchTool ne "BLAST") {
+        print "Warning: A protein-to-TaxID dictionary is not provided.\n" ;
+        if ($interactive) {
+            print "Press Enter to proceed, or Ctrl+C to exit:";
+            $s = <STDIN>;
+        }
+    }
 }
 
 
 ## read taxonomic information ##
 
-if (-d "$wkDir/taxonomy"){
-	print "Reading taxonomy records from previous run(s)...";
-	if (-s "$wkDir/taxonomy/taxa.db"){
-		open IN, "<$wkDir/taxonomy/taxa.db";
-		while (<IN>){
-			s/\s+$//; next if /^#/; next unless $_;
-			@a = split (/\t/);
-			next if exists $dbTaxa{$a[0]};
-			my %taxon :shared = ('organism'=>$a[1], 'lineage'=>$a[2]);
-			my $i = 3; $taxon{$_} = $a[$i++] for (@ranks);
-			$dbTaxa{$a[0]} = \%taxon;
-		}
-		close IN;
-	}
-	if (-s "$wkDir/taxonomy/ranks.db"){
-		open IN, "<$wkDir/taxonomy/ranks.db";
-		while (<IN>){
-			s/\s+$//; next if /^#/; next unless $_;
-			@a = split (/\t/);
-			next if exists $dbRanks{$a[0]};
-			$dbRanks{$a[0]} = $a[1];
-		}
-		close IN;
-	}
-	print " done. ".(scalar keys %dbTaxa)." taxa and ".(scalar keys %dbRanks)." ranks read.\n";
-}else{
-	mkdir "$wkDir/taxonomy";
+if (-d "$wkDir/taxonomy") {
+    print "Reading taxonomy records from previous run(s)...";
+    if (-s "$wkDir/taxonomy/taxa.db") {
+        open IN, "<$wkDir/taxonomy/taxa.db";
+        while (<IN>) {
+            s/\s+$//; next if /^#/; next unless $_;
+            @a = split (/\t/);
+            next if exists $dbTaxa{$a[0]};
+            my %taxon :shared = ('organism'=>$a[1], 'lineage'=>$a[2]);
+            my $i = 3; $taxon{$_} = $a[$i++] for (@ranks);
+            $dbTaxa{$a[0]} = \%taxon;
+        }
+        close IN;
+    }
+    if (-s "$wkDir/taxonomy/ranks.db") {
+        open IN, "<$wkDir/taxonomy/ranks.db";
+        while (<IN>) {
+            s/\s+$//; next if /^#/; next unless $_;
+            @a = split (/\t/);
+            next if exists $dbRanks{$a[0]};
+            $dbRanks{$a[0]} = $a[1];
+        }
+        close IN;
+    }
+    print " done. ".(scalar keys %dbTaxa)." taxa and ".(scalar keys %dbRanks)." ranks read.\n";
+} else {
+    mkdir "$wkDir/taxonomy";
 }
 
 
 ## collect 'self' information ##
 
-if (-s "$wkDir/taxonomy/self.info"){
-	# The file self.info is like: sample_name | TaxID | organism_name
-	my $nRead = 0;
-	open IN, "<$wkDir/taxonomy/self.info";
-	while (<IN>){
-		s/\s+$//; next if /^#/; next unless $_;
-		@a = split (/\t/);
-		if (exists $ins{$a[0]}){
-			$ins{$a[0]}{'taxid'} = $a[1];
-			$ins{$a[0]}{'organism'} = $a[2];
-		}
-	}
-	close IN;
+if (-s "$wkDir/taxonomy/self.info") {
+    # The file self.info is like: sample_name | TaxID | organism_name
+    my $nRead = 0;
+    open IN, "<$wkDir/taxonomy/self.info";
+    while (<IN>) {
+        s/\s+$//; next if /^#/; next unless $_;
+        @a = split (/\t/);
+        if (exists $ins{$a[0]}) {
+            $ins{$a[0]}{'taxid'} = $a[1];
+            $ins{$a[0]}{'organism'} = $a[2];
+        }
+    }
+    close IN;
 }
-if ($selfTax){ # TaxIDs of input protein sets
-	%h = ();
-	@a = split (/,/, $selfTax);
-	foreach (@a){
-		@b = split (/:/);
-		next unless $#b;
-		$h{$b[0]} = $b[1];
-	}
-	foreach my $set (keys %ins){
-		next if exists $ins{$set}{'taxid'};
-		next unless exists $h{$set};
-		@a = get_taxonomy (($h{$set}));
-		$ins{$set}{'taxid'} = $h{$set};
-		$ins{$set}{'organism'} = $a[0];
-		open OUT, ">>$wkDir/taxonomy/self.info";
-		print OUT $set,"\t",$ins{$set}{'taxid'},"\t",$ins{$set}{'organism'},"\n";
-		close OUT;
-	}
+if ($selfTax) { # TaxIDs of input protein sets
+    %h = ();
+    @a = split (/,/, $selfTax);
+    foreach (@a) {
+        @b = split (/:/);
+        next unless $#b;
+        $h{$b[0]} = $b[1];
+    }
+    foreach my $set (keys %ins) {
+        next if exists $ins{$set}{'taxid'};
+        next unless exists $h{$set};
+        @a = get_taxonomy (($h{$set}));
+        $ins{$set}{'taxid'} = $h{$set};
+        $ins{$set}{'organism'} = $a[0];
+        open OUT, ">>$wkDir/taxonomy/self.info";
+        print OUT $set,"\t",$ins{$set}{'taxid'},"\t",$ins{$set}{'organism'},"\n";
+        close OUT;
+    }
 }
-if ($interactive){
-	foreach my $set (keys %ins){
-		next if exists $ins{$set}{'taxid'};
-		print "Enter the TaxID of $set, or press Enter if you don't know:";
-		$s = <STDIN>; chomp $s; next unless $s;
-		@a = get_taxonomy (($s));
-		$ins{$set}{'taxid'} = $s;
-		$ins{$set}{'organism'} = $a[0];
-		open OUT, ">>$wkDir/taxonomy/self.info";
-		print OUT $set,"\t",$ins{$set}{'taxid'},"\t",$ins{$set}{'organism'},"\n";
-		close OUT;
-	}
+if ($interactive) {
+    foreach my $set (keys %ins) {
+        next if exists $ins{$set}{'taxid'};
+        print "Enter the TaxID of $set, or press Enter if you don't know:";
+        $s = <STDIN>; chomp $s; next unless $s;
+        @a = get_taxonomy (($s));
+        $ins{$set}{'taxid'} = $s;
+        $ins{$set}{'organism'} = $a[0];
+        open OUT, ">>$wkDir/taxonomy/self.info";
+        print OUT $set,"\t",$ins{$set}{'taxid'},"\t",$ins{$set}{'organism'},"\n";
+        close OUT;
+    }
 }
 print "Taxonomy of input protein sets:\n";
 my @missingTax = ();
-foreach my $set (sort keys %ins){
-	if (exists $ins{$set}{'taxid'}){
-		print "  $set: ", $ins{$set}{'organism'}, " (", $ins{$set}{'taxid'}, ")\n";
-	}else{
-		push (@missingTax, $set);
-	}
+foreach my $set (sort keys %ins) {
+    if (exists $ins{$set}{'taxid'}) {
+        print "  $set: ", $ins{$set}{'organism'}, " (", $ins{$set}{'taxid'}, ")\n";
+    } else {
+        push (@missingTax, $set);
+    }
 }
-if (@missingTax){
-	print "Attempting to identify taxonomy of ".(scalar @missingTax)." protein set(s) :\n";
-	print "  ", join (",", @missingTax), "\n";
-	foreach my $set (@missingTax){
-		my $taxid = "";
-		if (%prot2taxids){ # look up the dictionary
-			foreach my $prot (@{$ins{$set}{'prots'}}){
-				$taxid = '';
-				my ($gi, $accn, $name) = ($prot->{'gi'}, $prot->{'accn'}, $prot->{'name'});
-				if (exists $prot2taxids{$name} and $prot2taxids{$name} !~ /,/){ $taxid = $prot2taxids{$name}; }
-				elsif (exists $prot2taxids{$gi} and $prot2taxids{$name} !~ /,/){ $taxid = $prot2taxids{$gi}; }
-				elsif (exists $prot2taxids{$accn} and $prot2taxids{$name} !~ /,/){ $taxid = $prot2taxids{$accn}; }
-				next unless $taxid;
-				next if $taxid =~ /,/;
-				last;
-			}
-		}
-		unless ($taxid){ # look up the BLAST database
-			my ($gi, $accn, $name) = ($ins{$set}{'prots'}[0]{'gi'}, $ins{$set}{'prots'}[0]{'accn'}, $ins{$set}{'prots'}[0]{'name'});
-			if ($searchTool eq "BLAST" and not $httpBlast){
-				my $query = "";
-				
-				if ($accn){ $query = $accn; }
-				elsif ($gi){ $query = $gi; }
-				else { $query = $name; }
-				my @out = `$blastdbcmd -db $protdb -entry $query -outfmt \"%a %g %T %t\"`;
-
-				# The command is to query the BLAST database for one or more particular sequences.
-				# The four codes represent accn, gi, taxid, title
-				# The query may be gi or accn or "gi|###|ref|###" or "title".
-				# If the database does not contain TaxID (or if not -taxid_map), %T will be 0.
-				# when making database (using makeblastdb), one should do -parse_seqids to enable gi and accn search
-
-				my $found = 1;
-				foreach (@out){ if (/not found in BLAST database/){ $found = 0; last; } }
-				if ($found){
-					@a = split (/\s+/, $out[0]);
-					$taxid = $a[2] if (scalar(@a) > 3 and $a[2] and $a[2] =~ /^\d+$/);
-				}
-				unless ($taxid){
-					if ($gi){ # look up the NCBI server
-						my $iRetry = 0;
-						sleep 1;
-						while (1){
-							$s = get "$eSummaryServer?db=protein&id=$gi";
-							last if (defined $s) and ($s =~ /<eSummaryResult>/s);
-							die "\nFailed to retrieve taxonomic information from NCBI.\n" if ($iRetry >= $retries);
-							$iRetry ++; sleep $delay; next;
-						}
-						$taxid = $1 if ($s =~ /<Item Name=\"TaxId\" Type=\"Integer\">(\d+?)<\/Item>/);
-					}
-				}
-			}
-		}
-		if ($taxid){
-			@a = get_taxonomy(($taxid));
-			$ins{$set}{'taxid'} = $taxid;
-			$ins{$set}{'organism'} = $a[0];
-			open OUT, ">>$wkDir/taxonomy/self.info";
-			print OUT $set, "\t", $taxid, "\t", $a[0], "\n";
-			close OUT;
-			print OUT "  $set: $taxid ($a[0])\n";
-		}else{
-			die "Error: Cannot identify the taxonomy of $set.\n";
-		}
-	}
+if (@missingTax) {
+    print "Attempting to identify taxonomy of ".(scalar @missingTax)." protein set(s) :\n";
+    print "  ", join (",", @missingTax), "\n";
+    foreach my $set (@missingTax) {
+        my $taxid = "";
+        if (%prot2taxids) { # look up the dictionary
+            foreach my $prot (@{$ins{$set}{'prots'}}) {
+                $taxid = '';
+                my $name = $prot->{'name'};
+                if (exists $prot2taxids{$name} and $prot2taxids{$name} !~ /,/) { $taxid = $prot2taxids{$name}; }
+                next unless $taxid;
+                next if $taxid =~ /,/;
+                last;
+            }
+        }
+        unless ($taxid) { # look up the BLAST database
+            my $name = $ins{$set}{'prots'}[0]{'name'};
+            if ($searchTool eq "BLAST" and not $httpBlast) {
+                my $query = $name;
+                my @out = `$blastdbcmd -db $protdb -entry $query -outfmt \"%a %T %t\"`;
+
+                # The command is to query the BLAST database for one or more particular sequences.
+                # The four codes represent name (accession), taxid, title
+                # If the database does not contain TaxID (or if not -taxid_map), %T will be 0.
+                # when making database (using makeblastdb), one should do -parse_seqids to enable search by name (instead of sequence)
+
+                my $found = 1;
+                foreach (@out) { if (/not found in BLAST database/) { $found = 0; last; } }
+                if ($found) {
+                    @a = split (/\s+/, $out[0]);
+                    $taxid = $a[2] if (scalar(@a) > 3 and $a[2] and $a[2] =~ /^\d+$/);
+                }
+                unless ($taxid) {
+                    my $iRetry = 0;
+                    sleep 1;
+                    while (1) {
+                        $s = get "$eSummaryServer?db=protein&id=$name";
+                        last if (defined $s) and ($s =~ /<eSummaryResult>/s);
+                        die "\nFailed to retrieve taxonomic information from NCBI.\n" if ($iRetry >= $retries);
+                        $iRetry ++; sleep $delay; next;
+                    }
+                    $taxid = $1 if ($s =~ /<Item Name=\"TaxId\" Type=\"Integer\">(\d+?)<\/Item>/);
+                }
+            }
+        }
+        if ($taxid) {
+            @a = get_taxonomy(($taxid));
+            $ins{$set}{'taxid'} = $taxid;
+            $ins{$set}{'organism'} = $a[0];
+            open OUT, ">>$wkDir/taxonomy/self.info";
+            print OUT $set, "\t", $taxid, "\t", $a[0], "\n";
+            close OUT;
+            print OUT "  $set: $taxid ($a[0])\n";
+        } else {
+            die "Error: Cannot identify the taxonomy of $set.\n";
+        }
+    }
 }
 
 
 ## perform batch sequence similarity search ##
 
 mkdir "$wkDir/search" unless -d "$wkDir/search";
-if ($requests == 1 or not $httpBlast){
-	foreach my $set (sort keys %ins){
-		next if exists $ins{$set}{'done'};
-		print "Batch homology search of $set (".(scalar @{$ins{$set}{'prots'}})." queries) started.\n";
-		mkdir "$wkDir/search/$set" unless -d "$wkDir/search/$set";
-		@a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
-		open LOG, ">>$wkDir/search/$set.log";
-		print LOG "Program started at $s.\n";
-		print LOG "Number of queries: ". (scalar @{$ins{$set}{'prots'}}) .".\n";
-		close LOG;
-		
-		unless ($httpBlast){ # local search
-		
-			%h = ();
-			for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++){
-				$h{$ins{$set}{'prots'}[$i]{'name'}} = $i unless $ins{$set}{'prots'}[$i]{'seq'};
-			}
-			if (%h){
-				if ($searchTool ne "BLAST"){ die "Error: Cannot extract query sequences from a non-BLAST local database.\n"; }
-				open OUT, ">$wkDir/seqids.txt"; 
-				print OUT $_."\n" for (keys %h);
-				close OUT;
-				my @out = `$blastdbcmd -dbtype=prot -db $protdb -entry_batch $wkDir/seqids.txt -outfmt \"%a %s\"`;
-				unlink "$wkDir/seqids.txt";
-				if (join('', @out) =~ /not found in BLAST database/){ die "Error: Query sequences not found in $protdb.\n"; }
-				foreach (@out){
-					s/\s+$//; next unless $_;
-					next if /^Error/;
-					@a = split (/\s+/); next unless $#a;
-					$a[0] =~ s/\.\d+$//;
-					if (exists $h{$a[0]}){ $ins{$set}{'prots'}[$h{$a[0]}]{'seq'} = $a[1]; }
-				}
-			}
-			%h = ();
-			for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++){
-				$h{$ins{$set}{'prots'}[$i]{'name'}} = $i unless $ins{$set}{'prots'}[$i]{'seq'};
-			}
-			if (%h){ die "Error: One or more query sequences cannot be extracted from $protdb.\n"; }
-
-			# new feature: batch self search #
-			
-			if ($searchTool eq "RAPSearch2" or $searchTool eq "DIAMOND"){
-				%h = ();
-				open OUT, ">$wkDir/tmp.in";
-				for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++){
-					$h{$ins{$set}{'prots'}[$i]{'name'}} = $i;
-					print OUT ">".$ins{$set}{'prots'}[$i]{'name'}."\n".$ins{$set}{'prots'}[$i]{'seq'}."\n";
-				}
-				close OUT;
-				if ($searchTool eq "RAPSearch2"){
-					`$prerapsearch -d $wkDir/tmp.in -n $wkDir/raptmp`;
-					`$rapsearch -q $wkDir/tmp.in -d $wkDir/raptmp -t a -s f -o $wkDir/tmp -z $threads`;
-					unlink "$wkDir/tmp.in";
-					unlink "$wkDir/raptmp";
-					unlink "$wkDir/raptmp.info";
-					die "Error in running RAPSearch2. Please check." unless -s "$wkDir/tmp.m8";
-					open IN, "<$wkDir/tmp.m8";
-					while (<IN>){
-						s/\s+$//; next unless $_; next if /^#/;
-						@a = split (/\t/);
-						next if ($#a < 11);
-						next unless $a[0] eq $a[1];
-						next unless exists $h{$a[0]};
-						next if exists $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'};
-						$ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'} = $a[10]."/".$a[11]."/".$a[2]."/".length($ins{$set}{'prots'}[$h{$a[0]}]{'seq'});
-					}
-					close IN;
-					unlink "$wkDir/tmp.aln";
-					unlink "$wkDir/tmp.m8";
-				}elsif ($searchTool eq "DIAMOND"){
-					`$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
-					`$diamond blastp -p $threads -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
-					my @out = `$diamond view -a $wkDir/tmp.daa`;
-					unlink "$wkDir/tmp.in";
-					unlink "$wkDir/tmp.dmnd";
-					unlink "$wkDir/tmp.daa";
-					die "Error in running DIAMOND. Please check." unless @out;
-					foreach (@out){
-						s/\s+$//;
-						@a = split (/\t/);
-						next unless $#a == 11;
-						next unless $a[0] eq $a[1];
-						next unless exists $h{$a[0]};
-						next if exists $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'};
-						$ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'} = $a[10]."/".$a[11]."/".$a[2]."/".length($ins{$set}{'prots'}[$h{$a[0]}]{'seq'});
-					}
-				}
-			}
-		
-			# end of new feature #
-			
-			my @ids = ();
-			for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++){
-				next if $ins{$set}{'prots'}[$i]{'done'};
-				push (@ids, $i);
-				next if exists $ins{$set}{'prefile'};
-				next unless $queries;
-				unless (scalar(@ids) % $queries){
-					local_search ($set, \@ids);
-					@ids = ();
-				}
-			}
-			local_search ($set, \@ids) if (@ids);
-			
-		}else{ # single-process http BLAST #
-		
-			for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++){
-				my $query = $ins{$set}{'prots'}[$i]{'name'};
-				next if -s "$wkDir/search/$set/$query.txt";
-				my $return = 0;
-				print "  BLASTing $query... ";
-				my $iRetry = 0;
-				while (1){
-					$return = http_blast ($set, $i);
-					last if $return =~ /\/1$/;
-					print "failed.\n";
-					print "\n" and last if ($iRetry >= $retries);
-					print "  Retrying...";
-					$iRetry ++; sleep $delay;
-				}
-				open LOG, ">>$wkDir/search/$set.log";
-				if ($return =~ /\/(\d+)\/1$/){
-					print LOG "$query\t$1\n";
-					print "done. $1 hits.\n";
-				}
-				else{ print LOG "$query\tfailed\n"; }
-				close LOG;
-				sleep $delay;
-			}
-		}
-		@a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
-		open LOG, ">>$wkDir/search/$set.log";
-		print LOG "Program ended at $s.\n";
-		close LOG;
-		unlink "$wkDir/tmp.in";
-		print "Batch homology search of $set (".(scalar @{$ins{$set}{'prots'}})." queries) completed.\n";
-	}
-	
-}else{ # multi-process http BLAST #
-
-	print "$requests http BLAST threads are running in parallel.\n";
-	
-	my %running = (); # accn -> 1
-	my %retry = (); # accn -> times of retry
-	my %rSets = (); # protein sets that have been started
-	my %failed = ();
-	
-	while ($nDone + (scalar keys %failed) < $nProt){
-		if (my $n = $requests-scalar(threads->list(threads::running))){
-			for (1..$n){
-				foreach my $set (keys %ins){
-					next if $ins{$set}{'done'};
-					unless (exists $rSets{$set}){
-						print "Batch BLAST of $set (".(scalar @{$ins{$set}{'prots'}})." queries) started.\n";
-						$rSets{$set} = 1;
-						mkdir "$wkDir/search/$set" unless -d "$wkDir/search/$set";
-						@a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
-						open LOG, ">>$wkDir/search/$set.log"; print LOG "Program started at $s.\nNumber of queries: ". (scalar @{$ins{$set}{'prots'}}) .".\n"; close LOG;
-					}
-					my $started = 0;
-					for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++){
-						my $query = $ins{$set}{'prots'}[$i]{'name'};
-						next if -s "$wkDir/search/$set/$query.txt";
-						next if exists $running{$query};
-						next if exists $failed{"$set|$query"};
-						my $thr = threads->create(\&http_blast, $set, $i);
-						print "  BLAST of $query started.\n";
-						$running{$query} = 1;
-						$started = 1;
-						select (undef, undef, undef, 0.25); # this is to delay 0.25 seconds
-						last;
-					}
-					last if $started;
-					print "Batch BLAST of $set (".(scalar @{$ins{$set}{'prots'}})." queries) completed.\n";
-					$ins{$set}{'done'} = 1;
-					delete $rSets{$set};
-					@a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
-					open LOG, ">>$wkDir/search/$set.log"; print LOG "Program ended at $s.\n"; close LOG;
-				}
-			}
-		}
-		if (@a = threads->list(threads::joinable)){
-			foreach (@a){
-				my ($query, $set, $hits, $return) = split (/\//, $_->join());
-				delete $running{$query};
-				if ($return){
-					$nDone ++;
-					print "  BLAST of $query completed. $hits hits retrieved.\n";
-					open LOG, ">>$wkDir/search/$set.log";
-					print LOG "$query\t$hits\n";
-					close LOG;
-				}else{
-					$retry{$query} = 0 unless exists $retry{$query};
-					$retry{$query} ++;
-					if ($retry{$query} >= $retries){
-						delete $retry{$query};
-						$failed{"$set|$query"} = 1;
-						print "  BLAST of $query failed.\n";
-						open LOG, ">>$wkDir/search/$set.log"; print LOG "$query\tfailed\n"; close LOG;
-					}else{
-						print "  BLAST of $query failed. Scheduled to retry.\n";
-					}
-				}
-			}
-		}
-		sleep $delay;
-		if ($retryFailed and %failed and ($nDone + (scalar keys %failed) == $nProt)){
-			$retryFailed = 0;
-			%failed = ();
-			$ins{$_}{'done'} = 0 for (keys %ins);
-		}
-	}
+if ($requests == 1 or not $httpBlast) {
+    foreach my $set (sort keys %ins) {
+        next if exists $ins{$set}{'done'};
+        print "Batch homology search of $set (".(scalar @{$ins{$set}{'prots'}})." queries) started.\n";
+        mkdir "$wkDir/search/$set" unless -d "$wkDir/search/$set";
+        @a = localtime(time);
+        $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
+        open LOG, ">>$wkDir/search/$set.log";
+        print LOG "Program started at $s.\n";
+        print LOG "Number of queries: ". (scalar @{$ins{$set}{'prots'}}) .".\n";
+        close LOG;
+
+        unless ($httpBlast) {  # local search
+
+            %h = ();
+            for (my $i = 0; $i < scalar(@{$ins{$set}{'prots'}}); $i ++) {
+                $h{$ins{$set}{'prots'}[$i]{'name'}} = $i unless $ins{$set}{'prots'}[$i]{'seq'};
+            }
+            if (%h) {
+                die "Error: Cannot extract query sequences from a non-BLAST local database.\n" if $searchTool ne "BLAST";
+                open OUT, ">$wkDir/seqids.txt";
+                print OUT $_."\n" for (keys %h);
+                close OUT;
+                my @out = `$blastdbcmd -dbtype=prot -db $protdb -entry_batch $wkDir/seqids.txt -outfmt \"%a %s\"`;
+                unlink "$wkDir/seqids.txt";
+                die "Error: Query sequences not found in $protdb.\n" if join('', @out) =~ /not found in BLAST database/;
+                foreach (@out) {
+                    s/\s+$//; next unless $_;
+                    next if /^Error/;
+                    @a = split (/\s+/); next unless $#a;
+                    $a[0] =~ s/\.\d+$//;
+                    if (exists $h{$a[0]}) { $ins{$set}{'prots'}[$h{$a[0]}]{'seq'} = $a[1]; }
+                }
+            }
+            %h = ();
+            for (my $i = 0; $i < scalar(@{$ins{$set}{'prots'}}); $i ++) {
+                $h{$ins{$set}{'prots'}[$i]{'name'}} = $i unless $ins{$set}{'prots'}[$i]{'seq'};
+            }
+            if (%h) { die "Error: One or more query sequences cannot be extracted from $protdb.\n"; }
+
+            # new feature: batch self search #
+
+            if ($searchTool eq "RAPSearch2" or $searchTool eq "DIAMOND") {
+                %h = ();
+                open OUT, ">$wkDir/tmp.in";
+                for (my $i = 0; $i < scalar(@{$ins{$set}{'prots'}}); $i ++) {
+                    $h{$ins{$set}{'prots'}[$i]{'name'}} = $i;
+                    print OUT ">".$ins{$set}{'prots'}[$i]{'name'}."\n".$ins{$set}{'prots'}[$i]{'seq'}."\n";
+                }
+                close OUT;
+                if ($searchTool eq "RAPSearch2") {
+                    `$prerapsearch -d $wkDir/tmp.in -n $wkDir/raptmp`;
+                    `$rapsearch -q $wkDir/tmp.in -d $wkDir/raptmp -t a -s f -o $wkDir/tmp -z $threads`;
+                    unlink "$wkDir/tmp.in";
+                    unlink "$wkDir/raptmp";
+                    unlink "$wkDir/raptmp.info";
+                    die "Error in running RAPSearch2. Please check." unless -s "$wkDir/tmp.m8";
+                    open IN, "<$wkDir/tmp.m8";
+                    while (<IN>) {
+                        s/\s+$//; next unless $_; next if /^#/;
+                        @a = split (/\t/);
+                        next if ($#a < 11);
+                        next unless $a[0] eq $a[1];
+                        next unless exists $h{$a[0]};
+                        next if exists $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'};
+                        $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'} = $a[10]."/".$a[11]."/".$a[2]."/".length($ins{$set}{'prots'}[$h{$a[0]}]{'seq'});
+                    }
+                    close IN;
+                    unlink "$wkDir/tmp.aln";
+                    unlink "$wkDir/tmp.m8";
+                } elsif ($searchTool eq "DIAMOND") {
+                    `$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
+                    `$diamond blastp -p $threads -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
+                    my @out = `$diamond view -a $wkDir/tmp.daa`;
+                    unlink "$wkDir/tmp.in";
+                    unlink "$wkDir/tmp.dmnd";
+                    unlink "$wkDir/tmp.daa";
+                    die "Error in running DIAMOND. Please check." unless @out;
+                    foreach (@out) {
+                        s/\s+$//;
+                        @a = split (/\t/);
+                        next unless $#a == 11;
+                        next unless $a[0] eq $a[1];
+                        next unless exists $h{$a[0]};
+                        next if exists $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'};
+                        $ins{$set}{'prots'}[$h{$a[0]}]{'selfalign'} = $a[10]."/".$a[11]."/".$a[2]."/".length($ins{$set}{'prots'}[$h{$a[0]}]{'seq'});
+                    }
+                }
+            }
+
+            # end of new feature #
+
+            my @ids = ();
+            for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++) {
+                next if $ins{$set}{'prots'}[$i]{'done'};
+                push (@ids, $i);
+                next if exists $ins{$set}{'prefile'};
+                next unless $queries;
+                unless (scalar(@ids) % $queries) {
+                    local_search ($set, \@ids);
+                    @ids = ();
+                }
+            }
+            local_search ($set, \@ids) if (@ids);
+
+        } else { # single-process http BLAST #
+
+            for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++) {
+                my $query = $ins{$set}{'prots'}[$i]{'name'};
+                next if -s "$wkDir/search/$set/$query.txt";
+                my $return = 0;
+                print "  BLASTing $query... ";
+                my $iRetry = 0;
+                while (1) {
+                    $return = http_blast ($set, $i);
+                    last if $return =~ /\/1$/;
+                    print "failed.\n";
+                    print "\n" and last if ($iRetry >= $retries);
+                    print "  Retrying...";
+                    $iRetry ++; sleep $delay;
+                }
+                open LOG, ">>$wkDir/search/$set.log";
+                if ($return =~ /\/(\d+)\/1$/) {
+                    print LOG "$query\t$1\n";
+                    print "done. $1 hits.\n";
+                } else {
+                    print LOG "$query\tfailed\n";
+                }
+                close LOG;
+                sleep $delay;
+            }
+        }
+        @a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
+        open LOG, ">>$wkDir/search/$set.log";
+        print LOG "Program ended at $s.\n";
+        close LOG;
+        unlink "$wkDir/tmp.in";
+        print "Batch homology search of $set (".(scalar @{$ins{$set}{'prots'}})." queries) completed.\n";
+    }
+
+} else {  # multi-process http BLAST #
+
+    print "$requests http BLAST threads are running in parallel.\n";
+
+    my %running = ();  # name -> 1
+    my %retry = ();  # name -> times of retry
+    my %rSets = ();  # protein sets that have been started
+    my %failed = ();
+
+    while ($nDone + (scalar keys %failed) < $nProt) {
+        if (my $n = $requests-scalar(threads->list(threads::running))) {
+            for (1..$n) {
+                foreach my $set (keys %ins) {
+                    next if $ins{$set}{'done'};
+                    unless (exists $rSets{$set}) {
+                        print "Batch BLAST of $set (".(scalar @{$ins{$set}{'prots'}})." queries) started.\n";
+                        $rSets{$set} = 1;
+                        mkdir "$wkDir/search/$set" unless -d "$wkDir/search/$set";
+                        @a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
+                        open LOG, ">>$wkDir/search/$set.log"; print LOG "Program started at $s.\nNumber of queries: ". (scalar @{$ins{$set}{'prots'}}) .".\n"; close LOG;
+                    }
+                    my $started = 0;
+                    for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++) {
+                        my $query = $ins{$set}{'prots'}[$i]{'name'};
+                        next if -s "$wkDir/search/$set/$query.txt";
+                        next if exists $running{$query};
+                        next if exists $failed{"$set|$query"};
+                        my $thr = threads->create(\&http_blast, $set, $i);
+                        print "  BLAST of $query started.\n";
+                        $running{$query} = 1;
+                        $started = 1;
+                        select (undef, undef, undef, 0.25); # this is to delay 0.25 seconds
+                        last;
+                    }
+                    last if $started;
+                    print "Batch BLAST of $set (".(scalar @{$ins{$set}{'prots'}})." queries) completed.\n";
+                    $ins{$set}{'done'} = 1;
+                    delete $rSets{$set};
+                    @a = localtime(time); $s = (++$a[4])."/$a[3]/".($a[5]+=1900)." $a[2]:$a[1]:$a[0]";
+                    open LOG, ">>$wkDir/search/$set.log"; print LOG "Program ended at $s.\n"; close LOG;
+                }
+            }
+        }
+        if (@a = threads->list(threads::joinable)) {
+            foreach (@a) {
+                my ($query, $set, $hits, $return) = split (/\//, $_->join());
+                delete $running{$query};
+                if ($return) {
+                    $nDone ++;
+                    print "  BLAST of $query completed. $hits hits retrieved.\n";
+                    open LOG, ">>$wkDir/search/$set.log";
+                    print LOG "$query\t$hits\n";
+                    close LOG;
+                } else {
+                    $retry{$query} = 0 unless exists $retry{$query};
+                    $retry{$query} ++;
+                    if ($retry{$query} >= $retries) {
+                        delete $retry{$query};
+                        $failed{"$set|$query"} = 1;
+                        print "  BLAST of $query failed.\n";
+                        open LOG, ">>$wkDir/search/$set.log"; print LOG "$query\tfailed\n"; close LOG;
+                    } else {
+                        print "  BLAST of $query failed. Scheduled to retry.\n";
+                    }
+                }
+            }
+        }
+        sleep $delay;
+        if ($retryFailed and %failed and ($nDone + (scalar keys %failed) == $nProt)) {
+            $retryFailed = 0;
+            %failed = ();
+            $ins{$_}{'done'} = 0 for (keys %ins);
+        }
+    }
 }
 print "Batch homology search completed. searcher.pl exits.\n";
 print "You may re-run searcher.pl to validate the results and finish incomplete searches.\n";
@@ -854,281 +847,260 @@
   # return: 0 (success) or 1 (fail)
 
 sub local_search {
-	my ($set, $refID) = @_;
-	my @ids = @$refID;
-	my %name2id = (); # protein name (accn) to index
-	my $outfile = $wkDir."/tmp.out"; # search result file name
-	if (exists $ins{$set}{'prefile'}){ # read pre-computed results
-		print "  Importing pre-computed search results of $set...";
-		$outfile = $preSearch."/".$ins{$set}{'prefile'};
-		$name2id{$ins{$set}{'prots'}[$_]{'name'}} = $_ for (@ids);
-	}else{ # de novo search
-		if (scalar(@ids) == scalar(@{$ins{$set}{'prots'}})){
-			print "  $searchTool"."ing all proteins of $set...";
-		}else{
-			if (scalar(@ids) <= 32){
-				@a = ();
-				push (@a, $ins{$set}{'prots'}[$_]{'name'}) for (@ids);
-				print "  $searchTool"."ing ".join(",", @a)."...";
-			}else{
-				@a = ();
-				for (my $i=0; $i<3; $i++){
-					push (@a, $ins{$set}{'prots'}[$ids[$i]]{'name'});
-				}
-				print "  $searchTool"."ing ".join(",", @a)."... (".scalar(@ids)." proteins)";
-			}
-		}
-		unlink "$wkDir/tmp.in" if -e "$wkDir/tmp.in";
-		open OUT, ">$wkDir/tmp.in";
-		foreach my $id (@ids){
-			print OUT ">".$ins{$set}{'prots'}[$id]{'name'}."\n".$ins{$set}{'prots'}[$id]{'seq'}."\n";
-			$name2id{$ins{$set}{'prots'}[$id]{'name'}} = $id;
-		}
-		close OUT;
-		if ($searchTool eq "BLAST"){
-			$s = "$blastp -query $wkDir/tmp.in -db $protdb -out $outfile";
-			$s .= " -num_threads $threads" if ($threads > 1);
-			$s .= " -evalue $evalue" if ($evalue);
-			$s .= " -max_target_seqs $nHits" if ($nHits);
-			$s .= " -outfmt \"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sseq\"";
-			`$s`;
-			# this is the standard tabular format, plus aligned part of subject sequence
-		}elsif ($searchTool eq "RAPSearch2"){
-			$s = "$rapsearch -q $wkDir/tmp.in -d $protdb -o $wkDir/tmp -t a -s f";
-			$s .= " -z $threads" if ($threads > 1);
-			$s .= " -e $evalue" if ($evalue);
-			$s .= " -v $nHits" if ($nHits);
-			`$s`;
-			$outfile = $wkDir."/tmp.m8";
-			unlink $wkDir."/tmp.aln";
-		}elsif ($searchTool eq "DIAMOND"){
-			$s = "$diamond blastp -p $threads -q $wkDir/tmp.in -d $protdb -a $wkDir/tmp -t $wkDir";
-			$s .= " -e $evalue" if ($evalue);
-			$s .= " -k $nHits" if ($nHits);
-			`$s`;
-			$s = "$diamond view -a $wkDir/tmp.daa -o $outfile";
-			`$s`;
-			unlink $wkDir."/tmp.daa";
-		}else{
-			unlink "$wkDir/tmp.in";
-			die "Error: Search tool not specified and pre-computed results not found for $set.\n";
-		}
-		unlink "$wkDir/tmp.in";
-	}
-	unless (-e $outfile){
-		print "Warning: Search result missing.\n";
-		return 1;
-	}
-	
-	# read search result file
-	my (%accn2gi, %gi2accn) = ((), ());
-	my %seqid2taxid = (); # need to look up the TaxIDs of these sequence IDs (GI, accn or name)
-	open IN, "<$outfile";
-	while (<IN>){
-		s/\s+$//; next unless $_; next if /^#/;
-		@a = split (/\t/);
-		next unless exists $name2id{$a[0]};
-		my $id = $name2id{$a[0]};
-		$a[1] =~ s/\s+$//; # RAPSearch2 adds ^M to subject ID. probably a bug.
-		next if ($evalue and $a[10] ne "*" and $evalue < $a[10]); # evalue cutoff
-		next if ($identity and $a[2] ne "*" and $identity > $a[2]); # % identity cutoff
-		next if ($coverage and $a[6] ne "*" and $a[7] ne "*" and $coverage > ($a[7]-$a[6]+1)/length($ins{$set}{'prots'}[$id]{'seq'})*100); # % coverage cutoff
-		%h = ('sseqid'=>$a[1], 'pident'=>$a[2], 'evalue'=>$a[10], 'bitscore'=>$a[11], 'qstart'=>$a[6], 'qend'=>$a[7], 'sseq'=>'', 'name'=>$a[1]);
-		$h{'sseq'} = $a[12] if ($#a >= 12 and $getAln);
-		if ($h{'sseqid'} =~ /^gi\|(\d+)\|.+\|([A-Z0-9_]+)\.\d+\|(.*)$/){
-			$h{'name'} = $2;
-			$accn2gi{$2} = $1 unless exists $accn2gi{$2};
-			$gi2accn{$1} = $2 unless exists $gi2accn{$1};
-			$seqid2taxid{$1} = '0' unless exists $seqid2taxid{$1};
-			$seqid2taxid{$2} = '0' unless exists $seqid2taxid{$2};
-		}else{
-			$seqid2taxid{$h{'name'}} = '0' unless exists $seqid2taxid{$h{'name'}};
-		}
-		push (@{$ins{$set}{'prots'}[$id]{'hits'}}, {%h});
-	}
-	close IN;
-	unlink $outfile unless exists $ins{$set}{'prefile'};
-	
-	# get TaxIDs for hits
-	if (%seqid2taxid){ 
-		my @seqids4db = (); # sequence IDs not in dictionary
-		
-		# look up in the dictionary
-		if (%prot2taxids){
-			foreach my $seqid (keys %seqid2taxid){
-				next if $seqid2taxid{$seqid};
-				if (exists $prot2taxids{$seqid}){
-					$seqid2taxid{$seqid} = $prot2taxids{$seqid};
-					if (exists $accn2gi{$seqid} and not $seqid2taxid{$accn2gi{$seqid}}){ $seqid2taxid{$accn2gi{$seqid}} = $prot2taxids{$seqid}; }
-					if (exists $gi2accn{$seqid} and not $seqid2taxid{$gi2accn{$seqid}}){ $seqid2taxid{$gi2accn{$seqid}} = $prot2taxids{$seqid}; }
-				}
-			}
-			foreach my $seqid (keys %seqid2taxid){
-				push (@seqids4db, $seqid) unless $seqid2taxid{$seqid};
-			}
-		}else{
-			@seqids4db = keys %seqid2taxid;
-		}
-		
-		# find the rest in database
-		if (@seqids4db and $searchTool eq "BLAST" and not $httpBlast){
-			open OUT, ">$wkDir/seqids.txt"; 
-			print OUT $_."\n" for (keys %seqid2taxid);
-			close OUT;
-			my @out = `$blastdbcmd -dbtype=prot -db $protdb -entry_batch $wkDir/seqids.txt -outfmt \"%a %g %T\"`;
-			unlink "$wkDir/seqids.txt";
-			foreach (@out){
-				s/\s+$//; next unless $_;
-				@a = split (/\s+/); next if $#a < 2;
-				next unless $a[2];
-				$a[0] =~ s/\.\d+$//;
-				if (exists $seqid2taxid{$a[0]} and not $seqid2taxid{$a[0]}){ $seqid2taxid{$a[0]} = $a[2]; }
-				elsif ($a[1] ne "N/A" and exists $seqid2taxid{$a[1]} and not $seqid2taxid{$a[1]}){ $seqid2taxid{$a[1]} = $a[2]; }
-			}
-		}
-	}
-	
-	# get complete taxonomy information
-	my %taxid2orgn = ();
-	foreach my $seqid (keys %seqid2taxid){
-		next unless $seqid2taxid{$seqid};
-		@a = split (/,/, $seqid2taxid{$seqid}); # one SeqID may correspond to multiple TaxIDs
-		foreach (@a){
-			next if exists $taxid2orgn{$_};
-			$taxid2orgn{$_} = '';
-		}
-	}
-	if (%taxid2orgn){
-		@a = keys %taxid2orgn;
-		@b = get_taxonomy (@a);
-		for (my $i=0; $i<scalar(@a); $i++){
-			$taxid2orgn{$a[$i]} = $b[$i];
-		}
-	}
-
-	# ignore invalid organism names
-	my @unwanted = ();
-	foreach my $taxid (keys %taxid2orgn){
-		my $organism = $taxid2orgn{$taxid};
-		if ($organism eq 'na'){ push (@unwanted, $taxid); }
-		elsif ($organism =~ /^Unresolved/){ push (@unwanted, $taxid); }
-		if ($taxonUCASE and ($organism !~ /^\[?[A-Z]/)){ # a valid organism name may start with "["
-			push (@unwanted, $taxid);
-		}
-		if (@ignoreTaxa){
-			foreach (@ignoreTaxa){
-				if ($organism =~ /$_/){ push (@unwanted, $taxid); last; }
-			}
-		}
-	}
-	delete $taxid2orgn{$_} for @unwanted;
-
-	# process hits for each query
-	foreach my $id (@ids){
-
-		# extract valid hits from search result #
-		my @hits;
-		my $isQueryIn = 0; # whether query is among subjects
-		my $name = $ins{$set}{'prots'}[$id]{'name'};
-
-		if (@{$ins{$set}{'prots'}[$id]{'hits'}}){
-			my %usedTaxids = ();
-			my %usedSpecies = ();
-			
-			# sort by bit score
-			my %id2score = ();
-			for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}[$id]{'hits'}}); $i++){
-				$id2score{$i} = $ins{$set}{'prots'}[$id]{'hits'}[$i]{'bitscore'};
-			}
-			foreach my $i (sort { $id2score{$b} <=> $id2score{$a} } keys %id2score){
-				%h = %{$ins{$set}{'prots'}[$id]{'hits'}[$i]};
-				my $taxids = '';
-				if (exists $seqid2taxid{$h{'name'}}){ $taxids = $seqid2taxid{$h{'name'}}; }
-				elsif (exists $accn2gi{$h{'name'}} and exists $seqid2taxid{$accn2gi{$h{'name'}}}){ $taxids = $seqid2taxid{$accn2gi{$h{'name'}}}; }
-				next unless $taxids;
-				foreach my $taxid (split (/,/, $taxids)){
-					next unless exists $taxid2orgn{$taxid};
-					my $organism = $taxid2orgn{$taxid};
-					if ($ins{$set}{'prots'}[$id]{'name'} eq $h{'name'}){ # self-align result is always retained
-						$isQueryIn = 1;
-					}else{ # remove redundancy from other hits
-						next if $taxid2orgn{$taxid} eq 'na';
-						next if exists $usedTaxids{$taxid};
-						if ($ignoreSubspecies){
-							next unless exists $dbTaxa{$taxid};
-							next unless exists $dbTaxa{$taxid}{'species'};
-							my $species = $dbTaxa{$taxid}{'species'};
-							next unless $species;
-							next if exists $usedSpecies{$species};
-							$usedSpecies{$species} = $dbRanks{$species};
-						}
-						$usedTaxids{$taxid} = 1;
-					}
-					my %hit = ('accn'=>$h{'name'}, 'expect'=>$h{'evalue'}, 'score'=>$h{'bitscore'}, 'identity'=>$h{'pident'}, 'coverage'=>'*', 'taxid'=>$taxid, 'organism'=>$organism, 'sequence'=>$h{'sseq'});
-					if ($h{'qstart'} ne "*" and $h{'qend'} ne "*"){ $hit{'coverage'} = sprintf("%.2f", ($h{'qend'}-$h{'qstart'}+1)/length($ins{$set}{'prots'}[$id]{'seq'})*100); }
-					push (@hits, {%hit});
-					last if (scalar @hits >= $nHits);
-				}
-			}
-		}
-
-		# perform self search, in case not targetted in previous steps #
-		unless ($isQueryIn){
-			if (exists $ins{$set}{'prots'}[$id]{'selfalign'}){
-				@a = split(/\//, $ins{$set}{'prots'}[$id]{'selfalign'});
-			}else{
-				@a = self_align ($name, $ins{$set}{'prots'}[$id]{'seq'});
-			}
-			if (@a != (0,0,0,0)){
-				my %hit = ('accn'=>$name, 'expect'=>$a[0], 'score'=>$a[1], 'identity'=>$a[2], 'coverage'=>'100.00', 'taxid'=>$ins{$set}{'taxid'}, 'organism'=>$ins{$set}{'organism'}, 'sequence'=>$ins{$set}{'prots'}[$id]{'seq'});
-				unshift (@hits, {%hit});
-				pop @hits if (scalar @hits > $nHits);
-			}
-		}
-
-		# output result
-		my ($ntax, $nchar) = (0, 0);
-		open (OUT, ">>$wkDir/search/$set/$name.txt");
-		print OUT "#NEXUS\nBEGIN QUERY;\n";
-		if ($ins{$set}{'prots'}[$id]{'accn'}){
-			print OUT "\tGI=".$ins{$set}{'prots'}[$id]{'gi'}.";\n\tAccession=".$ins{$set}{'prots'}[$id]{'accn'}.";\n";
-		}else{
-			print OUT "\tName=".$ins{$set}{'prots'}[$id]{'name'}.";\n";
-		}
-		print OUT "\tLength=".length($ins{$set}{'prots'}[$id]{'seq'}).";\n";
-		if ($ins{$set}{'prots'}[$id]{'product'}){
-			print OUT "\tProduct=".$ins{$set}{'prots'}[$id]{'product'}.";\n";
-		}
-		print OUT "\tOrganism=".$ins{$set}{'organism'}.";\n";
-		print OUT "END;\n\n";
-		print OUT "BEGIN ORGANISM;\n";
-		print OUT "[Accession\tOrganism\tTaxID\tBit-score\tE-value\t\%Identity\t\%Coverage]\n";
-		for (my $i=0; $i<scalar(@hits); $i++){
-			next if exists $hits[$i]{'delete'};
-			$ntax ++;
-			$nchar = length($hits[$i]{'sequence'}) if ($hits[$i]{'sequence'} and $nchar < length($hits[$i]{'sequence'}));
-			print OUT $hits[$i]{'accn'}."\t".$hits[$i]{'organism'}."\t".$hits[$i]{'taxid'}."\t".$hits[$i]{'score'}."\t".$hits[$i]{'expect'}."\t".$hits[$i]{'identity'}."\t".$hits[$i]{'coverage'}."\n";
-			last if $i > $maxHits;
-		}
-		print OUT ";\nEND;\n\n";
-		if ($getAln){
-			print OUT "BEGIN DATA;\n";
-			print OUT "\tDIMENSIONS NTAX=$ntax NCHAR=$nchar;\n\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n\tMATRIX\n";
-			for (my $i=0; $i<scalar(@hits); $i++){
-				next if exists $hits[$i]{'delete'};
-				@a = split (/\//, $hits[$i]{'accn'});
-				print OUT $a[0]."\t".$hits[$i]{'sequence'}."\n";
-				last if $i > $maxHits;
-			}
-			print OUT ";\nEND;\n\n";
-		}
-		close OUT;
-		open LOG, ">>$wkDir/search/$set.log";
-		print LOG "$name\t$ntax\n";
-		close LOG;
-	}
-	print " done.\n";
-	return 0;
+    my ($set, $refID) = @_;
+    my @ids = @$refID;
+    my %name2id = (); # protein name to index
+    my $outfile = $wkDir."/tmp.out"; # search result file name
+    if (exists $ins{$set}{'prefile'}) { # read pre-computed results
+        print "  Importing pre-computed search results of $set...";
+        $outfile = $preSearch."/".$ins{$set}{'prefile'};
+        $name2id{$ins{$set}{'prots'}[$_]{'name'}} = $_ for (@ids);
+    } else { # de novo search
+        if (scalar(@ids) == scalar(@{$ins{$set}{'prots'}})) {
+            print "  $searchTool"."ing all proteins of $set...";
+        } else {
+            if (scalar(@ids) <= 32) {
+                @a = ();
+                push (@a, $ins{$set}{'prots'}[$_]{'name'}) for (@ids);
+                print "  $searchTool"."ing ".join(",", @a)."...";
+            } else {
+                @a = ();
+                for (my $i=0; $i<3; $i++) {
+                    push (@a, $ins{$set}{'prots'}[$ids[$i]]{'name'});
+                }
+                print "  $searchTool"."ing ".join(",", @a)."... (".scalar(@ids)." proteins)";
+            }
+        }
+        unlink "$wkDir/tmp.in" if -e "$wkDir/tmp.in";
+        open OUT, ">$wkDir/tmp.in";
+        foreach my $id (@ids) {
+            print OUT ">".$ins{$set}{'prots'}[$id]{'name'}."\n".$ins{$set}{'prots'}[$id]{'seq'}."\n";
+            $name2id{$ins{$set}{'prots'}[$id]{'name'}} = $id;
+        }
+        close OUT;
+        if ($searchTool eq "BLAST") {
+            $s = "$blastp -query $wkDir/tmp.in -db $protdb -out $outfile";
+            $s .= " -num_threads $threads" if ($threads > 1);
+            $s .= " -evalue $evalue" if ($evalue);
+            $s .= " -max_target_seqs $nHits" if ($nHits);
+            $s .= " -outfmt \"6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sseq\"";
+            `$s`;
+            # this is the standard tabular format, plus aligned part of subject sequence
+        } elsif ($searchTool eq "RAPSearch2") {
+            $s = "$rapsearch -q $wkDir/tmp.in -d $protdb -o $wkDir/tmp -t a -s f";
+            $s .= " -z $threads" if ($threads > 1);
+            $s .= " -e $evalue" if ($evalue);
+            $s .= " -v $nHits" if ($nHits);
+            `$s`;
+            $outfile = $wkDir."/tmp.m8";
+            unlink $wkDir."/tmp.aln";
+        } elsif ($searchTool eq "DIAMOND") {
+            $s = "$diamond blastp -p $threads -q $wkDir/tmp.in -d $protdb -a $wkDir/tmp -t $wkDir";
+            $s .= " -e $evalue" if ($evalue);
+            $s .= " -k $nHits" if ($nHits);
+            `$s`;
+            $s = "$diamond view -a $wkDir/tmp.daa -o $outfile";
+            `$s`;
+            unlink $wkDir."/tmp.daa";
+        } else {
+            unlink "$wkDir/tmp.in";
+            die "Error: Search tool not specified and pre-computed results not found for $set.\n";
+        }
+        unlink "$wkDir/tmp.in";
+    }
+    unless (-e $outfile) {
+        print "Warning: Search result missing.\n";
+        return 1;
+    }
+
+    # read search result file
+    my %seqid2taxid = (); # need to look up the TaxIDs of these sequence IDs
+    open IN, "<$outfile";
+    while (<IN>) {
+        s/\s+$//; next unless $_; next if /^#/;
+        @a = split (/\t/);
+        next unless exists $name2id{$a[0]};
+        my $id = $name2id{$a[0]};
+        $a[1] =~ s/\s+$//; # RAPSearch2 adds ^M to subject ID. probably a bug.
+        next if ($evalue and $a[10] ne "*" and $evalue < $a[10]); # evalue cutoff
+        next if ($identity and $a[2] ne "*" and $identity > $a[2]); # % identity cutoff
+        next if ($coverage and $a[6] ne "*" and $a[7] ne "*" and $coverage > ($a[7]-$a[6]+1)/length($ins{$set}{'prots'}[$id]{'seq'})*100); # % coverage cutoff
+        %h = ('sseqid'=>$a[1], 'pident'=>$a[2], 'evalue'=>$a[10], 'bitscore'=>$a[11], 'qstart'=>$a[6], 'qend'=>$a[7], 'sseq'=>'', 'name'=>$a[1]);
+        $h{'sseq'} = $a[12] if ($#a >= 12 and $getAln);
+        $seqid2taxid{$h{'name'}} = '0' unless exists $seqid2taxid{$h{'name'}};
+        push (@{$ins{$set}{'prots'}[$id]{'hits'}}, {%h});
+    }
+    close IN;
+    unlink $outfile unless exists $ins{$set}{'prefile'};
+
+    # get TaxIDs for hits
+    if (%seqid2taxid) {
+        my @seqids4db = (); # sequence IDs not in dictionary
+
+        # look up in the dictionary
+        if (%prot2taxids) {
+            foreach my $seqid (keys %seqid2taxid) {
+                next if $seqid2taxid{$seqid};
+                if (exists $prot2taxids{$seqid}) {
+                    $seqid2taxid{$seqid} = $prot2taxids{$seqid};
+                }
+            }
+            foreach my $seqid (keys %seqid2taxid) {
+                push (@seqids4db, $seqid) unless $seqid2taxid{$seqid};
+            }
+        } else {
+            @seqids4db = keys %seqid2taxid;
+        }
+
+        # find the rest in database
+        if (@seqids4db and $searchTool eq "BLAST" and not $httpBlast) {
+            open OUT, ">$wkDir/seqids.txt";
+            print OUT $_."\n" for (keys %seqid2taxid);
+            close OUT;
+            my @out = `$blastdbcmd -dbtype=prot -db $protdb -entry_batch $wkDir/seqids.txt -outfmt \"%a %g %T\"`;
+            unlink "$wkDir/seqids.txt";
+            foreach (@out) {
+                s/\s+$//; next unless $_;
+                @a = split (/\s+/); next if $#a < 2;
+                next unless $a[2];
+                $a[0] =~ s/\.\d+$//;
+                if (exists $seqid2taxid{$a[0]} and not $seqid2taxid{$a[0]}) { $seqid2taxid{$a[0]} = $a[2]; }
+                elsif ($a[1] ne "N/A" and exists $seqid2taxid{$a[1]} and not $seqid2taxid{$a[1]}) { $seqid2taxid{$a[1]} = $a[2]; }
+            }
+        }
+    }
+
+    # get complete taxonomy information
+    my %taxid2orgn = ();
+    foreach my $seqid (keys %seqid2taxid) {
+        next unless $seqid2taxid{$seqid};
+        @a = split (/,/, $seqid2taxid{$seqid}); # one SeqID may correspond to multiple TaxIDs
+        foreach (@a) {
+            next if exists $taxid2orgn{$_};
+            $taxid2orgn{$_} = '';
+        }
+    }
+    if (%taxid2orgn) {
+        @a = keys %taxid2orgn;
+        @b = get_taxonomy (@a);
+        for (my $i=0; $i<scalar(@a); $i++) {
+            $taxid2orgn{$a[$i]} = $b[$i];
+        }
+    }
+
+    # ignore invalid organism names
+    my @unwanted = ();
+    foreach my $taxid (keys %taxid2orgn) {
+        my $organism = $taxid2orgn{$taxid};
+        if ($organism eq 'na') { push (@unwanted, $taxid); }
+        elsif ($organism =~ /^Unresolved/) { push (@unwanted, $taxid); }
+        if ($taxonUCASE and ($organism !~ /^\[?[A-Z]/)) { # a valid organism name may start with "["
+            push (@unwanted, $taxid);
+        }
+        if (@ignoreTaxa) {
+            foreach (@ignoreTaxa) {
+                if ($organism =~ /$_/) { push (@unwanted, $taxid); last; }
+            }
+        }
+    }
+    delete $taxid2orgn{$_} for @unwanted;
+
+    # process hits for each query
+    foreach my $id (@ids) {
+
+        # extract valid hits from search result #
+        my @hits;
+        my $isQueryIn = 0; # whether query is among subjects
+        my $name = $ins{$set}{'prots'}[$id]{'name'};
+
+        if (@{$ins{$set}{'prots'}[$id]{'hits'}}) {
+            my %usedTaxids = ();
+            my %usedSpecies = ();
+
+            # sort by bit score
+            my %id2score = ();
+            for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}[$id]{'hits'}}); $i++) {
+                $id2score{$i} = $ins{$set}{'prots'}[$id]{'hits'}[$i]{'bitscore'};
+            }
+            foreach my $i (sort { $id2score{$b} <=> $id2score{$a} } keys %id2score) {
+                %h = %{$ins{$set}{'prots'}[$id]{'hits'}[$i]};
+                next unless exists $seqid2taxid{$h{'name'}};
+                foreach my $taxid (split (/,/, $seqid2taxid{$h{'name'}})) {
+                    next unless exists $taxid2orgn{$taxid};
+                    my $organism = $taxid2orgn{$taxid};
+                    if ($ins{$set}{'prots'}[$id]{'name'} eq $h{'name'}) { # self-align result is always retained
+                        $isQueryIn = 1;
+                    } else { # remove redundancy from other hits
+                        next if $taxid2orgn{$taxid} eq 'na';
+                        next if exists $usedTaxids{$taxid};
+                        if ($ignoreSubspecies) {
+                            next unless exists $dbTaxa{$taxid};
+                            next unless exists $dbTaxa{$taxid}{'species'};
+                            my $species = $dbTaxa{$taxid}{'species'};
+                            next unless $species;
+                            next if exists $usedSpecies{$species};
+                            $usedSpecies{$species} = $dbRanks{$species};
+                        }
+                        $usedTaxids{$taxid} = 1;
+                    }
+                    my %hit = ('name'=>$h{'name'}, 'expect'=>$h{'evalue'}, 'score'=>$h{'bitscore'}, 'identity'=>$h{'pident'}, 'coverage'=>'*', 'taxid'=>$taxid, 'organism'=>$organism, 'sequence'=>$h{'sseq'});
+                    if ($h{'qstart'} ne "*" and $h{'qend'} ne "*") { $hit{'coverage'} = sprintf("%.2f", ($h{'qend'}-$h{'qstart'}+1)/length($ins{$set}{'prots'}[$id]{'seq'})*100); }
+                    push (@hits, {%hit});
+                    last if (scalar @hits >= $nHits);
+                }
+            }
+        }
+
+        # perform self search, in case not targetted in previous steps #
+        unless ($isQueryIn) {
+            if (exists $ins{$set}{'prots'}[$id]{'selfalign'}) {
+                @a = split(/\//, $ins{$set}{'prots'}[$id]{'selfalign'});
+            } else {
+                @a = self_align ($name, $ins{$set}{'prots'}[$id]{'seq'});
+            }
+            if (@a != (0,0,0,0)) {
+                my %hit = ('name'=>$name, 'expect'=>$a[0], 'score'=>$a[1], 'identity'=>$a[2], 'coverage'=>'100.00', 'taxid'=>$ins{$set}{'taxid'}, 'organism'=>$ins{$set}{'organism'}, 'sequence'=>$ins{$set}{'prots'}[$id]{'seq'});
+                unshift (@hits, {%hit});
+                pop @hits if (scalar @hits > $nHits);
+            }
+        }
+
+        # output result
+        my ($ntax, $nchar) = (0, 0);
+        open (OUT, ">>$wkDir/search/$set/$name.txt");
+        print OUT "#NEXUS\nBEGIN QUERY;\n";
+        print OUT "\tName=".$ins{$set}{'prots'}[$id]{'name'}.";\n";
+        print OUT "\tLength=".length($ins{$set}{'prots'}[$id]{'seq'}).";\n";
+        print OUT "\tTitle=".$ins{$set}{'prots'}[$id]{'title'}.";\n";
+        print OUT "END;\n\n";
+        print OUT "BEGIN ORGANISM;\n";
+        print OUT "[Accession\tOrganism\tTaxID\tBit-score\tE-value\t\%Identity\t\%Coverage]\n";
+        for (my $i=0; $i<scalar(@hits); $i++) {
+            next if exists $hits[$i]{'delete'};
+            $ntax ++;
+            $nchar = length($hits[$i]{'sequence'}) if ($hits[$i]{'sequence'} and $nchar < length($hits[$i]{'sequence'}));
+            print OUT $hits[$i]{'name'}."\t".$hits[$i]{'organism'}."\t".$hits[$i]{'taxid'}."\t".$hits[$i]{'score'}."\t".$hits[$i]{'expect'}."\t".$hits[$i]{'identity'}."\t".$hits[$i]{'coverage'}."\n";
+            last if $i > $maxHits;
+        }
+        print OUT ";\nEND;\n\n";
+        if ($getAln) {
+            print OUT "BEGIN DATA;\n";
+            print OUT "\tDIMENSIONS NTAX=$ntax NCHAR=$nchar;\n\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n\tMATRIX\n";
+            for (my $i=0; $i<scalar(@hits); $i++) {
+                next if exists $hits[$i]{'delete'};
+                @a = split (/\//, $hits[$i]{'name'});
+                print OUT $a[0]."\t".$hits[$i]{'sequence'}."\n";
+                last if $i > $maxHits;
+            }
+            print OUT ";\nEND;\n\n";
+        }
+        close OUT;
+        open LOG, ">>$wkDir/search/$set.log";
+        print LOG "$name\t$ntax\n";
+        close LOG;
+    }
+    print " done.\n";
+    return 0;
 }
 
 
@@ -1140,446 +1112,441 @@ sub local_search {
   # note: the Perl thread join function has some problems, that why I used this inconvenient way.
 
 sub http_blast{
-	my ($set, $id) = ($_[0], $_[1]);
-
-	# generate query information
-	my %self = %{$ins{$set}{'prots'}[$id]}; # copy the whole record
-	$self{'length'} = 0; # need to know length
-	$self{'length'} = length($self{'seq'}) if $self{'seq'}; # ideally, sequence is available, otherwise see below
-	my $query = $self{'name'};
-
-	# send BLAST request #
-	my $isError = 0;
-	my $url = "$blastServer?CMD=Put&PROGRAM=blastp&DATABASE=$httpDb&FILTER=m S";
-	$url .= "&EXPECT=$evalue" if $evalue;
-	$url .= "&MAX_NUM_SEQ=$nHits" if ($nHits != 100);
-	$url .= "&EQ_TEXT=$eqText" if $eqText;
-	my $querykey = $self{'name'};
-	$querykey = $self{'seq'} if $self{'seq'}; # user-defined query sequence
-	$url .= "&QUERY=$querykey";
-	$s = get $url;
-	
-	# $url = substr($args,0,2048) if (length($args) > 2048); # In some situations, the max size of a URL is 2048 bytes
-	# @a = localtime(time); print "Post: $a[2]:$a[1]:$a[0], ";
-	my $starttime = time;
-	
-	my $rid;
-	# @a = localtime(time); print "Respond: $a[2]:$a[1]:$a[0], ";
-	if ($s =~ /^    RID = (.*$)/m){ $rid = $1; print "RID=$rid "; }
-	else{ return "$query/$set/0/0"; } ##########################
-	if ($s =~ /\s\((\d+) letters\)/){ $self{'length'} = $1; }
-	if ($s =~ /^    RTOE = (.*$)/m){
-		my $rtoe = $1;
-		# print "RTOE=$rtoe, ";
-		if ($rtoe and $rtoe =~ /^\d+$/){ sleep $rtoe; }else{ sleep $delay; }
-	}else{ return "$query/$set/0/0"; }
-	while (1){
-		sleep 1;
-		$s = get "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
-		if ($s =~ /\s+Status=([A-Z]+)/m){
-			if ($1 eq "WAITING"){
-				if (time - $starttime > $timeout){
-					$isError = 1; last;
-				}else{
-					print ". "; sleep $delay; next;
-				}
-			}
-			elsif ($1 eq "FAILED"){ $isError = 1; last; }
-			elsif ($1 eq "UNKNOWN"){ $isError = 1; last; }
-			elsif ($1 eq "READY"){ @a = localtime(time); last; }
-			else{ $isError = 1; last; }
-		}else{
-			$isError = 1; last;
-		}
-		
-		# if ($s =~ /\s+Status=WAITING/m){ next; }
-		# if ($s =~ /\s+Status=FAILED/m){ $isError = 1; last; }
-		# if ($s =~ /\s+Status=UNKNOWN/m){ $isError = 1; last; }
-		# if ($s =~ /\s+Status=READY/m){
-		# 	if ($s =~ /\s+ThereAreHits=yes/m){ last; }
-		# 	else{ last; } # no hits;
-		# }
-	}
-	if ($isError){ return "$query/$set/0/0"; }
-	
-	# retrieve tabular report #
-	sleep 1;
-	$url = "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
-	$url .= "&MAX_NUM_SEQ=$nHits&DESCRIPTIONS=$nHits" if ($nHits != 100);
-	$s = get $url;
-	if (($s !~ /# blastp/) or ($s !~ /# Query:\s/)){ return "$query/$set/0/0"; }
-	my %hits = ();
-	my $hitid = 0;
-	foreach (split(/\n/, $s)){
-		if (/^# Query/ and not $self{'seq'}){
-			s/>.*//;
-			@a = split (/\|/);
-			$self{'gi'} = $a[1];
-			$self{'accession'} = $a[3];
-			$a[4] =~ /^\s*(.+\S)\s*\[(.+)\]/;
-			$self{'product'} = $1;
-			$self{'organism'} = $2;
-		}
-		next if (/^#/);
-		@a = split (/\t/);
-		next if ($#a < 12);
-		next unless $a[12];
-		next if ($identity and $identity > $a[2]); # % identity cutoff
-		@b = split (/;/, $a[1]);
-		$hitid ++;
-		foreach (@b){
-			next unless (/\|$/);
-			@c = split (/\|/);
-			$c[3] =~ s/\.\d+$//;
-			my %hit = ('id', $hitid, 'accn', $c[3], 'expect', $a[11], 'score', $a[12], 'identity', $a[2], 'qlength', $a[8]-$a[7]+1);
-			$hits{$c[1]} = {%hit};
-		}
-	}
-
-	# retrieve taxonomy information #
-	my $i = 0; # count
-	@a = (); # all results
-	@b = (); # subset of GIs
-	foreach (keys %hits){
-		$i ++;
-		push (@b, $_);
-		if ($i == 190){ # in some situations, a URI should not exceed ~2000 characters, which is approximately 190 GIs.
-			sleep 1;
-			while (1){
-				$s = get $eSummaryServer."?db=protein&id=".join (",", @b);
-				last if (defined $s);
-				sleep $delay;
-			}
-			push (@a, $1) while ($s =~ s/<DocSum>(.+?)<\/DocSum>//s);
-			$i = 0; @b = ();
-		}
-	}
-	if (@b){
-		sleep 1;
-		while (1){
-			$s = get $eSummaryServer."?db=protein&id=".join (",", @b);
-			last if (defined $s);
-			sleep $delay;
-		}
-		push (@a, $1) while ($s =~ s/<DocSum>(.+?)<\/DocSum>//s);
-	}
-
-	foreach (@a){
-		/<Id>(\d+)<\/Id>/;
-		$s = $1;
-		/<Item Name=\"TaxId\" Type=\"Integer\">(\d+?)<\/Item>/;
-		$hits{$s}{'taxid'} = $1;
-		/<Item Name=\"Title\" Type=\"String\">.*\[([^\[\]]+?)\]<\/Item>/;
-		$hits{$s}{'organism'} = $1;
-		/<Item Name=\"Length\" Type=\"Integer\">(\d+)<\/Item>/;
-		$hits{$s}{'length'} = $1;
-	}
-
-	# discard hits whose taxonomy information is unidentified #
-	foreach (keys %hits){
-		delete $hits{$_} and next unless exists $hits{$_}{'taxid'};
-		delete $hits{$_} unless $hits{$_}{'taxid'};
-		delete $hits{$_} and next unless exists $hits{$_}{'organism'};
-		delete $hits{$_} unless $hits{$_}{'organism'};
-		delete $hits{$_} if ($hits{$_}{'organism'} =~ /^Unresolved/);
-		delete $hits{$_} if ($taxonUCASE and ($hits{$_}{'organism'} !~ /^[A-Z]/));
-	}
-
-	# discard hits whose organism names contain specified strings #
-	if (@ignoreTaxa){
-		foreach (keys %hits){
-			foreach $s (@ignoreTaxa){
-				if ($hits{$_}{'organism'} =~ /$s/){
-					delete $hits{$_};
-					last;
-				}
-			}
-		}
-	}
-
-	# merge identical proteins (defined by NCBI) from one organism #
-	$hitid = 0;
-	my ($taxid, $key) = (0, '');
-	foreach ( sort {$hits{$a}{'id'} <=> $hits{$b}{'id'}} keys %hits){
-		if ($hits{$_}{'id'} != $hitid){
-			$hitid = $hits{$_}{'id'};
-			$taxid = $hits{$_}{'taxid'};
-			$key = $_;
-		}else{
-			if ($hits{$_}{'taxid'} == $taxid){
-				$hits{$key}{'accn'} .= "/".$hits{$_}{'accn'};
-				delete $hits{$_};
-			}
-		}
-	}
-
-	# merge duplicated hits (proteins with same bit score from one organism) #
-	if ($mergeDuplicates){
-		my $score = 0;
-		($taxid, $key) = (0, '');
-		foreach ( sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'taxid'} <=> $hits{$b}{'taxid'})} keys %hits){
-			if (($hits{$_}{'score'} != $score) or ($hits{$_}{'taxid'} != $taxid)){
-				$score = $hits{$_}{'score'};
-				$taxid = $hits{$_}{'taxid'};
-				$key = $_;
-			}else{
-				$hits{$key}{'accn'} .= "/".$hits{$_}{'accn'};
-				delete $hits{$_};
-			}
-		}
-	}
-
-	# reorder accession numbers #
-	foreach (keys %hits){
-		$hits{$_}{'accn'} = order_accns ($hits{$_}{'accn'}, $query);
-	}
-
-	# retrieve taxonomy information from NCBI server #
-	$i = 0;
-	@a = ();
-	foreach (keys %hits){
-		next if exists $dbTaxa{$hits{$_}{'taxid'}};
-		push @a, $hits{$_}{'taxid'};
-		$i ++;
-		if ($i == 150){
-			sleep 1;
-			get_taxonomy @a;
-			$i = 0; @a = (); sleep $delay;
-		}
-	}
-	sleep 1;
-	get_taxonomy @a if @a;
-
-	# check whether BLAST result contains query itself #
-	my $isQueryIn = 0;
-	foreach (keys %hits){
-		@a = split (/\//, $hits{$_}{'accn'});
-		foreach $s (@a){ if ($query eq $s){ $isQueryIn = $hits{$_}{'id'}; last; }}
-		if ($isQueryIn){ $self{'length'} = $hits{$_}{'length'} unless $self{'length'}; last; }
-	}
-
-	# find out length of query sequence, in case not in previous steps #
-	unless ($self{'length'} or $self{'seq'}){
-		sleep 1;
-		while (1){
-			$s = get "$eSearchServer?db=protein&term=$query";
-			last if (defined $s);
-			sleep $delay;
-		}
-		$s =~ /<Id>(\d+)<\/Id>/;
-		sleep 1;
-		while (1){
-			$s = get "$eSummaryServer?db=protein&id=$1";
-			last if (defined $s);
-			sleep $delay;
-		}
-		$s =~ /<Item Name=\"Length\" Type=\"Integer\">(\d+)<\/Item>/;
-		$self{'length'} = $1;
-	}
-	next unless $self{'length'};
-	$hits{$_}{'coverage'} = sprintf("%.2f", $hits{$_}{'qlength'}/$self{'length'}*100) for (keys %hits);
-
-	# perform self search, in case not targetted in previous steps #
-	unless ($isQueryIn){
-		sleep 1;
-		@a = self_align ($query, '');
-		return "$query/$set/0/0" if (@a == (0,0,0,0));
-		my %hit = ('id', 0, 'accn', $query, 'expect', $a[0], 'score', $a[1], 'identity', $a[2], 'coverage', '100.00', 'taxid', $ins{$set}{'taxid'}, 'organism', $ins{$set}{'organism'});
-		if ($self{'length'}){ $hit{'length'} = $self{'length'}; }
-		elsif ($a[3]){ $hit{'length'} = $a[3]; }
-		else{ $hit{'length'} = 0; }
-		$hits{$query} = {%hit};
-	}
-
-	# in multiple hits from one species, only keep the hit with the highest bit score #
-	if ($ignoreSubspecies){
-		my %speciesDB = ();
-		foreach (sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits){
-			unless (exists $hits{$_}{'taxid'} and exists $dbTaxa{$hits{$_}{'taxid'}} and exists $dbTaxa{$hits{$_}{'taxid'}}{'species'} and $dbTaxa{$hits{$_}{'taxid'}}{'species'}){
-				delete $hits{$_}; next;
-			}
-			$s = $dbTaxa{$hits{$_}{'taxid'}}{'species'};
-			unless (exists $speciesDB{$s}){
-				$speciesDB{$s} = $dbRanks{$s};
-			}else{
-				next if ($hits{$_}{'taxid'} == $ins{$set}{'taxid'});
-				next if (($hits{$_}{'accn'} eq $query) or ($hits{$_}{'accn'} =~ /^$query\//) or ($hits{$_}{'accn'} =~ /\/$query$/) or ($hits{$_}{'accn'} =~ /\/$query\//));
-				delete $hits{$_};
-			}
-		}
-	}
-
-	# remove excessive hits #
-	if ($maxHits){
-		my $i = 0;
-		foreach (sort {$hits{$b}{'score'} <=> $hits{$a}{'score'}} keys %hits){
-			$i ++;
-			delete $hits{$_} if ($i > $maxHits);
-		}
-	}
-
-	# create output file #
-	open (OUT, ">$wkDir/search/$set/$query.txt");
-
-	print OUT "#NEXUS\nBEGIN QUERY;\n";
-	if ($self{'accn'}){
-		print OUT "\tGI=".$self{'gi'}.";\n\tAccession=".$self{'accn'}.";\n";
-	}else{
-		print OUT "\tName=".$self{'name'}.";\n";
-	}
-	print OUT "\tLength=".$self{'length'}.";\n";
-	if ($self{'product'}){
-		print OUT "\tProduct=".$self{'product'}.";\n";
-	}
-	print OUT "\tOrganism=".$ins{$set}{'organism'}.";\n";
-	print OUT "END;\n\n";
-
-	# retrieve taxonomy report (using TaxBLAST)
-	if ($taxBlast){
-		sleep 1;
-		$s = get "$blastServer?CMD=Get&FORMAT_TYPE=HTML&FORMAT_OBJECT=TaxBlast&RID=$rid&ALIGNMENTS=100000";
-		if (($s !~ /Tax BLAST Report/) or ($s !~ /Lineage Report/)){
-			print OUT "BEGIN ERROR;\nRetrieval of taxonomy report failed.\n;\nEND;\n\n";
-		}else{
-			my $reading = 0;
-			foreach (split(/\n/, $s)){
-				if ($_ eq "<B><A NAME=lineage>Lineage Report</A></B><BR>"){
-					print OUT "BEGIN LINEAGE;\n";
-					$reading = 1; next;
-				}
-				if (($_ eq "</FONT></PRE><HR>") and $reading){
-					print OUT ";\nEND;\n\n";
-					$reading = 0; next;
-				}
-				if ($reading){
-					s/\<.*?>//g;
-					s/( hit[ s] \[.*?\])(.*)$/$1/;
-					if (@ignoreTaxa){
-						$i = 0;
-						foreach $s (@ignoreTaxa){
-							if (/$s/){ $i = 1; last; }
-						}
-						next if $i;
-					}
-					print OUT "$_\n";
-					next;
-				}
-			}
-		}
-	}
-
-	# output hit table #
-
-	print OUT "BEGIN ORGANISM;\n";
-	print OUT "[Accession\tOrganism\tTaxID\tBit-score\tE-value\t\%Identity\t\%Coverage]\n";
-	foreach (sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits){
-		print OUT $hits{$_}{'accn'}."\t".$hits{$_}{'organism'}."\t".$hits{$_}{'taxid'}."\t".$hits{$_}{'score'}."\t".$hits{$_}{'expect'}."\t".$hits{$_}{'identity'}."\t".$hits{$_}{'coverage'}."\n";
-	}
-	print OUT ";\nEND;\n\n";
-
-	# retrieve hit sequences #
-	if ($seqBlast){
-		my $i = 0; # count
-		my $allinfo = "";
-		@b = (); # subset of GIs
-		foreach (keys %hits){
-			$i ++;
-			push (@b, $_);
-			if ($i == 190){ # a URI should not exceed ~2000 characters, which is approximately 190 GIs.
-				sleep 1;
-				while (1){
-					$s = get $eFetchServer."?db=protein&rettype=FASTA&id=".join (",", @b);
-					last if (defined $s);
-					sleep $delay;
-				}
-				$allinfo .= $s;
-				$i = 0; @b = ();
-			}
-		}
-		if (@b){
-			sleep 1;
-			while (1){
-				$s = get $eFetchServer."?db=protein&rettype=FASTA&id=".join (",", @b);
-				last if (defined $s);
-				sleep $delay;
-			}
-			$allinfo .= $s;
-		}
-		$i = 0; # current GI
-		foreach (split (/\n/, $allinfo)){
-			next unless $_;
-			if (/^>gi\|(\d+)\|/){
-				$i = $1;
-				$hits{$i}{'sequence'} = "";
-			}else{
-				$hits{$i}{'sequence'} .= $_ if $i;
-			}
-		}
-	}
-
-	# retrieve multiple sequence alignment (conflicts seqBlast)
-	if ($alnBlast and not $seqBlast){
-		sleep 1;
-		$s = get "$blastServer?CMD=Get&ALIGNMENT_VIEW=FlatQueryAnchoredNoIdentities&FORMAT_TYPE=Text&RID=$rid&ALIGNMENTS=100000";
-		if (($s !~ /blastp/i) or ($s !~ /\nQuery=\s/)){
-			print OUT "BEGIN ERROR;\nRetrieval of multiple sequence alignment failed.\n;\nEND;\n\n";
-		}else{
-			@c = split(/\n/, $s);
-			my $reading = 0; # reading status
-			my $iBlock = -1; # block ID
-			my %seqs; # sequence alignment
-			foreach (@c){
-				if ($_ eq "ALIGNMENTS"){ $reading = 1; next; }
-				if ($reading and /^\s/){ $reading = 0; last; }
-				next unless $reading;
-				unless ($_){ $iBlock ++; next; } # Start a new block if empty line
-				@a = split(/\s+/,substr($_,0,19)); # id
-				#next if ($a[0] eq "Query");
-				$_ =~ /(\s\s\d*$)/;
-				$s = substr($_,19,length($_)-19-length($1)); # sequence
-				$s =~ s/\s/-/g;
-				$seqs{$a[0]} = "-"x($iBlock*60) unless (exists $seqs{$a[0]}); # add new sequence
-				$seqs{$a[0]} .= $s; # add new sequence
-			}
-			$i = 0;
-			foreach $s(keys %seqs){
-				$i = length($seqs{$s}) if (length($seqs{$s}) > $i);
-			}
-			foreach $s(keys %seqs){
-				$seqs{$s} .= "-"x($i - length($seqs{$s})) if (length($seqs{$s}) < $i);
-			}
-			foreach my $hit (keys %hits){
-				@a = split /\//, $hits{$hit}{'accn'};
-				foreach (@a){
-					if (exists $seqs{$_}){
-						$hits{$hit}{'sequence'} = $seqs{$_};
-						last;
-					}
-				}
-			}
-		}
-	}
-
-	# output sequences #
-	if ($seqBlast or $alnBlast){
-		my $ntax = 0; # number of sequences
-		my $nchar = 0; # maximum length of sequence
-		foreach (keys %hits){
-			next unless exists $hits{$_}{'sequence'};
-			$ntax ++;
-			$nchar = length ($hits{$_}{'sequence'}) if (length ($hits{$_}{'sequence'}) > $nchar);
-		}
-		print OUT "BEGIN DATA;\n";
-		print OUT "\tDIMENSIONS NTAX=$ntax NCHAR=$nchar;\n\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n\tMATRIX\n";
-		foreach (sort {($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits){
-			next unless exists $hits{$_}{'sequence'};
-			@a = split (/\//, $hits{$_}{'accn'});
-			print OUT $a[0]."\t".$hits{$_}{'sequence'}."\n";
-		}		
-		print OUT ";\nEND;\n\n";
-	}
-	close OUT;
-	return "$query/$set/".scalar(keys %hits)."/1";
+    my ($set, $id) = ($_[0], $_[1]);
+
+    # generate query information
+    my %self = %{$ins{$set}{'prots'}[$id]}; # copy the whole record
+    $self{'length'} = 0; # need to know length
+    $self{'length'} = length($self{'seq'}) if $self{'seq'}; # ideally, sequence is available, otherwise see below
+    my $query = $self{'name'};
+
+    # send BLAST request #
+    my $isError = 0;
+    my $url = "$blastServer?CMD=Put&PROGRAM=blastp&DATABASE=$httpDb&FILTER=m%20S";
+    $url .= "&EXPECT=$evalue" if $evalue;
+    $url .= "&MAX_NUM_SEQ=$nHits" if ($nHits != 100);
+    $url .= "&EQ_TEXT=$eqText" if $eqText;
+    my $querykey = $self{'name'};
+    $querykey = $self{'seq'} if $self{'seq'}; # user-defined query sequence
+    $url .= "&QUERY=$querykey";
+    $s = get $url;
+
+    # $url = substr($args,0,2048) if (length($args) > 2048); # In some situations, the max size of a URL is 2048 bytes
+    # @a = localtime(time); print "Post: $a[2]:$a[1]:$a[0], ";
+    my $starttime = time;
+
+    my $rid;
+    # @a = localtime(time); print "Respond: $a[2]:$a[1]:$a[0], ";
+    if ($s =~ /^    RID = (.*$)/m) { $rid = $1; print "RID=$rid "; }
+    else{ return "$query/$set/0/0"; } ##########################
+    if ($s =~ /\s\((\d+) letters\)/) { $self{'length'} = $1; }
+    if ($s =~ /^    RTOE = (.*$)/m) {
+        my $rtoe = $1;
+        # print "RTOE=$rtoe, ";
+        if ($rtoe and $rtoe =~ /^\d+$/) { sleep $rtoe; } else { sleep $delay; }
+    } else { return "$query/$set/0/0"; }
+    while (1) {
+        sleep 1;
+        $s = get "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
+        if ($s =~ /\s+Status=([A-Z]+)/m) {
+            if ($1 eq "WAITING") {
+                if (time - $starttime > $timeout) {
+                    $isError = 1; last;
+                } else {
+                    print ". "; sleep $delay; next;
+                }
+            }
+            elsif ($1 eq "FAILED") { $isError = 1; last; }
+            elsif ($1 eq "UNKNOWN") { $isError = 1; last; }
+            elsif ($1 eq "READY") { @a = localtime(time); last; }
+            else{ $isError = 1; last; }
+        } else {
+            $isError = 1; last;
+        }
+
+        # if ($s =~ /\s+Status=WAITING/m) { next; }
+        # if ($s =~ /\s+Status=FAILED/m) { $isError = 1; last; }
+        # if ($s =~ /\s+Status=UNKNOWN/m) { $isError = 1; last; }
+        # if ($s =~ /\s+Status=READY/m) {
+        #     if ($s =~ /\s+ThereAreHits=yes/m) { last; }
+        #     else{ last; } # no hits;
+        # }
+    }
+    if ($isError) { return "$query/$set/0/0"; }
+
+    # retrieve tabular report #
+    sleep 1;
+    $url = "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
+    $url .= "&MAX_NUM_SEQ=$nHits&DESCRIPTIONS=$nHits" if ($nHits != 100);
+    $s = get $url;
+    if (($s !~ /# blastp/) or ($s !~ /# Query:\s/)) { return "$query/$set/0/0"; }
+    my %hits = ();
+    my $hitIdx = 0;
+    foreach (split(/\n/, $s)) {
+        if (/^#/) {
+            if (s/^# Query: // and not $self{'seq'}) {
+                # s/>.*//;
+                @a = split(/\s+/, $_, 2);
+                $self{'title'} = $a[1] if $#a;
+            }
+        } else {
+            # fields: query id, subject ids, % identity, % positives, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
+            @a = split (/\t/);
+            next if ($#a < 12);
+            next if $identity and $identity > $a[2];  # % identity cutoff
+            $hitIdx ++;
+            foreach (split (/;/, $a[1])) {
+                next unless /\|$/;
+                # as of 2017, the NCBI BLAST server still prints sequence ids like "gi|123456789|ref|NP_123456.1|"
+                my $gi = (split(/\|/))[1];
+                my $name = (split(/\|/))[-1];
+                # length of aligned region of query sequence = q. end - q. start + 1
+                my $qlen = $a[8] - $a[7] + 1;
+                my %hit = ('id'=>$hitIdx, 'name'=>$name, 'expect'=>$a[11], 'score'=>$a[12], 'identity'=>$a[2], 'qlen'=>$qlen);
+                $hits{$gi} = {%hit};
+            }
+        }
+    }
+
+    # retrieve taxonomy information #
+    my $i = 0; # count
+    @a = (); # all results
+    @b = (); # subset of names
+    foreach (keys %hits) {
+        $i ++;
+        push (@b, $_);
+        if ($i == 190) { # in some situations, a URI should not exceed ~2000 characters, which is approximately 190 GIs.
+            sleep 1;
+            while (1) {
+                $s = get $eSummaryServer."?db=protein&id=".join (",", @b);
+                last if (defined $s);
+                sleep $delay;
+            }
+            push (@a, $1) while ($s =~ s/<DocSum>(.+?)<\/DocSum>//s);
+            $i = 0; @b = ();
+        }
+    }
+    if (@b) {
+        sleep 1;
+        while (1) {
+            $s = get $eSummaryServer."?db=protein&id=".join (",", @b);
+            last if (defined $s);
+            sleep $delay;
+        }
+        push (@a, $1) while ($s =~ s/<DocSum>(.+?)<\/DocSum>//s);
+    }
+
+    foreach (@a) {
+        # as of 2017, this Id is still the GI number
+        /<Id>(\d+)<\/Id>/;
+        $s = $1;
+        /<Item Name=\"TaxId\" Type=\"Integer\">(\d+?)<\/Item>/;
+        $hits{$s}{'taxid'} = $1;
+        /<Item Name=\"Title\" Type=\"String\">.*\[([^\[\]]+?)\]<\/Item>/;
+        $hits{$s}{'organism'} = $1;
+        /<Item Name=\"Length\" Type=\"Integer\">(\d+)<\/Item>/;
+        $hits{$s}{'length'} = $1;
+    }
+
+    # discard hits whose taxonomy information is unidentified #
+    foreach (keys %hits) {
+        delete $hits{$_} and next unless exists $hits{$_}{'taxid'};
+        delete $hits{$_} unless $hits{$_}{'taxid'};
+        delete $hits{$_} and next unless exists $hits{$_}{'organism'};
+        delete $hits{$_} unless $hits{$_}{'organism'};
+        delete $hits{$_} if ($hits{$_}{'organism'} =~ /^Unresolved/);
+        delete $hits{$_} if ($taxonUCASE and ($hits{$_}{'organism'} !~ /^[A-Z]/));
+    }
+
+    # discard hits whose organism names contain specified strings #
+    if (@ignoreTaxa) {
+        foreach (keys %hits) {
+            foreach $s (@ignoreTaxa) {
+                if ($hits{$_}{'organism'} =~ /$s/) {
+                    delete $hits{$_};
+                    last;
+                }
+            }
+        }
+    }
+
+    # merge identical proteins (defined by NCBI) from one organism #
+    $hitIdx = 0;
+    my ($taxid, $key) = (0, '');
+    foreach ( sort {$hits{$a}{'id'} <=> $hits{$b}{'id'}} keys %hits) {
+        if ($hits{$_}{'id'} != $hitIdx) {
+            $hitIdx = $hits{$_}{'id'};
+            $taxid = $hits{$_}{'taxid'};
+            $key = $_;
+        } else {
+            if ($hits{$_}{'taxid'} == $taxid) {
+                $hits{$key}{'name'} .= "/".$hits{$_}{'name'};
+                delete $hits{$_};
+            }
+        }
+    }
+
+    # merge duplicated hits (proteins with same bit score from one organism) #
+    if ($mergeDuplicates) {
+        my $score = 0;
+        ($taxid, $key) = (0, '');
+        foreach ( sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'taxid'} <=> $hits{$b}{'taxid'})} keys %hits) {
+            if (($hits{$_}{'score'} != $score) or ($hits{$_}{'taxid'} != $taxid)) {
+                $score = $hits{$_}{'score'};
+                $taxid = $hits{$_}{'taxid'};
+                $key = $_;
+            } else {
+                $hits{$key}{'name'} .= "/".$hits{$_}{'name'};
+                delete $hits{$_};
+            }
+        }
+    }
+
+    # reorder accession numbers #
+    foreach (keys %hits) {
+        $hits{$_}{'name'} = order_accns ($hits{$_}{'name'}, $query);
+    }
+
+    # retrieve taxonomy information from NCBI server #
+    $i = 0;
+    @a = ();
+    foreach (keys %hits) {
+        next if exists $dbTaxa{$hits{$_}{'taxid'}};
+        push @a, $hits{$_}{'taxid'};
+        $i ++;
+        if ($i == 150) {
+            sleep 1;
+            get_taxonomy @a;
+            $i = 0; @a = (); sleep $delay;
+        }
+    }
+    sleep 1;
+    get_taxonomy @a if @a;
+
+    # check whether BLAST result contains query itself #
+    my $isQueryIn = 0;
+    foreach (keys %hits) {
+        @a = split (/\//, $hits{$_}{'name'});
+        foreach $s (@a) { if ($query eq $s) { $isQueryIn = $hits{$_}{'id'}; last; }}
+        if ($isQueryIn) { $self{'length'} = $hits{$_}{'length'} unless $self{'length'}; last; }
+    }
+
+    # find out length of query sequence, in case not in previous steps #
+    unless ($self{'length'} or $self{'seq'}) {
+        sleep 1;
+        while (1) {
+            $s = get "$eSearchServer?db=protein&term=$query";
+            last if (defined $s);
+            sleep $delay;
+        }
+        # as of 2017, this Id is still the GI number
+        $s =~ /<Id>(\d+)<\/Id>/;
+        sleep 1;
+        while (1) {
+            $s = get "$eSummaryServer?db=protein&id=$1";
+            last if (defined $s);
+            sleep $delay;
+        }
+        $s =~ /<Item Name=\"Length\" Type=\"Integer\">(\d+)<\/Item>/;
+        $self{'length'} = $1;
+    }
+    next unless $self{'length'};
+    $hits{$_}{'coverage'} = sprintf("%.2f", $hits{$_}{'qlen'}/$self{'length'}*100) for (keys %hits);
+
+    # perform self search, in case not targetted in previous steps #
+    unless ($isQueryIn) {
+        sleep 1;
+        @a = self_align ($query, '');
+        return "$query/$set/0/0" if (@a == (0,0,0,0));
+        my %hit = ('id', 0, 'name', $query, 'expect', $a[0], 'score', $a[1], 'identity', $a[2], 'coverage', '100.00', 'taxid', $ins{$set}{'taxid'}, 'organism', $ins{$set}{'organism'});
+        if ($self{'length'}) { $hit{'length'} = $self{'length'}; }
+        elsif ($a[3]) { $hit{'length'} = $a[3]; }
+        else{ $hit{'length'} = 0; }
+        $hits{$query} = {%hit};
+    }
+
+    # in multiple hits from one species, only keep the hit with the highest bit score #
+    if ($ignoreSubspecies) {
+        my %speciesDB = ();
+        foreach (sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits) {
+            unless (exists $hits{$_}{'taxid'} and exists $dbTaxa{$hits{$_}{'taxid'}} and exists $dbTaxa{$hits{$_}{'taxid'}}{'species'} and $dbTaxa{$hits{$_}{'taxid'}}{'species'}) {
+                delete $hits{$_}; next;
+            }
+            $s = $dbTaxa{$hits{$_}{'taxid'}}{'species'};
+            unless (exists $speciesDB{$s}) {
+                $speciesDB{$s} = $dbRanks{$s};
+            } else {
+                next if ($hits{$_}{'taxid'} == $ins{$set}{'taxid'});
+                next if (($hits{$_}{'name'} eq $query) or ($hits{$_}{'name'} =~ /^$query\//) or ($hits{$_}{'name'} =~ /\/$query$/) or ($hits{$_}{'name'} =~ /\/$query\//));
+                delete $hits{$_};
+            }
+        }
+    }
+
+    # remove excessive hits #
+    if ($maxHits) {
+        my $i = 0;
+        foreach (sort {$hits{$b}{'score'} <=> $hits{$a}{'score'}} keys %hits) {
+            $i ++;
+            delete $hits{$_} if ($i > $maxHits);
+        }
+    }
+
+    # create output file #
+    open (OUT, ">$wkDir/search/$set/$query.txt");
+
+    print OUT "#NEXUS\nBEGIN QUERY;\n";
+    print OUT "\tName=".$self{'name'}.";\n";
+    print OUT "\tLength=".$self{'length'}.";\n";
+    print OUT "\tTitle=".$self{'title'}.";\n";
+    print OUT "END;\n\n";
+
+    # retrieve taxonomy report (using TaxBLAST)
+    if ($taxBlast) {
+        sleep 1;
+        $s = get "$blastServer?CMD=Get&FORMAT_TYPE=HTML&FORMAT_OBJECT=TaxBlast&RID=$rid&ALIGNMENTS=100000";
+        if (($s !~ /Tax BLAST Report/) or ($s !~ /Lineage Report/)) {
+            print OUT "BEGIN ERROR;\nRetrieval of taxonomy report failed.\n;\nEND;\n\n";
+        } else {
+            my $reading = 0;
+            foreach (split(/\n/, $s)) {
+                if ($_ eq "<B><A NAME=lineage>Lineage Report</A></B><BR>") {
+                    print OUT "BEGIN LINEAGE;\n";
+                    $reading = 1; next;
+                }
+                if (($_ eq "</FONT></PRE><HR>") and $reading) {
+                    print OUT ";\nEND;\n\n";
+                    $reading = 0; next;
+                }
+                if ($reading) {
+                    s/\<.*?>//g;
+                    s/( hit[ s] \[.*?\])(.*)$/$1/;
+                    if (@ignoreTaxa) {
+                        $i = 0;
+                        foreach $s (@ignoreTaxa) {
+                            if (/$s/) { $i = 1; last; }
+                        }
+                        next if $i;
+                    }
+                    print OUT "$_\n";
+                    next;
+                }
+            }
+        }
+    }
+
+    # output hit table #
+
+    print OUT "BEGIN ORGANISM;\n";
+    print OUT "[Accession\tOrganism\tTaxID\tBit-score\tE-value\t\%Identity\t\%Coverage]\n";
+    foreach (sort {($hits{$b}{'score'} <=> $hits{$a}{'score'}) or ($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits) {
+        print OUT $hits{$_}{'name'}."\t".$hits{$_}{'organism'}."\t".$hits{$_}{'taxid'}."\t".$hits{$_}{'score'}."\t".$hits{$_}{'expect'}."\t".$hits{$_}{'identity'}."\t".$hits{$_}{'coverage'}."\n";
+    }
+    print OUT ";\nEND;\n\n";
+
+    # retrieve hit sequences #
+    if ($seqBlast) {
+        my $i = 0; # count
+        my $allinfo = "";
+        @b = (); # subset of GIs
+        foreach (keys %hits) {
+            $i ++;
+            push (@b, $_);
+            if ($i == 190) { # a URI should not exceed ~2000 characters, which is approximately 190 GIs.
+                sleep 1;
+                while (1) {
+                    $s = get $eFetchServer."?db=protein&rettype=FASTA&id=".join (",", @b);
+                    last if (defined $s);
+                    sleep $delay;
+                }
+                $allinfo .= $s;
+                $i = 0; @b = ();
+            }
+        }
+        if (@b) {
+            sleep 1;
+            while (1) {
+                $s = get $eFetchServer."?db=protein&rettype=FASTA&id=".join (",", @b);
+                last if (defined $s);
+                sleep $delay;
+            }
+            $allinfo .= $s;
+        }
+        $i = 0; # current GI
+        foreach (split (/\n/, $allinfo)) {
+            next unless $_;
+            if (/^>gi\|(\d+)\|/) {
+                $i = $1;
+                $hits{$i}{'sequence'} = "";
+            } else {
+                $hits{$i}{'sequence'} .= $_ if $i;
+            }
+        }
+    }
+
+    # retrieve multiple sequence alignment (conflicts seqBlast)
+    if ($alnBlast and not $seqBlast) {
+        sleep 1;
+        $s = get "$blastServer?CMD=Get&ALIGNMENT_VIEW=FlatQueryAnchoredNoIdentities&FORMAT_TYPE=Text&RID=$rid&ALIGNMENTS=100000";
+        if (($s !~ /blastp/i) or ($s !~ /\nQuery=\s/)) {
+            print OUT "BEGIN ERROR;\nRetrieval of multiple sequence alignment failed.\n;\nEND;\n\n";
+        } else {
+            @c = split(/\n/, $s);
+            my $reading = 0; # reading status
+            my $iBlock = -1; # block ID
+            my %seqs; # sequence alignment
+            foreach (@c) {
+                if ($_ eq "ALIGNMENTS") { $reading = 1; next; }
+                if ($reading and /^\s/) { $reading = 0; last; }
+                next unless $reading;
+                unless ($_) { $iBlock ++; next; } # Start a new block if empty line
+                @a = split(/\s+/,substr($_,0,19)); # id
+                #next if ($a[0] eq "Query");
+                $_ =~ /(\s\s\d*$)/;
+                $s = substr($_,19,length($_)-19-length($1)); # sequence
+                $s =~ s/\s/-/g;
+                $seqs{$a[0]} = "-"x($iBlock*60) unless (exists $seqs{$a[0]}); # add new sequence
+                $seqs{$a[0]} .= $s; # add new sequence
+            }
+            $i = 0;
+            foreach $s(keys %seqs) {
+                $i = length($seqs{$s}) if (length($seqs{$s}) > $i);
+            }
+            foreach $s(keys %seqs) {
+                $seqs{$s} .= "-"x($i - length($seqs{$s})) if (length($seqs{$s}) < $i);
+            }
+            foreach my $hit (keys %hits) {
+                @a = split /\//, $hits{$hit}{'name'};
+                foreach (@a) {
+                    if (exists $seqs{$_}) {
+                        $hits{$hit}{'sequence'} = $seqs{$_};
+                        last;
+                    }
+                }
+            }
+        }
+    }
+
+    # output sequences #
+    if ($seqBlast or $alnBlast) {
+        my $ntax = 0; # number of sequences
+        my $nchar = 0; # maximum length of sequence
+        foreach (keys %hits) {
+            next unless exists $hits{$_}{'sequence'};
+            $ntax ++;
+            $nchar = length ($hits{$_}{'sequence'}) if (length ($hits{$_}{'sequence'}) > $nchar);
+        }
+        print OUT "BEGIN DATA;\n";
+        print OUT "\tDIMENSIONS NTAX=$ntax NCHAR=$nchar;\n\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n\tMATRIX\n";
+        foreach (sort {($hits{$a}{'id'} <=> $hits{$b}{'id'})} keys %hits) {
+            next unless exists $hits{$_}{'sequence'};
+            @a = split (/\//, $hits{$_}{'name'});
+            print OUT $a[0]."\t".$hits{$_}{'sequence'}."\n";
+        }
+        print OUT ";\nEND;\n\n";
+    }
+    close OUT;
+    return "$query/$set/".scalar(keys %hits)."/1";
 }
 
 
@@ -1589,91 +1556,91 @@ sub http_blast{
   # return (0, 0, 0, 0) if failed
 
 sub self_align {
-	my ($name, $seq, $length) = ($_[0], $_[1], 0);
-	$length = length($seq) if $seq;
-	unless ($httpBlast){ # local mode
-		my $fail = 0;
-		my @result = ();
-		open OUT, ">$wkDir/tmp.in";
-		print OUT ">$name\n".$seq."\n";
-		close OUT;
-		if ($searchTool eq "BLAST"){
-			my @out = `$blastp -query $wkDir/tmp.in -subject $wkDir/tmp.in -outfmt \"6 evalue bitscore pident\"`;
-			unlink "$wkDir/tmp.in";
-			@a = split (/\t/, $out[0]);
-			if ($#a == 2){
-				$a[2] =~ s/\s+$//;
-				@result = ($a[0], $a[1], $a[2], $length);
-			}else{ $fail = 1; }
-		}elsif ($searchTool eq "RAPSearch2"){
-			`$prerapsearch -d $wkDir/tmp.in -n $wkDir/raptmp`;
-			`$rapsearch -q $wkDir/tmp.in -d $wkDir/raptmp -t a -s f -o $wkDir/tmp`;
-			unlink "$wkDir/tmp.in";
-			unlink "$wkDir/raptmp";
-			unlink "$wkDir/raptmp.info";
-			if (-s "$wkDir/tmp.m8"){
-				open IN, "<$wkDir/tmp.m8";
-				while (<IN>){
-					s/\s+$//; next unless $_; next if /^#/;
-					@a = split (/\t/);
-					if ($#a < 11){ $fail = 1; last; }
-					@result = ($a[10], $a[11], $a[2], $length);
-					last;
-				}
-				close IN;
-				unlink "$wkDir/tmp.aln";
-				unlink "$wkDir/tmp.m8";
-			}else{ $fail = 1; }
-
-		}elsif ($searchTool eq "DIAMOND"){
-			`$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
-			`$diamond blastp -p 1 -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
-			my @out = `$diamond view -a $wkDir/tmp.daa`;
-			unlink "$wkDir/tmp.in";
-			unlink "$wkDir/tmp.dmnd";
-			unlink "$wkDir/tmp.daa";
-			if (@out){
-				@a = split (/\t/, $out[0]);
-				if ($#a == 11){
-					$a[11] =~ s/\s+$//;
-					@result = ($a[10], $a[11], $a[2], $length);
-				}else{ $fail = 1; }
-			}else{ $fail = 1; }
-		}else{ $fail = 1; }
-		if ($fail){
-			unlink "$wkDir/tmp.in" if -e "$wkDir/tmp.in";
-			return (0,0,0,0);
-		}else{ return @result; }
-
-	}else{ # http mode
-		my $isError = 0;
-		if ($seq){ $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=".$seq."&SUBJECTS=".$seq; }
-		else{ $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=$name&SUBJECTS=$name"; };
-		return (0,0,0,0) unless (defined $s and $s =~ /^    RID = (.*$)/m);
-		my $rid = $1;
-		if ($s =~ /\s\((\d+) letters\)/ and not $length){ $length = $1; }
-		sleep 1;
-		while (1){
-			$s = get "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
-			return (0,0,0,0) unless defined $s and $s =~ /\s+Status=(.+)/m;
-			if ($1 eq "WAITING"){ sleep $delay; next; }
-			if ($1 eq "FAILED" or $1 eq "UNKNOWN"){ $isError = 1; last; }
-			if ($1 eq "READY" and $s =~ /\s+ThereAreHits=yes/m){ last; }
-			$isError = 1; last;
-		}
-		return (0,0,0,0) if $isError;
-		sleep 1;
-		$s = get "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
-		return (0,0,0,0) unless (defined $s and $s =~ /# blastp/ and $s =~ /# Query:\s/);
-		$s =~ /\<PRE\>(.*)\<\/PRE\>/s; $s = $1;
-		foreach (split(/\n/, $s)){
-			next unless $_;
-			next if (/^#/);
-			my @a = split (/\t/);
-			$a[12] =~ s/^\s+//;
-			return ($a[11], $a[12], $a[2], $length);
-		}
-	}
+    my ($name, $seq, $length) = ($_[0], $_[1], 0);
+    $length = length($seq) if $seq;
+    unless ($httpBlast) { # local mode
+        my $fail = 0;
+        my @result = ();
+        open OUT, ">$wkDir/tmp.in";
+        print OUT ">$name\n".$seq."\n";
+        close OUT;
+        if ($searchTool eq "BLAST") {
+            my @out = `$blastp -query $wkDir/tmp.in -subject $wkDir/tmp.in -outfmt \"6 evalue bitscore pident\"`;
+            unlink "$wkDir/tmp.in";
+            @a = split (/\t/, $out[0]);
+            if ($#a == 2) {
+                $a[2] =~ s/\s+$//;
+                @result = ($a[0], $a[1], $a[2], $length);
+            } else { $fail = 1; }
+        } elsif ($searchTool eq "RAPSearch2") {
+            `$prerapsearch -d $wkDir/tmp.in -n $wkDir/raptmp`;
+            `$rapsearch -q $wkDir/tmp.in -d $wkDir/raptmp -t a -s f -o $wkDir/tmp`;
+            unlink "$wkDir/tmp.in";
+            unlink "$wkDir/raptmp";
+            unlink "$wkDir/raptmp.info";
+            if (-s "$wkDir/tmp.m8") {
+                open IN, "<$wkDir/tmp.m8";
+                while (<IN>) {
+                    s/\s+$//; next unless $_; next if /^#/;
+                    @a = split (/\t/);
+                    if ($#a < 11) { $fail = 1; last; }
+                    @result = ($a[10], $a[11], $a[2], $length);
+                    last;
+                }
+                close IN;
+                unlink "$wkDir/tmp.aln";
+                unlink "$wkDir/tmp.m8";
+            } else { $fail = 1; }
+
+        } elsif ($searchTool eq "DIAMOND") {
+            `$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
+            `$diamond blastp -p 1 -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
+            my @out = `$diamond view -a $wkDir/tmp.daa`;
+            unlink "$wkDir/tmp.in";
+            unlink "$wkDir/tmp.dmnd";
+            unlink "$wkDir/tmp.daa";
+            if (@out) {
+                @a = split (/\t/, $out[0]);
+                if ($#a == 11) {
+                    $a[11] =~ s/\s+$//;
+                    @result = ($a[10], $a[11], $a[2], $length);
+                } else { $fail = 1; }
+            } else { $fail = 1; }
+        } else { $fail = 1; }
+        if ($fail) {
+            unlink "$wkDir/tmp.in" if -e "$wkDir/tmp.in";
+            return (0,0,0,0);
+        } else { return @result; }
+
+    } else { # http mode
+        my $isError = 0;
+        if ($seq) { $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=".$seq."&SUBJECTS=".$seq; }
+        else{ $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=$name&SUBJECTS=$name"; };
+        return (0,0,0,0) unless (defined $s and $s =~ /^    RID = (.*$)/m);
+        my $rid = $1;
+        if ($s =~ /\s\((\d+) letters\)/ and not $length) { $length = $1; }
+        sleep 1;
+        while (1) {
+            $s = get "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
+            return (0,0,0,0) unless defined $s and $s =~ /\s+Status=(.+)/m;
+            if ($1 eq "WAITING") { sleep $delay; next; }
+            if ($1 eq "FAILED" or $1 eq "UNKNOWN") { $isError = 1; last; }
+            if ($1 eq "READY" and $s =~ /\s+ThereAreHits=yes/m) { last; }
+            $isError = 1; last;
+        }
+        return (0,0,0,0) if $isError;
+        sleep 1;
+        $s = get "$blastServer?CMD=Get&ALIGNMENT_VIEW=Tabular&FORMAT_TYPE=Text&RID=$rid";
+        return (0,0,0,0) unless (defined $s and $s =~ /# blastp/ and $s =~ /# Query:\s/);
+        $s =~ /\<PRE\>(.*)\<\/PRE\>/s; $s = $1;
+        foreach (split(/\n/, $s)) {
+            next unless $_;
+            next if (/^#/);
+            my @a = split (/\t/);
+            $a[12] =~ s/^\s+//;
+            return ($a[11], $a[12], $a[2], $length);
+        }
+    }
 }
 
 
@@ -1683,181 +1650,156 @@ sub self_align {
   # other: write global variable %dbTaxa and %dbRanks, write file taxa.db and ranks.db
 
 sub get_taxonomy{
-	my @organisms = ();
-	my %taxa2w = (); # taxa to write
-	my %ranks2w = (); # ranks to write
-	my %hRanks = map { $_ => 1 } @ranks;
-
-	unless ($httpBlast){ # local taxonomy database
-		foreach my $taxid (@_){
-			if (exists $dbTaxa{$taxid}){
-				push (@organisms, $dbTaxa{$taxid}{'organism'});
-				next;
-			}
-			if (exists $taxdumps{$taxid}){
-				my $id = $taxid;
-				my $pid = $taxdumps{$taxid}{'parent'};
-				my $name = $taxdumps{$taxid}{'name'};
-				my $rank = $taxdumps{$taxid}{'rank'};
-				push (@organisms, $name);
-				my %taxon :shared = ('organism', $name);
-				$taxon{$_} = "" for (@ranks);
-				$taxon{$rank} = $id if ($rank and exists $hRanks{$rank});
-				while (1){
-					last unless ($pid and $pid != $id);
-					$name = $taxdumps{$pid}{'name'};
-					$rank = $taxdumps{$pid}{'rank'};
-					unless (exists $dbRanks{$pid}){
-						$dbRanks{$pid} = $name;
-						$ranks2w{$pid} = $name;
-					}
-					$taxon{'lineage'} .= "/$pid";
-					$taxon{$rank} = $pid if (exists $hRanks{$rank});
-					$id = $pid;
-					$pid = $taxdumps{$id}{'parent'};
-				}
-				$dbTaxa{$taxid} = \%taxon;
-				$taxa2w{$taxid} = {%taxon};
-			}else{
-				push (@organisms, "na");
-				unless (exists $badTaxids{$taxid}){
-					print " Warning: Invalid TaxID: $taxid.\n";
-					$badTaxids{$taxid} = 1;
-					open OUT, ">>$wkDir/taxonomy/invalid.taxids";
-					print OUT $taxid."\n";
-					close OUT;
-				}
-			}
-		}
-
-	}else{ # remote taxonomy database
-		my $iRetry = 0;
-		my @taxids2get = ();
-		my %taxid2organism = ();
-		foreach (@_){
-			if (exists $dbTaxa{$_}){ $taxid2organism{$_} = $dbTaxa{$_}{'organism'}; }
-			else{ push (@taxids2get, $_) }
-		}
-		sleep 1;
-		while (1){
-			$s = get "$eFetchServer?db=taxonomy&id=".join (",", @taxids2get);
-			last if (defined $s);
-			die "\nFailed to retrieve taxonomic information from NCBI.\n" if ($iRetry >= $retries);
-			$iRetry ++; sleep $delay; next;
-		}
-		if ($s =~ /<ERROR>ID list is empty/){
-			print "Warning: Invalid TaxIDs:", join (",", @taxids2get), ".\n";
-		}else{
-			$s =~ s/<TaxaSet>//;
-			while ($s =~ s/\n<Taxon>\s+<TaxId>(\d+)<\/TaxId>\s+<ScientificName>(.+?)<\/ScientificName>(.+?)\n<\/Taxon>//s){
-				my $id = $1; my $t = $3;
-				$taxid2organism{$id} = $2;
-				my %taxon :shared = ('organism', $2);
-				$taxon{$_} = "" for (@ranks);
-				while ($t =~ s/<Taxon>\s+<TaxId>(\d+)<\/TaxId>\s+<ScientificName>(.+?)<\/ScientificName>\s+<Rank>(.+?)<\/Rank>\s+<\/Taxon>//s){
-					unless (exists $dbRanks{$1}){
-						$dbRanks{$1} = $2;
-						$ranks2w{$1} = $2;
-					}
-					$taxon{'lineage'} .= "/$1";
-					$taxon{$3} = $1 if (exists $hRanks{$3});
-				}
-				if ($t =~ /<\/ParentTaxId>\s+<Rank>(\S+)<\/Rank>/){ # the taxon itself is a rank
-					foreach (@ranks){
-						next if $taxon{$_};
-						if ($1 eq $_){ $taxon{$_} = $id; last; }
-					}
-				}
-				$dbTaxa{$id} = \%taxon;
-				$taxa2w{$id} = {%taxon};
-			}
-		}
-		foreach (@_){
-			if (exists $taxid2organism{$_}){
-				push (@organisms, $taxid2organism{$_});
-			}else{
-				push (@organisms, "na");
-				print "Warning: Invalid TaxID: $_.\n";
-				unless (exists $badTaxids{$_}){
-					$badTaxids{$_} = 1;
-					open OUT, ">>$wkDir/taxonomy/invalid.taxids";
-					print OUT $_."\n";
-					close OUT;
-				}
-			}
-		}
-	}
-	if (%taxa2w){
-		open OUT, ">>$wkDir/taxonomy/taxa.db";
-		foreach my $id (keys %taxa2w){
-			print OUT $id."\t".$taxa2w{$id}{'organism'}."\t".$taxa2w{$id}{'lineage'};
-			print OUT "\t".$taxa2w{$id}{$_} for (@ranks);
-			print OUT "\n";
-		}
-		close OUT;
-	}
-	if (%ranks2w){
-		open OUT, ">>$wkDir/taxonomy/ranks.db";
-		foreach my $id (keys %ranks2w){
-			print OUT $id."\t".$ranks2w{$id}."\n";
-		}
-		close OUT;
-	}
-	return @organisms;
+    my @organisms = ();
+    my %taxa2w = (); # taxa to write
+    my %ranks2w = (); # ranks to write
+    my %hRanks = map { $_ => 1 } @ranks;
+
+    unless ($httpBlast) { # local taxonomy database
+        foreach my $taxid (@_) {
+            if (exists $dbTaxa{$taxid}) {
+                push (@organisms, $dbTaxa{$taxid}{'organism'});
+                next;
+            }
+            if (exists $taxdumps{$taxid}) {
+                my $id = $taxid;
+                my $pid = $taxdumps{$taxid}{'parent'};
+                my $name = $taxdumps{$taxid}{'name'};
+                my $rank = $taxdumps{$taxid}{'rank'};
+                push (@organisms, $name);
+                my %taxon :shared = ('organism', $name);
+                $taxon{$_} = "" for (@ranks);
+                $taxon{$rank} = $id if ($rank and exists $hRanks{$rank});
+                while (1) {
+                    last unless ($pid and $pid != $id);
+                    $name = $taxdumps{$pid}{'name'};
+                    $rank = $taxdumps{$pid}{'rank'};
+                    unless (exists $dbRanks{$pid}) {
+                        $dbRanks{$pid} = $name;
+                        $ranks2w{$pid} = $name;
+                    }
+                    $taxon{'lineage'} .= "/$pid";
+                    $taxon{$rank} = $pid if (exists $hRanks{$rank});
+                    $id = $pid;
+                    $pid = $taxdumps{$id}{'parent'};
+                }
+                $dbTaxa{$taxid} = \%taxon;
+                $taxa2w{$taxid} = {%taxon};
+            } else {
+                push (@organisms, "na");
+                unless (exists $badTaxids{$taxid}) {
+                    print " Warning: Invalid TaxID: $taxid.\n";
+                    $badTaxids{$taxid} = 1;
+                    open OUT, ">>$wkDir/taxonomy/invalid.taxids";
+                    print OUT $taxid."\n";
+                    close OUT;
+                }
+            }
+        }
+
+    } else { # remote taxonomy database
+        my $iRetry = 0;
+        my @taxids2get = ();
+        my %taxid2organism = ();
+        foreach (@_) {
+            if (exists $dbTaxa{$_}) { $taxid2organism{$_} = $dbTaxa{$_}{'organism'}; }
+            else{ push (@taxids2get, $_) }
+        }
+        sleep 1;
+        while (1) {
+            $s = get "$eFetchServer?db=taxonomy&id=".join (",", @taxids2get);
+            last if (defined $s);
+            die "\nFailed to retrieve taxonomic information from NCBI.\n" if ($iRetry >= $retries);
+            $iRetry ++; sleep $delay; next;
+        }
+        if ($s =~ /<ERROR>ID list is empty/) {
+            print "Warning: Invalid TaxIDs:", join (",", @taxids2get), ".\n";
+        } else {
+            $s =~ s/<TaxaSet>//;
+            while ($s =~ s/\n<Taxon>\s+<TaxId>(\d+)<\/TaxId>\s+<ScientificName>(.+?)<\/ScientificName>(.+?)\n<\/Taxon>//s) {
+                my $id = $1; my $t = $3;
+                $taxid2organism{$id} = $2;
+                my %taxon :shared = ('organism', $2);
+                $taxon{$_} = "" for (@ranks);
+                while ($t =~ s/<Taxon>\s+<TaxId>(\d+)<\/TaxId>\s+<ScientificName>(.+?)<\/ScientificName>\s+<Rank>(.+?)<\/Rank>\s+<\/Taxon>//s) {
+                    unless (exists $dbRanks{$1}) {
+                        $dbRanks{$1} = $2;
+                        $ranks2w{$1} = $2;
+                    }
+                    $taxon{'lineage'} .= "/$1";
+                    $taxon{$3} = $1 if (exists $hRanks{$3});
+                }
+                if ($t =~ /<\/ParentTaxId>\s+<Rank>(\S+)<\/Rank>/) { # the taxon itself is a rank
+                    foreach (@ranks) {
+                        next if $taxon{$_};
+                        if ($1 eq $_) { $taxon{$_} = $id; last; }
+                    }
+                }
+                $dbTaxa{$id} = \%taxon;
+                $taxa2w{$id} = {%taxon};
+            }
+        }
+        foreach (@_) {
+            if (exists $taxid2organism{$_}) {
+                push (@organisms, $taxid2organism{$_});
+            } else {
+                push (@organisms, "na");
+                print "Warning: Invalid TaxID: $_.\n";
+                unless (exists $badTaxids{$_}) {
+                    $badTaxids{$_} = 1;
+                    open OUT, ">>$wkDir/taxonomy/invalid.taxids";
+                    print OUT $_."\n";
+                    close OUT;
+                }
+            }
+        }
+    }
+    if (%taxa2w) {
+        open OUT, ">>$wkDir/taxonomy/taxa.db";
+        foreach my $id (keys %taxa2w) {
+            print OUT $id."\t".$taxa2w{$id}{'organism'}."\t".$taxa2w{$id}{'lineage'};
+            print OUT "\t".$taxa2w{$id}{$_} for (@ranks);
+            print OUT "\n";
+        }
+        close OUT;
+    }
+    if (%ranks2w) {
+        open OUT, ">>$wkDir/taxonomy/ranks.db";
+        foreach my $id (keys %ranks2w) {
+            print OUT $id."\t".$ranks2w{$id}."\n";
+        }
+        close OUT;
+    }
+    return @organisms;
 }
 
-## parse sequence title ##
-  # GenBank-style sequence titles have gi, accn, product and name (accn)
-  # plain titles have name and product
-
-sub seq_title {
-	my $title = $_[0];
-	my ($gi, $accn, $name, $product) = ('', '', '', '');
-	if ($title =~ /^gi\|(\d+)\|.+\|([A-Z0-9_]+)\.\d+\|(.*)$/){ # GenBank-style title
-		($gi, $accn, $name) = ($1, $2, $2);
-		if ($3){
-			$product = $3;
-			$product = $1 if ($product =~ /^(.+)\[/);
-			$product =~ s/^\s+|\s+$//g;
-		}
-	}else{ # user-defined name
-		$title =~ s/^\s+|\s+$//g;
-		if ($title =~ /^(\S+)\s+(.+)$/){
-			($name, $product) = ($1, $2);
-		}else{
-			$name = $title;
-		}
-	}
-	return ($gi, $accn, $name, $product);
-}
 
 # reorder accession numbers #
 sub order_accns {
-	# order: NP_ > XP_ = YP_ = ZP_ = AP_ > everything else
-	return $_[0] unless $_[0] =~ /\//;
-	my @accns0 = split (/\//, $_[0]);
-	my @accns1 =  ();
-	foreach (@accns0){
-		if (/_/){ unshift (@accns1, $_); }
-		else{ push (@accns1, $_); }
-	}
-	@accns0 = ();
-	foreach (@accns1){
-		if (/^NP_/){ unshift (@accns0, $_); }
-		else{ push (@accns0, $_); }
-	}
-	@accns1 = ();
-	foreach (@accns0){
-		if ($#_ and /^$_[1]$/){ unshift (@accns1, $_); }
-		else{ push (@accns1, $_); }
-	}
-	return join ("/", @accns1);
+    # order: NP_ > ?P_ > everything else
+    return $_[0] unless $_[0] =~ /\//;
+    my @accns0 = split (/\//, $_[0]);
+    my @accns1 =  ();
+    foreach (@accns0) {
+        if (/_/) { unshift (@accns1, $_); }
+        else{ push (@accns1, $_); }
+    }
+    @accns0 = ();
+    foreach (@accns1) {
+        if (/^NP_/) { unshift (@accns0, $_); }
+        else{ push (@accns0, $_); }
+    }
+    @accns1 = ();
+    foreach (@accns0) {
+        if ($#_ and /^$_[1]$/) { unshift (@accns1, $_); }
+        else{ push (@accns1, $_); }
+    }
+    return join ("/", @accns1);
 }
 
 # get stem file name #
 sub stem_name {
-	my $stem = $_[0];
-	$stem =~ s/\.[^\.]+$//;
-	return $stem;
+    my $stem = $_[0];
+    $stem =~ s/\.[^\.]+$//;
+    return $stem;
 }
-
diff --git a/scripts/treer.pl b/scripts/treer.pl
old mode 100755
new mode 100644
index e317985..d7f6f34
--- a/scripts/treer.pl
+++ b/scripts/treer.pl
@@ -23,30 +23,30 @@
 
 my $i; my $j; my $n; my $s; my $t; my @a; my @b; my @c; my %h;
 
-my @sets;											# protein sets (genomes)
-my @hits;											# blast hits, each element is a hash, including:
-														# accn, accns, organism, group, taxid, score, distance, ignore
+my @sets;                                            # protein sets (genomes)
+my @hits;                                            # blast hits, each element is a hash, including:
+                                                        # accn, accns, organism, group, taxid, score, distance, ignore
 
-my %self;											# query information
-my %seqs;											# accn -> sequence
-my %names;											# accn -> organism name
-my $nSeq;											# number of hits with sequences
-my $nChar;											# number of aa per sequence in alignment
-my $tree;											# phylogenetic tree with annotations
+my %self;                                            # query information
+my %seqs;                                            # accn -> sequence
+my %names;                                            # accn -> organism name
+my $nSeq;                                            # number of hits with sequences
+my $nChar;                                            # number of aa per sequence in alignment
+my $tree;                                            # phylogenetic tree with annotations
 
 
 ## program parameters ##
 
-my $wkDir = $ARGV[0];								# working directory
+my $wkDir = $ARGV[0];                                # working directory
 
-my $minHits = 3;									# minimal number of valid hits for phylogenetic analysis
+my $minHits = 3;                                    # minimal number of valid hits for phylogenetic analysis
 
-my $trimSeq = 0;									# trim unreliable regions from sequence alignment (options: 0, gblocks)
-my $realign = 0;									# realign sequences (options: 0, clustalw, mafft, muscle, clustalo)
-my $buildTree = 0;									# Build phylogenetic tree using specified program (options: 0, clustalw, mafft, phyml, raxml)
-my $bsTree = 0;										# perform bootstrap for designated times.
-my $distance = 0;									# use distance matrix instead of BLAST bit scores to rank hits (options: 0, clustalw, mafft, raxml)
-my $aaModel = "WAG";								# protein substitution model for RAxML
+my $trimSeq = 0;                                    # trim unreliable regions from sequence alignment (options: 0, gblocks)
+my $realign = 0;                                    # realign sequences (options: 0, clustalw, mafft, muscle, clustalo)
+my $buildTree = 0;                                    # Build phylogenetic tree using specified program (options: 0, clustalw, mafft, phyml, raxml)
+my $bsTree = 0;                                        # perform bootstrap for designated times.
+my $distance = 0;                                    # use distance matrix instead of BLAST bit scores to rank hits (options: 0, clustalw, mafft, raxml)
+my $aaModel = "WAG";                                # protein substitution model for RAxML
 
 my $gblocks = "Gblocks";
 my $clustalw = "clustalw";
@@ -64,26 +64,26 @@
 
 ## read configurations ##
 
-if (-e "$wkDir/config.txt"){
-	open IN, "<$wkDir/config.txt";
-	my $readMonitor = 0; my $readMiddle = 0;
-	while (<IN>){
-		s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
-		$minHits = $1 if /^minHits=(\d+)$/;
-		$trimSeq = $1 if /^trimSeq=(.+)$/;
-		$realign = $1 if /^realign=(.+)$/;
-		$buildTree = $1 if /^buildTree=(.+)$/;
-		$bsTree = $1 if /^bsTree=(.+)$/;
-		$distance = $1 if /^distance=(.+)$/;
-		$aaModel = $1 if /^aaModel=(.+)$/;
-		$gblocks = $1 if /^gblocks=(.+)$/;
-		$clustalw = $1 if /^clustalw=(.+)$/;
-		$mafft = $1 if /^mafft=(.+)$/;
-		$raxml = $1 if /^raxml=(.+)$/;
-		$phyml = $1 if /^phyml=(.+)$/;
-		$fasttree = $1 if /^fasttree=(.+)$/;
-	}
-	close IN;
+if (-e "$wkDir/config.txt") {
+    open IN, "<$wkDir/config.txt";
+    my $readMonitor = 0; my $readMiddle = 0;
+    while (<IN>) {
+        s/#.*$//; s/\s+$//g; s/^\s+//g; next unless $_;
+        $minHits = $1 if /^minHits=(\d+)$/;
+        $trimSeq = $1 if /^trimSeq=(.+)$/;
+        $realign = $1 if /^realign=(.+)$/;
+        $buildTree = $1 if /^buildTree=(.+)$/;
+        $bsTree = $1 if /^bsTree=(.+)$/;
+        $distance = $1 if /^distance=(.+)$/;
+        $aaModel = $1 if /^aaModel=(.+)$/;
+        $gblocks = $1 if /^gblocks=(.+)$/;
+        $clustalw = $1 if /^clustalw=(.+)$/;
+        $mafft = $1 if /^mafft=(.+)$/;
+        $raxml = $1 if /^raxml=(.+)$/;
+        $phyml = $1 if /^phyml=(.+)$/;
+        $fasttree = $1 if /^fasttree=(.+)$/;
+    }
+    close IN;
 }
 
 
@@ -94,9 +94,9 @@
 @a = readdir(DIR);
 close DIR;
 
-foreach (@a){
-	next if (/^\./);
-	push @sets, $_ if -d "$wkDir/search/$_";
+foreach (@a) {
+    next if (/^\./);
+    push @sets, $_ if -d "$wkDir/search/$_";
 }
 print "done. ";
 die "No protein sets detected.\n" unless @sets;
@@ -104,348 +104,348 @@
 
 print "\n0-------------25-------------50------------75------------100%\n";
 
-foreach my $set (@sets){
-	my $dir = "$wkDir/search/$set";
-	opendir (DIR, "$dir") or next;
-	my @files = grep (/\.txt$/, readdir(DIR));
-	close DIR;
-	next unless @files;
-	
-	my $iProtein = 0;
-	my $iProgress = 0;
-	my $nProtein = $#files+1;
-	print "$set has $nProtein proteins. Analyzing...\n";
-	
-	foreach my $file (@files){
-		$iProtein ++;
-		@hits = (); %seqs = (); %self = (); $nSeq = 0; $nChar = 0; $tree = ""; %names = ();
-		open IN, "<$dir/$file" or next;
-		my $reading = 0;
-		my ($hasCoverage, $hasDistance) = (0, 0);
-		while (<IN>) {
-			s/\s+$//;
-			if (/^BEGIN QUERY/){ $reading = "query"; next; }
-			if (/^BEGIN ORGANISM/){ $reading = "organism"; next; }
-			if (/^BEGIN DATA/){ $reading = "data"; next; }
-			if (/^END;/){ $reading = 0; next; }
-			if ($reading eq "query"){
-				$self{'accn'} = $1 if /^\tName=(.+);$/;
-				$self{'length'} = $1 if /^\tLength=(.+);$/;
-				$self{'product'} = $1 if /^\tProduct=(.+);$/;
-				if (/^\tAccession=(.+);$/){ $self{'accn'} = $1; $self{'accn'} =~ s/\.[\d]+$//; }
-			}
-			if ($reading eq "organism"){ # read organisms
-				last unless exists $self{'accn'};
-				next if /^;/;
-				if (/^\[/){
-					$hasCoverage = 1 if /Coverage/;
-					$hasDistance = 1 if /Distance/;
-					next;
-				}
-				@a = split (/\t/);
-				my %hit = ();
-				$hit{'organism'} = $a[1];
-				$hit{'taxid'} = $a[2];
-				$hit{'score'} = $a[3];
-				$hit{'expect'} = $a[4];
-				$hit{'identity'} = $a[5];
-				$hit{'coverage'} = $a[6] if $hasCoverage;
-				$hit{'accns'} = $a[0];
-				@a = split(/\//, $a[0]);
-				$hit{'accn'} = $a[0];
-				push @hits, {%hit};
-				unless (exists $self{'taxid'}){ # identify self
-					foreach (@a){
-						if ($self{'accn'} eq $_){
-							$self{'id'} = $#hits;
-							$self{'taxid'} = $hit{'taxid'};
-							$self{'score'} = $hit{'score'};
-							$self{'expect'} = $hit{'expect'};
-							$self{'identity'} = $hit{'identity'};
-							$self{'coverage'} = $hit{'coverage'} if exists $hit{'coverage'};
-							$self{'organism'} = $hit{'organism'};
-							last;
-						}
-					}
-				}
-			}
-			if ($reading eq "data"){ # read sequences
-				next if /^;/;
-				next if /^\t/;
-				@a = split (/\t/);
-				$seqs{$a[0]} = $a[1];
-				$nSeq ++;
-				$nChar = length($a[1]) unless $nChar;
-			}
-		}
-		close IN;
-		print "BLAST report of $file is incomplete.\n" and next unless exists $self{'accn'};
-		next if @hits < $minHits;
-		next if keys %seqs < $minHits;
-
-		## realign with ClustalW ##
-		
-		if ($realign eq "clustalw"){
-			next if write_fasta ("$dir/temp") < 2;
-			system "$clustalw -infile=$dir/temp -quicktree -output=fasta -quiet > $dir/buffer";
-			die "Execution of ClustalW failed. Please check.\n" unless -s "$dir/temp.fasta";
-			read_fasta "$dir/temp.fasta";
-			unlink "$dir/temp", "$dir/temp.fasta", "$dir/temp.dnd", "$dir/buffer";
-		}
-
-		## trim alignment with GBlocks ##
-		
-		if ($trimSeq eq "gblocks"){
-			write_fasta "$dir/temp";
-			system "$gblocks $dir/temp -b2=".(int($nSeq/2)+1)." -b3=3 -b4=6 -b5=a > $dir/buffer";
-			die "Execution of Gblocks failed on $file. Please check.\n" unless -s "$dir/temp-gb";
-			read_fasta "$dir/temp-gb";
-			foreach (keys %seqs){
-				$seqs{$_} =~ s/\s//g;
-				delete $seqs{$_} unless ($seqs{$_} =~ /[a-zA-Z]/);
-			}
-			unlink "$dir/temp", "$dir/temp-gb", "$dir/temp-gb.htm";
-		}
-
-		## create translation table for tree annotation ##
-		
-		if ($buildTree){
-			for ($i=0; $i<=$#hits; $i++){
-				$s = $hits[$i]{'organism'};
-				$s =~ s/[^a-zA-Z0-9,\.\-]/ /g;
-				$names{$hits[$i]{'accn'}} = $s;
-			}
-		}
-
-		## build phylogenetic tree and compute distance matrix with ClustalW (Neighbor-Joining) ##
-	
-		if ($buildTree eq "clustalw"){
-			if (write_fasta("$dir/temp")>= 3){
-				$s = "$clustalw -infile=$dir/temp -outputtree=dist -quiet";
-				if ($bsTree){ $s .= " -bootstrap=$bsTree -seed=12345 -bootlabels=node"; }else{ $s .= " -tree"; }
-				$s .= " > $dir/buffer";
-				system $s;
-				die "Execution of ClustalW failed. Please check.\n" unless (-s "$dir/temp.ph" or "$dir/temp.phb");
-				if ($bsTree){ open IN, "<$dir/temp.phb"; }else{ open IN, "<$dir/temp.ph"; }
-				while (<IN>){ s/\s+$//; $tree .= $_; }
-				close IN;
-				$tree =~ s/TRICHOTOMY//i;
-				$tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
-				if ($distance){
-					$s = "";
-					open IN, "<$dir/temp.dst";
-					$_ = <IN>; $s = <IN>; $s =~ s/\s+$//;
-					while (<IN>){
-						last unless (/^\s/);
-						s/\s+$//; s/^\s+//;
-						$s .= " $_";
-					}
-					close IN;
-					@a = split (/\s+/, $s);
-					shift @a;
-					for ($i=0; $i<=$#hits; $i++){
-						next if exists $hits[$i]{'ignore'};
-						next unless exists $seqs{$hits[$i]{'accn'}};
-						$hits[$i]{'distance'} = shift (@a);
-					}
-				}
-				unlink "$dir/temp", "$dir/temp.ph", "$dir/temp.phb","$dir/temp.dst", "$dir/buffer";
-			}
-		}
-
-		## realign sequences, build phylogenetic tree and compute distance matrix with MAFFT (Neighbor-Joining) ##
-
-		if ($realign eq "mafft" or $buildTree eq "mafft"){
-			if (write_fasta("$dir/temp")>=3){
-				system "$mafft --retree 1 --treeout --distout --quiet $dir/temp > $dir/buffer";
-				die "Execution of MAFFT failed. Please check.\n" unless -s "$dir/buffer";
-				read_fasta "$dir/buffer" if ($realign eq "mafft");
-				if ($buildTree eq "mafft"){
-					open IN, "<$dir/temp.tree";
-					while (<IN>){ s/\s+$//; $tree .= $_; }
-					close IN;
-					$tree =~ s/\s+$/;/;
-					$tree =~ s/_+:/:/g;
-					$tree =~ s/([,\(])\d{1,4}_/$1/g;
-					$tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
-					$tree .= ";" unless $tree =~ /;$/;
-					if ($distance){
-						my %mafftIDs = ();
-						open IN, "<$dir/temp.hat2";
-						$_ = <IN>; $_ = <IN>; $_ = <IN>;
-						@a = ();
-						while (<IN>){
-							s/\s+$//; s/^\s+//; next if (/\. =/);
-							@a = (@a, split (/\s/));
-							last if @a >= $nSeq;
-						}
-						close IN;
-						$j = 0;
-						$hits[0]{'distance'} = "0.000";
-						for ($i=1; $i<=$#hits; $i++){
-							next if exists $hits[$i]{'ignore'};
-							next unless exists $seqs{$hits[$i]{'accn'}};
-							$hits[$i]{'distance'} = $a[$j++];
-						}
-					}
-				}
-				unlink "$dir/temp", "$dir/buffer", "$dir/temp.tree", "$dir/temp.hat2";
-			}
-		}
-
-		## build phylogenetic tree with PhyML (Maximum Likelihood) ##
-
-		if ($buildTree eq "phyml"){
-			if (write_phylip("$dir/temp")>=3){
-				$s = "$phyml -i $dir/temp -d aa --quiet";
-				$s .= "-b $bsTree" if $bsTree;
-				system $s;
-				open IN, "<$dir/temp_phyml_tree.txt"; $tree = <IN>; close IN;
-				$tree =~ s/\s+$//;
-				$tree =~ s/([,\(])t(\d+):/$1$hits[$2]{'accn'}:/g;
-				$tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
-				unlink "$dir/temp", "$dir/temp_phyml_stats.txt", "$dir/temp_phyml_tree.txt";
-				unlink "$dir/temp_phyml_boot_stats.txt", "$dir/temp_phyml_boot_trees.txt" if $bsTree;
-			}
-		}
-
-		## build phylogenetic tree with FastTree (Maximum Likelihood) and perform SH test ##
-
-		if ($buildTree eq "fasttree"){
-			if (write_fasta("$dir/temp")>=3){
-				$s = "$fasttree < $dir/temp > fasttree.tmp -quiet";
-				$s .= " -wag" if $aaModel eq "WAG";
-				system $s;
-				open IN, "<fasttree.tmp"; $tree = <IN>; close IN;
-				$tree =~ s/\s+$//;
-				$tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
-				unlink "$dir/temp", "fasttree.tmp";
-			}
-		}
-
-		## build phylogenetic tree and compute distance matrix with RAxML (Maximum Likelihood) ##
-
-		if ($buildTree eq "raxml"){
-			open OUT, ">$dir/temp";
-			foreach (keys %seqs){
-				$i = length ($seqs{$_});
-				last;
-			}
-			print OUT " $nSeq $i\n";
-			for ($i=0; $i<=$#hits; $i++){
-				next if exists $hits[$i]{'ignore'};
-				next unless exists $seqs{$hits[$i]{'accn'}};
-				print OUT "t$i".(" " x (13-length("t$i"))).$seqs{$hits[$i]{'accn'}}."\n";
-			}
-			close OUT;
-			$s = "$raxml -m PROTGAMMA$aaModel -p 12345 -s $dir/temp -n tmp";
-			$s .= " -f a -x 12345 -# $bsTree" if $bsTree;
-			$s .= " > RAxML_screen.tmp";
-			system $s;
-			die "Execution of RAxML failed. Please check.\n" unless ((-s "RAxML_result.tmp") or (-s "RAxML_bipartitions.tmp"));
-			if ($bsTree){ open IN, "RAxML_bipartitions.tmp"; }
-			else { open IN, "RAxML_result.tmp"; }
-			$tree = <IN>; close IN;
-			$tree =~ s/\s+$//;
-			$tree =~ s/([,\(])t(\d+):/$1$hits[$2]{'accn'}:/g;
-			$tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
-			if ($distance){
-				$s = "$raxml -f x -m PROTGAMMA$aaModel -s $dir/temp -n tmp2";
-				if ($bsTree){ $s .= " -t RAxML_bipartitions.tmp"; }
-				else { open IN, " -t RAxML_result.tmp"; }
-				$s .= " > RAxML_screen.tmp";
-				system $s;
-				open IN, "<RAxML_distances.tmp2";
-				@a = ();
-				while (<IN>){
-					s/\s+$//; next unless $_;
-					next unless (/^t0/);
-					@b = split (/\s+/);
-					push @a, $b[2];
-				}
-				close IN;
-				$hits[0]{'distance'} = "0.000000";
-				for ($i=1; $i<=$#hits; $i++){
-					next if exists $hits[$i]{'ignore'};
-					next unless exists $seqs{$hits[$i]{'accn'}};
-					$hits[$i]{'distance'} = shift (@a);
-				}
-				unlink "RAxML_info.tmp2", "RAxML_distances.tmp2";
-			}
-			unlink "$dir/temp", "$dir/temp.reduced";
-			unlink "RAxML_bestTree.tmp", "RAxML_info.tmp", "RAxML_log.tmp", "RAxML_parsimonyTree.tmp", "RAxML_result.tmp", "RAxML_screen.tmp";
-			unlink "RAxML_bootstrap.tmp", "RAxML_bipartitions.tmp", "RAxML_bipartitionsBranchLabels.tmp" if $bsTree;
-		}
-
-		## print realigned sequences ##
-		
-		if ($realign or $trimSeq or $distance){
-			foreach (keys %seqs){
-				$nChar = length ($seqs{$_});
-				last;
-			}
-			$s = "";
-			$reading = 1;
-			open IN, "<$dir/$file" or next;
-			while (<IN>){
-				if (/^BEGIN TREE/ and $buildTree){ $reading = 0; next; }
-				if ($reading) { $s .= $_; }
-				if (/^END;/ and not $reading) { $reading = 1; next; }
-				if (/^BEGIN ORGANISM/ and $distance){
-					$reading = 0;
-					for ($i=0; $i<=$#hits; $i++){
-						$s .= $hits[$i]{'accns'}."\t".$hits[$i]{'organism'}."\t".$hits[$i]{'taxid'}."\t".$hits[$i]{'score'}."\t".$hits[$i]{'expect'}."\t".$hits[$i]{'identity'};
-						$s .= "\t".$hits[$i]{'coverage'} if exists $hits[$i]{'coverage'};
-						if (exists $hits[$i]{'ignore'}){
-							$s .= "\tx\n";
-						}else{
-							if ($distance and exists $hits[$i]{'distance'}){
-								$s .= "\t".$hits[$i]{'distance'}."\n";
-							}else{
-								$s .= "\t\n";
-							}
-						}
-					}
-					$s .= ";\nEND;\n";
-				}
-				if (/^BEGIN DATA/){
-					$reading = 0;
-					$s .= "\tDIMENSIONS NTAX=$nSeq NCHAR=$nChar;\n";
-					$s .= "\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n";
-					$s .= "\tMATRIX\n";
-					for ($i=0; $i<=$#hits; $i++){
-						next if exists $hits[$i]{'ignore'};
-						next unless exists $seqs{$hits[$i]{'accn'}};
-						$s .= $hits[$i]{'accn'}."\t".$seqs{$hits[$i]{'accn'}}."\n";
-					}
-					$s .= ";\nEND;\n";
-				}
-			}
-			close IN;
-			open OUT, ">$dir/$file" or next;
-			print OUT $s;
-			close OUT;
-		}
-
-		## print tree ##
-		
-		if ($buildTree){
-			open OUT, ">>$dir/$file" or next;
-			print OUT "BEGIN TREES;\n\tTREE 1 = $tree\nEND;\n";
-			close OUT;
-		}
-		
-		## show progress ##
-		
-		if ($iProtein/$nProtein >= $iProgress/60){ 
-			print ".";
-			$iProgress++;
-		}
-	}
-	
-	unlink "RAxML_screen.tmp" if -e "RAxML_screen.tmp";
-	
-	print " done.\n";
+foreach my $set (@sets) {
+    my $dir = "$wkDir/search/$set";
+    opendir (DIR, "$dir") or next;
+    my @files = grep (/\.txt$/, readdir(DIR));
+    close DIR;
+    next unless @files;
+
+    my $iProtein = 0;
+    my $iProgress = 0;
+    my $nProtein = $#files+1;
+    print "$set has $nProtein proteins. Analyzing...\n";
+
+    foreach my $file (@files) {
+        $iProtein ++;
+        @hits = (); %seqs = (); %self = (); $nSeq = 0; $nChar = 0; $tree = ""; %names = ();
+        open IN, "<$dir/$file" or next;
+        my $reading = 0;
+        my ($hasCoverage, $hasDistance) = (0, 0);
+        while (<IN>) {
+            s/\s+$//;
+            if (/^BEGIN QUERY/) { $reading = "query"; next; }
+            if (/^BEGIN ORGANISM/) { $reading = "organism"; next; }
+            if (/^BEGIN DATA/) { $reading = "data"; next; }
+            if (/^END;/) { $reading = 0; next; }
+            if ($reading eq "query") {
+                $self{'accn'} = $1 if /^\tName=(.+);$/;
+                $self{'length'} = $1 if /^\tLength=(.+);$/;
+                $self{'product'} = $1 if /^\tProduct=(.+);$/;
+                if (/^\tAccession=(.+);$/) { $self{'accn'} = $1; $self{'accn'} =~ s/\.[\d]+$//; }
+            }
+            if ($reading eq "organism") { # read organisms
+                last unless exists $self{'accn'};
+                next if /^;/;
+                if (/^\[/) {
+                    $hasCoverage = 1 if /Coverage/;
+                    $hasDistance = 1 if /Distance/;
+                    next;
+                }
+                @a = split (/\t/);
+                my %hit = ();
+                $hit{'organism'} = $a[1];
+                $hit{'taxid'} = $a[2];
+                $hit{'score'} = $a[3];
+                $hit{'expect'} = $a[4];
+                $hit{'identity'} = $a[5];
+                $hit{'coverage'} = $a[6] if $hasCoverage;
+                $hit{'accns'} = $a[0];
+                @a = split(/\//, $a[0]);
+                $hit{'accn'} = $a[0];
+                push @hits, {%hit};
+                unless (exists $self{'taxid'}) { # identify self
+                    foreach (@a) {
+                        if ($self{'accn'} eq $_) {
+                            $self{'id'} = $#hits;
+                            $self{'taxid'} = $hit{'taxid'};
+                            $self{'score'} = $hit{'score'};
+                            $self{'expect'} = $hit{'expect'};
+                            $self{'identity'} = $hit{'identity'};
+                            $self{'coverage'} = $hit{'coverage'} if exists $hit{'coverage'};
+                            $self{'organism'} = $hit{'organism'};
+                            last;
+                        }
+                    }
+                }
+            }
+            if ($reading eq "data") { # read sequences
+                next if /^;/;
+                next if /^\t/;
+                @a = split (/\t/);
+                $seqs{$a[0]} = $a[1];
+                $nSeq ++;
+                $nChar = length($a[1]) unless $nChar;
+            }
+        }
+        close IN;
+        print "BLAST report of $file is incomplete.\n" and next unless exists $self{'accn'};
+        next if @hits < $minHits;
+        next if keys %seqs < $minHits;
+
+        ## realign with ClustalW ##
+
+        if ($realign eq "clustalw") {
+            next if write_fasta ("$dir/temp") < 2;
+            system "$clustalw -infile=$dir/temp -quicktree -output=fasta -quiet > $dir/buffer";
+            die "Execution of ClustalW failed. Please check.\n" unless -s "$dir/temp.fasta";
+            read_fasta "$dir/temp.fasta";
+            unlink "$dir/temp", "$dir/temp.fasta", "$dir/temp.dnd", "$dir/buffer";
+        }
+
+        ## trim alignment with GBlocks ##
+
+        if ($trimSeq eq "gblocks") {
+            write_fasta "$dir/temp";
+            system "$gblocks $dir/temp -b2=".(int($nSeq/2)+1)." -b3=3 -b4=6 -b5=a > $dir/buffer";
+            die "Execution of Gblocks failed on $file. Please check.\n" unless -s "$dir/temp-gb";
+            read_fasta "$dir/temp-gb";
+            foreach (keys %seqs) {
+                $seqs{$_} =~ s/\s//g;
+                delete $seqs{$_} unless ($seqs{$_} =~ /[a-zA-Z]/);
+            }
+            unlink "$dir/temp", "$dir/temp-gb", "$dir/temp-gb.htm";
+        }
+
+        ## create translation table for tree annotation ##
+
+        if ($buildTree) {
+            for ($i=0; $i<=$#hits; $i++) {
+                $s = $hits[$i]{'organism'};
+                $s =~ s/[^a-zA-Z0-9,\.\-]/ /g;
+                $names{$hits[$i]{'accn'}} = $s;
+            }
+        }
+
+        ## build phylogenetic tree and compute distance matrix with ClustalW (Neighbor-Joining) ##
+
+        if ($buildTree eq "clustalw") {
+            if (write_fasta("$dir/temp")>= 3) {
+                $s = "$clustalw -infile=$dir/temp -outputtree=dist -quiet";
+                if ($bsTree) { $s .= " -bootstrap=$bsTree -seed=12345 -bootlabels=node"; } else { $s .= " -tree"; }
+                $s .= " > $dir/buffer";
+                system $s;
+                die "Execution of ClustalW failed. Please check.\n" unless (-s "$dir/temp.ph" or "$dir/temp.phb");
+                if ($bsTree) { open IN, "<$dir/temp.phb"; } else { open IN, "<$dir/temp.ph"; }
+                while (<IN>) { s/\s+$//; $tree .= $_; }
+                close IN;
+                $tree =~ s/TRICHOTOMY//i;
+                $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
+                if ($distance) {
+                    $s = "";
+                    open IN, "<$dir/temp.dst";
+                    $_ = <IN>; $s = <IN>; $s =~ s/\s+$//;
+                    while (<IN>) {
+                        last unless (/^\s/);
+                        s/\s+$//; s/^\s+//;
+                        $s .= " $_";
+                    }
+                    close IN;
+                    @a = split (/\s+/, $s);
+                    shift @a;
+                    for ($i=0; $i<=$#hits; $i++) {
+                        next if exists $hits[$i]{'ignore'};
+                        next unless exists $seqs{$hits[$i]{'accn'}};
+                        $hits[$i]{'distance'} = shift (@a);
+                    }
+                }
+                unlink "$dir/temp", "$dir/temp.ph", "$dir/temp.phb","$dir/temp.dst", "$dir/buffer";
+            }
+        }
+
+        ## realign sequences, build phylogenetic tree and compute distance matrix with MAFFT (Neighbor-Joining) ##
+
+        if ($realign eq "mafft" or $buildTree eq "mafft") {
+            if (write_fasta("$dir/temp")>=3) {
+                system "$mafft --retree 1 --treeout --distout --quiet $dir/temp > $dir/buffer";
+                die "Execution of MAFFT failed. Please check.\n" unless -s "$dir/buffer";
+                read_fasta "$dir/buffer" if ($realign eq "mafft");
+                if ($buildTree eq "mafft") {
+                    open IN, "<$dir/temp.tree";
+                    while (<IN>) { s/\s+$//; $tree .= $_; }
+                    close IN;
+                    $tree =~ s/\s+$/;/;
+                    $tree =~ s/_+:/:/g;
+                    $tree =~ s/([,\(])\d{1,4}_/$1/g;
+                    $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
+                    $tree .= ";" unless $tree =~ /;$/;
+                    if ($distance) {
+                        my %mafftIDs = ();
+                        open IN, "<$dir/temp.hat2";
+                        $_ = <IN>; $_ = <IN>; $_ = <IN>;
+                        @a = ();
+                        while (<IN>) {
+                            s/\s+$//; s/^\s+//; next if (/\. =/);
+                            @a = (@a, split (/\s/));
+                            last if @a >= $nSeq;
+                        }
+                        close IN;
+                        $j = 0;
+                        $hits[0]{'distance'} = "0.000";
+                        for ($i=1; $i<=$#hits; $i++) {
+                            next if exists $hits[$i]{'ignore'};
+                            next unless exists $seqs{$hits[$i]{'accn'}};
+                            $hits[$i]{'distance'} = $a[$j++];
+                        }
+                    }
+                }
+                unlink "$dir/temp", "$dir/buffer", "$dir/temp.tree", "$dir/temp.hat2";
+            }
+        }
+
+        ## build phylogenetic tree with PhyML (Maximum Likelihood) ##
+
+        if ($buildTree eq "phyml") {
+            if (write_phylip("$dir/temp")>=3) {
+                $s = "$phyml -i $dir/temp -d aa --quiet";
+                $s .= "-b $bsTree" if $bsTree;
+                system $s;
+                open IN, "<$dir/temp_phyml_tree.txt"; $tree = <IN>; close IN;
+                $tree =~ s/\s+$//;
+                $tree =~ s/([,\(])t(\d+):/$1$hits[$2]{'accn'}:/g;
+                $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
+                unlink "$dir/temp", "$dir/temp_phyml_stats.txt", "$dir/temp_phyml_tree.txt";
+                unlink "$dir/temp_phyml_boot_stats.txt", "$dir/temp_phyml_boot_trees.txt" if $bsTree;
+            }
+        }
+
+        ## build phylogenetic tree with FastTree (Maximum Likelihood) and perform SH test ##
+
+        if ($buildTree eq "fasttree") {
+            if (write_fasta("$dir/temp")>=3) {
+                $s = "$fasttree < $dir/temp > fasttree.tmp -quiet";
+                $s .= " -wag" if $aaModel eq "WAG";
+                system $s;
+                open IN, "<fasttree.tmp"; $tree = <IN>; close IN;
+                $tree =~ s/\s+$//;
+                $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
+                unlink "$dir/temp", "fasttree.tmp";
+            }
+        }
+
+        ## build phylogenetic tree and compute distance matrix with RAxML (Maximum Likelihood) ##
+
+        if ($buildTree eq "raxml") {
+            open OUT, ">$dir/temp";
+            foreach (keys %seqs) {
+                $i = length ($seqs{$_});
+                last;
+            }
+            print OUT " $nSeq $i\n";
+            for ($i=0; $i<=$#hits; $i++) {
+                next if exists $hits[$i]{'ignore'};
+                next unless exists $seqs{$hits[$i]{'accn'}};
+                print OUT "t$i".(" " x (13-length("t$i"))).$seqs{$hits[$i]{'accn'}}."\n";
+            }
+            close OUT;
+            $s = "$raxml -m PROTGAMMA$aaModel -p 12345 -s $dir/temp -n tmp";
+            $s .= " -f a -x 12345 -# $bsTree" if $bsTree;
+            $s .= " > RAxML_screen.tmp";
+            system $s;
+            die "Execution of RAxML failed. Please check.\n" unless ((-s "RAxML_result.tmp") or (-s "RAxML_bipartitions.tmp"));
+            if ($bsTree) { open IN, "RAxML_bipartitions.tmp"; }
+            else { open IN, "RAxML_result.tmp"; }
+            $tree = <IN>; close IN;
+            $tree =~ s/\s+$//;
+            $tree =~ s/([,\(])t(\d+):/$1$hits[$2]{'accn'}:/g;
+            $tree =~ s/([,\(])$_:/$1'$names{$_}':/ for (keys %names);
+            if ($distance) {
+                $s = "$raxml -f x -m PROTGAMMA$aaModel -s $dir/temp -n tmp2";
+                if ($bsTree) { $s .= " -t RAxML_bipartitions.tmp"; }
+                else { open IN, " -t RAxML_result.tmp"; }
+                $s .= " > RAxML_screen.tmp";
+                system $s;
+                open IN, "<RAxML_distances.tmp2";
+                @a = ();
+                while (<IN>) {
+                    s/\s+$//; next unless $_;
+                    next unless (/^t0/);
+                    @b = split (/\s+/);
+                    push @a, $b[2];
+                }
+                close IN;
+                $hits[0]{'distance'} = "0.000000";
+                for ($i=1; $i<=$#hits; $i++) {
+                    next if exists $hits[$i]{'ignore'};
+                    next unless exists $seqs{$hits[$i]{'accn'}};
+                    $hits[$i]{'distance'} = shift (@a);
+                }
+                unlink "RAxML_info.tmp2", "RAxML_distances.tmp2";
+            }
+            unlink "$dir/temp", "$dir/temp.reduced";
+            unlink "RAxML_bestTree.tmp", "RAxML_info.tmp", "RAxML_log.tmp", "RAxML_parsimonyTree.tmp", "RAxML_result.tmp", "RAxML_screen.tmp";
+            unlink "RAxML_bootstrap.tmp", "RAxML_bipartitions.tmp", "RAxML_bipartitionsBranchLabels.tmp" if $bsTree;
+        }
+
+        ## print realigned sequences ##
+
+        if ($realign or $trimSeq or $distance) {
+            foreach (keys %seqs) {
+                $nChar = length ($seqs{$_});
+                last;
+            }
+            $s = "";
+            $reading = 1;
+            open IN, "<$dir/$file" or next;
+            while (<IN>) {
+                if (/^BEGIN TREE/ and $buildTree) { $reading = 0; next; }
+                if ($reading) { $s .= $_; }
+                if (/^END;/ and not $reading) { $reading = 1; next; }
+                if (/^BEGIN ORGANISM/ and $distance) {
+                    $reading = 0;
+                    for ($i=0; $i<=$#hits; $i++) {
+                        $s .= $hits[$i]{'accns'}."\t".$hits[$i]{'organism'}."\t".$hits[$i]{'taxid'}."\t".$hits[$i]{'score'}."\t".$hits[$i]{'expect'}."\t".$hits[$i]{'identity'};
+                        $s .= "\t".$hits[$i]{'coverage'} if exists $hits[$i]{'coverage'};
+                        if (exists $hits[$i]{'ignore'}) {
+                            $s .= "\tx\n";
+                        } else {
+                            if ($distance and exists $hits[$i]{'distance'}) {
+                                $s .= "\t".$hits[$i]{'distance'}."\n";
+                            } else {
+                                $s .= "\t\n";
+                            }
+                        }
+                    }
+                    $s .= ";\nEND;\n";
+                }
+                if (/^BEGIN DATA/) {
+                    $reading = 0;
+                    $s .= "\tDIMENSIONS NTAX=$nSeq NCHAR=$nChar;\n";
+                    $s .= "\tFORMAT DATATYPE=PROTEIN MISSING=? GAP=-;\n";
+                    $s .= "\tMATRIX\n";
+                    for ($i=0; $i<=$#hits; $i++) {
+                        next if exists $hits[$i]{'ignore'};
+                        next unless exists $seqs{$hits[$i]{'accn'}};
+                        $s .= $hits[$i]{'accn'}."\t".$seqs{$hits[$i]{'accn'}}."\n";
+                    }
+                    $s .= ";\nEND;\n";
+                }
+            }
+            close IN;
+            open OUT, ">$dir/$file" or next;
+            print OUT $s;
+            close OUT;
+        }
+
+        ## print tree ##
+
+        if ($buildTree) {
+            open OUT, ">>$dir/$file" or next;
+            print OUT "BEGIN TREES;\n\tTREE 1 = $tree\nEND;\n";
+            close OUT;
+        }
+
+        ## show progress ##
+
+        if ($iProtein/$nProtein >= $iProgress/60) {
+            print ".";
+            $iProgress++;
+        }
+    }
+
+    unlink "RAxML_screen.tmp" if -e "RAxML_screen.tmp";
+
+    print " done.\n";
 }
 print "Execution of treer completed.\n";
 exit 0;
@@ -453,51 +453,50 @@
 
 ## public subroutines ##
 
-sub write_fasta ($){
-	my $count = 0;
-	open OUT, ">$_[0]";
-	for ($i=0; $i<=$#hits; $i++){
-		next if exists $hits[$i]{'ignore'};
-		next unless exists $seqs{$hits[$i]{'accn'}};
-		print OUT ">".$hits[$i]{'accn'}."\n".$seqs{$hits[$i]{'accn'}}."\n";
-		$count ++;
-	}
-	close OUT;
-	return $count;
+sub write_fasta ($) {
+    my $count = 0;
+    open OUT, ">$_[0]";
+    for ($i=0; $i<=$#hits; $i++) {
+        next if exists $hits[$i]{'ignore'};
+        next unless exists $seqs{$hits[$i]{'accn'}};
+        print OUT ">".$hits[$i]{'accn'}."\n".$seqs{$hits[$i]{'accn'}}."\n";
+        $count ++;
+    }
+    close OUT;
+    return $count;
 }
 
-sub write_phylip ($){
-	my $count = 0;
-	open OUT, ">$_[0]";
-	foreach (keys %seqs){
-		$i = length ($seqs{$_});
-		last;
-	}
-	print OUT " $nSeq $i\n";
-	for ($i=0; $i<=$#hits; $i++){
-		next if exists $hits[$i]{'ignore'};
-		next unless exists $seqs{$hits[$i]{'accn'}};
-		print OUT "t$i".(" " x (13-length("t$i"))).$seqs{$hits[$i]{'accn'}}."\n";
-		$count ++;
-	}
-	close OUT;
-	return $count;
+sub write_phylip ($) {
+    my $count = 0;
+    open OUT, ">$_[0]";
+    foreach (keys %seqs) {
+        $i = length ($seqs{$_});
+        last;
+    }
+    print OUT " $nSeq $i\n";
+    for ($i=0; $i<=$#hits; $i++) {
+        next if exists $hits[$i]{'ignore'};
+        next unless exists $seqs{$hits[$i]{'accn'}};
+        print OUT "t$i".(" " x (13-length("t$i"))).$seqs{$hits[$i]{'accn'}}."\n";
+        $count ++;
+    }
+    close OUT;
+    return $count;
 }
 
 
-sub read_fasta ($){
-	open IN, "<$_[0]";
-	my $name = "";
-	while (<IN>){
-		s/\s+$//;
-		next unless $_;
-		if (s/^>//){
-			$name = $_;
-			$seqs{$name} = "";
-		}else{
-			$seqs{$name} .= $_;
-		}
-	}
-	close IN;
+sub read_fasta ($) {
+    open IN, "<$_[0]";
+    my $name = "";
+    while (<IN>) {
+        s/\s+$//;
+        next unless $_;
+        if (s/^>//) {
+            $name = $_;
+            $seqs{$name} = "";
+        } else {
+            $seqs{$name} .= $_;
+        }
+    }
+    close IN;
 }
-

From d369dbd57fec04aa9f73e8bd8b99eb357f75f5cb Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Wed, 23 Aug 2017 00:35:16 -0700
Subject: [PATCH 2/9] changed diamond support to version 0.8+

---
 scripts/searcher.pl | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/scripts/searcher.pl b/scripts/searcher.pl
index 0c9c48e..eee435f 100644
--- a/scripts/searcher.pl
+++ b/scripts/searcher.pl
@@ -691,11 +691,9 @@
                     unlink "$wkDir/tmp.m8";
                 } elsif ($searchTool eq "DIAMOND") {
                     `$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
-                    `$diamond blastp -p $threads -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
-                    my @out = `$diamond view -a $wkDir/tmp.daa`;
+                    my @out = `$diamond blastp -p $threads -q $wkDir/tmp.in -d $wkDir/tmp.dmnd`;
                     unlink "$wkDir/tmp.in";
                     unlink "$wkDir/tmp.dmnd";
-                    unlink "$wkDir/tmp.daa";
                     die "Error in running DIAMOND. Please check." unless @out;
                     foreach (@out) {
                         s/\s+$//;
@@ -895,13 +893,10 @@ sub local_search {
             $outfile = $wkDir."/tmp.m8";
             unlink $wkDir."/tmp.aln";
         } elsif ($searchTool eq "DIAMOND") {
-            $s = "$diamond blastp -p $threads -q $wkDir/tmp.in -d $protdb -a $wkDir/tmp -t $wkDir";
+            $s = "$diamond blastp -p $threads -q $wkDir/tmp.in -d $protdb -o $outfile";
             $s .= " -e $evalue" if ($evalue);
             $s .= " -k $nHits" if ($nHits);
             `$s`;
-            $s = "$diamond view -a $wkDir/tmp.daa -o $outfile";
-            `$s`;
-            unlink $wkDir."/tmp.daa";
         } else {
             unlink "$wkDir/tmp.in";
             die "Error: Search tool not specified and pre-computed results not found for $set.\n";
@@ -1594,11 +1589,9 @@ sub self_align {
 
         } elsif ($searchTool eq "DIAMOND") {
             `$diamond makedb --in $wkDir/tmp.in -d $wkDir/tmp`;
-            `$diamond blastp -p 1 -q $wkDir/tmp.in -d $wkDir/tmp -a $wkDir/tmp -t $wkDir`;
-            my @out = `$diamond view -a $wkDir/tmp.daa`;
+            my @out = `$diamond blastp -p 1 -q $wkDir/tmp.in -d $wkDir/tmp.dmnd`;
             unlink "$wkDir/tmp.in";
             unlink "$wkDir/tmp.dmnd";
-            unlink "$wkDir/tmp.daa";
             if (@out) {
                 @a = split (/\t/, $out[0]);
                 if ($#a == 11) {

From b16c36b507b61ea2bdc556ecbb00b8b8e8fbd540 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Wed, 23 Aug 2017 01:02:55 -0700
Subject: [PATCH 3/9] removed GI parser from analyzer

---
 scripts/analyzer.pl |  4 +---
 scripts/searcher.pl | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/scripts/analyzer.pl b/scripts/analyzer.pl
index 8e41bfc..2137ce1 100644
--- a/scripts/analyzer.pl
+++ b/scripts/analyzer.pl
@@ -478,11 +478,9 @@
             if (/^END;/) { $reading = 0; next; }
             if ($reading eq "query") { # read query (self)
                 $result{'accn'} = $1 if /^\tName=(.+);$/;
-                $result{'gi'} = $1 if /^\tGI=(\d+);$/;
                 $result{'length'} = $1 if /^\tLength=(.+);$/;
                 $result{'product'} = $1 if /^\tProduct=(.+)\s*;$/;
-                $result{'organism'} = $1 if /^\tOrganism=(.+)\s*;$/;
-                if (/^\tAccession=(.+);$/) { $result{'accn'} = $1; $result{'accn'} =~ s/\.[\d]+$//; }
+                $result{'accn'} = $1 if /^\tAccession=(.+);$/;
             }
             if ($reading eq "organism") { # read organisms
                 next if /^;/;
diff --git a/scripts/searcher.pl b/scripts/searcher.pl
index eee435f..a163358 100644
--- a/scripts/searcher.pl
+++ b/scripts/searcher.pl
@@ -59,7 +59,7 @@
                                         #    prots (array...)
                                         # prot => (
                                         #    name (accn or user-defined)
-                                        #    title (after the 1st whitespace)
+                                        #    product (after 1st whitespace, excluding [organism name])
                                         #    seq (necessary for local searches, optional for remote BLAST)
                                         #    done (boolean)
                                         #    hits (array...)
@@ -87,6 +87,7 @@
 sub local_search;                        # same as above
 sub self_align;                          # search against itself
 sub get_taxonomy;                        # paramter: array of TaxIDs
+sub get_product;                         # get product from protein title
 sub stem_name;                           # get stem file name
 sub order_accns;                         # reorder accession number
 
@@ -295,12 +296,12 @@
     while (<IN>) {
         s/\s+$//; next unless $_; next if /^#/;
         $intype = /^>/ ? "fasta" : "list" unless $intype;
-        my %prot = ('name'=>'', 'title'=>'', 'seq'=>'', 'hits'=>[], 'done'=>0);
+        my %prot = ('name'=>'', 'product'=>'', 'seq'=>'', 'hits'=>[], 'done'=>0);
         if ($intype eq 'fasta') {
             if (s/^>//) {
                 @a = split(/\s+/, $_, 2);
                 $prot{'name'} = $a[0];
-                $prot{'title'} = $a[1] if $#a;
+                $prot{'product'} = get_product($a[1]) if $#a;
                 push @{$ins{$set}{'prots'}}, {%prot};
             } else {
                 $ins{$set}{'prots'}[-1]{'seq'} .= $_;  # append sequence
@@ -1066,7 +1067,7 @@ sub local_search {
         print OUT "#NEXUS\nBEGIN QUERY;\n";
         print OUT "\tName=".$ins{$set}{'prots'}[$id]{'name'}.";\n";
         print OUT "\tLength=".length($ins{$set}{'prots'}[$id]{'seq'}).";\n";
-        print OUT "\tTitle=".$ins{$set}{'prots'}[$id]{'title'}.";\n";
+        print OUT "\tProduct=".$ins{$set}{'prots'}[$id]{'product'}.";\n";
         print OUT "END;\n\n";
         print OUT "BEGIN ORGANISM;\n";
         print OUT "[Accession\tOrganism\tTaxID\tBit-score\tE-value\t\%Identity\t\%Coverage]\n";
@@ -1180,9 +1181,8 @@ sub http_blast{
     foreach (split(/\n/, $s)) {
         if (/^#/) {
             if (s/^# Query: // and not $self{'seq'}) {
-                # s/>.*//;
                 @a = split(/\s+/, $_, 2);
-                $self{'title'} = $a[1] if $#a;
+                $self{'product'} = get_product($a[1]) if $#a;
             }
         } else {
             # fields: query id, subject ids, % identity, % positives, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
@@ -1393,7 +1393,7 @@ sub http_blast{
     print OUT "#NEXUS\nBEGIN QUERY;\n";
     print OUT "\tName=".$self{'name'}.";\n";
     print OUT "\tLength=".$self{'length'}.";\n";
-    print OUT "\tTitle=".$self{'title'}.";\n";
+    print OUT "\tProduct=".$self{'product'}.";\n";
     print OUT "END;\n\n";
 
     # retrieve taxonomy report (using TaxBLAST)
@@ -1790,6 +1790,13 @@ sub order_accns {
     return join ("/", @accns1);
 }
 
+# get product from protein title
+sub get_product {
+    my $title = $_[0];
+    $title =~ s/^\s+|\s+$//g;
+    $title =~ s/\[.+\]$//;
+}
+
 # get stem file name #
 sub stem_name {
     my $stem = $_[0];

From ca103d0d2be0cd69f7eae7ce9b518344d2b9bb2f Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Wed, 23 Aug 2017 01:04:56 -0700
Subject: [PATCH 4/9] changed version number

---
 ChangeLog  |  6 +++---
 GUI.html   | 37 ++++++++++++++++++-------------------
 LICENSE    |  4 ++--
 README.md  |  2 +-
 config.txt |  9 ++++-----
 5 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index f390080..e25693f 100755
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,12 +1,12 @@
 ChangeLog of HGTector
 
-== Version 0.2.2 (8/22/2017) ==
+== Version 0.2.2 (8/23/2017) ==
 
 New features:
-- Adopt the new NCBI format (accession instead of GI as sequence identifier)
+- Adopt new NCBI standard (accession instead of GI as sequence identifier)
 
 Bug fixes:
-- Reformatted all Perl scripts using Perltidy.
+- Reformatted all Perl scripts.
 - Replaced 'http' in NCBI URLs with 'https'.
 
 
diff --git a/GUI.html b/GUI.html
index a9fcdf0..508f8cf 100755
--- a/GUI.html
+++ b/GUI.html
@@ -1,10 +1,10 @@
 <!DOCTYPE HTML>
 <html>
 	<!--
-		This web page is a graphical user interface (GUI) of HGTector v0.2.1
+		This web page is a graphical user interface (GUI) of HGTector v0.2.2
 	-->
 	<head>
-		<title>HGTector v0.2.1 GUI</title>
+		<title>HGTector v0.2.2 GUI</title>
 		<style type="text/css" media=screen>
 			body{
 				color:#3D3C3A;
@@ -148,7 +148,7 @@
 				if (!document.getElementById('ignoreParalogs').checked){ out.value += 'ignoreParalogs=0\n' };
 				if (!document.getElementById('ignoreSeqRepeats').checked){ out.value += 'ignoreSeqRepeats=0\n' };
 				if (document.getElementById('ignoreSubspecies').checked){ out.value += 'ignoreSubspecies=1\n' };
-				
+
 				if (document.getElementsByName('grouping')[0].checked){
 					val = document.getElementById('selfRank').value; if (val != ""){ out.value += 'selfRank='+val+'\n'; }
 				}
@@ -165,7 +165,7 @@
 					if (document.getElementById('plot3D').checked){ out.value += 'plot3D=1\n' };
 					val = document.getElementById('plotRef').value; if (val != ""){ out.value += 'plotRef='+val+'\n'; }
 				}
-				
+
 				if (document.getElementsByName('cutoffing')[0].checked){
 					if (!document.getElementById('exOutlier').disabled){ out.value += 'exOutlier='+document.getElementById('exOutlier').value+'\n'; }
 					if (!document.getElementById('dipTest').disabled && document.getElementById('dipTest').checked){
@@ -213,7 +213,7 @@
 				val = document.getElementById('minSize').value; if (val != "0"){ out.value += 'minSize='+val+'\n'; }
 				if (document.getElementById('selfLow').checked){ out.value += 'selfLow=1\n'; }
 				if (document.getElementById('BBH0').checked){ out.value += 'BBH='+document.getElementById('BBH12').value+'\n'; }
-				
+
 				val = document.getElementById('title').value; if (val != ""){ out.value += 'title='+val+'\n'; }
 				val = document.getElementById('deHypo').value; if (val != 0){ out.value += 'deHypo='+val+'\n'; }
 				if (!document.getElementById('byDonor').checked){
@@ -238,7 +238,7 @@
 				if (!document.getElementById('outText').checked){ out.value += 'outText=0\n'; }
 				if (!document.getElementById('outHTML').checked){ out.value += 'outHTML=0\n'; }
 				if (document.getElementById('outExcel').checked){ out.value += 'outExcel=1\n'; }
-				
+
 				val = document.getElementById('clustalw').value; if (val != "" && val != "clustalw"){ out.value += 'clustalw='+val+'\n'; }
 				val = document.getElementById('mafft').value; if (val != "" && val != "mafft"){ out.value += 'mafft='+val+'\n'; }
 				val = document.getElementById('gblocks').value; if (val != "" && val != "Gblocks"){ out.value += 'gblocks='+val+'\n'; }
@@ -251,21 +251,21 @@
 					val = document.getElementById('bsTree').value; if (val != ""){ out.value += 'bsTree='+val+'\n'; }
 					if (document.getElementById('distance').checked){ out.value += 'distance=1\n' };
 				};
-				
+
 				if (!out.value){ out.value = "# You didn't change any default settings."; }
 			}
 		</script>
 	</head>
-	
+
 	<body style="width:800px; margin-left:auto; margin-right:auto;">
-		<p><h3>HGTector v0.2.1 Configuration File Generator</h3></p>
+		<p><h3>HGTector v0.2.2 Configuration File Generator</h3></p>
 		<p>Qiyun Zhu, Katharina Dittmar</p>
 		<hr>
 		<p>Welcome! This page will guide you through the process of building an effective configuration file for HGTector.</p>
 		<p>Program mode: <select id="interactive"><option value=0>automatic</option><option value=1 selected="selected">interactive</option></select></p>
 		<hr>
 		<p></p>
-		
+
 		<!-- Input data -->
 		<p class="header"><input type="button" id="btnInput" value="Show" onclick="show_hide_section('Input')"> Input data</p>
 		<div id="divInput" style="display:none">
@@ -334,7 +334,7 @@
 					<option value="RAPSearch2">RAPSearch2</option>
 					<option value="DIAMOND">DIAMOND</option>
 					<option value="customized">customized</option>
-				</select> &nbsp; 
+				</select> &nbsp;
 				<span id="RemoteBlast">
 					<input type="checkbox" id="chkRemote" onchange="
 					show_hide_subsection('Remote');
@@ -532,7 +532,7 @@
 						<option value=3>modified Z-score</option>
 					 </select> method. [<a href="javascript:show_hide_tooltip('Outlier')">?</a>]</p>
 					<p class="tooltip" id="tipOutlier" style="display:none">These outliers are genes with unexpected high number of hits. They may be caused by database redundancy. Excluding them prior to clustering may help to suppress false results.</p>
-					<p><input id="dipTest" type="checkbox" onchange="document.getElementById('isDipSig').disabled=!this.checked"> Test for non-unimodality (requires R package diptest). [<a href="javascript:show_hide_tooltip('Dip')">?</a>] 
+					<p><input id="dipTest" type="checkbox" onchange="document.getElementById('isDipSig').disabled=!this.checked"> Test for non-unimodality (requires R package diptest). [<a href="javascript:show_hide_tooltip('Dip')">?</a>]
 					<input id="isDipSig" type="checkbox" onchange="document.getElementById('dipSig').disabled=!this.checked" disabled> P-value threshold: <input type="text" id="dipSig" size=4 value="0.05" disabled> [<a href="javascript:show_hide_tooltip('DipSig')">?</a>]</p>
 					<p class="tooltip" id="tipDip" style="display:none">Perform Hartigan's dip test using R package <b>diptest</b> to assess the significance of non-unimodality of weight distribution. The smaller the p-value is, the more comfortable it is to split the population into two or more clusters.</p>
 					<p class="tooltip" id="tipDipSig" style="display:none">If p-value >= this threshold (meaning the non-unimodality is insignificant), the program will consider that there is inadequate statistical support for dividing the population into two clusters. In such circumstance, the global cutoff will be used.</p>
@@ -556,7 +556,7 @@
 							<option value=1 selected>horizontal midpoint between 1st peak and 1st pit</option>
 							<option value=2>horizontal quantile from 1st pit toward 1st peak, as:</option>
 							<option value=3>vertical quantile from 1st pit toward 1st peak, as:</option>
-						 </select> 
+						 </select>
 						 <input type="text" id="qKCO" size=4 value="0.5" disabled></p>
 					</div>
 					<p>Specify a global cutoff (quantile) if the population cannot be partitioned statistically: <input type="text" id="globalCO2" size=4 value="0.25"></p>
@@ -632,7 +632,7 @@
 				</div>
 				<p><input type="radio" name="orthologing" onclick="change_orthologing()"> Let the program infer gene orthology using its built-in function.</p>
 				<div id="divInOrthologing" class="subsection" style="display:none">
-					<p>Rule: two genes are defined as (co-)orthologous if they: 
+					<p>Rule: two genes are defined as (co-)orthologous if they:
 					<select id="pairRule">
 						<option value=1>one is a hit of the other</option>
 						<option value=2>one is the best hit of the other</option>
@@ -679,19 +679,19 @@
 			<div class="subsection" id="divTreer" style="display:none">
 				<p><b>Treer</b> is an extra step that stands alone from the standard HGTector pipeline. It calls external programs to build phylogenetic trees based on protein sequences obtained from homology search. Resulting trees will be appended to each search result file (*.bla), as a standard NEXUS Tree block. These files may be directly displayed in external tree viewers (such as <a href="http://tree.bio.ed.ac.uk/software/figtree/">FigTree</a>) for visual examination of evolutionary patterns, or subject to downstream phylogenetic analyses.</p>
 				<p>Warning: Treer is an experimental module. Its behavior is highly unpredictable.</p>
-				<p><input type="checkbox" onchange="document.getElementById('realign').disabled=!this.checked"> Realign sequences, using: 
+				<p><input type="checkbox" onchange="document.getElementById('realign').disabled=!this.checked"> Realign sequences, using:
 					<select id="realign" disabled=true">
 						<option value='ClustalW'>ClustalW</option>
 						<option value='MAFFT'>MAFFT</option>
 						<option value='Muscle'>Muscle</option>
 					</select>
 				</p>
-				<p><input type="checkbox" onchange="document.getElementById('trimSeq').disabled=!this.checked"> Trim unreliable regions from alignment, using: 
+				<p><input type="checkbox" onchange="document.getElementById('trimSeq').disabled=!this.checked"> Trim unreliable regions from alignment, using:
 					<select id="trimSeq" disabled=true">
 						<option value='Gblocks'>Gblocks</option>
 					</select>
 				</p>
-				<p><input type="checkbox" id="chkTreeOption" onchange="document.getElementById('buildTree').disabled=!this.checked;show_hide_subsection('TreeOption')"> Build phylogenetic tree, using: 
+				<p><input type="checkbox" id="chkTreeOption" onchange="document.getElementById('buildTree').disabled=!this.checked;show_hide_subsection('TreeOption')"> Build phylogenetic tree, using:
 					<select id="buildTree" disabled=true">
 						<option value='clustalw'>ClustalW (Neighbor-Joining)</option>
 						<option value='mafft'>MAFFT (Neighbor-Joining)</option>
@@ -742,7 +742,7 @@
 				<p><input type="checkbox" id="plot3D"> Use R package <b>rgl</b> to creates interactive 3D scatter plots to visualize the weight populations.</p>
 			</div>
 		</div>
-		
+
 		<!-- Generate config.txt -->
 		<hr>
 		<p><input type="button" id="btnOK" value="Generate configuration file!" onclick="generate_config()"><input type="button" id="btnMiscShow" value="misc features" style="float:right" onclick="document.getElementById('divMisc').style.display='block';document.getElementById('btnMiscShow').style.display='none';"></p>
@@ -768,4 +768,3 @@
 		<p><br><br><br><br><br><br><br><br></p>
 	</body>
 </html>
-
diff --git a/LICENSE b/LICENSE
index 932f55e..fc86ad5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2013-2016, Qiyun Zhu and Katharina Dittmar
+Copyright (c) 2013-2017, Qiyun Zhu and Katharina Dittmar
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
@@ -6,4 +6,4 @@ Redistribution and use in source and binary forms, with or without modification,
     Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
     Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index cd3e853..09040b1 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ HGTector
 
 A computational pipeline for genome-wide detection of putative horizontal gene transfer (HGT) events based on sequence homology search hit distribution statistics
 
-Latest release: [0.2.1](https://github.com/DittmarLab/HGTector/archive/0.2.1.zip)
+Latest release: [0.2.1](https://github.com/DittmarLab/HGTector/archive/0.2.2.zip)
 
 Last update: 2017-1-28
 
diff --git a/config.txt b/config.txt
index cef945e..293fb57 100755
--- a/config.txt
+++ b/config.txt
@@ -1,6 +1,6 @@
 
 #######################################################
-#        HGTector v0.2.1 configuration file           #
+#        HGTector v0.2.2 configuration file           #
 #######################################################
 
 # This is a template of the configuration file (config.txt). It contains all parameters
@@ -31,7 +31,7 @@ interactive=1
   preSearch=
 
   ## Search tool ##
-  
+
     # Choice of BLAST, RAPSearch2, DIAMOND and customized
     searchTool=blast
 
@@ -188,7 +188,7 @@ interactive=1
       # selfRank=family       # family level (Enterobacteriaceae)
       # selfRank=0            # the lowest rank that includes all input genomes (Escherichia)
       # selfRank=1            # the second lowest rank (Enterobacteriaceae)
-    
+
     # Option 2: manually specify TaxID(s) for each group
       # TaxIDs can be looked-up in NCBI taxonomy database. They don't have to be limited to
       # taxonomic ranks defined by 'ranks'.
@@ -396,7 +396,7 @@ interactive=1
   outText=1
   outHTML=1
   outExcel=0
-  
+
   # Attach detailed outputs in Excel spreadsheet
   detailExcel=1
 
@@ -449,4 +449,3 @@ interactive=1
     fasttree=FastTree
     raxml=raxmlHPC
     phyml=phyml
-

From 133dcbef850cd0d5f1273084ec9682268fcfd157 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Wed, 23 Aug 2017 01:11:34 -0700
Subject: [PATCH 5/9] fixed some indents

---
 scripts/analyzer.pl   |  68 ++--
 scripts/bbh.pl        |  19 +-
 scripts/databaser.py  | 711 +++++++++++++++++++++---------------------
 scripts/orthologer.pl |  30 +-
 scripts/reporter.pl   |  31 +-
 scripts/treer.pl      |  34 +-
 6 files changed, 446 insertions(+), 447 deletions(-)

diff --git a/scripts/analyzer.pl b/scripts/analyzer.pl
index 2137ce1..d616f72 100644
--- a/scripts/analyzer.pl
+++ b/scripts/analyzer.pl
@@ -50,85 +50,85 @@
 
 ## global variables ##
 
-my @sets;                                        # proteins sets to analyze
-my %lv = ();                                            # taxonomic grouping scenario
+my @sets;                                       # proteins sets to analyze
+my %lv = ();                                    # taxonomic grouping scenario
 my %selves = ();                                # taxonomic information of self group
 my %lvList = ();                                # list of taxids of each level
 
 my %taxadb = ();                                # taxa.db
-my %ranksdb = ();                                # ranks.db
-my %selfinfo = ();                                # self.info
+my %ranksdb = ();                               # ranks.db
+my %selfinfo = ();                              # self.info
 
-my %proteins = ();                                # accn -> name (identified by COG)
+my %proteins = ();                              # accn -> name (identified by COG)
 
-my @files;                                        # search report files for each protein set
+my @files;                                      # search report files for each protein set
 
 ## the master table storing everything. It's an array of hashes. Each row is a record.
 my %results = ();
 
 ## the phyletic pattern of the whole genome, aka "fingerprint"
-my %fpN = ();                                    # number of hits per protein
+my %fpN = ();                                   # number of hits per protein
 
 ## program parameters ##
 
-my $wkDir = $ARGV[0];                            # working directory
+my $wkDir = $ARGV[0];                           # working directory
 my $interactive = 1;                            # interactive or automatic mode
 
 my $minHits = 0;                                # minimal number of hits a valid search result should contain
 my $maxHits = 0;                                # maximal number of hits to retain for one protein, 0 means infinite
 my $minSize = 0;                                # minimal size (aa) of a valid protein (0 means infinite)
-my $evalue = 1e-5;                                # E-value cutoff
-my $identity = 0;                                # percent identity cutoff
-my $coverage = 0;                                # query coverage cutoff
+my $evalue = 1e-5;                              # E-value cutoff
+my $identity = 0;                               # percent identity cutoff
+my $coverage = 0;                               # query coverage cutoff
 
 # algorithm
-my $selfRank = 0;                                # taxonomic rank(s) on which the program analyzes
-my $normalize = 1;                                # use relative bit score (bit score of subject / bit score of query)
-my $unite = 1;                                    # hit pattern (0: each genome has own pattern, 1: one pattern for all genomes)
+my $selfRank = 0;                               # taxonomic rank(s) on which the program analyzes
+my $normalize = 1;                              # use relative bit score (bit score of subject / bit score of query)
+my $unite = 1;                                  # hit pattern (0: each genome has own pattern, 1: one pattern for all genomes)
 
 my $useDistance = 0;                            # use phylogenetic distance instead of bit scores
-my $useWeight = 1;                                # use weight (sum of scores) instead of number of hits
+my $useWeight = 1;                              # use weight (sum of scores) instead of number of hits
 
 # fingerprints
-my $outRaw = 1;                                    # output raw number/weight data
-my $outFp = 1;                                    # output fingerprint
+my $outRaw = 1;                                 # output raw number/weight data
+my $outFp = 1;                                  # output fingerprint
 my $graphFp = 0;                                # graph fingerprint (requires R)
 
-my $plotRef = "";                                # file name of reference set of genes
+my $plotRef = "";                               # file name of reference set of genes
 
 my $boxPlot = 1;                                # box plot
-my $histogram = 1;                                # histogram
+my $histogram = 1;                              # histogram
 my $densityPlot = 1;                            # density plot
 my $scatterPlot = 1;                            # scatter plot
-my $plot3D = 0;                                    # 3-way scatter plot
+my $plot3D = 0;                                 # 3-way scatter plot
 
 # cutoffs
-my $howCO = 4;                                    # how to determine cutoffs (0: user-defined global cutoff (%), 1: user-defined individual
-                                                    # cutoffs, 2: wait for user input, 3: histogram, 4: kernel density estimation,
-                                                    # 5: hierarchical clustering)
+my $howCO = 4;                                  # how to determine cutoffs (0: user-defined global cutoff (%), 1: user-defined individual
+                                                # cutoffs, 2: wait for user input, 3: histogram, 4: kernel density estimation,
+                                                # 5: hierarchical clustering)
 my $globalCO = 0.25;                            # arbitrary global cutoff (%)
-my ($selfCO, $closeCO, $distalCO) = (0, 0, 0);    # user-defined cutoffs for individual groups
+my ($selfCO, $closeCO, $distalCO) = (0, 0, 0);  # user-defined cutoffs for individual groups
 
-my $exOutlier = 0;                                # exclude outliers, hits distant from other hits of the same group
+my $exOutlier = 0;                              # exclude outliers, hits distant from other hits of the same group
 
-my $nBin = 20;                                    # number of bins in histogram
-my $plotHist = 0;                                # plot histogram on screen
+my $nBin = 20;                                  # number of bins in histogram
+my $plotHist = 0;                               # plot histogram on screen
 
 my $toolKDE = 0;                                # computational tool for kernel density estimation
 my $bwF = 1;                                    # bandwidth selection factor
 my $plotKDE = 0;                                # plot density function on screen
 my $toolExtrema = 0;                            # computational tool for identifying local extrema of density function (0: Perl code, 1: R package "pastecs")
-my $whichPeak = 0;                                # definition of "typical" and "atypical" regions
-my $modKCO = 1;                                    # location of cutoff (0: 1st pit, 1: midpoint of x-coordinates between 1st peak and 1st pit, 2/3: quantile
-my $qKCO = 0.5;                                    # horizontal/vertical quantile from 1st pit toward 1st peak
+my $whichPeak = 0;                              # definition of "typical" and "atypical" regions
+my $modKCO = 1;                                 # location of cutoff (0: 1st pit, 1: midpoint of x-coordinates between 1st peak and 1st pit, 2/3: quantile
+my $qKCO = 0.5;                                 # horizontal/vertical quantile from 1st pit toward 1st peak
 
 my $dipTest = 0;                                # perform non-unimodality test (Hartigan's dip test) and report p-value
-my $dipSig = 0;                                    # use global cutoff if dip test's result is not significant
+my $dipSig = 0;                                 # use global cutoff if dip test's result is not significant
 
 my $selfLow = 0;                                # HGT-derived genes must have low self weight (an optional criterion)
 
 my $BBH = 0;                                    # use conventional best match method instead
-my $loss = 0;                                    # also report gene loss events
+my $loss = 0;                                   # also report gene loss events
 my $POE = 0;                                    # also report POE
 
 my @ranks = ('species', 'genus', 'family', 'order', 'class', 'phylum');
@@ -139,7 +139,7 @@
 my @inSets = ();
 my @exSets = ();
 
-my $R;                                            # Statistics::R instance
+my $R;                                          # Statistics::R instance
 
 ## read configurations ##
 
@@ -153,7 +153,7 @@
         $minSize = $1 if /^minSize=(\d+)$/;
         $evalue = $1 if /^evalue=(.+)$/;
         $identity = $1 if /^identity=(.+)$/;
-        $identity = $1 if /^percIdent=(.+)$/; # backward compatibility
+        $identity = $1 if /^percIdent=(.+)$/;  # backward compatibility
         $coverage = $1 if /^coverage=(.+)$/;
 
         $selfRank = $1 if /^selfRank=(.+)$/;
diff --git a/scripts/bbh.pl b/scripts/bbh.pl
index 24a416d..ecc4555 100644
--- a/scripts/bbh.pl
+++ b/scripts/bbh.pl
@@ -25,17 +25,17 @@
 
 ## global variables ##
 
-my @queries = ();                                    # list of query proteins
-my %done = ();                                        # queries that is already done.
+my @queries = ();                   # list of query proteins
+my %done = ();                      # queries that is already done.
 
-my $query;                                            # current query protein (accession number)
-my $taxid;                                            # target organism
-my $subject;                                        # subject protein (accession number)
-my $eqText;                                            # entrez query text
-my $isBBH;                                            # whether a hit is BBH
+my $query;                          # current query protein (accession number)
+my $taxid;                          # target organism
+my $subject;                        # subject protein (accession number)
+my $eqText;                         # entrez query text
+my $isBBH;                          # whether a hit is BBH
 
-my $iRetry = 0;                                        # current number of retries
-my $isError = 0;                                    # if error
+my $iRetry = 0;                     # current number of retries
+my $isError = 0;                    # if error
 
 use LWP::UserAgent;
 use HTTP::Request::Common qw(POST);
@@ -212,7 +212,6 @@ ()
         foreach (@out) {
             if (/hits? found/) { $read = 1; next; }
             next unless $read;
-#            print;
             @a = split (/\t/);
             if ($a[1] =~ $subject) { $isBBH = 1; }
             last;
diff --git a/scripts/databaser.py b/scripts/databaser.py
index f77c565..55489eb 100644
--- a/scripts/databaser.py
+++ b/scripts/databaser.py
@@ -9,64 +9,64 @@
 import os, sys, re, time, datetime, subprocess, urllib, ftplib, gzip, tarfile
 
 print('\n'.join((
-	'',
-	'Databaser: generate taxonomically balanced, non-redundant proteome database.',
-	'Author: Qiyun Zhu (qzhu@jcvi.org)',
-	'Affiliation: J. Craig Venter Institute',
-	'Last updated: Jan 28, 2017',
-	''
-	)))
+    '',
+    'Databaser: generate taxonomically balanced, non-redundant proteome database.',
+    'Author: Qiyun Zhu (qzhu@jcvi.org)',
+    'Affiliation: J. Craig Venter Institute',
+    'Last updated: Jan 28, 2017',
+    ''
+    )))
 
 if len(sys.argv) == 1:
-	print('\n'.join((
-	'Usage:',
-	'  python scripts/databaser.py [-format=none|blast] [-range=all|microbe|etc] [-represent=no|auto|filename] [-subsample=0|1|2...] [-out=dbname]',
-	'Note:',
-	'  By default, the script will download NCBI RefSeq proteomic data of microbial organisms, keep one proteome per species, plus all NCBI-defined representative proteomes.',
-	'  Use "-help" to display details of program parameters.'
-	)))
-	input('Press Enter to continue or Ctrl+C to exit...')
+    print('\n'.join((
+    'Usage:',
+    '  python scripts/databaser.py [-format=none|blast] [-range=all|microbe|etc] [-represent=no|auto|filename] [-subsample=0|1|2...] [-out=dbname]',
+    'Note:',
+    '  By default, the script will download NCBI RefSeq proteomic data of microbial organisms, keep one proteome per species, plus all NCBI-defined representative proteomes.',
+    '  Use "-help" to display details of program parameters.'
+    )))
+    input('Press Enter to continue or Ctrl+C to exit...')
 elif sys.argv[1] == '-h' or sys.argv[1].endswith('-help'):
-	print('\n'.join((
-	'Details of parameters:',
-	'  -format=<none, blast (default)>',
-	'    Format of the output database. If "none", only a master fasta file will be generated, based on which one may manually create customized databases.',
-	'  -range=<taxonomic categories>',
-	'    The NCBI RefSeq genome database has the following categories: archaea, bacteria, fungi, invertebrate, plant, protozoa, vertebrate_mammalian, vertebrate_other, and viral.',
-	'    Type one or more desired categories, separated by comma (,).',
-	'    Type "all" for all categories, or "microbe" for archaea, bacteria, fungi, and protozoa (default).',
-	'  -represent=<no, auto (default) or filename>',
-	'    By default, the script will download a list of NCBI-defined representative genomes, and include all of them in the resulting database.',
-	'    One may also provide a user-defined genome list (format: one taxid per line), or switch off this function',
-	'  -subsample=<0, 1 (default), 2, 3...>',
-	'    By default, the script will keep up to one proteome per species.',
-	'    Change the number for more proteomes, or "0" for all (no subsampling).',
-	'  -out=<database name>',
-	'    Please use a simple string without spaces or symbols. The default name is "stdb".',
-	'',
-	'Examples:',
-	'  Default behavior:',
-	'    python script/databaser.py -format=blast -range=microbe -subsample=1 -represent=auto',
-	'  Download microbial eukaryotic proteomes:',
-	'    python script/databaser.py -format=none -range=protozoa,fungi -subsample=0 -represent=no -out="MyEukarie"',
-	''
-	)))
-	sys.exit()
+    print('\n'.join((
+    'Details of parameters:',
+    '  -format=<none, blast (default)>',
+    '    Format of the output database. If "none", only a master fasta file will be generated, based on which one may manually create customized databases.',
+    '  -range=<taxonomic categories>',
+    '    The NCBI RefSeq genome database has the following categories: archaea, bacteria, fungi, invertebrate, plant, protozoa, vertebrate_mammalian, vertebrate_other, and viral.',
+    '    Type one or more desired categories, separated by comma (,).',
+    '    Type "all" for all categories, or "microbe" for archaea, bacteria, fungi, and protozoa (default).',
+    '  -represent=<no, auto (default) or filename>',
+    '    By default, the script will download a list of NCBI-defined representative genomes, and include all of them in the resulting database.',
+    '    One may also provide a user-defined genome list (format: one taxid per line), or switch off this function',
+    '  -subsample=<0, 1 (default), 2, 3...>',
+    '    By default, the script will keep up to one proteome per species.',
+    '    Change the number for more proteomes, or "0" for all (no subsampling).',
+    '  -out=<database name>',
+    '    Please use a simple string without spaces or symbols. The default name is "stdb".',
+    '',
+    'Examples:',
+    '  Default behavior:',
+    '    python script/databaser.py -format=blast -range=microbe -subsample=1 -represent=auto',
+    '  Download microbial eukaryotic proteomes:',
+    '    python script/databaser.py -format=none -range=protozoa,fungi -subsample=0 -represent=no -out="MyEukarie"',
+    ''
+    )))
+    sys.exit()
 
 today = datetime.date.today()
 with open('date.txt', 'w') as fout:
-	fout.write(str(today))
+    fout.write(str(today))
 
 (retries, delay) = (5, 10)
 (format, rng, repre, subsp, outname) = ('blast', 'microbe', 'auto', 1, 'stdb')
 for argv in sys.argv[1:]:
-	l = argv.split('=')
-	if len(l) < 2: continue
-	if l[0] == '-format': format = l[1]
-	elif l[0] == '-range': rng = l[1]
-	elif l[0] == '-represent': repre = l[1]
-	elif l[0] == '-subsample': subsp = int(l[1])
-	elif l[0] == '-out': outname = l[1]
+    l = argv.split('=')
+    if len(l) < 2: continue
+    if l[0] == '-format': format = l[1]
+    elif l[0] == '-range': rng = l[1]
+    elif l[0] == '-represent': repre = l[1]
+    elif l[0] == '-subsample': subsp = int(l[1])
+    elif l[0] == '-out': outname = l[1]
 
 print('Connecting to the NCBI FTP server...', end=' ')
 sys.stdout.flush()
@@ -77,13 +77,13 @@
 print('Downloading the NCBI taxonomy database...', end=' ')
 sys.stdout.flush()
 if os.path.isdir('taxdump'):
-	if os.path.isfile('taxdump/names.dmp'): os.remove('taxdump/names.dmp')
-	if os.path.isfile('taxdump/nodes.dmp'): os.remove('taxdump/nodes.dmp')
+    if os.path.isfile('taxdump/names.dmp'): os.remove('taxdump/names.dmp')
+    if os.path.isfile('taxdump/nodes.dmp'): os.remove('taxdump/nodes.dmp')
 else:
-	os.makedirs('taxdump')
+    os.makedirs('taxdump')
 ftp.cwd('/pub/taxonomy')
 with open('taxdump.tar.gz', 'wb') as fout:
-	ftp.retrbinary('RETR ' + 'taxdump.tar.gz', fout.write, 1024)
+    ftp.retrbinary('RETR ' + 'taxdump.tar.gz', fout.write, 1024)
 tar = tarfile.open('taxdump.tar.gz', 'r:gz')
 tar.extract('names.dmp', 'taxdump')
 tar.extract('nodes.dmp', 'taxdump')
@@ -94,170 +94,170 @@
 name2id = {}
 nname = {} # number of names per TaxID
 if repre == 'auto' or subsp:
-	print('Reading the NCBI taxonomy database...', end=' ')
-	sys.stdout.flush()
-	fin = open('taxdump/names.dmp', 'r')
-	for line in fin:
-		l = re.split('\s+\|\s+', line)
-		if l[0] in nname: nname[l[0]] += 1
-		else: nname[l[0]] = 1
-		if re.search('scientific name\s*\|', line):
-			if l[1] not in name2id:
-				name2id[l[1]] = l[0]
-	fin.close()
-	print('done.')
-	print(' ', str(len(nname)), 'taxa read.')
+    print('Reading the NCBI taxonomy database...', end=' ')
+    sys.stdout.flush()
+    fin = open('taxdump/names.dmp', 'r')
+    for line in fin:
+        l = re.split('\s+\|\s+', line)
+        if l[0] in nname: nname[l[0]] += 1
+        else: nname[l[0]] = 1
+        if re.search('scientific name\s*\|', line):
+            if l[1] not in name2id:
+                name2id[l[1]] = l[0]
+    fin.close()
+    print('done.')
+    print(' ', str(len(nname)), 'taxa read.')
 
 represents = {}
 if repre == 'auto':
-	print('Downloading the NCBI representative genome list...', end=' ')
-	sys.stdout.flush()
-	if os.path.exists('representative_genomes.txt'): os.remove('representative_genomes.txt')
-	try:
-		urlretrieve('http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=refgenomes&download=on', 'representative_genomes.txt')
-		print('done.')
-	except:
-		print('failed.')
-	if os.path.exists('representative_genomes.txt'):
-		print('Reading the NCBI representative genome list...', end=' ')
-		sys.stdout.flush()
-		fin = open('representative_genomes.txt', 'r')
-		for line in fin:
-			if line[0] == '#': continue
-			line = line.rstrip('\r\n')
-			if not line: continue
-			name = line.split('\t')[2]
-			if name in name2id:
-				represents[name2id[name]] = 1
-		fin.close()
-		print('done.')
-		print(' ', str(len(represents)), ' genomes read.')
+    print('Downloading the NCBI representative genome list...', end=' ')
+    sys.stdout.flush()
+    if os.path.exists('representative_genomes.txt'): os.remove('representative_genomes.txt')
+    try:
+        urlretrieve('http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=refgenomes&download=on', 'representative_genomes.txt')
+        print('done.')
+    except:
+        print('failed.')
+    if os.path.exists('representative_genomes.txt'):
+        print('Reading the NCBI representative genome list...', end=' ')
+        sys.stdout.flush()
+        fin = open('representative_genomes.txt', 'r')
+        for line in fin:
+            if line[0] == '#': continue
+            line = line.rstrip('\r\n')
+            if not line: continue
+            name = line.split('\t')[2]
+            if name in name2id:
+                represents[name2id[name]] = 1
+        fin.close()
+        print('done.')
+        print(' ', str(len(represents)), ' genomes read.')
 elif repre != 'no' and os.path.exists(repre):
-	print('Reading user-defined representative genome list...', end=' ')
-	sys.stdout.flush()
-	fin = open(repre, 'r')
-	for line in fin:
-		if line[0] == '#': continue
-		line = line.rstrip('\r\n')
-		if not line: continue
-		l = line.split('\t')
-		represents[l[0]] = 1
-	fin.close()
-	print(' ', str(len(represents)), ' genomes read.')
+    print('Reading user-defined representative genome list...', end=' ')
+    sys.stdout.flush()
+    fin = open(repre, 'r')
+    for line in fin:
+        if line[0] == '#': continue
+        line = line.rstrip('\r\n')
+        if not line: continue
+        l = line.split('\t')
+        represents[l[0]] = 1
+    fin.close()
+    print(' ', str(len(represents)), ' genomes read.')
 
 print('Reading RefSeq genome list...', end=' ')
 sys.stdout.flush()
 gcfs = {}
 downfiles = {}
 if rng == 'all':
-	downfiles['assembly_summary_refseq.txt'] = 'assembly_summary_refseq.txt'
+    downfiles['assembly_summary_refseq.txt'] = 'assembly_summary_refseq.txt'
 elif rng == 'microbe':
-	for i in ('archaea', 'bacteria', 'fungi', 'protozoa'):
-		downfiles['assembly_summary_' + i + '.txt'] = i + '/assembly_summary.txt'
+    for i in ('archaea', 'bacteria', 'fungi', 'protozoa'):
+        downfiles['assembly_summary_' + i + '.txt'] = i + '/assembly_summary.txt'
 else:
-	for i in rng.split(','):
-		downfiles['assembly_summary_' + i + '.txt'] = i + '/assembly_summary.txt'
+    for i in rng.split(','):
+        downfiles['assembly_summary_' + i + '.txt'] = i + '/assembly_summary.txt'
 for downfile in downfiles:
-	if os.path.isfile(downfile): os.remove(downfile)
-	if '/' in downfiles[downfile]: ftp.cwd('/genomes/refseq/' + downfiles[downfile].split('/')[0])
-	else: ftp.cwd('/genomes/refseq')
-	with open(downfile, 'wb') as fout:
-		if '/' in downfiles[downfile]: ftp.retrbinary('RETR ' + downfiles[downfile].split('/')[1], fout.write, 1024)
-		else: ftp.retrbinary('RETR ' + downfiles[downfile], fout.write, 1024)
-	fin = open(downfile, 'r')
-	for line in fin:
-		line = line.rstrip('\r\n')
-		if line[0] == '#': continue
-		l = line.split('\t')
-		if l[10] == 'latest':
-			gcfs[l[0]] = {'wgs_master': l[3], 'taxid': l[5], 'species_taxid': l[6], 'organism_name': l[7], 'assembly_level': l[11], 'seqs': 0, 'length': 0}
-	fin.close()
-	os.remove(downfile)
+    if os.path.isfile(downfile): os.remove(downfile)
+    if '/' in downfiles[downfile]: ftp.cwd('/genomes/refseq/' + downfiles[downfile].split('/')[0])
+    else: ftp.cwd('/genomes/refseq')
+    with open(downfile, 'wb') as fout:
+        if '/' in downfiles[downfile]: ftp.retrbinary('RETR ' + downfiles[downfile].split('/')[1], fout.write, 1024)
+        else: ftp.retrbinary('RETR ' + downfiles[downfile], fout.write, 1024)
+    fin = open(downfile, 'r')
+    for line in fin:
+        line = line.rstrip('\r\n')
+        if line[0] == '#': continue
+        l = line.split('\t')
+        if l[10] == 'latest':
+            gcfs[l[0]] = {'wgs_master': l[3], 'taxid': l[5], 'species_taxid': l[6], 'organism_name': l[7], 'assembly_level': l[11], 'seqs': 0, 'length': 0}
+    fin.close()
+    os.remove(downfile)
 print('done.')
 print(' ', str(len(gcfs)), 'genomes read.')
 
 if subsp:
-	print('Subsampling genomes...', end=' ')
-	sys.stdout.flush()
-	sp2gcfs = {}
-	for gcf in gcfs:
-		sp = gcfs[gcf]['species_taxid']
-		if sp in sp2gcfs: sp2gcfs[sp] += ',' + gcf
-		else: sp2gcfs[sp] = gcf
-	for sp in sp2gcfs:
-		l = sp2gcfs[sp].split(',')
-		if len(l) <= subsp:
-			for gcf in l:
-				gcfs[gcf]['in'] = 1
-		else:
-			n = 0
-			if represents: # get representative organisms as priority
-				repid2gcfs = {} # one representative TaxID may correspond to multiple GCFs
-				for gcf in l:
-					id = gcfs[gcf]['taxid']
-					if id in represents:
-						if id in repid2gcfs: repid2gcfs[id] += ',' + gcf
-						else: repid2gcfs[id] = gcf
-				for id in repid2gcfs:
-					if ',' in repid2gcfs[id]:
-						l = repid2gcfs[id].split(',')
-						thisgcf = ''
-						for gcf in l:
-							if gcfs[gcf]['assembly_level'] == 'Complete Genome' or gcfs[gcf]['assembly_level'] == 'Chromosome':
-								thisgcf = gcf
-								break
-						if not thisgcf:
-							for gcf in l:
-								if gcfs[gcf]['assembly_level'] == 'Scaffold':
-									thisgcf = gcf
-									break
-							if not thisgcf:
-								thisgcf = l[0]
-						gcfs[thisgcf]['in'] = 1
-					else:
-						gcfs[repid2gcfs[id]]['in'] = 1
-					n += 1
-			if n >= subsp: continue
-			names = {} # GCF => number of names
-			for gcf in l:
-				if gcfs[gcf]['assembly_level'] == 'Complete Genome' or gcfs[gcf]['assembly_level'] == 'Chromosome':
-					id = gcfs[gcf]['taxid']
-					if id in nname: names[gcf] = nname[id]
-					else: names[gcf] = 0
-			if names:
-				for (gcf, number) in sorted(names.items(), reverse=True, key=lambda x: x[1]):
-					gcfs[gcf]['in'] = 1
-					n += 1
-					if n >= subsp: break
-			if n >= subsp: continue
-			names = {}
-			for gcf in l:
-				if gcfs[gcf]['assembly_level'] == 'Scaffold':
-					id = gcfs[gcf]['taxid']
-					if id in nname: names[gcf] = nname[id]
-					else: names[gcf] = 0
-			if names:
-				for (gcf, number) in sorted(names.items(), reverse=True, key=lambda x: x[1]):
-					gcfs[gcf]['in'] = 1
-					n += 1
-					if n >= subsp: break
-			if n >= subsp: continue
-			names = {}
-			for gcf in l:
-				if gcfs[gcf]['assembly_level'] == 'Contig':
-					id = gcfs[gcf]['taxid']
-					if id in nname: names[gcf] = nname[id]
-					else: names[gcf] = 0
-			if names:
-				for (gcf, number) in sorted(names.items(), reverse=True, key=lambda x: x[1]):
-					gcfs[gcf]['in'] = 1
-					n += 1
-					if n >= subsp: break
-	n = 0
-	for gcf in gcfs:
-		if 'in' in gcfs[gcf]: n += 1
-	print('done.')
-	print(' ', str(n), 'genomes retained.')
+    print('Subsampling genomes...', end=' ')
+    sys.stdout.flush()
+    sp2gcfs = {}
+    for gcf in gcfs:
+        sp = gcfs[gcf]['species_taxid']
+        if sp in sp2gcfs: sp2gcfs[sp] += ',' + gcf
+        else: sp2gcfs[sp] = gcf
+    for sp in sp2gcfs:
+        l = sp2gcfs[sp].split(',')
+        if len(l) <= subsp:
+            for gcf in l:
+                gcfs[gcf]['in'] = 1
+        else:
+            n = 0
+            if represents: # get representative organisms as priority
+                repid2gcfs = {} # one representative TaxID may correspond to multiple GCFs
+                for gcf in l:
+                    id = gcfs[gcf]['taxid']
+                    if id in represents:
+                        if id in repid2gcfs: repid2gcfs[id] += ',' + gcf
+                        else: repid2gcfs[id] = gcf
+                for id in repid2gcfs:
+                    if ',' in repid2gcfs[id]:
+                        l = repid2gcfs[id].split(',')
+                        thisgcf = ''
+                        for gcf in l:
+                            if gcfs[gcf]['assembly_level'] == 'Complete Genome' or gcfs[gcf]['assembly_level'] == 'Chromosome':
+                                thisgcf = gcf
+                                break
+                        if not thisgcf:
+                            for gcf in l:
+                                if gcfs[gcf]['assembly_level'] == 'Scaffold':
+                                    thisgcf = gcf
+                                    break
+                            if not thisgcf:
+                                thisgcf = l[0]
+                        gcfs[thisgcf]['in'] = 1
+                    else:
+                        gcfs[repid2gcfs[id]]['in'] = 1
+                    n += 1
+            if n >= subsp: continue
+            names = {} # GCF => number of names
+            for gcf in l:
+                if gcfs[gcf]['assembly_level'] == 'Complete Genome' or gcfs[gcf]['assembly_level'] == 'Chromosome':
+                    id = gcfs[gcf]['taxid']
+                    if id in nname: names[gcf] = nname[id]
+                    else: names[gcf] = 0
+            if names:
+                for (gcf, number) in sorted(names.items(), reverse=True, key=lambda x: x[1]):
+                    gcfs[gcf]['in'] = 1
+                    n += 1
+                    if n >= subsp: break
+            if n >= subsp: continue
+            names = {}
+            for gcf in l:
+                if gcfs[gcf]['assembly_level'] == 'Scaffold':
+                    id = gcfs[gcf]['taxid']
+                    if id in nname: names[gcf] = nname[id]
+                    else: names[gcf] = 0
+            if names:
+                for (gcf, number) in sorted(names.items(), reverse=True, key=lambda x: x[1]):
+                    gcfs[gcf]['in'] = 1
+                    n += 1
+                    if n >= subsp: break
+            if n >= subsp: continue
+            names = {}
+            for gcf in l:
+                if gcfs[gcf]['assembly_level'] == 'Contig':
+                    id = gcfs[gcf]['taxid']
+                    if id in nname: names[gcf] = nname[id]
+                    else: names[gcf] = 0
+            if names:
+                for (gcf, number) in sorted(names.items(), reverse=True, key=lambda x: x[1]):
+                    gcfs[gcf]['in'] = 1
+                    n += 1
+                    if n >= subsp: break
+    n = 0
+    for gcf in gcfs:
+        if 'in' in gcfs[gcf]: n += 1
+    print('done.')
+    print(' ', str(n), 'genomes retained.')
 
 print('Downloading non-redundant proteomic data from NCBI RefSeq...')
 sys.stdout.flush()
@@ -266,172 +266,171 @@
 (nodata, haserror) = ([], [])
 print(' ', end=' ')
 with open('sampled_genomes.txt', 'w') as fout:
-	fout.write('#' + '\t'.join(('assembly_accession', 'wgs_master', 'taxid', 'species_taxid', 'organism_name', 'assembly_level', 'proteins', 'residues')) + '\n')
+    fout.write('#' + '\t'.join(('assembly_accession', 'wgs_master', 'taxid', 'species_taxid', 'organism_name', 'assembly_level', 'proteins', 'residues')) + '\n')
 for gcf in gcfs:
-	if subsp and 'in' not in gcfs[gcf]: continue
-	dir = '/'.join((gcf[4:7], gcf[7:10], gcf[10:13]))
-	(found, err550) = (0, 0)
-	for i in range(retries):
-		try:
-			ftp.cwd('/genomes/all/GCF/' + dir)
-			found = 1
-		except ftplib.error_perm as resp:
-			if str(resp).split()[0] == '550': err550 = 1
-		except ftplib.error_temp as resp:
-			time.sleep(delay)
-			continue
-		else: pass
-		break
-	if not found:
-		if err550: nodata.append(gcf)
-		else: haserror.append(gcf)
-		continue
-	(files, err550) = ([], 0)
-	for i in range(retries):
-		try:
-			files = ftp.nlst()
-		except ftplib.error_perm as resp:
-			if str(resp).split()[0] == '550': err550 = 1
-		except ftplib.error_temp as resp:
-			time.sleep(delay)
-			continue
-		else: pass
-		break
-	if not files:
-		if err550: nodata.append(gcf)
-		else: haserror.append(gcf)
-		continue
-	assembly = ''
-	for file in files:
-		if file.startswith(gcf):
-			assembly = file
-			break
-	if not assembly:
-		nodata.append(gcf)
-		continue
-	(found, err550) = (0, 0)
-	for i in range(retries):
-		try:
-			ftp.cwd('/genomes/all/GCF/' + dir + '/' + assembly)
-			found = 1
-		except ftplib.error_perm as resp:
-			if str(resp).split()[0] == '550': err550 = 1
-		except ftplib.error_temp as resp:
-			time.sleep(delay)
-			continue
-		else: pass
-		break
-	if not found:
-		if err550: nodata.append(gcf)
-		else: haserror.append(gcf)
-		continue
-	(files, err550) = ([], 0)
-	for i in range(retries):
-		try:
-			files = ftp.nlst()
-		except ftplib.error_perm as resp:
-			if str(resp).split()[0] == '550': err550 = 1
-		except ftplib.error_temp as resp:
-			time.sleep(delay)
-			continue
-		else: pass
-		break
-	if not files:
-		if err550: nodata.append(gcf)
-		else: haserror.append(gcf)
-		continue
-	downfile = assembly + '_protein.gpff.gz'
-	if not downfile in files:
-		nodata.append(gcf)
-		continue
-	written = 0
-	for i in range(retries):
-		try:
-			with open(downfile, 'wb') as fout:
-				ftp.retrbinary('RETR ' + downfile, fout.write, 1024)
-			written = 1
-		except ftplib.error_temp as resp:
-			time.sleep(delay)
-			continue
-		else: pass
-		break
-	if not written:
-		haserror.append(gcf)
-		continue
-	with open(outname + '.faa', 'a') as fout:
-		try: fin = gzip.open(downfile, mode='rt')
-		except TypeError: fin = gzip.open(downfile, 'r')
-		taxid = gcfs[gcf]['taxid']
-		(accn, gi, name, seq) = ('', '', '', '')
-		for line in fin:
-			line = line.rstrip('\r\n')
-			if line.startswith('DEFINITION  '):
-				name = line[12:].rstrip('.')
-			elif line.startswith('VERSION     '):
-				l = line[12:].split(' ')
-				(accn, gi) = (l[0], l[-1].split(':')[-1])
-			elif len(line) > 10 and re.search(r'^\s+\d+\s$', line[:10]):
-				seq += line[10:].replace(' ', '').upper()
-			elif line == '//':
-				if gi in gi2taxids:
-					gi2taxids[gi] += ',' + taxid
-					if gi not in sharedgis: sharedgis[gi] = '1'
-				else:
-					gi2taxids[gi] = taxid
-					gilist.append(gi)
-					fout.write('>gi|' + gi + '|ref|' + accn + '| ' + name + '\n' + seq +'\n')
-				gcfs[gcf]['seqs'] += 1
-				gcfs[gcf]['length'] += len(seq)
-				(accn, gi, name, seq) = ('', '', '', '')
-		fin.close()
-	nseq += gcfs[gcf]['seqs']
-	naa += gcfs[gcf]['length']
-	with open('sampled_genomes.txt', 'a') as fout:
-		fout.write('\t'.join((gcf, gcfs[gcf]['wgs_master'], taxid, gcfs[gcf]['species_taxid'], gcfs[gcf]['organism_name'], gcfs[gcf]['assembly_level'], str(gcfs[gcf]['seqs']), str(gcfs[gcf]['length']))) + '\n')
-	os.remove(downfile)
-	ngcf += 1
-	if not ngcf % 25:
-		print(str(ngcf), end=' ')
-		sys.stdout.flush()
+    if subsp and 'in' not in gcfs[gcf]: continue
+    dir = '/'.join((gcf[4:7], gcf[7:10], gcf[10:13]))
+    (found, err550) = (0, 0)
+    for i in range(retries):
+        try:
+            ftp.cwd('/genomes/all/GCF/' + dir)
+            found = 1
+        except ftplib.error_perm as resp:
+            if str(resp).split()[0] == '550': err550 = 1
+        except ftplib.error_temp as resp:
+            time.sleep(delay)
+            continue
+        else: pass
+        break
+    if not found:
+        if err550: nodata.append(gcf)
+        else: haserror.append(gcf)
+        continue
+    (files, err550) = ([], 0)
+    for i in range(retries):
+        try:
+            files = ftp.nlst()
+        except ftplib.error_perm as resp:
+            if str(resp).split()[0] == '550': err550 = 1
+        except ftplib.error_temp as resp:
+            time.sleep(delay)
+            continue
+        else: pass
+        break
+    if not files:
+        if err550: nodata.append(gcf)
+        else: haserror.append(gcf)
+        continue
+    assembly = ''
+    for file in files:
+        if file.startswith(gcf):
+            assembly = file
+            break
+    if not assembly:
+        nodata.append(gcf)
+        continue
+    (found, err550) = (0, 0)
+    for i in range(retries):
+        try:
+            ftp.cwd('/genomes/all/GCF/' + dir + '/' + assembly)
+            found = 1
+        except ftplib.error_perm as resp:
+            if str(resp).split()[0] == '550': err550 = 1
+        except ftplib.error_temp as resp:
+            time.sleep(delay)
+            continue
+        else: pass
+        break
+    if not found:
+        if err550: nodata.append(gcf)
+        else: haserror.append(gcf)
+        continue
+    (files, err550) = ([], 0)
+    for i in range(retries):
+        try:
+            files = ftp.nlst()
+        except ftplib.error_perm as resp:
+            if str(resp).split()[0] == '550': err550 = 1
+        except ftplib.error_temp as resp:
+            time.sleep(delay)
+            continue
+        else: pass
+        break
+    if not files:
+        if err550: nodata.append(gcf)
+        else: haserror.append(gcf)
+        continue
+    downfile = assembly + '_protein.gpff.gz'
+    if not downfile in files:
+        nodata.append(gcf)
+        continue
+    written = 0
+    for i in range(retries):
+        try:
+            with open(downfile, 'wb') as fout:
+                ftp.retrbinary('RETR ' + downfile, fout.write, 1024)
+            written = 1
+        except ftplib.error_temp as resp:
+            time.sleep(delay)
+            continue
+        else: pass
+        break
+    if not written:
+        haserror.append(gcf)
+        continue
+    with open(outname + '.faa', 'a') as fout:
+        try: fin = gzip.open(downfile, mode='rt')
+        except TypeError: fin = gzip.open(downfile, 'r')
+        taxid = gcfs[gcf]['taxid']
+        (accn, gi, name, seq) = ('', '', '', '')
+        for line in fin:
+            line = line.rstrip('\r\n')
+            if line.startswith('DEFINITION  '):
+                name = line[12:].rstrip('.')
+            elif line.startswith('VERSION     '):
+                l = line[12:].split(' ')
+                (accn, gi) = (l[0], l[-1].split(':')[-1])
+            elif len(line) > 10 and re.search(r'^\s+\d+\s$', line[:10]):
+                seq += line[10:].replace(' ', '').upper()
+            elif line == '//':
+                if gi in gi2taxids:
+                    gi2taxids[gi] += ',' + taxid
+                    if gi not in sharedgis: sharedgis[gi] = '1'
+                else:
+                    gi2taxids[gi] = taxid
+                    gilist.append(gi)
+                    fout.write('>gi|' + gi + '|ref|' + accn + '| ' + name + '\n' + seq +'\n')
+                gcfs[gcf]['seqs'] += 1
+                gcfs[gcf]['length'] += len(seq)
+                (accn, gi, name, seq) = ('', '', '', '')
+        fin.close()
+    nseq += gcfs[gcf]['seqs']
+    naa += gcfs[gcf]['length']
+    with open('sampled_genomes.txt', 'a') as fout:
+        fout.write('\t'.join((gcf, gcfs[gcf]['wgs_master'], taxid, gcfs[gcf]['species_taxid'], gcfs[gcf]['organism_name'], gcfs[gcf]['assembly_level'], str(gcfs[gcf]['seqs']), str(gcfs[gcf]['length']))) + '\n')
+    os.remove(downfile)
+    ngcf += 1
+    if not ngcf % 25:
+        print(str(ngcf), end=' ')
+        sys.stdout.flush()
 print('done.')
 
 print(' ', str(ngcf), 'genomes,', str(nseq), 'protein sequences, and', str(naa), 'residues downloaded.')
 if nodata:
-	print(' ', len(nodata), 'genomes have no data.')
-	with open('genomes_without_data.txt', 'w') as fout:
-		for i in nodata: fout.write(i + '\n')
+    print(' ', len(nodata), 'genomes have no data.')
+    with open('genomes_without_data.txt', 'w') as fout:
+        for i in nodata: fout.write(i + '\n')
 if haserror:
-	print(' ', len(haserror), 'genomes cannot be downloaded.')
-	with open('cannot_download.txt', 'w') as fout:
-		for i in haserror: fout.write(i + '\n')
+    print(' ', len(haserror), 'genomes cannot be downloaded.')
+    with open('cannot_download.txt', 'w') as fout:
+        for i in haserror: fout.write(i + '\n')
 with open('gi2taxid.txt', 'w') as fout:
-	for gi in gilist:
-		fout.write(gi + '\t' + gi2taxids[gi] + '\n')
+    for gi in gilist:
+        fout.write(gi + '\t' + gi2taxids[gi] + '\n')
 if sharedgis:
-	print(' ', str(len(gi2taxids)), 'proteins extracted, including', str(len(sharedgis)), 'proteins shared by multiple organisms.')
+    print(' ', str(len(gi2taxids)), 'proteins extracted, including', str(len(sharedgis)), 'proteins shared by multiple organisms.')
 else:
-	print(' ', str(len(gi2taxids)), 'proteins extracted.')
+    print(' ', str(len(gi2taxids)), 'proteins extracted.')
 
 if format == 'blast':
-	print('Building BLAST database...')
-	sys.stdout.flush()
-	if os.path.isdir('blast'):
-		filelist = os.listdir('blast')
-		if filelist:
-			for filename in filelist:
-				os.remove('blast/' + filename)
-	else:
-		os.makedirs('blast')
-	p = subprocess.Popen('makeblastdb -in ' + outname + '.faa -parse_seqids -dbtype prot -out blast/' + outname + ' -title "Auto-generated standard BLAST database ' + str(today) + '"', stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
-	out = p.stdout.read()
-	if 'command not found' in out:
-		print('failed. Check if ncbi-blast+ has been correctly installed.')
-	else:
-		for line in out.split('\n'):
-			if not line: continue
-			if line[0] == '\t': line = '  ' + line[1:]
-			print('  ' + line)
-	p.stdout.close()
-	print('done.')
+    print('Building BLAST database...')
+    sys.stdout.flush()
+    if os.path.isdir('blast'):
+        filelist = os.listdir('blast')
+        if filelist:
+            for filename in filelist:
+                os.remove('blast/' + filename)
+    else:
+        os.makedirs('blast')
+    p = subprocess.Popen('makeblastdb -in ' + outname + '.faa -parse_seqids -dbtype prot -out blast/' + outname + ' -title "Auto-generated standard BLAST database ' + str(today) + '"', stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
+    out = p.stdout.read()
+    if 'command not found' in out:
+        print('failed. Check if ncbi-blast+ has been correctly installed.')
+    else:
+        for line in out.split('\n'):
+            if not line: continue
+            if line[0] == '\t': line = '  ' + line[1:]
+            print('  ' + line)
+    p.stdout.close()
+    print('done.')
 print('Task completed.')
-
diff --git a/scripts/orthologer.pl b/scripts/orthologer.pl
index 0f90675..8447c57 100644
--- a/scripts/orthologer.pl
+++ b/scripts/orthologer.pl
@@ -26,25 +26,25 @@
 
 ## Program parameters ##
 
-my $wkDir = $ARGV[0];                                # working directory
-my @sets = ();                                        # list of protein sets
+my $wkDir = $ARGV[0];               # working directory
+my @sets = ();                      # list of protein sets
 
-my %taxa = ();                                        # set -> taxid
-my %proteins = ();                                    # all input proteins (accn -> id)
+my %taxa = ();                      # set -> taxid
+my %proteins = ();                  # all input proteins (accn -> id)
 
-my @cogs = ();                                        # all COGs. array of hashes, including:
-                                                        # accns -> accn1/accn2/accn3...
-                                                        # names -> name1    name2    name3...
-                                                        # nProtein -> number of proteins
-                                                        # nSet -> number of genomes
-                                                        # name -> best name
-my %names = ();                                        # id -> names
-my %accns = ();                                        # id -> accns
-my $cogid = 1;                                        # current id
+my @cogs = ();                      # all COGs. array of hashes, including:
+                                    # accns -> accn1/accn2/accn3...
+                                    # names -> name1    name2    name3...
+                                    # nProtein -> number of proteins
+                                    # nSet -> number of genomes
+                                    # name -> best name
+my %names = ();                     # id -> names
+my %accns = ();                     # id -> accns
+my $cogid = 1;                      # current id
 
-my $evalue = 1e-5;                                    # maximum E-value threshold
+my $evalue = 1e-5;                  # maximum E-value threshold
 
-my $pairRule = 3;                                    # pairing rule
+my $pairRule = 3;                   # pairing rule
 
 ## Read configuration ##
 
diff --git a/scripts/reporter.pl b/scripts/reporter.pl
index 2ce63ec..b4e9ed4 100644
--- a/scripts/reporter.pl
+++ b/scripts/reporter.pl
@@ -31,50 +31,51 @@
 my $wkDir = $ARGV[0];
 my $interactive = 1;
 
-my $deHypo = 0;                    # ignore hypothetical proteins
+my $deHypo = 0;                     # ignore hypothetical proteins
 
 my $byDonor = 1;                    # summarize HGT events by organism
 my @ranks = ('species', 'genus', 'family', 'order', 'class', 'phylum');
-my $sumRank = "order";            # on this rank
+my $sumRank = "order";              # on this rank
 my $defOnly = 1;                    # ignore those without this rank defined
 
-my $byFunction = 0;                # summarize HGT by function
+my $byFunction = 0;                 # summarize HGT by function
 my $dirFunction;                    # directory containing functional annotations
 
 my $byOrthology = 0;                # generate report by ortholog
 my $smOrthology;                    # file containing scheme of gene orthology
 my $nameOGs = 1;                    # Name COGs if necessary
 my $exORFan = 1;                    # Exclude ORFans
-my @cogs;                            # clusters of orthologs (COGs) [ID] -> name short (long), predicted?
+my @cogs;                           # clusters of orthologs (COGs) [ID] -> name short (long), predicted?
 my %cogids = ();                    # clusters of orthologs (COGs): accn -> ID
-my $isCOG = 0;                        # whether use COG
+my $isCOG = 0;                      # whether use COG
 my %ORFans = ();                    # ORFans set -> (array)
 
 my $outText = 1;                    # generate report in plain text
 my $outHTML = 1;                    # generate report in web page (HTML)
-my $outExcel = 0;                    # generate report in Excel spreadsheet
+my $outExcel = 0;                   # generate report in Excel spreadsheet
 my $detailExcel = 1;                # attach detailed output in Excel workbook
 
 
 ## global variables ##
 
-my $title;                            # title of the analysis
-my @sets;                            # protein sets (genomes)
+my $title;                          # title of the analysis
+my @sets;                           # protein sets (genomes)
 
-my %results = ();                    # master variable of results: $results{$set}{$accn}{$match}
+my %results = ();                   # master variable of results: $results{$set}{$accn}{$match}
 
 my %totals = ();                    # total number of genes per genome
 my %organisms = ();
 
 my %taxadb = ();                    # taxa.db
-my %ranksdb = ();                    # ranks.db
+my %ranksdb = ();                   # ranks.db
 
-my $workbook;                        # Excel workbook
-my $worksheet;                        # Excel worksheet
-my $excelRow;                        # active row number
-my $excelTitle;                    # Excel title format
+my $workbook;                       # Excel workbook
+my $worksheet;                      # Excel worksheet
+my $excelRow;                       # active row number
+my $excelTitle;                     # Excel title format
 my $excelHeader;                    # Excel header format
-my ($excelGrey, $excelGreen, $excelYellow, $excelRed);                # Excel data formats
+my ($excelGrey, $excelGreen, $excelYellow, $excelRed);
+                                    # Excel data formats
 
 my $iLoss = 0; my $iPOE = 0; my $iMatch = 0;
 
diff --git a/scripts/treer.pl b/scripts/treer.pl
index d7f6f34..76f09de 100644
--- a/scripts/treer.pl
+++ b/scripts/treer.pl
@@ -23,30 +23,30 @@
 
 my $i; my $j; my $n; my $s; my $t; my @a; my @b; my @c; my %h;
 
-my @sets;                                            # protein sets (genomes)
-my @hits;                                            # blast hits, each element is a hash, including:
-                                                        # accn, accns, organism, group, taxid, score, distance, ignore
+my @sets;                           # protein sets (genomes)
+my @hits;                           # blast hits, each element is a hash, including:
+                                    # accn, accns, organism, group, taxid, score, distance, ignore
 
-my %self;                                            # query information
-my %seqs;                                            # accn -> sequence
-my %names;                                            # accn -> organism name
-my $nSeq;                                            # number of hits with sequences
-my $nChar;                                            # number of aa per sequence in alignment
-my $tree;                                            # phylogenetic tree with annotations
+my %self;                           # query information
+my %seqs;                           # accn -> sequence
+my %names;                          # accn -> organism name
+my $nSeq;                           # number of hits with sequences
+my $nChar;                          # number of aa per sequence in alignment
+my $tree;                           # phylogenetic tree with annotations
 
 
 ## program parameters ##
 
-my $wkDir = $ARGV[0];                                # working directory
+my $wkDir = $ARGV[0];               # working directory
 
-my $minHits = 3;                                    # minimal number of valid hits for phylogenetic analysis
+my $minHits = 3;                    # minimal number of valid hits for phylogenetic analysis
 
-my $trimSeq = 0;                                    # trim unreliable regions from sequence alignment (options: 0, gblocks)
-my $realign = 0;                                    # realign sequences (options: 0, clustalw, mafft, muscle, clustalo)
-my $buildTree = 0;                                    # Build phylogenetic tree using specified program (options: 0, clustalw, mafft, phyml, raxml)
-my $bsTree = 0;                                        # perform bootstrap for designated times.
-my $distance = 0;                                    # use distance matrix instead of BLAST bit scores to rank hits (options: 0, clustalw, mafft, raxml)
-my $aaModel = "WAG";                                # protein substitution model for RAxML
+my $trimSeq = 0;                    # trim unreliable regions from sequence alignment (options: 0, gblocks)
+my $realign = 0;                    # realign sequences (options: 0, clustalw, mafft, muscle, clustalo)
+my $buildTree = 0;                  # build phylogenetic tree using specified program (options: 0, clustalw, mafft, phyml, raxml)
+my $bsTree = 0;                     # perform bootstrap for designated times.
+my $distance = 0;                   # use distance matrix instead of BLAST bit scores to rank hits (options: 0, clustalw, mafft, raxml)
+my $aaModel = "WAG";                # protein substitution model for RAxML
 
 my $gblocks = "Gblocks";
 my $clustalw = "clustalw";

From b7e00bfb112a4f5b5264247486da49ea6ac0ab4f Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Wed, 23 Aug 2017 12:55:14 -0700
Subject: [PATCH 6/9] added installer, modified multiple

---
 HGTector.pl          |  35 +++-
 scripts/databaser.py | 450 +++++++++++++++++++++++++++++--------------
 scripts/installer.sh |  48 +++++
 scripts/searcher.pl  |  65 +++----
 4 files changed, 406 insertions(+), 192 deletions(-)
 create mode 100644 scripts/installer.sh

diff --git a/HGTector.pl b/HGTector.pl
index f2dfceb..08e8076 100644
--- a/HGTector.pl
+++ b/HGTector.pl
@@ -24,7 +24,7 @@
   4) Execute:
      perl HGTector.pl /path/to/wkdir
 
-Note: Please browse GUI.html, a comprehensive and interactive web page that helps you to understand the whole pipeline and to generate proper configurations for your analyses.
+Please take a look at GUI.html, an interactive web interface that helps you to understand the whole pipeline and to generate configuration files for your analyses.
 
 ";
 
@@ -38,13 +38,16 @@
 if (not @ARGV or $ARGV[0] eq "-h" or $ARGV[0] eq "--help") {
     if (-d $db) {
         print $usage;
+
     } else {
-        print "\nWelcome! This seems to be the first time you run HGTector on this device.\n";
+        print "\nWelcome! This seems to be the first time you run HGTector on this computer.\n";
         print "I will guide you through the basics of the program.\n";
+
         print "\nWould you like to test on a small sample dataset (yes/NO)? ";
         my $s = <STDIN>; chomp $s;
         if ($s =~ /^yes$/i or $s =~ /^y$/i) {
-            print "\nWarning: HGTector will connect the NCBI BLAST server to perform sequence similarity search and identify taxonomy of hits. You don't need any local search tool, database or computing power. However, please be prepared that this process is slow and the server may hang in some situations.\nProceed (yes) or opt out (NO): ";
+            print "\nWarning: HGTector will connect the NCBI BLAST server to perform sequence similarity search and identify taxonomy of hits. You don't need any local search tool, database or computing power. However, please be prepared that this process is slow and the server may hang in some situations.\n";
+            print "Proceed (yes) or opt out (NO): ";
             $s = <STDIN>; chomp $s;
             if ($s =~ /^yes$/i or $s =~ /^y$/i) {
                 pipeline;
@@ -52,12 +55,27 @@
         }
         print "The sample dataset and its corresponding configuration files, plus an archive of sample output can be found in the /sample subdirectory. You may take a look and get a general idea of how the input / output files are organized in an HGTector analysis.\n";
         print "\n";
-        print "Now I will assist you to create necessary databases for future analyses.\nProceed (YES/no)? ";
+
+        print "Now I will assist you to install optional dependencies.\n";
+        print "HGTector by default works out-of-the-box, without relying on any third-party dependencies, whereas several optional modules will enhance HGTector's functionality.\n";
+        print "These modules include Perl modules Statistics::R and Spreadsheet::WriteExcel, and R packages pastecs and diptest (you will need R first).\n";
+        print "Proceed (yes/NO)? ";
+        $s = <STDIN>; chomp $s;
+        if ($s =~ /^yes$/i or $s =~ /^y$/i) {
+            if ($^O eq "MSWin32") {
+                print "You are using the Windows system, in which the installation script cannot be executed. You will need to manually install the dependencies.\n";
+            } else {
+                system "bash $scripts/installer.sh";
+            }
+        }
+        print "\n";
+
+        print "Now I will assist you to generate a reference database.\n";
+        print "Unless you plan to use HGTector in remote mode, a local reference database is recommended for consistant and controllable analyses.\n";
+        print "Proceed (YES/no)? ";
         $s = <STDIN>; chomp $s;
         if (not $s or $s =~ /^yes$/i or $s =~ /^y$/i) {
-            print "HGTector needs three databases: a protein sequence database, a taxonomy database, and a protein-to-taxonomy dictionary. Using proper databases is important for effective analyses. Please read about the details of choice of databases in the GUI.\n";
-            print "Since release 0.2.0, a Python script databaser.py is implemented to automate the database construction process.\n";
-            die "Error: databaser.py is not found in the scripts/ subdirectory.\n" unless -e "$scripts/databaser.py";
+            print "HGTector requires three databases: a protein sequence database, a taxonomy database, and a protein-to-taxonomy dictionary. Using proper databases is important for optimal results. Please read about the details of choice of databases in the GUI.\n";
             print "\nPlease specify a location to store the databases (default: db/ in the program directory): ";
             $s = <STDIN>; chomp $s; $s =~ s/\/$//;
             $db = $s if $s;
@@ -96,7 +114,7 @@
                 print "# You still need to build a searchable database based on $db/$out.faa";
             }
             print "taxdump=$db/taxdump\n";
-            print "prot2taxid=$db/gi2taxid.txt\n";
+            print "prot2taxid=$db/prot2taxid.txt\n";
             print "\n";
         }
         print "You may start to use HGTector to analyze your data.\n";
@@ -108,6 +126,7 @@
 }
 exit 0;
 
+
 sub pipeline {
     print "\nValidating task...\n";
     die "Error: Invalid working directory $wkDir.\n" unless -d $wkDir;
diff --git a/scripts/databaser.py b/scripts/databaser.py
index 55489eb..3ee1cb6 100644
--- a/scripts/databaser.py
+++ b/scripts/databaser.py
@@ -1,55 +1,65 @@
 #!/usr/local/bin/python
 
 from __future__ import print_function
-try: input = raw_input
-except NameError: pass
-try: from urllib.request import urlretrieve
-except ImportError: from urllib import urlretrieve
+try:
+    input = raw_input
+except NameError:
+    pass
+try:
+    from urllib.request import urlretrieve
+except ImportError:
+    from urllib import urlretrieve
 
-import os, sys, re, time, datetime, subprocess, urllib, ftplib, gzip, tarfile
+import os
+import sys
+import re
+import time
+import datetime
+import subprocess
+import urllib
+import ftplib
+import gzip
+import tarfile
 
 print('\n'.join((
     '',
-    'Databaser: generate taxonomically balanced, non-redundant proteome database.',
-    'Author: Qiyun Zhu (qzhu@jcvi.org)',
-    'Affiliation: J. Craig Venter Institute',
-    'Last updated: Jan 28, 2017',
+    '-> Databaser: generate taxonomically balanced, non-redundant genome and proteome database. <-',
     ''
-    )))
+)))
 
 if len(sys.argv) == 1:
     print('\n'.join((
-    'Usage:',
-    '  python scripts/databaser.py [-format=none|blast] [-range=all|microbe|etc] [-represent=no|auto|filename] [-subsample=0|1|2...] [-out=dbname]',
-    'Note:',
-    '  By default, the script will download NCBI RefSeq proteomic data of microbial organisms, keep one proteome per species, plus all NCBI-defined representative proteomes.',
-    '  Use "-help" to display details of program parameters.'
+        'Usage:',
+        '  python scripts/databaser.py [-format=none|blast] [-range=all|microbe|etc] [-represent=no|auto|filename] [-subsample=0|1|2...] [-out=dbname]',
+        'Note:',
+        '  By default, the script will download NCBI RefSeq genomic data of microbial organisms, keep one genome per species, plus all NCBI-defined representative genomes.',
+        '  Use "-help" to display details of program parameters.'
     )))
     input('Press Enter to continue or Ctrl+C to exit...')
 elif sys.argv[1] == '-h' or sys.argv[1].endswith('-help'):
     print('\n'.join((
-    'Details of parameters:',
-    '  -format=<none, blast (default)>',
-    '    Format of the output database. If "none", only a master fasta file will be generated, based on which one may manually create customized databases.',
-    '  -range=<taxonomic categories>',
-    '    The NCBI RefSeq genome database has the following categories: archaea, bacteria, fungi, invertebrate, plant, protozoa, vertebrate_mammalian, vertebrate_other, and viral.',
-    '    Type one or more desired categories, separated by comma (,).',
-    '    Type "all" for all categories, or "microbe" for archaea, bacteria, fungi, and protozoa (default).',
-    '  -represent=<no, auto (default) or filename>',
-    '    By default, the script will download a list of NCBI-defined representative genomes, and include all of them in the resulting database.',
-    '    One may also provide a user-defined genome list (format: one taxid per line), or switch off this function',
-    '  -subsample=<0, 1 (default), 2, 3...>',
-    '    By default, the script will keep up to one proteome per species.',
-    '    Change the number for more proteomes, or "0" for all (no subsampling).',
-    '  -out=<database name>',
-    '    Please use a simple string without spaces or symbols. The default name is "stdb".',
-    '',
-    'Examples:',
-    '  Default behavior:',
-    '    python script/databaser.py -format=blast -range=microbe -subsample=1 -represent=auto',
-    '  Download microbial eukaryotic proteomes:',
-    '    python script/databaser.py -format=none -range=protozoa,fungi -subsample=0 -represent=no -out="MyEukarie"',
-    ''
+        'Details of parameters:',
+        '  -format=<none, blast (default)>',
+        '    Format of the output database. If "none", only a master fasta file will be generated, based on which one may manually create customized databases.',
+        '  -range=<taxonomic categories>',
+        '    The NCBI RefSeq genome database has the following categories: archaea, bacteria, fungi, invertebrate, plant, protozoa, vertebrate_mammalian, vertebrate_other, and viral.',
+        '    Type one or more desired categories, separated by comma (,).',
+        '    Type "all" for all categories, or "microbe" for archaea, bacteria, fungi, and protozoa (default).',
+        '  -represent=<no, auto (default) or filename>',
+        '    By default, the script will download a list of NCBI-defined representative genomes, and include all of them in the resulting database.',
+        '    One may also provide a user-defined genome list (format: one taxid per line), or switch off this function',
+        '  -subsample=<0, 1 (default), 2, 3...>',
+        '    By default, the script will keep up to one genome per species.',
+        '    Change the number for more genomes, or "0" for all (no subsampling).',
+        '  -out=<database name>',
+        '    Please use a simple string without spaces or symbols. The default name is "stdb".',
+        '',
+        'Examples:',
+        '  Default behavior:',
+        '    python script/databaser.py -format=blast -range=microbe -subsample=1 -represent=auto',
+        '  Download microbial eukaryotic genomes:',
+        '    python script/databaser.py -format=none -range=protozoa,fungi -subsample=0 -represent=no -out="MyEukarie"',
+        ''
     )))
     sys.exit()
 
@@ -61,12 +71,18 @@
 (format, rng, repre, subsp, outname) = ('blast', 'microbe', 'auto', 1, 'stdb')
 for argv in sys.argv[1:]:
     l = argv.split('=')
-    if len(l) < 2: continue
-    if l[0] == '-format': format = l[1]
-    elif l[0] == '-range': rng = l[1]
-    elif l[0] == '-represent': repre = l[1]
-    elif l[0] == '-subsample': subsp = int(l[1])
-    elif l[0] == '-out': outname = l[1]
+    if len(l) < 2:
+        continue
+    if l[0] == '-format':
+        format = l[1]
+    elif l[0] == '-range':
+        rng = l[1]
+    elif l[0] == '-represent':
+        repre = l[1]
+    elif l[0] == '-subsample':
+        subsp = int(l[1])
+    elif l[0] == '-out':
+        outname = l[1]
 
 print('Connecting to the NCBI FTP server...', end=' ')
 sys.stdout.flush()
@@ -77,8 +93,10 @@
 print('Downloading the NCBI taxonomy database...', end=' ')
 sys.stdout.flush()
 if os.path.isdir('taxdump'):
-    if os.path.isfile('taxdump/names.dmp'): os.remove('taxdump/names.dmp')
-    if os.path.isfile('taxdump/nodes.dmp'): os.remove('taxdump/nodes.dmp')
+    if os.path.isfile('taxdump/names.dmp'):
+        os.remove('taxdump/names.dmp')
+    if os.path.isfile('taxdump/nodes.dmp'):
+        os.remove('taxdump/nodes.dmp')
 else:
     os.makedirs('taxdump')
 ftp.cwd('/pub/taxonomy')
@@ -92,15 +110,17 @@
 print('done.')
 
 name2id = {}
-nname = {} # number of names per TaxID
+nname = {}  # number of names per TaxID
 if repre == 'auto' or subsp:
     print('Reading the NCBI taxonomy database...', end=' ')
     sys.stdout.flush()
     fin = open('taxdump/names.dmp', 'r')
     for line in fin:
         l = re.split('\s+\|\s+', line)
-        if l[0] in nname: nname[l[0]] += 1
-        else: nname[l[0]] = 1
+        if l[0] in nname:
+            nname[l[0]] += 1
+        else:
+            nname[l[0]] = 1
         if re.search('scientific name\s*\|', line):
             if l[1] not in name2id:
                 name2id[l[1]] = l[0]
@@ -112,9 +132,11 @@
 if repre == 'auto':
     print('Downloading the NCBI representative genome list...', end=' ')
     sys.stdout.flush()
-    if os.path.exists('representative_genomes.txt'): os.remove('representative_genomes.txt')
+    if os.path.exists('representative_genomes.txt'):
+        os.remove('representative_genomes.txt')
     try:
-        urlretrieve('http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=refgenomes&download=on', 'representative_genomes.txt')
+        urlretrieve('http://www.ncbi.nlm.nih.gov/genomes/Genome2BE/genome2srv.cgi?action=refgenomes&download=on',
+                    'representative_genomes.txt')
         print('done.')
     except:
         print('failed.')
@@ -123,9 +145,11 @@
         sys.stdout.flush()
         fin = open('representative_genomes.txt', 'r')
         for line in fin:
-            if line[0] == '#': continue
+            if line[0] == '#':
+                continue
             line = line.rstrip('\r\n')
-            if not line: continue
+            if not line:
+                continue
             name = line.split('\t')[2]
             if name in name2id:
                 represents[name2id[name]] = 1
@@ -137,9 +161,11 @@
     sys.stdout.flush()
     fin = open(repre, 'r')
     for line in fin:
-        if line[0] == '#': continue
+        if line[0] == '#':
+            continue
         line = line.rstrip('\r\n')
-        if not line: continue
+        if not line:
+            continue
         l = line.split('\t')
         represents[l[0]] = 1
     fin.close()
@@ -153,24 +179,34 @@
     downfiles['assembly_summary_refseq.txt'] = 'assembly_summary_refseq.txt'
 elif rng == 'microbe':
     for i in ('archaea', 'bacteria', 'fungi', 'protozoa'):
-        downfiles['assembly_summary_' + i + '.txt'] = i + '/assembly_summary.txt'
+        downfiles['assembly_summary_' + i + '.txt'] = i + \
+            '/assembly_summary.txt'
 else:
     for i in rng.split(','):
-        downfiles['assembly_summary_' + i + '.txt'] = i + '/assembly_summary.txt'
+        downfiles['assembly_summary_' + i + '.txt'] = i + \
+            '/assembly_summary.txt'
 for downfile in downfiles:
-    if os.path.isfile(downfile): os.remove(downfile)
-    if '/' in downfiles[downfile]: ftp.cwd('/genomes/refseq/' + downfiles[downfile].split('/')[0])
-    else: ftp.cwd('/genomes/refseq')
+    if os.path.isfile(downfile):
+        os.remove(downfile)
+    if '/' in downfiles[downfile]:
+        ftp.cwd('/genomes/refseq/' + downfiles[downfile].split('/')[0])
+    else:
+        ftp.cwd('/genomes/refseq')
     with open(downfile, 'wb') as fout:
-        if '/' in downfiles[downfile]: ftp.retrbinary('RETR ' + downfiles[downfile].split('/')[1], fout.write, 1024)
-        else: ftp.retrbinary('RETR ' + downfiles[downfile], fout.write, 1024)
+        if '/' in downfiles[downfile]:
+            ftp.retrbinary(
+                'RETR ' + downfiles[downfile].split('/')[1], fout.write, 1024)
+        else:
+            ftp.retrbinary('RETR ' + downfiles[downfile], fout.write, 1024)
     fin = open(downfile, 'r')
     for line in fin:
         line = line.rstrip('\r\n')
-        if line[0] == '#': continue
+        if line[0] == '#':
+            continue
         l = line.split('\t')
         if l[10] == 'latest':
-            gcfs[l[0]] = {'wgs_master': l[3], 'taxid': l[5], 'species_taxid': l[6], 'organism_name': l[7], 'assembly_level': l[11], 'seqs': 0, 'length': 0}
+            gcfs[l[0]] = {'wgs_master': l[3], 'taxid': l[5], 'species_taxid': l[6], 'organism_name': l[7],
+                          'assembly_level': l[11], 'dnas': 0, 'nucleotides': 0, 'proteins': 0, 'residues': 0}
     fin.close()
     os.remove(downfile)
 print('done.')
@@ -182,8 +218,10 @@
     sp2gcfs = {}
     for gcf in gcfs:
         sp = gcfs[gcf]['species_taxid']
-        if sp in sp2gcfs: sp2gcfs[sp] += ',' + gcf
-        else: sp2gcfs[sp] = gcf
+        if sp in sp2gcfs:
+            sp2gcfs[sp] += ',' + gcf
+        else:
+            sp2gcfs[sp] = gcf
     for sp in sp2gcfs:
         l = sp2gcfs[sp].split(',')
         if len(l) <= subsp:
@@ -191,13 +229,15 @@
                 gcfs[gcf]['in'] = 1
         else:
             n = 0
-            if represents: # get representative organisms as priority
-                repid2gcfs = {} # one representative TaxID may correspond to multiple GCFs
+            if represents:  # get representative organisms as priority
+                repid2gcfs = {}  # one representative TaxID may correspond to multiple GCFs
                 for gcf in l:
                     id = gcfs[gcf]['taxid']
                     if id in represents:
-                        if id in repid2gcfs: repid2gcfs[id] += ',' + gcf
-                        else: repid2gcfs[id] = gcf
+                        if id in repid2gcfs:
+                            repid2gcfs[id] += ',' + gcf
+                        else:
+                            repid2gcfs[id] = gcf
                 for id in repid2gcfs:
                     if ',' in repid2gcfs[id]:
                         l = repid2gcfs[id].split(',')
@@ -217,58 +257,75 @@
                     else:
                         gcfs[repid2gcfs[id]]['in'] = 1
                     n += 1
-            if n >= subsp: continue
-            names = {} # GCF => number of names
+            if n >= subsp:
+                continue
+            names = {}  # GCF => number of names
             for gcf in l:
                 if gcfs[gcf]['assembly_level'] == 'Complete Genome' or gcfs[gcf]['assembly_level'] == 'Chromosome':
                     id = gcfs[gcf]['taxid']
-                    if id in nname: names[gcf] = nname[id]
-                    else: names[gcf] = 0
+                    if id in nname:
+                        names[gcf] = nname[id]
+                    else:
+                        names[gcf] = 0
             if names:
                 for (gcf, number) in sorted(names.items(), reverse=True, key=lambda x: x[1]):
                     gcfs[gcf]['in'] = 1
                     n += 1
-                    if n >= subsp: break
-            if n >= subsp: continue
+                    if n >= subsp:
+                        break
+            if n >= subsp:
+                continue
             names = {}
             for gcf in l:
                 if gcfs[gcf]['assembly_level'] == 'Scaffold':
                     id = gcfs[gcf]['taxid']
-                    if id in nname: names[gcf] = nname[id]
-                    else: names[gcf] = 0
+                    if id in nname:
+                        names[gcf] = nname[id]
+                    else:
+                        names[gcf] = 0
             if names:
                 for (gcf, number) in sorted(names.items(), reverse=True, key=lambda x: x[1]):
                     gcfs[gcf]['in'] = 1
                     n += 1
-                    if n >= subsp: break
-            if n >= subsp: continue
+                    if n >= subsp:
+                        break
+            if n >= subsp:
+                continue
             names = {}
             for gcf in l:
                 if gcfs[gcf]['assembly_level'] == 'Contig':
                     id = gcfs[gcf]['taxid']
-                    if id in nname: names[gcf] = nname[id]
-                    else: names[gcf] = 0
+                    if id in nname:
+                        names[gcf] = nname[id]
+                    else:
+                        names[gcf] = 0
             if names:
                 for (gcf, number) in sorted(names.items(), reverse=True, key=lambda x: x[1]):
                     gcfs[gcf]['in'] = 1
                     n += 1
-                    if n >= subsp: break
+                    if n >= subsp:
+                        break
     n = 0
     for gcf in gcfs:
-        if 'in' in gcfs[gcf]: n += 1
+        if 'in' in gcfs[gcf]:
+            n += 1
     print('done.')
     print(' ', str(n), 'genomes retained.')
 
-print('Downloading non-redundant proteomic data from NCBI RefSeq...')
+print('Downloading non-redundant genomic data from NCBI RefSeq...')
 sys.stdout.flush()
-(gilist, gi2taxids, sharedgis) = ([], {}, {})
-(ngcf, nseq, naa) = (0, 0, 0)
+(dnalist, dna2taxids, shared_dnas) = ([], {}, {})
+(protlist, prot2taxids, shared_prots) = ([], {}, {})
+(ngcf, ndna, nnt, nprot, naa) = (0, 0, 0, 0, 0)
 (nodata, haserror) = ([], [])
+os.makedirs('genbank')
 print(' ', end=' ')
 with open('sampled_genomes.txt', 'w') as fout:
-    fout.write('#' + '\t'.join(('assembly_accession', 'wgs_master', 'taxid', 'species_taxid', 'organism_name', 'assembly_level', 'proteins', 'residues')) + '\n')
+    fout.write('#' + '\t'.join(('assembly_accession', 'wgs_master', 'taxid', 'species_taxid',
+                                'organism_name', 'assembly_level', 'dnas', 'nucleotides', 'proteins', 'residues')) + '\n')
 for gcf in gcfs:
-    if subsp and 'in' not in gcfs[gcf]: continue
+    if subsp and 'in' not in gcfs[gcf]:
+        continue
     dir = '/'.join((gcf[4:7], gcf[7:10], gcf[10:13]))
     (found, err550) = (0, 0)
     for i in range(retries):
@@ -276,30 +333,38 @@
             ftp.cwd('/genomes/all/GCF/' + dir)
             found = 1
         except ftplib.error_perm as resp:
-            if str(resp).split()[0] == '550': err550 = 1
+            if str(resp).split()[0] == '550':
+                err550 = 1
         except ftplib.error_temp as resp:
             time.sleep(delay)
             continue
-        else: pass
+        else:
+            pass
         break
     if not found:
-        if err550: nodata.append(gcf)
-        else: haserror.append(gcf)
+        if err550:
+            nodata.append(gcf)
+        else:
+            haserror.append(gcf)
         continue
     (files, err550) = ([], 0)
     for i in range(retries):
         try:
             files = ftp.nlst()
         except ftplib.error_perm as resp:
-            if str(resp).split()[0] == '550': err550 = 1
+            if str(resp).split()[0] == '550':
+                err550 = 1
         except ftplib.error_temp as resp:
             time.sleep(delay)
             continue
-        else: pass
+        else:
+            pass
         break
     if not files:
-        if err550: nodata.append(gcf)
-        else: haserror.append(gcf)
+        if err550:
+            nodata.append(gcf)
+        else:
+            haserror.append(gcf)
         continue
     assembly = ''
     for file in files:
@@ -315,32 +380,40 @@
             ftp.cwd('/genomes/all/GCF/' + dir + '/' + assembly)
             found = 1
         except ftplib.error_perm as resp:
-            if str(resp).split()[0] == '550': err550 = 1
+            if str(resp).split()[0] == '550':
+                err550 = 1
         except ftplib.error_temp as resp:
             time.sleep(delay)
             continue
-        else: pass
+        else:
+            pass
         break
     if not found:
-        if err550: nodata.append(gcf)
-        else: haserror.append(gcf)
+        if err550:
+            nodata.append(gcf)
+        else:
+            haserror.append(gcf)
         continue
     (files, err550) = ([], 0)
     for i in range(retries):
         try:
             files = ftp.nlst()
         except ftplib.error_perm as resp:
-            if str(resp).split()[0] == '550': err550 = 1
+            if str(resp).split()[0] == '550':
+                err550 = 1
         except ftplib.error_temp as resp:
             time.sleep(delay)
             continue
-        else: pass
+        else:
+            pass
         break
     if not files:
-        if err550: nodata.append(gcf)
-        else: haserror.append(gcf)
+        if err550:
+            nodata.append(gcf)
+        else:
+            haserror.append(gcf)
         continue
-    downfile = assembly + '_protein.gpff.gz'
+    downfile = assembly + '_genomic.gbff.gz'
     if not downfile in files:
         nodata.append(gcf)
         continue
@@ -353,64 +426,144 @@
         except ftplib.error_temp as resp:
             time.sleep(delay)
             continue
-        else: pass
+        else:
+            pass
         break
     if not written:
         haserror.append(gcf)
         continue
-    with open(outname + '.faa', 'a') as fout:
-        try: fin = gzip.open(downfile, mode='rt')
-        except TypeError: fin = gzip.open(downfile, 'r')
-        taxid = gcfs[gcf]['taxid']
-        (accn, gi, name, seq) = ('', '', '', '')
-        for line in fin:
-            line = line.rstrip('\r\n')
-            if line.startswith('DEFINITION  '):
-                name = line[12:].rstrip('.')
-            elif line.startswith('VERSION     '):
-                l = line[12:].split(' ')
-                (accn, gi) = (l[0], l[-1].split(':')[-1])
-            elif len(line) > 10 and re.search(r'^\s+\d+\s$', line[:10]):
-                seq += line[10:].replace(' ', '').upper()
-            elif line == '//':
-                if gi in gi2taxids:
-                    gi2taxids[gi] += ',' + taxid
-                    if gi not in sharedgis: sharedgis[gi] = '1'
+    faa = open(outname + '.faa', 'a')
+    fna = open(outname + '.fna', 'a')
+    try:
+        fin = gzip.open(downfile, mode='rt')
+    except TypeError:
+        fin = gzip.open(downfile, 'r')
+    taxid = gcfs[gcf]['taxid']
+    (dna, dna_name, dna_seq) = ('', '', '')
+    (prot, prot_name, prot_seq) = ('', '', '')
+    reading = ''
+    for line in fin:
+        line = line.rstrip('\r\n')
+        if line.startswith('DEFINITION  '):
+            dna_name = line[12:].strip()
+            reading = 'dna_name'
+        elif line.startswith('VERSION     '):
+            dna = line[12:].split(' ')[0]
+        elif len(line) > 10 and re.search(r'^\s+\d+\s$', line[:10]):
+            dna_seq += line[10:].replace(' ', '').upper()
+        elif line == '//':
+            if dna in dna2taxids:
+                dna2taxids[dna] += ',' + taxid
+                if dna not in shared_dnas:
+                    shared_dnas[dna] = '1'
+            else:
+                dna2taxids[dna] = taxid
+                dnalist.append(dna)
+                fna.write('>' + dna + ' ' + dna_name.rstrip('.') +
+                          '\n' + dna_seq + '\n')
+            gcfs[gcf]['dnas'] += 1
+            gcfs[gcf]['nucleotides'] += len(dna_seq)
+            (dna, dna_name, dna_seq) = ('', '', '')
+            (prot, prot_name, prot_seq) = ('', '', '')
+            reading = ''
+        elif "/protein_id=" in line:
+            prot = line[33:].strip('"')
+        elif "/product=" in line:
+            prot_name = line[30:].strip('"')
+            if not line.endswith('"'):
+                reading = 'prot_name'
+        elif "/translation=" in line:
+            prot_seq = line[34:].strip('"')
+            if line.endswith('"'):
+                if prot in prot2taxids:
+                    prot2taxids[prot].add(taxid)
+                    if prot not in shared_prots:
+                        shared_prots[prot] = '1'
                 else:
-                    gi2taxids[gi] = taxid
-                    gilist.append(gi)
-                    fout.write('>gi|' + gi + '|ref|' + accn + '| ' + name + '\n' + seq +'\n')
-                gcfs[gcf]['seqs'] += 1
-                gcfs[gcf]['length'] += len(seq)
-                (accn, gi, name, seq) = ('', '', '', '')
-        fin.close()
-    nseq += gcfs[gcf]['seqs']
-    naa += gcfs[gcf]['length']
+                    prot2taxids[prot] = set()
+                    prot2taxids[prot].add(taxid)
+                    protlist.append(prot)
+                    faa.write('>' + prot + ' ' + prot_name +
+                              '\n' + prot_seq + '\n')
+                gcfs[gcf]['proteins'] += 1
+                gcfs[gcf]['residues'] += len(prot_seq)
+                (prot, prot_name, prot_seq) = ('', '', '')
+                reading = ''
+            else:
+                reading = 'prot_seq'
+        elif reading == 'dna_name':
+            if line.startswith('            '):
+                if not dna_name.endswith('-'):
+                    dna_name += ' '
+                dna_name += line[12:].strip()
+            else:
+                reading = ''
+        elif reading == 'prot_name':
+            prot_name += line[21:].strip('"')
+        elif reading == 'prot_seq':
+            prot_seq += line[21:].strip('"')
+            if line.endswith('"'):
+                if prot in prot2taxids:
+                    prot2taxids[prot].add(taxid)
+                    if prot not in shared_prots:
+                        shared_prots[prot] = '1'
+                else:
+                    prot2taxids[prot] = set()
+                    prot2taxids[prot].add(taxid)
+                    protlist.append(prot)
+                    faa.write('>' + prot + ' ' + prot_name +
+                              '\n' + prot_seq + '\n')
+                gcfs[gcf]['proteins'] += 1
+                gcfs[gcf]['residues'] += len(prot_seq)
+                (prot, prot_name, prot_seq) = ('', '', '')
+                reading = ''
+    fin.close()
+    faa.close()
+    fna.close()
+    ndna += gcfs[gcf]['dnas']
+    nnt += gcfs[gcf]['nucleotides']
+    nprot += gcfs[gcf]['proteins']
+    naa += gcfs[gcf]['residues']
     with open('sampled_genomes.txt', 'a') as fout:
-        fout.write('\t'.join((gcf, gcfs[gcf]['wgs_master'], taxid, gcfs[gcf]['species_taxid'], gcfs[gcf]['organism_name'], gcfs[gcf]['assembly_level'], str(gcfs[gcf]['seqs']), str(gcfs[gcf]['length']))) + '\n')
-    os.remove(downfile)
+        fout.write('\t'.join((gcf, gcfs[gcf]['wgs_master'], taxid, gcfs[gcf]['species_taxid'], gcfs[gcf]['organism_name'], gcfs[gcf]['assembly_level'], str(
+            gcfs[gcf]['dnas']), str(gcfs[gcf]['nucleotides']), str(gcfs[gcf]['proteins']), str(gcfs[gcf]['residues']))) + '\n')
+    os.rename(downfile, 'genbank/' + downfile)
     ngcf += 1
     if not ngcf % 25:
         print(str(ngcf), end=' ')
         sys.stdout.flush()
 print('done.')
 
-print(' ', str(ngcf), 'genomes,', str(nseq), 'protein sequences, and', str(naa), 'residues downloaded.')
+print(' %s genomes, %s DNA sequences (%s bp), and %s protein sequences (%s aa) downloaded.' %
+      (str(ngcf), str(ndna), str(nnt), str(nprot), str(naa)))
 if nodata:
     print(' ', len(nodata), 'genomes have no data.')
     with open('genomes_without_data.txt', 'w') as fout:
-        for i in nodata: fout.write(i + '\n')
+        for i in nodata:
+            fout.write(i + '\n')
 if haserror:
     print(' ', len(haserror), 'genomes cannot be downloaded.')
     with open('cannot_download.txt', 'w') as fout:
-        for i in haserror: fout.write(i + '\n')
-with open('gi2taxid.txt', 'w') as fout:
-    for gi in gilist:
-        fout.write(gi + '\t' + gi2taxids[gi] + '\n')
-if sharedgis:
-    print(' ', str(len(gi2taxids)), 'proteins extracted, including', str(len(sharedgis)), 'proteins shared by multiple organisms.')
+        for i in haserror:
+            fout.write(i + '\n')
+with open('dna2taxid.txt', 'w') as fout:
+    for dna in dnalist:
+        fout.write(
+            dna + '\t' + ','.join(sorted(dna2taxids[dna], key=int)) + '\n')
+with open('prot2taxid.txt', 'w') as fout:
+    for prot in protlist:
+        fout.write(prot + '\t' +
+                   ','.join(sorted(prot2taxids[prot], key=int)) + '\n')
+if shared_dnas:
+    print(' ', str(len(dna2taxids)), 'DNAs extracted, including',
+          str(len(shared_dnas)), 'DNAs shared by multiple organisms.')
+else:
+    print(' ', str(len(dna2taxids)), 'DNAs extracted.')
+if shared_prots:
+    print(' ', str(len(prot2taxids)), 'proteins extracted, including',
+          str(len(shared_prots)), 'proteins shared by multiple organisms.')
 else:
-    print(' ', str(len(gi2taxids)), 'proteins extracted.')
+    print(' ', str(len(prot2taxids)), 'proteins extracted.')
 
 if format == 'blast':
     print('Building BLAST database...')
@@ -422,14 +575,17 @@
                 os.remove('blast/' + filename)
     else:
         os.makedirs('blast')
-    p = subprocess.Popen('makeblastdb -in ' + outname + '.faa -parse_seqids -dbtype prot -out blast/' + outname + ' -title "Auto-generated standard BLAST database ' + str(today) + '"', stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
+    p = subprocess.Popen('makeblastdb -in ' + outname + '.faa -parse_seqids -dbtype prot -out blast/' + outname +
+                         ' -title "Auto-generated standard BLAST database ' + str(today) + '"', stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
     out = p.stdout.read()
     if 'command not found' in out:
         print('failed. Check if ncbi-blast+ has been correctly installed.')
     else:
         for line in out.split('\n'):
-            if not line: continue
-            if line[0] == '\t': line = '  ' + line[1:]
+            if not line:
+                continue
+            if line[0] == '\t':
+                line = '  ' + line[1:]
             print('  ' + line)
     p.stdout.close()
     print('done.')
diff --git a/scripts/installer.sh b/scripts/installer.sh
new file mode 100644
index 0000000..e9529e9
--- /dev/null
+++ b/scripts/installer.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Installation script for HGTector
+
+set -eu
+
+if ! hash R 2>/dev/null
+then
+    echo "R is not detected. You may either install R manually and run this script again, or simply omit it."
+    exit 1
+fi
+
+export PERL_MM_USE_DEFAULT=1
+
+if ! perl -MApp::cpanminus -e 1 2>/dev/null
+then
+    echo "Installing cpanminus, for easy installation of Perl modules..."
+    perl -MCPAN -e 'install App::cpanminus'
+    echo done
+fi
+
+if ! perl -MStatistics::R -e 1 2>/dev/null
+then
+    echo "Installing Perl module Statistics::R, for crosstalking between Perl and R..."
+    cpanm Statistics::R
+    echo done
+fi
+
+if ! perl -MSpreadsheet::WriteExcel -e 1 2>/dev/null
+then
+    echo "Installing Perl module Spreadsheet::WriteExcel, for generating Excel spreadsheets..."
+    cpanm Spreadsheet::WriteExcel
+    echo done
+fi
+
+if [[ $(R -q -e "require('pastecs')" 2>&1) == *"there is no package called"* ]]
+then
+    echo "Installing R package pastecs, for calculating local extrema..."
+    R -e "install.packages('pastecs', repos='https://cran.rstudio.com/')"
+    echo done
+fi
+
+if [[ $(R -q -e "require('diptest')" 2>&1) == *"there is no package called"* ]]
+then
+    echo "Installing R package diptest, for performing Hartigan's dip test for unimodality..."
+    R -e "install.packages('diptest', repos='https://cran.rstudio.com/')"
+    echo done
+fi
diff --git a/scripts/searcher.pl b/scripts/searcher.pl
index a163358..6113400 100644
--- a/scripts/searcher.pl
+++ b/scripts/searcher.pl
@@ -42,7 +42,7 @@
 
 ## public variables ##
 
-my $s; my @a; my @b; my @c; my %h;
+my $s; my @a; my @b; my %h;
 
 
 ## global variables ##
@@ -231,7 +231,7 @@
 elsif (lc($searchTool) eq "rapsearch2") { $searchTool = "RAPSearch2"; }
 elsif (lc($searchTool) eq "diamond") { $searchTool = "DIAMOND"; }
 elsif (lc($searchTool) eq "customized") { $searchTool = "customized"; }
-else{ die "Error: Invalid search tool: $searchTool.\n"; }
+else { die "Error: Invalid search tool: $searchTool.\n"; }
 
 if ($searchTool eq "customized" and not $preSearch) { die "Error: You must provide pre-computed search results for \"customized\" search tool.\n"; }
 if ($preSearch and not -d $preSearch) { die "Error: Invalid directory for pre-computed search results: $preSearch\n"; }
@@ -400,7 +400,7 @@
 
 
 ## detect pre-computed search results ##
-# This function was inspired by Conor Meehan (cmeehan@itg.be) #
+# this function was inspired by Conor Meehan (cmeehan@itg.be) #
 
 if ($preSearch) {
     my $n = 0;
@@ -516,7 +516,7 @@
     }
     close IN;
 }
-if ($selfTax) { # TaxIDs of input protein sets
+if ($selfTax) {  # TaxIDs of input protein sets
     %h = ();
     @a = split (/,/, $selfTax);
     foreach (@a) {
@@ -554,7 +554,7 @@
     if (exists $ins{$set}{'taxid'}) {
         print "  $set: ", $ins{$set}{'organism'}, " (", $ins{$set}{'taxid'}, ")\n";
     } else {
-        push (@missingTax, $set);
+        push @missingTax, $set;
     }
 }
 if (@missingTax) {
@@ -562,7 +562,7 @@
     print "  ", join (",", @missingTax), "\n";
     foreach my $set (@missingTax) {
         my $taxid = "";
-        if (%prot2taxids) { # look up the dictionary
+        if (%prot2taxids) {  # look up the dictionary
             foreach my $prot (@{$ins{$set}{'prots'}}) {
                 $taxid = '';
                 my $name = $prot->{'name'};
@@ -572,7 +572,7 @@
                 last;
             }
         }
-        unless ($taxid) { # look up the BLAST database
+        unless ($taxid) {  # look up the BLAST database
             my $name = $ins{$set}{'prots'}[0]{'name'};
             if ($searchTool eq "BLAST" and not $httpBlast) {
                 my $query = $name;
@@ -598,7 +598,7 @@
                         die "\nFailed to retrieve taxonomic information from NCBI.\n" if ($iRetry >= $retries);
                         $iRetry ++; sleep $delay; next;
                     }
-                    $taxid = $1 if ($s =~ /<Item Name=\"TaxId\" Type=\"Integer\">(\d+?)<\/Item>/);
+                    $taxid = $1 if $s =~ /<Item Name=\"TaxId\" Type=\"Integer\">(\d+?)<\/Item>/;
                 }
             }
         }
@@ -723,7 +723,7 @@
             }
             local_search ($set, \@ids) if (@ids);
 
-        } else { # single-process http BLAST #
+        } else {  # single-process http BLAST #
 
             for (my $i=0; $i<scalar(@{$ins{$set}{'prots'}}); $i++) {
                 my $query = $ins{$set}{'prots'}[$i]{'name'};
@@ -1105,7 +1105,7 @@ sub local_search {
   # return: (query, set, number_of_hits, whether_successful (0 - failed, 1- successful)) separated by "/"
   # refer to: ftp://ftp.ncbi.nlm.nih.gov/blast/documents/web_blast.pl
   # and: http://www.ncbi.nlm.nih.gov/BLAST/Doc/urlapi.html
-  # note: the Perl thread join function has some problems, that why I used this inconvenient way.
+  # note: the Perl thread join function has some problems, that's why I used this inconvenient way.
 
 sub http_blast{
     my ($set, $id) = ($_[0], $_[1]);
@@ -1116,7 +1116,7 @@ sub http_blast{
     $self{'length'} = length($self{'seq'}) if $self{'seq'}; # ideally, sequence is available, otherwise see below
     my $query = $self{'name'};
 
-    # send BLAST request #
+    # send BLAST request
     my $isError = 0;
     my $url = "$blastServer?CMD=Put&PROGRAM=blastp&DATABASE=$httpDb&FILTER=m%20S";
     $url .= "&EXPECT=$evalue" if $evalue;
@@ -1134,15 +1134,14 @@ sub http_blast{
     my $rid;
     # @a = localtime(time); print "Respond: $a[2]:$a[1]:$a[0], ";
     if ($s =~ /^    RID = (.*$)/m) { $rid = $1; print "RID=$rid "; }
-    else{ return "$query/$set/0/0"; } ##########################
+    else { return "$query/$set/0/0"; } ##########################
     if ($s =~ /\s\((\d+) letters\)/) { $self{'length'} = $1; }
     if ($s =~ /^    RTOE = (.*$)/m) {
         my $rtoe = $1;
-        # print "RTOE=$rtoe, ";
-        if ($rtoe and $rtoe =~ /^\d+$/) { sleep $rtoe; } else { sleep $delay; }
+        if ($rtoe and $rtoe =~ /^\d+$/) { print "yah"; sleep $rtoe; } else { sleep $delay; }
     } else { return "$query/$set/0/0"; }
     while (1) {
-        sleep 1;
+        sleep 5;
         $s = get "$blastServer?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=$rid";
         if ($s =~ /\s+Status=([A-Z]+)/m) {
             if ($1 eq "WAITING") {
@@ -1155,18 +1154,10 @@ sub http_blast{
             elsif ($1 eq "FAILED") { $isError = 1; last; }
             elsif ($1 eq "UNKNOWN") { $isError = 1; last; }
             elsif ($1 eq "READY") { @a = localtime(time); last; }
-            else{ $isError = 1; last; }
+            else { $isError = 1; last; }
         } else {
             $isError = 1; last;
         }
-
-        # if ($s =~ /\s+Status=WAITING/m) { next; }
-        # if ($s =~ /\s+Status=FAILED/m) { $isError = 1; last; }
-        # if ($s =~ /\s+Status=UNKNOWN/m) { $isError = 1; last; }
-        # if ($s =~ /\s+Status=READY/m) {
-        #     if ($s =~ /\s+ThereAreHits=yes/m) { last; }
-        #     else{ last; } # no hits;
-        # }
     }
     if ($isError) { return "$query/$set/0/0"; }
 
@@ -1204,13 +1195,13 @@ sub http_blast{
     }
 
     # retrieve taxonomy information #
-    my $i = 0; # count
-    @a = (); # all results
-    @b = (); # subset of names
+    my $i = 0;  # count
+    @a = ();  # all results
+    @b = ();  # subset of names
     foreach (keys %hits) {
         $i ++;
         push (@b, $_);
-        if ($i == 190) { # in some situations, a URI should not exceed ~2000 characters, which is approximately 190 GIs.
+        if ($i == 190) {  # in some situations, a URI should not exceed ~2000 characters, which is approximately 190 GIs.
             sleep 1;
             while (1) {
                 $s = get $eSummaryServer."?db=protein&id=".join (",", @b);
@@ -1356,7 +1347,7 @@ sub http_blast{
         my %hit = ('id', 0, 'name', $query, 'expect', $a[0], 'score', $a[1], 'identity', $a[2], 'coverage', '100.00', 'taxid', $ins{$set}{'taxid'}, 'organism', $ins{$set}{'organism'});
         if ($self{'length'}) { $hit{'length'} = $self{'length'}; }
         elsif ($a[3]) { $hit{'length'} = $a[3]; }
-        else{ $hit{'length'} = 0; }
+        else { $hit{'length'} = 0; }
         $hits{$query} = {%hit};
     }
 
@@ -1486,11 +1477,10 @@ sub http_blast{
         if (($s !~ /blastp/i) or ($s !~ /\nQuery=\s/)) {
             print OUT "BEGIN ERROR;\nRetrieval of multiple sequence alignment failed.\n;\nEND;\n\n";
         } else {
-            @c = split(/\n/, $s);
             my $reading = 0; # reading status
             my $iBlock = -1; # block ID
             my %seqs; # sequence alignment
-            foreach (@c) {
+            foreach (split(/\n/, $s)) {
                 if ($_ eq "ALIGNMENTS") { $reading = 1; next; }
                 if ($reading and /^\s/) { $reading = 0; last; }
                 next unless $reading;
@@ -1608,7 +1598,7 @@ sub self_align {
     } else { # http mode
         my $isError = 0;
         if ($seq) { $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=".$seq."&SUBJECTS=".$seq; }
-        else{ $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=$name&SUBJECTS=$name"; };
+        else { $s = get "$blastAlignServer?CMD=Put&PROGRAM=blastp&DATABASE=$protdb&QUERY=$name&SUBJECTS=$name"; };
         return (0,0,0,0) unless (defined $s and $s =~ /^    RID = (.*$)/m);
         my $rid = $1;
         if ($s =~ /\s\((\d+) letters\)/ and not $length) { $length = $1; }
@@ -1696,7 +1686,7 @@ sub get_taxonomy{
         my %taxid2organism = ();
         foreach (@_) {
             if (exists $dbTaxa{$_}) { $taxid2organism{$_} = $dbTaxa{$_}{'organism'}; }
-            else{ push (@taxids2get, $_) }
+            else { push (@taxids2get, $_) }
         }
         sleep 1;
         while (1) {
@@ -1775,17 +1765,17 @@ sub order_accns {
     my @accns1 =  ();
     foreach (@accns0) {
         if (/_/) { unshift (@accns1, $_); }
-        else{ push (@accns1, $_); }
+        else { push (@accns1, $_); }
     }
     @accns0 = ();
     foreach (@accns1) {
         if (/^NP_/) { unshift (@accns0, $_); }
-        else{ push (@accns0, $_); }
+        else { push (@accns0, $_); }
     }
     @accns1 = ();
     foreach (@accns0) {
         if ($#_ and /^$_[1]$/) { unshift (@accns1, $_); }
-        else{ push (@accns1, $_); }
+        else { push (@accns1, $_); }
     }
     return join ("/", @accns1);
 }
@@ -1794,7 +1784,8 @@ sub order_accns {
 sub get_product {
     my $title = $_[0];
     $title =~ s/^\s+|\s+$//g;
-    $title =~ s/\[.+\]$//;
+    $title =~ s/\s+\[.+\]$//;
+    return $title;
 }
 
 # get stem file name #

From 9abe99c825d62889c038af50f9e3db728c4cc3b0 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Wed, 23 Aug 2017 15:35:24 -0700
Subject: [PATCH 7/9] trivial fixes

---
 HGTector.pl         | 12 ++++++++----
 scripts/searcher.pl |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/HGTector.pl b/HGTector.pl
index 08e8076..8f4eae7 100644
--- a/HGTector.pl
+++ b/HGTector.pl
@@ -40,20 +40,24 @@
         print $usage;
 
     } else {
-        print "\nWelcome! This seems to be the first time you run HGTector on this computer.\n";
+        print "\n";
+        print "Welcome! This seems to be the first time you run HGTector on this computer.\n";
         print "I will guide you through the basics of the program.\n";
 
-        print "\nWould you like to test on a small sample dataset (yes/NO)? ";
+        print "\n";
+        print "Would you like to test on a small sample dataset (yes/NO)? ";
         my $s = <STDIN>; chomp $s;
         if ($s =~ /^yes$/i or $s =~ /^y$/i) {
-            print "\nWarning: HGTector will connect the NCBI BLAST server to perform sequence similarity search and identify taxonomy of hits. You don't need any local search tool, database or computing power. However, please be prepared that this process is slow and the server may hang in some situations.\n";
+            print "\n";
+            print "HGTector will connect the NCBI BLAST server to perform sequence similarity search and to identify taxonomy of hits. You don't need any local search tool, database or computing power. However, please be prepared that this process may take some time depending on how busy the server is.\n";
             print "Proceed (yes) or opt out (NO): ";
             $s = <STDIN>; chomp $s;
             if ($s =~ /^yes$/i or $s =~ /^y$/i) {
+                print "\n";
                 pipeline;
             }
         }
-        print "The sample dataset and its corresponding configuration files, plus an archive of sample output can be found in the /sample subdirectory. You may take a look and get a general idea of how the input / output files are organized in an HGTector analysis.\n";
+        print "The sample dataset and its configuration file, plus an archive of sample output can be found in the /sample subdirectory. You may take a look and get a general idea of how the input / output files are organized in an HGTector analysis.\n";
         print "\n";
 
         print "Now I will assist you to install optional dependencies.\n";
diff --git a/scripts/searcher.pl b/scripts/searcher.pl
index 6113400..40f7959 100644
--- a/scripts/searcher.pl
+++ b/scripts/searcher.pl
@@ -1138,7 +1138,7 @@ sub http_blast{
     if ($s =~ /\s\((\d+) letters\)/) { $self{'length'} = $1; }
     if ($s =~ /^    RTOE = (.*$)/m) {
         my $rtoe = $1;
-        if ($rtoe and $rtoe =~ /^\d+$/) { print "yah"; sleep $rtoe; } else { sleep $delay; }
+        if ($rtoe and $rtoe =~ /^\d+$/) { sleep $rtoe; } else { sleep $delay; }
     } else { return "$query/$set/0/0"; }
     while (1) {
         sleep 5;

From 97c33890be91625e49eb9d94911933d2796e67f5 Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Wed, 23 Aug 2017 15:51:52 -0700
Subject: [PATCH 8/9] updated changelog

---
 ChangeLog | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index e25693f..945feb6 100755
--- a/ChangeLog
+++ b/ChangeLog
@@ -3,11 +3,16 @@ ChangeLog of HGTector
 == Version 0.2.2 (8/23/2017) ==
 
 New features:
-- Adopt new NCBI standard (accession instead of GI as sequence identifier)
+- Added an installation script "installer.sh".
+- Adopted new NCBI standard, i.e., using accession instead of GI as sequence identifier.
+- Modified the mechanism of databaser.py. Now it downloads both DNA and protein sequences.
+- Replaced 'http' with 'https' in NCBI server URLs.
+- Adopted the command-line interface of DIAMOND 0.8+.
 
 Bug fixes:
-- Reformatted all Perl scripts.
-- Replaced 'http' in NCBI URLs with 'https'.
+- Reformatted all Perl and Python scripts.
+- Reworded multiple screen prompts.
+- Multiple trivial bug fixes.
 
 
 == Incremental update (1/28/2017) ==

From 460adaaf9d222e7fafe8ff7a483ae688ea0a2abe Mon Sep 17 00:00:00 2001
From: qiyunzhu <qiyunzhu@gmail.com>
Date: Wed, 23 Aug 2017 20:49:08 -0700
Subject: [PATCH 9/9] added pre-built database

---
 ChangeLog   |  1 +
 HGTector.pl | 12 ++++++++----
 README.md   | 16 +++++-----------
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 945feb6..3ec4dc4 100755
--- a/ChangeLog
+++ b/ChangeLog
@@ -3,6 +3,7 @@ ChangeLog of HGTector
 == Version 0.2.2 (8/23/2017) ==
 
 New features:
+- Provided a pre-built standard database.
 - Added an installation script "installer.sh".
 - Adopted new NCBI standard, i.e., using accession instead of GI as sequence identifier.
 - Modified the mechanism of databaser.py. Now it downloads both DNA and protein sequences.
diff --git a/HGTector.pl b/HGTector.pl
index 8f4eae7..0d0bca7 100644
--- a/HGTector.pl
+++ b/HGTector.pl
@@ -74,12 +74,16 @@
         }
         print "\n";
 
-        print "Now I will assist you to generate a reference database.\n";
-        print "Unless you plan to use HGTector in remote mode, a local reference database is recommended for consistant and controllable analyses.\n";
+        print "Now I will assist you to prepare a reference database.\n";
+        print "Unless you plan to always use HGTector in remote mode, a local reference database is recommended for consistant and controllable analyses.\n";
+        print "The database will have three components: a protein sequence database, a taxonomy database, and a protein-to-taxonomy dictionary.\n";
+        print "Using proper databases is important for optimal results. Please consider reading about the details of choice of databases in the GUI.\n";
+        print "\n";
+
+        print "A pre-built database is available (see README.md for download link). Alternatively, I will build an up-to-date one for you now.\n";
         print "Proceed (YES/no)? ";
         $s = <STDIN>; chomp $s;
         if (not $s or $s =~ /^yes$/i or $s =~ /^y$/i) {
-            print "HGTector requires three databases: a protein sequence database, a taxonomy database, and a protein-to-taxonomy dictionary. Using proper databases is important for optimal results. Please read about the details of choice of databases in the GUI.\n";
             print "\nPlease specify a location to store the databases (default: db/ in the program directory): ";
             $s = <STDIN>; chomp $s; $s =~ s/\/$//;
             $db = $s if $s;
@@ -110,7 +114,7 @@
             }
             my $i = system "python $scripts/databaser.py -format=$format -range=$range -represent=$represent -subsample=$subsample -out=$out";
             chdir $cwd;
-            print "\nThere appear to be some problems in database building.\n" unless -s "$db/$out.faa";
+            print "\nThere appears to be some problems in database building.\n" unless -s "$db/$out.faa";
             print "\nNow that databases are created. Your may link HGTector to them in future analyses, by setting the following parameters in config.txt:\n";
             if ($format eq "blast") {
                 print "protdb=$db/blast/$out\n";
diff --git a/README.md b/README.md
index 09040b1..6f42785 100644
--- a/README.md
+++ b/README.md
@@ -1,23 +1,17 @@
 HGTector
-==========
-
-**Note: Instead of downloading the release, please git clone the master branch, which contains an important update in match with the new NCBI FTP site structure.**
-
-**Alternatively, please download a pre-constructed database from [here](https://u21438014.dl.dropboxusercontent.com/u/21438014/HGTector/stdb_20170128.tar.xz).**
+========
 
 A computational pipeline for genome-wide detection of putative horizontal gene transfer (HGT) events based on sequence homology search hit distribution statistics
 
-Latest release: [0.2.1](https://github.com/DittmarLab/HGTector/archive/0.2.2.zip)
-
-Last update: 2017-1-28
-
 Authors: Qiyun Zhu (<qiyunzhu@gmail.com>), Katharina Dittmar (<katharinad@gmail.com>)
 
 Affiliation: Department of Biological Sciences, University at Buffalo, State University of New York, Buffalo, USA
 
 License: [BSD 2-clause](http://opensource.org/licenses/BSD-2-Clause).
 
-An article detailing the method and its performance on simulated and real genomic datasets can be found [here](http://www.biomedcentral.com/1471-2164/15/717):
+Download the latest release: [0.2.2](https://github.com/DittmarLab/HGTector/archive/0.2.2.zip), updated on 2017-8-23; and a standard [database](https://u21438014.dl.dropboxusercontent.com/u/21438014/HGTector/stdb_20170630.tar), updated on 2017-6-30.
+
+An article detailing the method and its performance on simulated and real genomic datasets can be found [here](https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-15-717):
 
 Zhu Q, Kosoy M, Dittmar K. HGTector: an automated method facilitating genome-wide discovery of putative horizontal gene transfers. *BMC Genomics*. 2014. 15:717.
 
@@ -25,4 +19,4 @@ Please visit our lab's [website](http://katharina-dittmar.squarespace.com/) for
 
 Usage: Simply execute **perl HGTector.pl**, or, open **GUI.html** in a web browser to see a step-by-step wizard.
 
-Download [HGTector 0.2.1](https://github.com/DittmarLab/HGTector/archive/0.2.1.zip).
+Download [HGTector 0.2.2](https://github.com/DittmarLab/HGTector/archive/0.2.2.zip).

GO	Term	$_
Cellular component
Biological process
Molecular function
$go	$goes{$go}{'term'}	0 and $i < 0.1){ $t = "lightgreen"; } - elsif ($i >= 0.1 and $i < 0.5) { $t = "khaki"; } - elsif ($i >= 0.5) { $t = "lightpink"; } - print HTML " style='background-color: $t;'>$i