From d6b41d609652ff57bc2902b4c1a27b630f5469cc Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Tue, 25 Oct 2016 20:11:07 -0700
Subject: [PATCH 01/11] Minor bug and formatting print statements -fixed bug in
 utility.check_db when no db specified -formatted error messages -fixed print
 statement in snps.py and genes.py

---
 midas/build/build_db.py    |  2 +-
 midas/run/genes.py         |  2 +-
 midas/run/snps.py          |  2 +-
 midas/utility.py           | 16 +++++------
 scripts/build_midas_db.py  | 11 ++++----
 scripts/call_consensus.py  | 20 ++++++-------
 scripts/compare_genes.py   |  2 +-
 scripts/merge_midas.py     | 14 ++++-----
 scripts/run_midas.py       | 58 +++++++++++++++++++-------------------
 scripts/snp_diversity.py   | 28 +++++++++---------
 scripts/strain_tracking.py | 12 ++++----
 11 files changed, 84 insertions(+), 83 deletions(-)

diff --git a/midas/build/build_db.py b/midas/build/build_db.py
index 8cd764b..afdb537 100644
--- a/midas/build/build_db.py
+++ b/midas/build/build_db.py
@@ -161,7 +161,7 @@ def build_marker_db(args, genomes, species):
 	command = "%s index %s/marker_genes/phyeco.fa " % (args['hs-blastn'], args['outdir'])
 	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 	utility.check_exit_code(process, command)
-	print("4. Copying mapping cutoffs file")
+	print("4. Writing mapping cutoffs file")
 	build_mapping_cutoffs(args)
 
 def build_mapping_cutoffs(args):
diff --git a/midas/run/genes.py b/midas/run/genes.py
index 46820bd..9fc4dd7 100644
--- a/midas/run/genes.py
+++ b/midas/run/genes.py
@@ -67,8 +67,8 @@ def pangenome_align(args):
 	args['log'].write('command: '+command+'\n')
 	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 	# Check for errors
-	print("  finished aligning")
 	utility.check_exit_code(process, command)
+	print("  finished aligning")
 	print("  checking bamfile integrity")
 	utility.check_bamfile(args, bampath)
 
diff --git a/midas/run/snps.py b/midas/run/snps.py
index 66ecf06..4f9851e 100644
--- a/midas/run/snps.py
+++ b/midas/run/snps.py
@@ -55,9 +55,9 @@ def genome_align(args):
 	args['log'].write('command: '+command+'\n')
 	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 	# Check for errors
+	utility.check_exit_code(process, command)
 	print("  finished aligning")
 	print("  checking bamfile integrity")
-	utility.check_exit_code(process, command)
 	utility.check_bamfile(args, bam_path)
 
 def pileup(args):
diff --git a/midas/utility.py b/midas/utility.py
index 4bc0983..5ca2b99 100644
--- a/midas/utility.py
+++ b/midas/utility.py
@@ -90,10 +90,10 @@ def add_executables(args):
 	args['samtools'] = '/'.join([main_dir, 'bin', platform.system(), 'samtools'])
 	for arg in ['hs-blastn', 'stream_seqs', 'bowtie2-build', 'bowtie2', 'samtools', 'stream_bam']:
 		if not os.path.isfile(args[arg]):
-			sys.exit("File not found: %s" % args[arg])
+			sys.exit("\nError: File not found: %s\n" % args[arg])
 	for arg in ['hs-blastn', 'bowtie2-build', 'bowtie2', 'samtools']:
 		if not is_executable(args[arg]):
-			sys.exit("File not executable: %s" % args[arg])
+			sys.exit("\nError:File not executable: %s\n" % args[arg])
 
 def is_executable(f):
 	""" Check if file is executable by all """
@@ -106,7 +106,7 @@ def auto_detect_file_type(inpath):
 	for line in infile:
 		if line[0] == '>': return 'fasta'
 		elif line[0] == '@': return 'fastq'
-		else: sys.exit("Filetype [fasta, fastq] of %s could not be recognized" % inpath)
+		else: sys.exit("Error: Filetype [fasta, fastq] of %s could not be recognized\n" % inpath)
 	infile.close()
 
 def check_compression(inpath):
@@ -117,13 +117,13 @@ def check_compression(inpath):
 		next(file)
 		file.close()
 	except:
-		sys.exit("\nError: File extension '%s' does not match expected compression" % ext)
+		sys.exit("\nError: File extension '%s' does not match expected compression\n" % ext)
 
 def check_database(args):
-	if 'db' is None:
+	if args['db'] is None:
 		error = "\nError: No reference database specified\n"
-		error = "Use the flag -d to specify a database,\n"
-		error = "or set the MIDAS_DB environmental variable: export MIDAS_DB=/path/to/midas/db\n"
+		error += "Use the flag -d to specify a database,\n"
+		error += "or set the MIDAS_DB environmental variable: export MIDAS_DB=/path/to/midas/db\n"
 		sys.exit(error)
 	if not os.path.isdir(args['db']):
 		error = "\nError: Specified reference database does not exist: %s\n" % args['db']
@@ -179,7 +179,7 @@ def check_exit_code(process, command):
 	""" Capture stdout, stderr. Check unix exit code and exit if non-zero """
 	out, err = process.communicate()
 	if process.returncode != 0:
-		err_message = "\nError encountered executing:\n%s\n\nError message:\n%s" % (command, err)
+		err_message = "\nError encountered executing:\n%s\n\nError message:\n%s\n" % (command, err)
 		sys.exit(err_message)
 
 def check_bamfile(args, bampath):
diff --git a/scripts/build_midas_db.py b/scripts/build_midas_db.py
index 817bfe3..4fc9e6c 100755
--- a/scripts/build_midas_db.py
+++ b/scripts/build_midas_db.py
@@ -16,7 +16,7 @@ def fetch_arguments():
 		description="""
 Description:
 This script will allow you to build your own custom MIDAS database
-Usage: build_midas_db.py [options]
+Usage: build_midas_db.py indir mapfile outdir [options]
 """)
 	parser.add_argument('indir', type=str,
 		help="""Path to directory of input genomes
@@ -64,15 +64,14 @@ def check_args(args):
 	if not os.path.isdir(args['outdir']):
 		os.mkdir(args['outdir'])
 	if not os.path.isdir(args['indir']):
-		sys.exit("\nError: could not locate directory specified by --genomes: %s" % args['indir'])
+		sys.exit("\nError: could not locate directory specified by --genomes: %s\n" % args['indir'])
 	if not os.path.isfile(args['mapfile']):
-		sys.exit("\nError: could not locate file specified by --mapping: %s" % args['mapfile'])
+		sys.exit("\nError: could not locate file specified by --mapping: %s\n" % args['mapfile'])
 
 	for program in ['hmmsearch', 'usearch']:
 		if not utility.which(program):
-			error = ""
-			error += "\nError: program '%s' not found in your PATH" % program
-			error += "\nMake sure that you've installed the program and added it's location to your PATH"
+			error = "\nError: program '%s' not found in your PATH" % program
+			error += "\nMake sure that you've installed the program and added it's location to your PATH\n"
 			sys.exit(error)
 
 
diff --git a/scripts/call_consensus.py b/scripts/call_consensus.py
index 2694620..473f696 100755
--- a/scripts/call_consensus.py
+++ b/scripts/call_consensus.py
@@ -105,25 +105,25 @@ def print_args(args):
 
 def check_args(args):
 	if not os.path.isdir(args['indir']):
-		sys.exit("Specified input directory '%s' does not exist" % args['indir'])
+		sys.exit("\nError: Specified input directory '%s' does not exist\n" % args['indir'])
 	if args['site_depth'] < 2:
-		sys.exit("\nError: --site_depth must be >=2 to calculate nucleotide variation")
+		sys.exit("\nError: --site_depth must be >=2 to calculate nucleotide variation\n")
 	if args['max_sites'] < 1:
-		sys.exit("\nError: --max_sites must be >= 1 to calculate nucleotide variation")
+		sys.exit("\nError: --max_sites must be >= 1 to calculate nucleotide variation\n")
 	if args['max_samples'] < 1:
-		sys.exit("\nError: --max_samples must be >= 1 to calculate nucleotide variation")
+		sys.exit("\nError: --max_samples must be >= 1 to calculate nucleotide variation\n")
 	if args['site_ratio'] < 0:
-		sys.exit("\nError: --site_ratio cannot be a negative number")
+		sys.exit("\nError: --site_ratio cannot be a negative number\n")
 	if args['site_depth'] < 0:
-		sys.exit("\nError: --site_depth cannot be a negative number")
+		sys.exit("\nError: --site_depth cannot be a negative number\n")
 	if args['sample_depth'] < 0:
-		sys.exit("\nError: --sample_depth cannot be a negative number")
+		sys.exit("\nError: --sample_depth cannot be a negative number\n")
 	if not 0 <= args['site_maf'] <= 1:
-		sys.exit("\nError: --site_maf must be between 0 and 1")
+		sys.exit("\nError: --site_maf must be between 0 and 1\n")
 	if not 0 <= args['site_prev'] <= 1:
-		sys.exit("\nError: --site_prev must be between 0 and 1")
+		sys.exit("\nError: --site_prev must be between 0 and 1\n")
 	if not 0 <= args['fract_cov'] <= 1:
-		sys.exit("\nError: --fract_cov must be between 0 and 1")
+		sys.exit("\nError: --fract_cov must be between 0 and 1\n")
 
 def init_sequences(samples):
 	for sample in samples.values():
diff --git a/scripts/compare_genes.py b/scripts/compare_genes.py
index 810f9e1..3cc0080 100755
--- a/scripts/compare_genes.py
+++ b/scripts/compare_genes.py
@@ -61,7 +61,7 @@ def init_paths(args):
 		if os.path.isfile(inpath):
 			paths[ext] = inpath
 		else:
-			sys.exit('Input file does not exist: %s' % inpath)
+			sys.exit("\nError: Input file does not exist: %s\n" % inpath)
 	return paths
 
 def compute_euclidian(df, s1, s2):
diff --git a/scripts/merge_midas.py b/scripts/merge_midas.py
index 3b87c08..72bd2c0 100755
--- a/scripts/merge_midas.py
+++ b/scripts/merge_midas.py
@@ -21,7 +21,7 @@ def get_program():
 		print('\tsnps\t merge single nucleotide variants of species across samples')
 		quit()
 	elif sys.argv[1] not in ['species', 'genes', 'snps']:
-		sys.exit("Unrecognized command: '%s'" % sys.argv[1])
+		sys.exit("\nError: Unrecognized command: '%s'\n" % sys.argv[1])
 		quit()
 	else:
 		return sys.argv[1]
@@ -35,7 +35,7 @@ def get_arguments(program):
 	elif program == 'snps':
 		args = snps_arguments()
 	else:
-		sys.exit("Unrecognized program: '%s'" % program)
+		sys.exit("\nError: Unrecognized program: '%s'\n" % program)
 	return args
 
 def species_arguments():
@@ -229,13 +229,13 @@ def check_arguments(program, args):
 		check_input(args)
 		utility.check_database(args)
 	else:
-		sys.exit("Unrecognized program: '%s'" % program)
+		sys.exit("\nError: Unrecognized program: '%s'\n" % program)
 	if platform.system() not in ['Linux', 'Darwin']:
-		sys.exit("Operating system '%s' not supported" % system())
+		sys.exit("\nError: Operating system '%s' not supported\n" % system())
 
 def check_input(args):
 	args['indirs'] = []
-	error = "\nError: specified input %s does not exist: %s"
+	error = "\nError: specified input %s does not exist: %s\n"
 	if args['intype'] == 'dir':
 		if not os.path.isdir(args['input']):
 			sys.exit(error % (args['intype'], os.path.abspath(args['input'])))
@@ -264,7 +264,7 @@ def print_arguments(program, args):
 	elif program == 'snps':
 		print_snps_arguments(args)
 	else:
-		sys.exit("Unrecognized program: '%s'" % program)
+		sys.exit("\nError: Unrecognized program: '%s'\n" % program)
 
 def print_species_arguments(args):
 	print ("===========Parameters===========")
@@ -335,7 +335,7 @@ def run_program(program, args):
 		from midas.merge import merge_snps
 		merge_snps.run_pipeline(args)
 	else:
-		sys.exit("Unrecognized program: '%s'" % program)
+		sys.exit("\nError: Unrecognized program: '%s'\n" % program)
 
 if __name__ == '__main__':
 	program = get_program()
diff --git a/scripts/run_midas.py b/scripts/run_midas.py
index f0848e7..377c628 100755
--- a/scripts/run_midas.py
+++ b/scripts/run_midas.py
@@ -21,7 +21,7 @@ def get_program():
 		print('\tsnps\t identify single nucleotide variants in abundant species')
 		quit()
 	elif sys.argv[1] not in ['species', 'genes', 'snps']:
-		sys.exit("Error: Unrecognized command: '%s'" % sys.argv[1])
+		sys.exit("\nError: Unrecognized command: '%s'\n" % sys.argv[1])
 		quit()
 	else:
 		return sys.argv[1]
@@ -40,7 +40,7 @@ def get_arguments(program):
 	elif program == 'snps':
 		args = snp_arguments()
 	else:
-		sys.exit("Error: Unrecognized program: '%s'" % program)
+		sys.exit("\nError: Unrecognized program: '%s'\n" % program)
 	utility.add_executables(args)
 	return args
 
@@ -53,9 +53,9 @@ def check_arguments(program, args):
 	elif program == 'snps':
 		check_snps(args)
 	else:
-		sys.exit("Error: Unrecognized program: '%s'" % program)
+		sys.exit("\nError: Unrecognized program: '%s'\n" % program)
 	if platform.system() not in ['Linux', 'Darwin']:
-		sys.exit("Error: Operating system '%s' not supported" % system())
+		sys.exit("\nError: Operating system '%s' not supported\n" % system())
 
 def print_arguments(program, args):
 	""" Run program specified by user (species, genes, or snps) """
@@ -66,7 +66,7 @@ def print_arguments(program, args):
 	elif program == 'snps':
 		print_snp_arguments(args)
 	else:
-		sys.exit("Error: Unrecognized program: '%s'" % program)
+		sys.exit("\nError: Unrecognized program: '%s'\n" % program)
 	
 def run_program(program, args):
 	""" Run program specified by user (species, genes, or snps) """
@@ -80,7 +80,7 @@ def run_program(program, args):
 		from midas.run import snps
 		snps.run_pipeline(args)
 	else:
-		sys.exit("Error: Unrecognized program: '%s'" % program)
+		sys.exit("\nError: Unrecognized program: '%s'\n" % program)
 
 def species_arguments():
 	parser = argparse.ArgumentParser(
@@ -167,17 +167,17 @@ def check_species(args):
 		os.makedirs('%s/species' % args['outdir'])
 	# check word size
 	if args['word_size'] < 12:
-		sys.exit("\nError: Invalid word size: %s. Must be greater than or equal to 12" % args['word_size'])
+		sys.exit("\nError: Invalid word size: %s. Must be greater than or equal to 12\n" % args['word_size'])
 	# check mapping identity
 	if args['mapid'] and (args['mapid'] < 0 or args['mapid'] > 100):
-		sys.exit("\nError: Invalid mapping identity: %s. Must be between 0 and 100" % args['mapid'])
+		sys.exit("\nError: Invalid mapping identity: %s. Must be between 0 and 100\n" % args['mapid'])
 	# check alignment coverage
 	if args['aln_cov'] < 0 or args['aln_cov'] > 1:
-		sys.exit("\nError: Invalid alignment coverage: %s. Must be between 0 and 1" % args['aln_cov'])
+		sys.exit("\nError: Invalid alignment coverage: %s. Must be between 0 and 1\n" % args['aln_cov'])
 	# check that m1 (and m2) exist
 	for arg in ['m1', 'm2']:
 		if args[arg] and not os.path.isfile(args[arg]):
-			sys.exit("\nError: Input file does not exist: '%s'" % args[arg])
+			sys.exit("\nError: Input file does not exist: '%s'\n" % args[arg])
 	# check that extention matches compression
 	if args['m1']: utility.check_compression(args['m1'])
 	if args['m2']: utility.check_compression(args['m2'])
@@ -442,7 +442,7 @@ def check_selected_species(args):
 				path = '%s/rep_genomes/%s' % (args['db'], species_id)
 				error = "\nError: Could not locate species representative genome: %s\n" % path
 			if not os.path.isdir(path):
-				 sys.exit(error)
+				sys.exit(error)
 
 def check_genes(args):
 	""" Check validity of command line arguments """
@@ -469,39 +469,39 @@ def check_genes(args):
 		if (args['species_topn'] or args['species_cov']) and args['build_db']:
 			sys.exit("\nError: Could not find species abundance profile: %s\n\
 To specify species with --species_topn or --species_cov you must have run: run_midas.py species\n\
-Alternatively, you can manually specify one or more species using --species_id" % profile)
+Alternatively, you can manually specify one or more species using --species_id\n" % profile)
 	# no database but --align specified
 	if (args['align']
 		and not args['build_db']
 		and not os.path.isfile('%s/genes/temp/pangenomes.fa' % args['outdir'])):
 		error = "\nError: You've specified --align, but no database has been built"
-		error += "\nTry running with --build_db"
+		error += "\nTry running with --build_db\n"
 		sys.exit(error)
 	# no bamfile but --cov specified
 	if (args['cov']
 		and not args['align']
 		and not os.path.isfile('%s/genes/temp/pangenome.bam' % args['outdir'])):
 		error = "\nError: You've specified --call_genes, but no alignments were found"
-		error += "\nTry running with --align"
+		error += "\nTry running with --align\n"
 		sys.exit(error)
 	# no reads
 	if args['align'] and not args['m1']:
-		sys.exit("\nError: To align reads, you must specify path to input FASTA/FASTQ")
+		sys.exit("\nError: To align reads, you must specify path to input FASTA/FASTQ\n")
 	# check input file paths
 	for arg in ['m1', 'm2']:
 		if args[arg] and not os.path.isfile(args[arg]):
-			sys.exit("\nError: Input file does not exist: '%s'" % args[arg])
+			sys.exit("\nError: Input file does not exist: '%s'\n" % args[arg])
 	# check compression
 	if args['m1']: utility.check_compression(args['m1'])
 	if args['m2']: utility.check_compression(args['m2'])
 	# input options
 	if args['m2'] and not args['m1']:
-		sys.exit("\nError: Must specify -1 and -2 if aligning paired end reads")
+		sys.exit("\nError: Must specify -1 and -2 if aligning paired end reads\n")
 	# sanity check input values
 	if args['mapid'] < 1 or args['mapid'] > 100:
-		sys.exit("\nError: MAPID must be between 1 and 100")
+		sys.exit("\nError: MAPID must be between 1 and 100\n")
 	if args['aln_cov'] < 0 or args['aln_cov'] > 1:
-		sys.exit("\nError: ALN_COV must be between 0 and 1")
+		sys.exit("\nError: ALN_COV must be between 0 and 1\n")
 
 def check_snps(args):
 	""" Check validity of command line arguments """
@@ -528,13 +528,13 @@ def check_snps(args):
 		if (args['species_topn'] or args['species_cov']) and args['build_db']:
 			sys.exit("\nError: Could not find species abundance profile: %s\n\
 To specify species with --species_topn or --species_cov you must have run: run_midas.py species\n\
-Alternatively, you can manually specify one or more species using --species_id" % profile)
+Alternatively, you can manually specify one or more species using --species_id\n" % profile)
 	# no database but --align specified
 	if (args['align']
 		and not args['build_db']
 		and not os.path.isfile('%s/snps/temp/genomes.fa' % args['outdir'])):
 		error = "\nError: You've specified --align, but no database has been built"
-		error += "\nTry running with --build_db"
+		error += "\nTry running with --build_db\n"
 		sys.exit(error)
 	# no bamfile but --call specified
 	if (args['call']
@@ -542,7 +542,7 @@ def check_snps(args):
 		and not os.path.isfile('%s/snps/temp/genomes.bam' % args['outdir'])
 		):
 		error = "\nError: You've specified --call_snps, but no alignments were found"
-		error += "\nTry running with --align"
+		error += "\nTry running with --align\n"
 		sys.exit(error)
 	# no genomes but --call specified
 	if (args['call']
@@ -550,28 +550,28 @@ def check_snps(args):
 		and not os.path.isfile('%s/snps/temp/genomes.fa' % args['outdir'])
 		):
 		error = "\nError: You've specified --call_snps, but the no genome database was found"
-		error += "\nTry running with --build_db"
+		error += "\nTry running with --build_db\n"
 		sys.exit(error)
 	# no reads
 	if args['align'] and not args['m1']:
-		sys.exit("\nError: To align reads, you must specify path to input FASTA/FASTQ")
+		sys.exit("\nError: To align reads, you must specify path to input FASTA/FASTQ\n")
 	# check input file paths
 	for arg in ['m1', 'm2']:
 		if args[arg] and not os.path.isfile(args[arg]):
-			sys.exit("\nError: Input file does not exist: '%s'" % args[arg])
+			sys.exit("\nError: Input file does not exist: '%s'\n" % args[arg])
 	# check compression
 	if args['m1']: utility.check_compression(args['m1'])
 	if args['m2']: utility.check_compression(args['m2'])
 	# input options
 	if args['m2'] and not args['m1']:
-		sys.exit("\nError: Must specify -1 and -2 if aligning paired end reads")
+		sys.exit("\nError: Must specify -1 and -2 if aligning paired end reads\n")
 	# sanity check input values
 	if args['mapid'] < 1 or args['mapid'] > 100:
-		sys.exit("\nError: MAPQ must be between 1 and 100")
+		sys.exit("\nError: MAPQ must be between 1 and 100\n")
 	if args['mapq'] < 0 or args['mapq'] > 100:
-		sys.exit("\nError: MAPQ must be between 0 and 100")
+		sys.exit("\nError: MAPQ must be between 0 and 100\n")
 	if args['baseq'] < 0 or args['baseq'] > 100:
-		sys.exit("\nError: BASEQ must be between 0 and 100")
+		sys.exit("\nError: BASEQ must be between 0 and 100\n")
 
 def write_readme(program, args):
 	outfile = open('%s/%s/README' % (args['outdir'], program), 'w')
diff --git a/scripts/snp_diversity.py b/scripts/snp_diversity.py
index 8ae381d..d8a91ea 100755
--- a/scripts/snp_diversity.py
+++ b/scripts/snp_diversity.py
@@ -141,31 +141,31 @@ def print_args(args):
 
 def check_args(args):
 	if not os.path.isdir(args['indir']):
-		sys.exit("Specified input directory '%s' does not exist" % args['indir'])
+		sys.exit("\nError: Specified input directory '%s' does not exist\n" % args['indir'])
 	if args['site_depth'] < 2:
-		sys.exit("\nError: --site_depth must be >=2 to calculate nucleotide variation")
+		sys.exit("\nError: --site_depth must be >=2 to calculate nucleotide variation\n")
 	if args['max_sites'] < 1:
-		sys.exit("\nError: --max_sites must be >= 1 to calculate nucleotide variation")
+		sys.exit("\nError: --max_sites must be >= 1 to calculate nucleotide variation\n")
 	if args['max_samples'] < 1:
-		sys.exit("\nError: --max_samples must be >= 1 to calculate nucleotide variation")
+		sys.exit("\nError: --max_samples must be >= 1 to calculate nucleotide variation\n")
 	if args['site_ratio'] < 0:
-		sys.exit("\nError: --site_ratio cannot be a negative number")
+		sys.exit("\nError: --site_ratio cannot be a negative number\n")
 	if args['site_depth'] < 0:
-		sys.exit("\nError: --site_depth cannot be a negative number")
+		sys.exit("\nError: --site_depth cannot be a negative number\n")
 	if args['sample_depth'] < 0:
-		sys.exit("\nError: --sample_depth cannot be a negative number")
+		sys.exit("\nError: --sample_depth cannot be a negative number\n")
 	if not 0 <= args['site_maf'] <= 1:
-		sys.exit("\nError: --site_maf must be between 0 and 1")
+		sys.exit("\nError: --site_maf must be between 0 and 1\n")
 	if not 0 <= args['site_prev'] <= 1:
-		sys.exit("\nError: --site_prev must be between 0 and 1")
+		sys.exit("\nError: --site_prev must be between 0 and 1\n")
 	if not 0 <= args['fract_cov'] <= 1:
-		sys.exit("\nError: --fract_cov must be between 0 and 1")
+		sys.exit("\nError: --fract_cov must be between 0 and 1\n")
 	if args['rand_reads'] > args['site_depth'] and not args['replace_reads']:
-		sys.exit("\nError: --rand_reads cannot exceed --site_depth when --replace_reads=False")
+		sys.exit("\nError: --rand_reads cannot exceed --site_depth when --replace_reads=False\n")
 	if args['rand_sites'] and (args['rand_sites'] < 0 or args['rand_sites'] > 1):
-		sys.exit("\nError: --rand_sites must be between 0 and 1")
+		sys.exit("\nError: --rand_sites must be between 0 and 1\n")
 	if 'NC' in args['site_type'] and args['genomic_type'] == 'per-gene':
-		sys.exit("\nError: --site_type cannot be NC if --genomic_type is per-gene")
+		sys.exit("\nError: --site_type cannot be NC if --genomic_type is per-gene\n")
 	# set min # of rand reads
 
 class Diversity:
@@ -297,7 +297,7 @@ def resample_samples(samples):
 	""" Random select high quality samples, set pass_qc=False for others """
 	hq_indexes = [index for index, sample in enumerate(samples) if sample.pass_qc]
 	if args['rand_samples'] > len(hq_indexes):
-		sys.exit("\nError: --rand_samples cannot exceed the number of high-quality samples")
+		sys.exit("\nError: --rand_samples cannot exceed the number of high-quality samples\n")
 	else:
 		hq_indexes = np.random.choice(hq_indexes, args['rand_samples'], replace=False)
 		for index, sample in enumerate(samples):
diff --git a/scripts/strain_tracking.py b/scripts/strain_tracking.py
index 56e3e0d..6eeba60 100755
--- a/scripts/strain_tracking.py
+++ b/scripts/strain_tracking.py
@@ -20,7 +20,7 @@ def get_program():
 		print('\ttrack_markers   track rare SNPs between samples and determine transmission')
 		quit()
 	elif sys.argv[1] not in ['id_markers', 'track_markers']:
-		sys.exit("Unrecognized command: '%s'" % sys.argv[1])
+		sys.exit("\nError: Unrecognized command: '%s'\n" % sys.argv[1])
 		quit()
 	else:
 		return sys.argv[1]
@@ -32,7 +32,7 @@ def get_arguments(program):
 	elif program == 'track_markers':
 		args = track_arguments()
 	else:
-		sys.exit("Unrecognized program: '%s'" % program)
+		sys.exit("\nError: Unrecognized program: '%s'\n" % program)
 	return args
 
 def id_arguments():
@@ -130,8 +130,10 @@ def track_arguments():
 useful for quick tests""")
 
 	args = vars(parser.parse_args())
-	if not os.path.isdir(args['indir']): sys.exit("Specified input directory '%s' does not exist" % args['indir'])
-	if not os.path.isfile(args['markers']): sys.exit("Specified input file '%s' does not exist" % args['markers'])
+	if not os.path.isdir(args['indir']):
+		sys.exit("\nError: Specified input directory '%s' does not exist\n" % args['indir'])
+	if not os.path.isfile(args['markers']):
+		sys.exit("\nError: Specified input file '%s' does not exist\n" % args['markers'])
 	return args
 
 def run_program(program, args):
@@ -142,7 +144,7 @@ def run_program(program, args):
 	elif program == 'track_markers':
 		track_strains.track_markers(args)
 	else:
-		sys.exit("Unrecognized program: '%s'" % program)
+		sys.exit("Error: Unrecognized program: '%s'\n" % program)
 
 if __name__ == '__main__':
 	program = get_program()

From 37b5c7d293b493063a9b1da7d153c5b3027e110b Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Tue, 1 Nov 2016 12:27:36 -0700
Subject: [PATCH 02/11] Updated build_midas_db -added class MarkerGenes -added
 feature to iteratively cluster genes   -genes first clustered at 99% identity
   -centroids from 99% identity clusters are clustered at 95, 90, 85, and 80 %
 identity -marker genes now identified from rep_genome as described in GR
 -removed alignment coverage criterion for identifying marker genes -removed
 option to run UCLUST in batches. may lead to out of memory errors in the
 future -added additional checks for genome.features file

---
 docs/build_db.md          |  23 +-
 midas/build/build_db.py   | 595 ++++++++++++++++++++------------------
 scripts/build_midas_db.py |  46 ++-
 3 files changed, 342 insertions(+), 322 deletions(-)

diff --git a/docs/build_db.md b/docs/build_db.md
index 1daf03d..aa0f8d1 100644
--- a/docs/build_db.md
+++ b/docs/build_db.md
@@ -6,11 +6,11 @@ This script will allow you to build a MIDAS database using your own genomes or m
 
 This script uses user-defined species and user-supplied genomes to build database files for running MIDAS. This will allow you to estimate the abundance of these species in a shotgun metagenome and quantify gene content and SNPs for species with sufficient data.
 
-First, a pan-genome is built for each species. A pan-genome is defined as the set of non-redundant genes across all genomes for each species. The program USEARCH (http://www.drive5.com/usearch) is used for clustering genes. The default clustering identity is 95%
+First, a pan-genome is built for each species. A pan-genome is defined as the set of non-redundant genes across all genomes for each species. The program USEARCH (http://www.drive5.com/usearch) is first used to cluster genes at 99% percent identity and identify a centroid gene sequence from each cluster. These "centroids" are used for recruiting metagenomic reads. This is done to reduce the number of genes searched while maintaining maximum mapping sensitivity. Next, USEARCH is used to cluster the centroids from 99% identity gene clusters further at 95, 90, 85, 80, and 75 percent identity. After mapping reads to 99% identity centroids with 'run_midas.py genes', reads can be optionally aggregated into gene clusters at any of the lower clustering thresholds with 'merge_midas.py genes'.
 
-Next, a representative-genome database is built. Representative genomes are used for mapping reads and calling SNPs. The user needs to define which genome they want to use as the representative for each species. 
+Next, a representative-genome database is built. Representative genomes are used for mapping reads and calling SNPs. The user simply needs to define which genome they want to use as the representative for each species.
 
-Finally, a marker genes database is built. Marker genes are defined as universal, single-copy gene families. These are genes that occur once per genome and in all genomes (of bacteria). MIDAS uses a set of 15 of such gene families. These are a subset of the PhyEco gene families described here: http://dx.doi.org/10.1371/journal.pone.0077033. To identify these genes, HMMER (http://hmmer.org) is used to scan each species' pan-genome. Once identified, a HS-BLASTN (http://dx.doi.org/10.1093/nar/gkv784) database is built for mapping short reads.
+Finally, a marker genes database is built. Marker genes are defined as universal, single-copy gene families. These are genes that occur once per genome and in all genomes (of bacteria). MIDAS uses a set of 15 of such gene families. These are a subset of the PhyEco gene families described here: http://dx.doi.org/10.1371/journal.pone.0077033. To identify these genes, HMMER (http://hmmer.org) is used to scan genes from the representative genome. Once identified, a HS-BLASTN (http://dx.doi.org/10.1093/nar/gkv784) database is built for mapping short reads.
 
 ## Requirements
 As with all scripts, MIDAS and its dependencies will need to be installed.  
@@ -39,11 +39,14 @@ Path to directory of genomes. Each subdirectory should be named with a genome id
 \<genome_id>.fna
 Genomic DNA sequence in FASTA format    
 
+\<genome_id>.faa
+Genomic DNA sequence in FASTA format
+
 \<genome_id>.ffn
-Gene DNA sequences in FASTA format 
+Protein sequences in FASTA format
  
 \<genome_id>.features
-File specificy genomic coordinates of genes.
+File specifies genomic coordinates of genes.
 This file should be tab-delimited with a header and 5 fields:  
 
 * scaffold_id (CHAR)
@@ -51,7 +54,8 @@ This file should be tab-delimited with a header and 5 fields:
 * start (INT)
 * end (INT)
 * strand ('+' or '-')	 
-	
+* gene_type ('CDS' or 'RNA')
+
 <b>mapfile PATH</b>  
 Path to mapping file that specifies which genomes belonging to the same species.  
 The file should be tab-delimited file with a header and 3 fields:  
@@ -68,13 +72,6 @@ Directory to store MIDAS database
 <b>--threads INT</b>  
 Number of threads to use
 
-<b>--pid FLOAT</b>  
-Percent identity threshold for clustering genes (default=0.95)
-
-<b>--iter_size INT</b>  
-Maximum number of genomes to process at one time to prevent exceeding USEARCH's 4G memory limit (default=500).  
-If the number of genomes per species exceeds this parameter, USEARCH will be run in batches of <iter_size>. After all batches have completed, the resulting centroids from each batch are combined and clustered one final time.
-
 <b>--max_species INT</b>  
 Maximum number of species to process from input (default=use all).  
 Useful for quick tests
diff --git a/midas/build/build_db.py b/midas/build/build_db.py
index afdb537..968f7e5 100644
--- a/midas/build/build_db.py
+++ b/midas/build/build_db.py
@@ -17,27 +17,195 @@ def __init__(self, id):
 class Genome:
 	def __init__(self, id, dir):
 		self.id=id
-		self.dir='%s/%s'%(dir,id)
-		self.files = self.init_files()
+		self.dir='%s/%s' % (dir, id)
+		self.files = {}
+		self.init_files()
+		self.is_rep = None
 
 	def init_files(self):
-		files = {}
 		if not os.path.isdir(self.dir):
 			sys.exit("\nError: genome directory '%s' does not exist" % (self.dir))
-		for type in ['fna', 'ffn', 'features']:
-			for ext in ['', '.gz', '.bz2']:
-				inpath = '%s/%s.%s%s' % (self.dir, self.id, type, ext)
-				if os.path.isfile(inpath):
-					files[type] = inpath
-			if type not in files:
+		for type in ['fna', 'ffn', 'faa', 'features']:
+			inpath = '%s/%s.%s' % (self.dir, self.id, type)
+			if os.path.isfile(inpath):
+				self.files[type] = inpath
+			else:
 				error = ""
-				error += "\nError: could not locate input file '%s/%s.%s(.gz|.bz2)'\n" % (self.dir, self.id, type)
+				error += "\nError: could not locate input file '%s/%s.%s'\n" % (self.dir, self.id, type)
 				error += "\nYour genome should contain the following files:\n"
 				error += "  %s/%s.fna (FASTA of genome sequence)\n" % (self.dir, self.id)
 				error += "  %s/%s.ffn (FASTA of gene sequences)\n" % (self.dir, self.id)
-				error += "  %s/%s.features (Genomic coordinates of genes)" % (self.dir, self.id)
+				error += "  %s/%s.faa (FASTA of protein sequences)\n" % (self.dir, self.id)
+				error += "  %s/%s.features (Genomic coordinates of genes)\n" % (self.dir, self.id)
 				sys.exit(error)
-		return files
+		file = open(self.files['features'])
+		header = next(file).rstrip('\n').split('\t')
+		for field in ['gene_id', 'scaffold_id', 'start', 'end', 'strand', 'gene_type']:
+			if field not in header:
+				sys.exit("\nError: missing required field '%s' in file '%s'\n" % (field, self.files['features']))
+		file.close()
+
+class Gene:
+	def __init__(self, id):
+		""" Instantiate Gene """
+		self.id = id
+		self.genome_id = None
+		self.seq = None
+		self.length = None
+		self.cluster_id = {}
+		self.centroid_id = {}
+
+class Pangenome:
+	def __init__(self, sp, outdir, ext):
+		""" Instantiate Pangenome """
+		self.dir = '%s/pan_genomes/%s' % (outdir, sp.id)
+		self.tmp = '%s/temp' % self.dir
+		self.species = sp
+		self.genomes = sp.genomes.values()
+		self.stats = {}
+		self.stats['genomes'] = len(self.genomes)
+		self.count_genes = 0
+		self.count_genes = 0
+		try: os.makedirs(self.tmp)
+		except: pass
+	
+	def store_genes(self):
+		""" Store genes from all genomes """
+		self.genes = {}
+		self.stats['genes'] = 0
+		for genome in self.genomes:
+			for rec in Bio.SeqIO.parse(genome.files['ffn'], 'fasta'):
+				if str(rec.seq) == '' or str(rec.id) in ['', '|']:
+					continue
+				else:
+					gene = Gene(rec.id)
+					gene.genome_id = genome.id
+					gene.seq = str(rec.seq).upper()
+					gene.length = len(gene.seq)
+					self.genes[gene.id] = gene
+					self.stats['genes'] += 1
+
+	def write_readme(self):
+		""" Concatenate all genes from pangenome into sequence file """
+		file = utility.iopen('%s/readme.txt' % self.dir, 'w')
+		file.write("""
+Description and statistics for pan-genome files
+
+Summary Statistics
+############
+
+Genomes: %(genomes)s
+Genes: %(genes)s
+Gene clusters (99%% identity): %(centroids_99)s
+Gene clusters (95%% identity): %(centroids_95)s
+Gene clusters (90%% identity): %(centroids_90)s
+Gene clusters (85%% identity): %(centroids_85)s
+Gene clusters (80%% identity): %(centroids_80)s
+Gene clusters (75%% identity): %(centroids_75)s
+		
+Output files
+############
+genes.ffn
+  all genes from specified genomes
+  
+centroids.ffn
+  gene sequences from 99%% identity gene clusters
+  used for recruiting metagenomic reads
+  
+gene_info.txt
+  information for all genes from genes.ffn
+  the fields centroid_{99,95,90,95,80,75} indicate mappings between gene_id and gene clusters
+""" % self.stats)
+		file.close()
+
+	def write_genes(self):
+		""" Concatenate all genes from pangenome into sequence file """
+		file = utility.iopen('%s/genes.ffn' % self.dir, 'w')
+		for gene in self.genes.values():
+			file.write('>%s\n%s\n' % (gene.id, gene.seq))
+		file.close()
+
+	def cluster_genes(self, threads):
+		""" Cluster genes at 99% ID; Clustering centroids at lower %ID cutoffs """
+		self.uclust(
+			genes='%s/genes.ffn' % self.dir,
+			pid=0.99,
+			centroids='%s/centroids.99.ffn' % self.tmp,
+			clusters='%s/uclust.99.txt' % self.tmp,
+			threads=threads)
+		self.store_gene_info(pid=99)
+		shutil.copy('%s/centroids.99.ffn' % self.tmp, '%s/centroids.ffn' % self.dir)
+		for pid in [95, 90, 85, 80, 75]:
+			self.uclust(
+				genes='%s/centroids.99.ffn' % self.tmp,
+				pid=pid/100.0,
+				centroids='%s/centroids.%s.ffn' % (self.tmp, pid),
+				clusters='%s/uclust.%s.txt' % (self.tmp, pid),
+				threads=threads)
+			self.store_gene_info(pid)
+		self.store_cluster_membership()
+
+	def store_gene_info(self, pid):
+		""" Parse UCLUST file and store mapping of gene_id to centroid_id at given %ID cutoff """
+		self.stats['centroids_%s' % pid] = 0
+		for r in self.parse_uclust('%s/uclust.%s.txt' % (self.tmp, pid)):
+			if r['type'] == 'H':
+				self.genes[r['gene_id']].cluster_id[pid] = r['cluster_id']
+				self.genes[r['gene_id']].centroid_id[pid] = r['centroid_id']
+			elif r['type'] == 'S':
+				self.genes[r['gene_id']].cluster_id[pid] = r['cluster_id']
+				self.genes[r['gene_id']].centroid_id[pid] = r['gene_id']
+				self.stats['centroids_%s' % pid] += 1
+			else:
+				continue
+			
+	def store_cluster_membership(self):
+		""" Map gene to 99% ID centroids at each clustering %ID cutoff """
+		for gene in self.genes.values():
+			gene.centroid_99 = gene.centroid_id[99]
+			gene.centroid_95 = self.genes[gene.centroid_99].centroid_id[95]
+			gene.centroid_90 = self.genes[gene.centroid_99].centroid_id[90]
+			gene.centroid_85 = self.genes[gene.centroid_99].centroid_id[85]
+			gene.centroid_80 = self.genes[gene.centroid_99].centroid_id[80]
+			gene.centroid_75 = self.genes[gene.centroid_99].centroid_id[75]
+
+	def write_gene_info(self):
+		""" Record gene info in gene_info.txt """
+		file = utility.iopen('%s/gene_info.txt' % self.dir, 'w')
+		header = ['gene_id', 'genome_id', 'gene_length', 'centroid_99', 'centroid_95', 'centroid_90', 'centroid_85', 'centroid_80', 'centroid_75']
+		file.write('\t'.join(header)+'\n')
+		for gene_id in sorted(self.genes.keys()):
+			g = self.genes[gene_id]
+			values = [g.id, g.genome_id, g.length, g.centroid_99, g.centroid_95, g.centroid_90, g.centroid_85, g.centroid_80, g.centroid_75]
+			file.write('\t'.join([str(_) for _ in values])+'\n')
+		file.close()
+
+	def clean_up(self):
+		""" Remove temporary files """
+		shutil.rmtree('%s/temp' % self.dir)
+
+	def parse_uclust(self, inpath):
+		""" Yield formatted records from UCLUST output file """
+		# centroids are type == 'S'
+		# non-centroids are type == 'H'
+		# clusters are type == 'C'
+		fields = ['type', 'cluster_id', 'size', 'pid', 'strand', 'skip1', 'skip2', 'skip3', 'gene_id', 'centroid_id']
+		with utility.iopen(inpath) as infile:
+			for index, line in enumerate(infile):
+				values = line.rstrip('\n').split('\t')
+				record = dict([(f,v) for f,v in zip(fields, values)])
+				yield record
+
+	def uclust(self, genes, pid, centroids, clusters, threads):
+		""" Run UCLUST from shell with specified arguments """
+		command = "usearch "
+		command += "-cluster_fast %s " % genes
+		command += "-id %s " % pid
+		command += "-centroids %s " % centroids
+		command += "-uc %s " % clusters
+		command += "-threads %s " % threads
+		process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+		utility.check_exit_code(process, command)
 
 def parse_mapping_file(args):
 	infile = utility.iopen(args['mapfile'])
@@ -69,6 +237,9 @@ def read_species(args):
 			sp.genomes[genome_id] = Genome(genome_id, args['indir'])
 			if record['rep_genome'] == '1':
 				sp.rep_genome = genome_id
+				sp.genomes[genome_id].is_rep = True
+			else:
+				sp.genomes[genome_id].is_rep = False
 		# store species
 		if len(species) < args['max_species']:
 			species[species_id] = sp
@@ -84,112 +255,9 @@ def read_genomes(species):
 	genomes = sum([sp.genomes.values() for sp in species], [])
 	return genomes
 
-def parse_hmmsearch(p_in):
-	""" Parse HMMER domblout files. Return data-type formatted dictionary """
-	f_in = utility.iopen(p_in)
-	for line in f_in:
-		if line[0] == '#': continue
-		x = line.rstrip().split()
-		query = x[0]
-		target = x[3]
-		evalue = float(x[12])
-		qcov = (int(x[20]) - int(x[19]) + 1)/float(x[2])
-		tcov = (int(x[16]) - int(x[15]) + 1)/float(x[5])
-		yield {'query':query, 'target':target, 'evalue':evalue, 'qcov':qcov, 'tcov':tcov, 'qlen':int(x[2]), 'tlen':int(x[5])}
-
-def find_hits(args, species, max_evalue, min_cov):
-	inpath = "%s/marker_genes/temp/%s.hmmsearch" % (args['outdir'], species.id)
-	hits = {}
-	for r in parse_hmmsearch(inpath):
-		if r['evalue'] > max_evalue:
-			continue
-		elif min(r['qcov'], r['tcov']) < min_cov:
-			continue
-		if r['target'] not in hits:
-			hits[r['target']] = r
-		elif r['evalue'] < hits[r['target']]['evalue']:
-			hits[r['target']] = r
-	return hits.values()
-
-def hmmsearch(args, species):
-	command = "hmmsearch --noali --cpu %s " % args['threads']
-	command += "--domtblout %s/marker_genes/temp/%s.hmmsearch " % (args['outdir'], species.id)
-	command += "%s/%s " % (os.path.dirname(__file__), 'phyeco.hmm')
-	command += "%s/pan_genomes/%s/centroids.faa > /dev/null" % (args['outdir'], species.id)
-	return command
-
-def hsblastn_index(fasta):
-	command = "hs-blastn index %s " % fasta
-	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={'PATH':sys.path})
-	utility.check_exit_code(process, command)
-
-def parse_fasta(p_in):
-	""" Return lookup of seq_id to sequence for PATRIC genes """
-	seqs = {}
-	infile = utility.iopen(p_in)
-	for r in Bio.SeqIO.parse(infile, "fasta"):
-		seqs[r.id] = str(r.seq).upper()
-	infile.close()
-	return seqs
-
-def build_fasta_db(args, species):
-	seqfile = utility.iopen('%s/marker_genes/phyeco.fa' % args['outdir'], 'w')
-	mapfile = utility.iopen('%s/marker_genes/phyeco.map' % args['outdir'], 'w')
-	mapfile.write('\t'.join(['species_id', 'gene_id', 'gene_length', 'marker_id'])+'\n') # add back genome id
-	for index, sp in enumerate(species):
-		path = '%s/pan_genomes/%s/centroids.ffn' % (args['outdir'], sp.id)
-		fna = parse_fasta(path)
-		for h in find_hits(args, sp, max_evalue=1e-5, min_cov=0.70):
-			gene = fna[h['query']].upper()
-			mapfile.write('%s\t%s\t%s\t%s\n' % (sp.id, h['query'], len(gene), h['target']))
-			seqfile.write('>'+h['query']+'\n'+gene+'\n')
-	seqfile.close()
-	mapfile.close()
-
-def build_marker_db(args, genomes, species):
-	outdir = '%s/marker_genes/temp' % args['outdir']
-	if not os.path.isdir(outdir): os.makedirs(outdir)
-	print("1. Searching marker gene HMMs vs pangenomes")
-	for sp in species:
-		print("   species: %s" % sp.id)
-		command = hmmsearch(args, sp)
-		process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-		utility.check_exit_code(process, command)
-	print("2. Building FASTA of homologs")
-	build_fasta_db(args, species)
-	print("3. Building HS-BLASTN database of homologs")
-	command = "%s index %s/marker_genes/phyeco.fa " % (args['hs-blastn'], args['outdir'])
-	process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-	utility.check_exit_code(process, command)
-	print("4. Writing mapping cutoffs file")
-	build_mapping_cutoffs(args)
-
-def build_mapping_cutoffs(args):
-	cutoffs = {
-		'B000032':95.50,
-		'B000039':94.75,
-		'B000041':98.00,
-		'B000062':97.25,
-		'B000063':96.00,
-		'B000065':98.00,
-		'B000071':95.25,
-		'B000079':98.00,
-		'B000080':95.25,
-		'B000081':97.00,
-		'B000082':95.25,
-		'B000086':96.75,
-		'B000096':96.75,
-		'B000103':95.25,
-		'B000114':94.50,
-	}
-	outfile = open('%s/marker_genes/phyeco.mapping_cutoffs' % args['outdir'], 'w')
-	for marker_id, cutoff in cutoffs.items():
-		outfile.write(marker_id+'\t'+str(cutoff)+'\n')
-	outfile.close()
-
 def build_repgenome_db(args, genomes, species):
 	for sp in species:
-		print("species: %s" % sp.id)
+		print("%s" % sp.id)
 		outdir = '%s/rep_genomes/%s' % (args['outdir'], sp.id)
 		if not os.path.isdir(outdir): os.makedirs(outdir)
 		for ext in ['fna', 'features']:
@@ -199,167 +267,18 @@ def build_repgenome_db(args, genomes, species):
 
 def build_pangenome_db(args, species):
 	for sp in species:
-		print("species: %s" % sp.id)
-		pangenome = Pangenome(sp, args['outdir'], args['iter_size'], args['compress'])
-		pangenome.cluster_genes(args['pid'], args['threads'])
-		pangenome.translate()
-		pangenome.record_info()
-		pangenome.clean_up()
-
-class Gene:
-	def __init__(self, id):
-		self.id = id
-
-class GeneCluster:
-	def __init__(self, id):
-		self.id = id
-
-class Pangenome:
-	def __init__(self, sp, outdir, iter_size, ext):
-		self.dir = '%s/pan_genomes/%s' % (outdir, sp.id)
-		self.species = sp
-		self.genomes = sp.genomes.values()
-		self.ngenomes = len(self.genomes)
-		self.iter_size = iter_size
-		self.niters = 1 + (self.ngenomes - 1)/self.iter_size
-			
-	def batch_seqs(self, index, outdir, genomes):
-		seqfile = utility.iopen('%s/genes.ffn' % outdir, 'w')
-		mapfile = utility.iopen('%s/gene_to_genome.txt' % outdir, 'w')
-		start = index * self.iter_size
-		stop = start + self.iter_size
-		for genome in self.genomes[start:stop]:
-			for rec in Bio.SeqIO.parse(genome.files['ffn'], 'fasta'):
-				if str(rec.seq) == '' or str(rec.id) in ['', '|']:
-					continue
-				else:
-					seqfile.write('>%s\n%s\n' % (rec.id, str(rec.seq).upper()))
-					mapfile.write('%s\t%s\n' % (rec.id, genome.id))
-		seqfile.close()
-		mapfile.close()
-
-	def translate(self):
-		infile = utility.iopen('%s/centroids.ffn' % self.dir)
-		outfile = utility.iopen('%s/centroids.faa' % self.dir, 'w')
-		for r in Bio.SeqIO.parse(infile, 'fasta'):
-			if len(r.seq) % 3 == 0:
-				outfile.write('>'+r.id+'\n'+str(r.seq.translate()).rstrip('*')+'\n')
-		infile.close()
-		outfile.close()
-
-	def cluster_genes(self, pid, threads):
-		print("  1. Clustering genes")
-		# split genes into batches and cluster
-		for index in range(self.niters):
-			print("     iteration %s" % index)
-			outdir = '%s/temp/iter_%s' % (self.dir, index) if self.niters > 1 else '%s/temp' % self.dir
-			if not os.path.isdir(outdir): os.makedirs(outdir)
-			self.batch_seqs(index, outdir, self.genomes)
-			self.uclust('%s/genes.ffn' % outdir, pid, '%s/centroids.ffn' % outdir, '%s/uclust.txt' % outdir, threads)
-		# if split into > 1 batch, combine batches and recluster
-		if self.niters > 1:
-			print("  2. Combining centroids across clustering iterations")
-			seqfile = utility.iopen('%s/temp/genes.ffn' % self.dir, 'w')
-			mapfile = utility.iopen('%s/temp/gene_to_genome.txt' % self.dir, 'w')
-			for i in range(self.niters):
-				indir = '%s/temp/iter_%s' % (self.dir, i)
-				for line in utility.iopen('%s/centroids.ffn' % indir): seqfile.write(line)
-				for line in utility.iopen('%s/gene_to_genome.txt' % indir): mapfile.write(line)
-			seqfile.close()
-			mapfile.close()
-			print("  3. re-clustering combined centroids")
-			self.uclust('%s/temp/genes.ffn' % self.dir, pid, '%s/temp/centroids.ffn' % self.dir, '%s/temp/uclust.txt' % self.dir, threads)
-		# move centroids to final dest
-		shutil.move('%s/temp/centroids.ffn' % self.dir, '%s/centroids.ffn' % self.dir)
-
-	def map_gene_to_genome(self):
-		genes = {}
-		for line in utility.iopen('%s/temp/gene_to_genome.txt' % self.dir):
-			gene_id, genome_id = line.rstrip('\n').split('\t')
-			genes[gene_id] = Gene(gene_id)
-			genes[gene_id].genome_id = genome_id
-		return genes
-
-	def store_gene_info(self):
-		genes = self.map_gene_to_genome()
-		for r in self.parse_uclust('%s/temp/uclust.txt' % self.dir):
-			if r['type'] == 'H':
-				genes[r['gene_id']].cluster_id = r['cluster_id']
-				genes[r['gene_id']].centroid = '0'
-				genes[r['gene_id']].length = int(r['size'])
-			elif r['type'] == 'S':
-				genes[r['gene_id']].cluster_id = r['cluster_id']
-				genes[r['gene_id']].centroid = '1'
-				genes[r['gene_id']].length = int(r['size'])
-		return genes
-
-	def store_cluster_info(self):
-		clusters = {}
-		for r in self.parse_uclust('%s/temp/uclust.txt' % self.dir):
-			if r['type'] == 'C':
-				clusters[r['cluster_id']] = GeneCluster(r['cluster_id'])
-				clusters[r['cluster_id']].centroid_id = r['gene_id']
-				clusters[r['cluster_id']].size = int(r['size'])
-		return clusters
-
-	def update_info(self):
-		if self.niters == 1: return
-		for index in range(self.niters):
-			for r in self.parse_uclust('%s/temp/iter_%s/uclust.txt' % (self.dir, index)):
-				if r['type'] == 'H':
-					gene = self.genes[r['gene_id']]
-					cluster_id = self.genes[r['centroid_id']].cluster_id
-					gene.cluster_id = cluster_id
-					gene.centroid = '0'
-					gene.length = r['size']
-					self.clusters[cluster_id].size += 1
-
-	def write_cluster_info(self):
-		outfile = utility.iopen('%s/cluster_info.txt' % self.dir, 'w')
-		outfile.write('\t'.join(['cluster_id', 'size', 'centroid'])+'\n')
-		for cluster_id in sorted(self.clusters.keys()):
-			cluster = self.clusters[cluster_id]
-			outfile.write('\t'.join([cluster.id, str(cluster.size), cluster.centroid_id])+'\n')
-		outfile.close()
-
-	def write_gene_info(self):
-		outfile = utility.iopen('%s/gene_info.txt' % self.dir, 'w')
-		outfile.write('\t'.join(['gene_id', 'genome_id', 'cluster_id', 'centroid', 'length'])+'\n')
-		for gene_id in sorted(self.genes.keys()):
-			gene = self.genes[gene_id]
-			outfile.write('\t'.join([gene.id, gene.genome_id, gene.cluster_id, gene.centroid, str(gene.length)])+'\n')
-
-	def record_info(self):
-		self.genes = self.store_gene_info()
-		self.clusters = self.store_cluster_info()
-		self.update_info()
-		self.write_cluster_info()
-		self.write_gene_info()
-
-	def clean_up(self):
-		shutil.rmtree('%s/temp' % self.dir)
-
-	def parse_uclust(self, inpath):
-		""" Yield formatted records from UCLUST output file """
-		# centroids are type == 'S'
-		# non-centroids are type == 'H'
-		# clusters are type == 'C'
-		fields = ['type', 'cluster_id', 'size', 'pid', 'strand', 'skip1', 'skip2', 'skip3', 'gene_id', 'centroid_id']
-		with utility.iopen(inpath) as infile:
-			for index, line in enumerate(infile):
-				values = line.rstrip('\n').split('\t')
-				record = dict([(f,v) for f,v in zip(fields, values)])
-				yield record
-
-	def uclust(self, genes, pid, centroids, clusters, threads):
-		command = "usearch "
-		command += "-cluster_fast %s " % genes
-		command += "-id %s " % pid
-		command += "-centroids %s " % centroids
-		command += "-uc %s " % clusters
-		command += "-threads %s " % threads
-		process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-		utility.check_exit_code(process, command)
+		print("%s" % sp.id)
+		p = Pangenome(sp, outdir=args['outdir'], ext=args['compress'])
+		print("  catting genes")
+		p.store_genes()
+		p.write_genes()
+		print("  clustering genes")
+		p.cluster_genes(threads=args['threads'])
+		print("  writing gene info")
+		p.write_gene_info()
+		print("  removing temporary files")
+		p.clean_up()
+		p.write_readme()
 
 def write_species_info(args, species):
 	outfile = utility.iopen('%s/species_info.txt' % args['outdir'], 'w')
@@ -382,23 +301,135 @@ def compress(outdir):
 					outfile.close()
 					os.remove(inpath)
 
+def build_marker_db(args, genomes, species):
+	marker_genes = MarkerGenes(args['outdir'])
+	print("  searching marker gene HMMs vs pangenomes")
+	for sp in species:
+		for genome in sp.genomes.values():
+			marker_genes.hmmsearch(
+				inpath=genome.files['faa'],
+				outpath='%s/%s.hmmsearch' % (marker_genes.tmp, genome.id),
+				threads=args['threads'])
+			fna = marker_genes.parse_fasta(genome.files['ffn'])
+			for h in marker_genes.find_hits(
+					inpath='%s/%s.hmmsearch' % (marker_genes.tmp, genome.id),
+					max_evalue=1e-5,
+					min_cov=0.00):
+				gene = fna[h['query']].upper()
+				info = [sp.id, genome.id, h['query'], len(gene), h['target']]
+				marker_genes.info.write('\t'.join([str(_) for _ in info])+'\n')
+				if genome.is_rep:
+					marker_genes.fasta.write('>'+h['query']+'\n'+gene+'\n')
+	marker_genes.info.close()
+	marker_genes.fasta.close()
+	print("  building blast database")
+	marker_genes.build_hsblastn_db(hsblastn=args['hs-blastn'])
+	print("  writing mapping cutoffs file")
+	marker_genes.build_mapping_cutoffs()
+	print("  removing temporary files")
+	shutil.rmtree(marker_genes.tmp)
+
+class MarkerGenes:
+	def __init__(self, dir):
+		self.dir = '%s/marker_genes' % dir
+		self.tmp = '%s/temp' % self.dir
+		if not os.path.isdir(self.tmp): os.makedirs(self.tmp)
+		self.fasta = open('%s/phyeco.fa' % self.dir, 'w')
+		self.info = open('%s/phyeco.map' % self.dir, 'w')
+		self.header = ['species_id', 'genome_id', 'gene_id', 'gene_length', 'marker_id']
+		self.info.write('\t'.join(self.header)+'\n')
+
+	def hmmsearch(self, inpath, outpath, threads):
+		command = "hmmsearch --noali --cpu %s " % threads
+		command += "--domtblout %s " % outpath
+		command += "%s/%s " % (os.path.dirname(__file__), 'phyeco.hmm')
+		command += "%s > /dev/null" % inpath
+		process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+		utility.check_exit_code(process, command)
+
+	def parse_hmmsearch(self, p_in):
+		""" Parse HMMER domblout files. Return data-type formatted dictionary """
+		f_in = utility.iopen(p_in)
+		for line in f_in:
+			if line[0] == '#': continue
+			x = line.rstrip().split()
+			query = x[0]
+			target = x[3]
+			evalue = float(x[12])
+			qcov = (int(x[20]) - int(x[19]) + 1)/float(x[2])
+			tcov = (int(x[16]) - int(x[15]) + 1)/float(x[5])
+			yield {'query':query, 'target':target, 'evalue':evalue, 'qcov':qcov, 'tcov':tcov, 'qlen':int(x[2]), 'tlen':int(x[5])}
+
+	def find_hits(self, inpath, max_evalue, min_cov):
+		hits = {}
+		for r in self.parse_hmmsearch(inpath):
+			if r['evalue'] > max_evalue:
+				continue
+			elif min(r['qcov'], r['tcov']) < min_cov:
+				continue
+			if r['target'] not in hits:
+				hits[r['target']] = r
+			elif r['evalue'] < hits[r['target']]['evalue']:
+				hits[r['target']] = r
+		return hits.values()
+
+	def hsblastn_index(self, fasta):
+		command = "hs-blastn index %s " % fasta
+		process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={'PATH':sys.path})
+		utility.check_exit_code(process, command)
+	
+	def parse_fasta(self, p_in):
+		""" Return lookup of seq_id to sequence for PATRIC genes """
+		seqs = {}
+		infile = utility.iopen(p_in)
+		for r in Bio.SeqIO.parse(infile, "fasta"):
+			seqs[r.id] = str(r.seq).upper()
+		infile.close()
+		return seqs
+
+	def build_hsblastn_db(self, hsblastn):
+		command = "%s index " % hsblastn
+		command += " %s/phyeco.fa " % self.dir
+		process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+		utility.check_exit_code(process, command)
+	
+	def build_mapping_cutoffs(self):
+		cutoffs = {
+			'B000032':95.50,
+			'B000039':94.75,
+			'B000041':98.00,
+			'B000062':97.25,
+			'B000063':96.00,
+			'B000065':98.00,
+			'B000071':95.25,
+			'B000079':98.00,
+			'B000080':95.25,
+			'B000081':97.00,
+			'B000082':95.25,
+			'B000086':96.75,
+			'B000096':96.75,
+			'B000103':95.25,
+			'B000114':94.50,
+		}
+		outfile = open('%s/phyeco.mapping_cutoffs' % self.dir, 'w')
+		for marker_id, cutoff in cutoffs.items():
+			outfile.write(marker_id+'\t'+str(cutoff)+'\n')
+		outfile.close()
+
 def run_pipeline(args):
 		
+	print("Reading species info")
 	species = read_species(args)
 	genomes = read_genomes(species)
-	
 	write_species_info(args, species)
 	
-	print("Building pangenome database")
-	print("=====================")
+	print("\nBuilding pangenome database")
 	build_pangenome_db(args, species)
 				
 	print("\nBuilding representative genome database")
-	print("=====================")
 	build_repgenome_db(args, genomes, species)
 
 	print("\nBuilding marker genes database")
-	print("=====================")
 	build_marker_db(args, genomes, species)
 
 	print("")
diff --git a/scripts/build_midas_db.py b/scripts/build_midas_db.py
index 4fc9e6c..ffc67a0 100755
--- a/scripts/build_midas_db.py
+++ b/scripts/build_midas_db.py
@@ -21,60 +21,52 @@ def fetch_arguments():
 	parser.add_argument('indir', type=str,
 		help="""Path to directory of input genomes
 Each subdirectory should be named according to a genome_id
-Each subdirectory should contain the files:
-  <genome_id>.fna: Genomic DNA sequence in FASTA format 
-  <genome_id>.ffn: Gene DNA sequences in FASTA format 
-  <genome_id>.features: Genomic coordinates of genes. Tab-delimited with a header fields:
-     * scaffold_id (CHAR)
-     * gene_id (CHAR)
-     * start (INT)
-     * end (INT)
-     * strand ('+' or '-')	  
+Each subdirectory should contain (replace genome_id):
+  genome_id.fna: Genomic DNA sequence in FASTA format
+  genome_id.ffn: Gene DNA sequences in FASTA format
+  genome_id.features: Genomic coordinates of genes. 
+    sorted by: scaffold_id, start
+    tab-delimited with a header and fields:
+    gene_id (CHAR)
+    scaffold_id (CHAR)
+    start (INT)
+    end (INT)
+    strand (+/-)
+    gene_type (CDS/RNA)
 """)
 	parser.add_argument('mapfile', type=str,
-		help="""
-Path to mapping file that specifies which genomes belonging to the same species.
+		help="""Path to mapping file that specifies which genomes belonging to the same species.
 The file should be tab-delimited file with a header and 3 fields:
-  * genome_id (CHAR): corresponds to subdirectory within INDIR
-  * species_id (CHAR): species identifier for genome_id
-  * rep_genome (0 or 1): indicator if genome_id should be used for SNP calling
+  genome_id (CHAR): corresponds to subdirectory within INDIR
+  species_id (CHAR): species identifier for genome_id
+  rep_genome (0 or 1): indicator if genome_id should be used for SNP calling
 """)
 	parser.add_argument('outdir', type=str,
 		help="Directory to store MIDAS database")
 	parser.add_argument('--threads', type=str, metavar='INT', default=1,
-		help="Number of threads to use")
-	parser.add_argument('--pid', type=float, metavar='FLOAT', default=0.95,
-		help="Percent identity threshold for defining non-redundant genes (0.95)")
-	parser.add_argument('--iter_size', type=int, default=500, metavar='INT',
-		help="""Maximum number of genomes to process with at one time
-to prevent exceeding USEARCH's 4G memory limit (500)""")
+		help="Number of threads to use (1)")
+	parser.add_argument('--compress', action='store_true', default=False,
+		help="Compress output files with gzip (False)")
 	parser.add_argument('--max_species', type=int, default=float('inf'), metavar='INT',
 		help="Maximum number of species to process from input (use all).\nUseful for quick tests")
 	parser.add_argument('--max_genomes', type=int, default=float('inf'), metavar='INT',
 		help="Maximum number of genomes to process per species (use all).\nUseful for quick tests")
-	parser.add_argument('--compress', action='store_true', default=False,
-		help="Compress output files with gzip")
-
 	args = vars(parser.parse_args())
-	
 	return args
 
 def check_args(args):
-
 	if not os.path.isdir(args['outdir']):
 		os.mkdir(args['outdir'])
 	if not os.path.isdir(args['indir']):
 		sys.exit("\nError: could not locate directory specified by --genomes: %s\n" % args['indir'])
 	if not os.path.isfile(args['mapfile']):
 		sys.exit("\nError: could not locate file specified by --mapping: %s\n" % args['mapfile'])
-
 	for program in ['hmmsearch', 'usearch']:
 		if not utility.which(program):
 			error = "\nError: program '%s' not found in your PATH" % program
 			error += "\nMake sure that you've installed the program and added it's location to your PATH\n"
 			sys.exit(error)
 
-
 if __name__ == "__main__":
 	args = fetch_arguments()
 	utility.add_executables(args)

From aa56f14c0ac2982a4399c9a6fbdd1856f9bcca1b Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Tue, 1 Nov 2016 12:32:46 -0700
Subject: [PATCH 03/11] Fixed bug computing read-depth of marker genes

---
 midas/run/species.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/midas/run/species.py b/midas/run/species.py
index 9c12f44..ad00744 100644
--- a/midas/run/species.py
+++ b/midas/run/species.py
@@ -4,7 +4,7 @@
 # Copyright (C) 2015 Stephen Nayfach
 # Freely distributed under the GNU General Public License (GPLv3)
 
-import sys, os, subprocess
+import sys, os, subprocess, Bio.SeqIO
 from time import time
 from midas import utility
 from operator import itemgetter
@@ -17,10 +17,13 @@ def read_annotations(args):
 	return info
 
 def read_marker_info(args):
+	""" Read info for marker genes from phyeco.fa """
 	info = {}
-	inpath = '%s/marker_genes/phyeco.map' % args['db']
-	for r in utility.parse_file(inpath):
-		info[r['gene_id']] = r
+	for seq in Bio.SeqIO.parse('%s/marker_genes/phyeco.fa' % args['db'], 'fasta'):
+		info[seq.id] = None
+	for r in utility.parse_file('%s/marker_genes/phyeco.map' % args['db']):
+		if r['gene_id'] in info:
+			info[r['gene_id']] = r
 	return info
 
 def map_reads_hsblast(args):

From 95d117f2f885a53d058f0b40f7f0240702884026 Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Tue, 1 Nov 2016 12:34:58 -0700
Subject: [PATCH 04/11] Fixed bug when parsing site_id

---
 midas/parse.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/midas/parse.py b/midas/parse.py
index f8a0b21..103c228 100644
--- a/midas/parse.py
+++ b/midas/parse.py
@@ -69,7 +69,7 @@ def __init__(self, species, samples):
 	def site_id(self):
 		""" parse info from site_id """
 		self.id = self.info['site_id']
-		self.ref_id, self.ref_pos, self.ref_allele = self.id.rsplit('|',2)
+		self.ref_id, self.ref_pos, self.ref_allele = self.id.rsplit('|', 2)
 		
 	def fetch_row(self, species):
 		""" fetch next row from matrices """

From 9845d942c4f1e807eee7fdccb83c3ba0b5e7e082 Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Tue, 1 Nov 2016 12:37:37 -0700
Subject: [PATCH 05/11] updated docs

---
 docs/install.md    |  2 +-
 docs/merge_cnvs.md | 17 ++++++++++++-----
 docs/ref_db.md     | 23 +++++++++++++----------
 3 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/docs/install.md b/docs/install.md
index 1cf6123..c045073 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -20,7 +20,7 @@ Or, download the latest source code:
 https://github.com/snayfach/MIDAS/archive/master.zip  
 And unpack the project: `unzip MIDAS-master.zip`  
 
-Or, download a stable releases of the source code:  
+Or, download a stable release of the source code:
 [https://github.com/snayfach/MIDAS/releases] (https://github.com/snayfach/MIDAS/releases)  
 
 ### Install dependencies
diff --git a/docs/merge_cnvs.md b/docs/merge_cnvs.md
index 56be17f..067d6d0 100644
--- a/docs/merge_cnvs.md
+++ b/docs/merge_cnvs.md
@@ -34,6 +34,10 @@ Sample filters (select subset of samples from INPUT):
   --max_samples INT     Maximum number of samples to process. useful for testing (use all)
 
 Presence/Absence:
+  --cluster_pid {75,80,85,90,95,99}
+                        In the database, pan-genomes are defined at 6 different % identity clustering cutoffs
+                        CLUSTER_PID allows you to quantify gene content for any of these sets of gene clusters
+                        By default, gene content is reported for genes clustered at 95% identity (95)
   --min_copy FLOAT      Genes >= MIN_COPY are classified as present
                         Genes < MIN_COPY are classified as absent (0.35)
 ```
@@ -41,19 +45,22 @@ Presence/Absence:
 ## Examples
 
 Examples:
-1) Merge results for all species. Provide list of paths to sample directories:
+1) Merge results for all species. Provide list of paths to sample directories:  
 `merge_midas.py genes /path/to/outdir -i sample_1,sample_2 -t list`
 
-2) Merge results for one species:
+2) Merge results for one species:  
 `merge_midas.py genes /path/to/outdir --species_id Bacteroides_vulgatus_57955 -i sample_1,sample_2 -t list`
 
-3) Exclude low-coverage samples in output matrix:
+3) Quantify genes clusters at 90% identity:  
+`merge_midas.py genes /path/to/outdir -i /path/to/samples -t dir --cluster_pid 90`
+
+4) Exclude low-coverage samples in output matrix:  
 `merge_midas.py genes /path/to/outdir -i /path/to/samples -t dir --sample_depth 5.0`
 
-4) Use lenient threshold for determining gene presence-absence:
+5) Use lenient threshold for determining gene presence-absence:  
 `merge_midas.py genes /path/to/outdir -i /path/to/samples -t dir --min_copy 0.1`
 
-5) Run a quick test:
+6) Run a quick test:  
 `merge_midas.py genes /path/to/outdir -i /path/to/samples -t dir --max_species 1 --max_samples 10`
 
 
diff --git a/docs/ref_db.md b/docs/ref_db.md
index 8638761..6f0cab4 100644
--- a/docs/ref_db.md
+++ b/docs/ref_db.md
@@ -4,27 +4,27 @@ Description of how the MIDAS database was constructed, how the species groups co
 ## Install reference database
 
 #### Step 1. download default database 
-Download from your browser:   
-[http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.1.tar.gz](http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.1.tar.gz)
+Download the latest version from your browser:   
+[http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.2.tar.gz](http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.2.tar.gz)
 
 Or, download from the command line:   
-on Unix: `wget http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.1.tar.gz`  
-on OSX: `curl http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.1.tar.gz > midas_db_v1.1.tar.gz`
+on Unix: `wget http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.2.tar.gz`  
+on OSX: `curl http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.2.tar.gz > midas_db_v1.2.tar.gz`
 
 * This may take several minutes to several hours, depending on your internet speed
 * The entire database requires 32 GB of free space to download and decompress
 * Once decompressed, it takes 16G of free space
-* See midas_db_v1.1/README for more information
+* See midas_db_v1.2/README for more information
 
 #### Step 2. unpack tarball
-`tar -zxvf midas_db_v1.1.tar.gz`  
+`tar -zxvf midas_db_v1.2.tar.gz`  
 
 #### Step 3. create MIDAS_DB environmental variable
 The MIDAS_DB variable tells MIDAS where the reference database is located:   
-`export MIDAS_DB=midas_db_v1.1`
+`export MIDAS_DB=midas_db_v1.2`
 
 Alternatively, you can manually specify the database location when you run MIDAS:  
-ex: `run_midas.py species outdir -d midas_db_v1.1 [options]`
+ex: `run_midas.py species outdir -d midas_db_v1.2 [options]`
 
 ## Build custom database
 Alternatively, you can build a custom database with your own genome sequences [read more] (build_db.md)
@@ -53,8 +53,11 @@ Each genome-cluster was annotated according to the consensus (i.e., most common)
 
 **Pan-genome**
 
-* The set of non-redundant genes (95% DNA identity) across all genomes within species
-* Metagenomic reads are mapped to pan-genome database to determine the gene-content of strains in a sample
+* The set of non-redundant genes (99% DNA identity) across all genomes within each species
+* Mapping between 99% identity gene clusters and gene clusters at lower % identity clustering thresholds (95, 90, 85, 80, and 75)
+* Metagenomic reads are initially mapped to centroid gene sequences from 99% gene families
+* Reads can be easily aggregated into gene clusters at lower % identity clustering thresholds
+* Mapped reads are used to determine the gene content of strains in a sample
 * Gene clustering was performed with USEARCH
 
 ### Database coverage across biomes

From e44140de142f4bba91fe57a935046c396e1f8421 Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Tue, 1 Nov 2016 12:38:53 -0700
Subject: [PATCH 06/11] Switch to classes, reporting # of mapped reads -added
 Species and Genes classes -reporting # of mapped reads (total and per gene)
 -simplified functions in run_pipeline

---
 midas/run/genes.py | 285 ++++++++++++++++++++++++---------------------
 1 file changed, 150 insertions(+), 135 deletions(-)

diff --git a/midas/run/genes.py b/midas/run/genes.py
index 9fc4dd7..2f784ad 100644
--- a/midas/run/genes.py
+++ b/midas/run/genes.py
@@ -4,11 +4,84 @@
 # Copyright (C) 2015 Stephen Nayfach
 # Freely distributed under the GNU General Public License (GPLv3)
 
-import sys, os, subprocess, gzip
+import sys, os, subprocess, gzip, csv, Bio.SeqIO, numpy as np
+from collections import defaultdict
 from time import time
 from midas import utility
 from midas.run import stream_bam
 
+class Species:
+	""" Base class for species """
+	def __init__(self, id):
+		self.id = id
+		self.paths = {}
+		self.genes = []
+		self.pangenome_size = 0
+		self.reads = 0
+		self.bases = 0.0
+		self.depth = 0.0
+		self.markers = defaultdict(float)
+		
+	def init_ref_db(self, ref_db):
+		""" Set paths to input files """
+		self.dir = '%s/pan_genomes/%s' % (ref_db, self.id)
+		for ext in ['', '.gz']:
+			for file in ['centroids.ffn', 'cluster_info.txt', 'gene_info.txt']:
+				inpath = '%s/%s%s' % (self.dir, file, ext)
+				if os.path.isfile(inpath):
+					self.paths[file] = inpath
+
+def initialize_species(args):
+	""" Initialize Species objects """
+	species = {}
+	splist = '%s/genes/species.txt' % args['outdir']
+	if args['build_db']:
+		from midas.run.species import select_species
+		with open(splist, 'w') as outfile:
+			for id in select_species(args):
+				species[id] = Species(id)
+				outfile.write(id+'\n')
+	elif os.path.isfile(splist):
+		for line in open(splist):
+			species[id] = Species(line.rstrip())
+	for sp in species.values():
+		sp.init_ref_db(args['db'])
+	return species
+
+class Gene:
+	""" Base class for gene """
+	def __init__(self, id):
+		self.id = id
+		self.reads = 0
+		self.bases = 0.0
+		self.depth = 0.0
+		self.length = 0
+		self.copies = 0.0
+		self.marker_id = None
+
+def initialize_genes(args, species):
+	""" Initialize Gene objects """
+	genes = {}
+	# fetch gene_id, species_id, gene length
+	for sp in species.values():
+		path = sp.paths['centroids.ffn']
+		file = utility.iopen(path)
+		for seq in Bio.SeqIO.parse(file, 'fasta'):
+			genes[seq.id] = Gene(seq.id)
+			genes[seq.id].species_id = sp.id
+			genes[seq.id].length = len(seq.seq)
+			sp.pangenome_size += 1
+		file.close()
+	# fetch marker_id
+	path = '%s/marker_genes/phyeco.map' % args['db']
+	file = utility.iopen(path)
+	reader = csv.DictReader(file, delimiter='\t')
+	for r in reader:
+		if r['gene_id'] in genes:
+			genes[r['gene_id']].marker_id=r['marker_id']
+	file.close()
+	return genes
+
 def build_pangenome_db(args, species):
 	""" Build FASTA and BT2 database from pangene species centroids """
 	import Bio.SeqIO
@@ -19,7 +92,7 @@ def build_pangenome_db(args, species):
 	db_stats = {'total_length':0, 'total_seqs':0, 'species':0}
 	for sp in species.values():
 		db_stats['species'] += 1
-		infile = utility.iopen(sp.pan_genome)
+		infile = utility.iopen(sp.paths['centroids.ffn'])
 		for r in Bio.SeqIO.parse(infile, 'fasta'):
 			pangenome_fasta.write('>%s\n%s\n' % (r.id, str(r.seq).upper()))
 			pangenome_map.write('%s\t%s\n' % (r.id, sp.id))
@@ -72,15 +145,21 @@ def pangenome_align(args):
 	print("  checking bamfile integrity")
 	utility.check_bamfile(args, bampath)
 
-def count_mapped_bp(args):
-	""" Count number of bp mapped to each centroid across pangenomes """
-	import pysam, numpy as np
+def pangenome_coverage(args, species, genes):
+	""" Compute coverage of pangenome for species_id and write results to disk """
+	count_mapped_bp(args, species, genes)
+	normalize(args, species, genes)
+	write_results(args, species, genes)
+
+def count_mapped_bp(args, species, genes):
+	""" Count number of bp mapped to each gene across pangenomes """
+	import pysam
 	bam_path = '/'.join([args['outdir'], 'genes/temp/pangenomes.bam'])
 	aln_file = pysam.AlignmentFile(bam_path, "rb")
-	ref_to_length = dict([(i,j) for i,j in zip(aln_file.references, aln_file.lengths)])
-	gene_to_cov = dict([(i,0.0) for i in aln_file.references])
+	i, j = 0,0
+	# loop over alignments, sum values per gene
 	for index, aln in enumerate(aln_file.fetch(until_eof = True)):
-		query = aln.query_name
+		i += 1
 		if stream_bam.compute_perc_id(aln) < args['mapid']:
 			continue
 		elif stream_bam.compute_aln_cov(aln) < args['aln_cov']:
@@ -91,145 +170,82 @@ def count_mapped_bp(args):
 			continue
 		else:
 			gene_id = aln_file.getrname(aln.reference_id)
-			cov = len(aln.query_alignment_sequence)/float(ref_to_length[gene_id])
-			gene_to_cov[gene_id] += cov
-	return gene_to_cov
+			genes[gene_id].reads += 1
+			genes[gene_id].bases += len(aln.query_alignment_sequence)
+			genes[gene_id].depth += genes[gene_id].bases/float(genes[gene_id].length)
+			j += 1
+	print("  total aligned reads: %s" % i)
+	print("  total mapped reads: %s" % j)
+	# loop over genes, sum values per species
+	for gene in genes.values():
+		species[gene.species_id].reads += gene.reads
+		species[gene.species_id].bases += gene.bases
+		species[gene.species_id].depth += gene.depth
+		species[gene.species_id].genes.append(gene.depth)
+	# loop over species, compute summaries
+	for sp in species.values():
+		sp.covered_genes = sum([1 for _ in sp.genes if _ > 0])
+		sp.mean_coverage = np.mean([_ for _ in sp.genes if _ > 0])
+		sp.fraction_covered = sp.covered_genes/float(sp.pangenome_size)
 
-def compute_marker_cov(args, species, gene_to_cov, ref_to_species):
-	""" Count number of bp mapped to each marker marker gene """
-	from numpy import median
-	# read in map of gene to marker
-	gene_to_marker = read_marker_map(args, species)
-	marker_ids = set([marker_id for gene_id, marker_id in gene_to_marker.items()])
-	# init marker coverage
-	species_to_marker_to_cov = {}
-	for species_id in species:
-		species_to_marker_to_cov[species_id] = {}
-		for marker_id in marker_ids:
-			species_to_marker_to_cov[species_id][marker_id] = 0.0
-	# compute marker coverages
-	for gene_id, marker_id in gene_to_marker.items():
-		if gene_id in ref_to_species:
-			species_id = ref_to_species[gene_id]
-			if marker_id in marker_ids and gene_id in gene_to_cov:
-				species_to_marker_to_cov[species_id][marker_id] += gene_to_cov[gene_id]
-	# compute median marker cov
-	species_to_norm = {}
-	for species_id in species_to_marker_to_cov:
-		covs = list(species_to_marker_to_cov[species_id].values())
-		species_to_norm[species_id] = median(covs)
-	return species_to_norm
+def normalize(args, species, genes):
+	""" Count number of bp mapped to each marker gene """
+	# compute marker depth
+	for gene in genes.values():
+		if gene.marker_id is not None:
+			gene.depth = gene.depth
+			species[gene.species_id].markers[gene.marker_id] += gene.depth
+	# compute median marker depth
+	for sp in species.values():
+		sp.marker_coverage = np.median(sp.markers.values())
+	# normalize genes by median marker depth
+	for gene in genes.values():
+		sp = species[gene.species_id]
+		if sp.marker_coverage > 0:
+			gene.copies = gene.depth/sp.marker_coverage
 
-def compute_pangenome_coverage(args, species):
-	""" Compute coverage of pangenome for species_id and write results to disk """
-	# map gene_id to species_id
-	ref_to_species = {}
-	for line in open('/'.join([args['outdir'], 'genes/temp/pangenomes.map'])):
-		gene_id, species_id = line.rstrip().split()
-		ref_to_species[gene_id] = species_id
-		
+def write_results(args, species, genes):
+	""" Write results to disk """
 	# open outfiles for each species_id
-	outfiles = {}
-	for species_id in species:
-		outpath = '/'.join([args['outdir'], 'genes/output/%s.genes.gz' % species_id])
-		outfiles[species_id] = utility.iopen(outpath, 'w')
-		outfiles[species_id].write('\t'.join(['gene_id', 'coverage', 'copy_number'])+'\n')
-	
-	# parse bam into cov files for each species_id
-	gene_to_cov = count_mapped_bp(args)
-
-	# compute normalization factor
-	species_to_norm = compute_marker_cov(args, species, gene_to_cov, ref_to_species)
-
+	header = ['gene_id', 'count_reads', 'coverage', 'copy_number']
+	for sp in species.values():
+		path = '/'.join([args['outdir'], 'genes/output/%s.genes.gz' % sp.id])
+		sp.out = utility.iopen(path, 'w')
+		sp.out.write('\t'.join(header)+'\n')
 	# write to output files
-	for gene_id in sorted(gene_to_cov):
-		cov = gene_to_cov[gene_id]
-		species_id = ref_to_species[gene_id]
-		outfile = outfiles[species_id]
-		normcov = cov/species_to_norm[species_id] if species_to_norm[species_id] > 0 else 0.0
-		outfile.write('\t'.join([str(x) for x in [gene_id, cov, normcov]])+'\n')
+	for gene_id in sorted(genes):
+		gene = genes[gene_id]
+		sp = species[gene.species_id]
+		values = [gene.id, gene.reads, gene.depth, gene.copies]
+		sp.out.write('\t'.join([str(_) for _ in values])+'\n')
+	# close output files
+	for sp in species.values():
+		sp.out.close()
+	# summary stats
+	path = '/'.join([args['outdir'], 'genes/summary.txt'])
+	file = open(path, 'w')
+	header = ['species_id', 'pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage', 'count_reads']
+	file.write('\t'.join(header)+'\n')
+	for sp in species.values():
+		values = [sp.id, sp.pangenome_size, sp.covered_genes, sp.fraction_covered, sp.mean_coverage, sp.marker_coverage, sp.reads]
+		file.write('\t'.join([str(_) for _ in values])+'\n')
+	file.close()
 
 def remove_tmp(args):
 	""" Remove specified temporary files """
 	import shutil
 	shutil.rmtree('/'.join([args['outdir'], 'genes/temp']))
 
-def genes_summary(args):
-	""" Get summary of mapping statistics """
-	# store stats
-	stats = {}
-	inpath = '%s/%s' % (args['outdir'], 'genes/temp/pangenomes.map')
-	for species_id in args['species_id']:
-		pangenome_size, covered_genes, total_coverage, marker_coverage = [0,0,0,0]
-		for r in utility.parse_file('/'.join([args['outdir'], 'genes/output/%s.genes.gz' % species_id])):
-			pangenome_size += 1
-			coverage = float(r['coverage'])
-			normcov = float(r['copy_number'])
-			if coverage > 0:
-				covered_genes += 1
-				total_coverage += coverage
-			if normcov > 0:
-				marker_coverage = coverage/normcov
-		stats[species_id] = {'pangenome_size':pangenome_size,
-							 'covered_genes':covered_genes,
-							 'fraction_covered':covered_genes/float(pangenome_size),
-							 'mean_coverage':total_coverage/covered_genes if covered_genes > 0 else 0.0,
-							 'marker_coverage':marker_coverage}
-	# write stats
-	fields = ['pangenome_size', 'covered_genes', 'fraction_covered', 'mean_coverage', 'marker_coverage']
-	outfile = open('/'.join([args['outdir'], 'genes/summary.txt']), 'w')
-	outfile.write('\t'.join(['species_id'] + fields)+'\n')
-	for species_id in stats:
-		record = [species_id] + [str(stats[species_id][field]) for field in fields]
-		outfile.write('\t'.join(record)+'\n')
-
-def read_marker_map(args, species):
-	gene_to_marker = {}
-	path = '%s/marker_genes/phyeco.map' % args['db']
-	for r in utility.parse_file(path):
-		if r['species_id'] in species:
-			gene_to_marker[r['gene_id']] = r['marker_id']
-	return gene_to_marker
-
-class Species:
-	""" Base class for species """
-	def __init__(self, id):
-		self.id = id
-		
-	def init_ref_db(self, ref_db):
-		self.dir = '%s/pan_genomes/%s' % (ref_db, self.id)
-		for ext in ['', '.gz']:
-			inpath = '%s/centroids.ffn%s' % (self.dir, ext)
-			if os.path.isfile(inpath): self.pan_genome = inpath
-			inpath = '%s/cluster_info.txt%s' % (self.dir, ext)
-			if os.path.isfile(inpath): self.cluster_info = inpath
-		self.gene_to_cluster = {}
-		self.cluster_to_gene = {}
-		for r in utility.parse_file(self.cluster_info):
-			self.gene_to_cluster[r['centroid']] = r['cluster_id']
-			self.gene_to_cluster[r['cluster_id']] = r['centroid']
-
-def initialize_species(args):
-	species = {}
-	splist = '%s/genes/species.txt' % args['outdir']
-	if args['build_db']:
-		from midas.run.species import select_species
-		with open(splist, 'w') as outfile:
-			for id in select_species(args):
-				species[id] = Species(id)
-				outfile.write(id+'\n')
-	elif os.path.isfile(splist):
-		for line in open(splist):
-			species[id] = Species(line.rstrip())
-	for sp in species.values():
-		sp.init_ref_db(args['db'])
-	return species
-
 def run_pipeline(args):
 	""" Run entire pipeline """
 	
-	# Initialize species
+	# Initialize reference data
+	print("\nReading reference data")
+	start = time()
 	species = initialize_species(args)
+	genes = initialize_genes(args, species)
+	print("  %s minutes" % round((time() - start)/60, 2) )
+	print("  %s Gb maximum memory" % utility.max_mem_usage())
 
 	# Build pangenome database for selected species
 	if args['build_db']:
@@ -254,8 +270,7 @@ def run_pipeline(args):
 		start = time()
 		print("\nComputing coverage of pangenomes")
 		args['log'].write("\nComputing coverage of pangenomes\n")
-		compute_pangenome_coverage(args, species)
-		genes_summary(args)
+		pangenome_coverage(args, species, genes)
 		print("  %s minutes" % round((time() - start)/60, 2) )
 		print("  %s Gb maximum memory" % utility.max_mem_usage())
 

From 555d107fcce387f549d5b54999e37e2cfb51afbe Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Tue, 1 Nov 2016 12:53:12 -0700
Subject: [PATCH 07/11] Added support for --cluster_pid, genes_reads.txt

---
 midas/merge/merge_genes.py | 59 +++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/midas/merge/merge_genes.py b/midas/merge/merge_genes.py
index a6db79a..3e97420 100755
--- a/midas/merge/merge_genes.py
+++ b/midas/merge/merge_genes.py
@@ -9,45 +9,46 @@
 from midas import utility
 from midas.merge import merge
 
-def build_gene_matrices(species_id, samples, args):
+def build_gene_matrices(sp, min_copy):
 	""" Compute gene copy numbers for samples """
-	for sample in samples:
+	for sample in sp.samples:
 		sample.genes = {}
-		for type in ['presabs', 'copynum', 'depth']:
-			sample.genes[type] = defaultdict(float)
-		inpath = '%s/genes/output/%s.genes.gz' % (sample.dir, species_id)
+		for field, dtype in [('presabs',float), ('copynum',float), ('depth',float), ('reads',int)]:
+			sample.genes[field] = defaultdict(dtype)
+		inpath = '%s/genes/output/%s.genes.gz' % (sample.dir, sp.id)
 		for r in utility.parse_file(inpath):
 			if 'ref_id' in r: r['gene_id'] = r['ref_id'] # fix old fields if present
 			if 'normalized_coverage' in r: r['copy_number'] = r['normalized_coverage'] 
 			if 'raw_coverage' in r: r['coverage'] = r['raw_coverage']
-			gene_id = r['gene_id']
+			gene_id = sp.map[r['gene_id']]
 			sample.genes['copynum'][gene_id] += float(r['copy_number'])
 			sample.genes['depth'][gene_id] += float(r['coverage'])
-	for sample in samples:
+			sample.genes['reads'][gene_id] += int(r['count_reads']) if 'count_reads' in r else 0
+	for sample in sp.samples:
 		for gene_id, copynum in sample.genes['copynum'].items():
-			if copynum >= args['min_copy']: sample.genes['presabs'][gene_id] = 1
+			if copynum >= min_copy: sample.genes['presabs'][gene_id] = 1
 			else: sample.genes['presabs'][gene_id] = 0
 
-def write_gene_matrices(species_id, samples, args):
+def write_gene_matrices(sp):
 	""" Compute pangenome matrices to file """
 	# open outfiles
 	outfiles = {}
-	for type in ['presabs', 'copynum', 'depth']:
-		outfiles[type] = open('%s/%s/genes_%s.txt' % (args['outdir'], species_id, type), 'w')
-		outfiles[type].write('\t'.join(['gene_id'] + [s.id for s in samples])+'\n')
+	for type in ['presabs', 'copynum', 'depth', 'reads']:
+		outfiles[type] = open('%s/genes_%s.txt' % (sp.dir, type), 'w')
+		outfiles[type].write('\t'.join(['gene_id'] + [s.id for s in sp.samples])+'\n')
 	# write values
-	genes = sorted(samples[0].genes['depth'])
+	genes = sorted(sp.samples[0].genes['depth'])
 	for gene_id in genes:
-		for type in ['presabs', 'copynum', 'depth']:
+		for type in ['presabs', 'copynum', 'depth', 'reads']:
 			outfiles[type].write(gene_id)
-			for sample in samples:
+			for sample in sp.samples:
 				outfiles[type].write('\t%s' % str(sample.genes[type][gene_id]))
 			outfiles[type].write('\n')
 	for outfile in outfiles.values():
 		outfile.close()
 
 def write_readme(args, sp):
-	outfile = open('%s/%s/README' % (args['outdir'], sp.id), 'w')
+	outfile = open('%s/readme.txt' % sp.dir, 'w')
 	outfile.write("""
 Description of output files and file formats from 'merge_midas.py genes'
 
@@ -61,12 +62,14 @@ def write_readme(args, sp):
 genes_presabs.txt  
   the presence (1) or absence (0) of each gene per sample
   estimated by applying a threshold to gene copy-number values
+genes_reads.txt
+  number of reads mapped to each gene per sample
 genes_summary.txt
   alignment summary statistics per sample
 
 Output formats
 ############
-genes_depth.txt, genes_copynum.txt, genes_presabs.txt
+genes_depth.txt, genes_copynum.txt, genes_presabs.txt, genes_reads.txt
   tab-delimited matrix files
   field names are sample ids
   row names are gene ids
@@ -84,20 +87,30 @@ def write_readme(args, sp):
 """ % (args['db'], sp.id) )
 	outfile.close()
 
+def read_cluster_map(sp, db, pid):
+	sp.map = {}
+	for ext in ['', '.gz']:
+		path = '/'.join([db, 'pan_genomes', sp.id, 'gene_info.txt%s' % ext])
+		if os.path.isfile(path):
+			sp.gene_info = path
+	for r in utility.parse_file(sp.gene_info):
+		sp.map[r['centroid_99']] =  r['centroid_%s' % pid]
+
 def run_pipeline(args):
 
 	print("Identifying species")
 	species = merge.select_species(args, type='genes')
-
+	
 	for sp in species:
-
+	
 		print("Merging: %s for %s samples" % (sp.id, len(sp.samples)))
-		outdir = os.path.join(args['outdir'], sp.id)
-		if not os.path.isdir(outdir): os.mkdir(outdir)
+		sp.dir = os.path.join(args['outdir'], sp.id)
+		if not os.path.isdir(sp.dir): os.mkdir(sp.dir)
+		read_cluster_map(sp, args['db'], args['cluster_pid'])
 			
 		print("  building pangenome matrices")
-		build_gene_matrices(sp.id, sp.samples, args)
-		write_gene_matrices(sp.id, sp.samples, args)
+		build_gene_matrices(sp, min_copy=args['min_copy'])
+		write_gene_matrices(sp)
 
 		print("  writing summary statistics")
 		merge.write_summary_stats(sp.id, sp.samples, args, 'genes')

From 09aaa4d8e3ba97e8282fc2c8ed78a85494291bd0 Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Tue, 1 Nov 2016 12:54:07 -0700
Subject: [PATCH 08/11] Incremented version number

---
 midas/utility.py | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/midas/utility.py b/midas/utility.py
index 5ca2b99..bcb38c5 100644
--- a/midas/utility.py
+++ b/midas/utility.py
@@ -6,7 +6,7 @@
 
 import io, os, stat, sys, resource, gzip, platform, subprocess, bz2
 
-__version__ = '1.1.0'
+__version__ = '1.2.0'
 
 def which(program):
 	""" Mimics unix 'which' function """
diff --git a/setup.py b/setup.py
index 9b89b2f..e208915 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
 	name = 'MIDAS',
-	version = '1.1.0',
+	version = '1.2.0',
 	description = 'An integrated pipeline and for estimating species and strain-level genomic variation from metagenomic data',
 	license = 'GPL',
 	author = 'Stephen Nayfach',

From 51e31eff5beba189a14c6d096684d27dfc112eca Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Tue, 1 Nov 2016 13:00:05 -0700
Subject: [PATCH 09/11] Minor changes

---
 midas/merge/annotate.py      |  3 +--
 midas/merge/merge_snps.py    |  2 +-
 midas/merge/merge_species.py |  2 +-
 midas/merge/snp_matrix.py    |  9 ++-------
 scripts/merge_midas.py       | 12 +++++++++---
 scripts/run_midas.py         | 22 ++++++++++++----------
 6 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/midas/merge/annotate.py b/midas/merge/annotate.py
index f79d92f..3120402 100755
--- a/midas/merge/annotate.py
+++ b/midas/merge/annotate.py
@@ -98,7 +98,6 @@ def annotate_site(site, genes, gene_index, contigs):
 	# gene_index: current position in list of genes; global variable
 	site.snp_types = {}
 	site.amino_acids = {}
-	
 	while True:
 		# 1. fetch next gene
 		#    if there are no more genes, snp must be non-coding so break
@@ -120,7 +119,7 @@ def annotate_site(site, genes, gene_index, contigs):
 		# 4. otherwise, snp must be in gene
 		#    annotate site (1D-4D) and snp (SYN, NS)
 		else:
-			site.gene_id = gene['gene_id'].split('|')[-1]
+			site.gene_id = gene['gene_id']
 			site.ref_codon, site.codon_pos = fetch_ref_codon(site, gene)
 			if not all([_ in ['A','T','C','G'] for _ in site.ref_codon]): # check for invalid bases in codon
 				site.site_type = 'NA'; site.gene_id = ''
diff --git a/midas/merge/merge_snps.py b/midas/merge/merge_snps.py
index d5f24c1..a1fe247 100755
--- a/midas/merge/merge_snps.py
+++ b/midas/merge/merge_snps.py
@@ -160,7 +160,7 @@ def filter_snp_matrix(species_id, samples, args):
 			write_matrices(site, matrices)
 
 def write_readme(args, sp):
-	outfile = open('%s/%s/README' % (args['outdir'], sp.id), 'w')
+	outfile = open('%s/%s/readme.txt' % (args['outdir'], sp.id), 'w')
 	outfile.write("""
 Description of output files and file formats from 'merge_midas.py snps'
 
diff --git a/midas/merge/merge_species.py b/midas/merge/merge_species.py
index ae9f4ec..97a1d40 100644
--- a/midas/merge/merge_species.py
+++ b/midas/merge/merge_species.py
@@ -87,7 +87,7 @@ def identify_samples(args):
 	return samples
 
 def write_readme(args):
-	outfile = open('%s/README' % args['outdir'], 'w')
+	outfile = open('%s/readme.txt' % args['outdir'], 'w')
 	outfile.write("""
 Description of output files and file formats from 'merge_midas.py species'
 
diff --git a/midas/merge/snp_matrix.py b/midas/merge/snp_matrix.py
index 7a6cfde..552bf4a 100644
--- a/midas/merge/snp_matrix.py
+++ b/midas/merge/snp_matrix.py
@@ -15,18 +15,13 @@ def __init__(self, files, samples, info=None):
 			self.id, self.depth = next(files['depth'])
 			self.id, self.alt_allele = next(files['alt_allele'])
 			self.info = next(info) if info else None
-			self.ref_id, self.ref_pos, self.ref_allele = self.parse_id()
+			self.ref_id, self.ref_pos, self.ref_allele = self.id.rsplit('|', 2)
+			self.ref_pos = int(self.ref_pos)
 			self.samples = samples
 			
 		except StopIteration:
 			self.id = None
 
-	def parse_id(self):
-		ref_id = self.id.rsplit('|')[0]
-		ref_pos = int(self.id.split('|')[1])
-		ref_allele = self.id.split('|')[2]
-		return ref_id, ref_pos, ref_allele
-
 	def sample_values(self): # return a dic mapping sample id to site values
 		d = {}
 		for index, sample in enumerate(self.samples):
diff --git a/scripts/merge_midas.py b/scripts/merge_midas.py
index 72bd2c0..9d8acc4 100755
--- a/scripts/merge_midas.py
+++ b/scripts/merge_midas.py
@@ -134,13 +134,18 @@ def genes_arguments():
 	species.add_argument('--species_id', dest='species_id', type=str, metavar='CHAR',
 		help="""Comma-separated list of species ids""")
 	species.add_argument('--max_species', type=int, metavar='INT',
-		help="""Maximum number of species to merge. useful for testing (use all)""")
+		help="""Maximum number of species to merge. Useful for testing (use all)""")
 	sample = parser.add_argument_group('Sample filters (select subset of samples from INPUT)')
 	sample.add_argument('--sample_depth', type=float, default=1.0, metavar='FLOAT',
 		help="""Minimum read-depth across all genes with non-zero coverage (1.0)""")
 	sample.add_argument('--max_samples', type=int, metavar='INT',
-		help="""Maximum number of samples to process. useful for testing (use all)""")
-	gene = parser.add_argument_group('Presence/Absence')
+		help="""Maximum number of samples to process. Useful for testing (use all)""")
+	gene = parser.add_argument_group('Quantification')
+	gene.add_argument('--cluster_pid', type=str, dest='cluster_pid', default='95', choices=['75', '80', '85', '90', '95', '99'],
+		help="""In the database, pan-genomes are defined at 6 different %% identity clustering cutoffs
+CLUSTER_PID allows you to quantify gene content for any of these sets of gene clusters
+By default, gene content is reported for genes clustered at 95%% identity (95)
+""")
 	gene.add_argument('--min_copy', type=float, default=0.35, metavar='FLOAT',
 		help="""Genes >= MIN_COPY are classified as present
 Genes < MIN_COPY are classified as absent (0.35)""")
@@ -295,6 +300,7 @@ def print_genes_arguments(args):
 	print ("  keep samples with >=%s mean coverage across genes with non-zero coverage" % args['sample_depth'])
 	if args['max_samples']: print ("  keep <= %s samples" % args['max_samples'])
 	print ("Gene quantification criterea:")
+	print ("  quantify genes clustered at %s%% identity" % args['cluster_pid'])
 	print ("  present (1): genes with copy number >= %s" % args['min_copy'])
 	print ("  absent (0): genes with copy number < %s" % args['min_copy'])
 	print ("===============================")
diff --git a/scripts/run_midas.py b/scripts/run_midas.py
index 377c628..8213380 100755
--- a/scripts/run_midas.py
+++ b/scripts/run_midas.py
@@ -148,10 +148,12 @@ def print_species_arguments(args):
 	lines.append("Input reads (2nd mate): %s" % args['m2'])
 	lines.append("Remove temporary files: %s" % args['remove_temp'])
 	lines.append("Word size for database search: %s" % args['word_size'])
-	if args['mapid']: lines.append("Minimum mapping identity: %s" % args['mapid'])
+	if args['mapid']:
+		lines.append("Minimum mapping identity: %s" % args['mapid'])
 	lines.append("Minimum mapping alignment coverage: %s" % args['aln_cov'])
 	lines.append("Number of reads to use from input: %s" % (args['max_reads'] if args['max_reads'] else 'use all'))
-	if args['read_length']: lines.append("Trim reads to %s-bp and discard reads with length < %s-bp" % (args['read_length'], args['read_length']))
+	if args['read_length']:
+		lines.append("Trim reads from 3'/right end to %s-bp and discard reads with length < %s-bp" % (args['read_length'], args['read_length']))
 	lines.append("Number of threads for database search: %s" % args['threads'])
 	lines.append("================================")
 	args['log'].write('\n'.join(lines)+'\n')
@@ -258,11 +260,11 @@ def gene_arguments():
 	map.add_argument('--mapid', type=float, metavar='FLOAT',
 		default=94.0, help='Discard reads with alignment identity < MAPID (94.0)')
 	map.add_argument('--mapq', type=int, metavar='INT',
-		default=20, help='Discard reads with mapping quality < MAPQ (10)')
+		default=0, help=argparse.SUPPRESS) # help='Discard reads with mapping quality < MAPQ (0)'
 	map.add_argument('--aln_cov', type=float, metavar='FLOAT',
 		default=0.75, help='Discard reads with alignment coverage < ALN_COV (0.75)')
 	map.add_argument('--trim', type=int, default=0, metavar='INT',
-		help='Trim N base-pairs from read-tails (0)')
+		help='Trim N base-pairs from 3\'/right end of read (0)')
 	args = vars(parser.parse_args())
 	if args['species_id']: args['species_id'] = args['species_id'].split(',')
 	return args
@@ -303,7 +305,7 @@ def print_gene_arguments(args):
 		lines.append("  minimum alignment coverage of reads: %s" % args['aln_cov'])
 		lines.append("  minimum read quality score: %s" % args['readq'])
 		lines.append("  minimum mapping quality score: %s" % args['mapq'])
-		lines.append("  trim %s base-pairs from read-tails" % args['trim'])
+		lines.append("  trim %s base-pairs from 3'/right end of read" % args['trim'])
 	lines.append("================================")
 	args['log'].write('\n'.join(lines)+'\n')
 	sys.stdout.write('\n'.join(lines)+'\n')
@@ -377,7 +379,7 @@ def snp_arguments():
 	snps.add_argument('--readq', type=int, metavar='INT',
 		default=20, help='Discard reads with mean quality < READQ (20)')
 	snps.add_argument('--trim', metavar='INT', type=int, default=0,
-		help='Trim N base-pairs from read-tails (0)')
+		help='Trim N base-pairs from 3\'/right end of read')
 	snps.add_argument('--discard', default=False, action='store_true',
 		help='Discard discordant read-pairs')
 	snps.add_argument('--baq', default=False, action='store_true',
@@ -424,7 +426,7 @@ def print_snp_arguments(args):
 		lines.append("  minimum mapping quality score: %s" % args['mapq'])
 		lines.append("  minimum base quality score: %s" % args['baseq'])
 		lines.append("  minimum read quality score: %s" % args['readq'])
-		lines.append("  trim %s base-pairs from read-tails" % args['trim'])
+		lines.append("  trim %s base-pairs from 3'/right end of read" % args['trim'])
 		if args['discard']: lines.append("  discard discordant read-pairs")
 		if args['baq']: lines.append("  enable BAQ (per-base alignment quality)")
 		if args['adjust_mq']: lines.append("  adjust MAPQ")
@@ -574,7 +576,7 @@ def check_snps(args):
 		sys.exit("\nError: BASEQ must be between 0 and 100\n")
 
 def write_readme(program, args):
-	outfile = open('%s/%s/README' % (args['outdir'], program), 'w')
+	outfile = open('%s/%s/readme.txt' % (args['outdir'], program), 'w')
 	if program == 'species':
 		outfile.write("""
 Description of output files and file formats from 'run_midas.py species'
@@ -599,7 +601,7 @@ def write_readme(program, args):
   coverage: estimated genome-coverage (i.e. read-depth) of species in metagenome
   relative_abundance: estimated relative abundance of species in metagenome
 
-**Additional information for each species can be found in the reference database:
+Additional information for each species can be found in the reference database:
  %s/marker_genes
 """ % args['db'])
 	elif program == 'genes':
@@ -638,7 +640,7 @@ def write_readme(program, args):
   mean_coverage: average read-depth across genes with at least 1 mapped read
   marker_coverage: median read-depth across 15 universal single copy genes
 
-**Additional information for each species can be found in the reference database:
+Additional information for each species can be found in the reference database:
  %s/pan_genomes
 """ % args['db'])
 	elif program == 'snps':

From 87d102556ee777be65ccbb18dbe72478c2914e02 Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Wed, 2 Nov 2016 15:07:09 -0700
Subject: [PATCH 10/11] Minor changes

---
 docs/tutorial.md               | 6 +++---
 midas/analyze/track_strains.py | 2 ++
 midas/run/genes.py             | 7 ++++---
 midas/run/snps.py              | 6 +++---
 scripts/call_consensus.py      | 2 +-
 5 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/docs/tutorial.md b/docs/tutorial.md
index 7cef64b..01d695c 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -14,14 +14,14 @@ Install python dependencies as needed:
 [read more...] (install.md)
 
 Download & unpack reference database:  
-[http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.1.tar.gz](http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.1.tar.gz)  
-`tar -zxvf midas_db_v1.1.tar.gz`  
+[http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.2.tar.gz](http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.2.tar.gz)  
+`tar -zxvf midas_db_v1.2.tar.gz`  
 [read more...] (ref_db.md)
 
 Update your environment:  
 `export PYTHONPATH=$PYTHONPATH:MIDAS`  
 `export PATH=$PATH:MIDAS/scripts`   
-`export MIDAS_DB=midas_db_v1.1`  
+`export MIDAS_DB=midas_db_v1.2`  
 
 Optionally, download & unpack example dataset:  
 [http://lighthouse.ucsf.edu/MIDAS/example.tar.gz](http://lighthouse.ucsf.edu/MIDAS/example.tar.gz)  
diff --git a/midas/analyze/track_strains.py b/midas/analyze/track_strains.py
index 2ab02cf..96b05d3 100644
--- a/midas/analyze/track_strains.py
+++ b/midas/analyze/track_strains.py
@@ -90,6 +90,8 @@ def call_markers(species, samples, args):
 	# open marker list
 	markers = utility.parse_file(species.paths['markers'])
 	marker = fetch_marker(markers) # dictionary for 1st marker allele
+	if marker is None:
+		sys.exit("\nError: no marker alleles found in file: %s\n" % species.paths['markers'])
 	
 	# init markers per sample
 	for sample in samples.values():
diff --git a/midas/run/genes.py b/midas/run/genes.py
index 2f784ad..2bd37af 100644
--- a/midas/run/genes.py
+++ b/midas/run/genes.py
@@ -170,9 +170,11 @@ def count_mapped_bp(args, species, genes):
 			continue
 		else:
 			gene_id = aln_file.getrname(aln.reference_id)
+			aln_len = len(aln.query_alignment_sequence)
+			gene_len = genes[gene_id].length
 			genes[gene_id].reads += 1
-			genes[gene_id].bases += len(aln.query_alignment_sequence)
-			genes[gene_id].depth += genes[gene_id].bases/float(genes[gene_id].length)
+			genes[gene_id].bases += aln_len
+			genes[gene_id].depth += aln_len/float(gene_len)
 			j += 1
 	print("  total aligned reads: %s" % i)
 	print("  total mapped reads: %s" % j)
@@ -193,7 +195,6 @@ def normalize(args, species, genes):
 	# compute marker depth
 	for gene in genes.values():
 		if gene.marker_id is not None:
-			gene.depth = gene.depth
 			species[gene.species_id].markers[gene.marker_id] += gene.depth
 	# compute median marker depth
 	for sp in species.values():
diff --git a/midas/run/snps.py b/midas/run/snps.py
index 4f9851e..5076d48 100644
--- a/midas/run/snps.py
+++ b/midas/run/snps.py
@@ -19,9 +19,9 @@ def build_genome_db(args, species):
 		db_stats['species'] += 1
 		infile = utility.iopen(sp.paths['fna'])
 		for r in Bio.SeqIO.parse(infile, 'fasta'):
-				outfile.write('>%s\n%s\n' % (r.id, str(r.seq).upper()))
-				db_stats['total_length'] += len(r.seq)
-				db_stats['total_seqs'] += 1
+			outfile.write('>%s\n%s\n' % (r.id, str(r.seq).upper()))
+			db_stats['total_length'] += len(r.seq)
+			db_stats['total_seqs'] += 1
 		infile.close()
 	outfile.close()
 	# print out database stats
diff --git a/scripts/call_consensus.py b/scripts/call_consensus.py
index 473f696..624e77b 100755
--- a/scripts/call_consensus.py
+++ b/scripts/call_consensus.py
@@ -146,7 +146,7 @@ def sequence_description(sample):
 	desc = {}
 	desc['length'] = len(sample.seq)
 	desc['percent_missing'] = percent_missing(sample.seq)
-	desc['mean_depth'] = round(sample.sample_depth, 2)
+	desc['mean_depth'] = round(sample.mean_depth, 2)
 	return desc
 
 def write_consensus(args, samples):

From e5ca6badd208202c40634890e735237e2024d386 Mon Sep 17 00:00:00 2001
From: snayfach <snayfach@gmail.com>
Date: Wed, 2 Nov 2016 15:27:51 -0700
Subject: [PATCH 11/11] updated docs

---
 docs/cnvs.md    | 3 +--
 docs/install.md | 4 ++--
 docs/ref_db.md  | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/docs/cnvs.md b/docs/cnvs.md
index 56e5126..9d9cdc1 100644
--- a/docs/cnvs.md
+++ b/docs/cnvs.md
@@ -44,9 +44,8 @@ Read alignment options (if using --align):
 Quantify genes options (if using --call_genes):
   --readq INT           Discard reads with mean quality < READQ (20)
   --mapid FLOAT         Discard reads with alignment identity < MAPID (94.0)
-  --mapq INT            Discard reads with mapping quality < MAPQ (10)
   --aln_cov FLOAT       Discard reads with alignment coverage < ALN_COV (0.75)
-  --trim INT            Trim N base-pairs from read-tails (0)
+  --trim INT            Trim N base-pairs from 3'/right end of reads (0)
 ```
 
 ## Examples
diff --git a/docs/install.md b/docs/install.md
index c045073..c44600e 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -20,7 +20,7 @@ Or, download the latest source code:
 https://github.com/snayfach/MIDAS/archive/master.zip  
 And unpack the project: `unzip MIDAS-master.zip`  
 
-Or, download a stable release of the source code:
+Or, download a stable release of the source code:  
 [https://github.com/snayfach/MIDAS/releases] (https://github.com/snayfach/MIDAS/releases)  
 
 ### Install dependencies
@@ -45,7 +45,7 @@ If you still have installation issues, please open a new [issue] (https://github
 
 This will enable you to run MIDAS scripts:  
 `export PYTHONPATH=$PYTHONPATH:/path/to/MIDAS`  
-`export PATH=$PATH:/path/to/MIDAS/scripts` 
+`export PATH=$PATH:/path/to/MIDAS/scripts`   
 `export MIDAS_DB=/path/to/midas_database` 
 
 Be sure to replace '/path/to' with the appropriate path on your system  
diff --git a/docs/ref_db.md b/docs/ref_db.md
index 6f0cab4..6f2a5d8 100644
--- a/docs/ref_db.md
+++ b/docs/ref_db.md
@@ -12,8 +12,8 @@ on Unix: `wget http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.2.tar.gz`
 on OSX: `curl http://lighthouse.ucsf.edu/MIDAS/midas_db_v1.2.tar.gz > midas_db_v1.2.tar.gz`
 
 * This may take several minutes to several hours, depending on your internet speed
-* The entire database requires 32 GB of free space to download and decompress
-* Once decompressed, it takes 16G of free space
+* The entire database requires 36 GB of free space to download and decompress
+* Once decompressed, it takes 18G of free space
 * See midas_db_v1.2/README for more information
 
 #### Step 2. unpack tarball