diff --git a/assembly.py b/assembly.py index b26898265..6b3923b22 100755 --- a/assembly.py +++ b/assembly.py @@ -8,21 +8,39 @@ __commands__ = [] # built-ins -import argparse, logging, random, os, os.path, shutil, subprocess, glob +import argparse +import logging +import random +import os +import os.path +import shutil +import subprocess +import glob try: from itertools import zip_longest -except ImportError : +except ImportError: from itertools import izip_longest as zip_longest # intra-module -import util.cmd, util.file, util.vcf -import read_utils, taxon_filter -import tools, tools.picard, tools.samtools, tools.gatk, tools.novoalign -import tools.trinity, tools.mosaik, tools.muscle +import util.cmd +import util.file +import util.vcf +import read_utils +import taxon_filter +import tools +import tools.picard +import tools.samtools +import tools.gatk +import tools.novoalign +import tools.trinity +import tools.mosaik +import tools.muscle # third-party -import Bio.AlignIO, Bio.SeqIO, Bio.Data.IUPACData +import Bio.AlignIO +import Bio.SeqIO +import Bio.Data.IUPACData log = logging.getLogger(__name__) @@ -31,24 +49,24 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): ''' Take reads through Trimmomatic, Prinseq, and subsampling. This should probably move over to read_utils or taxon_filter. ''' - + # BAM -> fastq infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq'])) tools.picard.SamToFastqTool().execute(inBam, infq[0], infq[1]) - + # Trimmomatic trimfq = list(map(util.file.mkstempfname, ['.trim.1.fastq', '.trim.2.fastq'])) taxon_filter.trimmomatic(infq[0], infq[1], trimfq[0], trimfq[1], clipDb) os.unlink(infq[0]) os.unlink(infq[1]) - + # Prinseq rmdupfq = list(map(util.file.mkstempfname, ['.rmdup.1.fastq', '.rmdup.2.fastq'])) read_utils.rmdup_prinseq_fastq(trimfq[0], rmdupfq[0]) read_utils.rmdup_prinseq_fastq(trimfq[1], rmdupfq[1]) os.unlink(trimfq[0]) os.unlink(trimfq[1]) - + # Purge unmated purgefq = list(map(util.file.mkstempfname, ['.fix.1.fastq', '.fix.2.fastq'])) read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1]) @@ -57,21 +75,26 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): # Log count with open(purgefq[0], 'rt') as inf: - n = int(sum(1 for line in inf)/4) + n = int(sum(1 for line in inf) / 4) log.info("PRE-SUBSAMPLE COUNT: %s read pairs", n) - + # Subsample subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq'])) cmd = [os.path.join(util.file.get_scripts_path(), 'subsampler.py'), - '-n', str(n_reads), - '-mode', 'p', - '-in', purgefq[0], purgefq[1], - '-out', subsampfq[0], subsampfq[1], - ] + '-n', + str(n_reads), + '-mode', + 'p', + '-in', + purgefq[0], + purgefq[1], + '-out', + subsampfq[0], + subsampfq[1],] subprocess.check_call(cmd) os.unlink(purgefq[0]) os.unlink(purgefq[1]) - + # Fastq -> BAM # Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way # breaks out the read groups and perform the above steps in a way that preserves @@ -82,28 +105,33 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): if n == 0: # FastqToSam cannot deal with empty input # but Picard SamFormatConverter can deal with empty files - opts = ['INPUT='+tmp_header, 'OUTPUT='+outBam, 'VERBOSITY=ERROR'] + opts = ['INPUT=' + tmp_header, 'OUTPUT=' + outBam, 'VERBOSITY=ERROR'] tools.picard.PicardTools().execute('SamFormatConverter', opts, JVMmemory='50m') else: - tools.picard.FastqToSamTool().execute( - subsampfq[0], subsampfq[1], 'Dummy', tmp_bam) + tools.picard.FastqToSamTool().execute(subsampfq[0], subsampfq[1], 'Dummy', tmp_bam) tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam) os.unlink(tmp_bam) os.unlink(tmp_header) os.unlink(subsampfq[0]) os.unlink(subsampfq[1]) + + def parser_trim_rmdup_subsamp(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', - help='Input reads, unaligned BAM format.') - parser.add_argument('clipDb', - help='Trimmomatic clip DB.') - parser.add_argument('outBam', - help='Output reads, unaligned BAM format (currently, read groups and other header information are destroyed in this process).') - parser.add_argument('--n_reads', default=100000, type=int, - help='Subsample reads to no more than this many pairs. (default %(default)s)') - util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None))) + parser.add_argument('inBam', help='Input reads, unaligned BAM format.') + parser.add_argument('clipDb', help='Trimmomatic clip DB.') + parser.add_argument( + 'outBam', + help="""Output reads, unaligned BAM format (currently, read groups and other + header information are destroyed in this process).""") + parser.add_argument('--n_reads', + default=100000, + type=int, + help='Subsample reads to no more than this many pairs. (default %(default)s)') + util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, trim_rmdup_subsamp_reads, split_args=True) return parser + + __commands__.append(('trim_rmdup_subsamp', parser_trim_rmdup_subsamp)) @@ -116,37 +144,36 @@ def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outReads=None, JVM subsamp_bam = outReads else: subsamp_bam = util.file.mkstempfname('.subsamp.bam') - + trim_rmdup_subsamp_reads(inBam, clipDb, subsamp_bam, n_reads=n_reads) subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq'])) tools.picard.SamToFastqTool().execute(subsamp_bam, subsampfq[0], subsampfq[1]) tools.trinity.TrinityTool().execute(subsampfq[0], subsampfq[1], outFasta, JVMmemory=JVMmemory, threads=threads) os.unlink(subsampfq[0]) os.unlink(subsampfq[1]) - + if not outReads: os.unlink(subsamp_bam) + def parser_assemble_trinity(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', - help='Input unaligned reads, BAM format.') - parser.add_argument('clipDb', - help='Trimmomatic clip DB.') - parser.add_argument('outFasta', - help='Output assembly.') - parser.add_argument('--n_reads', default=100000, type=int, - help='Subsample reads to no more than this many pairs. (default %(default)s)') - parser.add_argument('--outReads', default=None, - help='Save the trimmomatic/prinseq/subsamp reads to a BAM file') + parser.add_argument('inBam', help='Input unaligned reads, BAM format.') + parser.add_argument('clipDb', help='Trimmomatic clip DB.') + parser.add_argument('outFasta', help='Output assembly.') + parser.add_argument('--n_reads', + default=100000, + type=int, + help='Subsample reads to no more than this many pairs. (default %(default)s)') + parser.add_argument('--outReads', default=None, help='Save the trimmomatic/prinseq/subsamp reads to a BAM file') parser.add_argument('--JVMmemory', - default=tools.trinity.TrinityTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--threads', - default=1, - help='Number of threads (default: %(default)s)') - util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None))) + default=tools.trinity.TrinityTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--threads', default=1, help='Number of threads (default: %(default)s)') + util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, assemble_trinity, split_args=True) return parser + + __commands__.append(('assemble_trinity', parser_assemble_trinity)) @@ -175,33 +202,36 @@ def order_and_orient(inFasta, inReference, outFasta, inReads=None): Bio.SeqIO.write([seqObj], tmpInputFile, "fasta") tmp_prefix = util.file.mkstempfname(prefix='VFAT-') - cmd = [os.path.join(util.file.get_scripts_path(), 'vfat', 'orientContig.pl'), - inFasta, tmpInputFile, tmp_prefix, - '-musclepath', musclepath] + cmd = [os.path.join(util.file.get_scripts_path(), 'vfat', 'orientContig.pl'), inFasta, tmpInputFile, + tmp_prefix, '-musclepath', musclepath] subprocess.check_call(cmd) cmd = [os.path.join(util.file.get_scripts_path(), 'vfat', 'contigMerger.pl'), - tmp_prefix+'_orientedContigs', inReference, tmp_prefix, - '-musclepath', musclepath, - '-samtoolspath', tools.samtools.SamtoolsTool().install_and_get_path()] + tmp_prefix + '_orientedContigs', inReference, tmp_prefix, '-musclepath', musclepath, + '-samtoolspath', tools.samtools.SamtoolsTool().install_and_get_path()] if inReads: infq = list(map(util.file.mkstempfname, ['.in.1.fastq', '.in.2.fastq'])) tools.picard.SamToFastqTool().execute(inReads, infq[0], infq[1]) mosaik = tools.mosaik.MosaikTool() cmd = cmd + [ - '-readfq', infq[0], '-readfq2', infq[1], - '-mosaikpath', os.path.dirname(mosaik.install_and_get_path()), - '-mosaiknetworkpath', mosaik.get_networkFile(), + '-readfq', + infq[0], + '-readfq2', + infq[1], + '-mosaikpath', + os.path.dirname(mosaik.install_and_get_path()), + '-mosaiknetworkpath', + mosaik.get_networkFile(), ] subprocess.check_call(cmd) - shutil.move(tmp_prefix+'_assembly.fa', tmpOutputFile) + shutil.move(tmp_prefix + '_assembly.fa', tmpOutputFile) tempFastas.append(tmpOutputFile) - # append + # append util.file.concat(tempFastas, outFasta) for tmpFile in tempFastas: os.unlink(tmpFile) - for fn in glob.glob(tmp_prefix+'*'): + for fn in glob.glob(tmp_prefix + '*'): os.unlink(fn) with open(outFasta, 'rt') as inf: out_chr_count = len([1 for x in inf if x.startswith('>')]) @@ -211,29 +241,35 @@ def order_and_orient(inFasta, inReference, outFasta, inReads=None): raise Exception("error: expected {} chromosomes, only got {} chromosomes".format(ref_chr_count, out_chr_count)) return 0 + def parser_order_and_orient(parser=argparse.ArgumentParser()): - parser.add_argument('inFasta', - help='Input de novo assembly/contigs, FASTA format.') + parser.add_argument('inFasta', help='Input de novo assembly/contigs, FASTA format.') parser.add_argument('inReference', - help='Reference genome for ordering, orienting, and merging contigs, FASTA format.') - parser.add_argument('outFasta', - help='Output assembly, FASTA format, with the same number of chromosomes as inReference, and in the same order.') - parser.add_argument('--inReads', default=None, - help='Input reads in unaligned BAM format. These can be used to improve the merge process.') - util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None))) + help='Reference genome for ordering, orienting, and merging contigs, FASTA format.') + parser.add_argument( + 'outFasta', + help="""Output assembly, FASTA format, with the same number of + chromosomes as inReference, and in the same order.""") + parser.add_argument('--inReads', + default=None, + help='Input reads in unaligned BAM format. These can be used to improve the merge process.') + util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, order_and_orient, split_args=True) return parser + + __commands__.append(('order_and_orient', parser_order_and_orient)) class PoorAssemblyError(Exception): + def __init__(self, chr_idx, seq_len, non_n_count): super(PoorAssemblyError, self).__init__( 'Error: poor assembly quality, chr {}: contig length {}, unambiguous bases {}'.format( - chr_idx, seq_len, non_n_count)) + chr_idx, seq_len, non_n_count)) -def impute_from_reference(inFasta, inReference, outFasta, - minLengthFraction, minUnambig, replaceLength, newName=None): + +def impute_from_reference(inFasta, inReference, outFasta, minLengthFraction, minUnambig, replaceLength, newName=None): ''' This takes a de novo assembly, aligns against a reference genome, and imputes all missing positions (plus some of the chromosome ends) @@ -259,8 +295,8 @@ def impute_from_reference(inFasta, inReference, outFasta, with open(inFasta, 'r') as asmFastaFile: with open(inReference, 'r') as refFastaFile: - asmFasta = Bio.SeqIO.parse(asmFastaFile , 'fasta') - refFasta = Bio.SeqIO.parse(refFastaFile , 'fasta') + asmFasta = Bio.SeqIO.parse(asmFastaFile, 'fasta') + refFasta = Bio.SeqIO.parse(refFastaFile, 'fasta') for idx, (refSeqObj, asmSeqObj) in enumerate(zip_longest(refFasta, asmFasta)): # our zip fails if one file has more seqs than the other if not refSeqObj or not asmSeqObj: @@ -269,8 +305,8 @@ def impute_from_reference(inFasta, inReference, outFasta, minLength = len(refSeqObj) * minLengthFraction non_n_count = unambig_count(asmSeqObj.seq) seq_len = len(asmSeqObj) - if seq_len= args.minLength - and unambig_count(s.seq) >= len(s)*args.minUnambig], - outf, args.output_format) + if len(s) >= args.minLength and unambig_count(s.seq) >= len(s) * args.minUnambig + ], outf, args.output_format) return 0 -__commands__.append(('filter_short_seqs', parser_filter_short_seqs)) +__commands__.append(('filter_short_seqs', parser_filter_short_seqs)) + def parser_modify_contig(parser=argparse.ArgumentParser()): parser.add_argument("input", help="input alignment of reference and contig (should contain exactly 2 sequences)") parser.add_argument("output", help="Destination file for modified contigs") parser.add_argument("ref", help="reference sequence name (exact match required)") - parser.add_argument("-n", "--name", type=str, - help="fasta header output name (default: existing header)", - default=None) - parser.add_argument("-cn", "--call-reference-ns", - help="""should the reference sequence be called if there is an + parser.add_argument("-n", + "--name", + type=str, + help="fasta header output name (default: existing header)", + default=None) + parser.add_argument("-cn", + "--call-reference-ns", + help="""should the reference sequence be called if there is an N in the contig and a more specific base in the reference (default: %(default)s)""", - default=False, action="store_true", dest="call_reference_ns") - parser.add_argument("-t", "--trim-ends", - help="should ends of contig.fasta be trimmed to length of reference (default: %(default)s)", - default=False, action="store_true", dest="trim_ends") - parser.add_argument("-r5", "--replace-5ends", - help="should the 5'-end of contig.fasta be replaced by reference (default: %(default)s)", - default=False, action="store_true", dest="replace_5ends") - parser.add_argument("-r3", "--replace-3ends", - help="should the 3'-end of contig.fasta be replaced by reference (default: %(default)s)", - default=False, action="store_true", dest="replace_3ends") - parser.add_argument("-l", "--replace-length", - help="length of ends to be replaced (if replace-ends is yes) (default: %(default)s)", - default=10, type=int) - parser.add_argument("-f", "--format", - help="Format for input alignment (default: %(default)s)", - default="fasta") - parser.add_argument("-r", "--replace-end-gaps", + default=False, + action="store_true", + dest="call_reference_ns") + parser.add_argument("-t", + "--trim-ends", + help="should ends of contig.fasta be trimmed to length of reference (default: %(default)s)", + default=False, + action="store_true", + dest="trim_ends") + parser.add_argument("-r5", + "--replace-5ends", + help="should the 5'-end of contig.fasta be replaced by reference (default: %(default)s)", + default=False, + action="store_true", + dest="replace_5ends") + parser.add_argument("-r3", + "--replace-3ends", + help="should the 3'-end of contig.fasta be replaced by reference (default: %(default)s)", + default=False, + action="store_true", + dest="replace_3ends") + parser.add_argument("-l", + "--replace-length", + help="length of ends to be replaced (if replace-ends is yes) (default: %(default)s)", + default=10, + type=int) + parser.add_argument("-f", "--format", help="Format for input alignment (default: %(default)s)", default="fasta") + parser.add_argument( + "-r", + "--replace-end-gaps", help="Replace gaps at the beginning and end of the sequence with reference sequence (default: %(default)s)", - default=False, action="store_true", dest="replace_end_gaps") - parser.add_argument("-rn", "--remove-end-ns", - help="Remove leading and trailing N's in the contig (default: %(default)s)", - default=False, action="store_true", dest="remove_end_ns") - parser.add_argument("-ca", "--call-reference-ambiguous", - help="""should the reference sequence be called if the contig seq is ambiguous and + default=False, + action="store_true", + dest="replace_end_gaps") + parser.add_argument("-rn", + "--remove-end-ns", + help="Remove leading and trailing N's in the contig (default: %(default)s)", + default=False, + action="store_true", + dest="remove_end_ns") + parser.add_argument("-ca", + "--call-reference-ambiguous", + help="""should the reference sequence be called if the contig seq is ambiguous and the reference sequence is more informative & consistant with the ambiguous base (ie Y->C) (default: %(default)s)""", - default=False, action="store_true", dest="call_reference_ambiguous") - util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None))) + default=False, + action="store_true", + dest="call_reference_ambiguous") + util.cmd.common_args(parser, (('tmpDir', None), ('loglevel', None), ('version', None))) util.cmd.attach_main(parser, main_modify_contig) return parser + + def main_modify_contig(args): ''' Modifies an input contig. Depending on the options selected, can replace N calls with reference calls, replace ambiguous @@ -519,7 +594,7 @@ def main_modify_contig(args): ''' aln = Bio.AlignIO.read(args.input, args.format) - # TODO?: take list of alignments in, one per chromosome, rather than + # TODO?: take list of alignments in, one per chromosome, rather than # single alignment if len(aln) != 2: @@ -532,7 +607,7 @@ def main_modify_contig(args): consensus_idx = 0 else: raise NameError("reference name '%s' not in alignment" % args.ref) - + mc = ContigModifier(str(aln[ref_idx].seq), str(aln[consensus_idx].seq)) if args.remove_end_ns: mc.remove_end_ns() @@ -548,15 +623,17 @@ def main_modify_contig(args): mc.replace_5ends(args.replace_length) if args.replace_3ends: mc.replace_3ends(args.replace_length) - + with open(args.output, "wt") as f: if hasattr(args, "name"): - name = args.name + name = args.name else: name = aln[consensus_idx].name for line in util.file.fastaMaker([(name, mc.get_stripped_consensus())]): f.write(line) return 0 + + __commands__.append(('modify_contig', parser_modify_contig)) @@ -565,15 +642,16 @@ class ContigModifier(object): MUSCLE alignment to known reference genome author: rsealfon ''' + def __init__(self, ref, consensus): if len(ref) != len(consensus): raise Exception("improper alignment") - self.ref = list(ref) + self.ref = list(ref) self.consensus = list(consensus) self.len = len(ref) - + def get_stripped_consensus(self): - return ''.join(self.consensus).replace('-','') + return ''.join(self.consensus).replace('-', '') def call_reference_ns(self): log.debug("populating N's from reference...") @@ -617,7 +695,7 @@ def replace_5ends(self, replace_length): if self.ref[i] != "-": ct = ct + 1 if ct == replace_length: - for j in range(i+1): + for j in range(i + 1): self.consensus[j] = self.ref[j] break @@ -645,67 +723,74 @@ def remove_end_ns(self): break - class MutableSequence(object): + def __init__(self, name, start, stop, init_seq=None): - if not (stop>=start>=1): + if not (stop >= start >= 1): raise IndexError("coords out of bounds") - if init_seq==None: - self.seq = list('N' * (stop-start+1)) + if init_seq is None: + self.seq = list('N' * (stop - start + 1)) else: self.seq = list(init_seq) - if stop-start+1 != len(self.seq): + if stop - start + 1 != len(self.seq): raise Exception("wrong length") self.start = start self.stop = stop self.name = name self.deletions = [] + def modify(self, p, new_base): if not (self.start <= p <= self.stop): raise IndexError("position out of bounds") - i = p-self.start + i = p - self.start self.seq[i] = new_base + def replace(self, start, stop, new_seq): - if stop>start: + if stop > start: self.deletions.append((start, stop, new_seq)) self.__change__(start, stop, new_seq) + def __change__(self, start, stop, new_seq): if not (self.start <= start <= stop <= self.stop): raise IndexError("positions out of bounds") start -= self.start - stop -= self.start - if start==stop: + stop -= self.start + if start == stop: self.seq[start] = new_seq - for i in range(max(stop-start+1, len(new_seq))): - if start+i <= stop: - if i= ref length, fill out the rest of the bases - self.seq[start+i] = new_seq[i:] + self.seq[start + i] = new_seq[i:] else: - self.seq[start+i] = new_seq[i] + self.seq[start + i] = new_seq[i] else: # new allele is shorter than ref, so delete extra bases - self.seq[start+i] = '' + self.seq[start + i] = '' + def replay_deletions(self): for start, stop, new_seq in self.deletions: self.__change__(start, stop, new_seq) + def emit(self): return (self.name, ''.join(self.seq)) + def alleles_to_ambiguity(allelelist): ''' Convert a list of DNA bases to a single ambiguity base. All alleles must be one base long. ''' for a in allelelist: - if len(a)!=1: + if len(a) != 1: raise Exception("all alleles must be one base long") - if len(allelelist)==1: + if len(allelelist) == 1: return allelelist[0] else: - convert = dict([(tuple(sorted(v)),k) for k,v in Bio.Data.IUPACData.ambiguous_dna_values.items() if k!='X']) + convert = dict([(tuple(sorted(v)), k) for k, v in Bio.Data.IUPACData.ambiguous_dna_values.items() if k != 'X']) key = tuple(sorted(set(a.upper() for a in allelelist))) return convert[key] + def vcfrow_parse_and_call_snps(vcfrow, samples, min_dp=0, major_cutoff=0.5, min_dp_ratio=0.0): ''' Parse a single row of a VCF file, emit an iterator over each sample, call SNP genotypes using custom viral method based on read counts. @@ -717,74 +802,80 @@ def vcfrow_parse_and_call_snps(vcfrow, samples, min_dp=0, major_cutoff=0.5, min_ stop = start + len(vcfrow[3]) - 1 format_col = vcfrow[8].split(':') format_col = dict((format_col[i], i) for i in range(len(format_col))) - assert 'GT' in format_col and format_col['GT']==0 # required by VCF spec - assert len(vcfrow)==9+len(samples) + assert 'GT' in format_col and format_col['GT'] == 0 # required by VCF spec + assert len(vcfrow) == 9 + len(samples) info = [x.split('=') for x in vcfrow[7].split(';') if x != '.'] - info = dict(x for x in info if len(x)==2) - info_dp = int(info.get('DP',0)) + info = dict(x for x in info if len(x) == 2) + info_dp = int(info.get('DP', 0)) # process each sample for i in range(len(samples)): sample = samples[i] - rec = vcfrow[i+9].split(':') - + rec = vcfrow[i + 9].split(':') + # require a minimum read coverage - if len(alleles)==1: + if len(alleles) == 1: # simple invariant case - dp = ('DP' in format_col and len(rec)>format_col['DP']) and int(rec[format_col['DP']]) or 0 + dp = ('DP' in format_col and len(rec) > format_col['DP']) and int(rec[format_col['DP']]) or 0 if dp < min_dp: continue geno = alleles - if info_dp and float(dp)/info_dp < min_dp_ratio: - log.warn("dropping invariant call at %s:%s-%s %s (%s) due to low DP ratio (%s / %s = %s < %s)", - c,start,stop,sample,geno,dp,info_dp,float(dp)/info_dp,min_dp_ratio) + if info_dp and float(dp) / info_dp < min_dp_ratio: + log.warn("dropping invariant call at %s:%s-%s %s (%s) due to low DP ratio (%s / %s = %s < %s)", c, + start, stop, sample, geno, dp, info_dp, float(dp) / info_dp, min_dp_ratio) continue else: # variant: manually call the highest read count allele if it exceeds a threshold - assert ('AD' in format_col and len(rec)>format_col['AD']) + assert ('AD' in format_col and len(rec) > format_col['AD']) allele_depths = list(map(int, rec[format_col['AD']].split(','))) - assert len(allele_depths)==len(alleles) - allele_depths = [(allele_depths[i], alleles[i]) for i in range(len(alleles)) if allele_depths[i]>0] - allele_depths = list(reversed(sorted((n,a) for n,a in allele_depths if n>=min_dp))) + assert len(allele_depths) == len(alleles) + allele_depths = [(allele_depths[i], alleles[i]) for i in range(len(alleles)) if allele_depths[i] > 0] + allele_depths = list(reversed(sorted((n, a) for n, a in allele_depths if n >= min_dp))) if not allele_depths: continue - dp = sum(n for n,a in allele_depths) + dp = sum(n for n, a in allele_depths) - if allele_depths[0][0] > (dp*major_cutoff): + if allele_depths[0][0] > (dp * major_cutoff): # call a single allele at this position if it is a clear winner geno = [allele_depths[0][1]] else: # call multiple alleles at this position if there is no clear winner - geno = [a for n,a in allele_depths] + geno = [a for n, a in allele_depths] if geno: yield (c, start, stop, sample, geno) + def vcf_to_seqs(vcfIter, chrlens, samples, min_dp=0, major_cutoff=0.5, min_dp_ratio=0.0): ''' Take a VCF iterator and produce an iterator of chromosome x sample full sequences.''' seqs = {} cur_c = None for vcfrow in vcfIter: try: - for c,start,stop,s,alleles in vcfrow_parse_and_call_snps(vcfrow, samples, min_dp=min_dp, major_cutoff=major_cutoff, min_dp_ratio=min_dp_ratio): + for c, start, stop, s, alleles in vcfrow_parse_and_call_snps( + vcfrow, + samples, + min_dp=min_dp, + major_cutoff=major_cutoff, + min_dp_ratio=min_dp_ratio): # changing chromosome? if c != cur_c: - if cur_c!=None: + if cur_c is not None: # dump the previous chromosome before starting a new one for s in samples: - seqs[s].replay_deletions() # because of the order of VCF rows with indels + seqs[s].replay_deletions() # because of the order of VCF rows with indels yield seqs[s].emit() # prepare base sequences for this chromosome cur_c = c for s in samples: - name = len(samples)>1 and ("%s-%s" % (c,s)) or c + name = len(samples) > 1 and ("%s-%s" % (c, s)) or c seqs[s] = MutableSequence(name, 1, chrlens[c]) # modify sequence for this chromosome/sample/position - if len(alleles)==1: + if len(alleles) == 1: # call a single allele seqs[s].replace(start, stop, alleles[0]) - elif all(len(a)==1 for a in alleles): + elif all(len(a) == 1 for a in alleles): # call an ambiguous SNP seqs[s].replace(start, stop, alleles_to_ambiguity(alleles)) else: @@ -795,9 +886,9 @@ def vcf_to_seqs(vcfIter, chrlens, samples, min_dp=0, major_cutoff=0.5, min_dp_ra raise # at the end, dump the last chromosome - if cur_c!=None: + if cur_c is not None: for s in samples: - seqs[s].replay_deletions() # because of the order of VCF rows with indels + seqs[s].replay_deletions() # because of the order of VCF rows with indels yield seqs[s].emit() @@ -805,34 +896,45 @@ def parser_vcf_to_fasta(parser=argparse.ArgumentParser()): parser.add_argument("inVcf", help="Input VCF file") parser.add_argument("outFasta", help="Output FASTA file") parser.add_argument("--trim_ends", - action="store_true", dest="trim_ends", - default=False, - help="""If specified, we will strip off continuous runs of N's from the beginning + action="store_true", + dest="trim_ends", + default=False, + help="""If specified, we will strip off continuous runs of N's from the beginning and end of the sequences before writing to output. Interior N's will not be changed.""") - parser.add_argument("--min_coverage", dest="min_dp", type=int, - help="""Specify minimum read coverage (with full agreement) to make a call. + parser.add_argument("--min_coverage", + dest="min_dp", + type=int, + help="""Specify minimum read coverage (with full agreement) to make a call. [default: %(default)s]""", - default=3) - parser.add_argument("--major_cutoff", dest="major_cutoff", type=float, - help="""If the major allele is present at a frequency higher than this cutoff, + default=3) + parser.add_argument("--major_cutoff", + dest="major_cutoff", + type=float, + help="""If the major allele is present at a frequency higher than this cutoff, we will call an unambiguous base at that position. If it is equal to or below this cutoff, we will call an ambiguous base representing all possible alleles at that position. [default: %(default)s]""", - default=0.5) - parser.add_argument("--min_dp_ratio", dest="min_dp_ratio", type=float, - help="""The input VCF file often reports two read depth values (DP)--one for + default=0.5) + parser.add_argument("--min_dp_ratio", + dest="min_dp_ratio", + type=float, + help="""The input VCF file often reports two read depth values (DP)--one for the position as a whole, and one for the sample in question. We can optionally reject calls in which the sample read count is below a specified fraction of the total read count. This filter will not apply to any sites unless both DP values are reported. [default: %(default)s]""", - default=0.0) - parser.add_argument("--name", dest="name", nargs="*", - help="output sequence names (default: reference names in VCF file)", - default=[]) - util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + default=0.0) + parser.add_argument("--name", + dest="name", + nargs="*", + help="output sequence names (default: reference names in VCF file)", + default=[]) + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, main_vcf_to_fasta) return parser + + def main_vcf_to_fasta(args): ''' Take input genotypes (VCF) and construct a consensus sequence (fasta) by using majority-read-count alleles in the VCF. @@ -843,20 +945,23 @@ def main_vcf_to_fasta(args): ''' assert args.min_dp >= 0 assert 0.0 <= args.major_cutoff < 1.0 - + with util.vcf.VcfReader(args.inVcf) as vcf: chrlens = dict(vcf.chrlens()) samples = vcf.samples() assert len(samples) == 1, """Multiple sample columns were found in the intermediary VCF file - of the refine_assembly step, suggesting multiple sample names are present + of the refine_assembly step, suggesting multiple sample names are present upstream in the BAM file. Please correct this so there is only one sample in the BAM file.""" with open(args.outFasta, 'wt') as outf: chr_idx = 0 for header, seq in vcf_to_seqs(util.file.read_tabfile(args.inVcf), - chrlens, samples, min_dp=args.min_dp, major_cutoff=args.major_cutoff, - min_dp_ratio=args.min_dp_ratio): + chrlens, + samples, + min_dp=args.min_dp, + major_cutoff=args.major_cutoff, + min_dp_ratio=args.min_dp_ratio): if args.trim_ends: seq = seq.strip('Nn') if args.name: @@ -867,16 +972,19 @@ def main_vcf_to_fasta(args): # done log.info("done") return 0 -__commands__.append(('vcf_to_fasta', parser_vcf_to_fasta)) +__commands__.append(('vcf_to_fasta', parser_vcf_to_fasta)) + def parser_trim_fasta(parser=argparse.ArgumentParser()): parser.add_argument("inFasta", help="Input fasta file") parser.add_argument("outFasta", help="Output (trimmed) fasta file") - util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, trim_fasta, split_args=True) return parser + + def trim_fasta(inFasta, outFasta): ''' Take input sequences (fasta) and trim any continuous sections of N's from the ends of them. Write trimmed sequences to an output fasta file. @@ -888,15 +996,17 @@ def trim_fasta(inFasta, outFasta): outf.write(line) log.info("done") return 0 -__commands__.append(('trim_fasta', parser_trim_fasta)) +__commands__.append(('trim_fasta', parser_trim_fasta)) + def deambig_base(base): ''' Take a single base (possibly a IUPAC ambiguity code) and return a random non-ambiguous base from among the possibilities ''' return random.choice(Bio.Data.IUPACData.ambiguous_dna_values[base.upper()]) + def deambig_fasta(inFasta, outFasta): ''' Take input sequences (fasta) and replace any ambiguity bases with a random unambiguous base from among the possibilities described by the ambiguity @@ -908,12 +1018,16 @@ def deambig_fasta(inFasta, outFasta): for line in util.file.fastaMaker([(record.id, ''.join(map(deambig_base, str(record.seq))))]): outf.write(line) return 0 + + def parser_deambig_fasta(parser=argparse.ArgumentParser()): parser.add_argument("inFasta", help="Input fasta file") parser.add_argument("outFasta", help="Output fasta file") - util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, deambig_fasta, split_args=True) return parser + + __commands__.append(('deambig_fasta', parser_deambig_fasta)) @@ -921,41 +1035,47 @@ def vcf_dpdiff(vcfs): for vcf in vcfs: with util.vcf.VcfReader(vcf) as v: samples = v.samples() - assert len(samples)==1 + assert len(samples) == 1 for row in util.file.read_tabfile(vcf): - dp1 = int(dict(x.split('=') for x in row[7].split(';') if x != '.').get('DP',0)) + dp1 = int(dict(x.split('=') for x in row[7].split(';') if x != '.').get('DP', 0)) dp2 = 0 if 'DP' in row[8].split(':'): dpidx = row[8].split(':').index('DP') - if len(row[9].split(':'))>dpidx: + if len(row[9].split(':')) > dpidx: dp2 = int(row[9].split(':')[dpidx]) ratio = '' if dp1: - ratio = float(dp2)/dp1 - yield (row[0],row[1],samples[0],dp1,dp2,dp1-dp2,ratio) + ratio = float(dp2) / dp1 + yield (row[0], row[1], samples[0], dp1, dp2, dp1 - dp2, ratio) def parser_dpdiff(parser=argparse.ArgumentParser()): parser.add_argument("inVcfs", help="Input VCF file", nargs='+') parser.add_argument("outFile", help="Output flat file") - util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, dpdiff, split_args=True) return parser + + def dpdiff(inVcfs, outFile): ''' Take input VCF files (all with only one sample each) and report on the discrepancies between the two DP fields (one in INFO and one in the sample's genotype column). ''' - header = ['chr','pos','sample','dp_info','dp_sample','diff','ratio'] + header = ['chr', 'pos', 'sample', 'dp_info', 'dp_sample', 'diff', 'ratio'] with open(outFile, 'wt') as outf: - outf.write('#'+'\t'.join(header)+'\n') + outf.write('#' + '\t'.join(header) + '\n') for row in vcf_dpdiff(inVcfs): - outf.write('\t'.join(map(str, row))+'\n') + outf.write('\t'.join(map(str, row)) + '\n') return 0 + + __commands__.append(('dpdiff', parser_dpdiff)) def full_parser(): return util.cmd.make_parser(__commands__, __doc__) + + if __name__ == '__main__': util.cmd.main_argparse(__commands__, __doc__) diff --git a/broad_utils.py b/broad_utils.py index abb9b0a76..73ac3e8f8 100755 --- a/broad_utils.py +++ b/broad_utils.py @@ -7,80 +7,107 @@ __author__ = "dpark@broadinstitute.org" __commands__ = [] -import argparse, logging, os, os.path, time, json, glob, hashlib, base64 -import util.cmd, util.file +import argparse +import logging +import os +import os.path +import time +import json +import glob +import hashlib +import base64 +import util.cmd +import util.file import tools.picard log = logging.getLogger(__name__) - # ========================================== # *** get stuff from Picard json file *** # ========================================== + def get_json_from_picard(picardDir): ''' for example, /seq/walkup/picard/{flowcell_minus_first_char} ''' analysisDir = max( - (os.path.getmtime(os.path.join(picardDir, d)), d) - for d in os.listdir(picardDir) + (os.path.getmtime(os.path.join(picardDir, d)), d) for d in os.listdir(picardDir) if os.path.isdir(os.path.join(picardDir, d)))[1] jsonfile = list(glob.glob(os.path.join(picardDir, analysisDir, 'info', 'logs', '*.json'))) if len(jsonfile) != 1: raise Exception("error") return jsonfile[0] + + def get_run_date(jsonfile): with open(jsonfile, 'rt') as inf: runDate = json.load(inf)['workflow']['runDate'] return runDate + + def get_bustard_dir(jsonfile): with open(jsonfile, 'rt') as inf: bustard = json.load(inf)['workflow']['runFolder'] return bustard + def parser_get_bustard_dir(parser=argparse.ArgumentParser()): parser.add_argument('inDir', help='Picard directory') util.cmd.common_args(parser, (('loglevel', 'ERROR'),)) util.cmd.attach_main(parser, main_get_bustard_dir) return parser + + def main_get_bustard_dir(args): 'Find the basecalls directory from a Picard directory' print(get_bustard_dir(get_json_from_picard(args.inDir))) return 0 + + __commands__.append(('get_bustard_dir', parser_get_bustard_dir)) + + def parser_get_run_date(parser=argparse.ArgumentParser()): - parser.add_argument('inDir', help='Picard directory') + parser.add_argument('inDir', help='Picard directory') util.cmd.common_args(parser, (('loglevel', 'ERROR'),)) util.cmd.attach_main(parser, main_get_run_date) return parser + + def main_get_run_date(args): 'Find the sequencing run date from a Picard directory' print(get_run_date(get_json_from_picard(args.inDir))) return 0 -__commands__.append(('get_run_date', parser_get_run_date)) +__commands__.append(('get_run_date', parser_get_run_date)) + # =============== # *** misc *** # =============== + def iterate_lanes(runfile): for flowcellfile in util.file.read_tabfile(runfile): for lane in util.file.read_tabfile_dict(flowcellfile[0]): yield lane + + def iterate_wells(runfile): for lane in iterate_lanes(runfile): for well in util.file.read_tabfile_dict(lane['barcode_file']): - yield (lane,well) + yield (lane, well) + def get_all_samples(runfile): - return list(sorted(set(well['sample'] - for lane, well in iterate_wells(runfile)))) + return list(sorted(set(well['sample'] for lane, well in iterate_wells(runfile)))) + def get_all_libraries(runfile): - return list(sorted(set(well['sample'] + '.l' + well['library_id_per_sample'] - for lane, well in iterate_wells(runfile)))) + return list(sorted(set(well['sample'] + '.l' + well['library_id_per_sample'] for lane, well in iterate_wells( + runfile)))) -def get_run_id(well): + +def get_run_id(well): run_id = well['sample'] if well.get('library_id_per_sample'): run_id += '.l' + well['library_id_per_sample'] @@ -88,35 +115,40 @@ def get_run_id(well): run_id += '.r' + well['run_id_per_library'] return run_id + def get_all_runs(runfile): - return list(sorted(get_run_id(well) +'.'+ lane['flowcell'] +'.'+ lane['lane'] - for lane, well in iterate_wells(runfile))) + return list(sorted(get_run_id(well) + '.' + lane['flowcell'] + '.' + lane['lane'] for lane, well in iterate_wells( + runfile))) + def parser_get_all_names(parser=argparse.ArgumentParser()): - parser.add_argument('type', help='Type of name', - choices=['samples', 'libraries', 'runs']) + parser.add_argument('type', help='Type of name', choices=['samples', 'libraries', 'runs']) parser.add_argument('runfile', help='File with seq run information') util.cmd.common_args(parser, (('loglevel', 'ERROR'),)) util.cmd.attach_main(parser, main_get_all_names) return parser -def main_get_all_names(args) : + + +def main_get_all_names(args): 'Get all samples' - if args.type=='samples': + if args.type == 'samples': method = get_all_samples - elif args.type=='libraries': + elif args.type == 'libraries': method = get_all_libraries - elif args.type=='runs': + elif args.type == 'runs': method = get_all_runs for s in method(args.runfile): print(s) return 0 -__commands__.append(('get_all_names', parser_get_all_names)) +__commands__.append(('get_all_names', parser_get_all_names)) + # ============================= # *** make_barcodes_file *** # ============================= + def make_barcodes_file(inFile, outFile): 'Create input file for extract_barcodes' if any(row.get('barcode_2') for row in util.file.read_tabfile_dict(inFile)): @@ -124,65 +156,78 @@ def make_barcodes_file(inFile, outFile): else: header = ['barcode_name', 'library_name', 'barcode_sequence_1'] with open(outFile, 'wt') as outf: - outf.write('\t'.join(header)+'\n') + outf.write('\t'.join(header) + '\n') for row in util.file.read_tabfile_dict(inFile): - out = {'barcode_sequence_1':row['barcode_1'], - 'barcode_sequence_2':row.get('barcode_2',''), - 'barcode_name':row['sample'], - 'library_name':row['sample']} + out = { + 'barcode_sequence_1': row['barcode_1'], + 'barcode_sequence_2': row.get('barcode_2', ''), + 'barcode_name': row['sample'], + 'library_name': row['sample'] + } if row.get('library_id_per_sample'): out['library_name'] += '.l' + row['library_id_per_sample'] - outf.write('\t'.join(out[h] for h in header)+'\n') + outf.write('\t'.join(out[h] for h in header) + '\n') + + def parser_make_barcodes_file(parser=argparse.ArgumentParser()): parser.add_argument('inFile', - help='''Input tab file w/header and 2-5 named columns (last three are optional): + help='''Input tab file w/header and 2-5 named columns (last three are optional): sample, barcode_1, barcode_2, library_id_per_sample, run_id_per_library''') parser.add_argument('outFile', help='Output BARCODE_FILE file for Picard.') util.cmd.attach_main(parser, make_barcodes_file, split_args=True) return parser + + __commands__.append(('make_barcodes_file', parser_make_barcodes_file)) # =========================== # *** extract_barcodes *** # =========================== + def parser_extract_barcodes(parser=argparse.ArgumentParser()): parser.add_argument('inDir', help='Bustard directory.') parser.add_argument('lane', help='Lane number.', type=int) parser.add_argument('barcodeFile', - help='''Input tab file w/header and four named columns: + help='''Input tab file w/header and four named columns: barcode_name, library_name, barcode_sequence_1, barcode_sequence_2''') parser.add_argument('outDir', help='Output directory for barcodes.') - parser.add_argument('--outMetrics', - help='Output metrics file. Default is to dump to a temp file.', - default=None) + parser.add_argument('--outMetrics', help='Output metrics file. Default is to dump to a temp file.', default=None) for opt in tools.picard.ExtractIlluminaBarcodesTool.option_list: - parser.add_argument('--'+opt, - help='Picard ExtractIlluminaBarcodes '+opt.upper()+' (default: %(default)s)', - default=tools.picard.ExtractIlluminaBarcodesTool.defaults.get(opt)) + parser.add_argument('--' + opt, + help='Picard ExtractIlluminaBarcodes ' + opt.upper() + ' (default: %(default)s)', + default=tools.picard.ExtractIlluminaBarcodesTool.defaults.get(opt)) parser.add_argument('--JVMmemory', - help='JVM virtual memory size (default: %(default)s)', - default = tools.picard.ExtractIlluminaBarcodesTool.jvmMemDefault) + help='JVM virtual memory size (default: %(default)s)', + default=tools.picard.ExtractIlluminaBarcodesTool.jvmMemDefault) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_extract_barcodes) return parser + + def main_extract_barcodes(args): 'Match every read in a lane against their barcode.' - out_metrics = (args.outMetrics==None) and util.file.mkstempfname('.metrics.txt') or args.outMetrics - picardOpts = dict((opt, getattr(args, opt)) - for opt in tools.picard.ExtractIlluminaBarcodesTool.option_list - if hasattr(args, opt) and getattr(args, opt)!=None) + out_metrics = (args.outMetrics is None) and util.file.mkstempfname('.metrics.txt') or args.outMetrics + picardOpts = dict((opt, getattr(args, opt)) for opt in tools.picard.ExtractIlluminaBarcodesTool.option_list + if hasattr(args, opt) and getattr(args, opt) != None) tools.picard.ExtractIlluminaBarcodesTool().execute( - os.path.join(args.inDir, 'Data', 'Intensities', 'BaseCalls'), args.lane, args.barcodeFile, - args.outDir, out_metrics, - picardOptions=picardOpts, JVMmemory=args.JVMmemory) + os.path.join(args.inDir, 'Data', 'Intensities', 'BaseCalls'), + args.lane, + args.barcodeFile, + args.outDir, + out_metrics, + picardOptions=picardOpts, + JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('extract_barcodes', parser_extract_barcodes)) # ============================== # *** make_library_params *** # ============================== + def make_params_file(inFile, bamDir, outFile): 'Create input file for illumina_basecalls' if any(row.get('barcode_2') for row in util.file.read_tabfile_dict(inFile)): @@ -190,99 +235,117 @@ def make_params_file(inFile, bamDir, outFile): else: header = ['OUTPUT', 'SAMPLE_ALIAS', 'LIBRARY_NAME', 'BARCODE_1'] with open(outFile, 'wt') as outf: - outf.write('\t'.join(header)+'\n') + outf.write('\t'.join(header) + '\n') rows = list(util.file.read_tabfile_dict(inFile)) - rows.append({'barcode_1':'N','barcode_2':'N','sample':'Unmatched'}) + rows.append({'barcode_1': 'N', 'barcode_2': 'N', 'sample': 'Unmatched'}) for row in rows: - out = {'BARCODE_1':row['barcode_1'], - 'BARCODE_2':row.get('barcode_2',''), - 'SAMPLE_ALIAS':row['sample'], - 'LIBRARY_NAME':row['sample']} + out = { + 'BARCODE_1': row['barcode_1'], + 'BARCODE_2': row.get('barcode_2', ''), + 'SAMPLE_ALIAS': row['sample'], + 'LIBRARY_NAME': row['sample'] + } if row.get('library_id_per_sample'): out['LIBRARY_NAME'] += '.l' + row['library_id_per_sample'] run_id = out['LIBRARY_NAME'] if row.get('run_id_per_library'): run_id += '.r' + row['run_id_per_library'] out['OUTPUT'] = os.path.join(bamDir, run_id + ".bam") - outf.write('\t'.join(out[h] for h in header)+'\n') + outf.write('\t'.join(out[h] for h in header) + '\n') + + def parser_make_params_file(parser=argparse.ArgumentParser()): parser.add_argument('inFile', - help='''Input tab file w/header and four named columns: + help='''Input tab file w/header and four named columns: barcode_name, library_name, barcode_sequence_1, barcode_sequence_2''') parser.add_argument('bamDir', help='Directory for output bams') parser.add_argument('outFile', help='Output LIBRARY_PARAMS file for Picard') util.cmd.attach_main(parser, make_params_file, split_args=True) return parser + + __commands__.append(('make_params_file', parser_make_params_file)) # ============================= # *** illumina_basecalls *** # ============================= + def get_earliest_date(inDir): ''' Looks at the dates of all first-level members of a directory, plus the directory itself, and returns the earliest date seen. ''' fnames = [inDir] + [os.path.join(inDir, x) for x in os.listdir(inDir)] earliest = min(os.path.getmtime(fn) for fn in fnames) - #return time.strftime("%Y-%m-%d", time.localtime(earliest)) + # return time.strftime("%Y-%m-%d", time.localtime(earliest)) # what?? http://sourceforge.net/p/samtools/mailman/message/27441767/ return time.strftime("%m/%d/%Y", time.localtime(earliest)) + def short_hash(inString, length=None): ''' Returns a base32 encoding of a SHA1 hash of the inString, optionally truncated to a maximum length. The base32 encoding is uppercase A-Z and 2-7. ''' hash_obj = hashlib.sha1(inString.encode('utf-8')) b32_str = base64.b32encode(bytes(hash_obj.digest())).decode('utf-8') - if length>0 and len(b32_str)>length: + if length > 0 and len(b32_str) > length: b32_str = b32_str[:length] return b32_str - + + def parser_illumina_basecalls(parser=argparse.ArgumentParser()): parser.add_argument('inBustardDir', help='Bustard directory.') parser.add_argument('inBarcodesDir', help='Barcodes directory.') parser.add_argument('flowcell', help='Flowcell ID') parser.add_argument('lane', help='Lane number.', type=int) parser.add_argument('paramsFile', - help='''Input tab file w/header and five named columns: + help='''Input tab file w/header and five named columns: BARCODE_1, BARCODE_2, OUTPUT, SAMPLE_ALIAS, LIBRARY_NAME''') for opt in tools.picard.IlluminaBasecallsToSamTool.option_list: - if opt=='adapters_to_check': - parser.add_argument('--'+opt, nargs='*', - help='Picard ExtractIlluminaBarcodes '+opt.upper()+' (default: %(default)s)', - default=tools.picard.IlluminaBasecallsToSamTool.defaults.get(opt)) + if opt == 'adapters_to_check': + parser.add_argument('--' + opt, + nargs='*', + help='Picard ExtractIlluminaBarcodes ' + opt.upper() + ' (default: %(default)s)', + default=tools.picard.IlluminaBasecallsToSamTool.defaults.get(opt)) else: - parser.add_argument('--'+opt, - help='Picard ExtractIlluminaBarcodes '+opt.upper()+' (default: %(default)s)', - default=tools.picard.IlluminaBasecallsToSamTool.defaults.get(opt)) + parser.add_argument('--' + opt, + help='Picard ExtractIlluminaBarcodes ' + opt.upper() + ' (default: %(default)s)', + default=tools.picard.IlluminaBasecallsToSamTool.defaults.get(opt)) parser.add_argument('--JVMmemory', - help='JVM virtual memory size (default: %(default)s)', - default = tools.picard.IlluminaBasecallsToSamTool.jvmMemDefault) + help='JVM virtual memory size (default: %(default)s)', + default=tools.picard.IlluminaBasecallsToSamTool.jvmMemDefault) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_illumina_basecalls) return parser + + def main_illumina_basecalls(args): 'Demultiplex Illumina runs & produce BAM files, one per sample' - picardOpts = dict((opt, getattr(args, opt)) - for opt in tools.picard.IlluminaBasecallsToSamTool.option_list - if hasattr(args, opt) and getattr(args, opt)!=None) + picardOpts = dict((opt, getattr(args, opt)) for opt in tools.picard.IlluminaBasecallsToSamTool.option_list + if hasattr(args, opt) and getattr(args, opt) != None) if not picardOpts.get('run_start_date'): picardOpts['run_start_date'] = get_earliest_date(args.inBustardDir) - #if not picardOpts.get('read_group_id'): + # if not picardOpts.get('read_group_id'): # picardOpts['read_group_id'] = short_hash('{}.{}'.format(args.flowcell,args.lane), 6) tools.picard.IlluminaBasecallsToSamTool().execute( os.path.join(args.inBustardDir, 'Data/Intensities/BaseCalls'), - args.inBarcodesDir, args.flowcell, args.lane, args.paramsFile, - picardOptions=picardOpts, JVMmemory=args.JVMmemory) + args.inBarcodesDir, + args.flowcell, + args.lane, + args.paramsFile, + picardOptions=picardOpts, + JVMmemory=args.JVMmemory) return 0 -__commands__.append(('illumina_basecalls', parser_illumina_basecalls)) +__commands__.append(('illumina_basecalls', parser_illumina_basecalls)) # ======================= + def full_parser(): return util.cmd.make_parser(__commands__, __doc__) + + if __name__ == '__main__': util.cmd.main_argparse(__commands__, __doc__) diff --git a/docs/conf.py b/docs/conf.py index 893fc5267..f993292cd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,19 +23,22 @@ # -- Mock out the heavyweight pip packages, esp those that require C ---- import mock -MOCK_MODULES = ['numpy', 'scipy', 'matplotlib', 'pysam', - 'Bio', 'Bio.AlignIO', 'Bio.SeqIO', 'Bio.Data.IUPACData'] +MOCK_MODULES = ['numpy', 'scipy', 'matplotlib', 'pysam', 'Bio', 'Bio.AlignIO', 'Bio.SeqIO', 'Bio.Data.IUPACData'] for mod_name in MOCK_MODULES: - sys.modules[mod_name] = mock.Mock() - -# -- Obtain GIT version -- + sys.modules[mod_name] = mock.Mock() + + # -- Obtain GIT version -- import subprocess + + def _git_version(): cmd = ['git', 'describe', '--tags', '--always'] # omit "--dirty" from doc build out = subprocess.check_output(cmd) if type(out) != str: out = out.decode('utf-8') return out.strip() + + __version__ = _git_version() # -- General configuration ------------------------------------------------ @@ -46,12 +49,7 @@ def _git_version(): # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.pngmath', - 'sphinxarg.ext', -] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.pngmath', 'sphinxarg.ext',] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -114,7 +112,6 @@ def _git_version(): # If true, keep warnings as "system message" paragraphs in the built documents. #keep_warnings = False - # -- Options for HTML output ---------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for @@ -199,27 +196,21 @@ def _git_version(): # Output file base name for HTML help builder. htmlhelp_basename = 'viral-ngsdoc' - # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). -latex_documents = [ - ('index', 'viral-ngs.tex', 'viral-ngs Documentation', - 'Broad Institute Viral Genomics', 'manual'), -] +latex_documents = [('index', 'viral-ngs.tex', 'viral-ngs Documentation', 'Broad Institute Viral Genomics', 'manual'),] # The name of an image file (relative to this directory) to place at the top of # the title page. @@ -241,39 +232,30 @@ def _git_version(): # If false, no module index is generated. #latex_domain_indices = True - # -- Options for manual page output --------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'viral-ngs', 'viral-ngs Documentation', - ['Broad Institute Viral Genomics'], 1) -] +man_pages = [('index', 'viral-ngs', 'viral-ngs Documentation', ['Broad Institute Viral Genomics'], 1)] # If true, show URL addresses after external links. #man_show_urls = False - # -- Options for Texinfo output ------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'viral-ngs', 'viral-ngs Documentation', - 'Broad Institute Viral Genomics', 'viral-ngs', 'Viral genomics analysis pipelines', - 'Miscellaneous'), + ('index', 'viral-ngs', 'viral-ngs Documentation', 'Broad Institute Viral Genomics', 'viral-ngs', + 'Viral genomics analysis pipelines', 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. #texinfo_appendices = [] - # If false, no module index is generated. #texinfo_domain_indices = True - # How to display URL addresses: 'footnote', 'no', or 'inline'. #texinfo_show_urls = 'footnote' - # If true, do not generate a @detailmenu in the "Top" node's menu. #texinfo_no_detailmenu = False diff --git a/interhost.py b/interhost.py index dcf968894..0ea670fa2 100755 --- a/interhost.py +++ b/interhost.py @@ -7,17 +7,22 @@ __commands__ = [] # built-ins -import argparse, logging, os, array, bisect, json +import argparse +import logging +import os +import array +import bisect +import json from itertools import permutations from collections import OrderedDict, Sequence try: from itertools import zip_longest -except ImportError : +except ImportError: from itertools import izip_longest as zip_longest try: from UserDict import UserDict from UserDict import DictMixin -except ImportError: # for Py3 +except ImportError: # for Py3 from collections import UserDict from collections import MutableMapping as DictMixin @@ -26,8 +31,12 @@ from Bio import SeqIO # module-specific -import tools.muscle, tools.snpeff, tools.mafft -import util.cmd, util.file, util.vcf +import tools.muscle +import tools.snpeff +import tools.mafft +import util.cmd +import util.file +import util.vcf log = logging.getLogger(__name__) @@ -35,6 +44,8 @@ # CoordMapper extends DictMixin so that after the basic dict dict() interface methods are defined, # we get higher level dictionary intervace methods for free + + class CoordMapper(DictMixin): """ Map (chrom, coordinate) between genome A and genome B. Coordinates are 1-based. @@ -49,14 +60,19 @@ class CoordMapper(DictMixin): put gaps in opposite species adjacent to each other without aligning a pair of real bases in between. """ - def __init__(self, alignerTool = tools.muscle.MuscleTool) : - """ The two genomes are described by fasta files with the same number of + + def __init__(self, alignerTool=tools.muscle.MuscleTool): + """ The two genomes are described by fasta files with the same number of chromosomes, and corresponding chromosomes must be in same order. """ - - # {chrA : {chrB: mapperAB, chrC: mapperAC}, chrB : {chrA: mapperBA, chrC: mapperBC}, chrC : {chrA: mapperCA, chrB: mapperCB} } - self.chrMaps = OrderedDict() - self.chrMapsUngapped = OrderedDict() + + # { + # chrA : {chrB: mapperAB, chrC: mapperAC}, + # chrB : {chrA: mapperBA, chrC: mapperBC}, + # chrC : {chrA: mapperCA, chrB: mapperCB} + # } + self.chrMaps = OrderedDict() + self.chrMapsUngapped = OrderedDict() self.alignerTool = alignerTool() def __getitem__(self, key): @@ -84,7 +100,7 @@ def __contains__(self, key): def keys(self): return self.chrMaps.keys() - def mapAtoB(self, fromChrom, fromPos = None, side = 0) : + def mapAtoB(self, fromChrom, fromPos=None, side=0): """ Map (chrom, coordinate) from genome A to genome B. If fromPos is None, map only the chromosome name If side is: @@ -93,11 +109,12 @@ def mapAtoB(self, fromChrom, fromPos = None, side = 0) : > 0, return the right-most position on B """ if len(self.chrMaps.keys()) != 4: - raise LookupError("CoordMapper.mapAtoB expects two input sequences and is provided only as a legacy function") + raise LookupError( + "CoordMapper.mapAtoB expects two input sequences and is provided only as a legacy function") return self.mapChr(fromChrom, list(self.chrMaps[fromChrom].keys())[0], fromPos, side) - - def mapBtoA(self, fromChrom, fromPos = None, side = 0) : + + def mapBtoA(self, fromChrom, fromPos=None, side=0): """ Map (chrom, coordinate) from genome B to genome A. If fromPos is None, map only the chromosome name If side is: @@ -106,7 +123,8 @@ def mapBtoA(self, fromChrom, fromPos = None, side = 0) : > 0, return the right-most position on A """ if len(self.chrMaps.keys()) != 4: - raise LookupError("CoordMapper.mapBtoA expects two input sequences and is provided only as a legacy function") + raise LookupError( + "CoordMapper.mapBtoA expects two input sequences and is provided only as a legacy function") return self.mapChr(fromChrom, list(self.chrMaps[fromChrom].keys())[0], fromPos, side) @@ -125,12 +143,11 @@ def mapChr(self, fromChrom, toChrom, fromPos=None, side=0, ungapped=False): raise KeyError("chr '%s' not found in CoordMapper relation map" % toChrom) mapper = self.chrMaps[fromChrom][toChrom] - #if not ungapped: + # if not ungapped: # mapper = self.chrMaps[fromChrom][toChrom] - #else: + # else: # mapper = self.chrMapsUngapped[fromChrom][toChrom] - if fromPos is None: return toChrom toPos = mapper(fromPos, 0) @@ -139,23 +156,29 @@ def mapChr(self, fromChrom, toChrom, fromPos=None, side=0, ungapped=False): return (toChrom, toPos) def load_alignments(self, aligned_files, a_idx=None, b_idx=None): - """ Loads aligned sequences into a CoordMapper instance. + """ Loads aligned sequences into a CoordMapper instance. Any number of sequences >1 may be read in. Mappers may be accessed via CoordMapper.chrMaps where chrMaps may look like: - ```{chrA : {chrB: mapperAB, chrC: mapperAC}, chrB : {chrA: mapperBA, chrC: mapperBC}, chrC : {chrA: mapperCA, chrB: mapperCB} }``` + ``` + { + chrA : {chrB: mapperAB, chrC: mapperAC}, + chrB : {chrA: mapperBA, chrC: mapperBC}, + chrC : {chrA: mapperCA, chrB: mapperCB} + } + ``` """ for alignOutFileName in aligned_files: - with open(alignOutFileName, 'rt') as alignOutFile : + with open(alignOutFileName, 'rt') as alignOutFile: seqs = list(SeqIO.parse(alignOutFile, 'fasta')) - #if len(list(seqs)) <2: + # if len(list(seqs)) <2: # raise Exception("Each aligned input file must contain >1 sequence.") # if mapping between specific sequences is specified if a_idx is not None and b_idx is not None: - assert a_idx>=0 and b_idx>=0 - assert a_idx= 0 and b_idx >= 0 + assert a_idx < len(seqs) and b_idx < len(seqs) + mapper = CoordMapper2Seqs(seqs[a_idx].seq, seqs[b_idx].seq) self.chrMaps.setdefault(seqs[a_idx].id, OrderedDict()) mapDict = OrderedDict() @@ -176,10 +199,11 @@ def load_alignments(self, aligned_files, a_idx=None, b_idx=None): self.chrMaps.setdefault(seq1.id, OrderedDict()) self.chrMapsUngapped.setdefault(seq1.id, OrderedDict()) # if the sequence we are mapping onto is already in the map - # raise an error + # raise an error # (could occur if same sequence is read in from multiple files) if (seq2.id in self.chrMaps[seq1.id]): - raise KeyError("duplicate sequence name '%s' already in chrMap for %s" % (seq2.id, seq1.id)) + raise KeyError( + "duplicate sequence name '%s' already in chrMap for %s" % (seq2.id, seq1.id)) mapper = CoordMapper2Seqs(seq1.seq, seq2.seq) mapDict = self.chrMaps[seq1.id] @@ -215,14 +239,15 @@ def align_and_load_sequences(self, unaligned_fasta_files, aligner=None): for f in alignOutFileNames: os.unlink(f) -class CoordMapper2Seqs(object) : + +class CoordMapper2Seqs(object): """ Map 1-based coordinates between two aligned sequences. - Result is a coordinate or an interval, as described in CoordMapper main + Result is a coordinate or an interval, as described in CoordMapper main comment string. Return None if beyond end. Input sequences must be already-aligned iterators through bases with gaps represented by dashes and all other characters assumed to be - real bases. + real bases. Assumptions: - Sequences (including gaps) are same length. - Each sequence has at least one real base. @@ -234,252 +259,288 @@ class CoordMapper2Seqs(object) : Implementation: mapArrays is a pair of arrays of equal length such that (mapArrays[0][n], mapArrays[1][n]) are the coordinates of a pair of - aligned real bases on the two sequences. The only pairs that are - included are the first, the last, and the pair immediately following + aligned real bases on the two sequences. The only pairs that are + included are the first, the last, and the pair immediately following any gap. Pairs are in increasing order. Coordinate mapping requires binary search in one of the arrays. Total space required, in bytes, is const + 8 * (number of indels). Time for a map in either direction is O(log(number of indels)). """ - - def __init__(self, seq0, seq1) : + + def __init__(self, seq0, seq1): self.mapArrays = [array.array('I'), array.array('I')] baseCount0 = 0 # Number of real bases in seq0 up to and including cur pos baseCount1 = 0 # Number of real bases in seq1 up to and including cur pos - beforeStart = True # Haven't yet reached first pair of aligned real bases - gapSinceLast = False # Have encounted a gap since last pair in mapArrays - for b0, b1 in zip_longest(seq0, seq1) : + beforeStart = True # Haven't yet reached first pair of aligned real bases + gapSinceLast = False # Have encounted a gap since last pair in mapArrays + for b0, b1 in zip_longest(seq0, seq1): if b0 is None or b1 is None: raise Exception('CoordMapper2Seqs: sequences must be same length.') realBase0 = b0 != '-' realBase1 = b1 != '-' baseCount0 += realBase0 baseCount1 += realBase1 - if realBase0 and realBase1 : - if beforeStart or gapSinceLast : + if realBase0 and realBase1: + if beforeStart or gapSinceLast: self.mapArrays[0].append(baseCount0) self.mapArrays[1].append(baseCount1) gapSinceLast = False beforeStart = False - finalPos0 = baseCount0 # Last pair of aligned real bases so far - finalPos1 = baseCount1 # Last pair of aligned real bases so far - else : + finalPos0 = baseCount0 # Last pair of aligned real bases so far + finalPos1 = baseCount1 # Last pair of aligned real bases so far + else: gapSinceLast = True if len(self.mapArrays[0]) == 0: raise Exception('CoordMapper2Seqs: no aligned bases.') - if self.mapArrays[0][-1] != finalPos0 : + if self.mapArrays[0][-1] != finalPos0: self.mapArrays[0].append(finalPos0) self.mapArrays[1].append(finalPos1) - def __call__(self, fromPos, fromWhich) : + def __call__(self, fromPos, fromWhich): """ fromPos: 1-based coordinate fromWhich: if 0, map from 1st sequence to 2nd, o.w. 2nd to 1st.""" - if fromPos != int(fromPos) : + if fromPos != int(fromPos): raise TypeError('CoordMapper2Seqs: pos %s is not an integer' % fromPos) fromArray = self.mapArrays[fromWhich] toArray = self.mapArrays[1 - fromWhich] - if fromPos < fromArray[0] or fromPos > fromArray[-1] : + if fromPos < fromArray[0] or fromPos > fromArray[-1]: result = None - elif fromPos == fromArray[-1] : + elif fromPos == fromArray[-1]: result = toArray[-1] - else : + else: insertInd = bisect.bisect(fromArray, fromPos) prevFromPos = fromArray[insertInd - 1] nextFromPos = fromArray[insertInd] prevToPos = toArray[insertInd - 1] nextToPos = toArray[insertInd] - assert(prevFromPos <= fromPos < nextFromPos) + assert (prevFromPos <= fromPos < nextFromPos) prevPlusOffset = prevToPos + (fromPos - prevFromPos) - if fromPos == nextFromPos - 1 and prevPlusOffset < nextToPos - 1 : + if fromPos == nextFromPos - 1 and prevPlusOffset < nextToPos - 1: result = [prevPlusOffset, nextToPos - 1] - else : + else: result = min(prevPlusOffset, nextToPos - 1) return result - # ========== snpEff annotation of VCF files ================== + def parser_snpEff(parser=argparse.ArgumentParser()): parser.add_argument("inVcf", help="Input VCF file") parser.add_argument("genomes", nargs='+', help="genome name") parser.add_argument("outVcf", help="Output VCF file") parser.add_argument("emailAddress", - help="""Your email address. To access the Genbank CoreNucleotide database, - NCBI requires you to specify your email address with each request. + help="""Your email address. To access the Genbank CoreNucleotide database, + NCBI requires you to specify your email address with each request. In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access.""") - util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('tmpDir', None), ('loglevel', None), ('version', None))) util.cmd.attach_main(parser, tools.snpeff.SnpEff().annotate_vcf, split_args=True) return parser -__commands__.append(('snpEff', parser_snpEff)) +__commands__.append(('snpEff', parser_snpEff)) + # ======================= # *** align_mafft *** # ======================= + def parser_general_mafft(parser=argparse.ArgumentParser()): - parser.add_argument('inFastas', nargs='+', - help='Input FASTA files.') + parser.add_argument('inFastas', nargs='+', help='Input FASTA files.') group = parser.add_mutually_exclusive_group() - group.add_argument('--localpair', default=None, action='store_true', - help='All pairwise alignments are computed with the Smith-Waterman algorithm.') - group.add_argument('--globalpair', default=None, action='store_true', - help='All pairwise alignments are computed with the Needleman-Wunsch algorithm.') - - parser.add_argument('--preservecase', default=None, action='store_true', - help='Preserve base or aa case, as well as symbols.') - parser.add_argument('--reorder', default=None, action='store_true', - help='Output is ordered aligned rather than in the order of the input (default: %(default)s).') - parser.add_argument('--gapOpeningPenalty', default=1.53, type=float, - help='Gap opening penalty (default: %(default)s).') - parser.add_argument('--ep', type=float, - help='Offset (works like gap extension penalty).') - parser.add_argument('--verbose', default=False, action='store_true', - help='Full output (default: %(default)s).') - parser.add_argument('--outputAsClustal', default=None, action='store_true', - help='Write output file in Clustal format rather than FASTA') - parser.add_argument('--maxiters', default = 0, type=int, - help='Maximum number of refinement iterations (default: %(default)s). Note: if "--localpair" or "--globalpair" is specified this defaults to 1000.') - parser.add_argument('--threads', default = -1, type=int, + group.add_argument('--localpair', + default=None, + action='store_true', + help='All pairwise alignments are computed with the Smith-Waterman algorithm.') + group.add_argument('--globalpair', + default=None, + action='store_true', + help='All pairwise alignments are computed with the Needleman-Wunsch algorithm.') + + parser.add_argument('--preservecase', + default=None, + action='store_true', + help='Preserve base or aa case, as well as symbols.') + parser.add_argument('--reorder', + default=None, + action='store_true', + help='Output is ordered aligned rather than in the order of the input (default: %(default)s).') + parser.add_argument('--gapOpeningPenalty', + default=1.53, + type=float, + help='Gap opening penalty (default: %(default)s).') + parser.add_argument('--ep', type=float, help='Offset (works like gap extension penalty).') + parser.add_argument('--verbose', default=False, action='store_true', help='Full output (default: %(default)s).') + parser.add_argument('--outputAsClustal', + default=None, + action='store_true', + help='Write output file in Clustal format rather than FASTA') + parser.add_argument( + '--maxiters', + default=0, + type=int, + help="""Maximum number of refinement iterations (default: %(default)s). + Note: if "--localpair" or "--globalpair" is specified this defaults to 1000.""") + parser.add_argument( + '--threads', + default=-1, + type=int, help='Number of processing threads (default: %(default)s, where -1 indicates use of all available cores).') return parser + def parser_align_mafft(parser): parser = parser_general_mafft(parser) - - parser.add_argument('outFile', - help='Output file containing alignment result (default format: FASTA)') + + parser.add_argument('outFile', help='Output file containing alignment result (default format: FASTA)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_align_mafft) return parser + def main_align_mafft(args): ''' Run the mafft alignment on the input FASTA file.''' if int(args.threads) == 0 or int(args.threads) < -1: - raise argparse.ArgumentTypeError('Argument "--threads" must be non-zero. Specify "-1" to use all available cores.') - - tools.mafft.MafftTool().execute( - inFastas = args.inFastas, - outFile = args.outFile, - localpair = args.localpair, - globalpair = args.globalpair, - preservecase = args.preservecase, - reorder = args.reorder, - gapOpeningPenalty = args.gapOpeningPenalty, - offset = args.ep, - verbose = args.verbose, - outputAsClustal = args.outputAsClustal, - maxiters = args.maxiters, - threads = args.threads - ) + raise argparse.ArgumentTypeError( + 'Argument "--threads" must be non-zero. Specify "-1" to use all available cores.') + + tools.mafft.MafftTool().execute( + inFastas=args.inFastas, + outFile=args.outFile, + localpair=args.localpair, + globalpair=args.globalpair, + preservecase=args.preservecase, + reorder=args.reorder, + gapOpeningPenalty=args.gapOpeningPenalty, + offset=args.ep, + verbose=args.verbose, + outputAsClustal=args.outputAsClustal, + maxiters=args.maxiters, + threads=args.threads) return 0 + + __commands__.append(('align_mafft', parser_align_mafft)) # ======================= # *** multichr_mafft *** # ======================= + def parser_multichr_mafft(parser): parser = parser_general_mafft(parser) - parser.add_argument('outDirectory', - help='Location for the output files (default is cwd: %(default)s)') - parser.add_argument('--outFilePrefix', default="aligned", - help='Prefix for the output file name (default: %(default)s)') - parser.add_argument('--sampleRelationFile', default=None, - help="""If the parameter sampleRelationFile is specified - (as a file path), a JSON file will be written mapping + parser.add_argument('outDirectory', help='Location for the output files (default is cwd: %(default)s)') + parser.add_argument('--outFilePrefix', + default="aligned", + help='Prefix for the output file name (default: %(default)s)') + parser.add_argument('--sampleRelationFile', + default=None, + help="""If the parameter sampleRelationFile is specified + (as a file path), a JSON file will be written mapping sample name to sequence position in the output.""") - parser.add_argument('--sampleNameListFile', default=None, - help="""If the parameter sampleRelationFile is specified - (as a file path), a file will be written mapping - sample names in the order of their sequence + parser.add_argument('--sampleNameListFile', + default=None, + help="""If the parameter sampleRelationFile is specified + (as a file path), a file will be written mapping + sample names in the order of their sequence positions in the output.""") util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, multichr_mafft) return parser + def multichr_mafft(args): ''' Run the mafft alignment on a series of chromosomes provided in sample-partitioned FASTA files. Output as FASTA. (i.e. file1.fasta would contain chr1, chr2, chr3; file2.fasta would also contain chr1, chr2, chr3)''' if int(args.threads) == 0 or int(args.threads) < -1: - raise argparse.ArgumentTypeError('Argument "--threads" must be non-zero. Specify "-1" to use all available cores.') + raise argparse.ArgumentTypeError( + 'Argument "--threads" must be non-zero. Specify "-1" to use all available cores.') # get the absolute path to the output directory in case it has been specified as a relative path, # since MAFFT relies on its CWD for path resolution absoluteOutDirectory = os.path.abspath(args.outDirectory) # make the output directory if it does not exist - if not os.path.isdir( absoluteOutDirectory ): - os.makedirs( absoluteOutDirectory ) + if not os.path.isdir(absoluteOutDirectory): + os.makedirs(absoluteOutDirectory) # prefix for output files - prefix = "" if args.outFilePrefix == None else args.outFilePrefix + prefix = "" if args.outFilePrefix is None else args.outFilePrefix # reorder the data into new FASTA files, where each FASTA file has only variants of its respective chromosome transposedFiles = transposeChromosomeFiles(args.inFastas, args.sampleRelationFile, args.sampleNameListFile) # since the FASTA files are for idx, filePath in enumerate(transposedFiles): - + # execute MAFFT alignment. The input file is passed within a list, since argparse ordinarily # passes input files in this way, and the MAFFT tool expects lists, # but in this case we are creating the input file ourselves tools.mafft.MafftTool().execute( - inFastas = [os.path.abspath(filePath)], - outFile = os.path.join(absoluteOutDirectory, "{}_{}.fasta".format(prefix, idx+1)), - localpair = args.localpair, - globalpair = args.globalpair, - preservecase = args.preservecase, - reorder = args.reorder, - gapOpeningPenalty = args.gapOpeningPenalty, - offset = args.ep, - verbose = args.verbose, - outputAsClustal = args.outputAsClustal, - maxiters = args.maxiters, - threads = args.threads - ) + inFastas=[os.path.abspath(filePath)], + outFile=os.path.join(absoluteOutDirectory, "{}_{}.fasta".format(prefix, idx + 1)), + localpair=args.localpair, + globalpair=args.globalpair, + preservecase=args.preservecase, + reorder=args.reorder, + gapOpeningPenalty=args.gapOpeningPenalty, + offset=args.ep, + verbose=args.verbose, + outputAsClustal=args.outputAsClustal, + maxiters=args.maxiters, + threads=args.threads) return 0 + + __commands__.append(('multichr_mafft', parser_multichr_mafft)) # ============================ # modified version of rachel's call_snps_3.py follows + + def call_snps_3(inFasta, outVcf, REF="KJ660346.2"): - a=Bio.AlignIO.read(inFasta, "fasta") + a = Bio.AlignIO.read(inFasta, "fasta") ref_idx = find_ref(a, REF) with open(outVcf, 'wt') as outf: outf.write(vcf_header(a)) for row in make_vcf(a, ref_idx, REF): - outf.write('\t'.join(map(str, row))+'\n') + outf.write('\t'.join(map(str, row)) + '\n') + + def find_ref(a, ref): for i in range(len(a)): if a[i].id == ref: return i return -1 + + def vcf_header(a): header = "##fileformat=VCFv4.1\n" header += "##FORMAT=\n" header += "##contig=\n" - header += '#' + '\t'.join(['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT'] + [x.id for x in a]) + '\n' + header += '#' + '\t'.join(['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] + [x.id for x in + a]) + '\n' return header + + def make_vcf(a, ref_idx, chrom): - bases=set(["A", "C", "G", "T"]) + bases = set(["A", "C", "G", "T"]) for i in range(len(a[0])): alt = [] for j in range(len(a)): if (a[j][i] != a[ref_idx][i]) and ((a[ref_idx][i] in bases) and (a[j][i] in bases)) and a[j][i] not in alt: alt.append(a[j][i]) if len(alt) > 0: - row = [chrom, i+1, '.', a[ref_idx][i], ','.join(alt), '.', '.', '.', 'GT'] + row = [chrom, i + 1, '.', a[ref_idx][i], ','.join(alt), '.', '.', '.', 'GT'] genos = [] for k in range(len(a)): if a[k][i] == a[ref_idx][i]: @@ -489,15 +550,16 @@ def make_vcf(a, ref_idx, chrom): else: for m in range(0, len(alt)): if a[k][i] == alt[m]: - genos.append(m+1) - yield row+genos + genos.append(m + 1) + yield row + genos + def transposeChromosomeFiles(inputFilenamesList, sampleRelationFile=None, sampleNameListFile=None): ''' Input: a list of FASTA files representing a genome for each sample. Each file contains the same number of sequences (chromosomes, segments, etc) in the same order. If the parameter sampleRelationFile is specified (as a file path), - a JSON file will be written mapping sample name to sequence position + a JSON file will be written mapping sample name to sequence position in the output. Output: a list of FASTA files representing all samples for each chromosome/segment for input to a multiple sequence aligner. @@ -515,21 +577,22 @@ def transposeChromosomeFiles(inputFilenamesList, sampleRelationFile=None, sample # write out json file containing relation of # sample name to position in output if sampleRelationFile: - with open( os.path.realpath(sampleRelationFile), "w") as outFile: + with open(os.path.realpath(sampleRelationFile), "w") as outFile: # dict mapping sample->index, zero indexed - sampleIdxMap = dict((os.path.basename(v).replace(".fasta",""),k) for k,v in enumerate(inputFilenamesList)) + sampleIdxMap = dict((os.path.basename(v).replace(".fasta", ""), k) + for k, v in enumerate(inputFilenamesList)) json.dump(sampleIdxMap, outFile, sort_keys=True, indent=4, separators=(',', ': ')) if sampleNameListFile: - with open( os.path.realpath(sampleNameListFile), "w" ) as outFile: - sampleNameList = [os.path.basename(v).replace(".fasta","\n") for v in inputFilenamesList] + with open(os.path.realpath(sampleNameListFile), "w") as outFile: + sampleNameList = [os.path.basename(v).replace(".fasta", "\n") for v in inputFilenamesList] outFile.writelines(sampleNameList) # for each interleaved record for chrRecordList in zip_longest(*fastaFiles): - if any(rec==None for rec in chrRecordList): + if any(rec is None for rec in chrRecordList): raise Exception("input files must all have the same number of sequences") - + outputFilename = util.file.mkstempfname('.fasta') outputFilenames.append(outputFilename) with open(outputFilename, "w") as outf: @@ -542,7 +605,10 @@ def transposeChromosomeFiles(inputFilenamesList, sampleRelationFile=None, sample return outputFilenames + def full_parser(): return util.cmd.make_parser(__commands__, __doc__) + + if __name__ == '__main__': util.cmd.main_argparse(__commands__, __doc__) diff --git a/intrahost.py b/intrahost.py index 44fd0ed9e..7cb822d32 100755 --- a/intrahost.py +++ b/intrahost.py @@ -8,14 +8,24 @@ __commands__ = [] # built-ins -import argparse, logging, itertools, re, os, collections +import argparse +import logging +import itertools +import re +import os +import collections # third-party -import Bio.AlignIO, Bio.SeqIO, Bio.Data.IUPACData +import Bio.AlignIO +import Bio.SeqIO +import Bio.Data.IUPACData import pysam # module-specific -import util.cmd, util.file, util.vcf, util.misc +import util.cmd +import util.file +import util.vcf +import util.misc from util.stats import median, fisher_exact, chi2_contingency from interhost import CoordMapper from tools.vphaser2 import Vphaser2Tool @@ -25,66 +35,61 @@ # ============= class AlleleFieldParser ================= -class AlleleFieldParser(object) : + +class AlleleFieldParser(object): """ Class for converting between the string and list representation of the fields in the allele columns of vphaser_one_sample output (corresponding to the SNP_or_LP_Profile columns in the V-Phaser 2 output). """ - def __init__(self, field = None, - allele = None, fcount = None, rcount = None, libBiasPval = None, - libCounts = None) : + + def __init__(self, field=None, allele=None, fcount=None, rcount=None, libBiasPval=None, libCounts=None): """ Input is either the string stored in one of the allele columns. or set field values. """ - if field == None : + if field is None: self._allele = allele self._strandCounts = [fcount, rcount] self._libBiasPval = libBiasPval - self._libCounts = libCounts # libCounts is a list of 2-element lists - else : + self._libCounts = libCounts # libCounts is a list of 2-element lists + else: words = field.split(':') self._allele = words[0] self._strandCounts = [int(words[1]), int(words[2])] - self._libCounts = [[int(words[ii]), int(words[ii + 1])] - for ii in range(3, len(words) - 1, 2)] + self._libCounts = [[int(words[ii]), int(words[ii + 1])] for ii in range(3, len(words) - 1, 2)] self._libBiasPval = float(words[-1]) - - def __repr__(self) : + + def __repr__(self): """Convert to string representation.""" - return ':'.join([self._allele] + - list(map(str, self._strandCounts)) + - sum((list(map(str, libCount)) - for libCount in self._libCounts), - []) + - ['%.4g' % self._libBiasPval]) - - def allele(self) : + return ':'.join([self._allele] + list(map(str, self._strandCounts)) + sum((list(map( + str, libCount)) for libCount in self._libCounts), []) + ['%.4g' % self._libBiasPval]) + + def allele(self): """ Return allele: A, C, G, or T for SNVs, Dn with n > 0 for deletions, Ibases where bases is a string of two or more bases for inserts """ return self._allele - - def total(self) : + + def total(self): return sum(self._strandCounts) - def strand_counts(self) : + def strand_counts(self): "Return [# forward reads (all libs), # reverse reads (all libs)]" return self._strandCounts - - def allele_and_strand_counts(self) : + + def allele_and_strand_counts(self): return [self.allele()] + self.strand_counts() - def lib_counts(self) : + def lib_counts(self): """Yield [# forward reads, # reverse reads] for each library, in order of read groups in BAM file. """ - for counts in self._libCounts : + for counts in self._libCounts: yield counts - def lib_bias_pval(self) : + def lib_bias_pval(self): "Return a p-value on whether there is a library bias for this allele." return self._libBiasPval @@ -93,17 +98,18 @@ def lib_bias_pval(self) : defaultMinReads = 5 defaultMaxBias = 10 -def vphaser_one_sample(inBam, inConsFasta, outTab, vphaserNumThreads = None, - minReadsEach = None, maxBias = None, removeDoublyMappedReads=False) : + +def vphaser_one_sample(inBam, inConsFasta, outTab, vphaserNumThreads=None, + minReadsEach=None, maxBias=None, removeDoublyMappedReads=False): ''' Input: a single BAM file, representing reads from one sample, mapped to - its own consensus assembly. It may contain multiple read groups and + its own consensus assembly. It may contain multiple read groups and libraries. Output: a tab-separated file with no header containing filtered V Phaser-2 output variants with additional column for sequence/chrom name, and library counts and p-values appended to the counts for each allele. ''' - if minReadsEach != None and minReadsEach < 0: + if minReadsEach is not None and minReadsEach < 0: raise Exception('minReadsEach must be at least 0.') bamToProcess = inBam @@ -114,41 +120,40 @@ def vphaser_one_sample(inBam, inConsFasta, outTab, vphaserNumThreads = None, samtoolsTool.index(bamToProcess) variantIter = Vphaser2Tool().iterate(bamToProcess, vphaserNumThreads) filteredIter = filter_strand_bias(variantIter, minReadsEach, maxBias) - + libraryFilteredIter = compute_library_bias(filteredIter, bamToProcess, inConsFasta) - with util.file.open_or_gzopen(outTab, 'wt') as outf : - for row in libraryFilteredIter : + with util.file.open_or_gzopen(outTab, 'wt') as outf: + for row in libraryFilteredIter: outf.write('\t'.join(row) + '\n') -def filter_strand_bias(isnvs, minReadsEach = None, maxBias = None) : + +def filter_strand_bias(isnvs, minReadsEach=None, maxBias=None): ''' Take an iterator of V-Phaser output (plus chromosome name prepended) and perform hard filtering for strand bias ''' - alleleCol = 7 # First column of output with allele counts - if minReadsEach == None : + alleleCol = 7 # First column of output with allele counts + if minReadsEach is None: minReadsEach = defaultMinReads - if maxBias == None : + if maxBias is None: maxBias = defaultMaxBias for row in isnvs: #front = row[:alleleCol] - for fieldInd in range(len(row) - 1, alleleCol - 1, -1) : + for fieldInd in range(len(row) - 1, alleleCol - 1, -1): f, r = AlleleFieldParser(row[fieldInd]).strand_counts() - if (int(f) 0 and not - (maxBias >= (float(f)/float(r)) >= 1.0/maxBias))) : + if (int(f) < minReadsEach or int(r) < minReadsEach or + (minReadsEach > 0 and not (maxBias >= (float(f) / float(r)) >= 1.0 / maxBias))): del row[fieldInd] if len(row) > alleleCol + 1: - row[alleleCol:] = sorted(row[alleleCol:], - key = lambda field : AlleleFieldParser(field).total(), - reverse = True) + row[alleleCol:] = sorted(row[alleleCol:], key=lambda field: AlleleFieldParser(field).total(), reverse=True) mac = sum(AlleleFieldParser(field).total() for field in row[alleleCol + 1:]) tot = sum(AlleleFieldParser(field).total() for field in row[alleleCol:]) row[2] = AlleleFieldParser(row[alleleCol + 1]).allele() row[3] = AlleleFieldParser(row[alleleCol]).allele() - row[6] = '%.6g' % (100.0*mac/tot) + row[6] = '%.6g' % (100.0 * mac / tot) yield row -def compute_library_bias(isnvs, inBam, inConsFasta) : + +def compute_library_bias(isnvs, inBam, inConsFasta): ''' For each variant, compute read counts in each library and p-value for library bias; append them to string for each variant. Format is allele:totalF:totalR:1stLibFCount:1stLibRCount:2ndLibFCount:...:p-val. @@ -156,23 +161,22 @@ def compute_library_bias(isnvs, inBam, inConsFasta) : Note: Total was computed by vphaser, library counts by samtools mpileup, so total might not be sum of library counts. ''' - alleleCol = 7 # First column of output with allele counts + alleleCol = 7 # First column of output with allele counts samtoolsTool = SamtoolsTool() - rgs_by_lib = sorted((rg['LB'],rg['ID']) - for rg in samtoolsTool.getReadGroups(inBam).values()) + rgs_by_lib = sorted((rg['LB'], rg['ID']) for rg in samtoolsTool.getReadGroups(inBam).values()) rgs_by_lib = itertools.groupby(rgs_by_lib, lambda x: x[0]) libBams = [] header_sam = util.file.mkstempfname('.sam') samtoolsTool.dumpHeader(inBam, header_sam) - for lib,rgs in rgs_by_lib: - rgs = list(id for lb,id in rgs) - + for lib, rgs in rgs_by_lib: + rgs = list(id for lb, id in rgs) + # Create libBam containing all the readgroups in rgs. # In samtools 1.1, this can be done by including -r multiple times on # a single command line, but that doesn't work in 0.1.19, so instead # extract readgroups one by one and then concatenate. rgBams = [] - for id in rgs : + for id in rgs: rgBam = util.file.mkstempfname('.bam') samtoolsTool.view(['-b', '-r', id], inBam, rgBam) samtoolsTool.index(rgBam) @@ -185,88 +189,88 @@ def compute_library_bias(isnvs, inBam, inConsFasta) : if len(rgBams) > 1: libBam = util.file.mkstempfname('.bam') samtoolsTool.merge(rgBams, libBam, ['-f', '-1', '-h', header_sam]) - for bam in rgBams : + for bam in rgBams: os.unlink(bam) else: # samtools merge cannot deal with only one (or zero) input bams libBam = rgBams[0] samtoolsTool.index(libBam) n_reads = samtoolsTool.count(libBam) - log.debug("LB:%s has %s reads in %s read groups (%s)", - lib, n_reads, len(rgs), ', '.join(rgs)) + log.debug("LB:%s has %s reads in %s read groups (%s)", lib, n_reads, len(rgs), ', '.join(rgs)) libBams.append(libBam) - - for row in isnvs : + + for row in isnvs: consensusAllele = row[3] pos = int(row[1]) if consensusAllele != 'i' else int(row[1]) - 1 chrom = row[0] - libCounts = [get_mpileup_allele_counts(libBam, chrom, pos, inConsFasta) - for libBam in libBams] + libCounts = [get_mpileup_allele_counts(libBam, chrom, pos, inConsFasta) for libBam in libBams] numAlleles = len(row) - alleleCol countsMatrix = [[0] * numAlleles for lib in libBams] libCountsByAllele = [] - for alleleInd in range(numAlleles) : + for alleleInd in range(numAlleles): allele = row[alleleCol + alleleInd].split(':')[0] libCountsByAllele.append([]) - for libAlleleCounts, countsRow in zip(libCounts, countsMatrix) : + for libAlleleCounts, countsRow in zip(libCounts, countsMatrix): f, r = libAlleleCounts.get(allele, [0, 0]) libCountsByAllele[-1].append([f, r]) countsRow[alleleInd] += f + r - for alleleInd in range(numAlleles) : + for alleleInd in range(numAlleles): contingencyTable = [ - [ countsRow[alleleInd] for countsRow in countsMatrix], - [sum(countsRow) - countsRow[alleleInd] for countsRow in countsMatrix]] + [countsRow[alleleInd] for countsRow in countsMatrix], [sum(countsRow) - countsRow[alleleInd] + for countsRow in countsMatrix] + ] rowSums = map(sum, contingencyTable) dofs = len(libCounts) - 1 - if dofs < 1 : + if dofs < 1: pval = 1.0 - elif min(rowSums) ** dofs / dofs < 10000 : + elif min(rowSums) ** dofs / dofs < 10000: # At this cutoff, fisher_exact should take <~ 0.1 sec pval = fisher_exact(contingencyTable) - else : + else: pval = chi2_contingency(contingencyTable) - row[alleleCol + alleleInd] = str(AlleleFieldParser(None, - *(row[alleleCol + alleleInd].split(':') + - [pval, libCountsByAllele[alleleInd]]))) + row[alleleCol + alleleInd] = str(AlleleFieldParser(None, *(row[alleleCol + alleleInd].split(':') + + [pval, libCountsByAllele[alleleInd]]))) yield row for bam in libBams: os.unlink(bam) os.unlink(header_sam) -def parse_alleles_string(allelesStr) : + +def parse_alleles_string(allelesStr): # Return {allele : [forwardCount, reverseCount]} # For reference, allele is '.' rather than real allele - alleleCounts = {} # allele : [forwardCount, reverseCount] + alleleCounts = {} # allele : [forwardCount, reverseCount] pos = -1 digits = re.compile('[0-9]+') - while pos < len(allelesStr) - 1 : + while pos < len(allelesStr) - 1: pos += 1 c = allelesStr[pos] - if c in '.,' : + if c in '.,': allele = '.' isRev = c == ',' - elif c in '<>$*' : # Reference skip, end of read, placeholder - continue # Not interested in these - elif c == '^' : # Start of read - pos += 1 # Skip quality character + elif c in '<>$*': # Reference skip, end of read, placeholder + continue # Not interested in these + elif c == '^': # Start of read + pos += 1 # Skip quality character continue - elif c in 'ACGTNacgtn' : + elif c in 'ACGTNacgtn': allele = c.upper() isRev = c == c.lower() - elif c in '+-' : # e.g., +3aaa + elif c in '+-': # e.g., +3aaa mat = digits.match(allelesStr, pos + 1) - indelLen = int(allelesStr[mat.start() : mat.end()]) - indelStr = allelesStr[mat.end() : mat.end() + indelLen] + indelLen = int(allelesStr[mat.start():mat.end()]) + indelStr = allelesStr[mat.end():mat.end() + indelLen] allele = 'I' + indelStr.upper() if c == '+' else 'D' + str(indelLen) isRev = indelStr == indelStr.lower() pos += mat.end() - mat.start() + indelLen - else : + else: raise Exception('Unknown allele type %s' % c) alleleCounts.setdefault(allele, [0, 0]) alleleCounts[allele][isRev] += 1 return alleleCounts -def get_mpileup_allele_counts(inBam, chrom, pos, inConsFasta) : + +def get_mpileup_allele_counts(inBam, chrom, pos, inConsFasta): """ Return {allele : [forwardCount, reverseCount], ...} allele is: Iins for insertions where ins represents the inserted bases @@ -275,14 +279,11 @@ def get_mpileup_allele_counts(inBam, chrom, pos, inConsFasta) : 'i' or 'd', in which case report count for consensus. """ pileupFileName = util.file.mkstempfname('.txt') - SamtoolsTool().mpileup(inBam, pileupFileName, - ['-A', - '-r', '%s:%d-%d' % (chrom, pos, pos), - '-B', '-d', '50000', '-L', '50000', '-Q', '0', - '-f', inConsFasta]) - with open(pileupFileName) as pileupFile : + SamtoolsTool().mpileup(inBam, pileupFileName, ['-A', '-r', '%s:%d-%d' % (chrom, pos, pos), '-B', '-d', '50000', + '-L', '50000', '-Q', '0', '-f', inConsFasta]) + with open(pileupFileName) as pileupFile: words = pileupFile.readline().split('\t') - if len(words)<5: + if len(words) < 5: # empty output files means no reads pile up on this position return {} alleleCounts = parse_alleles_string(words[4]) @@ -297,52 +298,62 @@ def get_mpileup_allele_counts(inBam, chrom, pos, inConsFasta) : refAllele = words[2] alleleCounts['i'] = alleleCounts['d'] = alleleCounts[refAllele] = \ alleleCounts.get('.', [0, 0]) - + return alleleCounts -def parser_vphaser_one_sample(parser = argparse.ArgumentParser()) : - parser.add_argument("inBam", - help = "Input Bam file.") - parser.add_argument("inConsFasta", - help = "Consensus assembly fasta.") - parser.add_argument("outTab", help = "Tab-separated headerless output file.") - parser.add_argument("--vphaserNumThreads", type = int, default = None, - help="Number of threads in call to V-Phaser 2.") - parser.add_argument("--minReadsEach", type = int, default = defaultMinReads, - help = "Minimum number of reads on each strand (default: %(default)s).") - parser.add_argument("--maxBias", type = int, default = defaultMaxBias, - help = """Maximum allowable ratio of number of reads on the two strands + +def parser_vphaser_one_sample(parser=argparse.ArgumentParser()): + parser.add_argument("inBam", help="Input Bam file.") + parser.add_argument("inConsFasta", help="Consensus assembly fasta.") + parser.add_argument("outTab", help="Tab-separated headerless output file.") + parser.add_argument("--vphaserNumThreads", type=int, default=None, help="Number of threads in call to V-Phaser 2.") + parser.add_argument("--minReadsEach", + type=int, + default=defaultMinReads, + help="Minimum number of reads on each strand (default: %(default)s).") + parser.add_argument("--maxBias", + type=int, + default=defaultMaxBias, + help="""Maximum allowable ratio of number of reads on the two strands (default: %(default)s). Ignored if minReadsEach = 0.""") - parser.add_argument("--removeDoublyMappedReads", default=False, action="store_true", - help="""When calling V-Phaser, keep reads mapping to more than one contig.""") + parser.add_argument("--removeDoublyMappedReads", + default=False, + action="store_true", + help="""When calling V-Phaser, keep reads mapping to more than one contig.""") util.cmd.common_args(parser, (('loglevel', None), ('version', None))) - util.cmd.attach_main(parser, vphaser_one_sample, split_args = True) + util.cmd.attach_main(parser, vphaser_one_sample, split_args=True) return parser + + __commands__.append(('vphaser_one_sample', parser_vphaser_one_sample)) # ========== vphaser ================= -def parser_vphaser(parser = argparse.ArgumentParser()) : - parser.add_argument("inBam", - help = "Input Bam file.") - parser.add_argument("outTab", help = "Tab-separated headerless output file.") - parser.add_argument("--numThreads", type = int, default = None, - help="Number of threads in call to V-Phaser 2.") + +def parser_vphaser(parser=argparse.ArgumentParser()): + parser.add_argument("inBam", help="Input Bam file.") + parser.add_argument("outTab", help="Tab-separated headerless output file.") + parser.add_argument("--numThreads", type=int, default=None, help="Number of threads in call to V-Phaser 2.") util.cmd.common_args(parser, (('loglevel', None), ('version', None))) - util.cmd.attach_main(parser, vphaser_main, split_args = True) + util.cmd.attach_main(parser, vphaser_main, split_args=True) return parser -def vphaser_main(inBam, outTab, numThreads = None) : + + +def vphaser_main(inBam, outTab, numThreads=None): """ Run V-Phaser 2 on the input file without any additional filtering. - Combine the non-header lines of the CHROM.var.raw.txt files it produces, + Combine the non-header lines of the CHROM.var.raw.txt files it produces, adding CHROM as the first field on each line. """ - with open(outTab, 'wt') as outf : - for row in Vphaser2Tool().iterate(inBam, numThreads) : + with open(outTab, 'wt') as outf: + for row in Vphaser2Tool().iterate(inBam, numThreads): outf.write('\t'.join(row) + '\n') + + __commands__.append(('vphaser', parser_vphaser)) # ========== tabfile_values_rename ================= + def tabfile_values_rename(inFile, mapFile, outFile, col=0): ''' Take input tab file and copy to an output file while changing the values in a specific column based on a mapping file. @@ -361,25 +372,31 @@ def tabfile_values_rename(inFile, mapFile, outFile, col=0): for line in inf: row = line.rstrip('\n').split('\t') row[col] = name_map[row[col]] - outf.write('\t'.join(row)+'\n') + outf.write('\t'.join(row) + '\n') + + def parser_tabfile_rename(parser=argparse.ArgumentParser()): parser.add_argument("inFile", help="Input flat file") parser.add_argument("mapFile", - help="""Map file. Two-column headerless file that maps input values to + help="""Map file. Two-column headerless file that maps input values to output values. This script will error if there are values in inFile that do not exist in mapFile.""") parser.add_argument("outFile", help="Output flat file") - parser.add_argument("--col_idx", dest="col", type=int, - help="""Which column number to replace (0-based index). [default: %(default)s]""", - default=0) - util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + parser.add_argument("--col_idx", + dest="col", + type=int, + help="""Which column number to replace (0-based index). [default: %(default)s]""", + default=0) + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, tabfile_values_rename, split_args=True) return parser -__commands__.append(('tabfile_rename', parser_tabfile_rename)) +__commands__.append(('tabfile_rename', parser_tabfile_rename)) + # ========== merge_to_vcf =========================== + def count_iter_items(iterable): """ Consume an iterable not reading it into memory; return the number of items. @@ -389,6 +406,7 @@ def count_iter_items(iterable): collections.deque(zip(iterable, counter), maxlen=0) # (consume at C speed) return next(counter) + def strip_accession_version(acc): ''' If this is a Genbank accession with a version number, remove the version number. @@ -398,6 +416,7 @@ def strip_accession_version(acc): acc = m.group(1) return acc + def parse_accession_str(chr_ref): ''' This tries to match an NCBI accession as defined here: @@ -408,8 +427,18 @@ def parse_accession_str(chr_ref): chr_ref = m.group("accession") return chr_ref -#def merge_to_vcf(refFasta, outVcf, samples, isnvs, assemblies, strip_chr_version=False, naive_filter=False): -def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version=False, naive_filter=False, parse_accession=False): +# def merge_to_vcf(refFasta, outVcf, samples, isnvs, assemblies, strip_chr_version=False, naive_filter=False): + + +def merge_to_vcf( + refFasta, + outVcf, + samples, + isnvs, + alignments, + strip_chr_version=False, + naive_filter=False, + parse_accession=False): ''' Combine and convert vPhaser2 parsed filtered output text files into VCF format. Assumption: consensus assemblies used in creating alignments do not extend beyond ends of reference. the number of alignment files equals the number of chromosomes / segments @@ -418,7 +447,7 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version # get IDs and sequence lengths for reference sequence with util.file.open_or_gzopen(refFasta, 'r') as inf: ref_chrlens = list((seq.id, len(seq)) for seq in Bio.SeqIO.parse(inf, 'fasta')) - + # use the output filepath specified if it is a .vcf, otherwise if it is gzipped we need # to write to a temp VCF and then compress to vcf.gz later if outVcf.endswith('.vcf.gz'): @@ -427,9 +456,9 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version tmpVcf = outVcf else: raise ValueError("outVcf must end in .vcf or .vcf.gz") - + log.info("loaded CoordMapper for all genomes, starting VCF merge...") - + # write header with open(tmpVcf, 'w') as outf: # write header @@ -437,7 +466,8 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version outf.write('##FORMAT=\n') outf.write('##FORMAT=\n') outf.write('##FORMAT=\n') - outf.write('##FORMAT=\n') + outf.write( + '##FORMAT=\n') # write out the contig lengths present in the reference genome for c, clen in ref_chrlens: if parse_accession: @@ -447,8 +477,8 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version outf.write('##contig=\n' % (c, clen)) # write out the name of the reference file used to generate the VCF outf.write('##reference=file://%s\n' % refFasta) - header = ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT'] + samples - outf.write('#'+'\t'.join(header)+'\n') + header = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'] + samples + outf.write('#' + '\t'.join(header) + '\n') # compress output if requested if outVcf.endswith('.vcf.gz'): @@ -459,7 +489,7 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version if not len(ref_chrlens) == len(alignments): raise LookupError("there must be an alignment file for each chromosome/segment present in the reference") - # we are assuming that the reference sequences have the same IDs in the alignments and in the + # we are assuming that the reference sequences have the same IDs in the alignments and in the # reference fasta, but we need to relate each reference sequence (chromosome/segment) to a specific # alignment file and index within the alignment ref_seq_id_to_alignment_file = dict() @@ -473,25 +503,29 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version for alignmentFile in alignmentFiles: with util.file.open_or_gzopen(alignmentFile, 'r') as inf2: for idx, seq in enumerate(Bio.SeqIO.parse(inf2, 'fasta')): - if refSeq.id == seq.id: + if refSeq.id == seq.id: ref_seq_id_to_alignment_file[seq.id] = alignmentFile ref_seq_in_alignment_file[seq.id] = seq.seq.ungap('-') - + if len(ref_seq_id_to_alignment_file) < len(ref_chrlens): raise LookupError("Not all reference sequences found in alignments.") if not (len(samples) == len(isnvs)): - raise LookupError("There must be an isnv file for each sample. %s samples, %s isnv files" % (len(samples), len(isnvs))) + raise LookupError( + "There must be an isnv file for each sample. %s samples, %s isnv files" % (len(samples), len(isnvs))) for fileName in alignments: with util.file.open_or_gzopen(fileName, 'r') as inf: # get two independent iterators into the alignment file alignmentSeqIter, alignmentSeqIter2 = itertools.tee(Bio.SeqIO.parse(inf, 'fasta'), 2) number_of_aligned_sequences = count_iter_items(alignmentSeqIter2) - # -1 is to account for inclusion of reference in the alignement in addition + # -1 is to account for inclusion of reference in the alignement in addition # to the assemblies - if not (number_of_aligned_sequences-1) == len(isnvs) == len(samples): - raise LookupError("samples, isnvs, and alignments must have the same number of elements (plus an extra reference record in the alignment). %s does not have the right number of sequences" % fileName) + if not (number_of_aligned_sequences - 1) == len(isnvs) == len(samples): + raise LookupError( + """samples, isnvs, and alignments must have the same number of elements + (plus an extra reference record in the alignment). + %s does not have the right number of sequences""" % fileName) samp_to_isnv = dict(zip(samples, isnvs)) @@ -503,7 +537,7 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version #ref_sequence = ref_seq_in_alignment_file[refSequence.id] # make a coordmapper to map all alignments to this reference sequence cm = CoordMapper() - cm.load_alignments( [ref_seq_id_to_alignment_file[ref_sequence.id]] ) + cm.load_alignments([ref_seq_id_to_alignment_file[ref_sequence.id]]) # ======================== # to map from ref->sample @@ -525,7 +559,9 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version samp_to_seqIndex[sampleName] = seq.seq.ungap('-') if not len(samp_to_seqIndex) == len(samplesToUse): - raise LookupError("Sequence info not found in file %s for all sample names provided. Check alignment files." % alignmentFile) + raise LookupError( + "Sequence info not found in file %s for all sample names provided. Check alignment files." % + alignmentFile) for s in samplesToUse: for row in util.file.read_tabfile(samp_to_isnv[sampleIDMatch(s)]): @@ -534,181 +570,185 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version if row[0] == s_chrom: allele_fields = list(AlleleFieldParser(x) for x in row[7:] if x) row = { - 'sample' : s, - 'CHROM' : ref_sequence.id, - 's_chrom' : s_chrom, - 's_pos' : int(row[1]), - 's_alt' : row[2], - 's_ref' : row[3], - 'alleles' : list(x.allele_and_strand_counts() for x in allele_fields), - 'n_libs' : dict( - (x.allele(), sum(1 for f,r in x.lib_counts() if f+r>0)) - for x in allele_fields), - 'lib_bias':dict( - (x.allele(), x.lib_bias_pval()) - for x in allele_fields), + 'sample': s, + 'CHROM': ref_sequence.id, + 's_chrom': s_chrom, + 's_pos': int(row[1]), + 's_alt': row[2], + 's_ref': row[3], + 'alleles': list(x.allele_and_strand_counts() for x in allele_fields), + 'n_libs': dict( + (x.allele(), sum(1 for f, r in x.lib_counts() + if f + r > 0)) for x in allele_fields), + 'lib_bias': dict( + (x.allele(), x.lib_bias_pval()) for x in allele_fields), } # make a sorted allele list row['allele_counts'] = list(sorted( - [(a,int(f)+int(r)) for a,f,r in row['alleles']], - key=(lambda x:x[1]), reverse=True)) + [(a, int(f) + int(r)) for a, f, r in row['alleles']], + key=(lambda x: x[1]), + reverse=True)) # naive filter (quick and dirty) if naive_filter: # require 2 libraries for every allele call - row['allele_counts'] = list((a,n) - for a,n in row['allele_counts'] - if row['n_libs'][a]>=2) + row['allele_counts'] = list((a, n) for a, n in row['allele_counts'] + if row['n_libs'][a] >= 2) # recompute total read counts for remaining - tot_n = sum(n for a,n in row['allele_counts']) + tot_n = sum(n for a, n in row['allele_counts']) # require allele frequency >= 0.5% - row['allele_counts'] = list((a,n) - for a,n in row['allele_counts'] - if tot_n>0 and float(n)/tot_n >= 0.005) + row['allele_counts'] = list((a, n) for a, n in row['allele_counts'] + if tot_n > 0 and float(n) / tot_n >= 0.005) # drop this position:sample if no variation left if len(row['allele_counts']) < 2: - log.info("dropping iSNV at %s:%s (%s) because no variation remains after simple filtering", - row['s_chrom'], row['s_pos'], row['sample']) + log.info( + """dropping iSNV at %s:%s (%s) + because no variation remains after simple filtering""", row['s_chrom'], + row['s_pos'], row['sample']) continue # reposition vphaser deletions minus one to be consistent with # VCF conventions if row['s_alt'].startswith('D'): - for a,n in row['allele_counts']: - if a[0] not in ('D','i'): + for a, n in row['allele_counts']: + if a[0] not in ('D', 'i'): log.error("allele_counts: " + str(row['allele_counts'])) raise Exception("deletion alleles must always start with D or i") - row['s_pos'] = row['s_pos']-1 + row['s_pos'] = row['s_pos'] - 1 # map position back to reference coordinates - row['POS'] = cm.mapChr(s, ref_sequence.id, row['s_pos'], side = -1)[1] - row['END'] = cm.mapChr(s, ref_sequence.id, row['s_pos'], side = 1)[1] + row['POS'] = cm.mapChr(s, ref_sequence.id, row['s_pos'], side=-1)[1] + row['END'] = cm.mapChr(s, ref_sequence.id, row['s_pos'], side=1)[1] if row['POS'] == None or row['END'] == None: raise Exception('consensus extends beyond start or end of reference.') data.append(row) - + # sort all iSNVs (across all samples) and group by position data = sorted(data, key=(lambda row: row['POS'])) data = itertools.groupby(data, lambda row: row['POS']) - - # process one reference position at a time from + + # process one reference position at a time from for pos, rows in data: # each of the sample-specific variants for a given ref pos rows = list(rows) - + # define the length of this variation based on the largest deletion end = pos for row in rows: end = max(end, row['END']) - for a,n in row['allele_counts']: + for a, n in row['allele_counts']: if a.startswith('D'): # end of deletion in sample's coord space - local_end = row['s_pos']+int(a[1:]) + local_end = row['s_pos'] + int(a[1:]) # end of deletion in reference coord space - ref_end = cm.mapChr(row['s_chrom'], ref_sequence.id, local_end, side = 1)[1] - if ref_end == None: - raise Exception('consensus extends ' \ - 'beyond start or end of reference.') + ref_end = cm.mapChr(row['s_chrom'], ref_sequence.id, local_end, side=1)[1] + if ref_end is None: + raise Exception('consensus extends ' 'beyond start or end of reference.') end = max(end, ref_end) - + # find reference allele and consensus alleles - refAllele = str(ref_sequence[pos-1:end].seq) - consAlleles = {} # the full pos-to-end consensus assembly sequence for each sample - samp_offsets = {} # {sample : isnv's index in its consAllele string} - for row in rows : + refAllele = str(ref_sequence[pos - 1:end].seq) + consAlleles = {} # the full pos-to-end consensus assembly sequence for each sample + samp_offsets = {} # {sample : isnv's index in its consAllele string} + for row in rows: s_pos = row['s_pos'] sample = row['sample'] - if samp_offsets.get(sample, s_pos) != s_pos : + if samp_offsets.get(sample, s_pos) != s_pos: raise NotImplementedError('Sample %s has variants at 2 ' - 'positions %s mapped to same reference position (%s:%s)' % - (sample, (s_pos, samp_offsets[sample]), ref_sequence.id, pos)) + 'positions %s mapped to same reference position (%s:%s)' % + (sample, (s_pos, samp_offsets[sample]), ref_sequence.id, pos)) samp_offsets[sample] = s_pos for s in samplesToUse: # map ref to s - cons_start = cm.mapChr(ref_sequence.id, s, pos, side = -1)[1] - cons_stop = cm.mapChr(ref_sequence.id, s, end, side = 1)[1] - if cons_start == None or cons_stop == None : + cons_start = cm.mapChr(ref_sequence.id, s, pos, side=-1)[1] + cons_stop = cm.mapChr(ref_sequence.id, s, end, side=1)[1] + if cons_start is None or cons_stop is None: log.info("variant is outside consensus assembly " - "for %s at %s:%s-%s.", s, ref_sequence.id, pos, end) + "for %s at %s:%s-%s.", s, ref_sequence.id, pos, end) continue - - cons = samp_to_seqIndex[s]#.seq.ungap('-')#[ cm.mapChr(ref_sequence.id, s) ] - - allele = str(cons[cons_start-1:cons_stop]).upper() + + cons = samp_to_seqIndex[s] # .seq.ungap('-')#[ cm.mapChr(ref_sequence.id, s) ] + + allele = str(cons[cons_start - 1:cons_stop]).upper() if s in samp_offsets: samp_offsets[s] -= cons_start - if all(a in set(('A','C','T','G')) for a in allele): + if all(a in set(('A', 'C', 'T', 'G')) for a in allele): consAlleles[s] = allele else: - log.warning("dropping ambiguous consensus for %s at %s:%s-%s: %s", s, ref_sequence.id, pos, end, allele) - + log.warning("dropping ambiguous consensus for %s at %s:%s-%s: %s", s, ref_sequence.id, pos, + end, allele) + # define genotypes and fractions - iSNVs = {} # {sample : {allele : fraction, ...}, ...} - iSNVs_n_libs = {} # {sample : {allele : n libraries > 0, ...}, ...} - iSNVs_lib_bias = {} # {sample : {allele : pval, ...}, ...} + iSNVs = {} # {sample : {allele : fraction, ...}, ...} + iSNVs_n_libs = {} # {sample : {allele : n libraries > 0, ...}, ...} + iSNVs_lib_bias = {} # {sample : {allele : pval, ...}, ...} for s in samplesToUse: - + # get all rows for this sample and merge allele counts together - acounts = dict(itertools.chain.from_iterable(row['allele_counts'] - for row in rows if row['sample'] == s)) - nlibs = dict(itertools.chain.from_iterable(row['n_libs'].items() - for row in rows if row['sample'] == s)) - libbias = dict(itertools.chain.from_iterable(row['lib_bias'].items() - for row in rows if row['sample'] == s)) + acounts = dict(itertools.chain.from_iterable(row['allele_counts'] for row in rows if + row['sample'] == s)) + nlibs = dict(itertools.chain.from_iterable(row['n_libs'].items() for row in rows if + row['sample'] == s)) + libbias = dict(itertools.chain.from_iterable(row['lib_bias'].items() for row in rows if + row['sample'] == s)) if 'i' in acounts and 'd' in acounts: # This sample has both an insertion line and a deletion line at the same spot! # To keep the reference allele from be counted twice, once as an i and once # as a d, average the counts and get rid of one of them. - acounts['i'] = int(round((acounts['i'] + acounts['d'])/2.0,0)) + acounts['i'] = int(round((acounts['i'] + acounts['d']) / 2.0, 0)) del acounts['d'] nlibs['i'] = max(nlibs['i'], nlibs['d']) libbias['i'] = max(libbias['i'], libbias['d']) - + if acounts and s in consAlleles: # we have iSNV data on this sample consAllele = consAlleles[s] tot_n = sum(acounts.values()) - iSNVs[s] = {} # {allele : fraction, ...} + iSNVs[s] = {} # {allele : fraction, ...} iSNVs_n_libs[s] = {} iSNVs_lib_bias[s] = {} - for orig_a,n in acounts.items(): - f = float(n)/tot_n + for orig_a, n in acounts.items(): + f = float(n) / tot_n a = orig_a if a.startswith('I'): # insertion point is relative to each sample - insert_point = samp_offsets[s]+1 + insert_point = samp_offsets[s] + 1 a = consAllele[:insert_point] + a[1:] + consAllele[insert_point:] elif a.startswith('D'): # deletion is the first consensus base, plus remaining # consensus seq with the first few positions dropped off - cut_left = samp_offsets[s]+1 - cut_right = samp_offsets[s]+1+int(a[1:]) + cut_left = samp_offsets[s] + 1 + cut_right = samp_offsets[s] + 1 + int(a[1:]) a = consAllele[:cut_left] + consAllele[cut_right:] - elif a in ('i','d'): + elif a in ('i', 'd'): # this is vphaser's way of saying the "reference" (majority/consensus) # allele, in the face of other indel variants a = consAllele else: # this is a SNP - if a not in set(('A','C','T','G')): + if a not in set(('A', 'C', 'T', 'G')): raise Exception() - if f>0.5 and a!=consAllele[samp_offsets[s]]: + if f > 0.5 and a != consAllele[samp_offsets[s]]: log.warning("vPhaser and assembly pipelines mismatch at " - "%s:%d (%s) - consensus %s, vPhaser %s, f %.3f", - ref_sequence.id, pos, s, consAllele[samp_offsets[s]], a, f) + "%s:%d (%s) - consensus %s, vPhaser %s, f %.3f", ref_sequence.id, + pos, s, consAllele[samp_offsets[s]], a, f) new_allele = list(consAllele) new_allele[samp_offsets[s]] = a a = ''.join(new_allele) - if not (a and a==a.upper()): + if not (a and a == a.upper()): raise Exception() iSNVs[s][a] = f iSNVs_n_libs[s][a] = nlibs[orig_a] iSNVs_lib_bias[s][a] = libbias[orig_a] - if all(len(a)==1 for a in iSNVs[s].keys()): + if all(len(a) == 1 for a in iSNVs[s].keys()): if consAllele not in iSNVs[s]: - raise Exception("at %s:%s (%s), consensus allele %s not among iSNV alleles %s -- other cons alleles: %s" % ( - ref_sequence.id, pos, s, consAllele, ', '.join(iSNVs[s].keys()), ', '.join(consAlleles[s]))) + raise Exception( + """at %s:%s (%s), consensus allele %s + not among iSNV alleles %s -- other cons alleles: %s""" % ( + ref_sequence.id, pos, s, consAllele, ', '.join( + iSNVs[s].keys()), ', '.join( + consAlleles[s]))) elif s in consAlleles: # there is no iSNV data for this sample, so substitute the consensus allele - iSNVs[s] = {consAlleles[s]:1.0} + iSNVs[s] = {consAlleles[s]: 1.0} # get unique alleles list for this position, in this order: # first: reference allele, @@ -717,42 +757,41 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version # finally: all other alleles, sorted first by number of containing samples, # then by intrahost read frequency summed over the population, # then by the allele string itself. - alleles_cons = [a - for a,n in sorted(util.misc.histogram(consAlleles.values()).items(), - key=lambda x:x[1], reverse=True) - if a!=refAllele] - alleles_isnv = list(itertools.chain.from_iterable([iSNVs[s].items() for s in samplesToUse if s in iSNVs])) + alleles_cons = [a for a, n in sorted(util.misc.histogram(consAlleles.values()).items(), + key=lambda x: x[1], + reverse=True) if a != refAllele] + alleles_isnv = list(itertools.chain.from_iterable( + [iSNVs[s].items() for s in samplesToUse if s in iSNVs])) alleles_isnv2 = [] - for a in set(a for a,n in alleles_isnv): - counts = list(x[1] for x in alleles_isnv if x[0]==a) - if len(counts)>0 and sum(counts)>0: + for a in set(a for a, n in alleles_isnv): + counts = list(x[1] for x in alleles_isnv if x[0] == a) + if len(counts) > 0 and sum(counts) > 0: # if we filtered any alleles above, make sure to omit absent alleles - alleles_isnv2.append((len(counts),sum(counts),a)) + alleles_isnv2.append((len(counts), sum(counts), a)) else: log.info("dropped allele %s at position %s:%s", a, ref_sequence.id, pos) alleles_isnv = list(allele for n_samples, n_reads, allele in reversed(sorted(alleles_isnv2))) alleles = list(util.misc.unique([refAllele] + alleles_cons + alleles_isnv)) - # map alleles from strings to numeric indexes if not alleles: raise Exception() - elif len(alleles)==1: + elif len(alleles) == 1: # if we filtered any alleles above, skip this position if there is no variation left here log.info("dropped position %s:%s due to lack of variation", ref_sequence.id, pos) continue - alleleMap = dict((a,i) for i,a in enumerate(alleles)) + alleleMap = dict((a, i) for i, a in enumerate(alleles)) # GT col emitted below - genos = [str(alleleMap.get(consAlleles.get(s),'.')) for s in samples] + genos = [str(alleleMap.get(consAlleles.get(s), '.')) for s in samples] # AF col emitted below, everything excluding the ref allele (one float per non-ref allele) - freqs = [(s in iSNVs) and ','.join(map(str, [iSNVs[s].get(a,0.0) for a in alleles[1:]])) or '.' + freqs = [(s in iSNVs) and ','.join(map(str, [iSNVs[s].get(a, 0.0) for a in alleles[1:]])) or '.' for s in samplesToUse] # NL col, everything including the ref allele (one int per allele) - nlibs = [(s in iSNVs_n_libs) and ','.join([str(iSNVs_n_libs[s].get(a,0)) for a in alleles]) or '.' + nlibs = [(s in iSNVs_n_libs) and ','.join([str(iSNVs_n_libs[s].get(a, 0)) for a in alleles]) or '.' for s in samplesToUse] # LB col, everything including the ref allele (one float per allele) - pvals = [(s in iSNVs_lib_bias) and ','.join([str(iSNVs_lib_bias[s].get(a,'.')) for a in alleles]) or '.' - for s in samplesToUse] + pvals = [(s in iSNVs_lib_bias) and ','.join([str(iSNVs_lib_bias[s].get(a, '.')) for a in alleles]) + or '.' for s in samplesToUse] # prepare output row and write to file c = ref_sequence.id @@ -760,37 +799,40 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, alignments, strip_chr_version c = parse_accession_str(c) if strip_chr_version: c = strip_accession_version(c) - out = [c, pos, '.', - alleles[0], ','.join(alleles[1:]), - '.', '.', '.', 'GT:AF:NL:LB'] + out = [c, pos, '.', alleles[0], ','.join(alleles[1:]), '.', '.', '.', 'GT:AF:NL:LB'] out = out + list(map(':'.join, zip(genos, freqs, nlibs, pvals))) - outf.write('\t'.join(map(str, out))+'\n') - + outf.write('\t'.join(map(str, out)) + '\n') + # compress output if requested if outVcf.endswith('.vcf.gz'): pysam.tabix_compress(tmpVcf, outVcf, force=True) pysam.tabix_index(outVcf, force=True, preset='vcf') os.unlink(tmpVcf) + def parser_merge_to_vcf(parser=argparse.ArgumentParser()): parser.add_argument("refFasta", - help="""The target reference genome. outVcf will use these + help="""The target reference genome. outVcf will use these chromosome names, coordinate spaces, and reference alleles""") - parser.add_argument("outVcf", - help="Output VCF file containing all variants") - parser.add_argument("--samples", nargs='+', required=True, - help="A list of sample names") - parser.add_argument("--isnvs", nargs='+', required=True, - help="""A list of file names from the output of vphaser_one_sample + parser.add_argument("outVcf", help="Output VCF file containing all variants") + parser.add_argument("--samples", nargs='+', required=True, help="A list of sample names") + parser.add_argument("--isnvs", + nargs='+', + required=True, + help="""A list of file names from the output of vphaser_one_sample These must be in the SAME ORDER as samples.""") - parser.add_argument("--alignments", nargs='+', required=True, - help="""a list of fasta files containing multialignment of input - assemblies, with one file per chromosome/segment. Each alignment - file will contain a line for each sample, as well as the + parser.add_argument("--alignments", + nargs='+', + required=True, + help="""a list of fasta files containing multialignment of input + assemblies, with one file per chromosome/segment. Each alignment + file will contain a line for each sample, as well as the reference genome to which they were aligned.""") parser.add_argument("--strip_chr_version", - default=False, action="store_true", dest="strip_chr_version", - help="""If set, strip any trailing version numbers from the + default=False, + action="store_true", + dest="strip_chr_version", + help="""If set, strip any trailing version numbers from the chromosome names. If the chromosome name ends with a period followed by integers, this is interepreted as a version number to be removed. This is because Genbank accession numbers are @@ -798,22 +840,28 @@ def parser_merge_to_vcf(parser=argparse.ArgumentParser()): corresponding version number. Default is false (leave chromosome names untouched).""") parser.add_argument("--naive_filter", - default=False, action="store_true", dest="naive_filter", - help="""If set, keep only the alleles that have at least + default=False, + action="store_true", + dest="naive_filter", + help="""If set, keep only the alleles that have at least two independent libraries of support and allele freq > 0.005. Default is false (do not filter at this stage).""") parser.add_argument("--parse_accession", - default=False, action="store_true", dest="parse_accession", - help="""If set, parse only the accession for the chromosome name. + default=False, + action="store_true", + dest="parse_accession", + help="""If set, parse only the accession for the chromosome name. Helpful if snpEff has to create its own database""") - util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, merge_to_vcf, split_args=True) return parser -__commands__.append(('merge_to_vcf', parser_merge_to_vcf)) +__commands__.append(('merge_to_vcf', parser_merge_to_vcf)) + # =================================================== + def compute_Fws(vcfrow): format_col = vcfrow[8].split(':') if 'AF' not in format_col: @@ -821,21 +869,23 @@ def compute_Fws(vcfrow): af_idx = format_col.index('AF') freqs = [dat.split(':') for dat in vcfrow[9:]] - freqs = [float(dat[af_idx].split(',')[0]) for dat in freqs if len(dat)>af_idx and dat[af_idx]!='.' and dat[0]!='.' and int(dat[0])<=1] + freqs = [float(dat[af_idx].split(',')[0]) for dat in freqs + if len(dat) > af_idx and dat[af_idx] != '.' and dat[0] != '.' and int(dat[0]) <= 1] - if len(freqs)<2: + if len(freqs) < 2: return None - p_s = sum(freqs)/len(freqs) - H_s = 2 * p_s * (1.0-p_s) + p_s = sum(freqs) / len(freqs) + H_s = 2 * p_s * (1.0 - p_s) - if H_s==0.0: + if H_s == 0.0: return None - H_w = [2*p*(1.0-p) for p in freqs] - H_w = sum(H_w)/len(H_w) + H_w = [2 * p * (1.0 - p) for p in freqs] + H_w = sum(H_w) / len(H_w) return (H_s, 1.0 - H_w / H_s) + def add_Fws_vcf(inVcf, outVcf): '''Compute the Fws statistic on iSNV data. See Manske, 2012 (Nature)''' with open(outVcf, 'wt') as outf: @@ -844,34 +894,39 @@ def add_Fws_vcf(inVcf, outVcf): if line.startswith('##'): outf.write(line) elif line.startswith('#'): - outf.write('##INFO=\n') - outf.write('##INFO=\n') + outf.write( + '##INFO=\n') + outf.write( + '##INFO=\n') outf.write(line) else: row = line.strip('\n').split('\t') Fws = compute_Fws(row) - if Fws!=None: + if Fws is not None: row[7] = row[7] + ";PI=%s;FWS=%s" % Fws - outf.write('\t'.join(row)+'\n') + outf.write('\t'.join(row) + '\n') + def parser_Fws(parser=argparse.ArgumentParser()): parser.add_argument("inVcf", help="Input VCF file") parser.add_argument("outVcf", help="Output VCF file") - util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, add_Fws_vcf, split_args=True) return parser -__commands__.append(('Fws', parser_Fws)) +__commands__.append(('Fws', parser_Fws)) + # =================== iSNV_table ================================ + def parse_eff(eff_field): ''' parse the old snpEff "EFF" INFO field ''' out = {} - effs = [eff.rstrip(')').replace('(','|').split('|') for eff in eff_field.split(',')] - effs = [[eff[i] for i in (0,3,4,5,6,9,11)] for eff in effs] - effs = [eff for eff in effs if eff[5] not in ('sGP','ssGP') and int(eff[6])<2] - if not len(effs)==1: + effs = [eff.rstrip(')').replace('(', '|').split('|') for eff in eff_field.split(',')] + effs = [[eff[i] for i in (0, 3, 4, 5, 6, 9, 11)] for eff in effs] + effs = [eff for eff in effs if eff[5] not in ('sGP', 'ssGP') and int(eff[6]) < 2] + if not len(effs) == 1: raise Exception() eff = effs[0] if eff[2]: @@ -880,29 +935,30 @@ def parse_eff(eff_field): aa = aa[2:] m = re.search(r"(\d+)", aa) out['eff_aa_pos'] = int(m.group(1)) - (out['eff_type'], out['eff_codon_dna'], out['eff_aa'], out['eff_prot_len'], out['eff_gene'], out['eff_protein'], rank) = eff + (out['eff_type'], out['eff_codon_dna'], out['eff_aa'], out[ + 'eff_prot_len' + ], out['eff_gene'], out['eff_protein'], rank) = eff return out + class SnpEffException(Exception): pass -def parse_ann(ann_field, alleles, transcript_blacklist=set(('GP.2','GP.3'))): + +def parse_ann(ann_field, alleles, transcript_blacklist=set(('GP.2', 'GP.3'))): ''' parse the new snpEff "ANN" INFO field ''' - + # only work on alt alleles alleles = alleles[1:] effs = [eff.split('|') for eff in ann_field.split(',')] - effs = [(eff[0], dict((k,eff[i]) for k,i in - (('eff_type',1),('eff_gene',3),('eff_protein',6), - ('eff_codon_dna',9),('eff_aa',10), - ('eff_aa_pos',13),('eff_prot_len',13)))) - for eff in effs - if eff[6] not in transcript_blacklist] + effs = [(eff[0], dict((k, eff[i]) for k, i in (('eff_type', 1), ('eff_gene', 3), ('eff_protein', 6), ( + 'eff_codon_dna', 9), ('eff_aa', 10), ('eff_aa_pos', 13), ('eff_prot_len', 13)))) for eff in effs + if eff[6] not in transcript_blacklist] effs_dict = dict(effs) if not effs: return {} - + if len(effs) != len(effs_dict): raise SnpEffException("ANN field has non-unique alleles") for a in alleles: @@ -911,51 +967,61 @@ def parse_ann(ann_field, alleles, transcript_blacklist=set(('GP.2','GP.3'))): if len(effs) != len(set(alleles)): raise SnpEffException("ANN field has %s entries, but ALT field has %s unique alleles: %s" % ( len(effs), len(set(alleles)), ','.join(alleles))) - + out = {} for k in ('eff_type', 'eff_codon_dna', 'eff_aa', 'eff_aa_pos', 'eff_prot_len', 'eff_gene', 'eff_protein'): a_out = [] for a in alleles: v = effs_dict[a][k] - if k=='eff_codon_dna' and v.startswith('c.'): + if k == 'eff_codon_dna' and v.startswith('c.'): v = v[2:] - elif k=='eff_aa' and v.startswith('p.'): + elif k == 'eff_aa' and v.startswith('p.'): v = v[2:] - elif k=='eff_aa_pos' and '/' in v: + elif k == 'eff_aa_pos' and '/' in v: v = v.split('/')[0] - elif k=='eff_prot_len' and '/' in v: + elif k == 'eff_prot_len' and '/' in v: v = v.split('/')[1] - elif k=='eff_protein' and v=='GP.1': + elif k == 'eff_protein' and v == 'GP.1': v = 'Glycoprotein' if v: a_out.append(v) out[k] = ','.join(util.misc.unique(a_out)) return out + def iSNV_table(vcf_iter): for row in vcf_iter: info = dict(kv.split('=') for kv in row['INFO'].split(';') if kv and kv != '.') - samples = [k for k in row.keys() if k not in set(('CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT'))] + samples = [ + k for k in row.keys() if k not in set( + ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT')) + ] # compute Hs: heterozygosity in population based on consensus genotypes alone genos = [row[s].split(':')[0] for s in samples] genos = util.misc.histogram(int(gt) for gt in genos if gt != '.') n = sum(genos.values()) - Hs = 1.0 - sum(k*k/float(n*n) for k in genos.values()) + Hs = 1.0 - sum(k * k / float(n * n) for k in genos.values()) try: for s in samples: f = row[s].split(':')[1] - if f and f!='.': + if f and f != '.': freqs = list(map(float, f.split(','))) f = sum(freqs) - Hw = 1.0 - sum(p*p for p in [1.0-f]+freqs) - out = {'chr':row['CHROM'], 'pos':row['POS'], - 'alleles':"%s,%s" %(row['REF'],row['ALT']), 'sample':s, - 'iSNV_freq':f, 'Hw':Hw, 'Hs':Hs} + Hw = 1.0 - sum(p * p for p in [1.0 - f] + freqs) + out = { + 'chr': row['CHROM'], + 'pos': row['POS'], + 'alleles': "%s,%s" % (row['REF'], row['ALT']), + 'sample': s, + 'iSNV_freq': f, + 'Hw': Hw, + 'Hs': Hs + } if 'EFF' in info: - for k,v in parse_eff(info['EFF']).items(): + for k, v in parse_eff(info['EFF']).items(): out[k] = v if 'ANN' in info: - for k,v in parse_ann(info['ANN'], alleles=out['alleles'].split(',')).items(): + for k, v in parse_ann(info['ANN'], alleles=out['alleles'].split(',')).items(): out[k] = v if 'PI' in info: out['Hs_snp'] = info['PI'] @@ -966,30 +1032,35 @@ def iSNV_table(vcf_iter): log.error("VCF parsing error at %s:%s", row['CHROM'], row['POS']) raise + def parser_iSNV_table(parser=argparse.ArgumentParser()): parser.add_argument("inVcf", help="Input VCF file") parser.add_argument("outFile", help="Output text file") - util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, main_iSNV_table) return parser + + def main_iSNV_table(args): '''Convert VCF iSNV data to tabular text''' - header = ['chr','pos','sample','patient','time','alleles','iSNV_freq','Hw','Hs', - 'eff_type','eff_codon_dna','eff_aa','eff_aa_pos','eff_prot_len','eff_gene','eff_protein'] + header = ['chr', 'pos', 'sample', 'patient', 'time', 'alleles', 'iSNV_freq', 'Hw', 'Hs', 'eff_type', + 'eff_codon_dna', 'eff_aa', 'eff_aa_pos', 'eff_prot_len', 'eff_gene', 'eff_protein'] with util.file.open_or_gzopen(args.outFile, 'wt') as outf: - outf.write('\t'.join(header)+'\n') + outf.write('\t'.join(header) + '\n') for row in iSNV_table(util.file.read_tabfile_dict(args.inVcf)): sample_parts = row['sample'].split('.') row['patient'] = sample_parts[0] - if len(sample_parts)>1: + if len(sample_parts) > 1: row['time'] = sample_parts[1] - outf.write('\t'.join(map(str, [row.get(h,'') for h in header]))+'\n') + outf.write('\t'.join(map(str, [row.get(h, '') for h in header])) + '\n') return 0 -__commands__.append(('iSNV_table', parser_iSNV_table)) +__commands__.append(('iSNV_table', parser_iSNV_table)) + # =================================================== + def iSNP_per_patient(table, agg_fun=median): data = sorted(table, key=lambda row: (int(row['pos']), row['patient'])) data = itertools.groupby(data, lambda row: (int(row['pos']), row['patient'])) @@ -999,33 +1070,39 @@ def iSNP_per_patient(table, agg_fun=median): if set(r['time'] for r in rows if r.get('time')): f = agg_fun(list(float(r['iSNV_freq']) for r in rows)) row['iSNV_freq'] = f - row['Hw'] = 2 * f * (1.0-f) + row['Hw'] = 2 * f * (1.0 - f) row['sample'] = row['patient'] else: - assert len(rows)==1, "error, found multiple rows for %s:%s" % (row['pos'],row['patient']) + assert len(rows) == 1, "error, found multiple rows for %s:%s" % (row['pos'], row['patient']) yield row + + def parser_iSNP_per_patient(parser=argparse.ArgumentParser()): parser.add_argument("inFile", help="Input text file") parser.add_argument("outFile", help="Output text file") - util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, main_iSNP_per_patient) return parser + + def main_iSNP_per_patient(args): '''Aggregate tabular iSNP data per patient x position (all time points averaged)''' - header = ['pos','patient','alleles','iSNV_freq','Hw', - 'eff_type','eff_codon_dna','eff_aa','eff_aa_pos','eff_prot_len','eff_gene','eff_protein'] + header = ['pos', 'patient', 'alleles', 'iSNV_freq', 'Hw', 'eff_type', 'eff_codon_dna', 'eff_aa', 'eff_aa_pos', + 'eff_prot_len', 'eff_gene', 'eff_protein'] with open(args.outFile, 'wt') as outf: - outf.write('\t'.join(header)+'\n') + outf.write('\t'.join(header) + '\n') for row in iSNP_per_patient(util.file.read_tabfile_dict(args.inFile)): - outf.write('\t'.join(map(str, [row.get(h,'') for h in header]))+'\n') + outf.write('\t'.join(map(str, [row.get(h, '') for h in header])) + '\n') return 0 -__commands__.append(('iSNP_per_patient', parser_iSNP_per_patient)) +__commands__.append(('iSNP_per_patient', parser_iSNP_per_patient)) + # =================================================== # ===============[ Utility functions ]================ + def sampleIDMatch(inputString): """ Given a sample name in the form of [sample] or [sample]-#, @@ -1039,7 +1116,10 @@ def sampleIDMatch(inputString): else: raise LookupError("The ID was not of the form (.*?)(?:-\d+|$)+, ex. 5985-0") + def full_parser(): return util.cmd.make_parser(__commands__, __doc__) + + if __name__ == '__main__': util.cmd.main_argparse(__commands__, __doc__) diff --git a/ncbi.py b/ncbi.py index 2e4113aea..d9f90d49c 100755 --- a/ncbi.py +++ b/ncbi.py @@ -6,16 +6,23 @@ __author__ = "tomkinsc@broadinstitute.org" __commands__ = [] -import argparse, logging, collections, shutil, os, os.path +import argparse +import logging +import collections +import shutil +import os +import os.path import Bio.SeqIO -import util.cmd, util.file, util.version, util.genbank +import util.cmd +import util.file +import util.version +import util.genbank import tools.tbl2asn import interhost log = logging.getLogger(__name__) - def fasta_chrlens(fasta): out = collections.OrderedDict() with open(fasta, 'rt') as inf: @@ -23,9 +30,10 @@ def fasta_chrlens(fasta): out[seq.id] = len(seq) return out + def tbl_transfer_common(cmap, ref_tbl, out_tbl, alt_chrlens, oob_clip=False): """ - This function is the feature transfer machinery used by tbl_transfer() + This function is the feature transfer machinery used by tbl_transfer() and tbl_transfer_prealigned(). cmap is an instance of CoordMapper. """ @@ -40,46 +48,47 @@ def tbl_transfer_common(cmap, ref_tbl, out_tbl, alt_chrlens, oob_clip=False): if not line.startswith('>Feature '): raise Exception("not sure how to handle a non-Feature record") refID = line[len('>Feature '):].strip() - if not ((refID.startswith('gb|') or refID.startswith('ref|')) and refID.endswith('|') and len(refID)>4): + if not ((refID.startswith('gb|') or refID.startswith('ref|')) and refID.endswith('|') and + len(refID) > 4): raise Exception("reference annotation does not refer to a GenBank or RefSeq accession") - refID = refID[refID.find("|")+1:-1] + refID = refID[refID.find("|") + 1:-1] refSeqID = [x for x in cmap.keys() if refID in x][0] #altid = cmap.mapChr(refSeqID, altid) - altid = list(set(cmap.keys()) - set([refSeqID]))[0] #cmap.mapChr(refSeqID, altid) + altid = list(set(cmap.keys()) - set([refSeqID]))[0] # cmap.mapChr(refSeqID, altid) line = '>Feature ' + altid feature_keep = True elif line[0] != '\t': # feature with numeric coordinates (map them) row = line.split('\t') - if not len(row)>=2: + if not len(row) >= 2: raise Exception("this line has only one column?") row[0] = int(row[0]) row[1] = int(row[1]) if row[1] >= row[0]: row[0] = cmap.mapChr(refSeqID, altid, row[0], -1)[1] - row[1] = cmap.mapChr(refSeqID, altid,row[1], 1)[1] + row[1] = cmap.mapChr(refSeqID, altid, row[1], 1)[1] else: # negative strand feature - row[0] = cmap.mapChr(refSeqID, altid, row[0], 1)[1] + row[0] = cmap.mapChr(refSeqID, altid, row[0], 1)[1] row[1] = cmap.mapChr(refSeqID, altid, row[1], -1)[1] - + if row[0] and row[1]: feature_keep = True - elif row[0]==None and row[1]==None: + elif row[0] == None and row[1] == None: # feature completely out of bounds feature_keep = False continue else: # feature overhangs end of sequence if oob_clip: - if row[0]==None: + if row[0] == None: row[0] = '<1' - if row[1]==None: + if row[1] == None: row[1] = '>{}'.format(alt_chrlens[altid]) else: feature_keep = False continue - line = '\t'.join(map(str,row)) + line = '\t'.join(map(str, row)) else: # feature notes if not feature_keep: @@ -88,7 +97,8 @@ def tbl_transfer_common(cmap, ref_tbl, out_tbl, alt_chrlens, oob_clip=False): elif 'protein_id' in line: # skip any lines that refer to an explicit protein_id continue - outf.write(line+'\n') + outf.write(line + '\n') + def tbl_transfer(ref_fasta, ref_tbl, alt_fasta, out_tbl, oob_clip=False): ''' This function takes an NCBI TBL file describing features on a genome @@ -97,24 +107,30 @@ def tbl_transfer(ref_fasta, ref_tbl, alt_fasta, out_tbl, oob_clip=False): cmap = interhost.CoordMapper() cmap.align_and_load_sequences([ref_fasta, alt_fasta]) alt_chrlens = fasta_chrlens(alt_fasta) - + tbl_transfer_common(cmap, ref_tbl, out_tbl, alt_chrlens, oob_clip) + def parser_tbl_transfer(parser=argparse.ArgumentParser()): parser.add_argument("ref_fasta", help="Input sequence of reference genome") parser.add_argument("ref_tbl", help="Input reference annotations (NCBI TBL format)") parser.add_argument("alt_fasta", help="Input sequence of new genome") parser.add_argument("out_tbl", help="Output file with transferred annotations") - parser.add_argument('--oob_clip', default=False, action='store_true', - help='''Out of bounds feature behavior. + parser.add_argument('--oob_clip', + default=False, + action='store_true', + help='''Out of bounds feature behavior. False: drop all features that are completely or partly out of bounds True: drop all features completely out of bounds but truncate any features that are partly out of bounds''') - util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('tmpDir', None), ('loglevel', None), ('version', None))) util.cmd.attach_main(parser, tbl_transfer, split_args=True) return parser + + __commands__.append(('tbl_transfer', parser_tbl_transfer)) + def tbl_transfer_prealigned(inputFasta, refFasta, refAnnotTblFiles, outputDir, oob_clip=False): """ This breaks out the ref and alt sequences into separate fasta files, and then @@ -124,13 +140,13 @@ def tbl_transfer_prealigned(inputFasta, refFasta, refAnnotTblFiles, outputDir, o This function expects to receive one fasta file containing a multialignment of a single segment/chromosome along with the respective reference sequence for that segment/chromosome. It also expects a reference containing all reference segments/chromosomes, so that the reference sequence can be identified in the input file by name. It - also expects a list of reference tbl files, where each file is named according to the ID present for its + also expects a list of reference tbl files, where each file is named according to the ID present for its corresponding sequence in the refFasta. For each non-reference sequence present in the inputFasta, two files are written: a fasta containing the segment/chromosome for the same, along with its corresponding feature table as created by tbl_transfer_common. """ - - ref_tbl = "" # must be identified in list of tables + + ref_tbl = "" # must be identified in list of tables ref_fasta_filename = "" matchingRefSeq = None @@ -142,7 +158,7 @@ def tbl_transfer_prealigned(inputFasta, refFasta, refAnnotTblFiles, outputDir, o with util.file.open_or_gzopen(inputFasta, 'r') as inf: for seq in Bio.SeqIO.parse(inf, 'fasta'): with util.file.open_or_gzopen(refFasta, 'r') as reff: - for refSeq in Bio.SeqIO.parse(reff, 'fasta'): + for refSeq in Bio.SeqIO.parse(reff, 'fasta'): if seq.id == refSeq.id: ref_fasta_filename = util.file.mkstempfname('.fasta') matchingRefSeq = seq @@ -150,10 +166,10 @@ def tbl_transfer_prealigned(inputFasta, refFasta, refAnnotTblFiles, outputDir, o if matchingRefSeq: break - if ref_fasta_filename == "": - raise KeyError("No reference was found in the input file %s" % (inputFasta) ) + if ref_fasta_filename == "": + raise KeyError("No reference was found in the input file %s" % (inputFasta)) - # identify the correct feature table source based on its filename, + # identify the correct feature table source based on its filename, # which should correspond to a unique component of the ref sequence ID (i.e. the genbank accession) for tblFilename in refAnnotTblFiles: # identify the correct feature table as the one that has an ID that is @@ -163,8 +179,8 @@ def tbl_transfer_prealigned(inputFasta, refFasta, refAnnotTblFiles, outputDir, o ref_tbl = tblFilename break if ref_tbl == "": - raise KeyError("No reference table was found for the reference %s" % (matchingRefSeq.id) ) - + raise KeyError("No reference table was found for the reference %s" % (matchingRefSeq.id)) + # write out the desired sequences to separate fasta files with util.file.open_or_gzopen(inputFasta, 'r') as inf: for seq in Bio.SeqIO.parse(inf, 'fasta'): @@ -172,7 +188,7 @@ def tbl_transfer_prealigned(inputFasta, refFasta, refAnnotTblFiles, outputDir, o # continue to the next sequence if seq.id == matchingRefSeq.id: continue - + alt_fasta_filename = "" combined_fasta_filename = "" @@ -183,119 +199,159 @@ def tbl_transfer_prealigned(inputFasta, refFasta, refAnnotTblFiles, outputDir, o Bio.SeqIO.write([matchingRefSeq, seq], outf, "fasta") # create a filepath for the output table - out_tbl = os.path.join(outputDir, seq.id+".tbl") + out_tbl = os.path.join(outputDir, seq.id + ".tbl") cmap = interhost.CoordMapper() cmap.load_alignments([combined_fasta_filename]) alt_chrlens = fasta_chrlens(combined_fasta_filename) - + tbl_transfer_common(cmap, ref_tbl, out_tbl, alt_chrlens, oob_clip) + def parser_tbl_transfer_prealigned(parser=argparse.ArgumentParser()): - parser.add_argument("inputFasta", help="""FASTA file containing input sequences, + parser.add_argument("inputFasta", + help="""FASTA file containing input sequences, including pre-made alignments and reference sequence""") parser.add_argument("refFasta", help="FASTA file containing the reference genome") - parser.add_argument("refAnnotTblFiles", nargs='+', help="""Name of the reference feature tables, - each of which should have a filename comrised of [refId].tbl - so they can be matched against the reference sequences""") + parser.add_argument("refAnnotTblFiles", + nargs='+', + help="""Name of the reference feature tables, + each of which should have a filename comrised of [refId].tbl + so they can be matched against the reference sequences""") parser.add_argument("outputDir", help="The output directory") - parser.add_argument('--oob_clip', default=False, action='store_true', - help='''Out of bounds feature behavior. + parser.add_argument('--oob_clip', + default=False, + action='store_true', + help='''Out of bounds feature behavior. False: drop all features that are completely or partly out of bounds True: drop all features completely out of bounds but truncate any features that are partly out of bounds''') - util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('tmpDir', None), ('loglevel', None), ('version', None))) util.cmd.attach_main(parser, tbl_transfer_prealigned, split_args=True) return parser -__commands__.append(('tbl_transfer_prealigned', parser_tbl_transfer_prealigned)) +__commands__.append(('tbl_transfer_prealigned', parser_tbl_transfer_prealigned)) -def fetch_fastas(accession_IDs, destinationDir, emailAddress, - forceOverwrite, combinedFilePrefix, fileExt, removeSeparateFiles, chunkSize): +def fetch_fastas(accession_IDs, destinationDir, emailAddress, forceOverwrite, combinedFilePrefix, fileExt, + removeSeparateFiles, chunkSize): ''' - This function downloads and saves the FASTA files + This function downloads and saves the FASTA files from the Genbank CoreNucleotide database given a given list of accession IDs. ''' - util.genbank.fetch_fastas_from_genbank(accession_IDs, destinationDir, emailAddress, - forceOverwrite, combinedFilePrefix, removeSeparateFiles, - fileExt, "fasta", chunkSize=chunkSize) - -def fetch_feature_tables(accession_IDs, destinationDir, emailAddress, - forceOverwrite, combinedFilePrefix, fileExt, removeSeparateFiles, chunkSize): + util.genbank.fetch_fastas_from_genbank(accession_IDs, + destinationDir, + emailAddress, + forceOverwrite, + combinedFilePrefix, + removeSeparateFiles, + fileExt, + "fasta", + chunkSize=chunkSize) + + +def fetch_feature_tables(accession_IDs, destinationDir, emailAddress, forceOverwrite, combinedFilePrefix, fileExt, + removeSeparateFiles, chunkSize): ''' - This function downloads and saves + This function downloads and saves feature tables from the Genbank CoreNucleotide database given a given list of accession IDs. ''' - util.genbank.fetch_feature_tables_from_genbank(accession_IDs, destinationDir, emailAddress, forceOverwrite, - combinedFilePrefix, removeSeparateFiles, - fileExt, "ft", chunkSize=chunkSize) - -def fetch_genbank_records(accession_IDs, destinationDir, emailAddress, - forceOverwrite, combinedFilePrefix, fileExt, removeSeparateFiles, chunkSize): + util.genbank.fetch_feature_tables_from_genbank(accession_IDs, + destinationDir, + emailAddress, + forceOverwrite, + combinedFilePrefix, + removeSeparateFiles, + fileExt, + "ft", + chunkSize=chunkSize) + + +def fetch_genbank_records(accession_IDs, destinationDir, emailAddress, forceOverwrite, combinedFilePrefix, fileExt, + removeSeparateFiles, chunkSize): ''' - This function downloads and saves + This function downloads and saves full flat text records from Genbank CoreNucleotide database given a given list of accession IDs. ''' - util.genbank.fetch_full_records_from_genbank(accession_IDs, destinationDir, emailAddress, forceOverwrite, - combinedFilePrefix, removeSeparateFiles, - fileExt, "gb", chunkSize=chunkSize) + util.genbank.fetch_full_records_from_genbank(accession_IDs, + destinationDir, + emailAddress, + forceOverwrite, + combinedFilePrefix, + removeSeparateFiles, + fileExt, + "gb", + chunkSize=chunkSize) + def parser_fetch_reference_common(parser=argparse.ArgumentParser()): parser.add_argument("emailAddress", - help="""Your email address. To access the Genbank CoreNucleotide database, - NCBI requires you to specify your email address with each request. + help="""Your email address. To access the Genbank CoreNucleotide database, + NCBI requires you to specify your email address with each request. In case of excessive usage of the E-utilities, NCBI will attempt to contact a user at the email address provided before blocking access. This email address should - be registered with NCBI. To register an email address, simply send - an email to eutilities@ncbi.nlm.nih.gov including your email address and + be registered with NCBI. To register an email address, simply send + an email to eutilities@ncbi.nlm.nih.gov including your email address and the tool name (tool='https://github.com/broadinstitute/viral-ngs').""") - parser.add_argument("destinationDir", - help="Output directory with where .fasta and .tbl files will be saved") - parser.add_argument("accession_IDs", nargs='+', - help="List of Genbank nuccore accession IDs") - parser.add_argument('--forceOverwrite', default=False, action='store_true', - help='''Overwrite existing files, if present.''') - parser.add_argument('--combinedFilePrefix', - help='''The prefix of the file containing the combined concatenated + parser.add_argument("destinationDir", help="Output directory with where .fasta and .tbl files will be saved") + parser.add_argument("accession_IDs", nargs='+', help="List of Genbank nuccore accession IDs") + parser.add_argument('--forceOverwrite', + default=False, + action='store_true', + help='''Overwrite existing files, if present.''') + parser.add_argument('--combinedFilePrefix', + help='''The prefix of the file containing the combined concatenated results returned by the list of accession IDs, in the order provided.''') - parser.add_argument('--fileExt', default=None, - help='''The extension to use for the downloaded files''') - parser.add_argument('--removeSeparateFiles', default=False, action='store_true', - help='''If specified, remove the individual files and leave only the combined file.''') - parser.add_argument('--chunkSize', default=1, type=int, - help='''Causes files to be downloaded from GenBank in chunks of N accessions. - Each chunk will be its own combined file, separate from any combined - file created via --combinedFilePrefix (default: %(default)s). If chunkSize is - unspecified and >500 accessions are provided, chunkSize will be set to 500 to + parser.add_argument('--fileExt', default=None, help='''The extension to use for the downloaded files''') + parser.add_argument('--removeSeparateFiles', + default=False, + action='store_true', + help='''If specified, remove the individual files and leave only the combined file.''') + parser.add_argument('--chunkSize', + default=1, + type=int, + help='''Causes files to be downloaded from GenBank in chunks of N accessions. + Each chunk will be its own combined file, separate from any combined + file created via --combinedFilePrefix (default: %(default)s). If chunkSize is + unspecified and >500 accessions are provided, chunkSize will be set to 500 to adhere to the NCBI guidelines on information retreival.''') return parser + def parser_fetch_fastas(parser): parser = parser_fetch_reference_common(parser) - util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('tmpDir', None), ('loglevel', None), ('version', None))) util.cmd.attach_main(parser, fetch_fastas, split_args=True) return parser + + __commands__.append(('fetch_fastas', parser_fetch_fastas)) + def parser_fetch_feature_tables(parser): parser = parser_fetch_reference_common(parser) - util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('tmpDir', None), ('loglevel', None), ('version', None))) util.cmd.attach_main(parser, fetch_feature_tables, split_args=True) return parser + + __commands__.append(('fetch_feature_tables', parser_fetch_feature_tables)) + def parser_fetch_genbank_records(parser): parser = parser_fetch_reference_common(parser) - util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('tmpDir', None), ('loglevel', None), ('version', None))) util.cmd.attach_main(parser, fetch_genbank_records, split_args=True) return parser + + __commands__.append(('fetch_genbank_records', parser_fetch_genbank_records)) + def fasta2fsa(infname, outdir, biosample=None): ''' copy a fasta file to a new directory and change its filename to end in .fsa for NCBI's sake. ''' @@ -316,11 +372,11 @@ def fasta2fsa(infname, outdir, biosample=None): outf.write(line) return outfname + def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=None): with open(cmt_fname, 'wt') as outf: outf.write("StructuredCommentPrefix\t##Genome-Assembly-Data-START##\n") - outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format( - util.version.get_version())) + outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format(util.version.get_version())) if name: outf.write("Assembly Name\t{}\n".format(name)) if coverage: @@ -332,9 +388,10 @@ def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=N outf.write("Sequencing Technology\t{}\n".format(seq_tech)) outf.write("StructuredCommentSuffix\t##Genome-Assembly-Data-END##\n") + def prep_genbank_files(templateFile, fasta_files, annotDir, - master_source_table=None, comment=None, sequencing_tech=None, - coverage_table=None, biosample_map=None): + master_source_table=None, comment=None, sequencing_tech=None, + coverage_table=None, biosample_map=None): ''' Prepare genbank submission files. Requires .fasta and .tbl files as input, as well as numerous other metadata files for the submission. Creates a directory full of files (.sqn in particular) that can be sent to GenBank. @@ -345,14 +402,14 @@ def prep_genbank_files(templateFile, fasta_files, annotDir, for row in util.file.read_tabfile_dict(coverage_table): if row.get('sample') and row.get('aln2self_cov_median'): coverage[row['sample']] = row['aln2self_cov_median'] - + # get biosample id map biosample = {} if biosample_map: for row in util.file.read_tabfile_dict(biosample_map): if row.get('sample') and row.get('BioSample'): biosample[row['sample']] = row['BioSample'] - + # make output directory util.file.mkdir_p(annotDir) for fn in fasta_files: @@ -363,39 +420,40 @@ def prep_genbank_files(templateFile, fasta_files, annotDir, fasta2fsa(fn, annotDir, biosample=biosample.get(sample)) # make .src files if master_source_table: - shutil.copy(master_source_table, os.path.join(annotDir, sample+'.src')) + shutil.copy(master_source_table, os.path.join(annotDir, sample + '.src')) # make .cmt files - make_structured_comment_file(os.path.join(annotDir, sample+'.cmt'), - name=sample, coverage=coverage.get(sample), seq_tech=sequencing_tech) - + make_structured_comment_file(os.path.join(annotDir, sample + '.cmt'), + name=sample, + coverage=coverage.get(sample), + seq_tech=sequencing_tech) + # run tbl2asn tbl2asn = tools.tbl2asn.Tbl2AsnTool() tbl2asn.execute(templateFile, annotDir, comment=comment, per_genome_comment=True) + def parser_prep_genbank_files(parser=argparse.ArgumentParser()): - parser.add_argument('templateFile', - help='Submission template file (.sbt) including author and contact info') - parser.add_argument("fasta_files", nargs='+', - help="Input fasta files") + parser.add_argument('templateFile', help='Submission template file (.sbt) including author and contact info') + parser.add_argument("fasta_files", nargs='+', help="Input fasta files") parser.add_argument("annotDir", - help="Output directory with genbank submission files (.tbl files must already be there)") - parser.add_argument('--comment', default=None, - help='comment field') - parser.add_argument('--sequencing_tech', default=None, - help='sequencing technology (e.g. Illumina HiSeq 2500)') - parser.add_argument('--master_source_table', default=None, - help='source modifier table') + help="Output directory with genbank submission files (.tbl files must already be there)") + parser.add_argument('--comment', default=None, help='comment field') + parser.add_argument('--sequencing_tech', default=None, help='sequencing technology (e.g. Illumina HiSeq 2500)') + parser.add_argument('--master_source_table', default=None, help='source modifier table') parser.add_argument("--biosample_map", - help="""A file with two columns and a header: sample and BioSample. + help="""A file with two columns and a header: sample and BioSample. This file may refer to samples that are not included in this submission.""") - parser.add_argument('--coverage_table', default=None, - help='''A genome coverage report file with a header row. The table must + parser.add_argument('--coverage_table', + default=None, + help='''A genome coverage report file with a header row. The table must have at least two columns named sample and aln2self_cov_median. All other columns are ignored. Rows referring to samples not in this submission are ignored.''') - util.cmd.common_args(parser, (('tmpDir',None), ('loglevel',None), ('version',None))) + util.cmd.common_args(parser, (('tmpDir', None), ('loglevel', None), ('version', None))) util.cmd.attach_main(parser, prep_genbank_files, split_args=True) return parser + + __commands__.append(('prep_genbank_files', parser_prep_genbank_files)) @@ -422,40 +480,45 @@ def prep_sra_table(lib_fname, biosampleFile, md5_fname, outFile): metadata.setdefault(s, {}) metadata[s]['filename'] = row[1] metadata[s]['MD5_checksum'] = row[0] - + with open(outFile, 'wt') as outf: header = ['biosample_accession', 'sample_name', 'library_ID', 'filename', 'MD5_checksum'] - outf.write('\t'.join(header)+'\n') + outf.write('\t'.join(header) + '\n') with open(lib_fname, 'rU') as inf: for line in inf: lib = line.rstrip('\n\r') parts = lib.split('.') - assert len(parts)>1 and parts[-1].startswith('l') + assert len(parts) > 1 and parts[-1].startswith('l') s = '.'.join(parts[:-1]) metadata.setdefault(s, {}) metadata[s]['library_ID'] = lib metadata[s]['sample_name'] = s - outf.write('\t'.join(metadata[s].get(h,'') for h in header)+'\n') + outf.write('\t'.join(metadata[s].get(h, '') for h in header) + '\n') + def parser_prep_sra_table(parser=argparse.ArgumentParser()): parser.add_argument('lib_fname', - help='A file that lists all of the library IDs that will be submitted in this batch') + help='A file that lists all of the library IDs that will be submitted in this batch') parser.add_argument("biosampleFile", - help="""A file with two columns and a header: sample and BioSample. + help="""A file with two columns and a header: sample and BioSample. This file may refer to samples that are not included in this submission.""") parser.add_argument("md5_fname", - help="""A file with two columns and no header. Two columns are MD5 checksum and filename. + help="""A file with two columns and no header. Two columns are MD5 checksum and filename. Should contain an entry for every bam file being submitted in this batch. This is typical output from "md5sum *.cleaned.bam".""") parser.add_argument("outFile", - help="Output table that contains most of the variable columns needed for SRA submission.") - util.cmd.common_args(parser, (('loglevel',None), ('version',None))) + help="Output table that contains most of the variable columns needed for SRA submission.") + util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, prep_sra_table, split_args=True) return parser + + __commands__.append(('prep_sra_table', parser_prep_sra_table)) def full_parser(): return util.cmd.make_parser(__commands__, __doc__) + + if __name__ == '__main__': util.cmd.main_argparse(__commands__, __doc__) diff --git a/pipes/Broad_LSF/cluster-submitter.py b/pipes/Broad_LSF/cluster-submitter.py index c1541ca0c..f11d34da2 100755 --- a/pipes/Broad_LSF/cluster-submitter.py +++ b/pipes/Broad_LSF/cluster-submitter.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 -import os, sys, re +import os +import sys +import re from snakemake.utils import read_job_properties LOGDIR = sys.argv[-2] @@ -13,21 +15,19 @@ jobname = "{rule}-{jobid}".format(rule=props["rule"], jobid=sm_jobid) if props["params"].get("logid"): jobname = "{rule}-{id}".format(rule=props["rule"], id=props["params"]["logid"]) -cmdline = "bsub -P {proj_name} -J {jobname} -r ".format( - proj_name='viral_ngs', jobname=jobname) +cmdline = "bsub -P {proj_name} -J {jobname} -r ".format(proj_name='viral_ngs', jobname=jobname) # log file output -if "-N" not in props["params"].get("LSF",""): - cmdline += "-oo {logdir}/LSF-{jobname}.txt ".format( - logdir=LOGDIR, jobname=jobname) +if "-N" not in props["params"].get("LSF", ""): + cmdline += "-oo {logdir}/LSF-{jobname}.txt ".format(logdir=LOGDIR, jobname=jobname) # pass memory resource request to LSF -mem = props.get('resources',{}).get('mem') +mem = props.get('resources', {}).get('mem') if mem: - cmdline += '-R "rusage[mem={}]" -M {} '.format(mem, 2*int(mem)) + cmdline += '-R "rusage[mem={}]" -M {} '.format(mem, 2 * int(mem)) # rule-specific LSF parameters (e.g. queue, runtime) -cmdline += props["params"].get("LSF","") + " " +cmdline += props["params"].get("LSF", "") + " " # figure out job dependencies dependencies = set(sys.argv[1:-2]) diff --git a/pipes/Broad_LSF/lsf-report.py b/pipes/Broad_LSF/lsf-report.py index d345cf3e0..bbb9c1451 100755 --- a/pipes/Broad_LSF/lsf-report.py +++ b/pipes/Broad_LSF/lsf-report.py @@ -5,10 +5,16 @@ __author__ = "dpark@broadinstitute.org" -import argparse, re, time, os, os.path, sys +import argparse +import re +import time +import os +import os.path +import sys + def read_lsf_logfile(infname): - out = {'logfile':infname} + out = {'logfile': infname} num_dash_lines = 0 with open(infname, 'rU', encoding='latin-1') as inf: for line in inf: @@ -42,18 +48,18 @@ def read_lsf_logfile(infname): line = line.split(':')[0] out['status'] = line elif ':' in line: - k,v = [s.strip() for s in line.split(':')] - out[k]=v + k, v = [s.strip() for s in line.split(':')] + out[k] = v if 'start_time' in out and 'end_time' in out: out['run_time'] = time.mktime(time.strptime(out['end_time'])) \ - time.mktime(time.strptime(out['start_time'])) return out + def read_all_logfiles(dirname): - header = ['job_id', 'job_name', 'job_prefix', 'job_suffix', 'queue', 'exec_host', - 'status', 'run_time', 'start_time', 'end_time', - 'CPU time', 'Max Memory', 'Max Swap', 'Max Processes', 'Max Threads', - 'logfile'] + header = ['job_id', 'job_name', 'job_prefix', 'job_suffix', 'queue', 'exec_host', 'status', 'run_time', + 'start_time', 'end_time', 'CPU time', 'Max Memory', 'Max Swap', 'Max Processes', 'Max Threads', 'logfile' + ] yield header for fname in os.listdir(dirname): try: @@ -61,25 +67,28 @@ def read_all_logfiles(dirname): except: print("Error parsing " + fname) raise - yield [str(row.get(h,'')) for h in header] + yield [str(row.get(h, '')) for h in header] + def parser_report(): parser = argparse.ArgumentParser( - description = "Read a directory full of LSF log files and produce a tabular report.") + description="Read a directory full of LSF log files and produce a tabular report.") parser.add_argument("logDir", help="Input directory of LSF log files") parser.add_argument("outFile", help="Output report file") return parser + def main_report(args): with open(args.outFile, 'wt') as outf: for row in read_all_logfiles(args.logDir): - outf.write('\t'.join(row)+'\n') + outf.write('\t'.join(row) + '\n') return 0 + if __name__ == '__main__': argv = sys.argv[1:] parser = parser_report() - if len(argv)==0: + if len(argv) == 0: parser.print_help() else: args = parser.parse_args(argv) diff --git a/pipes/Broad_UGER/cluster-submitter.py b/pipes/Broad_UGER/cluster-submitter.py index 945386ee3..7abc61ac2 100755 --- a/pipes/Broad_UGER/cluster-submitter.py +++ b/pipes/Broad_UGER/cluster-submitter.py @@ -1,5 +1,7 @@ #!/usr/bin/env python3 -import os, sys, re +import os +import sys +import re from snakemake.utils import read_job_properties LOGDIR = sys.argv[-2] @@ -13,24 +15,22 @@ jobname = "{rule}-{jobid}".format(rule=props["rule"], jobid=sm_jobid) if props["params"].get("logid"): jobname = "{rule}-{id}".format(rule=props["rule"], id=props["params"]["logid"]) -cmdline = "qsub -P {proj_name} -N {jobname} -cwd -r y ".format( - proj_name='sabeti_lab', jobname=jobname) +cmdline = "qsub -P {proj_name} -N {jobname} -cwd -r y ".format(proj_name='sabeti_lab', jobname=jobname) # log file output -cmdline += "-o {logdir} -e {logdir} ".format( - logdir=LOGDIR, jobname=jobname) +cmdline += "-o {logdir} -e {logdir} ".format(logdir=LOGDIR, jobname=jobname) # pass memory resource request to cluster -mem = props.get('resources',{}).get('mem') +mem = props.get('resources', {}).get('mem') if mem: - cmdline += ' -l m_mem_free={}G,h_rss={}G '.format( mem, round(1.2*float(int(mem)),2) ) + cmdline += ' -l m_mem_free={}G,h_rss={}G '.format(mem, round(1.2 * float(int(mem)), 2)) -cores = props.get('resources',{}).get('cores') +cores = props.get('resources', {}).get('cores') if cores: - cmdLine += ' -pe smp {} '.format( int(cores) ) + cmdLine += ' -pe smp {} '.format(int(cores)) # rule-specific UGER parameters (e.g. queue) -cmdline += props["params"].get("UGER","") + " " +cmdline += props["params"].get("UGER", "") + " " # figure out job dependencies dependencies = set(sys.argv[1:-2]) diff --git a/read_utils.py b/read_utils.py index dd004e374..b8361ce96 100755 --- a/read_utils.py +++ b/read_utils.py @@ -8,214 +8,277 @@ __author__ = "irwin@broadinstitute.org, dpark@broadinstitute.org" __commands__ = [] -import argparse, logging, math, os, tempfile, shutil, subprocess +import argparse +import logging +import math +import os +import tempfile +import shutil +import subprocess from Bio import SeqIO -import util.cmd, util.file +import util.cmd +import util.file from util.file import mkstempfname -import tools.picard, tools.samtools, tools.mvicuna, tools.prinseq -import tools.novoalign, tools.gatk +import tools.picard +import tools.samtools +import tools.mvicuna +import tools.prinseq +import tools.novoalign +import tools.gatk log = logging.getLogger(__name__) - # ======================= # *** purge_unmated *** # ======================= -def purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2, regex='^@(\S+)/[1|2]$') : + +def purge_unmated(inFastq1, inFastq2, outFastq1, outFastq2, regex='^@(\S+)/[1|2]$'): '''Use mergeShuffledFastqSeqs to purge unmated reads, and put corresponding reads in the same order. Corresponding sequences must have sequence identifiers of the form SEQID/1 and SEQID/2. ''' tempOutput = mkstempfname() - mergeShuffledFastqSeqsPath = os.path.join(util.file.get_scripts_path(), - 'mergeShuffledFastqSeqs.pl') - cmdline = [mergeShuffledFastqSeqsPath, '-t', '-r', regex, - '-f1', inFastq1, '-f2', inFastq2, '-o', tempOutput] + mergeShuffledFastqSeqsPath = os.path.join(util.file.get_scripts_path(), 'mergeShuffledFastqSeqs.pl') + cmdline = [mergeShuffledFastqSeqsPath, '-t', '-r', regex, '-f1', inFastq1, '-f2', inFastq2, '-o', tempOutput] log.debug(' '.join(cmdline)) subprocess.check_call(cmdline) shutil.move(tempOutput + '.1.fastq', outFastq1) shutil.move(tempOutput + '.2.fastq', outFastq2) return 0 + def parser_purge_unmated(parser=argparse.ArgumentParser()): - parser.add_argument('inFastq1', - help='Input fastq file; 1st end of paired-end reads.') - parser.add_argument('inFastq2', - help='Input fastq file; 2nd end of paired-end reads.') - parser.add_argument('outFastq1', - help='Output fastq file; 1st end of paired-end reads.') - parser.add_argument('outFastq2', - help='Output fastq file; 2nd end of paired-end reads.') + parser.add_argument('inFastq1', help='Input fastq file; 1st end of paired-end reads.') + parser.add_argument('inFastq2', help='Input fastq file; 2nd end of paired-end reads.') + parser.add_argument('outFastq1', help='Output fastq file; 1st end of paired-end reads.') + parser.add_argument('outFastq2', help='Output fastq file; 2nd end of paired-end reads.') parser.add_argument("--regex", - help="Perl regular expression to parse paired read IDs (default: %(default)s)", - default='^@(\S+)/[1|2]$') + help="Perl regular expression to parse paired read IDs (default: %(default)s)", + default='^@(\S+)/[1|2]$') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, purge_unmated, split_args=True) return parser + + __commands__.append(('purge_unmated', parser_purge_unmated)) # ========================= # *** fastq_to_fasta *** # ========================= -def fastq_to_fasta(inFastq, outFasta) : + + +def fastq_to_fasta(inFastq, outFasta): ''' Convert from fastq format to fasta format. Warning: output reads might be split onto multiple lines. ''' - + # Do this with biopython rather than prinseq, because if the latter fails # it doesn't return an error. (On the other hand, prinseq # can guarantee that output lines are not split...) - inFile = util.file.open_or_gzopen(inFastq) + inFile = util.file.open_or_gzopen(inFastq) outFile = util.file.open_or_gzopen(outFasta, 'w') - for rec in SeqIO.parse(inFile, 'fastq') : + for rec in SeqIO.parse(inFile, 'fastq'): SeqIO.write([rec], outFile, 'fasta') outFile.close() return 0 + + def parser_fastq_to_fasta(parser=argparse.ArgumentParser()): parser.add_argument('inFastq', help='Input fastq file.') parser.add_argument('outFasta', help='Output fasta file.') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, fastq_to_fasta, split_args=True) return parser + + __commands__.append(('fastq_to_fasta', parser_fastq_to_fasta)) # =============================== # *** index_fasta_samtools *** # =============================== + def parser_index_fasta_samtools(parser=argparse.ArgumentParser()): parser.add_argument('inFasta', help='Reference genome, FASTA format.') util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, main_index_fasta_samtools) return parser + + def main_index_fasta_samtools(args): '''Index a reference genome for Samtools.''' tools.samtools.SamtoolsTool().faidx(args.inFasta, overwrite=True) return 0 + + __commands__.append(('index_fasta_samtools', parser_index_fasta_samtools)) # ============================= # *** index_fasta_picard *** # ============================= + def parser_index_fasta_picard(parser=argparse.ArgumentParser()): parser.add_argument('inFasta', help='Input reference genome, FASTA format.') - parser.add_argument('--JVMmemory', default = tools.picard.CreateSequenceDictionaryTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', default = [], nargs='*', - help='Optional arguments to Picard\'s CreateSequenceDictionary, OPTIONNAME=value ...') + parser.add_argument('--JVMmemory', + default=tools.picard.CreateSequenceDictionaryTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s CreateSequenceDictionary, OPTIONNAME=value ...') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_index_fasta_picard) return parser + + def main_index_fasta_picard(args): '''Create an index file for a reference genome suitable for Picard/GATK.''' tools.picard.CreateSequenceDictionaryTool().execute( - args.inFasta, overwrite=True, - picardOptions=args.picardOptions, JVMmemory=args.JVMmemory) + args.inFasta, + overwrite=True, + picardOptions=args.picardOptions, + JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('index_fasta_picard', parser_index_fasta_picard)) # ============================= # *** mkdup_picard *** # ============================= + def parser_mkdup_picard(parser=argparse.ArgumentParser()): parser.add_argument('inBams', help='Input reads, BAM format.', nargs='+') parser.add_argument('outBam', help='Output reads, BAM format.') - parser.add_argument('--outMetrics', - help='Output metrics file. Default is to dump to a temp file.', - default=None) + parser.add_argument('--outMetrics', help='Output metrics file. Default is to dump to a temp file.', default=None) parser.add_argument("--remove", - help="Instead of marking duplicates, remove them entirely (default: %(default)s)", - default=False, action="store_true", dest="remove") - parser.add_argument('--JVMmemory', default = tools.picard.MarkDuplicatesTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', default = [], nargs='*', - help='Optional arguments to Picard\'s MarkDuplicates, OPTIONNAME=value ...') + help="Instead of marking duplicates, remove them entirely (default: %(default)s)", + default=False, + action="store_true", + dest="remove") + parser.add_argument('--JVMmemory', + default=tools.picard.MarkDuplicatesTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s MarkDuplicates, OPTIONNAME=value ...') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_mkdup_picard) return parser -def main_mkdup_picard(args) : + + +def main_mkdup_picard(args): '''Mark or remove duplicate reads from BAM file.''' opts = list(args.picardOptions) if args.remove: opts = ['REMOVE_DUPLICATES=true'] + opts tools.picard.MarkDuplicatesTool().execute( - args.inBams, args.outBam, args.outMetrics, - picardOptions=opts, JVMmemory=args.JVMmemory) + args.inBams, + args.outBam, + args.outMetrics, + picardOptions=opts, + JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('mkdup_picard', parser_mkdup_picard)) # ============================= # *** revert_bam_picard *** # ============================= + def parser_revert_bam_picard(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input reads, BAM format.') parser.add_argument('outBam', help='Output reads, BAM format.') - parser.add_argument('--JVMmemory', default = tools.picard.RevertSamTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', default = [], nargs='*', - help='Optional arguments to Picard\'s RevertSam, OPTIONNAME=value ...') + parser.add_argument('--JVMmemory', + default=tools.picard.RevertSamTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s RevertSam, OPTIONNAME=value ...') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_revert_bam_picard) return parser -def main_revert_bam_picard(args) : + + +def main_revert_bam_picard(args): '''Revert BAM to raw reads''' opts = list(args.picardOptions) - tools.picard.RevertSamTool().execute( - args.inBam, args.outBam, - picardOptions=opts, JVMmemory=args.JVMmemory) + tools.picard.RevertSamTool().execute(args.inBam, args.outBam, picardOptions=opts, JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('revert_bam_picard', parser_revert_bam_picard)) # ========================= # *** generic picard *** # ========================= + def parser_picard(parser=argparse.ArgumentParser()): parser.add_argument('command', help='picard command') - parser.add_argument('--JVMmemory', default = tools.picard.PicardTools.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', default = [], nargs='*', - help='Optional arguments to Picard, OPTIONNAME=value ...') + parser.add_argument('--JVMmemory', + default=tools.picard.PicardTools.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard, OPTIONNAME=value ...') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_picard) return parser -def main_picard(args) : + + +def main_picard(args): '''Generic Picard runner.''' - tools.picard.PicardTools().execute(args.command, - picardOptions=args.picardOptions, JVMmemory=args.JVMmemory) + tools.picard.PicardTools().execute(args.command, picardOptions=args.picardOptions, JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('picard', parser_picard)) # =================== # *** sort_bam *** # =================== + def parser_sort_bam(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', help='Input bam file.') - parser.add_argument('outBam', help='Output bam file, sorted.') + parser.add_argument('inBam', help='Input bam file.') + parser.add_argument('outBam', help='Output bam file, sorted.') parser.add_argument('sortOrder', - help='How to sort the reads. [default: %(default)s]', - choices = tools.picard.SortSamTool.valid_sort_orders, - default = tools.picard.SortSamTool.default_sort_order) + help='How to sort the reads. [default: %(default)s]', + choices=tools.picard.SortSamTool.valid_sort_orders, + default=tools.picard.SortSamTool.default_sort_order) parser.add_argument("--index", - help="Index outBam (default: %(default)s)", - default=False, action="store_true", dest="index") + help="Index outBam (default: %(default)s)", + default=False, + action="store_true", + dest="index") parser.add_argument("--md5", - help="MD5 checksum outBam (default: %(default)s)", - default=False, action="store_true", dest="md5") - parser.add_argument('--JVMmemory', default = tools.picard.SortSamTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', default = [], nargs='*', - help='Optional arguments to Picard\'s SortSam, OPTIONNAME=value ...') + help="MD5 checksum outBam (default: %(default)s)", + default=False, + action="store_true", + dest="md5") + parser.add_argument('--JVMmemory', + default=tools.picard.SortSamTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s SortSam, OPTIONNAME=value ...') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_sort_bam) return parser -def main_sort_bam(args) : + + +def main_sort_bam(args): '''Sort BAM file''' opts = list(args.picardOptions) if args.index: @@ -223,147 +286,185 @@ def main_sort_bam(args) : if args.md5: opts = ['CREATE_MD5_FILE=true'] + opts tools.picard.SortSamTool().execute( - args.inBam, args.outBam, args.sortOrder, - picardOptions=opts, JVMmemory=args.JVMmemory) + args.inBam, + args.outBam, + args.sortOrder, + picardOptions=opts, + JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('sort_bam', parser_sort_bam)) # ==================== # *** merge_bams *** # ==================== + def parser_merge_bams(parser=argparse.ArgumentParser()): - parser.add_argument('inBams', help='Input bam files.', nargs='+') - parser.add_argument('outBam', help='Output bam file.') - parser.add_argument('--JVMmemory', default = tools.picard.MergeSamFilesTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', default = [], nargs='*', - help='Optional arguments to Picard\'s MergeSamFiles, OPTIONNAME=value ...') + parser.add_argument('inBams', help='Input bam files.', nargs='+') + parser.add_argument('outBam', help='Output bam file.') + parser.add_argument('--JVMmemory', + default=tools.picard.MergeSamFilesTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s MergeSamFiles, OPTIONNAME=value ...') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_merge_bams) return parser -def main_merge_bams(args) : + + +def main_merge_bams(args): '''Merge multiple BAMs into one''' opts = list(args.picardOptions) + ['USE_THREADING=true'] - tools.picard.MergeSamFilesTool().execute( - args.inBams, args.outBam, - picardOptions=opts, JVMmemory=args.JVMmemory) + tools.picard.MergeSamFilesTool().execute(args.inBams, args.outBam, picardOptions=opts, JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('merge_bams', parser_merge_bams)) # ==================== # *** filter_bam *** # ==================== + def parser_filter_bam(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', help='Input bam file.') - parser.add_argument('readList', help='Input file of read IDs.') - parser.add_argument('outBam', help='Output bam file.') + parser.add_argument('inBam', help='Input bam file.') + parser.add_argument('readList', help='Input file of read IDs.') + parser.add_argument('outBam', help='Output bam file.') parser.add_argument("--exclude", - help="""If specified, readList is a list of reads to remove from input. + help="""If specified, readList is a list of reads to remove from input. Default behavior is to treat readList as an inclusion list (all unnamed reads are removed).""", - default=False, action="store_true", dest="exclude") - parser.add_argument('--JVMmemory', default = tools.picard.FilterSamReadsTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', default = [], nargs='*', - help='Optional arguments to Picard\'s FilterSamReads, OPTIONNAME=value ...') + default=False, + action="store_true", + dest="exclude") + parser.add_argument('--JVMmemory', + default=tools.picard.FilterSamReadsTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s FilterSamReads, OPTIONNAME=value ...') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_filter_bam) return parser -def main_filter_bam(args) : + + +def main_filter_bam(args): '''Filter BAM file by read name''' tools.picard.FilterSamReadsTool().execute( - args.inBam, args.exclude, args.readList, args.outBam, - picardOptions=args.picardOptions, JVMmemory=args.JVMmemory) + args.inBam, + args.exclude, + args.readList, + args.outBam, + picardOptions=args.picardOptions, + JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('filter_bam', parser_filter_bam)) # ======================= # *** bam_to_fastq *** # ======================= -def bam_to_fastq(inBam, outFastq1, outFastq2, outHeader = None, - JVMmemory = tools.picard.SamToFastqTool.jvmMemDefault, picardOptions = []) : + + +def bam_to_fastq(inBam, outFastq1, outFastq2, outHeader=None, + JVMmemory=tools.picard.SamToFastqTool.jvmMemDefault, picardOptions=[]): ''' Convert a bam file to a pair of fastq paired-end read files and optional text header. ''' - tools.picard.SamToFastqTool().execute(inBam, outFastq1, outFastq2, - picardOptions=picardOptions, JVMmemory=JVMmemory) - if outHeader : + tools.picard.SamToFastqTool().execute(inBam, + outFastq1, + outFastq2, + picardOptions=picardOptions, + JVMmemory=JVMmemory) + if outHeader: tools.samtools.SamtoolsTool().dumpHeader(inBam, outHeader) return 0 + + def parser_bam_to_fastq(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input bam file.') - parser.add_argument('outFastq1', - help='Output fastq file; 1st end of paired-end reads.') - parser.add_argument('outFastq2', - help='Output fastq file; 2nd end of paired-end reads.') - parser.add_argument('--outHeader', - help='Optional text file name that will receive bam header.', - default=None) - parser.add_argument('--JVMmemory', default = tools.picard.SamToFastqTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', default = [], nargs='*', - help='Optional arguments to Picard\'s SamToFastq, OPTIONNAME=value ...') + parser.add_argument('outFastq1', help='Output fastq file; 1st end of paired-end reads.') + parser.add_argument('outFastq2', help='Output fastq file; 2nd end of paired-end reads.') + parser.add_argument('--outHeader', help='Optional text file name that will receive bam header.', default=None) + parser.add_argument('--JVMmemory', + default=tools.picard.SamToFastqTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--picardOptions', + default=[], + nargs='*', + help='Optional arguments to Picard\'s SamToFastq, OPTIONNAME=value ...') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, bam_to_fastq, split_args=True) return parser + + __commands__.append(('bam_to_fastq', parser_bam_to_fastq)) # ======================= # *** fastq_to_bam *** # ======================= -def fastq_to_bam(inFastq1, inFastq2, outBam, sampleName = None, header = None, - JVMmemory = tools.picard.FastqToSamTool.jvmMemDefault, picardOptions = []) : + +def fastq_to_bam(inFastq1, inFastq2, outBam, sampleName=None, header=None, + JVMmemory=tools.picard.FastqToSamTool.jvmMemDefault, picardOptions=[]): ''' Convert a pair of fastq paired-end read files and optional text header to a single bam file. ''' - if header : + if header: fastqToSamOut = mkstempfname('.bam') - else : + else: fastqToSamOut = outBam - if sampleName == None : - sampleName = 'Dummy' # Will get overwritten by rehead command - if header : + if sampleName is None: + sampleName = 'Dummy' # Will get overwritten by rehead command + if header: # With the header option, rehead will be called after FastqToSam. # This will invalidate any md5 file, which would be a slow to construct # on our own, so just disallow and let the caller run md5sum if desired. - if any(opt.lower() == 'CREATE_MD5_FILE=True'.lower() - for opt in picardOptions) : - raise Exception('CREATE_MD5_FILE is not allowed with --header.') + if any(opt.lower() == 'CREATE_MD5_FILE=True'.lower() for opt in picardOptions): + raise Exception("""CREATE_MD5_FILE is not allowed with '--header.'""") tools.picard.FastqToSamTool().execute( - inFastq1, inFastq2, sampleName, fastqToSamOut, - picardOptions=picardOptions, JVMmemory=JVMmemory) - - if header : + inFastq1, + inFastq2, + sampleName, + fastqToSamOut, + picardOptions=picardOptions, + JVMmemory=JVMmemory) + + if header: tools.samtools.SamtoolsTool().reheader(fastqToSamOut, header, outBam) - + return 0 + def parser_fastq_to_bam(parser=argparse.ArgumentParser()): - parser.add_argument('inFastq1', - help='Input fastq file; 1st end of paired-end reads.') - parser.add_argument('inFastq2', - help='Input fastq file; 2nd end of paired-end reads.') + parser.add_argument('inFastq1', help='Input fastq file; 1st end of paired-end reads.') + parser.add_argument('inFastq2', help='Input fastq file; 2nd end of paired-end reads.') parser.add_argument('outBam', help='Output bam file.') - headerGroup = parser.add_mutually_exclusive_group(required = True) - headerGroup.add_argument('--sampleName', - help='Sample name to insert into the read group header.') - headerGroup.add_argument('--header', - help='Optional text file containing header.') - parser.add_argument('--JVMmemory', default = tools.picard.FastqToSamTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--picardOptions', default = [], nargs='*', - help='''Optional arguments to Picard\'s FastqToSam, - OPTIONNAME=value ... Note that header-related options will be + headerGroup = parser.add_mutually_exclusive_group(required=True) + headerGroup.add_argument('--sampleName', help='Sample name to insert into the read group header.') + headerGroup.add_argument('--header', help='Optional text file containing header.') + parser.add_argument('--JVMmemory', + default=tools.picard.FastqToSamTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--picardOptions', + default=[], + nargs='*', + help='''Optional arguments to Picard\'s FastqToSam, + OPTIONNAME=value ... Note that header-related options will be overwritten by HEADER if present.''') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, fastq_to_bam, split_args=True) return parser -__commands__.append(('fastq_to_bam', parser_fastq_to_bam)) +__commands__.append(('fastq_to_bam', parser_fastq_to_bam)) + # ====================== # *** split_reads *** # ====================== @@ -371,78 +472,89 @@ def parser_fastq_to_bam(parser=argparse.ArgumentParser()): defaultMaxReads = 1000 defaultFormat = 'fastq' -def split_reads(inFileName, outPrefix, outSuffix = "", - maxReads = None, numChunks = None, - indexLen = defaultIndexLen, format = defaultFormat) : - '''Split fasta or fastq file into chunks of maxReads reads or into + +def split_reads(inFileName, outPrefix, outSuffix="", + maxReads=None, numChunks=None, + indexLen=defaultIndexLen, format=defaultFormat): + '''Split fasta or fastq file into chunks of maxReads reads or into numChunks chunks named outPrefix01, outPrefix02, etc. If both maxReads and numChunks are None, use defaultMaxReads. The number of characters in file names after outPrefix is indexLen; if not specified, use defaultIndexLen. ''' - if maxReads == None : - if numChunks == None : + if maxReads is None: + if numChunks is None: maxReads = defaultMaxReads - else : - with util.file.open_or_gzopen(inFileName, 'rt') as inFile : + else: + with util.file.open_or_gzopen(inFileName, 'rt') as inFile: totalReadCount = 0 - for rec in SeqIO.parse(inFile, format) : + for rec in SeqIO.parse(inFile, format): totalReadCount += 1 maxReads = int(totalReadCount / numChunks + 0.5) - with util.file.open_or_gzopen(inFileName, 'rt') as inFile : + with util.file.open_or_gzopen(inFileName, 'rt') as inFile: readsWritten = 0 curIndex = 0 outFile = None - for rec in SeqIO.parse(inFile, format) : - if outFile == None : + for rec in SeqIO.parse(inFile, format): + if outFile is None: indexstring = "%0" + str(indexLen) + "d" - outFileName = outPrefix + (indexstring % (curIndex+1)) + outSuffix + outFileName = outPrefix + (indexstring % (curIndex + 1)) + outSuffix outFile = util.file.open_or_gzopen(outFileName, 'wt') SeqIO.write([rec], outFile, format) readsWritten += 1 - if readsWritten == maxReads : + if readsWritten == maxReads: outFile.close() outFile = None readsWritten = 0 curIndex += 1 - if outFile != None : + if outFile is not None: outFile.close() - + return 0 + def parser_split_reads(parser=argparse.ArgumentParser()): parser.add_argument('inFileName', help='Input fastq or fasta file.') parser.add_argument('outPrefix', - help='Output files will be named ${outPrefix}01${outSuffix}, ${outPrefix}02${outSuffix}...') - group = parser.add_mutually_exclusive_group(required = False) - group.add_argument('--maxReads', type = int, default=None, - help = '''Maximum number of reads per chunk (default {:d} if neither + help='Output files will be named ${outPrefix}01${outSuffix}, ${outPrefix}02${outSuffix}...') + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument('--maxReads', + type=int, + default=None, + help='''Maximum number of reads per chunk (default {:d} if neither maxReads nor numChunks is specified).'''.format(defaultMaxReads)) - group.add_argument('--numChunks', type = int, default=None, - help = 'Number of output files, if maxReads is not specified.') - parser.add_argument('--indexLen', type = int, default = defaultIndexLen, - help = '''Number of characters to append to outputPrefix for each + group.add_argument('--numChunks', + type=int, + default=None, + help='Number of output files, if maxReads is not specified.') + parser.add_argument('--indexLen', + type=int, + default=defaultIndexLen, + help='''Number of characters to append to outputPrefix for each output file (default %(default)s). Number of files must not exceed 10^INDEXLEN.''') - parser.add_argument('--format', choices = ['fastq', 'fasta'], - default = defaultFormat, - help='Input fastq or fasta file (default: %(default)s).') + parser.add_argument('--format', + choices=['fastq', 'fasta'], + default=defaultFormat, + help='Input fastq or fasta file (default: %(default)s).') parser.add_argument('--outSuffix', - default = '', - help = '''Output filename suffix (e.g. .fastq or .fastq.gz). + default='', + help='''Output filename suffix (e.g. .fastq or .fastq.gz). A suffix ending in .gz will cause the output file to be gzip compressed. Default is no suffix.''') util.cmd.attach_main(parser, split_reads, split_args=True) return parser + + __commands__.append(('split_reads', parser_split_reads)) -def split_bam(inBam, outBams) : +def split_bam(inBam, outBams): '''Split BAM file equally into several output BAM files. ''' samtools = tools.samtools.SamtoolsTool() picard = tools.picard.PicardTools() - + # get totalReadCount and maxReads # maxReads = totalReadCount / num files, but round up to the nearest # even number in order to keep read pairs together (assuming the input @@ -450,26 +562,25 @@ def split_bam(inBam, outBams) : # accomplished by Picard RevertSam with SANITIZE=true) totalReadCount = samtools.count(inBam) maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2) - log.info("splitting %d reads into %d files of %d reads each", - totalReadCount, len(outBams), maxReads) - + log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads) + # load BAM header into memory header = samtools.getHeader(inBam) if 'SO:queryname' not in header[0]: raise Exception('Input BAM file must be sorted in queryame order') - + # dump to bigsam bigsam = mkstempfname('.sam') samtools.view([], inBam, bigsam) - + # split bigsam into little ones with util.file.open_or_gzopen(bigsam, 'rt') as inf: for outBam in outBams: - log.info("preparing file "+outBam) + log.info("preparing file " + outBam) tmp_sam_reads = mkstempfname('.sam') with open(tmp_sam_reads, 'wt') as outf: for row in header: - outf.write('\t'.join(row)+'\n') + outf.write('\t'.join(row) + '\n') for i in range(maxReads): line = inf.readline() if not line: @@ -478,30 +589,36 @@ def split_bam(inBam, outBams) : if outBam == outBams[-1]: for line in inf: outf.write(line) - picard.execute("SamFormatConverter", [ - 'INPUT='+tmp_sam_reads, 'OUTPUT='+outBam, - 'VERBOSITY=WARNING'], JVMmemory='512m') + picard.execute("SamFormatConverter", + [ + 'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING' + ], + JVMmemory='512m') os.unlink(tmp_sam_reads) os.unlink(bigsam) + + def parser_split_bam(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input BAM file.') parser.add_argument('outBams', nargs='+', help='Output BAM files') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, split_bam, split_args=True) return parser -__commands__.append(('split_bam', parser_split_bam)) +__commands__.append(('split_bam', parser_split_bam)) + # ============================ # *** dup_remove_mvicuna *** # ============================ + def mvicuna_fastqs_to_readlist(inFastq1, inFastq2, readList): # Run M-Vicuna on FASTQ files outFastq1 = mkstempfname('.1.fastq') outFastq2 = mkstempfname('.2.fastq') tools.mvicuna.MvicunaTool().rmdup((inFastq1, inFastq2), (outFastq1, outFastq2), None) - + # Make a list of reads to keep with open(readList, 'at') as outf: for fq in (outFastq1, outFastq2): @@ -511,98 +628,97 @@ def mvicuna_fastqs_to_readlist(inFastq1, inFastq2, readList): if (line_num % 4) == 0: id = line.rstrip('\n')[1:] if id.endswith('/1'): - outf.write(id[:-2]+'\n') + outf.write(id[:-2] + '\n') line_num += 1 os.unlink(outFastq1) os.unlink(outFastq2) + def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None): ''' Remove duplicate reads from BAM file using M-Vicuna. The primary advantage to this approach over Picard's MarkDuplicates tool is that Picard requires that input reads are aligned to a reference, and M-Vicuna can operate on unaligned reads. ''' - + # Convert BAM -> FASTQ pairs per read group and load all read groups tempDir = tempfile.mkdtemp() - tools.picard.SamToFastqTool().per_read_group(inBam, tempDir, - picardOptions=['VALIDATION_STRINGENCY=LENIENT']) - read_groups = [x[1:] for x in - tools.samtools.SamtoolsTool().getHeader(inBam) - if x[0]=='@RG'] - read_groups = [dict(pair.split(':',1) for pair in rg) for rg in read_groups] - + tools.picard.SamToFastqTool().per_read_group(inBam, tempDir, picardOptions=['VALIDATION_STRINGENCY=LENIENT']) + read_groups = [x[1:] for x in tools.samtools.SamtoolsTool().getHeader(inBam) if x[0] == '@RG'] + read_groups = [dict(pair.split(':', 1) for pair in rg) for rg in read_groups] + # Collect FASTQ pairs for each library lb_to_files = {} for rg in read_groups: - lb_to_files.setdefault(rg.get('LB','none'), set()) + lb_to_files.setdefault(rg.get('LB', 'none'), set()) fname = rg['ID'] if 'PU' in rg: fname = rg['PU'] - lb_to_files[rg.get('LB','none')].add(os.path.join(tempDir, fname)) + lb_to_files[rg.get('LB', 'none')].add(os.path.join(tempDir, fname)) log.info("found %d distinct libraries and %d read groups", len(lb_to_files), len(read_groups)) - + # For each library, merge FASTQs and run rmdup for entire library readList = mkstempfname('.keep_reads.txt') for lb, files in lb_to_files.items(): log.info("executing M-Vicuna DupRm on library " + lb) - + # create merged FASTQs per library infastqs = (mkstempfname('.1.fastq'), mkstempfname('.2.fastq')) for d in range(2): with open(infastqs[d], 'wt') as outf: for fprefix in files: - fn = '%s_%d.fastq' % (fprefix, d+1) + fn = '%s_%d.fastq' % (fprefix, d + 1) if os.path.isfile(fn): with open(fn, 'rt') as inf: for line in inf: outf.write(line) os.unlink(fn) else: - log.warn("no reads found in %s, assuming that's because there's no reads in that read group", fn) - + log.warn("""no reads found in %s, + assuming that's because there's no reads in that read group""", fn) + # M-Vicuna DupRm to see what we should keep (append IDs to running file) mvicuna_fastqs_to_readlist(infastqs[0], infastqs[1], readList) for fn in infastqs: os.unlink(fn) - + # Filter original input BAM against keep-list tools.picard.FilterSamReadsTool().execute(inBam, False, readList, outBam, JVMmemory=JVMmemory) return 0 + def parser_rmdup_mvicuna_bam(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input reads, BAM format.') parser.add_argument('outBam', help='Output reads, BAM format.') - parser.add_argument('--JVMmemory', default = tools.picard.FilterSamReadsTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--JVMmemory', + default=tools.picard.FilterSamReadsTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, rmdup_mvicuna_bam, split_args=True) return parser + + __commands__.append(('rmdup_mvicuna_bam', parser_rmdup_mvicuna_bam)) def parser_dup_remove_mvicuna(parser=argparse.ArgumentParser()): - parser.add_argument('inFastq1', - help='Input fastq file; 1st end of paired-end reads.') - parser.add_argument('inFastq2', - help='Input fastq file; 2nd end of paired-end reads.') - parser.add_argument('pairedOutFastq1', - help='Output fastq file; 1st end of paired-end reads.') - parser.add_argument('pairedOutFastq2', - help='Output fastq file; 2nd end of paired-end reads.') - parser.add_argument('--unpairedOutFastq', - default=None, - help='File name of output unpaired reads') + parser.add_argument('inFastq1', help='Input fastq file; 1st end of paired-end reads.') + parser.add_argument('inFastq2', help='Input fastq file; 2nd end of paired-end reads.') + parser.add_argument('pairedOutFastq1', help='Output fastq file; 1st end of paired-end reads.') + parser.add_argument('pairedOutFastq2', help='Output fastq file; 2nd end of paired-end reads.') + parser.add_argument('--unpairedOutFastq', default=None, help='File name of output unpaired reads') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_dup_remove_mvicuna) return parser + + def main_dup_remove_mvicuna(args): '''Run mvicuna's duplicate removal operation on paired-end reads.''' tools.mvicuna.MvicunaTool().rmdup( - (args.inFastq1, args.inFastq2), - (args.pairedOutFastq1, args.pairedOutFastq2), - args.unpairedOutFastq) + (args.inFastq1, args.inFastq2), (args.pairedOutFastq1, args.pairedOutFastq2), args.unpairedOutFastq) return 0 + + __commands__.append(('dup_remove_mvicuna', parser_dup_remove_mvicuna)) @@ -617,35 +733,32 @@ def rmdup_prinseq_fastq(inFastq, outFastq): shutil.copyfile(inFastq, outFastq) else: cmd = [ - 'perl', tools.prinseq.PrinseqTool().install_and_get_path(), - '-ns_max_n', '1', - '-derep', '1', - '-fastq', inFastq, - '-out_bad', 'null', - '-line_width', '0', - '-out_good', outFastq[:-6] - ] + 'perl', tools.prinseq.PrinseqTool().install_and_get_path(), '-ns_max_n', '1', '-derep', '1', '-fastq', + inFastq, '-out_bad', 'null', '-line_width', '0', '-out_good', outFastq[:-6] + ] log.debug(' '.join(cmd)) subprocess.check_call(cmd) + + def parser_rmdup_prinseq_fastq(parser=argparse.ArgumentParser()): - parser.add_argument('inFastq1', - help='Input fastq file; 1st end of paired-end reads.') - parser.add_argument('inFastq2', - help='Input fastq file; 2nd end of paired-end reads.') - parser.add_argument('outFastq1', - help='Output fastq file; 1st end of paired-end reads.') - parser.add_argument('outFastq2', - help='Output fastq file; 2nd end of paired-end reads.') + parser.add_argument('inFastq1', help='Input fastq file; 1st end of paired-end reads.') + parser.add_argument('inFastq2', help='Input fastq file; 2nd end of paired-end reads.') + parser.add_argument('outFastq1', help='Output fastq file; 1st end of paired-end reads.') + parser.add_argument('outFastq2', help='Output fastq file; 2nd end of paired-end reads.') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_rmdup_prinseq_fastq) return parser + + def main_rmdup_prinseq_fastq(args): - ''' Run prinseq-lite's duplicate removal operation on paired-end + ''' Run prinseq-lite's duplicate removal operation on paired-end reads. Also removes reads with more than one N. ''' rmdup_prinseq_fastq(args.inFastq1, args.outFastq1) rmdup_prinseq_fastq(args.inFastq2, args.outFastq2) return 0 + + __commands__.append(('rmdup_prinseq_fastq', parser_rmdup_prinseq_fastq)) @@ -654,103 +767,127 @@ def filter_bam_mapped_only(inBam, outBam): aligned (-F 4) with a non-zero mapping quality (-q 1) and are not marked as a PCR/optical duplicate (-F 1024). ''' - tools.samtools.SamtoolsTool().view( - ['-b', '-q', '1', '-F', '1028'], inBam, outBam) + tools.samtools.SamtoolsTool().view(['-b', '-q', '1', '-F', '1028'], inBam, outBam) tools.picard.BuildBamIndexTool().execute(outBam) return 0 + + def parser_filter_bam_mapped_only(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', - help='Input aligned reads, BAM format.') - parser.add_argument('outBam', - help='Output sorted indexed reads, filtered to aligned-only, BAM format.') + parser.add_argument('inBam', help='Input aligned reads, BAM format.') + parser.add_argument('outBam', help='Output sorted indexed reads, filtered to aligned-only, BAM format.') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, filter_bam_mapped_only, split_args=True) return parser -__commands__.append(('filter_bam_mapped_only', parser_filter_bam_mapped_only)) +__commands__.append(('filter_bam_mapped_only', parser_filter_bam_mapped_only)) + # ======= Novoalign ======== -def parser_novoalign(parser=argparse.ArgumentParser()) : + +def parser_novoalign(parser=argparse.ArgumentParser()): parser.add_argument('inBam', help='Input reads, BAM format.') parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Novoindex.') parser.add_argument('outBam', help='Output reads, BAM format (aligned).') - parser.add_argument('--options', default = '-r Random', - help='Novoalign options (default: %(default)s)') - parser.add_argument('--min_qual', default = 0, - help='Filter outBam to minimum mapping quality (default: %(default)s)') - parser.add_argument('--JVMmemory', default = tools.picard.SortSamTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--options', default='-r Random', help='Novoalign options (default: %(default)s)') + parser.add_argument('--min_qual', + default=0, + help='Filter outBam to minimum mapping quality (default: %(default)s)') + parser.add_argument('--JVMmemory', + default=tools.picard.SortSamTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_novoalign) return parser + + def main_novoalign(args): '''Align reads with Novoalign. Sort and index BAM output.''' - tools.novoalign.NovoalignTool().execute(args.inBam, args.refFasta, args.outBam, - options=args.options.split(), min_qual=args.min_qual, JVMmemory=args.JVMmemory) + tools.novoalign.NovoalignTool().execute( + args.inBam, + args.refFasta, + args.outBam, + options=args.options.split(), + min_qual=args.min_qual, + JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('novoalign', parser_novoalign)) + def parser_novoindex(parser=argparse.ArgumentParser()): parser.add_argument('refFasta', help='Reference genome, FASTA format.') util.cmd.common_args(parser, (('loglevel', None), ('version', None))) util.cmd.attach_main(parser, tools.novoalign.NovoalignTool().index_fasta, split_args=True) return parser -__commands__.append(('novoindex', parser_novoindex)) +__commands__.append(('novoindex', parser_novoindex)) + # ========= GATK ========== + def parser_gatk_ug(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', - help='Input reads, BAM format.') - parser.add_argument('refFasta', - help='Reference genome, FASTA format, pre-indexed by Picard.') + parser.add_argument('inBam', help='Input reads, BAM format.') + parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard.') parser.add_argument('outVcf', - help='''Output calls in VCF format. If this filename ends with .gz, + help='''Output calls in VCF format. If this filename ends with .gz, GATK will BGZIP compress the output and produce a Tabix index file as well.''') parser.add_argument('--options', - default = '--min_base_quality_score 15 -ploidy 4', - help='UnifiedGenotyper options (default: %(default)s)') - parser.add_argument('--JVMmemory', default = tools.gatk.GATKTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + default='--min_base_quality_score 15 -ploidy 4', + help='UnifiedGenotyper options (default: %(default)s)') + parser.add_argument('--JVMmemory', + default=tools.gatk.GATKTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_gatk_ug) return parser + + def main_gatk_ug(args): '''Call genotypes using the GATK UnifiedGenotyper.''' - tools.gatk.GATKTool().ug(args.inBam, args.refFasta, args.outVcf, - options=args.options.split(), JVMmemory=args.JVMmemory) + tools.gatk.GATKTool().ug(args.inBam, + args.refFasta, + args.outVcf, + options=args.options.split(), + JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('gatk_ug', parser_gatk_ug)) + def parser_gatk_realign(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', - help='Input reads, BAM format, aligned to refFasta.') - parser.add_argument('refFasta', - help='Reference genome, FASTA format, pre-indexed by Picard.') - parser.add_argument('outBam', - help='Realigned reads.') - parser.add_argument('--JVMmemory', default = tools.gatk.GATKTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('inBam', help='Input reads, BAM format, aligned to refFasta.') + parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard.') + parser.add_argument('outBam', help='Realigned reads.') + parser.add_argument('--JVMmemory', + default=tools.gatk.GATKTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_gatk_realign) - parser.add_argument('--threads', - default=1, - help='Number of threads (default: %(default)s)') + parser.add_argument('--threads', default=1, help='Number of threads (default: %(default)s)') return parser + + def main_gatk_realign(args): '''Local realignment of BAM files with GATK IndelRealigner.''' tools.gatk.GATKTool().local_realign( - args.inBam, args.refFasta, args.outBam, JVMmemory=args.JVMmemory, threads=args.threads) + args.inBam, + args.refFasta, + args.outBam, + JVMmemory=args.JVMmemory, + threads=args.threads) return 0 -__commands__.append(('gatk_realign', parser_gatk_realign)) +__commands__.append(('gatk_realign', parser_gatk_realign)) + # ========================= def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None, - novoalign_options='', JVMmemory=None, threads=1): + novoalign_options='', JVMmemory=None, threads=1): ''' Take reads, align to reference with Novoalign, mark duplicates with Picard, realign indels with GATK, and optionally filter final file to mapped/non-dupe reads. @@ -758,60 +895,63 @@ def align_and_fix(inBam, refFasta, outBamAll=None, outBamFiltered=None, if not (outBamAll or outBamFiltered): log.warn("are you sure you meant to do nothing?") return - + bam_aligned = mkstempfname('.aligned.bam') tools.novoalign.NovoalignTool().execute( - inBam, refFasta, bam_aligned, - options=novoalign_options.split(), JVMmemory=JVMmemory) - + inBam, + refFasta, + bam_aligned, + options=novoalign_options.split(), + JVMmemory=JVMmemory) + bam_mkdup = mkstempfname('.mkdup.bam') tools.picard.MarkDuplicatesTool().execute( - [bam_aligned], bam_mkdup, - picardOptions=['CREATE_INDEX=true'], JVMmemory=JVMmemory) + [bam_aligned], + bam_mkdup, + picardOptions=['CREATE_INDEX=true'], + JVMmemory=JVMmemory) os.unlink(bam_aligned) - + bam_realigned = mkstempfname('.realigned.bam') - tools.gatk.GATKTool().local_realign( - bam_mkdup, refFasta, bam_realigned, JVMmemory=JVMmemory, threads=threads) + tools.gatk.GATKTool().local_realign(bam_mkdup, refFasta, bam_realigned, JVMmemory=JVMmemory, threads=threads) os.unlink(bam_mkdup) - + if outBamAll: shutil.copyfile(bam_realigned, outBamAll) tools.picard.BuildBamIndexTool().execute(outBamAll) if outBamFiltered: - tools.samtools.SamtoolsTool().view( - ['-b', '-q', '1', '-F', '1028'], - bam_realigned, outBamFiltered) + tools.samtools.SamtoolsTool().view(['-b', '-q', '1', '-F', '1028'], bam_realigned, outBamFiltered) tools.picard.BuildBamIndexTool().execute(outBamFiltered) os.unlink(bam_realigned) - + def parser_align_and_fix(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', - help='Input unaligned reads, BAM format.') - parser.add_argument('refFasta', - help='Reference genome, FASTA format, pre-indexed by Picard and Novoalign.') - parser.add_argument('--outBamAll', default = None, - help='''Aligned, sorted, and indexed reads. Unmapped reads are + parser.add_argument('inBam', help='Input unaligned reads, BAM format.') + parser.add_argument('refFasta', help='Reference genome, FASTA format, pre-indexed by Picard and Novoalign.') + parser.add_argument('--outBamAll', + default=None, + help='''Aligned, sorted, and indexed reads. Unmapped reads are retained and duplicate reads are marked, not removed.''') - parser.add_argument('--outBamFiltered', default = None, - help='''Aligned, sorted, and indexed reads. Unmapped reads and + parser.add_argument('--outBamFiltered', + default=None, + help='''Aligned, sorted, and indexed reads. Unmapped reads and duplicate reads are removed from this file.''') - parser.add_argument('--novoalign_options', default = '-r Random', - help='Novoalign options (default: %(default)s)') - parser.add_argument('--JVMmemory', default = '4g', - help='JVM virtual memory size (default: %(default)s)') - parser.add_argument('--threads', - default=1, - help='Number of threads (default: %(default)s)') + parser.add_argument('--novoalign_options', default='-r Random', help='Novoalign options (default: %(default)s)') + parser.add_argument('--JVMmemory', default='4g', help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--threads', default=1, help='Number of threads (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, align_and_fix, split_args=True) return parser + + __commands__.append(('align_and_fix', parser_align_and_fix)) # ========================= + def full_parser(): return util.cmd.make_parser(__commands__, __doc__) + + if __name__ == '__main__': util.cmd.main_argparse(__commands__, __doc__) diff --git a/reports.py b/reports.py index 9e8a121d2..6aebb7d70 100755 --- a/reports.py +++ b/reports.py @@ -5,9 +5,17 @@ __author__ = "dpark@broadinstitute.org" __commands__ = [] -import argparse, logging, subprocess, glob, os, time -import pysam, Bio.SeqIO -import util.cmd, util.file, util.misc +import argparse +import logging +import subprocess +import glob +import os +import time +import pysam +import Bio.SeqIO +import util.cmd +import util.file +import util.misc import tools.samtools import assembly from util.stats import mean, median @@ -16,35 +24,45 @@ def get_assembly_stats(sample, - cov_thresholds=(1,5,20,100), - assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', - align_dir='data/02_align_to_self', reads_dir='data/01_per_sample', - raw_reads_dir='data/00_raw'): + cov_thresholds=(1, 5, 20, 100), + assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', + align_dir='data/02_align_to_self', reads_dir='data/01_per_sample', + raw_reads_dir='data/00_raw'): ''' Fetch assembly-level statistics for a given sample ''' - out = {'sample':sample} + out = {'sample': sample} samtools = tools.samtools.SamtoolsTool() - header = ['sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt', - 'assembled_trinity', 'trinity_in_reads', - 'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig', - 'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup', - 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0', - ] + ['aln2self_cov_%dX'%t for t in cov_thresholds] - + header = ['sample', + 'reads_raw', + 'reads_cleaned', + 'reads_taxfilt', + 'assembled_trinity', + 'trinity_in_reads', + 'n_contigs', + 'contig_len', + 'unambig_bases', + 'pct_unambig', + 'aln2self_reads_tot', + 'aln2self_reads_aln', + 'aln2self_reads_rmdup', + 'aln2self_pct_nondup', + 'aln2self_cov_median', + 'aln2self_cov_mean', + 'aln2self_cov_mean_non0',] + ['aln2self_cov_%dX' % t for t in cov_thresholds] + # per-sample unaligned read stats for adj in ('cleaned', 'taxfilt'): reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam'))) if os.path.isfile(reads_bam): - out['reads_'+adj] = samtools.count(reads_bam) - out['reads_raw'] = sum(samtools.count(bam) - for bam in glob.glob(os.path.join(raw_reads_dir, sample+"*.bam"))) - + out['reads_' + adj] = samtools.count(reads_bam) + out['reads_raw'] = sum(samtools.count(bam) for bam in glob.glob(os.path.join(raw_reads_dir, sample + "*.bam"))) + # pre-assembly stats - out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, - sample + '.assembly1-trinity.fasta')) and 1 or 0 + out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, sample + + '.assembly1-trinity.fasta')) and 1 or 0 sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam') if os.path.isfile(sub_bam): - out['trinity_in_reads'] = samtools.count(sub_bam) - + out['trinity_in_reads'] = samtools.count(sub_bam) + # assembly stats assembly_fname = os.path.join(assembly_dir, sample + '.fasta') if not os.path.isfile(assembly_fname): @@ -53,14 +71,12 @@ def get_assembly_stats(sample, out['n_contigs'] = 0 return (header, out) with open(assembly_fname, 'rt') as inf: - counts = [(len(s), assembly.unambig_count(s.seq)) - for s in Bio.SeqIO.parse(inf, 'fasta') - if len(s)>0] + counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0] out['n_contigs'] = len(counts) - out['contig_len'] = ','.join(str(x) for x,y in counts) - out['unambig_bases'] = ','.join(str(y) for x,y in counts) - out['pct_unambig'] = ','.join(str(float(y)/x) for x,y in counts) - + out['contig_len'] = ','.join(str(x) for x, y in counts) + out['unambig_bases'] = ','.join(str(y) for x, y in counts) + out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts) + # read counts from align-to-self bam_fname = os.path.join(align_dir, sample + '.bam') if not os.path.isfile(bam_fname): @@ -70,63 +86,74 @@ def get_assembly_stats(sample, out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028']) if out['aln2self_reads_aln']: out['aln2self_pct_nondup'] = float(out['aln2self_reads_rmdup']) / out['aln2self_reads_aln'] - + # genome coverage stats bam_fname = os.path.join(align_dir, sample + '.mapped.bam') with pysam.AlignmentFile(bam_fname, 'rb') as bam: coverages = list([pcol.nsegments for pcol in bam.pileup()]) out['aln2self_cov_median'] = median(coverages) - out['aln2self_cov_mean'] = "%0.3f"%mean(coverages) - out['aln2self_cov_mean_non0'] = "%0.3f"%mean([n for n in coverages if n>0]) + out['aln2self_cov_mean'] = "%0.3f" % mean(coverages) + out['aln2self_cov_mean_non0'] = "%0.3f" % mean([n for n in coverages if n > 0]) for thresh in cov_thresholds: - out['aln2self_cov_%dX'%thresh] = sum(1 for n in coverages if n>=thresh) - + out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh) + return (header, out) -def assembly_stats(samples, outFile, - cov_thresholds, assembly_dir, assembly_tmp, align_dir, - reads_dir, raw_reads_dir): + +def assembly_stats(samples, outFile, cov_thresholds, assembly_dir, assembly_tmp, align_dir, reads_dir, raw_reads_dir): ''' Fetch assembly-level statistics for a given sample ''' header_written = False with open(outFile, 'wt') as outf: for sample in samples: - log.info("fetching stats on "+sample) + log.info("fetching stats on " + sample) header, out = get_assembly_stats(sample, - cov_thresholds=cov_thresholds, assembly_dir=assembly_dir, - assembly_tmp=assembly_tmp, align_dir=align_dir, - reads_dir=reads_dir, raw_reads_dir=raw_reads_dir) + cov_thresholds=cov_thresholds, + assembly_dir=assembly_dir, + assembly_tmp=assembly_tmp, + align_dir=align_dir, + reads_dir=reads_dir, + raw_reads_dir=raw_reads_dir) if not header_written: - outf.write('\t'.join(map(str, header))+'\n') + outf.write('\t'.join(map(str, header)) + '\n') header_written = True - outf.write('\t'.join([str(out.get(h,'')) for h in header])+'\n') + outf.write('\t'.join([str(out.get(h, '')) for h in header]) + '\n') outf.flush() + + def parser_assembly_stats(parser=argparse.ArgumentParser()): parser.add_argument('samples', nargs='+', help='Sample names.') parser.add_argument('outFile', help='Output report file.') - parser.add_argument('--cov_thresholds', nargs='+', type=int, - default=(1,5,20,100), - help='Genome coverage thresholds to report on. (default: %(default)s)') - parser.add_argument('--assembly_dir', default='data/02_assembly', - help='Directory with assembly outputs. (default: %(default)s)') - parser.add_argument('--assembly_tmp', default='tmp/02_assembly', - help='Directory with assembly temp files. (default: %(default)s)') - parser.add_argument('--align_dir', default='data/02_align_to_self', - help='Directory with reads aligned to own assembly. (default: %(default)s)') - parser.add_argument('--reads_dir', default='data/01_per_sample', - help='Directory with unaligned filtered read BAMs. (default: %(default)s)') - parser.add_argument('--raw_reads_dir', default='data/00_raw', - help='Directory with unaligned raw read BAMs. (default: %(default)s)') + parser.add_argument('--cov_thresholds', + nargs='+', + type=int, + default=(1, 5, 20, 100), + help='Genome coverage thresholds to report on. (default: %(default)s)') + parser.add_argument('--assembly_dir', + default='data/02_assembly', + help='Directory with assembly outputs. (default: %(default)s)') + parser.add_argument('--assembly_tmp', + default='tmp/02_assembly', + help='Directory with assembly temp files. (default: %(default)s)') + parser.add_argument('--align_dir', + default='data/02_align_to_self', + help='Directory with reads aligned to own assembly. (default: %(default)s)') + parser.add_argument('--reads_dir', + default='data/01_per_sample', + help='Directory with unaligned filtered read BAMs. (default: %(default)s)') + parser.add_argument('--raw_reads_dir', + default='data/00_raw', + help='Directory with unaligned raw read BAMs. (default: %(default)s)') util.cmd.attach_main(parser, assembly_stats, split_args=True) return parser -__commands__.append(('assembly_stats', parser_assembly_stats)) +__commands__.append(('assembly_stats', parser_assembly_stats)) + def get_refalign_stats(sample): pass - def consolidate_fastqc(inDirs, outFile): '''Consolidate multiple FASTQC reports into one.''' with util.file.open_or_gzopen(outFile, 'wt') as outf: @@ -136,22 +163,26 @@ def consolidate_fastqc(inDirs, outFile): out = {} with open(os.path.join(sdir, 'summary.txt'), 'rt') as inf: for line in inf: - v,k,fn = line.strip().split('\t') + v, k, fn = line.strip().split('\t') out[k] = v - if out_n==0: + if out_n == 0: header.append(k) if not fn.endswith('.bam'): raise out['Sample'] = fn[:-len('.bam')] - if out_n==0: - outf.write('\t'.join(header)+'\n') - outf.write('\t'.join([out.get(h,'') for h in header])+'\n') + if out_n == 0: + outf.write('\t'.join(header) + '\n') + outf.write('\t'.join([out.get(h, '') for h in header]) + '\n') out_n += 1 + + def parser_consolidate_fastqc(parser=argparse.ArgumentParser()): parser.add_argument('inDirs', help='Input FASTQC directories.', nargs='+') parser.add_argument('outFile', help='Output report file.') util.cmd.attach_main(parser, consolidate_fastqc, split_args=True) return parser + + __commands__.append(('consolidate_fastqc', parser_consolidate_fastqc)) @@ -161,12 +192,13 @@ def get_bam_info(bamstats_dir): with util.file.open_or_gzopen(fn, 'rt') as inf: bam = {} for line in inf: - k,v = line.rstrip('\n').split('\t') + k, v = line.rstrip('\n').split('\t') bam[k] = v libs.setdefault(bam['Sample'], {}) libs[bam['Sample']][bam['BAM']] = bam['Total reads'] return libs + def get_lib_info(runfile): libs = {} for lane in util.file.read_tabfile_dict(runfile): @@ -177,23 +209,18 @@ def get_lib_info(runfile): if plate.lower().startswith('plate'): plate = plate[5:] well_id = well['Well'][0].upper() + "%02d" % int(well['Well'][1:]) - dat = [well['sample'], - lane['flowcell']+'.'+lane['lane'], - well['barcode_1']+'-'+well['barcode_2'], - plate.strip()+':'+well_id, - get_earliest_date(lane['bustard_dir']), - well.get('Tube_ID','')] + dat = [well['sample'], lane['flowcell'] + '.' + lane['lane'], well['barcode_1'] + '-' + well['barcode_2'], + plate.strip() + ':' + well_id, get_earliest_date(lane['bustard_dir']), well.get('Tube_ID', '')] libs[libname].append(dat) return libs + def get_earliest_date(inDir): fnames = [inDir] + [os.path.join(inDir, x) for x in os.listdir(inDir)] earliest = min(os.path.getmtime(fn) for fn in fnames) return time.strftime("%Y-%m-%d", time.localtime(earliest)) - - def consolidate_spike_count(inDir, outFile): '''Consolidate multiple spike count reports into one.''' with open(outFile, 'wt') as outf: @@ -207,19 +234,24 @@ def consolidate_spike_count(inDir, outFile): for line in inf: if not line.startswith('Input bam'): spike, count = line.strip().split('\t') - outf.write('\t'.join([s, spike, count])+'\n') + outf.write('\t'.join([s, spike, count]) + '\n') + + def parser_consolidate_spike_count(parser=argparse.ArgumentParser()): parser.add_argument('inDir', help='Input spike count directory.') parser.add_argument('outFile', help='Output report file.') util.cmd.attach_main(parser, consolidate_spike_count, split_args=True) return parser -__commands__.append(('consolidate_spike_count', parser_consolidate_spike_count)) +__commands__.append(('consolidate_spike_count', parser_consolidate_spike_count)) # ======================= + def full_parser(): return util.cmd.make_parser(__commands__, __doc__) + + if __name__ == '__main__': util.cmd.main_argparse(__commands__, __doc__) diff --git a/taxon_filter.py b/taxon_filter.py index b658502ac..d9da128ac 100755 --- a/taxon_filter.py +++ b/taxon_filter.py @@ -4,14 +4,25 @@ ''' __author__ = "dpark@broadinstitute.org, irwin@broadinstitute.org," \ - + "hlevitin@broadinstitute.org" + + "hlevitin@broadinstitute.org" __commands__ = [] -import argparse, logging, subprocess, os, tempfile, shutil +import argparse +import logging +import subprocess +import os +import tempfile +import shutil from Bio import SeqIO -import util.cmd, util.file -import tools, tools.blast -import tools.last, tools.prinseq, tools.trimmomatic, tools.bmtagger, tools.picard +import util.cmd +import util.file +import tools +import tools.blast +import tools.last +import tools.prinseq +import tools.trimmomatic +import tools.bmtagger +import tools.picard from util.file import mkstempfname from util.misc import batch_iterator import read_utils @@ -22,90 +33,93 @@ # *** deplete_human *** # ======================= + def parser_deplete_human(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', - help='Input BAM file.') - parser.add_argument('revertBam', - help='Output BAM: read markup reverted with Picard.') - parser.add_argument('bmtaggerBam', - help='Output BAM: depleted of human reads with BMTagger.') - parser.add_argument('rmdupBam', - help='Output BAM: bmtaggerBam run through M-Vicuna duplicate removal.') + parser.add_argument('inBam', help='Input BAM file.') + parser.add_argument('revertBam', help='Output BAM: read markup reverted with Picard.') + parser.add_argument('bmtaggerBam', help='Output BAM: depleted of human reads with BMTagger.') + parser.add_argument('rmdupBam', help='Output BAM: bmtaggerBam run through M-Vicuna duplicate removal.') parser.add_argument('blastnBam', - help='Output BAM: rmdupBam run through another depletion of human reads with BLASTN.') + help='Output BAM: rmdupBam run through another depletion of human reads with BLASTN.') parser.add_argument('--taxfiltBam', - help='Output BAM: blastnBam run through taxonomic selection via LASTAL.', - default=None) - parser.add_argument('--bmtaggerDbs', nargs='+', required=True, - help='''Reference databases (one or more) to deplete from input. + help='Output BAM: blastnBam run through taxonomic selection via LASTAL.', + default=None) + parser.add_argument('--bmtaggerDbs', + nargs='+', + required=True, + help='''Reference databases (one or more) to deplete from input. For each db, requires prior creation of db.bitmask by bmtool, and db.srprism.idx, db.srprism.map, etc. by srprism mkindex.''') - parser.add_argument('--blastDbs', nargs='+', required=True, - help='One or more reference databases for blast to deplete from input.') + parser.add_argument('--blastDbs', + nargs='+', + required=True, + help='One or more reference databases for blast to deplete from input.') parser.add_argument('--lastDb', - help='One reference database for last (required if --taxfiltBam is specified).', - default=None) - parser.add_argument('--JVMmemory', default = tools.picard.FilterSamReadsTool.jvmMemDefault, - help='JVM virtual memory size for Picard FilterSamReads (default: %(default)s)') + help='One reference database for last (required if --taxfiltBam is specified).', + default=None) + parser.add_argument('--JVMmemory', + default=tools.picard.FilterSamReadsTool.jvmMemDefault, + help='JVM virtual memory size for Picard FilterSamReads (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_deplete_human) return parser + + def main_deplete_human(args): ''' Run the entire depletion pipeline: bmtagger, mvicuna, blastn. Optionally, use lastal to select a specific taxon of interest.''' - tools.picard.RevertSamTool().execute(args.inBam, args.revertBam, - picardOptions=['SORT_ORDER=queryname', 'SANITIZE=true']) - multi_db_deplete_bam(args.revertBam, args.bmtaggerDbs, deplete_bmtagger_bam, args.bmtaggerBam, JVMmemory=args.JVMmemory) + tools.picard.RevertSamTool().execute(args.inBam, + args.revertBam, + picardOptions=['SORT_ORDER=queryname', 'SANITIZE=true']) + multi_db_deplete_bam( + args.revertBam, + args.bmtaggerDbs, + deplete_bmtagger_bam, + args.bmtaggerBam, + JVMmemory=args.JVMmemory) read_utils.rmdup_mvicuna_bam(args.bmtaggerBam, args.rmdupBam, JVMmemory=args.JVMmemory) multi_db_deplete_bam(args.rmdupBam, args.blastDbs, deplete_blastn_bam, args.blastnBam, JVMmemory=args.JVMmemory) if args.taxfiltBam and args.lastDb: filter_lastal_bam(args.blastnBam, args.lastDb, args.taxfiltBam, JVMmemory=args.JVMmemory) return 0 -__commands__.append(('deplete_human', parser_deplete_human)) +__commands__.append(('deplete_human', parser_deplete_human)) + # ========================== # *** trim_trimmomatic *** # ========================== -def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, - clipFasta): + +def trimmomatic(inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta): '''Trim read sequences with Trimmomatic.''' trimmomaticPath = tools.trimmomatic.TrimmomaticTool().install_and_get_path() tmpUnpaired1 = mkstempfname() tmpUnpaired2 = mkstempfname() # This java program has a lot of argments... - javaCmd = ['java', '-Xmx2g', - '-Djava.io.tmpdir='+tempfile.tempdir, - '-classpath', - trimmomaticPath, - 'org.usadellab.trimmomatic.TrimmomaticPE', - inFastq1, - inFastq2, - pairedOutFastq1, - tmpUnpaired1, - pairedOutFastq2, - tmpUnpaired2, - 'LEADING:20', 'TRAILING:20', 'SLIDINGWINDOW:4:25', 'MINLEN:30', - 'ILLUMINACLIP:{}:2:30:12'.format(clipFasta) - ] + javaCmd = ['java', '-Xmx2g', '-Djava.io.tmpdir=' + tempfile.tempdir, '-classpath', trimmomaticPath, + 'org.usadellab.trimmomatic.TrimmomaticPE', inFastq1, inFastq2, pairedOutFastq1, tmpUnpaired1, + pairedOutFastq2, tmpUnpaired2, 'LEADING:20', 'TRAILING:20', 'SLIDINGWINDOW:4:25', 'MINLEN:30', + 'ILLUMINACLIP:{}:2:30:12'.format(clipFasta)] log.debug(' '.join(javaCmd)) subprocess.check_call(javaCmd) os.unlink(tmpUnpaired1) os.unlink(tmpUnpaired2) + def parser_trim_trimmomatic(parser=argparse.ArgumentParser()): - parser.add_argument("inFastq1", help = "Input reads 1") - parser.add_argument("inFastq2", help = "Input reads 2") - parser.add_argument("pairedOutFastq1", help = "Paired output 1") - parser.add_argument("pairedOutFastq2", help = "Paired output 2") - parser.add_argument("clipFasta", - help = "Fasta file with adapters, PCR sequences, etc. to clip off") + parser.add_argument("inFastq1", help="Input reads 1") + parser.add_argument("inFastq2", help="Input reads 2") + parser.add_argument("pairedOutFastq1", help="Paired output 1") + parser.add_argument("pairedOutFastq2", help="Paired output 2") + parser.add_argument("clipFasta", help="Fasta file with adapters, PCR sequences, etc. to clip off") util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, trimmomatic, split_args=True) return parser + + __commands__.append(('trim_trimmomatic', parser_trim_trimmomatic)) # ======================= @@ -117,9 +131,8 @@ def lastal_get_hits(inFastq, db, outList): lastalPath = tools.last.Lastal().install_and_get_path() mafSortPath = tools.last.MafSort().install_and_get_path() mafConvertPath = tools.last.MafConvert().install_and_get_path() - noBlastLikeHitsPath = os.path.join( util.file.get_scripts_path(), - 'noBlastLikeHits.py') - + noBlastLikeHitsPath = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py') + lastalOut = mkstempfname('.lastal') with open(lastalOut, 'wt') as outf: cmd = [lastalPath, '-Q1', db, inFastq] @@ -127,7 +140,7 @@ def lastal_get_hits(inFastq, db, outList): subprocess.check_call(cmd, stdout=outf) # everything below this point in this method should be replaced with # our own code that just reads lastal output and makes a list of read names - + mafSortOut = mkstempfname('.mafsort') with open(mafSortOut, 'wt') as outf: with open(lastalOut, 'rt') as inf: @@ -135,21 +148,21 @@ def lastal_get_hits(inFastq, db, outList): log.debug('cat ' + lastalOut + ' | ' + ' '.join(cmd) + ' > ' + mafSortOut) subprocess.check_call(cmd, stdin=inf, stdout=outf) os.unlink(lastalOut) - + mafConvertOut = mkstempfname('.mafconvert') with open(mafConvertOut, 'wt') as outf: cmd = [mafConvertPath, 'tab', mafSortOut] log.debug(' '.join(cmd) + ' > ' + mafConvertOut) subprocess.check_call(cmd, stdout=outf) os.unlink(mafSortOut) - + filteredFastq = mkstempfname('.filtered.fastq') with open(filteredFastq, 'wt') as outf: cmd = [noBlastLikeHitsPath, '-b', mafConvertOut, '-r', inFastq, '-m', 'hit'] log.debug(' '.join(cmd) + ' > ' + filteredFastq) subprocess.check_call(cmd, stdout=outf) os.unlink(mafConvertOut) - + with open(outList, 'wt') as outf: with open(filteredFastq, 'rt') as inf: line_num = 0 @@ -158,9 +171,10 @@ def lastal_get_hits(inFastq, db, outList): id = line.rstrip('\n\r')[1:] if id.endswith('/1') or id.endswith('/2'): id = id[:-2] - outf.write(id+'\n') + outf.write(id + '\n') line_num += 1 + def filter_lastal_bam(inBam, db, outBam, JVMmemory=None): ''' Restrict input reads to those that align to the given reference database using LASTAL. @@ -169,7 +183,7 @@ def filter_lastal_bam(inBam, db, outBam, JVMmemory=None): inReads1 = mkstempfname('.1.fastq') inReads2 = mkstempfname('.2.fastq') tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2) - + # look for hits in inReads1 and inReads2 hitList1 = mkstempfname('.1.hits') hitList2 = mkstempfname('.2.hits') @@ -177,28 +191,34 @@ def filter_lastal_bam(inBam, db, outBam, JVMmemory=None): os.unlink(inReads1) lastal_get_hits(inReads2, db, hitList2) os.unlink(inReads2) - + # merge hits hitList = mkstempfname('.hits') with open(hitList, 'wt') as outf: subprocess.check_call(['sort', '-u', hitList1, hitList2], stdout=outf) os.unlink(hitList1) os.unlink(hitList2) - + # filter original BAM file against keep list tools.picard.FilterSamReadsTool().execute(inBam, False, hitList, outBam, JVMmemory=JVMmemory) os.unlink(hitList) + + def parser_filter_lastal_bam(parser=argparse.ArgumentParser()): - parser.add_argument("inBam", help="Input reads") - parser.add_argument("db", help="Database of taxa we keep") + parser.add_argument("inBam", help="Input reads") + parser.add_argument("db", help="Database of taxa we keep") parser.add_argument("outBam", help="Output reads, filtered to refDb") - parser.add_argument('--JVMmemory', default = tools.picard.FilterSamReadsTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('--JVMmemory', + default=tools.picard.FilterSamReadsTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, filter_lastal_bam, split_args=True) return parser + + __commands__.append(('filter_lastal_bam', parser_filter_lastal_bam)) + def filter_lastal(inFastq, refDb, outFastq): ''' Restrict input reads to those that align to the given reference database using LASTAL. Also, remove duplicates with prinseq. @@ -209,8 +229,7 @@ def filter_lastal(inFastq, refDb, outFastq): mafSortPath = tools.last.MafSort().install_and_get_path() mafConvertPath = tools.last.MafConvert().install_and_get_path() prinseqPath = tools.prinseq.PrinseqTool().install_and_get_path() - noBlastLikeHitsPath = os.path.join( util.file.get_scripts_path(), - 'noBlastLikeHits.py') + noBlastLikeHitsPath = os.path.join(util.file.get_scripts_path(), 'noBlastLikeHits.py') # each pipe separated cmd gets own line # unfortunately, it doesn't seem to work to do .format(**locals()) on the @@ -219,18 +238,17 @@ def filter_lastal(inFastq, refDb, outFastq): '{lastalPath} -Q1 {refDb} {inFastq}'.format(**locals()), '| {mafSortPath} -n2'.format(**locals()), '| {mafConvertPath} tab /dev/stdin > {tempFilePath}'.format(**locals()), - ]) + ]) log.debug(lastalCmd) assert not os.system(lastalCmd) # filter inFastq against lastal hits filteredFastq = mkstempfname('.filtered.fastq') with open(filteredFastq, 'wt') as outf: - noBlastLikeHitsCmd = [ - noBlastLikeHitsPath, '-b', tempFilePath, '-r', inFastq, '-m', 'hit'] + noBlastLikeHitsCmd = [noBlastLikeHitsPath, '-b', tempFilePath, '-r', inFastq, '-m', 'hit'] log.debug(' '.join(noBlastLikeHitsCmd) + ' > ' + filteredFastq) subprocess.check_call(noBlastLikeHitsCmd, stdout=outf) - + # remove duplicate reads and reads with multiple Ns if os.path.getsize(filteredFastq) == 0: # prinseq-lite fails on empty file input (which can happen in real life @@ -239,34 +257,31 @@ def filter_lastal(inFastq, refDb, outFastq): shutil.copyfile(filteredFastq, outFastq) else: prinseqCmd = [ - 'perl', prinseqPath, - '-ns_max_n', '1', - '-derep', '1', - '-fastq', filteredFastq, - '-out_bad', 'null', - '-line_width', '0', - '-out_good', outFastq[:-6] - ] + 'perl', prinseqPath, '-ns_max_n', '1', '-derep', '1', '-fastq', filteredFastq, '-out_bad', 'null', + '-line_width', '0', '-out_good', outFastq[:-6] + ] log.debug(' '.join(prinseqCmd)) subprocess.check_call(prinseqCmd) os.unlink(filteredFastq) + def parser_filter_lastal(parser=argparse.ArgumentParser()): parser.add_argument("inFastq", help="Input fastq file") - parser.add_argument("refDb", - help="Reference database to retain from input") - parser.add_argument("outFastq", help = "Output fastq file") + parser.add_argument("refDb", help="Reference database to retain from input") + parser.add_argument("outFastq", help="Output fastq file") util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, filter_lastal, split_args=True) return parser -__commands__.append(('filter_lastal', parser_filter_lastal)) +__commands__.append(('filter_lastal', parser_filter_lastal)) + # ============================ # *** partition_bmtagger *** # ============================ -def deplete_bmtagger_bam(inBam, db, outBam, JVMmemory=None) : + +def deplete_bmtagger_bam(inBam, db, outBam, JVMmemory=None): """ Use bmtagger to partition the input reads into ones that match at least one of the databases and ones that don't match any of the databases. @@ -277,7 +292,7 @@ def deplete_bmtagger_bam(inBam, db, outBam, JVMmemory=None) : outBam: the output BAM files to hold the unmatched reads. """ bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path() - + # bmtagger calls several executables in the same directory, and blastn; # make sure they are accessible through $PATH blastnPath = tools.blast.BlastnTool().install_and_get_path() @@ -288,24 +303,23 @@ def deplete_bmtagger_bam(inBam, db, outBam, JVMmemory=None) : path = [d] + path path = os.pathsep.join(path) os.environ['PATH'] = path - + inReads1 = mkstempfname('.1.fastq') inReads2 = mkstempfname('.2.fastq') tools.picard.SamToFastqTool().execute(inBam, inReads1, inReads2) - + tempDir = tempfile.mkdtemp() matchesFile = mkstempfname('.txt') - cmdline = [bmtaggerPath, - '-b', db+'.bitmask', '-x', db+'.srprism', '-T', tempDir, - '-q1', '-1', inReads1, '-2', inReads2, - '-o', matchesFile] + cmdline = [bmtaggerPath, '-b', db + '.bitmask', '-x', db + '.srprism', '-T', tempDir, '-q1', '-1', inReads1, '-2', + inReads2, '-o', matchesFile] log.debug(' '.join(cmdline)) subprocess.check_call(cmdline) - + tools.picard.FilterSamReadsTool().execute(inBam, True, matchesFile, outBam, JVMmemory=JVMmemory) os.unlink(matchesFile) -def select_reads(inFastq, outFastq, selectorFcn) : + +def select_reads(inFastq, outFastq, selectorFcn): """ selectorFcn: Bio.SeqRecord.SeqRecord -> bool Output in outFastq all reads from inFastq for which @@ -314,14 +328,14 @@ def select_reads(inFastq, outFastq, selectorFcn) : on BAM files) which is likely much faster. This is the slowest step of partition_bmtagger currently. """ - with util.file.open_or_gzopen(inFastq, 'rt') as inFile : - with util.file.open_or_gzopen(outFastq, 'wt') as outFile : - for rec in SeqIO.parse(inFile, 'fastq') : - if selectorFcn(rec) : + with util.file.open_or_gzopen(inFastq, 'rt') as inFile: + with util.file.open_or_gzopen(outFastq, 'wt') as outFile: + for rec in SeqIO.parse(inFile, 'fastq'): + if selectorFcn(rec): SeqIO.write([rec], outFile, 'fastq') -def partition_bmtagger(inFastq1, inFastq2, databases, - outMatch = None, outNoMatch = None) : + +def partition_bmtagger(inFastq1, inFastq2, databases, outMatch=None, outNoMatch=None): """ Use bmtagger to partition the input reads into ones that match at least one of the databases and ones that don't match any of the databases. @@ -334,7 +348,7 @@ def partition_bmtagger(inFastq1, inFastq2, databases, hold the matching or unmatched reads. """ bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path() - + # bmtagger calls several executables in the same directory, and blastn; # make sure they are accessible through $PATH blastnPath = tools.blast.BlastnTool().install_and_get_path() @@ -345,56 +359,55 @@ def partition_bmtagger(inFastq1, inFastq2, databases, path = [d] + path path = os.pathsep.join(path) os.environ['PATH'] = path - + # bmtagger's list of matches strips /1 and /2 from ends of reads strip12 = lambda id : id[:-2] if id.endswith('/1') or id.endswith('/2') \ - else id - + else id + tempDir = tempfile.mkdtemp() matchesFiles = [mkstempfname() for db in databases] curReads1, curReads2 = inFastq1, inFastq2 for count, (db, matchesFile) in \ - enumerate(zip(databases, matchesFiles)) : + enumerate(zip(databases, matchesFiles)): """ Loop invariants: At the end of the kth loop, curReadsN has the original reads depleted by all matches to the first k databases, and matchesFiles[:k] contain the list of matching read names. """ - cmdline = [bmtaggerPath, - '-b', db+'.bitmask', '-x', db+'.srprism', '-T', tempDir, - '-q1', '-1', curReads1, '-2', curReads2, - '-o', matchesFile] + cmdline = [bmtaggerPath, '-b', db + '.bitmask', '-x', db + '.srprism', '-T', tempDir, '-q1', '-1', curReads1, + '-2', curReads2, '-o', matchesFile] log.debug(' '.join(cmdline)) subprocess.check_call(cmdline) prevReads1, prevReads2 = curReads1, curReads2 - if count < len(databases) - 1 : + if count < len(databases) - 1: curReads1, curReads2 = mkstempfname(), mkstempfname() - elif outNoMatch != None : + elif outNoMatch is not None: # Final time through loop, output depleted to requested files curReads1, curReads2 = outNoMatch[0], outNoMatch[1] - else : + else: # No need to calculate final depleted file. No one asked for it. # Technically, this violates the loop invariant ;-) break log.debug("starting select_reads") with open(matchesFile) as inf: matches = set(line.strip() for line in inf) - noMatchFcn = lambda rec : strip12(rec.id) not in matches + noMatchFcn = lambda rec: strip12(rec.id) not in matches select_reads(prevReads1, curReads1, noMatchFcn) select_reads(prevReads2, curReads2, noMatchFcn) - if outMatch != None : + if outMatch is not None: log.debug("preparing outMatch files") allMatches = set() for matchesFile in matchesFiles: with open(matchesFile) as inf: newMatches = set(line.strip() for line in inf) allMatches = allMatches.union(newMatches) - matchFcn = lambda rec : strip12(rec.id) in allMatches + matchFcn = lambda rec: strip12(rec.id) in allMatches select_reads(inFastq1, outMatch[0], matchFcn) select_reads(inFastq2, outMatch[1], matchFcn) log.debug("partition_bmtagger complete") + def deplete_bmtagger(inFastq1, inFastq2, databases, outFastq1, outFastq2): """ Use bmtagger to partition the input reads into ones that match at least one @@ -411,7 +424,7 @@ def deplete_bmtagger(inFastq1, inFastq2, databases, outFastq1, outFastq2): """ bmtaggerPath = tools.bmtagger.BmtaggerShTool().install_and_get_path() blastnPath = tools.blast.BlastnTool().install_and_get_path() - + # bmtagger calls several executables in the same directory, and blastn; # make sure they are accessible through $PATH path = os.environ['PATH'].split(os.pathsep) @@ -421,19 +434,17 @@ def deplete_bmtagger(inFastq1, inFastq2, databases, outFastq1, outFastq2): path = [d] + path path = os.pathsep.join(path) os.environ['PATH'] = path - + tempDir = tempfile.mkdtemp() curReads1, curReads2 = inFastq1, inFastq2 tempfiles = [] for db in databases: outprefix = mkstempfname() - cmdline = [bmtaggerPath, '-X', - '-b', db+'.bitmask', '-x', db+'.srprism', '-T', tempDir, - '-q1', '-1', curReads1, '-2', curReads2, - '-o', outprefix] + cmdline = [bmtaggerPath, '-X', '-b', db + '.bitmask', '-x', db + '.srprism', '-T', tempDir, '-q1', '-1', + curReads1, '-2', curReads2, '-o', outprefix] log.debug(' '.join(cmdline)) subprocess.check_call(cmdline) - curReads1, curReads2 = [outprefix+suffix for suffix in ('_1.fastq', '_2.fastq')] + curReads1, curReads2 = [outprefix + suffix for suffix in ('_1.fastq', '_2.fastq')] tempfiles += [curReads1, curReads2] shutil.copyfile(curReads1, outFastq1) shutil.copyfile(curReads2, outFastq2) @@ -441,27 +452,28 @@ def deplete_bmtagger(inFastq1, inFastq2, databases, outFastq1, outFastq2): os.unlink(fn) log.debug("deplete_bmtagger complete") + def parser_partition_bmtagger(parser=argparse.ArgumentParser()): - parser.add_argument('inFastq1', - help='Input fastq file; 1st end of paired-end reads.') + parser.add_argument('inFastq1', help='Input fastq file; 1st end of paired-end reads.') parser.add_argument('inFastq2', - help='Input fastq file; 2nd end of paired-end reads. '\ - 'Must have same names as inFastq1') - parser.add_argument('refDbs', nargs='+', - help='''Reference databases (one or more) to deplete from input. + help='Input fastq file; 2nd end of paired-end reads. ' + 'Must have same names as inFastq1') + parser.add_argument('refDbs', + nargs='+', + help='''Reference databases (one or more) to deplete from input. For each db, requires prior creation of db.bitmask by bmtool, and db.srprism.idx, db.srprism.map, etc. by srprism mkindex. ''') - parser.add_argument('--outMatch', nargs = 2, - help='Filenames for fastq output of matching reads.') - parser.add_argument('--outNoMatch', nargs = 2, - help='Filenames for fastq output of unmatched reads.') + parser.add_argument('--outMatch', nargs=2, help='Filenames for fastq output of matching reads.') + parser.add_argument('--outNoMatch', nargs=2, help='Filenames for fastq output of unmatched reads.') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_partition_bmtagger) return parser -def main_partition_bmtagger(args) : - ''' Use bmtagger to partition input reads into ones that - match at least one of several databases and ones that don't match + + +def main_partition_bmtagger(args): + ''' Use bmtagger to partition input reads into ones that + match at least one of several databases and ones that don't match any of the databases. ''' inFastq1 = args.inFastq1 @@ -471,31 +483,38 @@ def main_partition_bmtagger(args) : outNoMatch = args.outNoMatch assert outMatch or outNoMatch # comment this out until we can figure out why bmtagger -X fails only on Travis - #if outMatch==None: + # if outMatch==None: # deplete_bmtagger(inFastq1, inFastq2, databases, outNoMatch[0], outNoMatch[1]) - #else: + # else: # partition_bmtagger(inFastq1, inFastq2, databases, outMatch, outNoMatch) - #return 0 + # return 0 partition_bmtagger(inFastq1, inFastq2, databases, outMatch, outNoMatch) + + __commands__.append(('partition_bmtagger', parser_partition_bmtagger)) + def parser_deplete_bam_bmtagger(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', - help='Input BAM file.') - parser.add_argument('refDbs', nargs='+', - help='''Reference databases (one or more) to deplete from input. + parser.add_argument('inBam', help='Input BAM file.') + parser.add_argument('refDbs', + nargs='+', + help='''Reference databases (one or more) to deplete from input. For each db, requires prior creation of db.bitmask by bmtool, and db.srprism.idx, db.srprism.map, etc. by srprism mkindex.''') - parser.add_argument('outBam', - help='Output BAM file.') - parser.add_argument('--JVMmemory', default = tools.picard.FilterSamReadsTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('outBam', help='Output BAM file.') + parser.add_argument('--JVMmemory', + default=tools.picard.FilterSamReadsTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_deplete_bam_bmtagger) return parser -def main_deplete_bam_bmtagger(args) : + + +def main_deplete_bam_bmtagger(args): '''Use bmtagger to deplete input reads against several databases.''' multi_db_deplete_bam(args.inBam, args.refDbs, deplete_bmtagger_bam, args.outBam, JVMmemory=args.JVMmemory) + + __commands__.append(('deplete_bam_bmtagger', parser_deplete_bam_bmtagger)) @@ -509,11 +528,11 @@ def multi_db_deplete_bam(inBam, refDbs, deplete_method, outBam, JVMmemory=None): tmpBamIn = tmpBamOut shutil.copyfile(tmpBamIn, outBam) - # ======================== # *** deplete_blastn *** # ======================== + def blastn_chunked_fasta(fasta, db, chunkSize=1000000): """ Helper function: blastn a fasta file, overcoming apparent memory leaks on @@ -524,18 +543,16 @@ def blastn_chunked_fasta(fasta, db, chunkSize=1000000): blastnPath = tools.blast.BlastnTool().install_and_get_path() hits_files = [] - record_iter = SeqIO.parse(open(fasta, "rt"),"fasta") - for batch in batch_iterator(record_iter, chunkSize) : + record_iter = SeqIO.parse(open(fasta, "rt"), "fasta") + for batch in batch_iterator(record_iter, chunkSize): chunk_fasta = mkstempfname('.fasta') with open(chunk_fasta, "wt") as handle: SeqIO.write(batch, handle, "fasta") batch = None chunk_hits = mkstempfname('.hits.txt') - blastnCmd = [blastnPath, '-db', db, - '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6', - '-max_target_seqs', '2', - '-query', chunk_fasta, '-out', chunk_hits] + blastnCmd = [blastnPath, '-db', db, '-word_size', '16', '-evalue', '1e-6', '-outfmt', '6', '-max_target_seqs', + '2', '-query', chunk_fasta, '-out', chunk_hits] log.debug(' '.join(blastnCmd)) subprocess.check_call(blastnCmd) @@ -544,50 +561,50 @@ def blastn_chunked_fasta(fasta, db, chunkSize=1000000): return hits_files -def deplete_blastn(inFastq, outFastq, refDbs) : + +def deplete_blastn(inFastq, outFastq, refDbs): 'Use blastn to remove reads that match at least one of the databases.' - - ## Get tools - noBlastHits_v3Path = os.path.join(util.file.get_scripts_path(), - 'noBlastHits_v3.py') - - ## Convert to fasta + + # Get tools + noBlastHits_v3Path = os.path.join(util.file.get_scripts_path(), 'noBlastHits_v3.py') + + # Convert to fasta inFasta = mkstempfname('.fasta') read_utils.fastq_to_fasta(inFastq, inFasta) - - ## Run blastn using each of the databases in turn + + # Run blastn using each of the databases in turn blastOutFiles = [] - for db in refDbs : + for db in refDbs: log.info("running blastn on %s against %s", inFastq, db) blastOutFiles += blastn_chunked_fasta(inFasta, db) - ## Combine results from different databases + # Combine results from different databases blastOutCombined = mkstempfname('.txt') catCmd = ['cat'] + blastOutFiles log.debug(' '.join(catCmd) + '> ' + blastOutCombined) with open(blastOutCombined, 'wt') as outf: - subprocess.check_call(catCmd, stdout = outf) + subprocess.check_call(catCmd, stdout=outf) - ## run noBlastHits_v3.py to extract reads with no blast hits + # run noBlastHits_v3.py to extract reads with no blast hits # TODO: slurp the small amount of code in this script into here - noBlastHitsCmd = ['python', noBlastHits_v3Path, '-b', blastOutCombined, - '-r', inFastq, '-m', 'nohit'] + noBlastHitsCmd = ['python', noBlastHits_v3Path, '-b', blastOutCombined, '-r', inFastq, '-m', 'nohit'] log.debug(' '.join(noBlastHitsCmd) + '> ' + outFastq) - with util.file.open_or_gzopen(outFastq, 'wt') as outf : - subprocess.check_call(noBlastHitsCmd, stdout = outf) + with util.file.open_or_gzopen(outFastq, 'wt') as outf: + subprocess.check_call(noBlastHitsCmd, stdout=outf) + def parser_deplete_blastn(parser=argparse.ArgumentParser()): - parser.add_argument('inFastq', - help='Input fastq file.') - parser.add_argument('outFastq', - help='Output fastq file with matching reads removed.') - parser.add_argument('refDbs', nargs='+', - help='One or more reference databases for blast.') + parser.add_argument('inFastq', help='Input fastq file.') + parser.add_argument('outFastq', help='Output fastq file with matching reads removed.') + parser.add_argument('refDbs', nargs='+', help='One or more reference databases for blast.') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, deplete_blastn, split_args=True) return parser + + __commands__.append(('deplete_blastn', parser_deplete_blastn)) + def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs): 'Use blastn to remove reads that match at least one of the databases.' tmpfq1_a = mkstempfname('.fastq') @@ -604,25 +621,24 @@ def deplete_blastn_paired(infq1, infq2, outfq1, outfq2, refDbs): # purge fq1 of read pairs lost in fq2 read_utils.purge_unmated(tmpfq1_b, tmpfq2_c, outfq1, outfq2) + def parser_deplete_blastn_paired(parser=argparse.ArgumentParser()): - parser.add_argument('infq1', - help='Input fastq file.') - parser.add_argument('infq2', - help='Input fastq file.') - parser.add_argument('outfq1', - help='Output fastq file with matching reads removed.') - parser.add_argument('outfq2', - help='Output fastq file with matching reads removed.') - parser.add_argument('refDbs', nargs='+', - help='One or more reference databases for blast.') + parser.add_argument('infq1', help='Input fastq file.') + parser.add_argument('infq2', help='Input fastq file.') + parser.add_argument('outfq1', help='Output fastq file with matching reads removed.') + parser.add_argument('outfq2', help='Output fastq file with matching reads removed.') + parser.add_argument('refDbs', nargs='+', help='One or more reference databases for blast.') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, deplete_blastn_paired, split_args=True) return parser + + __commands__.append(('deplete_blastn_paired', parser_deplete_blastn_paired)) + def deplete_blastn_bam(inBam, db, outBam, chunkSize=1000000, JVMmemory=None): 'Use blastn to remove reads that match at least one of the databases.' - + blastnPath = tools.blast.BlastnTool().install_and_get_path() fastq1 = mkstempfname('.1.fastq') fastq2 = mkstempfname('.2.fastq') @@ -630,10 +646,10 @@ def deplete_blastn_bam(inBam, db, outBam, chunkSize=1000000, JVMmemory=None): blast_hits = mkstempfname('.blast_hits.txt') halfBam = mkstempfname('.half.bam') blastOutFile = mkstempfname('.hits.txt') - + # Initial BAM -> FASTQ pair tools.picard.SamToFastqTool().execute(inBam, fastq1, fastq2) - + # Find BLAST hits against FASTQ1 read_utils.fastq_to_fasta(fastq1, fasta) os.unlink(fastq1) @@ -647,15 +663,15 @@ def deplete_blastn_bam(inBam, db, outBam, chunkSize=1000000, JVMmemory=None): id = line.split('\t')[0].strip() if id.endswith('/1') or id.endswith('/2'): id = id[:-2] - outf.write(id+'\n') + outf.write(id + '\n') os.unlink(blastOutFile) - + # Deplete BAM of hits in FASTQ1 tools.picard.FilterSamReadsTool().execute(inBam, True, blast_hits, halfBam, JVMmemory=JVMmemory) - + # Depleted BAM -> FASTQ pair tools.picard.SamToFastqTool().execute(halfBam, fastq1, fastq2) - + # Find BLAST hits against FASTQ2 (which is already smaller than before) read_utils.fastq_to_fasta(fastq2, fasta) os.unlink(fastq1) @@ -669,42 +685,47 @@ def deplete_blastn_bam(inBam, db, outBam, chunkSize=1000000, JVMmemory=None): id = line.split('\t')[0].strip() if id.endswith('/1') or id.endswith('/2'): id = id[:-2] - outf.write(id+'\n') + outf.write(id + '\n') os.unlink(blastOutFile) - + # Deplete BAM of hits against FASTQ2 tools.picard.FilterSamReadsTool().execute(halfBam, True, blast_hits, outBam, JVMmemory=JVMmemory) - + # Clean up for fn in (fasta, blast_hits, halfBam): os.unlink(fn) - + + def parser_deplete_blastn_bam(parser=argparse.ArgumentParser()): - parser.add_argument('inBam', - help='Input BAM file.') - parser.add_argument('refDbs', nargs='+', - help='One or more reference databases for blast.') - parser.add_argument('outBam', - help='Output BAM file with matching reads removed.') - parser.add_argument("--chunkSize", type=int, default=1000000, - help='FASTA chunk size (default: %(default)s)') - parser.add_argument('--JVMmemory', default = tools.picard.FilterSamReadsTool.jvmMemDefault, - help='JVM virtual memory size (default: %(default)s)') + parser.add_argument('inBam', help='Input BAM file.') + parser.add_argument('refDbs', nargs='+', help='One or more reference databases for blast.') + parser.add_argument('outBam', help='Output BAM file with matching reads removed.') + parser.add_argument("--chunkSize", type=int, default=1000000, help='FASTA chunk size (default: %(default)s)') + parser.add_argument('--JVMmemory', + default=tools.picard.FilterSamReadsTool.jvmMemDefault, + help='JVM virtual memory size (default: %(default)s)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, main_deplete_blastn_bam) return parser + + def main_deplete_blastn_bam(args): '''Use blastn to remove reads that match at least one of the specified databases.''' - def wrapper(inBam, db, outBam, JVMmemory=None) : + + def wrapper(inBam, db, outBam, JVMmemory=None): return deplete_blastn_bam(inBam, db, outBam, chunkSize=args.chunkSize, JVMmemory=JVMmemory) + multi_db_deplete_bam(args.inBam, args.refDbs, wrapper, args.outBam, JVMmemory=args.JVMmemory) return 0 + + __commands__.append(('deplete_blastn_bam', parser_deplete_blastn_bam)) - + # ======================== # *** lastal_build_db *** # ======================== + def lastal_build_db(inputFasta, outputDirectory, outputFilePrefix): if outputFilePrefix: @@ -714,27 +735,27 @@ def lastal_build_db(inputFasta, outputDirectory, outputFilePrefix): fileNameSansExtension = os.path.splitext(baseName)[0] outPrefix = fileNameSansExtension - tools.last.Lastdb().execute( - inputFasta = inputFasta, - outputDirectory = outputDirectory, - outputFilePrefix = outPrefix - ) + tools.last.Lastdb().execute(inputFasta=inputFasta, outputDirectory=outputDirectory, outputFilePrefix=outPrefix) + def parser_lastal_build_db(parser=argparse.ArgumentParser()): - parser.add_argument('inputFasta', - help='Location of the input FASTA file') - parser.add_argument('outputDirectory', - help='Location for the output files (default is cwd: %(default)s)') + parser.add_argument('inputFasta', help='Location of the input FASTA file') + parser.add_argument('outputDirectory', help='Location for the output files (default is cwd: %(default)s)') parser.add_argument('--outputFilePrefix', - help='Prefix for the output file name (default: inputFasta name, sans ".fasta" extension)') + help='Prefix for the output file name (default: inputFasta name, sans ".fasta" extension)') util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmpDir', None))) util.cmd.attach_main(parser, lastal_build_db, split_args=True) return parser + + __commands__.append(('lastal_build_db', parser_lastal_build_db)) # ======================== + def full_parser(): return util.cmd.make_parser(__commands__, __doc__) + + if __name__ == '__main__': util.cmd.main_argparse(__commands__, __doc__) diff --git a/test/__init__.py b/test/__init__.py index 6d1fdc2cb..155df4419 100644 --- a/test/__init__.py +++ b/test/__init__.py @@ -2,33 +2,41 @@ __author__ = "irwin@broadinstitute.org" -import filecmp, os, unittest +import filecmp +import os +import unittest import util.file -def assert_equal_contents(testCase, filename1, filename2) : + +def assert_equal_contents(testCase, filename1, filename2): 'Assert contents of two files are equal for a unittest.TestCase' testCase.assertTrue(filecmp.cmp(filename1, filename2, shallow=False)) -class TestCaseWithTmp(unittest.TestCase) : + +class TestCaseWithTmp(unittest.TestCase): 'Base class for tests that use tempDir' - def setUp(self) : + + def setUp(self): util.file.set_tmpDir(type(self).__name__) - def tearDown(self) : + def tearDown(self): util.file.destroy_tmpDir() - def assertEqualContents(self, f1, f2) : + def assertEqualContents(self, f1, f2): assert_equal_contents(self, f1, f2) """ When "nose" executes python scripts for automated testing, it excludes ones with -the executable bit set (in case they aren't import safe). To prevent any of the +the executable bit set (in case they aren't import safe). To prevent any of the tests in this folder from being silently excluded, assure this bit is not set. """ -def assert_none_executable() : + + +def assert_none_executable(): testDir = os.path.dirname(__file__) - assert all(not os.access(os.path.join(testDir, filename), os.X_OK) - for filename in os.listdir(testDir) + assert all(not os.access(os.path.join(testDir, filename), os.X_OK) for filename in os.listdir(testDir) if filename.endswith('.py')) + + assert_none_executable() diff --git a/test/integration/__init__.py b/test/integration/__init__.py index e69de29bb..8b1378917 100644 --- a/test/integration/__init__.py +++ b/test/integration/__init__.py @@ -0,0 +1 @@ + diff --git a/test/integration/test_assembly.py b/test/integration/test_assembly.py index c670c970e..fe780ceb7 100644 --- a/test/integration/test_assembly.py +++ b/test/integration/test_assembly.py @@ -2,19 +2,27 @@ __author__ = "dpark@broadinstitute.org" -import assembly, util.cmd, util.file, tools.novoalign +import assembly +import util.cmd +import util.file +import tools.novoalign import Bio.SeqIO import unittest -import os, shutil, tempfile, argparse, itertools +import os +import shutil +import tempfile +import argparse +import itertools from test import TestCaseWithTmp + class TestAssemble(TestCaseWithTmp): ''' Test the de novo assembly pipeline ''' def test_ref_assisted_assembly(self): novoalign = tools.novoalign.NovoalignTool() novoalign.install() - + # prep inputs orig_ref = os.path.join(util.file.get_test_input_path(), 'ebov-makona.fasta') refGenome = util.file.mkstempfname('.ref.fasta') @@ -22,24 +30,20 @@ def test_ref_assisted_assembly(self): novoalign.index_fasta(refGenome) inBam = os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam') outFasta = util.file.mkstempfname('.refined.fasta') - + # run refine_assembly - args = [refGenome, inBam, outFasta, - "--chr_names", 'G5012.3', - "--min_coverage", '3', - "--novo_params", "-r Random -l 30 -g 40 -x 20 -t 502"] + args = [refGenome, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params", + "-r Random -l 30 -g 40 -x 20 -t 502"] args = assembly.parser_refine_assembly().parse_args(args) args.func_main(args) self.assertTrue(os.path.isfile(outFasta)) self.assertTrue(os.path.getsize(outFasta) > 1000) - + # check assembly quality with open(outFasta, 'rt') as inf: seq = Bio.SeqIO.read(inf, 'fasta') self.assertGreater(len(seq), 17000) self.assertGreater(assembly.unambig_count(seq.seq), len(seq) * 0.95) - # in order to test the actual de novo pipeline, we need to add a clip db for trimmomatic # then we should test from G5012.3.testreads.bam all the way through the assembly pipe - diff --git a/test/integration/test_ncbi.py b/test/integration/test_ncbi.py index 4f4fe4739..1c9b60aeb 100644 --- a/test/integration/test_ncbi.py +++ b/test/integration/test_ncbi.py @@ -3,7 +3,12 @@ __author__ = "tomkinsc@broadinstitute.org" # built-ins -import unittest, os, argparse, pickle, shutil, tempfile +import unittest +import os +import argparse +import pickle +import shutil +import tempfile from collections import OrderedDict import logging @@ -14,32 +19,25 @@ log = logging.getLogger(__name__) -class TestNcbiFetch(TestCaseWithTmp) : + +class TestNcbiFetch(TestCaseWithTmp): + def setUp(self): super(TestNcbiFetch, self).setUp() # these are Orungo accessions - self.accessions = [ "JQ610675.1", - "JQ610676.1", - "JQ610677.1", - "JQ610678.1", - "JQ610679.1", - "JQ610680.1", - "JQ610681.1", - "JQ610682.1", - "JQ610683.1", - "JQ610684.1"] + self.accessions = ["JQ610675.1", "JQ610676.1", "JQ610677.1", "JQ610678.1", "JQ610679.1", "JQ610680.1", + "JQ610681.1", "JQ610682.1", "JQ610683.1", "JQ610684.1"] self.myInputDir = util.file.get_test_input_path(self) def perform_download_and_check(self, parser_func, additional_args, expected_files, null_files): - + tempDir = tempfile.gettempdir() - args = [ "viral-ngs-test@example.com", - tempDir] + args = ["viral-ngs-test@example.com", tempDir] args.extend(self.accessions) - args.extend(additional_args) + args.extend(additional_args) args = parser_func(argparse.ArgumentParser()).parse_args(args) args.func_main(args) @@ -49,238 +47,299 @@ def perform_download_and_check(self, parser_func, additional_args, expected_file for fileName in expected_files: createdFilePath = os.path.join(tempDir, fileName) log.info("createdFilePath: {}".format(createdFilePath)) - assert os.path.exists(createdFilePath), "File that should have been created does not exist: %s" % createdFilePath + assert os.path.exists( + createdFilePath), "File that should have been created does not exist: %s" % createdFilePath self.assertEqualContents(createdFilePath, os.path.join(self.myInputDir, fileName)) for fileName in null_files: shouldNotExistFilePath = os.path.join(tempDir, fileName) - assert not os.path.exists(shouldNotExistFilePath), "File exists but it should not: %s" % shouldNotExistFilePath + assert not os.path.exists( + shouldNotExistFilePath), "File exists but it should not: %s" % shouldNotExistFilePath + class TestFastaFetch(TestNcbiFetch): + def setUp(self): super(TestFastaFetch, self).setUp() def test_download(self): args = [] - expectedFiles = [ a+".fasta" for a in self.accessions ] - null_files = [] + expectedFiles = [a + ".fasta" for a in self.accessions] + null_files = [] - self.perform_download_and_check(ncbi.parser_fetch_fastas, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_fastas, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_concat(self): args = ["--combinedFilePrefix", "orungo"] - expectedFiles = [ "orungo.fasta" ] + expectedFiles = ["orungo.fasta"] null_files = [] - - self.perform_download_and_check(ncbi.parser_fetch_fastas, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + + self.perform_download_and_check(ncbi.parser_fetch_fastas, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_removal_of_intermediates(self): args = ["--combinedFilePrefix", "orungo", "--removeSeparateFiles"] - expectedFiles = [ "orungo.fasta" ] - null_files = [ a+".fasta" for a in self.accessions ] - - self.perform_download_and_check(ncbi.parser_fetch_fastas, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + expectedFiles = ["orungo.fasta"] + null_files = [a + ".fasta" for a in self.accessions] + + self.perform_download_and_check(ncbi.parser_fetch_fastas, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_individual_preexistance(self): # since the arguments are positional, including an accession here makes a duplicate that should # raise an Error args = [self.accessions[0]] args.extend(["--combinedFilePrefix", "orungo"]) - expectedFiles = [ "orungo.fasta" ] + expectedFiles = ["orungo.fasta"] null_files = [] - + with self.assertRaises(AssertionError): - self.perform_download_and_check(ncbi.parser_fetch_fastas, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_fastas, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_combined_preexistance(self): args = ["--combinedFilePrefix", "orungo"] - expectedFiles = [ "orungo.fasta" ] + expectedFiles = ["orungo.fasta"] null_files = [] - + # call once to create the combined file - self.perform_download_and_check(ncbi.parser_fetch_fastas, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_fastas, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) # an error should be raised the second time the call is made with self.assertRaises(AssertionError): - self.perform_download_and_check(ncbi.parser_fetch_fastas, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_fastas, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_overwrite(self): args = ["--combinedFilePrefix", "orungo", "--forceOverwrite"] - expectedFiles = [ "orungo.fasta" ] + expectedFiles = ["orungo.fasta"] null_files = [] - + # call once to create the combined file - self.perform_download_and_check(ncbi.parser_fetch_fastas, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_fastas, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) # no error should be raised the second time the call is made - self.perform_download_and_check(ncbi.parser_fetch_fastas, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_fastas, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_different_file_extension(self): args = ["--fileExt", "fa", "--combinedFilePrefix", "orungo"] - expectedFiles = [ a+".fa" for a in self.accessions ] + expectedFiles = [a + ".fa" for a in self.accessions] expectedFiles.append("orungo.fa") - null_files = [] + null_files = [] - self.perform_download_and_check(ncbi.parser_fetch_fastas, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_fastas, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) class TestFeatureTableFetch(TestNcbiFetch): + def setUp(self): super(TestFeatureTableFetch, self).setUp() def test_download(self): args = [] - expectedFiles = [ a+".tbl" for a in self.accessions ] - null_files = [] + expectedFiles = [a + ".tbl" for a in self.accessions] + null_files = [] - self.perform_download_and_check(ncbi.parser_fetch_feature_tables, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_feature_tables, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_concat(self): args = ["--combinedFilePrefix", "orungo"] - expectedFiles = [ "orungo.tbl" ] + expectedFiles = ["orungo.tbl"] null_files = [] - - self.perform_download_and_check(ncbi.parser_fetch_feature_tables, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + + self.perform_download_and_check(ncbi.parser_fetch_feature_tables, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_removal_of_intermediates(self): args = ["--combinedFilePrefix", "orungo", "--removeSeparateFiles"] - expectedFiles = [ "orungo.tbl" ] - null_files = [ a+".tbl" for a in self.accessions ] - - self.perform_download_and_check(ncbi.parser_fetch_feature_tables, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + expectedFiles = ["orungo.tbl"] + null_files = [a + ".tbl" for a in self.accessions] + + self.perform_download_and_check(ncbi.parser_fetch_feature_tables, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_individual_preexistance(self): # since the arguments are positional, including an accession here makes a duplicate that should # raise an Error args = [self.accessions[0]] args.extend(["--combinedFilePrefix", "orungo"]) - expectedFiles = [ "orungo.tbl" ] + expectedFiles = ["orungo.tbl"] null_files = [] - + with self.assertRaises(AssertionError): - self.perform_download_and_check(ncbi.parser_fetch_feature_tables, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_feature_tables, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_combined_preexistance(self): args = ["--combinedFilePrefix", "orungo"] - expectedFiles = [ "orungo.tbl" ] + expectedFiles = ["orungo.tbl"] null_files = [] - + # call once to create the combined file - self.perform_download_and_check(ncbi.parser_fetch_feature_tables, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_feature_tables, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) # an error should be raised the second time the call is made with self.assertRaises(AssertionError): - self.perform_download_and_check(ncbi.parser_fetch_feature_tables, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_feature_tables, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_overwrite(self): args = ["--combinedFilePrefix", "orungo", "--forceOverwrite"] - expectedFiles = [ "orungo.tbl" ] + expectedFiles = ["orungo.tbl"] null_files = [] - + # call once to create the combined file - self.perform_download_and_check(ncbi.parser_fetch_feature_tables, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_feature_tables, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) # no error should be raised the second time the call is made - self.perform_download_and_check(ncbi.parser_fetch_feature_tables, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_feature_tables, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_different_file_extension(self): args = ["--fileExt", "table", "--combinedFilePrefix", "orungo"] - expectedFiles = [ a+".table" for a in self.accessions ] + expectedFiles = [a + ".table" for a in self.accessions] expectedFiles.append("orungo.table") - null_files = [] + null_files = [] + + self.perform_download_and_check(ncbi.parser_fetch_feature_tables, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) - self.perform_download_and_check(ncbi.parser_fetch_feature_tables, - additional_args=args, expected_files=expectedFiles, null_files=null_files) class TestGenbankRecordFetch(TestNcbiFetch): + def setUp(self): super(TestGenbankRecordFetch, self).setUp() def test_download(self): args = [] - expectedFiles = [ a+".gbk" for a in self.accessions ] - null_files = [] + expectedFiles = [a + ".gbk" for a in self.accessions] + null_files = [] - self.perform_download_and_check(ncbi.parser_fetch_genbank_records, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_genbank_records, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_concat(self): args = ["--combinedFilePrefix", "orungo"] - expectedFiles = [ "orungo.gbk" ] + expectedFiles = ["orungo.gbk"] null_files = [] - - self.perform_download_and_check(ncbi.parser_fetch_genbank_records, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + + self.perform_download_and_check(ncbi.parser_fetch_genbank_records, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_removal_of_intermediates(self): args = ["--combinedFilePrefix", "orungo", "--removeSeparateFiles"] - expectedFiles = [ "orungo.gbk" ] - null_files = [ a+".gbk" for a in self.accessions ] - - self.perform_download_and_check(ncbi.parser_fetch_genbank_records, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + expectedFiles = ["orungo.gbk"] + null_files = [a + ".gbk" for a in self.accessions] + + self.perform_download_and_check(ncbi.parser_fetch_genbank_records, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_individual_preexistance(self): # since the arguments are positional, including an accession here makes a duplicate that should # raise an Error args = [self.accessions[0]] args.extend(["--combinedFilePrefix", "orungo"]) - expectedFiles = [ "orungo.gbk" ] + expectedFiles = ["orungo.gbk"] null_files = [] - + with self.assertRaises(AssertionError): - self.perform_download_and_check(ncbi.parser_fetch_genbank_records, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_genbank_records, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_combined_preexistance(self): args = ["--combinedFilePrefix", "orungo"] - expectedFiles = [ "orungo.gbk" ] + expectedFiles = ["orungo.gbk"] null_files = [] - + # call once to create the combined file - self.perform_download_and_check(ncbi.parser_fetch_genbank_records, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_genbank_records, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) # an error should be raised the second time the call is made with self.assertRaises(AssertionError): - self.perform_download_and_check(ncbi.parser_fetch_genbank_records, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_genbank_records, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_overwrite(self): args = ["--combinedFilePrefix", "orungo", "--forceOverwrite"] - expectedFiles = [ "orungo.gbk" ] + expectedFiles = ["orungo.gbk"] null_files = [] - + # call once to create the combined file - self.perform_download_and_check(ncbi.parser_fetch_genbank_records, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_genbank_records, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) # no error should be raised the second time the call is made - self.perform_download_and_check(ncbi.parser_fetch_genbank_records, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_genbank_records, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) def test_different_file_extension(self): args = ["--fileExt", "gb", "--combinedFilePrefix", "orungo"] - expectedFiles = [ a+".gb" for a in self.accessions ] + expectedFiles = [a + ".gb" for a in self.accessions] expectedFiles.append("orungo.gb") - null_files = [] + null_files = [] - self.perform_download_and_check(ncbi.parser_fetch_genbank_records, - additional_args=args, expected_files=expectedFiles, null_files=null_files) + self.perform_download_and_check(ncbi.parser_fetch_genbank_records, + additional_args=args, + expected_files=expectedFiles, + null_files=null_files) diff --git a/test/unit/__init__.py b/test/unit/__init__.py index e69de29bb..8b1378917 100644 --- a/test/unit/__init__.py +++ b/test/unit/__init__.py @@ -0,0 +1 @@ + diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py index f1b6b256d..8cfcabbfa 100644 --- a/test/unit/test_assembly.py +++ b/test/unit/test_assembly.py @@ -2,18 +2,29 @@ __author__ = "dpark@broadinstitute.org" -import assembly, util.cmd, util.file -import Bio.SeqIO, Bio.Data.IUPACData -import unittest, argparse -import os, shutil, tempfile, argparse, itertools +import assembly +import util.cmd +import util.file +import Bio.SeqIO +import Bio.Data.IUPACData +import unittest +import argparse +import os +import shutil +import tempfile +import argparse +import itertools from test import TestCaseWithTmp + def makeFasta(seqs, outFasta): with open(outFasta, 'wt') as outf: for line in util.file.fastaMaker(seqs): outf.write(line) + class TestCommandHelp(unittest.TestCase): + def test_help_parser_for_each_command(self): for cmd_name, parser_fun in assembly.__commands__: parser = parser_fun(argparse.ArgumentParser()) @@ -21,12 +32,13 @@ def test_help_parser_for_each_command(self): class TestAmbiguityBases(unittest.TestCase): + def test_non_failure(self): ''' Make sure that alleles_to_ambiguity runs without errors for every possible combination of inputs. Check that the output is one-character long and uppercase. ''' - bases = ('A','C','T','G') - for i in range(1,5): + bases = ('A', 'C', 'T', 'G') + for i in range(1, 5): for alleles in itertools.permutations(bases, i): out = assembly.alleles_to_ambiguity(alleles) self.assertEqual(1, len(out)) @@ -35,11 +47,13 @@ def test_non_failure(self): class TestMutableSequence(unittest.TestCase): ''' Test the MutableSequence class ''' + def test_bad_coords(self): self.assertRaises(Exception, assembly.MutableSequence, 'chr', 0, 4) self.assertRaises(Exception, assembly.MutableSequence, 'chr', 5, 4) self.assertRaises(Exception, assembly.MutableSequence, 'chr', -2, 4) self.assertRaises(Exception, assembly.MutableSequence, 'chr', 5, 6, 'G') + def test_good_coords(self): x = assembly.MutableSequence('chr', 1, 5) x = assembly.MutableSequence('chr', 5, 5) @@ -47,6 +61,7 @@ def test_good_coords(self): x = assembly.MutableSequence('chr name with spaces 5 @#$ --', 1, 5) x = assembly.MutableSequence('chr', 5, 5, 'A') x = assembly.MutableSequence('chr', 5, 6, 'AT') + def test_modify_one(self): x = assembly.MutableSequence('chr', 5, 8, 'ATCG') self.assertRaises(Exception, x.modify, 4, 'G') @@ -64,11 +79,13 @@ def test_modify_one(self): self.assertEqual(x.emit(), ('chr', 'GjGG')) x.modify(8, 'Y') self.assertEqual(x.emit(), ('chr', 'GjGY')) + def test_modify_blank(self): x = assembly.MutableSequence('chr', 5, 8) self.assertEqual(x.emit(), ('chr', 'NNNN')) x.modify(6, 'G') self.assertEqual(x.emit(), ('chr', 'NGNN')) + def test_modify_insertions(self): x = assembly.MutableSequence('chr', 5, 8, 'ATCG') x.modify(6, 'insert') @@ -77,6 +94,7 @@ def test_modify_insertions(self): self.assertEqual(x.emit(), ('chr', 'AinsertCtail')) x.modify(5, 'headA') self.assertEqual(x.emit(), ('chr', 'headAinsertCtail')) + def test_modify_deletions(self): x = assembly.MutableSequence('chr', 5, 8, 'ATCG') self.assertRaises(Exception, x.replace, 6, 9, 'AT') @@ -92,10 +110,12 @@ def test_modify_deletions(self): self.assertEqual(x.emit(), ('chr', 'Ay123G')) x.modify(7, 'z') self.assertEqual(x.emit(), ('chr', 'AyzG')) + def test_modify_deletions_simple(self): x = assembly.MutableSequence('chr', 5, 8, 'ATCG') x.replace(6, 7, 'T') self.assertEqual(x.emit(), ('chr', 'ATG')) + def test_modify_deletions_remember(self): x = assembly.MutableSequence('chr', 5, 8, 'ATCG') x.replace(6, 7, 'T') @@ -104,35 +124,37 @@ def test_modify_deletions_remember(self): self.assertEqual(x.emit(), ('chr', 'ATxG')) x.replay_deletions() self.assertEqual(x.emit(), ('chr', 'ATG')) - - + class TestManualSnpCaller(unittest.TestCase): ''' Test the vcfrow_parse_and_call_snps method.. lots of edge cases. ''' + def test_missing_dp(self): ''' VCF files might contain rows with no calls or any kind of data and that's okay. ''' row = ['chr10', '105', '.', 'G', '.', '.', '.', '.', 'GT', './.'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=1)) self.assertEqual(out, []) + def test_dp_inaccurate(self): ''' The DP might not equal the sum of the ADs and that's okay apparently. ''' row = ['chr10', '105', '.', 'G', 'A', '.', '.', '.', 'GT:DP:AD', '0/1/1:5:2,2'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=1)) - self.assertEqual(set(out[0][4]), set(['G','A'])) + self.assertEqual(set(out[0][4]), set(['G', 'A'])) row = ['chr10', '105', '.', 'G', 'A', '.', '.', '.', 'GT:DP:AD', '0/1/1:2:3,3'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=3)) - self.assertEqual(set(out[0][4]), set(['G','A'])) + self.assertEqual(set(out[0][4]), set(['G', 'A'])) row = ['chr10', '105', '.', 'G', 'A', '.', '.', '.', 'GT:DP:AD', '0/1/1:10:2,0'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=3)) self.assertEqual(out, []) + def test_invariant_sites(self): ''' Invariant site handling is slightly different in code, so test it specially. ''' row = ['LASV.l', '1', '.', 'T', '.', '.', '.', '.', 'GT:DP', '0/0:3'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=3)) - self.assertEqual(out, [('LASV.l',1,1,'s1',['T'])]) + self.assertEqual(out, [('LASV.l', 1, 1, 's1', ['T'])]) row = ['LASV.l', '1', '.', 'T', '.', '.', '.', '.', 'GT', '0/0'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=0)) - self.assertEqual(out, [('LASV.l',1,1,'s1',['T'])]) + self.assertEqual(out, [('LASV.l', 1, 1, 's1', ['T'])]) row = ['LASV.l', '1', '.', 'T', '.', '.', '.', '.', 'GT', '0/0'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=1)) self.assertEqual(out, []) @@ -141,7 +163,8 @@ def test_invariant_sites(self): self.assertEqual(out, []) row = ['LASV.l', '1', '.', 'T', '.', '.', '.', '.', 'GT:DP', './.:10'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=1)) - self.assertEqual(out, [('LASV.l',1,1,'s1',['T'])]) + self.assertEqual(out, [('LASV.l', 1, 1, 's1', ['T'])]) + def test_het_edgecases(self): ''' The interplay between min_coverage and major_cutoff is not obvious, here's what I understand from Kristian about the desired behavior. @@ -156,22 +179,23 @@ def test_het_edgecases(self): ''' row = ['thecontig', '105000', '.', 'G', 'A,C,T', '.', '.', '.', 'GT:AD', '0/1:3,4,5,0'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=3)) - self.assertEqual(set(out[0][4]), set(['G','A','C'])) + self.assertEqual(set(out[0][4]), set(['G', 'A', 'C'])) row = ['thecontig', '105000', '.', 'G', 'A,C,T', '.', '.', '.', 'GT:AD', '0/1:2,3,0,3'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=3)) - self.assertEqual(set(out[0][4]), set(['A','T'])) + self.assertEqual(set(out[0][4]), set(['A', 'T'])) row = ['thecontig', '105000', '.', 'G', 'A,C,T', '.', '.', '.', 'GT:AD', '0/1:0,2,0,2'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=3)) self.assertEqual(out, []) row = ['thecontig', '105000', '.', 'G', 'A,C,T', '.', '.', '.', 'GT:AD', '0/1:0,2,0,2'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=2)) - self.assertEqual(set(out[0][4]), set(['A','T'])) + self.assertEqual(set(out[0][4]), set(['A', 'T'])) row = ['thecontig', '105000', '.', 'G', 'A,C,T', '.', '.', '.', 'GT:AD', '0/1:2,0,3,0'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=3)) self.assertEqual(out[0][4], ['C']) row = ['thecontig', '105000', '.', 'G', 'A,C,T', '.', '.', '.', 'GT:AD', '0/1:0,2,3,4'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=3)) self.assertEqual(out[0][4], ['T']) + def test_indels(self): ''' Indel handling ''' row = ['thecontig', '105000', '.', 'G', 'GA,T', '.', '.', '.', 'GT:AD', '0/1:5,10,1'] @@ -179,27 +203,29 @@ def test_indels(self): self.assertEqual(set(out[0][4]), set(['GA'])) row = ['thecontig', '105000', '.', 'G', 'GA,T', '.', '.', '.', 'GT:AD', '0/1:5,5,2'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=3)) - self.assertEqual(set(out[0][4]), set(['G','GA'])) + self.assertEqual(set(out[0][4]), set(['G', 'GA'])) row = ['thecontig', '105000', '.', 'G', 'GA,T', '.', '.', '.', 'GT:AD', '0/1:5,5,3'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'], min_dp=3)) - self.assertEqual(set(out[0][4]), set(['G','GA','T'])) + self.assertEqual(set(out[0][4]), set(['G', 'GA', 'T'])) row = ['thecontig', '105000', '.', 'AT', 'A', '.', '.', '.', 'GT:AD', '0/1:2,10'] out = list(assembly.vcfrow_parse_and_call_snps(row, ['s1'])) - self.assertEqual(out, [('thecontig',105000,105001,'s1',['A'])]) + self.assertEqual(out, [('thecontig', 105000, 105001, 's1', ['A'])]) + def test_vcf_to_seqs_indels1(self): input = ['thecontig', '5', '.', 'AT', 'A', '.', '.', '.', 'GT:AD', '0/1:2,10'] - actual = assembly.vcf_to_seqs([input], {'thecontig':10}, ['s1'], min_dp=2) + actual = assembly.vcf_to_seqs([input], {'thecontig': 10}, ['s1'], min_dp=2) actual = list(actual)[0][1].strip('N') self.assertEqual(actual, 'A') - actual = assembly.vcf_to_seqs([input], {'thecontig':10}, ['s1'], min_dp=2) + actual = assembly.vcf_to_seqs([input], {'thecontig': 10}, ['s1'], min_dp=2) actual = list(actual)[0][1] self.assertEqual(actual, 'NNNNANNNN') + def test_vcf_to_seqs_indels2(self): ''' More end-to-end indel handling ''' myInputDir = util.file.get_test_input_path(self) - input = os.path.join(myInputDir, 'indel.vcf.gz') + input = os.path.join(myInputDir, 'indel.vcf.gz') expected = os.path.join(myInputDir, 'output.fasta') - chrlens = {'EBOV_2014_G6060.1':18962} + chrlens = {'EBOV_2014_G6060.1': 18962} samples = ['G6060.1'] expected = str(Bio.SeqIO.read(expected, 'fasta').seq) actual = assembly.vcf_to_seqs(util.file.read_tabfile(input), chrlens, samples, min_dp=2) @@ -207,9 +233,9 @@ def test_vcf_to_seqs_indels2(self): self.assertEqual(actual, expected) - class TestDeambigAndTrimFasta(TestCaseWithTmp): ''' Test the deambig_fasta and trim_fasta commands. ''' + def run_method(self, inseqs, parser_fun): fasta_in = util.file.mkstempfname() fasta_out = util.file.mkstempfname() @@ -217,21 +243,23 @@ def run_method(self, inseqs, parser_fun): args = parser_fun(argparse.ArgumentParser()).parse_args([fasta_in, fasta_out]) args.func_main(args) return (fasta_in, fasta_out) + def test_trim_fasta(self): ''' Simple test of the trim_fasta command ''' - inseqs = ['NNnnNNnNaslkdfjasdkfNNNN','NNNnnN','NNN123','ATCG'] - expected = ['aslkdfjasdkf','','123','ATCG'] + inseqs = ['NNnnNNnNaslkdfjasdkfNNNN', 'NNNnnN', 'NNN123', 'ATCG'] + expected = ['aslkdfjasdkf', '', '123', 'ATCG'] expected = dict((str(i), expected[i]) for i in range(len(expected))) fasta_in, fasta_out = self.run_method(inseqs, assembly.parser_trim_fasta) with open(fasta_out, 'rt') as fa: for record in Bio.SeqIO.parse(fa, 'fasta'): self.assertIn(record.id, expected) self.assertEqual(str(record.seq), expected[record.id]) + def test_deambig_fasta(self): ''' Simple test of the deambig_fasta command ''' - table = [(k,v) for k,v in Bio.Data.IUPACData.ambiguous_dna_values.items() if k!='X'] - keys = [k for k,v in table] - vals = [set(v) for k,v in table] + table = [(k, v) for k, v in Bio.Data.IUPACData.ambiguous_dna_values.items() if k != 'X'] + keys = [k for k, v in table] + vals = [set(v) for k, v in table] keys = keys + [k.lower() for k in keys] vals = vals + vals inseq = ''.join(keys) diff --git a/test/unit/test_interhost.py b/test/unit/test_interhost.py index deea9048c..52abe2fa2 100644 --- a/test/unit/test_interhost.py +++ b/test/unit/test_interhost.py @@ -3,15 +3,21 @@ __author__ = "irwin@broadinstitute.org" import interhost -import test, util.file -import unittest, argparse, itertools +import test +import util.file +import unittest +import argparse +import itertools + class TestCommandHelp(unittest.TestCase): + def test_help_parser_for_each_command(self): for cmd_name, parser_fun in interhost.__commands__: parser = parser_fun(argparse.ArgumentParser()) helpstring = parser.format_help() + def makeTempFasta(seqs): fn = util.file.mkstempfname('.fasta') with open(fn, 'wt') as outf: @@ -19,55 +25,48 @@ def makeTempFasta(seqs): outf.write(line) return fn + class TestCoordMapper(test.TestCaseWithTmp): + def setUp(self): super(TestCoordMapper, self).setUp() - self.genomeA = makeTempFasta([ - ('chr1', 'ATGCACGTACGTATGCAAATCGG'), - ('chr2', 'AGTCGGTTTTCAG'), - ]) - self.genomeB = makeTempFasta([ - ('first_chrom', 'GCACGTACGTATTTGCAAATC'), - ('second_chr', 'AGTCGGTTTCCAC'), - ]) + self.genomeA = makeTempFasta([('chr1', 'ATGCACGTACGTATGCAAATCGG'), ('chr2', 'AGTCGGTTTTCAG'),]) + self.genomeB = makeTempFasta([('first_chrom', 'GCACGTACGTATTTGCAAATC'), ('second_chr', 'AGTCGGTTTCCAC'),]) self.cm = interhost.CoordMapper() self.cm.align_and_load_sequences([self.genomeA, self.genomeB]) - + def test_no_indels(self): - for pos in range(1,14): + for pos in range(1, 14): self.assertEqual(self.cm.mapAtoB('chr2', pos), ('second_chr', pos)) self.assertEqual(self.cm.mapBtoA('second_chr', pos), ('chr2', pos)) - - def test_map_indels(self) : - expLists = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, [11, 13], 14, 15, 16, 17, - 18, 19, 20, 21], - [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13, 14, 15, 16, - 17, 18, 19, 20, 21], - ] + + def test_map_indels(self): + expLists = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, [11, 13], 14, 15, 16, 17, 18, 19, 20, 21], + [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13, 14, 15, 16, 17, 18, 19, 20, 21],] for mapper, fromChrom, goodRange, toChrom, expected in [ - [self.cm.mapAtoB, 'chr1', range(3, 22), 'first_chrom', expLists[0]], - [self.cm.mapBtoA, 'first_chrom', range(1, 22), 'chr1', expLists[1]]] : + [self.cm.mapAtoB, 'chr1', range(3, 22), 'first_chrom', expLists[0]], [self.cm.mapBtoA, 'first_chrom', + range(1, 22), 'chr1', expLists[1]] + ]: result = [mapper(fromChrom, pos) for pos in goodRange] - for chrom, mappedPos in result : + for chrom, mappedPos in result: self.assertEqual(chrom, toChrom) - self.assertEqual(expected, - [mappedPos for chrom, mappedPos in result]) - + self.assertEqual(expected, [mappedPos for chrom, mappedPos in result]) + def test_side_param(self): - self.assertEqual(self.cm.mapAtoB('chr1', 13), ('first_chrom', [11,13])) - self.assertEqual(self.cm.mapAtoB('chr1', 13, 0), ('first_chrom', [11,13])) + self.assertEqual(self.cm.mapAtoB('chr1', 13), ('first_chrom', [11, 13])) + self.assertEqual(self.cm.mapAtoB('chr1', 13, 0), ('first_chrom', [11, 13])) self.assertEqual(self.cm.mapAtoB('chr1', 13, -1), ('first_chrom', 11)) self.assertEqual(self.cm.mapAtoB('chr1', 13, 1), ('first_chrom', 13)) self.assertEqual(self.cm.mapAtoB('chr1', 12), ('first_chrom', 10)) self.assertEqual(self.cm.mapAtoB('chr1', 12, 0), ('first_chrom', 10)) self.assertEqual(self.cm.mapAtoB('chr1', 12, -1), ('first_chrom', 10)) self.assertEqual(self.cm.mapAtoB('chr1', 12, 1), ('first_chrom', 10)) - + def test_oob_errors(self): - for pos in [-1, 0, 1, 2, 22, 23, 24] : + for pos in [-1, 0, 1, 2, 22, 23, 24]: self.assertEqual(self.cm.mapAtoB('chr1', pos), ('first_chrom', None)) - for pos in [-1, 0, 14, 15] : - self.assertEqual(self.cm.mapBtoA('second_chr', pos), ('chr2', None)) + for pos in [-1, 0, 14, 15]: + self.assertEqual(self.cm.mapBtoA('second_chr', pos), ('chr2', None)) def test_invalid_pos_error(self): with self.assertRaises(TypeError): @@ -80,18 +79,13 @@ def test_invalid_chr_error(self): self.cm.mapAtoB('nonexistentchr', 2) with self.assertRaises(KeyError): self.cm.mapBtoA('nonexistentchr', 2) - + def test_unequal_genomes_error(self): - genomeA = makeTempFasta([ - ('chr1', 'ATGCACGTACGTATGCAAATCGG'), - ('chr2', 'AGTCGGTTTTCAG'), - ]) - genomeB = makeTempFasta([ - ('first_chrom', 'GCACGTACGTATTTGCAAATC') - ]) + genomeA = makeTempFasta([('chr1', 'ATGCACGTACGTATGCAAATCGG'), ('chr2', 'AGTCGGTTTTCAG'),]) + genomeB = makeTempFasta([('first_chrom', 'GCACGTACGTATTTGCAAATC')]) with self.assertRaises(Exception): cm = interhost.CoordMapper(genomeA, genomeB) - + def test_map_chr_only(self): self.assertEqual(self.cm.mapAtoB('chr1'), 'first_chrom') self.assertEqual(self.cm.mapBtoA('first_chrom'), 'chr1') @@ -99,23 +93,25 @@ def test_map_chr_only(self): self.assertEqual(self.cm.mapBtoA('second_chr'), 'chr2') with self.assertRaises(KeyError): self.cm.mapAtoB('nonexistentchr') - + + class TestCoordMapperMultipleSeqs(test.TestCaseWithTmp): + def setUp(self): super(TestCoordMapperMultipleSeqs, self).setUp() self.genomeA = makeTempFasta([ - ('chr1', 'ATGCACGTACGTATGCAAATCGG'), - ('chr2', 'AGTCGGTTTTCAG'), - ('chr3', 'GACTTTTGGCTGA'), - ]) + ('chr1', 'ATGCACGTACGTATGCAAATCGG'), + ('chr2', 'AGTCGGTTTTCAG'), + ('chr3', 'GACTTTTGGCTGA'), + ]) self.genomeB = makeTempFasta([ - ('first_chrom', 'GCACGTACGTATTTGCAAATC'), - ('second_chr', 'AGTCGGTTTCCAC'), - ('third_chr', 'CACCTTTGGCTGA'), - ]) + ('first_chrom', 'GCACGTACGTATTTGCAAATC'), + ('second_chr', 'AGTCGGTTTCCAC'), + ('third_chr', 'CACCTTTGGCTGA'), + ]) self.cm = interhost.CoordMapper() self.cm.align_and_load_sequences([self.genomeA, self.genomeB]) - + def test_legacy_call(self): ''' If mapAtoB or mapBtoA is called on a CoordMapper object with >2 sequences, @@ -125,40 +121,37 @@ def test_legacy_call(self): self.assertRaises(LookupError, self.cm.mapBtoA, 'chr2', 1) def test_no_indels(self): - for pos in range(1,14): - self.assertEqual( self.cm.mapChr('chr2', 'second_chr', pos), ('second_chr', pos) ) - self.assertEqual( self.cm.mapChr('second_chr', 'chr2', pos), ('chr2', pos) ) - - def test_map_indels(self) : - expLists = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, [11, 13], 14, 15, 16, 17, - 18, 19, 20, 21], - [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13, 14, 15, 16, - 17, 18, 19, 20, 21], - ] + for pos in range(1, 14): + self.assertEqual(self.cm.mapChr('chr2', 'second_chr', pos), ('second_chr', pos)) + self.assertEqual(self.cm.mapChr('second_chr', 'chr2', pos), ('chr2', pos)) + + def test_map_indels(self): + expLists = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, [11, 13], 14, 15, 16, 17, 18, 19, 20, 21], + [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13, 14, 15, 16, 17, 18, 19, 20, 21],] for mapper, fromChrom, goodRange, toChrom, expected in [ - [self.cm.mapChr, 'chr1', range(3, 22), 'first_chrom', expLists[0]], - [self.cm.mapChr, 'first_chrom', range(1, 22), 'chr1', expLists[1]]] : + [self.cm.mapChr, 'chr1', range(3, 22), 'first_chrom', expLists[0]], [self.cm.mapChr, 'first_chrom', + range(1, 22), 'chr1', expLists[1]] + ]: result = [mapper(fromChrom, toChrom, pos) for pos in goodRange] - for chrom, mappedPos in result : + for chrom, mappedPos in result: self.assertEqual(chrom, toChrom) - self.assertEqual(expected, - [mappedPos for chrom, mappedPos in result]) - + self.assertEqual(expected, [mappedPos for chrom, mappedPos in result]) + def test_side_param(self): - self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', 13), ('first_chrom', [11,13])) - self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', 13, 0), ('first_chrom', [11,13])) + self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', 13), ('first_chrom', [11, 13])) + self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', 13, 0), ('first_chrom', [11, 13])) self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', 13, -1), ('first_chrom', 11)) self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', 13, 1), ('first_chrom', 13)) self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', 12), ('first_chrom', 10)) self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', 12, 0), ('first_chrom', 10)) self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', 12, -1), ('first_chrom', 10)) self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', 12, 1), ('first_chrom', 10)) - + def test_oob_errors(self): - for pos in [-1, 0, 1, 2, 22, 23, 24] : + for pos in [-1, 0, 1, 2, 22, 23, 24]: self.assertEqual(self.cm.mapChr('chr1', 'first_chrom', pos), ('first_chrom', None)) - for pos in [-1, 0, 14, 15] : - self.assertEqual(self.cm.mapChr('second_chr', 'chr2', pos), ('chr2', None)) + for pos in [-1, 0, 14, 15]: + self.assertEqual(self.cm.mapChr('second_chr', 'chr2', pos), ('chr2', None)) def test_invalid_pos_error(self): with self.assertRaises(TypeError): @@ -169,51 +162,33 @@ def test_invalid_pos_error(self): def test_invalid_chr_error(self): self.assertRaises(KeyError, self.cm.mapChr, 'nonexistentchr', 'chr1', 2) self.assertRaises(KeyError, self.cm.mapChr, 'chr1', 'nonexistentchr', 2) - + def test_unequal_genomes_error(self): - genomeA = makeTempFasta([ - ('chr1', 'ATGCACGTACGTATGCAAATCGG'), - ('chr2', 'AGTCGGTTTTCAG'), - ]) - genomeB = makeTempFasta([ - ('first_chrom', 'GCACGTACGTATTTGCAAATC') - ]) + genomeA = makeTempFasta([('chr1', 'ATGCACGTACGTATGCAAATCGG'), ('chr2', 'AGTCGGTTTTCAG'),]) + genomeB = makeTempFasta([('first_chrom', 'GCACGTACGTATTTGCAAATC')]) with self.assertRaises(Exception): cm = interhost.CoordMapper() cm.align_and_load_sequences([genomeA, genomeB]) - + def test_duplicate_chr_names_error(self): - genomeA = makeTempFasta([ - ('chr1', 'ATGCACGTACGTATGCAAATCGG'), - ]) - genomeB = makeTempFasta([ - ('chr1', 'GCACGTACGTATTTGCAAATC') - ]) + genomeA = makeTempFasta([('chr1', 'ATGCACGTACGTATGCAAATCGG'),]) + genomeB = makeTempFasta([('chr1', 'GCACGTACGTATTTGCAAATC')]) with self.assertRaises(Exception): cm = interhost.CoordMapper() cm.align_and_load_sequences([genomeA, genomeB]) def test_multiple_input_genomes(self): - genomeA = makeTempFasta([ - ('chr1', 'ATGCACGTACGTATGCAAATCGG'), - ]) - genomeB = makeTempFasta([ - ('first_chr', 'ATGCACTACGTATGCAAATCGG') - ]) - genomeC = makeTempFasta([ - ('chr_one', 'ATGCACGTACGTATGCAATCGG') - ]) + genomeA = makeTempFasta([('chr1', 'ATGCACGTACGTATGCAAATCGG'),]) + genomeB = makeTempFasta([('first_chr', 'ATGCACTACGTATGCAAATCGG')]) + genomeC = makeTempFasta([('chr_one', 'ATGCACGTACGTATGCAATCGG')]) cm = interhost.CoordMapper() cm.align_and_load_sequences([genomeA, genomeB, genomeC]) # check that toChrom is in the map self.assertEqual(cm.mapChr('chr1', 'chr_one'), 'chr_one') def test_single_chr_error(self): - genomeA = makeTempFasta([ - ('chr1', 'ATGCACGTACGTATGCAAATCGG'), - ]) - genomeB = makeTempFasta([ - ]) + genomeA = makeTempFasta([('chr1', 'ATGCACGTACGTATGCAAATCGG'),]) + genomeB = makeTempFasta([]) with self.assertRaises(Exception): cm = interhost.CoordMapper() cm.align_and_load_sequences([genomeA, genomeB]) @@ -227,56 +202,40 @@ def test_map_chr_only(self): self.assertEqual(self.cm.mapChr('third_chr', 'chr3'), 'chr3') self.assertRaises(KeyError, self.cm.mapChr, 'nonexistentchr', 'chr1') + class TestSpecificAlignments(test.TestCaseWithTmp): """ For the most part, CoordMapper2Seqs is tested implicitly when CoordMapper is tested. Focus here on special cases that are hard or impossible to get out of the aligner. """ - def test_basic_alignment(self) : - alignment = makeTempFasta([ - ('s1', 'ATCG'), - ('s2', 'ACCG'), - ('s3', 'AG-T'), - ]) + + def test_basic_alignment(self): + alignment = makeTempFasta([('s1', 'ATCG'), ('s2', 'ACCG'), ('s3', 'AG-T'),]) cm = interhost.CoordMapper() cm.load_alignments([alignment]) - def test_unequal_len(self) : - alignment = makeTempFasta([ - ('s1', 'AA'), - ('s2', 'A'), - ]) + def test_unequal_len(self): + alignment = makeTempFasta([('s1', 'AA'), ('s2', 'A'),]) cm = interhost.CoordMapper() - with self.assertRaises(Exception) : + with self.assertRaises(Exception): cm.load_alignments([alignment]) - def test_no_real_bases_in_sample(self) : - alignment1 = makeTempFasta([ - ('s1', 'AA'), - ('s2', '--'), - ]) + def test_no_real_bases_in_sample(self): + alignment1 = makeTempFasta([('s1', 'AA'), ('s2', '--'),]) cm = interhost.CoordMapper() - with self.assertRaises(Exception) : + with self.assertRaises(Exception): cm.load_alignments([alignment1]) - - alignment2 = makeTempFasta([ - ('s1', '--'), - ('s2', 'AA'), - ('s3', 'TT'), - ]) + + alignment2 = makeTempFasta([('s1', '--'), ('s2', 'AA'), ('s3', 'TT'),]) cm = interhost.CoordMapper() - with self.assertRaises(Exception) : + with self.assertRaises(Exception): cm.load_alignments([alignment2]) - def test_no_real_bases_at_position(self) : - alignment = makeTempFasta([ - ('s1', 'AT-G'), - ('s2', 'AC-G'), - ('s3', 'AG-T'), - ]) + def test_no_real_bases_at_position(self): + alignment = makeTempFasta([('s1', 'AT-G'), ('s2', 'AC-G'), ('s3', 'AG-T'),]) cm = interhost.CoordMapper() cm.load_alignments([alignment]) - for i in (1,2,3): + for i in (1, 2, 3): self.assertEqual(cm.mapChr('s1', 's2', i), ('s2', i)) self.assertEqual(cm.mapChr('s2', 's1', i), ('s1', i)) self.assertEqual(cm.mapChr('s1', 's3', i), ('s3', i)) @@ -284,56 +243,49 @@ def test_no_real_bases_at_position(self) : self.assertEqual(cm.mapChr('s2', 's3', i), ('s3', i)) self.assertEqual(cm.mapChr('s3', 's2', i), ('s2', i)) - def test_aligned_gaps(self) : - alignment = makeTempFasta([ - ('s1', 'ATCG'), - ('s2', 'AC-G'), - ('s3', 'AG-T'), - ]) + def test_aligned_gaps(self): + alignment = makeTempFasta([('s1', 'ATCG'), ('s2', 'AC-G'), ('s3', 'AG-T'),]) cm = interhost.CoordMapper() cm.load_alignments([alignment]) - for i in (1,2,3): + for i in (1, 2, 3): self.assertEqual(cm.mapChr('s2', 's3', i), ('s3', i)) self.assertEqual(cm.mapChr('s3', 's2', i), ('s2', i)) - for x,y in ((1,1), (2,2), (3,2), (4,3)): + for x, y in ((1, 1), (2, 2), (3, 2), (4, 3)): self.assertEqual(cm.mapChr('s1', 's2', x), ('s2', y)) self.assertEqual(cm.mapChr('s1', 's3', x), ('s3', y)) - for x,y in ((1,1), (2,[2,3]), (3,4)): + for x, y in ((1, 1), (2, [2, 3]), (3, 4)): self.assertEqual(cm.mapChr('s2', 's1', x), ('s1', y)) self.assertEqual(cm.mapChr('s3', 's1', x), ('s1', y)) - def test_adjacent_gaps(self) : + def test_adjacent_gaps(self): alignment = makeTempFasta([ - ('s1', 'ATCTG'), - ('s2', 'AC--G'), - ('s3', 'A-TTG'), - ('s4', 'A-C-G'), - ('s5', 'A--CG'), - ]) + ('s1', 'ATCTG'), + ('s2', 'AC--G'), + ('s3', 'A-TTG'), + ('s4', 'A-C-G'), + ('s5', 'A--CG'), + ]) cm = interhost.CoordMapper() cm.load_alignments([alignment]) - for x,y in ((1,1), (2,2), (3,2), (4,2), (5,3)): + for x, y in ((1, 1), (2, 2), (3, 2), (4, 2), (5, 3)): self.assertEqual(cm.mapChr('s1', 's2', x), ('s2', y)) - for x,y in ((1,1), (2,[2,4]), (3,5)): + for x, y in ((1, 1), (2, [2, 4]), (3, 5)): self.assertEqual(cm.mapChr('s2', 's1', x), ('s1', y)) - for x,y in ((1,1), (2,1), (3,2), (4,3), (5,4)): + for x, y in ((1, 1), (2, 1), (3, 2), (4, 3), (5, 4)): self.assertEqual(cm.mapChr('s1', 's3', x), ('s3', y)) - for x,y in ((1,[1,2]), (2,3), (3,4), (4,5)): + for x, y in ((1, [1, 2]), (2, 3), (3, 4), (4, 5)): self.assertEqual(cm.mapChr('s3', 's1', x), ('s1', y)) - for x,y in ((1,1), (2,[2,3]), (3,4)): + for x, y in ((1, 1), (2, [2, 3]), (3, 4)): self.assertEqual(cm.mapChr('s2', 's3', x), ('s3', y)) - for x,y in ((1,1), (2,2), (3,2), (4,3)): + for x, y in ((1, 1), (2, 2), (3, 2), (4, 3)): self.assertEqual(cm.mapChr('s3', 's2', x), ('s2', y)) - for a,b in itertools.combinations(('s2', 's4', 's5'), 2): - for i in (1,2,3): + for a, b in itertools.combinations(('s2', 's4', 's5'), 2): + for i in (1, 2, 3): self.assertEqual(cm.mapChr(a, b, i), (b, i)) self.assertEqual(cm.mapChr(b, a, i), (a, i)) - def test_one_real_base(self) : - alignment = makeTempFasta([ - ('s1', 'AC-'), - ('s2', '-CA'), - ]) + def test_one_real_base(self): + alignment = makeTempFasta([('s1', 'AC-'), ('s2', '-CA'),]) cm = interhost.CoordMapper() cm.load_alignments([alignment]) self.assertEqual(cm.mapChr('s1', 's2', 1), ('s2', None)) @@ -341,18 +293,13 @@ def test_one_real_base(self) : self.assertEqual(cm.mapChr('s2', 's1', 1), ('s1', 2)) self.assertEqual(cm.mapChr('s2', 's1', 2), ('s1', None)) - def test_exactly_two_pairs(self) : - alignment = makeTempFasta([ - ('s1', 'A--T'), - ('s2', 'AGGT'), - ]) + def test_exactly_two_pairs(self): + alignment = makeTempFasta([('s1', 'A--T'), ('s2', 'AGGT'),]) cm = interhost.CoordMapper() cm.load_alignments([alignment]) - self.assertEqual(cm.mapChr('s1', 's2', 1), ('s2', [1,3])) + self.assertEqual(cm.mapChr('s1', 's2', 1), ('s2', [1, 3])) self.assertEqual(cm.mapChr('s1', 's2', 2), ('s2', 4)) self.assertEqual(cm.mapChr('s2', 's1', 1), ('s1', 1)) self.assertEqual(cm.mapChr('s2', 's1', 2), ('s1', 1)) self.assertEqual(cm.mapChr('s2', 's1', 3), ('s1', 1)) self.assertEqual(cm.mapChr('s2', 's1', 4), ('s1', 2)) - - diff --git a/test/unit/test_intrahost.py b/test/unit/test_intrahost.py index f5a1f3c9b..a3dcd085d 100644 --- a/test/unit/test_intrahost.py +++ b/test/unit/test_intrahost.py @@ -4,23 +4,37 @@ # built-ins from collections import OrderedDict -import os, os.path, shutil, tempfile, itertools, argparse, unittest +import os +import os.path +import shutil +import tempfile +import itertools +import argparse +import unittest # third-party -import Bio, Bio.SeqRecord, Bio.Seq +import Bio +import Bio.SeqRecord +import Bio.Seq # module-specific -import intrahost, util.file, util.vcf, test +import intrahost +import util.file +import util.vcf +import test from intrahost import AlleleFieldParser import interhost import tools.mafft + class TestCommandHelp(unittest.TestCase): + def test_help_parser_for_each_command(self): for cmd_name, parser_fun in intrahost.__commands__: parser = parser_fun(argparse.ArgumentParser()) helpstring = parser.format_help() + def makeTempFasta(seqs): fn = util.file.mkstempfname('.fasta') with open(fn, 'wt') as outf: @@ -28,58 +42,60 @@ def makeTempFasta(seqs): outf.write(line) return fn + class MockVphaserOutput: ''' This creates test data that pretends to be the output from tools.vphaser2.Vphaser2Tool.iterate ''' + def __init__(self): self.isnvs = {} self.chroms = [] + def add_snp(self, chrom, pos, acounts, libinfo=None): # Add an iSNP at this chrom,pos. acounts is a list of triples: # (allele, fwd count, rev count). - assert type(pos) == int and pos>0 and len(acounts)>1 - for a,f,r in acounts: - assert a in ('A','C','G','T') - assert f>=0 and r>=0 and f+r>0 + assert isinstance(pos, int) and pos > 0 and len(acounts) > 1 + for a, f, r in acounts: + assert a in ('A', 'C', 'G', 'T') + assert f >= 0 and r >= 0 and f + r > 0 # Add libinfo to acounts - if libinfo == None : - acounts = [[a, f, r, 1, [[f, r]]] for a,f,r in acounts] - else : + if libinfo is None: + acounts = [[a, f, r, 1, [[f, r]]] for a, f, r in acounts] + else: # ... raise Exception - acounts = reversed(sorted((acount[1]+acount[2],) + tuple(acount) - for acount in acounts)) + acounts = reversed(sorted((acount[1] + acount[2],) + tuple(acount) for acount in acounts)) acounts = [tuple(acount[1:]) for acount in acounts] if chrom not in self.chroms: self.chroms.append(chrom) self.isnvs.setdefault(chrom, {}) self.isnvs[chrom].setdefault(pos, {}) self.isnvs[chrom][pos]['snp'] = acounts + def add_indel(self, chrom, pos, acounts, libinfo=None): ''' Add an iSNP at this chrom,pos. acounts is a list of triples: (allele, fwd count, rev count). allele is simply a small sequence with no "I" or "D" prefixing it (we'll add the I/D). ''' - assert type(pos) == int and pos>0 and len(acounts)>1 - for a,f,r in acounts: - assert type(a) == str - assert f>=0 and r>=0 and f+r>0 - assert '' in set(a for a,f,r in acounts) - + assert isinstance(pos, int) and pos > 0 and len(acounts) > 1 + for a, f, r in acounts: + assert isinstance(a, str) + assert f >= 0 and r >= 0 and f + r > 0 + assert '' in set(a for a, f, r in acounts) + # Add libinfo to acounts - if libinfo == None : - acounts = [[a, f, r, 1, [[f, r]]] for a,f,r in acounts] - else : + if libinfo is None: + acounts = [[a, f, r, 1, [[f, r]]] for a, f, r in acounts] + else: # ... raise Exception - - acounts = reversed(sorted((acount[1]+acount[2],) + tuple(acount) - for acount in acounts)) + + acounts = reversed(sorted((acount[1] + acount[2],) + tuple(acount) for acount in acounts)) acounts = [list(acount[1:]) for acount in acounts] - + # vphaser funniness here if acounts[0][0] == '': # this is a set of insertions against the consensus @@ -94,12 +110,13 @@ def add_indel(self, chrom, pos, acounts, libinfo=None): assert acounts[0][0][n_deleted:] == acounts[i][0] acounts[i][0] = 'D' + str(n_deleted) acounts[0][0] = 'i' - + if chrom not in self.chroms: self.chroms.append(chrom) self.isnvs.setdefault(chrom, {}) self.isnvs[chrom].setdefault(pos, {}) self.isnvs[chrom][pos]['lp'] = acounts + def __iter__(self): for c in self.chroms: for p in sorted(self.isnvs[c].keys()): @@ -108,7 +125,7 @@ def __iter__(self): mac = sum(acount[1] + acount[2] for acount in acounts[1:]) tot = sum(acount[1] + acount[2] for acount in acounts) yield [c, str(p), acounts[1][0], acounts[0][0], - '0.5', model, str(float(mac)/tot*100.0)] \ + '0.5', model, str(float(mac) / tot * 100.0)] \ + [str(AlleleFieldParser(None, *acount)) for acount in acounts] @@ -121,19 +138,19 @@ class TestPerSample(test.TestCaseWithTmp): These unit tests mock the vphaser tool output and just test the filtering/statistics/etc. ''' + def test_single_strand_bias_hard_filter(self): data = MockVphaserOutput() - data.add_snp('c1', 100, [('A',10,20), ('T',5,2), ('C',30,500), ('G',60,40)]) - data.add_snp('c2', 100, [('C',10,2), ('T',2,8)]) + data.add_snp('c1', 100, [('A', 10, 20), ('T', 5, 2), ('C', 30, 500), ('G', 60, 40)]) + data.add_snp('c2', 100, [('C', 10, 2), ('T', 2, 8)]) output = list(intrahost.filter_strand_bias(data)) - expected = ['c1', '100', 'A', 'G', None, 'snp', 23.076923076923078, - 'G:60:40:60:40:1', 'A:10:20:10:20:1'] + expected = ['c1', '100', 'A', 'G', None, 'snp', 23.076923076923078, 'G:60:40:60:40:1', 'A:10:20:10:20:1'] self.assertEqual(len(output), 1) self.assertEqual(output[0][:4], expected[:4]) self.assertEqual(output[0][5], expected[5]) self.assertAlmostEqual(float(output[0][6]), expected[6], places=4) self.assertEqual(output[0][7:], expected[7:]) - + def test_vphaser_one_sample(self): # Files here were created as follows: # - in.bam was copied from input directory for TestVPhaser2; see notes @@ -151,8 +168,7 @@ def test_vphaser_one_sample(self): inBam = os.path.join(myInputDir, 'in.bam') refFasta = os.path.join(myInputDir, 'ref.fasta') outTab = util.file.mkstempfname('.txt') - intrahost.vphaser_one_sample(inBam, refFasta, outTab, - vphaserNumThreads = 4, minReadsEach = 6, maxBias = 3) + intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3) expected = os.path.join(myInputDir, 'vphaser_one_sample_expected.txt') self.assertEqualContents(outTab, expected) @@ -171,8 +187,7 @@ def test_vphaser_one_sample_indels(self): inBam = os.path.join(myInputDir, 'in.indels.bam') refFasta = os.path.join(myInputDir, 'ref.indels.fasta') outTab = util.file.mkstempfname('.txt') - intrahost.vphaser_one_sample(inBam, refFasta, outTab, - vphaserNumThreads = 4, minReadsEach = 0) + intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=0) expected = os.path.join(myInputDir, 'vphaser_one_sample_indels_expected.txt') self.assertEqualContents(outTab, expected) @@ -183,8 +198,7 @@ def test_vphaser_one_sample_2libs(self): inBam = os.path.join(myInputDir, 'in.2libs.bam') refFasta = os.path.join(myInputDir, 'ref.fasta') outTab = util.file.mkstempfname('.txt') - intrahost.vphaser_one_sample(inBam, refFasta, outTab, - vphaserNumThreads = 4, minReadsEach = 6, maxBias = 3) + intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3) expected = os.path.join(myInputDir, 'vphaser_one_sample_2libs_expected.txt') self.assertEqualContents(outTab, expected) @@ -200,8 +214,7 @@ def test_vphaser_one_sample_3libs_and_chi2(self): inBam = os.path.join(myInputDir, 'in.3libs.bam') refFasta = os.path.join(myInputDir, 'ref.fasta') outTab = util.file.mkstempfname('.txt') - intrahost.vphaser_one_sample(inBam, refFasta, outTab, - vphaserNumThreads = 4, minReadsEach = 6, maxBias = 3) + intrahost.vphaser_one_sample(inBam, refFasta, outTab, vphaserNumThreads=4, minReadsEach=6, maxBias=3) expected = os.path.join(myInputDir, 'vphaser_one_sample_3libs_expected.txt') self.assertEqualContents(outTab, expected) @@ -209,43 +222,50 @@ def test_vphaser_one_sample_3libs_and_chi2(self): class VcfMergeRunner: ''' This creates test data and feeds it to intrahost.merge_to_vcf ''' + def __init__(self, ref_genome=None): - self.genomes = {} # {sample : {chrom : bases, ...}, ...} - self.genomeFastas = OrderedDict() # {sample: fastaFileName, ... - self.alignedFastas = [] # [chr1, chr2, ...] + self.genomes = {} # {sample : {chrom : bases, ...}, ...} + self.genomeFastas = OrderedDict() # {sample: fastaFileName, ... + self.alignedFastas = [] # [chr1, chr2, ...] self.isnvs = {} self.sample_order = [] - self.sequence_order = OrderedDict() # { sample: [seq1, seq2, ... ], ... } + self.sequence_order = OrderedDict() # { sample: [seq1, seq2, ... ], ... } if ref_genome: self.set_ref(ref_genome) + def set_ref(self, genome): self.ref = makeTempFasta(genome) + def add_genome(self, sample_name, genome): self.genomes[sample_name] = dict(genome) self.genomeFastas[sample_name] = makeTempFasta(genome) if sample_name not in self.sample_order: self.sample_order.append(sample_name) self.isnvs.setdefault(sample_name, MockVphaserOutput()) + def add_snp(self, sample, chrom, pos, acounts, libinfo=None): assert sample in self.genomeFastas assert chrom in self.genomes[sample] assert 1 <= pos <= len(self.genomes[sample][chrom]) - assert self.genomes[sample][chrom][pos - 1] in [a for a,f,r in acounts] + assert self.genomes[sample][chrom][pos - 1] in [a for a, f, r in acounts] self.isnvs[sample].add_snp(chrom, pos, acounts, libinfo) + def add_indel(self, sample, chrom, pos, acounts, libinfo=None): assert sample in self.genomeFastas assert sample in self.genomeFastas assert chrom in self.genomes[sample] assert 1 <= pos <= len(self.genomes[sample][chrom]) - if acounts[0][0] != '' : # deletion - assert self.genomes[sample][chrom][pos - 1 :].startswith(acounts[0][0]) + if acounts[0][0] != '': # deletion + assert self.genomes[sample][chrom][pos - 1:].startswith(acounts[0][0]) self.isnvs[sample].add_indel(chrom, pos, acounts, libinfo) + def dump_isnv_tmp_file(self, sample): fn = util.file.mkstempfname('.txt') with open(fn, 'wt') as outf: for row in self.isnvs[sample]: outf.write('\t'.join(map(str, row)) + '\n') return fn + def run_and_get_vcf_rows(self, retree=1): outVcf = util.file.mkstempfname('.vcf.gz') @@ -253,10 +273,8 @@ def run_and_get_vcf_rows(self, retree=1): seqIds = list(itertools.chain.from_iterable(self.sequence_order.values())) - intrahost.merge_to_vcf(self.ref, outVcf, - seqIds, - list(self.dump_isnv_tmp_file(s) for s in self.sample_order), - self.alignedFastas) + intrahost.merge_to_vcf(self.ref, outVcf, seqIds, list(self.dump_isnv_tmp_file(s) for s in self.sample_order), + self.alignedFastas) with util.vcf.VcfReader(outVcf) as vcf: rows = list(vcf.get()) return rows @@ -273,9 +291,9 @@ def multi_align_samples(self, retree=1): for sampleName, fastaFile in genomeKVIterator: with util.file.open_or_gzopen(fastaFile, 'r') as inf: - for seq in Bio.SeqIO.parse(inf, 'fasta'): + for seq in Bio.SeqIO.parse(inf, 'fasta'): self.sequence_order.setdefault(sampleName, default=[]) - self.sequence_order[sampleName].append( seq.id ) + self.sequence_order[sampleName].append(seq.id) inputFastas = [] inputFastas.append(self.ref) @@ -284,36 +302,37 @@ def multi_align_samples(self, retree=1): # since the FASTA files are for idx, filePath in enumerate(transposedFiles): - + outFile = util.file.mkstempfname('.fasta') outFilePath = os.path.dirname(outFile) alignedOutFile = tools.mafft.MafftTool().execute( - inFastas = [os.path.abspath(filePath)], - outFile = os.path.join(outFilePath, "{}{}.fasta".format("aligned", idx)), - localpair = False, - globalpair = True, - preservecase = True, - reorder = None, - gapOpeningPenalty = None, - offset = None, - verbose = False, - outputAsClustal = None, - maxiters = 1000, - threads = -1, - retree = retree - ) + inFastas=[os.path.abspath(filePath)], + outFile=os.path.join(outFilePath, "{}{}.fasta".format("aligned", idx)), + localpair=False, + globalpair=True, + preservecase=True, + reorder=None, + gapOpeningPenalty=None, + offset=None, + verbose=False, + outputAsClustal=None, + maxiters=1000, + threads=-1, + retree=retree) self.alignedFastas.append(alignedOutFile) + class TestVcfMerge(test.TestCaseWithTmp): ''' This tests step 2 of the iSNV calling process (intrahost.merge_to_vcf), which gets really nasty and tricky and has lots of edge cases. These unit tests mock the vphaser tool output and just tests the merge and VCF stuff. ''' + def test_empty_output(self): ref = makeTempFasta([('ref1', 'ATCGCA')]) - s1 = makeTempFasta([('s1_1', 'ATCGCA')]) + s1 = makeTempFasta([('s1_1', 'ATCGCA')]) emptyfile = util.file.mkstempfname('.txt') outVcf = util.file.mkstempfname('.vcf') #intrahost.merge_to_vcf(ref, outVcf, ['s1'], [emptyfile], [s1]) @@ -329,40 +348,47 @@ def test_empty_output(self): with util.file.open_or_gzopen(outVcf, 'rt') as inf: for line in inf: self.assertTrue(line.startswith('#')) - + def test_headers_with_two_samps(self): ref = makeTempFasta([('ref1', 'ATCGTTCA'), ('ref2', 'GGCCC')]) - s1 = makeTempFasta([('s1_1', 'ATCGCA'), ('s1_2', 'GGCCC')]) - s2 = makeTempFasta([('s2_1', 'ATCGTTCA'), ('s2_2', 'GGCCC')]) + s1 = makeTempFasta([('s1_1', 'ATCGCA'), ('s1_2', 'GGCCC')]) + s2 = makeTempFasta([('s2_1', 'ATCGTTCA'), ('s2_2', 'GGCCC')]) emptyfile = util.file.mkstempfname('.txt') outVcf = util.file.mkstempfname('.vcf.gz') - self.assertRaises(LookupError, intrahost.merge_to_vcf, ref, outVcf, ['s1', 's2'], [emptyfile, emptyfile], [s1, s2]) + self.assertRaises( + LookupError, intrahost.merge_to_vcf, ref, outVcf, [ + 's1', 's2' + ], [ + emptyfile, emptyfile + ], [ + s1, s2 + ]) with util.vcf.VcfReader(outVcf) as vcf: self.assertEqual(vcf.samples(), ['s1', 's2']) - self.assertEqual(vcf.chrlens(), {'ref1':8, 'ref2':5}) - + self.assertEqual(vcf.chrlens(), {'ref1': 8, 'ref2': 5}) + def test_simple_snps(self): merger = VcfMergeRunner([('ref1', 'ATCGGACT')]) merger.add_genome('s1', [('s1_1', 'ATCGGAC')]) - # ATCGGAC- - merger.add_genome('s2', [('s2_1', 'TCGGACT')]) - # -TCGGACT - merger.add_genome('s3', [('s3_1', 'TCGGACT')]) - # -TCGGACT + # ATCGGAC- + merger.add_genome('s2', [('s2_1', 'TCGGACT')]) + # -TCGGACT + merger.add_genome('s3', [('s3_1', 'TCGGACT')]) + # -TCGGACT merger.add_snp('s1', 's1_1', 3, [('C', 80, 80), ('A', 20, 20)]) merger.add_snp('s2', 's2_1', 2, [('C', 90, 90), ('A', 10, 10)]) merger.add_snp('s3', 's3_1', 5, [('A', 70, 70), ('T', 30, 30)]) rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 2) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 3) + self.assertEqual(rows[0].pos + 1, 3) self.assertEqual(rows[0].ref, 'C') self.assertEqual(rows[0].alt, 'A') self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.2') self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '0:0.1') self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '0:0.0') self.assertEqual(rows[1].contig, 'ref1') - self.assertEqual(rows[1].pos+1, 6) + self.assertEqual(rows[1].pos + 1, 6) self.assertEqual(rows[1].ref, 'A') self.assertEqual(rows[1].alt, 'T') self.assertEqual(':'.join(rows[1][0].split(':')[:2]), '0:0.0') @@ -372,15 +398,15 @@ def test_simple_snps(self): def test_snps_downstream_of_indels(self): merger = VcfMergeRunner([('ref1', 'ATCGGACT')]) merger.add_genome('s1', [('s1_1', 'ATCGTTGACT')]) - merger.add_genome('s2', [('s2_1', 'TCGTTGACT')]) - merger.add_genome('s3', [('s3_1', 'TCGGCCT')]) + merger.add_genome('s2', [('s2_1', 'TCGTTGACT')]) + merger.add_genome('s3', [('s3_1', 'TCGGCCT')]) merger.add_snp('s1', 's1_1', 8, [('A', 80, 80), ('C', 20, 20)]) merger.add_snp('s2', 's2_1', 7, [('A', 90, 90), ('C', 10, 10)]) merger.add_snp('s3', 's3_1', 5, [('C', 70, 70), ('A', 30, 30)]) rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 6) + self.assertEqual(rows[0].pos + 1, 6) self.assertEqual(rows[0].ref, 'A') self.assertEqual(rows[0].alt, 'C') self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.2') @@ -393,12 +419,12 @@ def test_sample_major_allele_not_ref_allele(self): merger = VcfMergeRunner([('ref1', 'ATCG')]) merger.add_genome('s1', [('s1_1', 'ATAGCCC')]) merger.add_snp('s1', 's1_1', 3, [('C', 10, 10), ('A', 90, 90)]) - # we need to specify retree as None + # we need to specify retree as None # so the test sequence aligns as expected rows = merger.run_and_get_vcf_rows(retree=None) self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 3) + self.assertEqual(rows[0].pos + 1, 3) self.assertEqual(rows[0].ref, 'C') self.assertEqual(rows[0].alt, 'A') self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '1:0.9') @@ -418,13 +444,13 @@ def test_backfill_sample_from_assembly(self): rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 3) + self.assertEqual(rows[0].pos + 1, 3) self.assertEqual(rows[0].ref, 'C') self.assertEqual(rows[0].alt, 'A') self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.1') self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:1.0') self.assertEqual(rows[0][2], '.:.:.:.') - + def test_simple_insertions(self): # IA, ITCG, etc # V-Phaser outputs a position that is just prior to where the new @@ -434,7 +460,7 @@ def test_simple_insertions(self): # For example: ATCG -> ATAAACG is considered "IAAA" at position 2. # This is not the same as the VCF convention, which includes the # initial invariant base as part of the allele (it's a T -> TAAA - # variant at position 2). + # variant at position 2). merger = VcfMergeRunner([('ref1', 'ATCG')]) merger.add_genome('s1', [('s1_1', 'ATCG')]) merger.add_genome('s2', [('s2_1', 'ATCG')]) @@ -445,13 +471,13 @@ def test_simple_insertions(self): rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 2) + self.assertEqual(rows[0].pos + 1, 2) self.assertEqual(rows[0].ref, 'T') self.assertEqual(rows[0].alt, 'TAA,TGCC,TAT') self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.2,0.0,0.0') self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '0:0.0,0.0,0.1') self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '0:0.05,0.15,0.0') - + def test_simple_deletions(self): # D1, D2, etc... # V-Phaser outputs a position that describes the deleted base and @@ -466,18 +492,18 @@ def test_simple_deletions(self): merger.add_genome('s2', [('s2_1', 'ATCGAT')]) merger.add_genome('s3', [('s3_1', 'ATCGAT')]) merger.add_indel('s1', 's1_1', 3, [('CG', 80, 80), ('', 20, 20)]) - merger.add_indel('s2', 's2_1', 3, [('C', 90, 90), ('', 10, 10)]) + merger.add_indel('s2', 's2_1', 3, [('C', 90, 90), ('', 10, 10)]) merger.add_indel('s3', 's3_1', 3, [('CG', 80, 80), ('G', 15, 15), ('', 5, 5)]) rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 2) + self.assertEqual(rows[0].pos + 1, 2) self.assertEqual(rows[0].ref, 'TCG') self.assertEqual(rows[0].alt, 'TG,T') self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '0:0.0,0.2') self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '0:0.1,0.0') self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '0:0.15,0.05') - + def test_deletion_spans_deletion(self): # sample assembly has deletion against reference and isnv deletes even more # POS is anchored right before the deletion @@ -490,11 +516,11 @@ def test_deletion_spans_deletion(self): rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 4) + self.assertEqual(rows[0].pos + 1, 4) self.assertEqual(rows[0].ref, 'GTTC') self.assertEqual(rows[0].alt, 'GC,G') self.assertEqual(rows[0][0], '1:0.9,0.1:0,1,1:.,1.0,1.0') - + def test_insertion_spans_deletion(self): # sample assembly has deletion against reference and isnv inserts back into it # POS is anchored right before the deletion @@ -507,37 +533,37 @@ def test_insertion_spans_deletion(self): merger.add_genome('s1', [('s1_1', 'ATCGCACC')]) merger.add_genome('s2', [('s2_1', 'ATCGCACC')]) merger.add_genome('s3', [('s3_1', 'ATCGCACC')]) - merger.add_indel('s1', 's1_1', 4, [('', 70, 70), ('T', 30, 30)]) - merger.add_indel('s2', 's2_1', 4, [('', 80, 80), ('TT', 20, 20)]) + merger.add_indel('s1', 's1_1', 4, [('', 70, 70), ('T', 30, 30)]) + merger.add_indel('s2', 's2_1', 4, [('', 80, 80), ('TT', 20, 20)]) merger.add_indel('s3', 's3_1', 4, [('', 90, 90), ('TTC', 10, 10)]) rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 4) + self.assertEqual(rows[0].pos + 1, 4) self.assertEqual(rows[0].ref, 'GTT') self.assertEqual(rows[0].alt, 'G,GT,GTTC') self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '1:0.7,0.3,0.0') self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:0.8,0.0,0.0') self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '1:0.9,0.0,0.1') - + def test_snp_within_insertion(self): # sample assembly has insertion against reference and isnp modifies it # REF: ATCG--GA # S1: ATCGTTGA - # isnv: C - # isnv: C - # isnv: C + # isnv: C + # isnv: C + # isnv: C merger = VcfMergeRunner([('ref1', 'ATCGGACT')]) merger.add_genome('s1', [('s1_1', 'ATCGTTGACT')]) - merger.add_genome('s2', [('s2_1', 'TCGTTGACT')]) - merger.add_genome('s3', [('s3_1', 'TCGTTGACT')]) + merger.add_genome('s2', [('s2_1', 'TCGTTGACT')]) + merger.add_genome('s3', [('s3_1', 'TCGTTGACT')]) merger.add_snp('s1', 's1_1', 4, [('G', 70, 70), ('C', 30, 30)]) merger.add_snp('s2', 's2_1', 4, [('T', 80, 80), ('C', 20, 20)]) merger.add_snp('s3', 's3_1', 5, [('T', 90, 90), ('C', 10, 10)]) rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 4) + self.assertEqual(rows[0].pos + 1, 4) self.assertEqual(rows[0].ref, 'G') self.assertEqual(rows[0].alt, 'GTT,CTT,GCT,GTC') self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '1:0.7,0.3,0.0,0.0') @@ -552,9 +578,9 @@ def test_2snps_within_insertion_same_sample(self): # REF: ATCG--GACT # S1: ATCGTTGACT # isnv: C - # isnv: C + # isnv: C merger = VcfMergeRunner([('ref1', 'ATCGGACT')]) - merger.add_genome('s1', [('s1_1', 'ATCGTTGACT')]) + merger.add_genome('s1', [('s1_1', 'ATCGTTGACT')]) merger.add_snp('s1', 's1_1', 5, [('T', 80, 80), ('C', 20, 20)]) merger.add_snp('s1', 's1_1', 6, [('T', 90, 90), ('C', 10, 10)]) self.assertRaises(NotImplementedError, merger.run_and_get_vcf_rows) @@ -576,19 +602,19 @@ def test_deletion_past_end_of_some_consensus(self): rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 2) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 4) + self.assertEqual(rows[0].pos + 1, 4) self.assertEqual(rows[0].ref, 'GAA') # multiple options because the allele frequencies can be the same - self.assertIn(rows[0].alt, ['C,G,A','G,C,A']) - self.assertIn(rows[0][0], ['2:0.0,0.7,0.3:0,0,1,1:.,.,1.0,1.0','1:0.7,0.0,0.3:0,1,0,1:.,1.0,.,1.0']) - self.assertIn(rows[0][1], ['1:1.0,0.0,0.0:.:.','2:0.0,1.0,0.0:.:.']) + self.assertIn(rows[0].alt, ['C,G,A', 'G,C,A']) + self.assertIn(rows[0][0], ['2:0.0,0.7,0.3:0,0,1,1:.,.,1.0,1.0', '1:0.7,0.0,0.3:0,1,0,1:.,1.0,.,1.0']) + self.assertIn(rows[0][1], ['1:1.0,0.0,0.0:.:.', '2:0.0,1.0,0.0:.:.']) self.assertEqual(rows[1].contig, 'ref1') - self.assertEqual(rows[1].pos+1, 7) + self.assertEqual(rows[1].pos + 1, 7) self.assertEqual(rows[1].ref, 'C') self.assertEqual(rows[1].alt, 'T') self.assertEqual(rows[1][0], '0:0.0:.:.') self.assertEqual(rows[1][1], '1:0.8:1,1:1.0,1.0') - + def test_snp_past_end_of_some_consensus(self): # Some sample contains SNP beyond the end of the consensus # sequence of another. It should skip latter rather @@ -604,7 +630,7 @@ def test_snp_past_end_of_some_consensus(self): rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 2) + self.assertEqual(rows[0].pos + 1, 2) self.assertEqual(rows[0].ref, 'T') self.assertEqual(rows[0].alt, 'G') self.assertEqual(rows[0][0], '0:0.3:1,1:1.0,1.0') @@ -622,9 +648,9 @@ def test_deletion_within_insertion(self): # isnv: x (position 7, D1) s4 => GTT merger = VcfMergeRunner([('ref1', 'ATCGGACT')]) merger.add_genome('s1', [('s1_1', 'ATCGTTGACT')]) - merger.add_genome('s2', [('s2_1', 'TCGTTGACT')]) - merger.add_genome('s3', [('s3_1', 'TCGTTGACT')]) - merger.add_genome('s4', [('s4_1', 'TCGTTGACT')]) + merger.add_genome('s2', [('s2_1', 'TCGTTGACT')]) + merger.add_genome('s3', [('s3_1', 'TCGTTGACT')]) + merger.add_genome('s4', [('s4_1', 'TCGTTGACT')]) merger.add_indel('s1', 's1_1', 5, [('TTG', 40, 40), ('TG', 30, 30), ('G', 20, 20), ('', 10, 10)]) merger.add_indel('s2', 's2_1', 4, [('T', 80, 80), ('', 20, 20)]) merger.add_indel('s3', 's3_1', 5, [('TG', 85, 85), ('G', 10, 10), ('', 5, 5)]) @@ -632,14 +658,14 @@ def test_deletion_within_insertion(self): rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 4) + self.assertEqual(rows[0].pos + 1, 4) self.assertEqual(rows[0].ref, 'GG') self.assertEqual(rows[0].alt, 'GTTG,GTG,GTT,G,GT') self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '1:0.4,0.3,0.0,0.1,0.0') self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:0.8,0.2,0.0,0.0,0.0') self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '1:0.85,0.1,0.0,0.0,0.05') self.assertEqual(':'.join(rows[0][3].split(':')[:2]), '1:0.9,0.0,0.1,0.0,0.0') - + def test_insertion_within_insertion(self): # sample assembly has insertion against reference and isnv puts even more in # REF: ATCG--GA @@ -649,21 +675,21 @@ def test_insertion_within_insertion(self): # isnv: ^ (position 6, IA) merger = VcfMergeRunner([('ref1', 'ATCGGACT')]) merger.add_genome('s1', [('s1_1', 'ATCGTTGACT')]) - merger.add_genome('s2', [('s2_1', 'TCGTTGACT')]) - merger.add_genome('s3', [('s3_1', 'TCGTTGACT')]) + merger.add_genome('s2', [('s2_1', 'TCGTTGACT')]) + merger.add_genome('s3', [('s3_1', 'TCGTTGACT')]) merger.add_indel('s1', 's1_1', 4, [('', 70, 70), ('A', 30, 30)]) merger.add_indel('s2', 's2_1', 4, [('', 80, 80), ('A', 20, 20)]) merger.add_indel('s3', 's3_1', 5, [('', 90, 90), ('A', 10, 10)]) rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 4) + self.assertEqual(rows[0].pos + 1, 4) self.assertEqual(rows[0].ref, 'G') self.assertEqual(rows[0].alt, 'GTT,GATT,GTAT,GTTA') self.assertEqual(':'.join(rows[0][0].split(':')[:2]), '1:0.7,0.3,0.0,0.0') self.assertEqual(':'.join(rows[0][1].split(':')[:2]), '1:0.8,0.0,0.2,0.0') self.assertEqual(':'.join(rows[0][2].split(':')[:2]), '1:0.9,0.0,0.0,0.1') - + def test_indel_collapse(self): # vphaser describes insertions and deletions separately # test appropriate collapse of coincident insertions and deletions into @@ -684,12 +710,9 @@ def test_indel_collapse(self): rows = merger.run_and_get_vcf_rows() self.assertEqual(len(rows), 1) self.assertEqual(rows[0].contig, 'ref1') - self.assertEqual(rows[0].pos+1, 2) + self.assertEqual(rows[0].pos + 1, 2) self.assertEqual(rows[0].ref, 'TC') self.assertEqual(rows[0].alt, 'T,TAC') - self.assertEqual(rows[0][0].split(':')[0], '0') # s1 is 0.5 TC, 0.3 T, 0.2 TAC + self.assertEqual(rows[0][0].split(':')[0], '0') # s1 is 0.5 TC, 0.3 T, 0.2 TAC for actual, expected in zip(rows[0][0].split(':')[1].split(','), [0.3, 0.2]): self.assertAlmostEqual(float(actual), expected, places=2) - - - diff --git a/test/unit/test_ncbi.py b/test/unit/test_ncbi.py index 52657c937..adc203ec0 100644 --- a/test/unit/test_ncbi.py +++ b/test/unit/test_ncbi.py @@ -3,15 +3,16 @@ __author__ = "tomkinsc@broadinstitute.org" # built-ins -import unittest, argparse +import unittest +import argparse -#module-specific +# module-specific import ncbi + class TestCommandHelp(unittest.TestCase): + def test_help_parser_for_each_command(self): for cmd_name, parser_fun in ncbi.__commands__: parser = parser_fun(argparse.ArgumentParser()) helpstring = parser.format_help() - - diff --git a/test/unit/test_read_utils.py b/test/unit/test_read_utils.py index 91b73a048..26eaa65cf 100644 --- a/test/unit/test_read_utils.py +++ b/test/unit/test_read_utils.py @@ -2,18 +2,30 @@ __author__ = "irwin@broadinstitute.org" -import unittest, os, tempfile, argparse, filecmp -import util, util.file, read_utils, tools, tools.samtools +import unittest +import os +import tempfile +import argparse +import filecmp +import util +import util.file +import read_utils +import tools +import tools.samtools from test import TestCaseWithTmp + class TestCommandHelp(unittest.TestCase): + def test_help_parser_for_each_command(self): for cmd_name, parser_fun in read_utils.__commands__: parser = parser_fun(argparse.ArgumentParser()) helpstring = parser.format_help() -class TestPurgeUnmated(TestCaseWithTmp) : - def test_purge_unmated(self) : + +class TestPurgeUnmated(TestCaseWithTmp): + + def test_purge_unmated(self): myInputDir = util.file.get_test_input_path(self) inFastq1 = os.path.join(myInputDir, 'in1.fastq') inFastq2 = os.path.join(myInputDir, 'in2.fastq') @@ -46,8 +58,10 @@ def test_purge_unmated_sra(self): self.assertEqualContents(outFastq1, expected1Fastq) self.assertEqualContents(outFastq2, expected2Fastq) -class TestFastqToFasta(TestCaseWithTmp) : - def test_fastq_to_fasta(self) : + +class TestFastqToFasta(TestCaseWithTmp): + + def test_fastq_to_fasta(self): myInputDir = util.file.get_test_input_path(self) inFastq = os.path.join(myInputDir, 'in.fastq') outFasta = util.file.mkstempfname('.fasta') @@ -59,11 +73,13 @@ def test_fastq_to_fasta(self) : expectedFasta = os.path.join(myInputDir, 'expected.fasta') self.assertEqualContents(outFasta, expectedFasta) -class TestFastqBam(TestCaseWithTmp) : + +class TestFastqBam(TestCaseWithTmp): 'Class for testing fastq <-> bam conversions' - def test_fastq_bam(self) : + + def test_fastq_bam(self): myInputDir = util.file.get_test_input_path(self) - + # Define file names inFastq1 = os.path.join(myInputDir, 'in1.fastq') inFastq2 = os.path.join(myInputDir, 'in2.fastq') @@ -77,17 +93,20 @@ def test_fastq_bam(self) : outFastq1 = util.file.mkstempfname('.fastq') outFastq2 = util.file.mkstempfname('.fastq') outHeader = util.file.mkstempfname('.txt') - + # in1.fastq, in2.fastq -> out.bam; header params from command-line parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser()) - args = parser.parse_args([inFastq1, inFastq2, outBamCmd, - '--sampleName', 'FreeSample', - '--JVMmemory', '1g', - '--picardOptions', - 'LIBRARY_NAME=Alexandria', - 'PLATFORM=9.75', - 'SEQUENCING_CENTER=KareemAbdul-Jabbar', - ]) + args = parser.parse_args([inFastq1, + inFastq2, + outBamCmd, + '--sampleName', + 'FreeSample', + '--JVMmemory', + '1g', + '--picardOptions', + 'LIBRARY_NAME=Alexandria', + 'PLATFORM=9.75', + 'SEQUENCING_CENTER=KareemAbdul-Jabbar',]) args.func_main(args) # samtools view for out.sam and compare to expected @@ -95,51 +114,57 @@ def test_fastq_bam(self) : samtools.view(['-h'], outBamCmd, outSam) # picard.sam.FastqToSam outputs header fields in different order for # java version 1.8 vs 1.7/1.6, so compare both - self.assertTrue(filecmp.cmp(outSam, expected1_7Sam, shallow=False) or - filecmp.cmp(outSam, expected1_8Sam, shallow=False)) - + self.assertTrue(filecmp.cmp(outSam, + expected1_7Sam, + shallow=False) or filecmp.cmp(outSam, + expected1_8Sam, + shallow=False)) + # in1.fastq, in2.fastq, inHeader.txt -> out.bam; header from txt parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser()) - args = parser.parse_args([inFastq1, inFastq2, outBamTxt, - '--header', inHeader]) + args = parser.parse_args([inFastq1, inFastq2, outBamTxt, '--header', inHeader]) args.func_main(args) # out.bam -> out1.fastq, out2.fastq, outHeader.txt; trim 1 base from 1 parser = read_utils.parser_bam_to_fastq(argparse.ArgumentParser()) - args = parser.parse_args([outBamTxt, outFastq1, outFastq2, - '--outHeader', outHeader, - '--JVMmemory', '1g', - '--picardOptions', 'READ1_TRIM=1', - ]) + args = parser.parse_args([outBamTxt, + outFastq1, + outFastq2, + '--outHeader', + outHeader, + '--JVMmemory', + '1g', + '--picardOptions', + 'READ1_TRIM=1',]) args.func_main(args) # compare to out1.fastq, out2.fastq, outHeader.txt to in and expected - self.assertEqualContents(outFastq1, expectedFastq1) # 1 base trimmed + self.assertEqualContents(outFastq1, expectedFastq1) # 1 base trimmed self.assertEqualContents(outFastq2, inFastq2) self.assertEqualContents(outHeader, inHeader) - -class TestSplitReads(TestCaseWithTmp) : + +class TestSplitReads(TestCaseWithTmp): 'Test various options of split_reads command.' - def test_max_reads(self) : + + def test_max_reads(self): 'Test splitting fastq using --maxReads option, with indexLen 1.' myInputDir = util.file.get_test_input_path(self) inFastq = os.path.join(myInputDir, 'in.fastq') outPrefix = util.file.mkstempfname() - + # Split parser = read_utils.parser_split_reads(argparse.ArgumentParser()) - args = parser.parse_args([inFastq, outPrefix, '--maxReads', '4', - '--indexLen', '1']) + args = parser.parse_args([inFastq, outPrefix, '--maxReads', '4', '--indexLen', '1']) args.func_main(args) - + # Check that results match expected expectedFastq1 = os.path.join(myInputDir, 'expected.fastq.1') expectedFastq2 = os.path.join(myInputDir, 'expected.fastq.2') self.assertEqualContents(outPrefix + '1', expectedFastq1) self.assertEqualContents(outPrefix + '2', expectedFastq2) - def test_num_chunks(self) : + def test_num_chunks(self): 'Test spliting fastq.gz using --numChunks option, with default indexLen.' myInputDir = util.file.get_test_input_path(self) inFastq = os.path.join(myInputDir, 'in.fastq.gz') @@ -149,7 +174,7 @@ def test_num_chunks(self) : parser = read_utils.parser_split_reads(argparse.ArgumentParser()) args = parser.parse_args([inFastq, outPrefix, '--numChunks', '3']) args.func_main(args) - + # Check that results match expected expectedFastq1 = os.path.join(myInputDir, 'expected.fastq.01') expectedFastq2 = os.path.join(myInputDir, 'expected.fastq.02') @@ -158,7 +183,7 @@ def test_num_chunks(self) : self.assertEqualContents(outPrefix + '02', expectedFastq2) self.assertEqualContents(outPrefix + '03', expectedFastq3) - def test_fasta(self) : + def test_fasta(self): 'Test splitting fasta file.' myInputDir = util.file.get_test_input_path(self) inFasta = os.path.join(myInputDir, 'in.fasta') @@ -166,18 +191,17 @@ def test_fasta(self) : # Split parser = read_utils.parser_split_reads(argparse.ArgumentParser()) - args = parser.parse_args([inFasta, outPrefix, '--numChunks', '2', - '--format', 'fasta']) + args = parser.parse_args([inFasta, outPrefix, '--numChunks', '2', '--format', 'fasta']) args.func_main(args) - + # Check that results match expected expectedFasta1 = os.path.join(myInputDir, 'expected.fasta.01') expectedFasta2 = os.path.join(myInputDir, 'expected.fasta.02') self.assertEqualContents(outPrefix + '01', expectedFasta1) self.assertEqualContents(outPrefix + '02', expectedFasta2) - -class TestMvicuna(TestCaseWithTmp) : + +class TestMvicuna(TestCaseWithTmp): """ Input consists of 3 read pairs. Second read pair is identical to first. @@ -189,10 +213,11 @@ class TestMvicuna(TestCaseWithTmp) : [IJ:]I have no idea if this is the correct behavior, but test checks that it doesn't change. """ - def test_mvicuna(self) : + + def test_mvicuna(self): tempDir = tempfile.mkdtemp() myInputDir = util.file.get_test_input_path(self) - + # Run mvicuna inFastq1 = os.path.join(myInputDir, 'in.1.fastq') inFastq2 = os.path.join(myInputDir, 'in.2.fastq') @@ -200,17 +225,13 @@ def test_mvicuna(self) : pairedOutFastq2 = os.path.join(tempDir, 'pairedOut.2.fastq') unpairedOutFastq = os.path.join(tempDir, 'unpairedOut.fastq') args = read_utils.parser_dup_remove_mvicuna(argparse.ArgumentParser()).parse_args( - [inFastq1, inFastq2, - pairedOutFastq1, pairedOutFastq2, - '--unpairedOutFastq', unpairedOutFastq]) + [inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, '--unpairedOutFastq', unpairedOutFastq]) args.func_main(args) - + # Compare to expected - for filename in ['pairedOut.1.fastq', 'pairedOut.2.fastq', - 'unpairedOut.fastq'] : - self.assertEqualContents( - os.path.join(tempDir, filename), - os.path.join(myInputDir, 'expected_' + filename)) + for filename in ['pairedOut.1.fastq', 'pairedOut.2.fastq', 'unpairedOut.fastq']: + self.assertEqualContents(os.path.join(tempDir, filename), os.path.join(myInputDir, 'expected_' + filename)) + if __name__ == '__main__': unittest.main() diff --git a/test/unit/test_snake.py b/test/unit/test_snake.py index 5ae4fe1fc..3d1105d9e 100644 --- a/test/unit/test_snake.py +++ b/test/unit/test_snake.py @@ -2,20 +2,26 @@ __author__ = "dpark@broadinstitute.org" -import util.cmd, util.file -import unittest, argparse -import sys, os, subprocess, shutil, tempfile, argparse, itertools +import util.cmd +import util.file +import unittest +import argparse +import sys +import os +import subprocess +import shutil +import tempfile +import argparse +import itertools from test import TestCaseWithTmp -if sys.version_info>=(3,2): +if sys.version_info >= (3, 2): import snakemake -def setup_dummy_simple( - sample_names=('G1234', 'G5678', 'G3671.1_r1', 'G3680-1_4', - '9876', 'x.y-7b')): +def setup_dummy_simple(sample_names=('G1234', 'G5678', 'G3671.1_r1', 'G3680-1_4', '9876', 'x.y-7b')): ''' Set up a very simple project directory with empty input files. ''' - + workdir = tempfile.mkdtemp() os.mkdir(os.path.join(workdir, 'data')) os.mkdir(os.path.join(workdir, 'ref_genome')) @@ -23,36 +29,33 @@ def setup_dummy_simple( os.mkdir(os.path.join(workdir, 'log')) os.mkdir(os.path.join(workdir, 'reports')) os.mkdir(os.path.join(workdir, 'tmp')) - + for s in sample_names: - with open(os.path.join(workdir, 'data', '00_raw', s+'.bam'), 'wt') as outf: + with open(os.path.join(workdir, 'data', '00_raw', s + '.bam'), 'wt') as outf: pass - for fn in ('samples-assembly.txt', 'samples-depletion.txt', - 'samples-runs.txt', 'samples-assembly-failures.txt'): + for fn in ('samples-assembly.txt', 'samples-depletion.txt', 'samples-runs.txt', 'samples-assembly-failures.txt'): with open(os.path.join(workdir, fn), 'wt') as outf: for s in sample_names: - outf.write(s+'\n') - - shutil.copy(os.path.join(util.file.get_project_path(), - 'pipes', 'Snakefile'), workdir) - shutil.copy(os.path.join(util.file.get_project_path(), - 'pipes', 'config.json'), workdir) - - os.symlink(util.file.get_project_path(), - os.path.join(workdir, 'bin')) - + outf.write(s + '\n') + + shutil.copy(os.path.join(util.file.get_project_path(), 'pipes', 'Snakefile'), workdir) + shutil.copy(os.path.join(util.file.get_project_path(), 'pipes', 'config.json'), workdir) + + os.symlink(util.file.get_project_path(), os.path.join(workdir, 'bin')) + return workdir -@unittest.skipIf(sys.version_info<(3,2), "python version is too old for snakemake") +@unittest.skipIf(sys.version_info < (3, 2), "python version is too old for snakemake") class TestSimpleDryRuns(TestCaseWithTmp): + def setUp(self): super(TestSimpleDryRuns, self).setUp() self.workdir = setup_dummy_simple() - self.env = {'GATK_PATH':os.environ.get('GATK_PATH'), - 'NOVOALIGN_PATH':os.environ.get('NOVOALIGN_PATH')} + self.env = {'GATK_PATH': os.environ.get('GATK_PATH'), 'NOVOALIGN_PATH': os.environ.get('NOVOALIGN_PATH')} + def tearDown(self): - for k,v in self.env.items(): + for k, v in self.env.items(): if v: os.environ[k] = v super(TestSimpleDryRuns, self).tearDown() @@ -62,11 +65,13 @@ def test_dryrun_all(self): self.assertTrue(snakemake.snakemake( os.path.join(self.workdir, 'Snakefile'), #configfile=os.path.join(self.workdir, 'config.json'), - workdir=self.workdir, dryrun=True)) + workdir=self.workdir, + dryrun=True)) self.assertTrue(snakemake.snakemake( os.path.join(self.workdir, 'Snakefile'), #configfile=os.path.join(self.workdir, 'config.json'), - workdir=self.workdir, dryrun=True, + workdir=self.workdir, + dryrun=True, targets=['all'])) def test_dryrun_all_assemble(self): @@ -74,7 +79,8 @@ def test_dryrun_all_assemble(self): self.assertTrue(snakemake.snakemake( os.path.join(self.workdir, 'Snakefile'), #configfile=os.path.join(self.workdir, 'config.json'), - workdir=self.workdir, dryrun=True, + workdir=self.workdir, + dryrun=True, targets=['all_assemble'])) def test_dryrun_all_deplete(self): @@ -82,5 +88,6 @@ def test_dryrun_all_deplete(self): self.assertTrue(snakemake.snakemake( os.path.join(self.workdir, 'Snakefile'), #configfile=os.path.join(self.workdir, 'config.json'), - workdir=self.workdir, dryrun=True, + workdir=self.workdir, + dryrun=True, targets=['all_deplete'])) diff --git a/test/unit/test_taxon_filter.py b/test/unit/test_taxon_filter.py index 92e4a107b..741e9227c 100644 --- a/test/unit/test_taxon_filter.py +++ b/test/unit/test_taxon_filter.py @@ -1,23 +1,34 @@ # Unit tests for taxon_filter.py __author__ = "dpark@broadinstitute.org, irwin@broadinstitute.org," \ - + "hlevitin@broadinstitute.org" - -import unittest, os, tempfile, shutil, subprocess, argparse -import taxon_filter, util.file, tools.last, tools.bmtagger, tools.blast, read_utils + + "hlevitin@broadinstitute.org" + +import unittest +import os +import tempfile +import shutil +import subprocess +import argparse +import taxon_filter +import util.file +import tools.last +import tools.bmtagger +import tools.blast +import read_utils from test import assert_equal_contents, TestCaseWithTmp class TestCommandHelp(unittest.TestCase): + def test_help_parser_for_each_command(self): for cmd_name, parser_fun in taxon_filter.__commands__: parser = parser_fun(argparse.ArgumentParser()) helpstring = parser.format_help() -class TestTrimmomatic(TestCaseWithTmp) : +class TestTrimmomatic(TestCaseWithTmp): - def test_trimmomatic(self) : + def test_trimmomatic(self): myInputDir = util.file.get_test_input_path(self) inFastq1 = os.path.join(myInputDir, 'in1.fastq') inFastq2 = os.path.join(myInputDir, 'in2.fastq') @@ -25,8 +36,7 @@ def test_trimmomatic(self) : pairedOutFastq2 = util.file.mkstempfname() clipFasta = os.path.join(myInputDir, 'clip.fasta') parser = taxon_filter.parser_trim_trimmomatic(argparse.ArgumentParser()) - args = parser.parse_args([inFastq1, inFastq2, pairedOutFastq1, - pairedOutFastq2, clipFasta]) + args = parser.parse_args([inFastq1, inFastq2, pairedOutFastq1, pairedOutFastq2, clipFasta]) args.func_main(args) # Check that results match expected @@ -35,9 +45,10 @@ def test_trimmomatic(self) : assert_equal_contents(self, pairedOutFastq1, expected1Fastq) assert_equal_contents(self, pairedOutFastq2, expected2Fastq) -class TestFilterLastal(TestCaseWithTmp) : - def test_filter_lastal(self) : +class TestFilterLastal(TestCaseWithTmp): + + def test_filter_lastal(self): # Create refDbs commonInputDir = util.file.get_test_input_path() myInputDir = util.file.get_test_input_path(self) @@ -48,17 +59,17 @@ def test_filter_lastal(self) : subprocess.check_call([lastdbPath, refDbs, refFasta]) # Call main_filter_lastal - inFastq = os.path.join( myInputDir, 'in.fastq') + inFastq = os.path.join(myInputDir, 'in.fastq') outFastq = util.file.mkstempfname('.fastq') - args = taxon_filter.parser_filter_lastal(argparse.ArgumentParser()).parse_args([ - inFastq, refDbs, outFastq]) + args = taxon_filter.parser_filter_lastal(argparse.ArgumentParser()).parse_args([inFastq, refDbs, outFastq]) args.func_main(args) # Check that results match expected expectedFastq = os.path.join(myInputDir, 'expected.fastq') assert_equal_contents(self, outFastq, expectedFastq) -class TestBmtagger(TestCaseWithTmp) : + +class TestBmtagger(TestCaseWithTmp): """ How test data was created: humanChr1Subset.fa has 200 bases from human chr1 @@ -68,12 +79,13 @@ class TestBmtagger(TestCaseWithTmp) : in[12].fastq "reads" are from humanChr[19]Subset.fa and ebola genome, with arbitrary quality scores. """ - def setUp(self) : + + def setUp(self): TestCaseWithTmp.setUp(self) self.tempDir = tempfile.mkdtemp() myInputDir = util.file.get_test_input_path(self) srprismPath = tools.bmtagger.SrprismTool().install_and_get_path() - for db in ['humanChr1Subset', 'humanChr9Subset'] : + for db in ['humanChr1Subset', 'humanChr9Subset']: # .map file is > 100M, so recreate instead of copying dbfa = os.path.join(myInputDir, db + '.fa') dbsrprism = os.path.join(self.tempDir, db + '.srprism') @@ -81,46 +93,36 @@ def setUp(self) : # .bitmask and .srprism.* files must be in same dir, so copy shutil.copy(os.path.join(myInputDir, db + '.bitmask'), self.tempDir) - def test_partition_bmtagger(self) : - outMatch = [os.path.join(self.tempDir, 'outMatch.{}.fastq'.format(n)) - for n in '12'] - outNoMatch = [os.path.join(self.tempDir, 'outNoMatch.{}.fastq'.format(n)) - for n in '12'] + def test_partition_bmtagger(self): + outMatch = [os.path.join(self.tempDir, 'outMatch.{}.fastq'.format(n)) for n in '12'] + outNoMatch = [os.path.join(self.tempDir, 'outNoMatch.{}.fastq'.format(n)) for n in '12'] myInputDir = util.file.get_test_input_path(self) args = taxon_filter.parser_partition_bmtagger(argparse.ArgumentParser()).parse_args( - [os.path.join(myInputDir, 'in1.fastq'), - os.path.join(myInputDir, 'in2.fastq'), - os.path.join(self.tempDir, 'humanChr1Subset'), - os.path.join(self.tempDir, 'humanChr9Subset'), - '--outMatch', outMatch[0], outMatch[1], - '--outNoMatch', outNoMatch[0], outNoMatch[1]]) + [os.path.join(myInputDir, 'in1.fastq'), os.path.join(myInputDir, 'in2.fastq'), os.path.join( + self.tempDir, 'humanChr1Subset'), os.path.join(self.tempDir, 'humanChr9Subset'), '--outMatch', + outMatch[0], outMatch[1], '--outNoMatch', outNoMatch[0], outNoMatch[1]]) args.func_main(args) - + # Compare to expected - for case in ['Match.1', 'Match.2', 'NoMatch.1', 'NoMatch.2'] : - assert_equal_contents(self, - os.path.join(self.tempDir, 'out' + case + '.fastq'), - os.path.join(myInputDir, 'expected.' + case + '.fastq')) + for case in ['Match.1', 'Match.2', 'NoMatch.1', 'NoMatch.2']: + assert_equal_contents(self, os.path.join(self.tempDir, 'out' + case + '.fastq'), + os.path.join(myInputDir, 'expected.' + case + '.fastq')) - def test_deplete_bmtagger(self) : + def test_deplete_bmtagger(self): myInputDir = util.file.get_test_input_path(self) args = taxon_filter.parser_partition_bmtagger(argparse.ArgumentParser()).parse_args( - [os.path.join(myInputDir, 'in1.fastq'), - os.path.join(myInputDir, 'in2.fastq'), - os.path.join(self.tempDir, 'humanChr1Subset'), - os.path.join(self.tempDir, 'humanChr9Subset'), - '--outNoMatch', - os.path.join(self.tempDir, 'deplete.1.fastq'), - os.path.join(self.tempDir, 'deplete.2.fastq')]) + [os.path.join(myInputDir, 'in1.fastq'), os.path.join(myInputDir, 'in2.fastq'), os.path.join( + self.tempDir, 'humanChr1Subset'), os.path.join(self.tempDir, 'humanChr9Subset'), '--outNoMatch', + os.path.join(self.tempDir, 'deplete.1.fastq'), os.path.join(self.tempDir, 'deplete.2.fastq')]) args.func_main(args) - + # Compare to expected - for case in ['1', '2'] : - assert_equal_contents(self, - os.path.join(self.tempDir, 'deplete.' + case + '.fastq'), - os.path.join(myInputDir, 'expected.NoMatch.' + case + '.fastq')) + for case in ['1', '2']: + assert_equal_contents(self, os.path.join(self.tempDir, 'deplete.' + case + '.fastq'), + os.path.join(myInputDir, 'expected.NoMatch.' + case + '.fastq')) + -class TestDepleteBlastn(TestCaseWithTmp) : +class TestDepleteBlastn(TestCaseWithTmp): ''' How test data was created: humanChr1Subset.fa has 200 bases from human chr1 @@ -128,7 +130,8 @@ class TestDepleteBlastn(TestCaseWithTmp) : in.fastq "reads" are from humanChr[19]Subset.fa and ebola genome, with arbitrary quality scores. ''' - def test_deplete_blastn(self) : + + def test_deplete_blastn(self): tempDir = tempfile.mkdtemp() myInputDir = util.file.get_test_input_path(self) @@ -136,28 +139,25 @@ def test_deplete_blastn(self) : makeblastdbPath = tools.blast.MakeblastdbTool().install_and_get_path() dbnames = ['humanChr1Subset.fa', 'humanChr9Subset.fa'] refDbs = [] - for dbname in dbnames : + for dbname in dbnames: refDb = os.path.join(tempDir, dbname) os.symlink(os.path.join(myInputDir, dbname), refDb) refDbs.append(refDb) - subprocess.check_call([ - makeblastdbPath, '-dbtype', 'nucl', '-in', refDb]) + subprocess.check_call([makeblastdbPath, '-dbtype', 'nucl', '-in', refDb]) # Run deplete_blastn outFile = os.path.join(tempDir, 'out.fastq') args = taxon_filter.parser_deplete_blastn(argparse.ArgumentParser()).parse_args( - [os.path.join(myInputDir, 'in.fastq'), - outFile, - refDbs[0], - refDbs[1]]) + [os.path.join(myInputDir, 'in.fastq'), outFile, refDbs[0], refDbs[1]]) args.func_main(args) # Compare to expected - assert_equal_contents(self, outFile, - os.path.join(myInputDir, 'expected.fastq')) + assert_equal_contents(self, outFile, os.path.join(myInputDir, 'expected.fastq')) -class TestDepleteBlastnBam(TestCaseWithTmp) : - def test_deplete_blastn_bam(self) : + +class TestDepleteBlastnBam(TestCaseWithTmp): + + def test_deplete_blastn_bam(self): tempDir = tempfile.mkdtemp() myInputDir = util.file.get_test_input_path(self) @@ -165,44 +165,42 @@ def test_deplete_blastn_bam(self) : makeblastdbPath = tools.blast.MakeblastdbTool().install_and_get_path() dbnames = ['humanChr1Subset.fa', 'humanChr9Subset.fa'] refDbs = [] - for dbname in dbnames : + for dbname in dbnames: refDb = os.path.join(tempDir, dbname) os.symlink(os.path.join(myInputDir, dbname), refDb) refDbs.append(refDb) - subprocess.check_call([ - makeblastdbPath, '-dbtype', 'nucl', '-in', refDb]) + subprocess.check_call([makeblastdbPath, '-dbtype', 'nucl', '-in', refDb]) # convert the input fastq's to a bam inFastq1 = os.path.join(myInputDir, "in1.fastq") inFastq2 = os.path.join(myInputDir, "in2.fastq") inBam = os.path.join(tempDir, 'in.bam') parser = read_utils.parser_fastq_to_bam(argparse.ArgumentParser()) - args = parser.parse_args([inFastq1, inFastq2, inBam, - '--sampleName', 'FreeSample', - '--JVMmemory', '1g', - '--picardOptions', - 'LIBRARY_NAME=Alexandria', - 'PLATFORM=9.75', - 'SEQUENCING_CENTER=KareemAbdul-Jabbar', - ]) + args = parser.parse_args([inFastq1, + inFastq2, + inBam, + '--sampleName', + 'FreeSample', + '--JVMmemory', + '1g', + '--picardOptions', + 'LIBRARY_NAME=Alexandria', + 'PLATFORM=9.75', + 'SEQUENCING_CENTER=KareemAbdul-Jabbar',]) args.func_main(args) # Run deplete_blastn_bam outBam = os.path.join(tempDir, 'out.bam') args = taxon_filter.parser_deplete_blastn_bam(argparse.ArgumentParser()).parse_args( - [inBam, - refDbs[0], - refDbs[1], - outBam, - "--chunkSize", "1"]) + [inBam, refDbs[0], refDbs[1], outBam, "--chunkSize", "1"]) args.func_main(args) # samtools view for out.sam and compare to expected outSam = os.path.join(tempDir, 'out.sam') samtools = tools.samtools.SamtoolsTool() samtools.view(['-h'], outBam, outSam) - assert_equal_contents(self, outSam, - os.path.join(myInputDir, 'expected.sam')) + assert_equal_contents(self, outSam, os.path.join(myInputDir, 'expected.sam')) + if __name__ == '__main__': unittest.main() diff --git a/test/unit/test_tools.py b/test/unit/test_tools.py index bf8c88ca3..6ebed223e 100644 --- a/test/unit/test_tools.py +++ b/test/unit/test_tools.py @@ -4,30 +4,40 @@ import tools from tools import * -import unittest, tempfile, shutil, os, logging -import util.cmd, util.file +import unittest +import tempfile +import shutil +import os +import logging +import util.cmd +import util.file from test import TestCaseWithTmp log = logging.getLogger(__name__) + class TestToolsInstallation(TestCaseWithTmp): + def setUp(self): super(TestToolsInstallation, self).setUp() util.cmd.setup_logger('INFO') + def testAllToolInstallers(self): - def iter_leaf_subclasses(aClass) : + + def iter_leaf_subclasses(aClass): "Iterate over subclasses at all levels that don't themselves have a subclass" isLeaf = True - for subclass in aClass.__subclasses__() : + for subclass in aClass.__subclasses__(): isLeaf = False - for leafClass in iter_leaf_subclasses(subclass) : + for leafClass in iter_leaf_subclasses(subclass): yield leafClass - if isLeaf : + if isLeaf: yield aClass + '''Load every tool's default chain of install methods and try them.''' for tool_class in iter_leaf_subclasses(tools.Tool): t = tool_class() t.install() self.assertTrue(t.is_installed(), "installation of tool %s failed" % tool_class.__name__) - log.info(".. installation of %s succeeded with installer %s" % (tool_class.__name__, t.installed_method.__class__.__name__)) - + log.info(".. installation of %s succeeded with installer %s" % + (tool_class.__name__, t.installed_method.__class__.__name__)) diff --git a/test/unit/test_tools_bwa.py b/test/unit/test_tools_bwa.py index 2c44ba68d..fc228dd5f 100644 --- a/test/unit/test_tools_bwa.py +++ b/test/unit/test_tools_bwa.py @@ -2,32 +2,33 @@ __author__ = "hlevitin@broadinstitute.org" -import unittest, os.path, shutil -import util.file, tools.bwa +import unittest +import os.path +import shutil +import util.file +import tools.bwa from test import TestCaseWithTmp -class TestToolBwa(TestCaseWithTmp) : - def setUp(self) : +class TestToolBwa(TestCaseWithTmp): + + def setUp(self): super(TestToolBwa, self).setUp() self.bwa = tools.bwa.Bwa() self.bwa.install() - def test_index(self) : - orig_ref = os.path.join(util.file.get_test_input_path(), - 'ebola.fasta') + def test_index(self): + orig_ref = os.path.join(util.file.get_test_input_path(), 'ebola.fasta') inRef = util.file.mkstempfname('.fasta') shutil.copyfile(orig_ref, inRef) - expected_fasta = os.path.join( - util.file.get_test_input_path(self), - 'ebola_expected.fasta') + expected_fasta = os.path.join(util.file.get_test_input_path(self), 'ebola_expected.fasta') self.bwa.execute('index', [inRef]) for ext in ('amb', 'ann', 'bwt', 'pac', 'sa'): - self.assertEqualContents(inRef+'.'+ext, expected_fasta+'.'+ext) + self.assertEqualContents(inRef + '.' + ext, expected_fasta + '.' + ext) - def test_aln(self) : + def test_aln(self): expectedDir = util.file.get_test_input_path(self) # can used expected out for index as input @@ -36,10 +37,8 @@ def test_aln(self) : output = util.file.mkstempfname('.sai') expect = os.path.join(expectedDir, 'ebola_aln_expected.sai') - self.bwa.execute('aln', [reference, fastq], - options={'-q': 5, '-t': 4}, - post_cmd=" > {}".format(output)) - + self.bwa.execute('aln', [reference, fastq], options={'-q': 5, '-t': 4}, post_cmd=" > {}".format(output)) + self.assertEqualContents(output, expect) diff --git a/test/unit/test_tools_mosaik.py b/test/unit/test_tools_mosaik.py index 907ffdd0d..9f0adfac6 100644 --- a/test/unit/test_tools_mosaik.py +++ b/test/unit/test_tools_mosaik.py @@ -2,11 +2,15 @@ __author__ = "mlin@dnanexus.com" -import unittest, os.path, shutil -import util.file, tools.mosaik +import unittest +import os.path +import shutil +import util.file +import tools.mosaik from test import TestCaseWithTmp -class TestToolMosaik(TestCaseWithTmp) : + +class TestToolMosaik(TestCaseWithTmp): def setUp(self): super(TestToolMosaik, self).setUp() @@ -19,4 +23,8 @@ def test_get_networkFile(self): # TO DO: further testing of Mosaik invocations # system($mosaikpath."MosaikBuild -q ".$option{fq}." -q2 ".$option{fq2}." -out $readdat -st ".$option{st}." -mfl ".$option{mfl}); - # system($mosaikpath."MosaikAligner -in $readdat -out $output -ia $refdat -hs ".$option{hs}." -act ".$option{act}." -mm 500 -mmp ".$option{mmp}." -minp ".$option{minp}." -ms ".$option{ms}." -mms ".$option{mms}." -gop ".$option{gop}." -hgop ".$option{hgop}." -gep ".$option{gep}."$bw -m ".$option{m}." -annpe ".$option{annpe}." -annse ".$option{annse}); + # system($mosaikpath."MosaikAligner -in $readdat -out $output -ia $refdat + # -hs ".$option{hs}." -act ".$option{act}." -mm 500 -mmp ".$option{mmp}." + # -minp ".$option{minp}." -ms ".$option{ms}." -mms ".$option{mms}." -gop + # ".$option{gop}." -hgop ".$option{hgop}." -gep ".$option{gep}."$bw -m + # ".$option{m}." -annpe ".$option{annpe}." -annse ".$option{annse}); diff --git a/test/unit/test_tools_novoalign.py b/test/unit/test_tools_novoalign.py index b0d5c5437..493e7159e 100644 --- a/test/unit/test_tools_novoalign.py +++ b/test/unit/test_tools_novoalign.py @@ -2,12 +2,17 @@ __author__ = "dpark@broadinstitute.org" -import unittest, os.path, shutil -import util.file, tools.novoalign, tools.samtools +import unittest +import os.path +import shutil +import util.file +import tools.novoalign +import tools.samtools import pysam from test import TestCaseWithTmp -class TestToolNovoalign(TestCaseWithTmp) : + +class TestToolNovoalign(TestCaseWithTmp): def setUp(self): super(TestToolNovoalign, self).setUp() @@ -15,9 +20,8 @@ def setUp(self): self.novoalign.install() self.samtools = tools.samtools.SamtoolsTool() - def test_index(self) : - orig_ref = os.path.join(util.file.get_test_input_path(), - 'ebola.fasta') + def test_index(self): + orig_ref = os.path.join(util.file.get_test_input_path(), 'ebola.fasta') inRef = util.file.mkstempfname('.fasta') shutil.copyfile(orig_ref, inRef) self.novoalign.index_fasta(inRef) @@ -25,64 +29,57 @@ def test_index(self) : self.assertTrue(os.path.isfile(outfile)) self.assertTrue(os.path.getsize(outfile)) - def test_align(self) : - orig_ref = os.path.join(util.file.get_test_input_path(), - 'ebola.fasta') + def test_align(self): + orig_ref = os.path.join(util.file.get_test_input_path(), 'ebola.fasta') inRef = util.file.mkstempfname('.fasta') shutil.copyfile(orig_ref, inRef) self.novoalign.index_fasta(inRef) - reads = os.path.join(util.file.get_test_input_path(self), - 'ebov_reads.bam') + reads = os.path.join(util.file.get_test_input_path(self), 'ebov_reads.bam') outBam = util.file.mkstempfname('.bam') self.novoalign.execute(reads, inRef, outBam) self.assertTrue(os.path.isfile(outBam)) self.assertTrue(os.path.getsize(outBam)) - self.assertTrue(os.path.isfile(outBam[:-1]+'i')) + self.assertTrue(os.path.isfile(outBam[:-1] + 'i')) - def test_align_filter(self) : - orig_ref = os.path.join(util.file.get_test_input_path(), - 'ebola.fasta') + def test_align_filter(self): + orig_ref = os.path.join(util.file.get_test_input_path(), 'ebola.fasta') inRef = util.file.mkstempfname('.fasta') shutil.copyfile(orig_ref, inRef) self.novoalign.index_fasta(inRef) - reads = os.path.join(util.file.get_test_input_path(self), - 'ebov_reads.bam') + reads = os.path.join(util.file.get_test_input_path(self), 'ebov_reads.bam') outBam = util.file.mkstempfname('.bam') self.novoalign.execute(reads, inRef, outBam, min_qual=1) self.assertTrue(os.path.isfile(outBam)) self.assertTrue(os.path.getsize(outBam)) - self.assertTrue(os.path.isfile(outBam[:-1]+'i')) + self.assertTrue(os.path.isfile(outBam[:-1] + 'i')) - def test_multi_read_groups(self) : - orig_ref = os.path.join(util.file.get_test_input_path(), - 'G5012.3.fasta') + def test_multi_read_groups(self): + orig_ref = os.path.join(util.file.get_test_input_path(), 'G5012.3.fasta') inRef = util.file.mkstempfname('.fasta') shutil.copyfile(orig_ref, inRef) self.novoalign.index_fasta(inRef) - + # align with Novoalign (BAM input, BAM output) - reads = os.path.join(util.file.get_test_input_path(), - 'G5012.3.subset.bam') + reads = os.path.join(util.file.get_test_input_path(), 'G5012.3.subset.bam') outBam = util.file.mkstempfname('.bam') self.novoalign.execute(reads, inRef, outBam) self.assertTrue(os.path.isfile(outBam)) self.assertTrue(os.path.getsize(outBam)) - self.assertTrue(os.path.isfile(outBam[:-1]+'i')) + self.assertTrue(os.path.isfile(outBam[:-1] + 'i')) sam_in = util.file.mkstempfname('.in.sam') sam_out = util.file.mkstempfname('.out.sam') self.samtools.view([], reads, sam_in) self.samtools.view([], outBam, sam_out) - + # assert that all reads are present in output - self.assertEqual(self.samtools.count(reads), - self.samtools.count(outBam)) - + self.assertEqual(self.samtools.count(reads), self.samtools.count(outBam)) + # assert that all read groups are described exactly the same # in the output header (not necessarily same order, but same content) orig_rgs = self.samtools.getReadGroups(reads) new_rgs = self.samtools.getReadGroups(outBam) - self.assertTrue(len(orig_rgs)>1) - self.assertTrue(len(new_rgs)>1) + self.assertTrue(len(orig_rgs) > 1) + self.assertTrue(len(new_rgs) > 1) for rgid in new_rgs.keys(): self.assertIn(rgid, orig_rgs) orig_rg = orig_rgs[rgid] @@ -94,12 +91,12 @@ def test_multi_read_groups(self) : del orig_rg['DT'] del new_rg['DT'] self.assertEqual(orig_rgs[rgid], new_rgs[rgid]) - + # if any RGs are missing, it should be because they were never really there for rgid in orig_rgs.keys(): if rgid not in new_rgs: self.assertEqual(0, self.samtools.count(reads, ['-r', rgid])) - + # assert that all reads retained their original RG assignments read_to_rg = {} read_seen = set() @@ -107,35 +104,32 @@ def test_multi_read_groups(self) : for read in inf: read = read.rstrip('\n').split('\t') tags = [t[5:] for t in read[11:] if t.startswith('RG:Z:')] - self.assertTrue(len(tags)==1) + self.assertTrue(len(tags) == 1) read_to_rg[read[0]] = tags[0] with open(sam_out, 'rt') as inf: for read in inf: read = read.rstrip('\n').split('\t') tags = [t[5:] for t in read[11:] if t.startswith('RG:Z:')] - self.assertTrue(len(tags)==1) + self.assertTrue(len(tags) == 1) self.assertIn(read[0], read_to_rg) self.assertEqual(tags[0], read_to_rg[read[0]]) read_seen.add(read[0]) self.assertEqual(len(read_seen), len(read_to_rg)) - + # clean up for fn in (sam_in, sam_out, outBam, inRef): os.unlink(fn) - def test_multi_read_groups_filter(self) : - orig_ref = os.path.join(util.file.get_test_input_path(), - 'G5012.3.fasta') + def test_multi_read_groups_filter(self): + orig_ref = os.path.join(util.file.get_test_input_path(), 'G5012.3.fasta') inRef = util.file.mkstempfname('.fasta') shutil.copyfile(orig_ref, inRef) self.novoalign.index_fasta(inRef) - + # align with Novoalign (BAM input, BAM output) - reads = os.path.join(util.file.get_test_input_path(), - 'G5012.3.subset.bam') + reads = os.path.join(util.file.get_test_input_path(), 'G5012.3.subset.bam') outBam = util.file.mkstempfname('.bam') self.novoalign.execute(reads, inRef, outBam, min_qual=1) self.assertTrue(os.path.isfile(outBam)) self.assertTrue(os.path.getsize(outBam)) - self.assertTrue(os.path.isfile(outBam[:-1]+'i')) - + self.assertTrue(os.path.isfile(outBam[:-1] + 'i')) diff --git a/test/unit/test_tools_picard.py b/test/unit/test_tools_picard.py index ded62b8cf..7de843c84 100644 --- a/test/unit/test_tools_picard.py +++ b/test/unit/test_tools_picard.py @@ -2,11 +2,19 @@ __author__ = "dpark@broadinstitute.org" -import unittest, os, tempfile, shutil -import util, util.file, tools, tools.picard +import unittest +import os +import tempfile +import shutil +import util +import util.file +import tools +import tools.picard from test import TestCaseWithTmp + class TestToolPicard(TestCaseWithTmp): + def test_fasta_index(self): orig_ref = os.path.join(util.file.get_test_input_path(self), 'in.fasta') expected_dict = os.path.join(util.file.get_test_input_path(self), 'in.dict') @@ -17,9 +25,9 @@ def test_fasta_index(self): inRef = util.file.mkstempfname(ext) shutil.copyfile(orig_ref, inRef) outDict = inRef[:-len(ext)] + '.dict' - + picard_index.execute(inRef) - + # the dict files will not be exactly the same, just the first 3 cols with open(outDict, 'rt') as inf: actual_first3 = [x.strip().split('\t')[:3] for x in inf.readlines()] diff --git a/test/unit/test_tools_samtools.py b/test/unit/test_tools_samtools.py index 5e302d88b..86acf523b 100644 --- a/test/unit/test_tools_samtools.py +++ b/test/unit/test_tools_samtools.py @@ -2,17 +2,24 @@ __author__ = "dpark@broadinstitute.org" -import unittest, os, tempfile, shutil -import util, util.file, tools, tools.samtools +import unittest +import os +import tempfile +import shutil +import util +import util.file +import tools +import tools.samtools from test import TestCaseWithTmp + class TestToolSamtools(TestCaseWithTmp): def test_count_bam(self): sam = os.path.join(util.file.get_test_input_path(self), 'simple.sam') n = tools.samtools.SamtoolsTool().count(sam, ['-S']) self.assertEqual(n, 2) - + def test_fasta_index(self): orig_ref = os.path.join(util.file.get_test_input_path(self), 'in.fasta') expected_fai = os.path.join(util.file.get_test_input_path(self), 'in.fasta.fai') @@ -24,5 +31,6 @@ def test_fasta_index(self): samtools.faidx(inRef) self.assertEqualContents(outFai, expected_fai) + if __name__ == '__main__': unittest.main() diff --git a/test/unit/test_tools_vphaser2.py b/test/unit/test_tools_vphaser2.py index 3954e90d4..17b907494 100644 --- a/test/unit/test_tools_vphaser2.py +++ b/test/unit/test_tools_vphaser2.py @@ -2,25 +2,28 @@ __author__ = "irwin@broadinstitute.org" -import unittest, os, pickle +import unittest +import os +import pickle import util.file from intrahost import vphaser_main from test import TestCaseWithTmp -class TestVPhaser2(TestCaseWithTmp) : - def test_vphaser2(self) : + +class TestVPhaser2(TestCaseWithTmp): + + def test_vphaser2(self): myInputDir = util.file.get_test_input_path(self) inBam = os.path.join(myInputDir, 'in.bam') outTab = util.file.mkstempfname('.txt') - vphaser_main(inBam, outTab, numThreads = 8) - with open(outTab, 'rt') as outf : - recs = map(lambda s : s.strip('\n').split('\t'), outf.readlines()) - with open(os.path.join(myInputDir, 'expected.cpickle'), 'rb') as expf : + vphaser_main(inBam, outTab, numThreads=8) + with open(outTab, 'rt') as outf: + recs = map(lambda s: s.strip('\n').split('\t'), outf.readlines()) + with open(os.path.join(myInputDir, 'expected.cpickle'), 'rb') as expf: expectedRecs = pickle.load(expf) # Vphaser2 p-val calculation is unstable and sometimes varies from # run to run, so exclude it from comparison. - self.assertEqual([rec[:4] + rec[5:] for rec in recs], - [rec[:4] + rec[5:] for rec in expectedRecs]) + self.assertEqual([rec[:4] + rec[5:] for rec in recs], [rec[:4] + rec[5:] for rec in expectedRecs]) """ Creation of in.bam: Start with test file that ships with V-Phaser 2. @@ -33,7 +36,7 @@ def test_vphaser2(self) : Move the @SQ line from c2.sam to c1.sam and delete header of c2.sam. cat c1.sam c2.sam >new.sam samtools view -bh new.sam >new.bam - + Creation of expected.cpickle: cPickle.dump(list(Vphaser2Tool().iterate(inBam, numThreads = 8)), open('expected.cpickle', 'w')) diff --git a/test/unit/test_util_annot.py b/test/unit/test_util_annot.py index 28fa355c6..e8e03962d 100644 --- a/test/unit/test_util_annot.py +++ b/test/unit/test_util_annot.py @@ -5,7 +5,9 @@ import util.annot import unittest + class TestGeneDb(unittest.TestCase): + def setUp(self): pass @@ -16,4 +18,3 @@ def testNothingAtAll(self): def testTautology(self): '''here we test 1 = 1''' self.assertEqual(1, 1) - diff --git a/test/unit/test_util_vcf.py b/test/unit/test_util_vcf.py index ee443d556..5f223b3e0 100644 --- a/test/unit/test_util_vcf.py +++ b/test/unit/test_util_vcf.py @@ -2,9 +2,9 @@ __author__ = "dpark@broadinstitute.org" -import util.vcf, util.file +import util.vcf +import util.file import unittest - ''' TODO make_intervals @@ -18,91 +18,97 @@ keeps breaking each time we refactor. ''' + class StubGenome: ''' This is a mock genome that should satisfy very simple functions in util.vcf like get_chrlens, GenomePosition, make_intervals, sliding_windows, etc. It simply contains a list of chromosome names and lengths. ''' + def __init__(self, chromlist): - self.chrs = [c for c,clen in chromlist] + self.chrs = [c for c, clen in chromlist] self.chrlen = dict(chromlist) - self.totlen = sum(clen for c,clen in chromlist) + self.totlen = sum(clen for c, clen in chromlist) + def chrlens(self): return [(c, self.chrlen[c]) for c in self.chrs] + class TestGenomePosition(unittest.TestCase): ''' Test the GenomePosition class which maps chr,pos pairs to a single gpos int and vice versa ''' def test_fail_OOB_get_gpos(self): - genome = StubGenome([('SDF',123), ('ASDF',256), ('lala',47)]) - invalids = [('SDF',0),('SDF',124),('ASDF',-1),('lala',48),('lala',200),('sdf',80),('la',2),(None,3)] + genome = StubGenome([('SDF', 123), ('ASDF', 256), ('lala', 47)]) + invalids = [('SDF', 0), ('SDF', 124), ('ASDF', -1), ('lala', 48), ('lala', 200), ('sdf', 80), ('la', 2), + (None, 3)] gmap = util.vcf.GenomePosition(genome) - for c,p in invalids: + for c, p in invalids: self.assertRaises(Exception, gmap.get_gpos, c, p) def test_fail_OOB_get_chr_pos(self): - genome = StubGenome([('SDF',123), ('ASDF',256), ('lala',47)]) - invalids = [0,-1,genome.totlen+1,genome.totlen*2] + genome = StubGenome([('SDF', 123), ('ASDF', 256), ('lala', 47)]) + invalids = [0, -1, genome.totlen + 1, genome.totlen * 2] gmap = util.vcf.GenomePosition(genome) for gpos in invalids: self.assertRaises(Exception, gmap.get_chr_pos, gpos) def test_fail_non_int_pos(self): - genome = StubGenome([('SDF',123), ('ASDF',256), ('lala',47)]) - invalids = [('SDF',5.3),('lala','10'),('ASDF',None)] + genome = StubGenome([('SDF', 123), ('ASDF', 256), ('lala', 47)]) + invalids = [('SDF', 5.3), ('lala', '10'), ('ASDF', None)] gmap = util.vcf.GenomePosition(genome) - for c,p in invalids: + for c, p in invalids: self.assertRaises(Exception, gmap.get_gpos, c, p) self.assertRaises(Exception, gmap.get_chr_pos, p) def test_spotcheck_edges(self): - genome = StubGenome([('SDF',123), ('ASDF',256), ('lala',47)]) - knowns = [('SDF',1,1), ('SDF',123,123), ('ASDF',1,124), ('ASDF',256,379), ('lala',1,380), ('lala',47,426)] + genome = StubGenome([('SDF', 123), ('ASDF', 256), ('lala', 47)]) + knowns = [('SDF', 1, 1), ('SDF', 123, 123), ('ASDF', 1, 124), ('ASDF', 256, 379), ('lala', 1, 380), + ('lala', 47, 426)] gmap = util.vcf.GenomePosition(genome) - for c,p,gpos in knowns: - self.assertEqual(gpos, gmap.get_gpos(c,p)) - self.assertEqual((c,p), gmap.get_chr_pos(gpos)) + for c, p, gpos in knowns: + self.assertEqual(gpos, gmap.get_gpos(c, p)) + self.assertEqual((c, p), gmap.get_chr_pos(gpos)) def test_equality_1chrGenome(self): - genome = StubGenome([('one chr',10)]) + genome = StubGenome([('one chr', 10)]) c = genome.chrs[0] gmap = util.vcf.GenomePosition(genome) - for i in range(1,genome.totlen+1): - self.assertEqual(i, gmap.get_gpos(c,i)) # c,p -> gpos should produce p=gpos - self.assertEqual((c,i), gmap.get_chr_pos(i)) # gpos -> c,p should produce p=gpos + for i in range(1, genome.totlen + 1): + self.assertEqual(i, gmap.get_gpos(c, i)) # c,p -> gpos should produce p=gpos + self.assertEqual((c, i), gmap.get_chr_pos(i)) # gpos -> c,p should produce p=gpos def test_equality_3chrGenome(self): - genome = StubGenome([('SDF',123), ('ASDF',256), ('lala',47)]) + genome = StubGenome([('SDF', 123), ('ASDF', 256), ('lala', 47)]) gmap = util.vcf.GenomePosition(genome) # test gpos -> c,p -> same gpos - for gpos in range(1, genome.totlen+1): - c,p = gmap.get_chr_pos(gpos) - gpos2 = gmap.get_gpos(c,p) + for gpos in range(1, genome.totlen + 1): + c, p = gmap.get_chr_pos(gpos) + gpos2 = gmap.get_gpos(c, p) self.assertEqual(gpos, gpos2) # test c,p -> gpos -> same c,p - for c,clen in genome.chrlen.items(): - for p in range(1,clen+1): - gpos = gmap.get_gpos(c,p) - c2,p2 = gmap.get_chr_pos(gpos) - self.assertEqual(c,c2) - self.assertEqual(p,p2) + for c, clen in genome.chrlen.items(): + for p in range(1, clen + 1): + gpos = gmap.get_gpos(c, p) + c2, p2 = gmap.get_chr_pos(gpos) + self.assertEqual(c, c2) + self.assertEqual(p, p2) def test_gpos_inbounds(self): - genome = StubGenome([('SDF',123), ('ASDF',256), ('lala',47)]) + genome = StubGenome([('SDF', 123), ('ASDF', 256), ('lala', 47)]) gmap = util.vcf.GenomePosition(genome) - for c,clen in genome.chrlen.items(): - for p in range(1,clen+1): - gpos = gmap.get_gpos(c,p) + for c, clen in genome.chrlen.items(): + for p in range(1, clen + 1): + gpos = gmap.get_gpos(c, p) self.assertIsNotNone(gpos) self.assertIsInstance(gpos, int) self.assertLessEqual(1, gpos) self.assertLessEqual(gpos, genome.totlen) def test_chr_pos_inbounds(self): - genome = StubGenome([('SDF',123), ('ASDF',256), ('lala',47)]) + genome = StubGenome([('SDF', 123), ('ASDF', 256), ('lala', 47)]) gmap = util.vcf.GenomePosition(genome) - for gpos in range(1,genome.totlen+1): - c,p = gmap.get_chr_pos(gpos) + for gpos in range(1, genome.totlen + 1): + c, p = gmap.get_chr_pos(gpos) self.assertIsNotNone(c) self.assertIn(c, genome.chrlen) self.assertIsNotNone(p) @@ -111,24 +117,24 @@ def test_chr_pos_inbounds(self): self.assertLessEqual(p, genome.chrlen[c]) def test_unique_gpos(self): - genome = StubGenome([('SDF',123), ('ASDF',256), ('lala',47)]) + genome = StubGenome([('SDF', 123), ('ASDF', 256), ('lala', 47)]) gmap = util.vcf.GenomePosition(genome) seen = set() - for c,clen in genome.chrlen.items(): - for p in range(1,clen+1): - gpos = gmap.get_gpos(c,p) + for c, clen in genome.chrlen.items(): + for p in range(1, clen + 1): + gpos = gmap.get_gpos(c, p) self.assertNotIn(gpos, seen) seen.add(gpos) self.assertEqual(len(seen), genome.totlen) def test_unique_chr_pos(self): - genome = StubGenome([('SDF',123), ('ASDF',256), ('lala',47)]) + genome = StubGenome([('SDF', 123), ('ASDF', 256), ('lala', 47)]) gmap = util.vcf.GenomePosition(genome) seen = set() - for gpos in range(1,genome.totlen+1): - c,p = gmap.get_chr_pos(gpos) - self.assertNotIn((c,p), seen) - seen.add((c,p)) + for gpos in range(1, genome.totlen + 1): + c, p = gmap.get_chr_pos(gpos) + self.assertNotIn((c, p), seen) + seen.add((c, p)) self.assertEqual(len(seen), genome.totlen) @@ -144,7 +150,7 @@ def setUp(self): self.basestring = str def test_sample_names(self): - expected = ['3D7','SenT001.08','SenT001.11','SenT002.07','SenT002.09'] + expected = ['3D7', 'SenT001.08', 'SenT001.11', 'SenT002.07', 'SenT002.09'] vcfdb = util.vcf.VcfReader(self.vcf_fname) self.assertEqual(expected, vcfdb.samples()) for s in vcfdb.samples(): @@ -154,7 +160,7 @@ def test_get_one_base(self): vcfdb = util.vcf.VcfReader(self.vcf_fname) genos = vcfdb.get_snp_genos(self.vcf_window[0], 1726432) alleles = [genos[s] for s in vcfdb.samples()] - expected = ['T','T','T','G','T'] + expected = ['T', 'T', 'T', 'G', 'T'] self.assertEqual(expected, alleles) for a in alleles: self.assertIsInstance(a, self.basestring) @@ -167,7 +173,6 @@ def test_get_positions_edges(self): def test_get_range_edges(self): vcfdb = util.vcf.VcfReader(self.vcf_fname) - out = [p for c,p,alleles,genos in vcfdb.get_range(self.vcf_window[0])] + out = [p for c, p, alleles, genos in vcfdb.get_range(self.vcf_window[0])] self.assertEqual(out[0], self.vcf_window[1]) self.assertEqual(out[-1], self.vcf_window[2]) - diff --git a/tools/__init__.py b/tools/__init__.py index c6995c2fd..6a15ad48c 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -2,7 +2,10 @@ __author__ = "dpark@broadinstitute.org,irwin@broadinstitute.org" -import os, logging, tempfile, shutil +import os +import logging +import tempfile +import shutil import util.file try: @@ -16,33 +19,36 @@ # Put all tool files in __all__ # allows "from tools import *" to import all tooles for testtools -__all__ = [filename[:-3] # Remove .py - for filename in os.listdir(os.path.dirname(__file__)) # tools directory - if filename.endswith('.py') and filename != '__init__.py' and - filename not in [ # Add any files to exclude here: - # e.g. 'sometool.py', - ] - ] +__all__ = [filename[:-3] # Remove .py + for filename in os.listdir(os.path.dirname(__file__)) # tools directory + if filename.endswith('.py') and filename != '__init__.py' and filename not in [ # Add any files to exclude here: + # e.g. 'sometool.py', + ]] installed_tools = {} log = logging.getLogger(__name__) + def get_tool_by_name(name): if name not in installed_tools: raise NotImplementedError return installed_tools[name] + class Tool(object): ''' Base tool class that includes install machinery. TO DO: add something about dependencies.. ''' + def __init__(self, install_methods=[]): self.install_methods = install_methods self.installed_method = None self.exec_path = None + def is_installed(self): - return (self.installed_method != None) + return (self.installed_method is not None) + def install(self): if not self.is_installed(): for m in self.install_methods: @@ -53,85 +59,103 @@ def install(self): self.exec_path = m.executable_path() installed_tools[self.__class__.__name__] = self break + def get_install_methods(self): return self.install_methods + def set_install_methods(self, methods): self.install_methods = methods + def version(self): return None + def executable_path(self): return self.exec_path + def execute(self, args): assert not os.system(self.exec_path + ' ' + args) - def install_and_get_path(self) : + + def install_and_get_path(self): self.install() - if self.executable_path()==None: + if self.executable_path() == None: raise NameError("unsuccessful in installing " + type(self).__name__) return self.executable_path() + class InstallMethod(object): ''' Base class for installation methods for a given tool. None of these methods should ever fail/error. attempt_install should return silently regardless of the outcome (is_installed must be called to verify success or failure). ''' + def __init__(self): self.attempts = 0 + def is_attempted(self): return self.attempts - def attempt_install(self): # Override _attempt_install, not this. + + def attempt_install(self): # Override _attempt_install, not this. self.attempts += 1 self._attempt_install() + def _attempt_install(self): raise NotImplementedError + def is_installed(self): raise NotImplementedError + def executable_path(self): raise NotImplementedError + class PrexistingUnixCommand(InstallMethod): ''' This is an install method that tries to find whether an executable binary already exists for free on the unix file system--it doesn't actually try to install anything. ''' - def __init__(self, path, verifycmd=None, verifycode=0, - require_executability=True): + + def __init__(self, path, verifycmd=None, verifycode=0, require_executability=True): self.path = path self.verifycmd = verifycmd self.verifycode = verifycode self.installed = False self.require_executability = require_executability InstallMethod.__init__(self) + def _attempt_install(self): - if os.access(self.path, (os.X_OK | os.R_OK) if - self.require_executability else os.R_OK): + if os.access(self.path, (os.X_OK | os.R_OK) if self.require_executability else os.R_OK): if self.verifycmd: self.installed = (os.system(self.verifycmd) == self.verifycode) else: self.installed = True else: self.installed = False + def is_installed(self): if not self.is_attempted(): self.attempt_install() return self.installed + def executable_path(self): return self.installed and self.path or None + class DownloadPackage(InstallMethod): ''' This is an install method for downloading, unpacking, and post- processing straight from the source. target_rel_path is the executable's path relative to destination_dir destination_dir defaults to the project build directory - post_download_command will be executed if it isn't None, in + post_download_command will be executed if it isn't None, in destination_dir. if post_download_ret != None, assert it is returned by post_download_command ''' + def __init__(self, url, target_rel_path, destination_dir=None, verifycmd=None, verifycode=0, require_executability=True, post_download_command=None, post_download_ret=0): - if destination_dir == None : + if destination_dir is None: destination_dir = util.file.get_build_path() self.url = url self.targetpath = os.path.join(destination_dir, target_rel_path) @@ -143,13 +167,15 @@ def __init__(self, url, target_rel_path, destination_dir=None, self.post_download_command = post_download_command self.post_download_ret = post_download_ret InstallMethod.__init__(self) + def is_installed(self): return self.installed + def executable_path(self): return self.installed and self.targetpath or None + def verify_install(self): - if os.access(self.targetpath, (os.X_OK | os.R_OK) if - self.require_executability else os.R_OK): + if os.access(self.targetpath, (os.X_OK | os.R_OK) if self.require_executability else os.R_OK): if self.verifycmd: log.debug("validating") self.installed = (os.system(self.verifycmd) == self.verifycode) @@ -158,54 +184,52 @@ def verify_install(self): else: self.installed = False return self.installed + def _attempt_install(self): if not self.verify_install(): self.pre_download() self.download() self.post_download() self.verify_install() + def pre_download(self): pass + def download(self): download_dir = tempfile.gettempdir() util.file.mkdir_p(download_dir) filepath = urlparse(self.url).path filename = filepath.split('/')[-1] - log.info("Downloading from %s to %s/%s ...", self.url, - download_dir, - filename) - urlretrieve(self.url, os.path.join(download_dir,filename)) + log.info("Downloading from %s to %s/%s ...", self.url, download_dir, filename) + urlretrieve(self.url, os.path.join(download_dir, filename)) self.download_file = filename self.unpack(download_dir) + def post_download(self): if self.post_download_command: - return_code = os.system('cd "{}" && {}'.format( - self.destination_dir, self.post_download_command)) - if self.post_download_ret != None: + return_code = os.system('cd "{}" && {}'.format(self.destination_dir, self.post_download_command)) + if self.post_download_ret is not None: assert return_code == self.post_download_ret + def unpack(self, download_dir): log.debug("unpacking") util.file.mkdir_p(self.destination_dir) if self.download_file.endswith('.zip'): - if os.system("unzip -o %s/%s -d %s > /dev/null" % (download_dir, - self.download_file, self.destination_dir)): + if os.system("unzip -o %s/%s -d %s > /dev/null" % (download_dir, self.download_file, self.destination_dir + )): return else: os.unlink(os.path.join(download_dir, self.download_file)) - elif (self.download_file.endswith('.tar.gz') or - self.download_file.endswith('.tgz') or - self.download_file.endswith('.tar.bz2') or - self.download_file.endswith('.tar')): + elif (self.download_file.endswith('.tar.gz') or self.download_file.endswith('.tgz') or + self.download_file.endswith('.tar.bz2') or self.download_file.endswith('.tar')): if self.download_file.endswith('.tar'): compression_option = '' elif self.download_file.endswith('.tar.bz2'): compression_option = 'j' else: compression_option = 'z' - untar_cmd = "tar -C {} -x{}pf {}/{}".format(self.destination_dir, - compression_option, - download_dir, + untar_cmd = "tar -C {} -x{}pf {}/{}".format(self.destination_dir, compression_option, download_dir, self.download_file) log.debug("Untaring with command: %s", untar_cmd) exitCode = os.system(untar_cmd) @@ -215,6 +239,6 @@ def unpack(self, download_dir): else: log.debug("tar returned with exit code 0") os.unlink(os.path.join(download_dir, self.download_file)) - else : + else: shutil.move(os.path.join(download_dir, self.download_file), - os.path.join(self.destination_dir, self.download_file)) + os.path.join(self.destination_dir, self.download_file)) diff --git a/tools/bedtools.py b/tools/bedtools.py index 82db5956c..84307d5b4 100644 --- a/tools/bedtools.py +++ b/tools/bedtools.py @@ -1,3 +1 @@ - - # 'bedtools': '/idi/sabeti-data/software/bedtools/bin', diff --git a/tools/blast.py b/tools/blast.py index 7937e19ca..5d266f220 100644 --- a/tools/blast.py +++ b/tools/blast.py @@ -5,50 +5,42 @@ urlPrefix = 'ftp://ftp.ncbi.nlm.nih.gov/blast/executables' \ '/blast+/2.2.29/ncbi-blast-2.2.29+-' -def get_url() : + +def get_url(): uname = os.uname() - if uname[0] == 'Darwin' : + if uname[0] == 'Darwin': osStr = 'universal-macosx' - elif uname[0] == 'Linux' : - if uname[4].endswith('64') : + elif uname[0] == 'Linux': + if uname[4].endswith('64'): osStr = 'x64-linux' - else : + else: osStr = 'ia32-linux' - else : + else: raise NotImplementedError('OS {} not implemented'.format(uname[0])) return urlPrefix + osStr + '.tar.gz' -class BlastTools(tools.Tool) : + +class BlastTools(tools.Tool): """'Abstract' base class for tools in the blast+ suite. Subclasses must define class member subtoolName.""" - def __init__(self, install_methods = None) : - unwanted = ['blast_formatter', - 'blastdb_aliastool', - 'blastdbcheck', - 'blastdbcmd', - 'convert2blastmask', - 'deltablast', - 'legacy_blast.pl', - 'makembindex', - 'makeprofiledb', - 'psiblast', - 'rpsblast', - 'rpstblastn', - 'segmasker', - 'tblastn', - 'tblastx', - 'update_blastdb.pl', - 'windowmasker'] - if install_methods == None : + + def __init__(self, install_methods=None): + unwanted = ['blast_formatter', 'blastdb_aliastool', 'blastdbcheck', 'blastdbcmd', 'convert2blastmask', + 'deltablast', 'legacy_blast.pl', 'makembindex', 'makeprofiledb', 'psiblast', 'rpsblast', + 'rpstblastn', 'segmasker', 'tblastn', 'tblastx', 'update_blastdb.pl', 'windowmasker'] + if install_methods is None: target_rel_path = 'ncbi-blast-2.2.29+/bin/' + self.subtoolName - install_methods = [tools.DownloadPackage(get_url(), target_rel_path, - post_download_command=' '.join(['rm'] - + ['ncbi-blast-2.2.29+/bin/'+f for f in unwanted]), - post_download_ret=None)] - tools.Tool.__init__(self, install_methods = install_methods) + install_methods = [tools.DownloadPackage(get_url(), + target_rel_path, + post_download_command=' '.join( + ['rm'] + ['ncbi-blast-2.2.29+/bin/' + f for f in unwanted]), + post_download_ret=None)] + tools.Tool.__init__(self, install_methods=install_methods) -class BlastnTool(BlastTools) : + +class BlastnTool(BlastTools): subtoolName = 'blastn' -class MakeblastdbTool(BlastTools) : + +class MakeblastdbTool(BlastTools): subtoolName = 'makeblastdb' diff --git a/tools/bmtagger.py b/tools/bmtagger.py index df5dfdd9e..2e26f32f8 100644 --- a/tools/bmtagger.py +++ b/tools/bmtagger.py @@ -1,71 +1,83 @@ "tools.Tool for bmtagger.sh." -import tools, util.file -import os, logging +import tools +import util.file +import os +import logging from tools import urlretrieve log = logging.getLogger(__name__) -class BmtaggerTools(tools.Tool) : +class BmtaggerTools(tools.Tool): ''' "Abstract" base class for bmtagger.sh, bmfilter, extract_fullseq, srprism. Subclasses must define class member subtoolName. - + Note: bmtagger calls blastn so that must be installed somewhere in $PATH. - + WARNING: bmtagger.sh does not work with the version of getopt that ships with Mac OS X. This can be worked around by installing linux getopt using fink and assuring that /sw/bin comes before /usr/bin in $PATH. ''' + # subtoolName must be defined in subclass - - def __init__(self, install_methods = None) : - if install_methods == None : + + def __init__(self, install_methods=None): + if install_methods is None: install_methods = [] install_methods.append(DownloadBmtagger(self.subtoolName)) - tools.Tool.__init__(self, install_methods = install_methods) + tools.Tool.__init__(self, install_methods=install_methods) -class BmtaggerShTool(BmtaggerTools) : + +class BmtaggerShTool(BmtaggerTools): subtoolName = 'bmtagger.sh' -class BmfilterTool(BmtaggerTools) : + +class BmfilterTool(BmtaggerTools): subtoolName = 'bmfilter' -class Extract_fullseqTool(BmtaggerTools) : + +class Extract_fullseqTool(BmtaggerTools): subtoolName = 'extract_fullseq' -class SrprismTool(BmtaggerTools) : + +class SrprismTool(BmtaggerTools): subtoolName = 'srprism' -class DownloadBmtagger(tools.InstallMethod) : + +class DownloadBmtagger(tools.InstallMethod): executables = ['bmtagger.sh', 'bmfilter', 'extract_fullseq', 'srprism'] - def __init__(self, subtoolName) : + + def __init__(self, subtoolName): self.installed = False self.targetDir = os.path.join(util.file.get_build_path(), 'bmtagger') self.targetpath = os.path.join(self.targetDir, subtoolName) tools.InstallMethod.__init__(self) + def is_installed(self): return self.installed - def executable_path(self) : + + def executable_path(self): return self.installed and self.targetpath or None - def verify_install(self) : - self.installed = all(os.access(os.path.join(self.targetDir, executable), - (os.X_OK | os.R_OK)) + + def verify_install(self): + self.installed = all(os.access(os.path.join(self.targetDir, executable), (os.X_OK | os.R_OK)) for executable in self.executables) return self.installed - def _attempt_install(self) : - if self.verify_install() : + + def _attempt_install(self): + if self.verify_install(): return util.file.mkdir_p(self.targetDir) urlBase = 'ftp://ftp.ncbi.nlm.nih.gov/pub/agarwala/bmtagger/' uname = os.uname() - if uname[0] == 'Darwin' : + if uname[0] == 'Darwin': urlBase += 'mac-os/' - elif uname[0] != 'Linux' or not uname[4].endswith('64') : + elif uname[0] != 'Linux' or not uname[4].endswith('64'): log.debug('OS {} not implemented'.format(uname[0])) return - for executable in self.executables : + for executable in self.executables: path = os.path.join(self.targetDir, executable) url = urlBase + executable log.info('Downloading from %s ...', url) diff --git a/tools/bwa.py b/tools/bwa.py index 12f41ebe9..031c1ecbf 100644 --- a/tools/bwa.py +++ b/tools/bwa.py @@ -3,7 +3,9 @@ __author__ = "hlevitin@broadinstitute.org" import tools -import os, os.path, logging +import os +import os.path +import logging log = logging.getLogger(__name__) @@ -12,31 +14,30 @@ # current is lates version as of 8/27/2014 USE_CURRENT = True DOWNLOAD_URL = { - 'legacy': - 'http://sourceforge.net/projects/bio-bwa/files/bwa-0.6.2.tar.bz2', - 'current': - 'http://sourceforge.net/projects/bio-bwa/files/bwa-0.7.10.tar.bz2' - } + 'legacy': 'http://sourceforge.net/projects/bio-bwa/files/bwa-0.6.2.tar.bz2', + 'current': 'http://sourceforge.net/projects/bio-bwa/files/bwa-0.7.10.tar.bz2' +} URL = DOWNLOAD_URL['current'] if USE_CURRENT else DOWNLOAD_URL['legacy'] -BWA_DIR = '.'.join( [ x for x in URL.split("/")[-1].split('.') if - x != "tar" and x != "bz2" and x != "gz"]) +BWA_DIR = '.'.join([x for x in URL.split("/")[-1].split('.') if x != "tar" and x != "bz2" and x != "gz"]) -class Bwa(tools.Tool) : - def __init__(self, install_methods = None) : + +class Bwa(tools.Tool): + + def __init__(self, install_methods=None): log.debug("BWA_DIR: %s", BWA_DIR) - if install_methods is None : + if install_methods is None: install_methods = [] - install_methods.append( tools.DownloadPackage( - URL, os.path.join(BWA_DIR, 'bwa'), + install_methods.append(tools.DownloadPackage( + URL, + os.path.join(BWA_DIR, 'bwa'), post_download_command="cd {}; make -s".format(BWA_DIR))) - tools.Tool.__init__(self, install_methods = install_methods) + tools.Tool.__init__(self, install_methods=install_methods) - def version(self) : - return ''.join([c for c in BWA_DIR if c.isdigit() or c=='.']) + def version(self): + return ''.join([c for c in BWA_DIR if c.isdigit() or c == '.']) - def execute(self, subcommand, args=[], options={}, option_string="", - post_cmd=""): + def execute(self, subcommand, args=[], options={}, option_string="", post_cmd=""): """ args are required arguments for the specified bwa subcommand (order matters for bwa execution) @@ -51,10 +52,7 @@ def execute(self, subcommand, args=[], options={}, option_string="", ( "> output.sai") """ arg_str = " ".join(args) - option_str = '{} {}'.format(' '.join([ "{} {}".format(k, v) for k, v in - options.items() ]), - option_string - ) + option_str = '{} {}'.format(' '.join(["{} {}".format(k, v) for k, v in options.items()]), option_string) cmd = "{} {} {} {} {}" \ .format(self.exec_path, subcommand, option_str, arg_str, post_cmd) log.debug("Calling bwa with cmd: {}".format(cmd)) diff --git a/tools/gatk.py b/tools/gatk.py index 94fc0d566..c2fb12905 100644 --- a/tools/gatk.py +++ b/tools/gatk.py @@ -1,18 +1,27 @@ ''' GATK genotyping toolkit from the Broad Institute - + This software has different licenses depending on use cases. As such, we do not have an auto-downloader. The user must have GATK pre-installed on their own and available in $GATK_PATH. ''' -import tools, tools.picard, tools.samtools, util.file -import logging, os, os.path, subprocess, tempfile +import tools +import tools.picard +import tools.samtools +import util.file +import logging +import os +import os.path +import subprocess +import tempfile log = logging.getLogger(__name__) -class GATKTool(tools.Tool) : + +class GATKTool(tools.Tool): jvmMemDefault = '2g' + def __init__(self, path=None): self.tool_version = None install_methods = [] @@ -21,23 +30,22 @@ def __init__(self, path=None): if not jarpath.endswith('.jar'): jarpath = os.path.join(jarpath, 'GenomeAnalysisTK.jar') install_methods.append(tools.PrexistingUnixCommand( - jarpath, verifycmd='java -jar %s --version' % jarpath, - verifycode=0, require_executability=False)) - tools.Tool.__init__(self, install_methods = install_methods) - - def execute(self, command, gatkOptions=[], JVMmemory=None) : + jarpath, + verifycmd='java -jar %s --version' % jarpath, + verifycode=0, + require_executability=False)) + tools.Tool.__init__(self, install_methods=install_methods) + + def execute(self, command, gatkOptions=[], JVMmemory=None): if JVMmemory is None: JVMmemory = self.jvmMemDefault - toolCmd = ['java', - '-Xmx' + JVMmemory, - '-Djava.io.tmpdir=' + tempfile.tempdir, - '-jar', self.install_and_get_path(), - '-T', command] + list(map(str, gatkOptions)) + toolCmd = ['java', '-Xmx' + JVMmemory, '-Djava.io.tmpdir=' + tempfile.tempdir, '-jar', + self.install_and_get_path(), '-T', command] + list(map(str, gatkOptions)) log.debug(' '.join(toolCmd)) subprocess.check_call(toolCmd) - - def dict_to_gatk_opts(self, options) : - return ["%s=%s" % (k,v) for k,v in options.items()] + + def dict_to_gatk_opts(self, options): + return ["%s=%s" % (k, v) for k, v in options.items()] def version(self): if self.tool_version is None: @@ -47,35 +55,50 @@ def version(self): def _get_tool_version(self): cmd = ['java', '-jar', self.install_and_get_path(), '--version'] self.tool_version = subprocess.check_output(cmd).strip() - + def ug(self, inBam, refFasta, outVcf, options=["--min_base_quality_score", 15, "-ploidy", 4], JVMmemory=None, threads=1): if int(threads) < 1: threads = 1 - opts = ['-I', inBam, '-R', refFasta, '-o', outVcf, - '-glm', 'BOTH', - '--baq', 'OFF', - '--useOriginalQualities', - '-out_mode', 'EMIT_ALL_SITES', - '-dt', 'NONE', - '--num_threads', threads, - '-stand_call_conf', 0, - '-stand_emit_conf', 0, - '-A', 'AlleleBalance', - ] + opts = ['-I', + inBam, + '-R', + refFasta, + '-o', + outVcf, + '-glm', + 'BOTH', + '--baq', + 'OFF', + '--useOriginalQualities', + '-out_mode', + 'EMIT_ALL_SITES', + '-dt', + 'NONE', + '--num_threads', + threads, + '-stand_call_conf', + 0, + '-stand_emit_conf', + 0, + '-A', + 'AlleleBalance',] self.execute('UnifiedGenotyper', opts + options, JVMmemory=JVMmemory) def local_realign(self, inBam, refFasta, outBam, JVMmemory=None, threads=1): intervals = util.file.mkstempfname('.intervals') opts = ['-I', inBam, '-R', refFasta, '-o', intervals] self.execute('RealignerTargetCreator', opts, JVMmemory=JVMmemory) - opts = ['-I', inBam, - '-R', refFasta, - '-targetIntervals', intervals, - '-o', outBam, - #'--num_threads', threads, + opts = ['-I', + inBam, + '-R', + refFasta, + '-targetIntervals', + intervals, + '-o', + outBam, #'--num_threads', threads, ] self.execute('IndelRealigner', opts, JVMmemory=JVMmemory) os.unlink(intervals) diff --git a/tools/last.py b/tools/last.py index a48166d8f..d49a3f8e8 100644 --- a/tools/last.py +++ b/tools/last.py @@ -1,35 +1,42 @@ "Tools in the 'last' suite." # built-ins -import os, logging, subprocess +import os +import logging +import subprocess # within this module import tools log = logging.getLogger(__name__) -class LastTools(tools.Tool) : + +class LastTools(tools.Tool): """ "Abstract" base class for tools in the 'last' suite. Subclasses must define class members subtoolName #and subtoolNameOnBroad. """ - def __init__(self, install_methods = None) : - if install_methods == None : + + def __init__(self, install_methods=None): + if install_methods is None: install_methods = [] install_methods.append(DownloadAndBuildLast(self.subtoolName)) - #Version of last on broad is old, database-incompatible with newer + # Version of last on broad is old, database-incompatible with newer # one, so don't use it and always load the newer version #path = os.path.join(lastBroadUnixPath, self.subtoolNameOnBroad) - #install_methods.append(tools.PrexistingUnixCommand(path)) - tools.Tool.__init__(self, install_methods = install_methods) + # install_methods.append(tools.PrexistingUnixCommand(path)) + tools.Tool.__init__(self, install_methods=install_methods) -class DownloadAndBuildLast(tools.DownloadPackage) : + +class DownloadAndBuildLast(tools.DownloadPackage): lastWithVersion = 'last-490' - def __init__(self, subtoolName) : + + def __init__(self, subtoolName): url = 'http://last.cbrc.jp/{}.zip'.format(self.lastWithVersion) target_rel_path = os.path.join(self.lastWithVersion, 'bin', subtoolName) tools.DownloadPackage.__init__(self, url, target_rel_path) - def post_download(self) : + + def post_download(self): path = os.path.join(self.destination_dir, self.lastWithVersion) os.system('cd {}; make -s; make -s install prefix=.'.format(path)) @@ -42,37 +49,39 @@ def post_download(self) : fileContents = inf.read() fileContents = fileContents.replace('string.maketrans("", "")', 'None') fileContents = fileContents.replace( - '#! /usr/bin/env python', - '#! /usr/bin/env python\nfrom __future__ import print_function') + '#! /usr/bin/env python', '#! /usr/bin/env python\nfrom __future__ import print_function') with open(mafConvertPath, 'wt') as outf: outf.write(fileContents) - def verify_install(self) : + + def verify_install(self): 'Default checks + verify python 2.7/3.x compatibility fixes were done' - if not tools.DownloadPackage.verify_install(self) : + if not tools.DownloadPackage.verify_install(self): return False - mafConvertPath = os.path.join(self.destination_dir, - self.lastWithVersion, 'bin', 'maf-convert') - if not os.access(mafConvertPath, os.X_OK | os.R_OK) : + mafConvertPath = os.path.join(self.destination_dir, self.lastWithVersion, 'bin', 'maf-convert') + if not os.access(mafConvertPath, os.X_OK | os.R_OK): return False with open(mafConvertPath, 'rt') as inf: return 'print_function' in inf.read() -class Lastal(LastTools) : + +class Lastal(LastTools): subtoolName = 'lastal' subtoolNameOnBroad = 'lastal' -class MafSort(LastTools) : + +class MafSort(LastTools): subtoolName = 'maf-sort' subtoolNameOnBroad = 'scripts/maf-sort.sh' -class Lastdb(LastTools) : + +class Lastdb(LastTools): subtoolName = 'lastdb' subtoolNameOnBroad = 'lastdb' def execute(self, inputFasta, outputDirectory, outputFilePrefix): # get the path to the binary toolCmd = [self.install_and_get_path()] - + # if the output directory (and its parents) do not exist, create them if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) @@ -86,19 +95,18 @@ def execute(self, inputFasta, outputDirectory, outputFilePrefix): # append the input filepath toolCmd.append(os.path.realpath(inputFasta)) - # lastdb writes files to the current working directory, so we need to set + # lastdb writes files to the current working directory, so we need to set # it to the desired output location os.chdir(os.path.realpath(outputDirectory)) - #execute the lastdb command + # execute the lastdb command log.debug(" ".join(toolCmd)) subprocess.check_call(toolCmd) # restore cwd os.chdir(cwdBeforeLastdb) -class MafConvert(LastTools) : + +class MafConvert(LastTools): subtoolName = 'maf-convert' subtoolNameOnBroad = 'scripts/maf-convert.py' - - diff --git a/tools/mafft.py b/tools/mafft.py index 2eaa17051..4a7a3ce86 100644 --- a/tools/mafft.py +++ b/tools/mafft.py @@ -5,37 +5,48 @@ __author__ = "tomkinsc@broadinstitute.org" - from Bio import SeqIO -import logging, tools, util.file -import os, os.path, subprocess +import logging +import tools +import util.file +import os +import os.path +import subprocess tool_version = '7.221' url = 'http://mafft.cbrc.jp/alignment/software/mafft-{ver}-{os}.{ext}' log = logging.getLogger(__name__) + class MafftTool(tools.Tool): - def __init__(self, install_methods = None): - if install_methods == None: + + def __init__(self, install_methods=None): + if install_methods is None: install_methods = [] - mafft_os = get_mafft_os() - mafft_bitdepth = get_mafft_bitdepth() + mafft_os = get_mafft_os() + mafft_bitdepth = get_mafft_bitdepth() mafft_archive_extension = get_mafft_archive_extension(mafft_os) - binaryPath = get_mafft_binary_path(mafft_os, mafft_bitdepth) - binaryDir = get_mafft_binary_path(mafft_os, mafft_bitdepth, full=False) - + binaryPath = get_mafft_binary_path(mafft_os, mafft_bitdepth) + binaryDir = get_mafft_binary_path(mafft_os, mafft_bitdepth, full=False) + target_rel_path = '{binPath}'.format(binPath=binaryPath) - verify_command = 'cd {dir}/mafft-{ver}/{binDir} && {dir}/mafft-{ver}/{binPath} --version > /dev/null 2>&1'.format(dir=util.file.get_build_path(), ver=tool_version, binPath=binaryPath, binDir=binaryDir) + verify_command = 'cd {dir}/mafft-{ver}/{binDir} && {dir}/mafft-{ver}/{binPath} --version > /dev/null 2>&1'.format( + dir=util.file.get_build_path(), + ver=tool_version, + binPath=binaryPath, + binDir=binaryDir) destination_dir = '{dir}/mafft-{ver}'.format(dir=util.file.get_build_path(), ver=tool_version) install_methods.append( - tools.DownloadPackage( url.format(ver=tool_version, os=mafft_os, ext=mafft_archive_extension), - target_rel_path=target_rel_path, - destination_dir=destination_dir, - verifycmd=verify_command)) + tools.DownloadPackage(url.format(ver=tool_version, + os=mafft_os, + ext=mafft_archive_extension), + target_rel_path=target_rel_path, + destination_dir=destination_dir, + verifycmd=verify_command)) - tools.Tool.__init__(self, install_methods = install_methods) + tools.Tool.__init__(self, install_methods=install_methods) def version(self): return tool_version @@ -48,12 +59,14 @@ def __seqIdsAreAllUnique(self, filePath, inputFormat="fasta"): # collapse like IDs using set() if len(seqIds) > len(set(seqIds)): - raise LookupError("Not all sequence IDs in input are unique for file: {}".format(os.path.basename(filePath))) + raise LookupError( + "Not all sequence IDs in input are unique for file: {}".format( + os.path.basename(filePath))) - def execute(self, inFastas, outFile, localpair, globalpair, preservecase, reorder, + def execute(self, inFastas, outFile, localpair, globalpair, preservecase, reorder, outputAsClustal, maxiters, gapOpeningPenalty=None, offset=None, threads=-1, verbose=True, retree=None): - inputFileName = "" + inputFileName = "" tempCombinedInputFile = "" # get the full paths of input and output files in case the user has specified relative paths @@ -62,14 +75,14 @@ def execute(self, inFastas, outFile, localpair, globalpair, preservecase, reorde inputFiles.append(os.path.abspath(f)) outFile = os.path.abspath(outFile) - # ensure that all sequence IDs in each input file are unique + # ensure that all sequence IDs in each input file are unique # (otherwise the alignment result makes little sense) # we can check before combining to localize duplications to a specific file for filePath in inputFiles: self.__seqIdsAreAllUnique(filePath) # if multiple fasta files are specified for input - if len(inputFiles)>1: + if len(inputFiles) > 1: # combined specified input files into a single temp FASTA file so MAFFT can read them tempFileSuffix = "" for filePath in inputFiles: @@ -79,7 +92,7 @@ def execute(self, inFastas, outFile, localpair, globalpair, preservecase, reorde for f in inputFiles: with open(f, "r") as infile: outfile.write(infile.read()) - #outFile.close() + # outFile.close() inputFileName = tempCombinedInputFile # if there is only once file specified, just use it else: @@ -130,7 +143,7 @@ def execute(self, inFastas, outFile, localpair, globalpair, preservecase, reorde toolCmd.append("--clustalout") if maxiters: toolCmd.append("--maxiterate {iters}".format(iters=maxiters)) - + toolCmd.append(inputFileName) log.debug(' '.join(toolCmd)) @@ -148,6 +161,7 @@ def execute(self, inFastas, outFile, localpair, globalpair, preservecase, reorde return outFile + def get_mafft_os(): uname = os.uname() if uname[0] == "Darwin": @@ -155,19 +169,22 @@ def get_mafft_os(): if uname[0] == "Linux": return "linux" + def get_mafft_archive_extension(mafft_os): if mafft_os == "mac": return "zip" elif mafft_os == "linux": return "tgz" + def get_mafft_bitdepth(): uname = os.uname() if uname[4] == "x86_64": return "64" - if uname[4] in ['i386','i686',"x86"]: + if uname[4] in ['i386', 'i686', "x86"]: return "32" + def get_mafft_binary_path(mafft_os, bitdepth, full=True): mafftPath = "" @@ -180,7 +197,3 @@ def get_mafft_binary_path(mafft_os, bitdepth, full=True): mafftPath += "mafft.bat" return mafftPath - - - - diff --git a/tools/mosaik.py b/tools/mosaik.py index 8935f8229..ae63b45c9 100644 --- a/tools/mosaik.py +++ b/tools/mosaik.py @@ -2,39 +2,43 @@ The MOSAIK aligner ''' -import logging, os, os.path, subprocess -import tools, util.file +import logging +import os +import os.path +import subprocess +import tools +import util.file tool_version = '2.1.33' url = 'https://mosaik-aligner.googlecode.com/files/MOSAIK-{ver}-{os}.tar' log = logging.getLogger(__name__) -class MosaikTool(tools.Tool) : - def __init__(self) : + +class MosaikTool(tools.Tool): + + def __init__(self): if os.uname()[0] == 'Darwin': if os.uname()[4].endswith('_64'): os.environ['BLD_PLATFORM'] = 'macosx64' else: os.environ['BLD_PLATFORM'] = 'macosx' install_methods = [] - destination_dir = os.path.join( - util.file.get_build_path(), 'mosaik-{}'.format(tool_version)) + destination_dir = os.path.join(util.file.get_build_path(), 'mosaik-{}'.format(tool_version)) install_methods.append( - DownloadAndBuildMosaik(url.format(ver=tool_version, os='source'), - os.path.join(destination_dir, 'bin', 'MosaikAligner'), - destination_dir)) - tools.Tool.__init__(self, install_methods = install_methods) - - def version(self) : + DownloadAndBuildMosaik(url.format(ver=tool_version, + os='source'), + os.path.join(destination_dir, 'bin', 'MosaikAligner'), + destination_dir)) + tools.Tool.__init__(self, install_methods=install_methods) + + def version(self): return tool_version - + def get_networkFile(self): # this is the directory to return - dir = os.path.join(util.file.get_build_path(), - 'mosaik-{}'.format(tool_version), - 'MOSAIK-{}-source'.format(tool_version), - 'networkFile') + dir = os.path.join(util.file.get_build_path(), 'mosaik-{}'.format(tool_version), + 'MOSAIK-{}-source'.format(tool_version), 'networkFile') if not os.path.isdir(dir): # if it doesn't exist, run just the download-unpack portion of the # source installer @@ -42,19 +46,19 @@ def get_networkFile(self): assert os.path.isdir(dir) return dir -class DownloadAndBuildMosaik(tools.DownloadPackage) : - def post_download(self) : - mosaikDir = os.path.join(self.destination_dir, - 'MOSAIK-{}-source'.format(tool_version)) - if tool_version == "2.1.33" : + +class DownloadAndBuildMosaik(tools.DownloadPackage): + + def post_download(self): + mosaikDir = os.path.join(self.destination_dir, 'MOSAIK-{}-source'.format(tool_version)) + if tool_version == "2.1.33": # In this version, obsolete LDFLAGS breaks make. Remove it makeFilePath = os.path.join(mosaikDir, 'Makefile') os.rename(makeFilePath, makeFilePath + '.orig') - with open(makeFilePath + '.orig') as inf : + with open(makeFilePath + '.orig') as inf: makeText = inf.read() - with open(makeFilePath, 'wt') as outf : - outf.write(makeText.replace('export LDFLAGS = -Wl', - '#export LDFLAGS = -Wl')) - + with open(makeFilePath, 'wt') as outf: + outf.write(makeText.replace('export LDFLAGS = -Wl', '#export LDFLAGS = -Wl')) + # Now we can make: os.system('cd "{}" && make -s'.format(mosaikDir)) diff --git a/tools/muscle.py b/tools/muscle.py index 8c8c29c61..fa4f79753 100644 --- a/tools/muscle.py +++ b/tools/muscle.py @@ -3,40 +3,50 @@ http://www.drive5.com/muscle ''' -import logging, tools, util.file -import os, os.path, subprocess +import logging +import tools +import util.file +import os +import os.path +import subprocess tool_version = '3.8.31' url = 'http://www.drive5.com/muscle/downloads{ver}/muscle{ver}_{os}.tar.gz' log = logging.getLogger(__name__) -class MuscleTool(tools.Tool) : - def __init__(self, install_methods = None) : - if install_methods == None : + +class MuscleTool(tools.Tool): + + def __init__(self, install_methods=None): + if install_methods is None: install_methods = [] muscle_os = get_muscle_os() if muscle_os != 'src': install_methods.append( - tools.DownloadPackage(url.format(ver=tool_version, os=muscle_os), - 'muscle{}_{}'.format(tool_version, muscle_os), - verifycmd='{}/muscle{}_{} -version > /dev/null 2>&1'.format(util.file.get_build_path(), tool_version, muscle_os))) + tools.DownloadPackage(url.format(ver=tool_version, + os=muscle_os), + 'muscle{}_{}'.format(tool_version, muscle_os), + verifycmd='{}/muscle{}_{} -version > /dev/null 2>&1'.format( + util.file.get_build_path(), tool_version, muscle_os))) install_methods.append( - tools.DownloadPackage(url.format(ver=tool_version, os=muscle_os), - 'muscle{}/src/muscle'.format(tool_version), - post_download_command='cd muscle{}/src; make -s'.format(tool_version), - verifycmd='{}/muscle{}/src/muscle -version > /dev/null 2>&1'.format(util.file.get_build_path(), tool_version))) - tools.Tool.__init__(self, install_methods = install_methods) - - def version(self) : + tools.DownloadPackage(url.format(ver=tool_version, + os=muscle_os), + 'muscle{}/src/muscle'.format(tool_version), + post_download_command='cd muscle{}/src; make -s'.format(tool_version), + verifycmd='{}/muscle{}/src/muscle -version > /dev/null 2>&1'.format( + util.file.get_build_path(), tool_version))) + tools.Tool.__init__(self, install_methods=install_methods) + + def version(self): return tool_version - + def execute(self, inFasta, outFasta, - maxiters=None, maxhours=None, format='fasta', diags=None, quiet=True, logFile=None): + maxiters=None, maxhours=None, format='fasta', diags=None, quiet=True, logFile=None): toolCmd = [self.install_and_get_path(), '-in', inFasta, '-out', outFasta] - + if format in ('html', 'msf', 'clw', 'clwstrict'): - toolCmd.append('-'+format) + toolCmd.append('-' + format) else: if format != 'fasta': raise Exception() @@ -50,13 +60,13 @@ def execute(self, inFasta, outFasta, toolCmd.append('-maxhours {}'.format(maxhours)) if logFile: toolCmd.append('-log {}'.format(logFile)) - + log.debug(' '.join(toolCmd)) subprocess.check_call(toolCmd) def get_muscle_os(): uname = os.uname() - if uname[4].startswith('x86') and uname[0] in ('Darwin','Linux'): + if uname[4].startswith('x86') and uname[0] in ('Darwin', 'Linux'): return 'i86' + uname[0].lower() + uname[4][-2:] return 'src' diff --git a/tools/mvicuna.py b/tools/mvicuna.py index 579562897..7eeee8cc6 100644 --- a/tools/mvicuna.py +++ b/tools/mvicuna.py @@ -1,19 +1,25 @@ "tools.Tool for mvicuna." -import logging, os, subprocess, shutil -import tools, util.file +import logging +import os +import subprocess +import shutil +import tools +import util.file # BroadUnixPath = '/gsap/garage-viral/viral/analysis/xyang/programs'\ # '/M-Vicuna/bin/mvicuna' log = logging.getLogger(__name__) -class MvicunaTool(tools.Tool) : - def __init__(self, install_methods = None): - if install_methods == None : + +class MvicunaTool(tools.Tool): + + def __init__(self, install_methods=None): + if install_methods is None: path = _get_mvicuna_path() install_methods = [tools.PrexistingUnixCommand(path)] - tools.Tool.__init__(self, install_methods = install_methods) + tools.Tool.__init__(self, install_methods=install_methods) def rmdup(self, inPair, outPair, outUnpaired=None): """ @@ -36,35 +42,31 @@ def rmdup(self, inPair, outPair, outUnpaired=None): """ if not outUnpaired: outUnpaired = util.file.mkstempfname(suffix='.unpaired.fastq') - tmp1OutPair = ( util.file.mkstempfname(suffix='.tmp1out.1.fastq'), - util.file.mkstempfname(suffix='.tmp1out.2.fastq')) - tmp2OutPair = ( util.file.mkstempfname(suffix='.tmp2out.1.fastq'), - util.file.mkstempfname(suffix='.tmp2out.2.fastq')) - cmdline = [self.install_and_get_path(), - '-ipfq', ','.join(inPair), - '-opfq', ','.join(tmp2OutPair), - '-osfq', outUnpaired, - '-drm_op', ','.join(tmp1OutPair), - '-tasks', 'DupRm'] + tmp1OutPair = (util.file.mkstempfname(suffix='.tmp1out.1.fastq'), + util.file.mkstempfname(suffix='.tmp1out.2.fastq')) + tmp2OutPair = (util.file.mkstempfname(suffix='.tmp2out.1.fastq'), + util.file.mkstempfname(suffix='.tmp2out.2.fastq')) + cmdline = [self.install_and_get_path(), '-ipfq', ','.join(inPair), '-opfq', ','.join(tmp2OutPair), '-osfq', + outUnpaired, '-drm_op', ','.join(tmp1OutPair), '-tasks', 'DupRm'] log.debug(' '.join(cmdline)) subprocess.check_call(cmdline) for tmpfname, outfname in zip(tmp2OutPair, outPair): shutil.copyfile(tmpfname, outfname) -def _get_mvicuna_path() : +def _get_mvicuna_path(): uname = os.uname() - if uname[0] == 'Darwin' : + if uname[0] == 'Darwin': osName = 'MacOSX' - elif uname[0] == 'Linux' and uname[4].endswith('64') : + elif uname[0] == 'Linux' and uname[4].endswith('64'): osName = 'linux64' - else : - log.debug('mvicuna not implemented for OS %s %s', - uname[0], uname[4]) + else: + log.debug('mvicuna not implemented for OS %s %s', uname[0], uname[4]) return '' binariesPath = util.file.get_binaries_path() return os.path.join(binariesPath, 'mvicuna', osName, 'mvicuna') + """ Instructions for building mvicuna on Mac OS X Mavericks: - Install brew diff --git a/tools/novoalign.py b/tools/novoalign.py index 9537d2691..7eefb342d 100644 --- a/tools/novoalign.py +++ b/tools/novoalign.py @@ -1,18 +1,27 @@ ''' Novoalign aligner by Novocraft - + This is commercial software that has different licenses depending on use cases. As such, we do not have an auto-downloader. The user must have Novoalign pre-installed on their own and available either in $PATH or $NOVOALIGN_PATH. ''' -import tools, tools.picard, tools.samtools, util.file -import logging, os, os.path, subprocess, stat +import tools +import tools.picard +import tools.samtools +import util.file +import logging +import os +import os.path +import subprocess +import stat log = logging.getLogger(__name__) -class NovoalignTool(tools.Tool) : + +class NovoalignTool(tools.Tool): + def __init__(self, path=None): self.tool_version = None install_methods = [] @@ -21,8 +30,8 @@ def __init__(self, path=None): install_methods.append(tools.PrexistingUnixCommand( os.path.join(novopath, 'novoalign'), require_executability=True)) - tools.Tool.__init__(self, install_methods = install_methods) - + tools.Tool.__init__(self, install_methods=install_methods) + def version(self): if self.tool_version is None: self._get_tool_version() @@ -35,14 +44,13 @@ def _get_tool_version(self): with open(tmpf, 'rt') as inf: self.tool_version = inf.readline().strip().split()[1] os.unlink(tmpf) - + def _fasta_to_idx_name(self, fasta): if not fasta.endswith('.fasta'): raise ValueError('input file %s must end with .fasta' % fasta) return fasta[:-6] + '.nix' - - def execute(self, inBam, refFasta, outBam, - options=["-r", "Random"], min_qual=0, JVMmemory=None): + + def execute(self, inBam, refFasta, outBam, options=["-r", "Random"], min_qual=0, JVMmemory=None): ''' Execute Novoalign on BAM inputs and outputs. If the BAM contains multiple read groups, break up the input and perform Novoalign separately on each one @@ -51,89 +59,88 @@ def execute(self, inBam, refFasta, outBam, If min_qual>0, use Samtools to filter on mapping quality. ''' samtools = tools.samtools.SamtoolsTool() - + # fetch list of RGs rgs = list(samtools.getReadGroups(inBam).keys()) - - if len(rgs)==0: + + if len(rgs) == 0: # Can't do this raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) - + elif len(rgs) == 1: # Only one RG, keep it simple - self.align_one_rg_bam(inBam, refFasta, outBam, - options=options, min_qual=min_qual, JVMmemory=JVMmemory) - + self.align_one_rg_bam(inBam, refFasta, outBam, options=options, min_qual=min_qual, JVMmemory=JVMmemory) + else: # Multiple RGs, align one at a time and merge align_bams = [] for rg in rgs: tmp_bam = util.file.mkstempfname('.{}.bam'.format(rg)) - self.align_one_rg_bam(inBam, refFasta, tmp_bam, - rgid=rg, - options=options, min_qual=min_qual, JVMmemory=JVMmemory) - if os.path.getsize(tmp_bam)>0: + self.align_one_rg_bam(inBam, + refFasta, + tmp_bam, + rgid=rg, + options=options, + min_qual=min_qual, + JVMmemory=JVMmemory) + if os.path.getsize(tmp_bam) > 0: align_bams.append(tmp_bam) - + # Merge BAMs, sort, and index tools.picard.MergeSamFilesTool().execute( - align_bams, outBam, - picardOptions=['SORT_ORDER=coordinate', - 'USE_THREADING=true', 'CREATE_INDEX=true'], + align_bams, + outBam, + picardOptions=['SORT_ORDER=coordinate', 'USE_THREADING=true', 'CREATE_INDEX=true'], JVMmemory=JVMmemory) for bam in align_bams: os.unlink(bam) - + def align_one_rg_bam(self, inBam, refFasta, outBam, - rgid=None, options=["-r", "Random"], min_qual=0, JVMmemory=None): + rgid=None, options=["-r", "Random"], min_qual=0, JVMmemory=None): ''' Execute Novoalign on BAM inputs and outputs. Requires that only one RG exists (will error otherwise). Use Picard to sort and index the output BAM. If min_qual>0, use Samtools to filter on mapping quality. ''' samtools = tools.samtools.SamtoolsTool() - + # Require exactly one RG rgs = samtools.getReadGroups(inBam) - if len(rgs)==0: + if len(rgs) == 0: raise InvalidBamHeaderError("{} lacks read groups".format(inBam)) elif len(rgs) == 1: if not rgid: rgid = list(rgs.keys())[0] elif not rgid: - raise InvalidBamHeaderError( - "{} has {} read groups, but we require exactly one".format( - inBam, len(rgs))) + raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs))) if rgid not in rgs: - raise InvalidBamHeaderError( - "{} has read groups, but not {}".format( - inBam, rgid)) + raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid)) rg = rgs[rgid] - + # Strip inBam to just one RG (if necessary) - if len(rgs)==1: + if len(rgs) == 1: one_rg_inBam = inBam else: # strip inBam to one read group tmp_bam = util.file.mkstempfname('.onebam.bam') samtools.view(['-b', '-r', rgid], inBam, tmp_bam) # special exit if this file is empty - if samtools.count(tmp_bam)==0: + if samtools.count(tmp_bam) == 0: return # simplify BAM header otherwise Novoalign gets confused one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid)) headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid)) with open(headerFile, 'wt') as outf: for row in samtools.getHeader(inBam): - if len(row)>0 and row[0]=='@RG': + if len(row) > 0 and row[0] == '@RG': if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]: # skip all read groups that are not rgid continue - outf.write('\t'.join(row)+'\n') + outf.write('\t'.join(row) + '\n') samtools.reheader(tmp_bam, headerFile, one_rg_inBam) os.unlink(tmp_bam) os.unlink(headerFile) - + # Novoalign tmp_sam = util.file.mkstempfname('.novoalign.sam') cmd = [self.install_and_get_path(), '-f', one_rg_inBam] + list(map(str, options)) @@ -141,7 +148,7 @@ def align_one_rg_bam(self, inBam, refFasta, outBam, log.debug(' '.join(cmd)) with open(tmp_sam, 'wt') as outf: subprocess.check_call(cmd, stdout=outf) - + # Samtools filter (optional) if min_qual: tmp_bam2 = util.file.mkstempfname('.filtered.bam') @@ -151,13 +158,14 @@ def align_one_rg_bam(self, inBam, refFasta, outBam, subprocess.check_call(cmd, stdout=outf) os.unlink(tmp_sam) tmp_sam = tmp_bam2 - + # Picard SortSam sorter = tools.picard.SortSamTool() - sorter.execute(tmp_sam, outBam, sort_order='coordinate', - picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'], - JVMmemory=JVMmemory) - + sorter.execute(tmp_sam, + outBam, + sort_order='coordinate', + picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'], + JVMmemory=JVMmemory) def index_fasta(self, refFasta): ''' Index a FASTA file (reference genome) for use with Novoalign. diff --git a/tools/picard.py b/tools/picard.py index 6898a8b0e..82d49fbcd 100644 --- a/tools/picard.py +++ b/tools/picard.py @@ -2,106 +2,118 @@ Tools in the Picard suite ''' -import logging, os, os.path, subprocess, tempfile, shutil +import logging +import os +import os.path +import subprocess +import tempfile +import shutil import pysam -import tools, util.file +import tools +import util.file tool_version = '1.126' url = 'https://github.com/broadinstitute/picard/releases/download/' \ - + '{ver}/picard-tools-{ver}.zip'.format(ver=tool_version) + + '{ver}/picard-tools-{ver}.zip'.format(ver=tool_version) # Note: Version 1.126 is latest as of 2014-12-02 # Note: /seq/software/picard/{versionnumber}/ does not correspond with github release numbers! log = logging.getLogger(__name__) -class PicardTools(tools.Tool) : + +class PicardTools(tools.Tool): """Base class for tools in the picard suite.""" jvmMemDefault = '2g' - def __init__(self, install_methods = None) : - if install_methods == None : + + def __init__(self, install_methods=None): + if install_methods is None: target_rel_path = 'picard-tools-{}/picard.jar'.format(tool_version) - install_methods = [ - tools.DownloadPackage(url, target_rel_path, require_executability=False)] - tools.Tool.__init__(self, install_methods = install_methods) - def version(self) : + install_methods = [tools.DownloadPackage(url, target_rel_path, require_executability=False)] + tools.Tool.__init__(self, install_methods=install_methods) + + def version(self): return tool_version - def execute(self, command, picardOptions=[], JVMmemory=None) : - if JVMmemory==None: + + def execute(self, command, picardOptions=[], JVMmemory=None): + if JVMmemory is None: JVMmemory = self.jvmMemDefault - toolCmd = ['java', - '-Xmx' + JVMmemory, - '-Djava.io.tmpdir=' + tempfile.tempdir, - '-jar', self.install_and_get_path(), - command] + picardOptions + toolCmd = ['java', '-Xmx' + JVMmemory, '-Djava.io.tmpdir=' + tempfile.tempdir, '-jar', + self.install_and_get_path(), command] + picardOptions log.debug(' '.join(toolCmd)) subprocess.check_call(toolCmd) - def dict_to_picard_opts(self, options) : - return ["%s=%s" % (k,v) for k,v in options.items()] -class RevertSamTool(PicardTools) : + def dict_to_picard_opts(self, options): + return ["%s=%s" % (k, v) for k, v in options.items()] + + +class RevertSamTool(PicardTools): subtoolName = 'RevertSam' - def execute(self, inBam, outBam, - picardOptions=[], JVMmemory=None) : - opts = ['INPUT='+inBam, 'OUTPUT='+outBam] + + def execute(self, inBam, outBam, picardOptions=[], JVMmemory=None): + opts = ['INPUT=' + inBam, 'OUTPUT=' + outBam] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) -class MarkDuplicatesTool(PicardTools) : + +class MarkDuplicatesTool(PicardTools): subtoolName = 'MarkDuplicates' - def execute(self, inBams, outBam, outMetrics=None, - picardOptions=[], JVMmemory=None) : - if not outMetrics : + + def execute(self, inBams, outBam, outMetrics=None, picardOptions=[], JVMmemory=None): + if not outMetrics: outMetrics = util.file.mkstempfname('.metrics') - opts = ['INPUT='+bam for bam in inBams] + [ - 'OUTPUT='+outBam, 'METRICS_FILE='+outMetrics] + opts = ['INPUT=' + bam for bam in inBams] + ['OUTPUT=' + outBam, 'METRICS_FILE=' + outMetrics] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) -class SamToFastqTool(PicardTools) : + +class SamToFastqTool(PicardTools): subtoolName = 'SamToFastq' - def execute(self, inBam, outFastq1, outFastq2, - picardOptions=[], JVMmemory=None) : - opts = ['INPUT='+inBam, - 'FASTQ='+outFastq1, 'SECOND_END_FASTQ='+outFastq2, - 'VALIDATION_STRINGENCY=SILENT'] + + def execute(self, inBam, outFastq1, outFastq2, picardOptions=[], JVMmemory=None): + opts = ['INPUT=' + inBam, 'FASTQ=' + outFastq1, 'SECOND_END_FASTQ=' + outFastq2, 'VALIDATION_STRINGENCY=SILENT' + ] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) - def per_read_group(self, inBam, outDir, picardOptions=[], JVMmemory=None) : - opts = ['INPUT='+inBam, 'OUTPUT_DIR='+outDir, 'OUTPUT_PER_RG=true'] + + def per_read_group(self, inBam, outDir, picardOptions=[], JVMmemory=None): + opts = ['INPUT=' + inBam, 'OUTPUT_DIR=' + outDir, 'OUTPUT_PER_RG=true'] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) -class FastqToSamTool(PicardTools) : + +class FastqToSamTool(PicardTools): subtoolName = 'FastqToSam' - def execute(self, inFastq1, inFastq2, sampleName, outBam, - picardOptions=[], JVMmemory=None) : - opts = ['FASTQ='+inFastq1, 'FASTQ2='+inFastq2, - 'OUTPUT='+outBam, 'SAMPLE_NAME='+sampleName] + + def execute(self, inFastq1, inFastq2, sampleName, outBam, picardOptions=[], JVMmemory=None): + opts = ['FASTQ=' + inFastq1, 'FASTQ2=' + inFastq2, 'OUTPUT=' + outBam, 'SAMPLE_NAME=' + sampleName] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) -class SortSamTool(PicardTools) : + +class SortSamTool(PicardTools): subtoolName = 'SortSam' valid_sort_orders = ['unsorted', 'queryname', 'coordinate'] default_sort_order = 'coordinate' - def execute(self, inBam, outBam, sort_order = default_sort_order, - picardOptions=[], JVMmemory=None) : - if sort_order not in self.valid_sort_orders : + + def execute(self, inBam, outBam, sort_order=default_sort_order, picardOptions=[], JVMmemory=None): + if sort_order not in self.valid_sort_orders: raise Exception("invalid sort order") - opts = ['INPUT='+inBam, 'OUTPUT='+outBam, 'SORT_ORDER='+sort_order] + opts = ['INPUT=' + inBam, 'OUTPUT=' + outBam, 'SORT_ORDER=' + sort_order] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) -class MergeSamFilesTool(PicardTools) : + +class MergeSamFilesTool(PicardTools): subtoolName = 'MergeSamFiles' - def execute(self, inBams, outBam, - picardOptions=[], JVMmemory=None) : - opts = ['INPUT='+bam for bam in inBams] + ['OUTPUT='+outBam] + + def execute(self, inBams, outBam, picardOptions=[], JVMmemory=None): + opts = ['INPUT=' + bam for bam in inBams] + ['OUTPUT=' + outBam] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) -class FilterSamReadsTool(PicardTools) : + +class FilterSamReadsTool(PicardTools): ''' TO DO: it might be desirable to replace this tool with a non-Picard/non-Java approach that uses samtools/pysam, sqlite, and O(1) memory. ''' subtoolName = 'FilterSamReads' jvmMemDefault = '4g' - def execute(self, inBam, exclude, readList, outBam, - picardOptions=[], JVMmemory=None) : + + def execute(self, inBam, exclude, readList, outBam, picardOptions=[], JVMmemory=None): if os.path.getsize(readList) == 0: # Picard FilterSamReads cannot deal with an empty READ_LIST_FILE if exclude: @@ -118,19 +130,19 @@ def execute(self, inBam, exclude, readList, outBam, # pysam.AlignmentFile cannot write an empty file # samtools cannot convert SAM -> BAM on an empty file # but Picard SamFormatConverter can deal with empty files - opts = ['INPUT='+tmpf, 'OUTPUT='+outBam, 'VERBOSITY=ERROR'] + opts = ['INPUT=' + tmpf, 'OUTPUT=' + outBam, 'VERBOSITY=ERROR'] PicardTools.execute(self, 'SamFormatConverter', opts, JVMmemory='50m') else: - opts = ['INPUT='+inBam, 'OUTPUT='+outBam, 'READ_LIST_FILE='+readList, - 'FILTER='+(exclude and 'excludeReadList' or 'includeReadList'), - 'WRITE_READS_FILES=false'] + opts = ['INPUT=' + inBam, 'OUTPUT=' + outBam, 'READ_LIST_FILE=' + readList, 'FILTER=' + + (exclude and 'excludeReadList' or 'includeReadList'), 'WRITE_READS_FILES=false'] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) -class CreateSequenceDictionaryTool(PicardTools) : + +class CreateSequenceDictionaryTool(PicardTools): subtoolName = 'CreateSequenceDictionary' jvmMemDefault = '512m' - def execute(self, inFasta, outDict=None, overwrite=False, - picardOptions=[], JVMmemory=None) : + + def execute(self, inFasta, outDict=None, overwrite=False, picardOptions=[], JVMmemory=None): if not outDict: if inFasta.lower().endswith('.fa'): outDict = inFasta[:-3] + '.dict' @@ -143,76 +155,75 @@ def execute(self, inFasta, outDict=None, overwrite=False, os.unlink(outDict) else: return - opts = ['REFERENCE='+inFasta, 'OUTPUT='+outDict] + opts = ['REFERENCE=' + inFasta, 'OUTPUT=' + outDict] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) -class BuildBamIndexTool(PicardTools) : + +class BuildBamIndexTool(PicardTools): subtoolName = 'BuildBamIndex' jvmMemDefault = '512m' - def execute(self, inBam, picardOptions=[], JVMmemory=None) : - opts = ['INPUT='+inBam] + + def execute(self, inBam, picardOptions=[], JVMmemory=None): + opts = ['INPUT=' + inBam] PicardTools.execute(self, self.subtoolName, opts + picardOptions, JVMmemory) -class ExtractIlluminaBarcodesTool(PicardTools) : + +class ExtractIlluminaBarcodesTool(PicardTools): subtoolName = 'ExtractIlluminaBarcodes' jvmMemDefault = '8g' - defaults = {'read_structure':'101T8B8B101T', - 'max_mismatches':0, 'minimum_base_quality':25, - 'num_processors':8} - option_list = ('read_structure', 'max_mismatches', 'minimum_base_quality', - 'min_mismatch_delta', 'max_no_calls', 'minimum_quality', - 'compress_outputs', 'num_processors') - def execute(self, basecalls_dir, lane, barcode_file, - output_dir, metrics, - picardOptions={}, JVMmemory=None) : + defaults = {'read_structure': '101T8B8B101T', 'max_mismatches': 0, 'minimum_base_quality': 25, 'num_processors': 8} + option_list = ('read_structure', 'max_mismatches', 'minimum_base_quality', 'min_mismatch_delta', 'max_no_calls', + 'minimum_quality', 'compress_outputs', 'num_processors') + + def execute(self, basecalls_dir, lane, barcode_file, output_dir, metrics, picardOptions={}, JVMmemory=None): opts_dict = self.defaults.copy() - for k,v in picardOptions.items(): + for k, v in picardOptions.items(): opts_dict[k] = v opts = [] - for k,v in opts_dict.items(): - if v != None: + for k, v in opts_dict.items(): + if v is not None: if type(v) in (list, tuple): for x in v: opts.append('='.join((k.upper(), str(x)))) else: opts.append('='.join((k.upper(), str(v)))) - opts += ['BASECALLS_DIR='+basecalls_dir, - 'LANE='+str(lane), - 'BARCODE_FILE='+barcode_file, - 'METRICS_FILE='+metrics] - if output_dir != None: - opts += ['OUTPUT_DIR='+output_dir] + opts += ['BASECALLS_DIR=' + basecalls_dir, 'LANE=' + str(lane), 'BARCODE_FILE=' + barcode_file, + 'METRICS_FILE=' + metrics] + if output_dir is not None: + opts += ['OUTPUT_DIR=' + output_dir] PicardTools.execute(self, self.subtoolName, opts, JVMmemory) -class IlluminaBasecallsToSamTool(PicardTools) : + +class IlluminaBasecallsToSamTool(PicardTools): subtoolName = 'IlluminaBasecallsToSam' jvmMemDefault = '54g' - defaults = {'read_structure':'101T8B8B101T', 'sequencing_center':'BI', + defaults = { + 'read_structure': '101T8B8B101T', + 'sequencing_center': 'BI', 'adapters_to_check': ('PAIRED_END', 'NEXTERA_V1', 'NEXTERA_V2'), - 'max_reads_in_ram_per_tile':100000, 'max_records_in_ram':100000, - 'num_processors':8, 'force_gc':False} - option_list = ('read_structure', 'sequencing_center', 'adapters_to_check', - 'platform', 'max_reads_in_ram_per_tile', 'max_records_in_ram', 'num_processors', - 'apply_eamss_filter', 'force_gc', 'first_tile', 'tile_limit', - 'include_non_pf_reads', 'run_start_date', 'read_group_id') + 'max_reads_in_ram_per_tile': 100000, + 'max_records_in_ram': 100000, + 'num_processors': 8, + 'force_gc': False + } + option_list = ('read_structure', 'sequencing_center', 'adapters_to_check', 'platform', 'max_reads_in_ram_per_tile', + 'max_records_in_ram', 'num_processors', 'apply_eamss_filter', 'force_gc', 'first_tile', + 'tile_limit', 'include_non_pf_reads', 'run_start_date', 'read_group_id') + def execute(self, basecalls_dir, barcodes_dir, run_barcode, lane, library_params, - picardOptions={}, JVMmemory=None) : + picardOptions={}, JVMmemory=None): opts_dict = self.defaults.copy() - for k,v in picardOptions.items(): + for k, v in picardOptions.items(): opts_dict[k] = v opts = [] - for k,v in opts_dict.items(): - if v != None: + for k, v in opts_dict.items(): + if v is not None: if type(v) in (list, tuple): for x in v: opts.append('='.join((k.upper(), str(x)))) else: opts.append('='.join((k.upper(), str(v)))) - opts += ['BASECALLS_DIR='+basecalls_dir, - 'BARCODES_DIR='+barcodes_dir, - 'LANE='+str(lane), - 'RUN_BARCODE='+run_barcode, - 'LIBRARY_PARAMS='+library_params] + opts += ['BASECALLS_DIR=' + basecalls_dir, 'BARCODES_DIR=' + barcodes_dir, 'LANE=' + str(lane), + 'RUN_BARCODE=' + run_barcode, 'LIBRARY_PARAMS=' + library_params] PicardTools.execute(self, self.subtoolName, opts, JVMmemory) - diff --git a/tools/prinseq.py b/tools/prinseq.py index 026a3d980..b1af29381 100644 --- a/tools/prinseq.py +++ b/tools/prinseq.py @@ -1,20 +1,21 @@ "tools.Tool for prinseq." -import tools, util.file +import tools +import util.file url = 'http://sourceforge.net/projects/prinseq/files/standalone/' \ 'prinseq-lite-0.19.3.tar.gz' -class PrinseqTool(tools.Tool) : - def __init__(self, install_methods = None): - if install_methods == None: + +class PrinseqTool(tools.Tool): + + def __init__(self, install_methods=None): + if install_methods is None: install_methods = [] target_rel_path = 'prinseq-lite-0.19.3/prinseq-lite.pl' install_methods.append( - tools.DownloadPackage(url, target_rel_path, - post_download_command='chmod +x {}'.format(target_rel_path), - require_executability = True)) - tools.Tool.__init__(self, install_methods = install_methods) - - - + tools.DownloadPackage(url, + target_rel_path, + post_download_command='chmod +x {}'.format(target_rel_path), + require_executability=True)) + tools.Tool.__init__(self, install_methods=install_methods) diff --git a/tools/samtools.py b/tools/samtools.py index fda6fcac4..d535e6785 100644 --- a/tools/samtools.py +++ b/tools/samtools.py @@ -1,19 +1,23 @@ ''' The Samtools package. - + TO DO: much of this stuff can be all eliminated by using pysam instead, as pysam (in its current versions) is meant to be the complete python implementation of htslib/samtools. - + http://pysam.readthedocs.org/en/latest/usage.html#using-samtools-commands-within-python - + Current bug with pysam 0.8.1: nosetests does not work unless you use --nocapture. python -m unittest works. Something about redirecting stdout. Actually, Travis CI still has issues with pysam and stdout even with --nocapture. ''' -import logging, tools, util.file -import os, os.path, subprocess +import logging +import tools +import util.file +import os +import os.path +import subprocess from collections import OrderedDict #import pysam @@ -23,17 +27,21 @@ log = logging.getLogger(__name__) -class SamtoolsTool(tools.Tool) : - def __init__(self, install_methods = None) : - if install_methods is None : + +class SamtoolsTool(tools.Tool): + + def __init__(self, install_methods=None): + if install_methods is None: install_methods = [ - tools.DownloadPackage(url, 'samtools-{}/samtools'.format(tool_version), - post_download_command='cd samtools-{}; make -s'.format(tool_version))] - tools.Tool.__init__(self, install_methods = install_methods) - - def version(self) : + tools.DownloadPackage(url, + 'samtools-{}/samtools'.format(tool_version), + post_download_command='cd samtools-{}; make -s'.format(tool_version)) + ] + tools.Tool.__init__(self, install_methods=install_methods) + + def version(self): return tool_version - + def execute(self, command, args, stdin=None, stdout=None, stderr=None): toolCmd = [self.install_and_get_path(), command] + args log.debug(' '.join(toolCmd)) @@ -53,17 +61,17 @@ def execute(self, command, args, stdin=None, stdout=None, stderr=None): def view(self, args, inFile, outFile, regions=[]): self.execute('view', args + ['-o', outFile, inFile] + regions) - + def merge(self, inFiles, outFile, options=['-f']): "Merge a list of inFiles to create outFile." # We are using -f for now because mkstempfname actually makes an empty # file, and merge fails with that as output target without the -f. # When mkstempfname is fixed, we should remove the -f. self.execute('merge', options + [outFile] + inFiles) - + def index(self, inBam): self.execute('index', [inBam]) - + def faidx(self, inFasta, overwrite=False): ''' Index reference genome for samtools ''' outfname = inFasta + '.fai' @@ -72,13 +80,13 @@ def faidx(self, inFasta, overwrite=False): os.unlink(outfname) else: return - #pysam.faidx(inFasta) + # pysam.faidx(inFasta) self.execute('faidx', [inFasta]) - + def reheader(self, inBam, headerFile, outBam): self.execute('reheader', [headerFile, inBam], stdout=outBam) - - def dumpHeader(self, inBam, outHeader) : + + def dumpHeader(self, inBam, outHeader): if inBam.endswith('.bam'): opts = ['-H'] elif inBam.endswith('.sam'): @@ -90,7 +98,7 @@ def removeDoublyMappedReads(self, inBam, outBam): #opts = ['-b', '-f 2'] opts = ['-b', '-F' '1028', '-f', '2'] self.view(opts, inBam, outBam) - + def getHeader(self, inBam): ''' fetch BAM header as a list of tuples (already split on tabs) ''' tmpf = util.file.mkstempfname('.txt') @@ -99,7 +107,7 @@ def getHeader(self, inBam): header = list(line.rstrip('\n').split('\t') for line in inf) os.unlink(tmpf) return header - + def getReadGroups(self, inBam): ''' fetch all read groups from the BAM header as an OrderedDict of RG ID -> RG dict. The RG dict is a mapping of read group keyword @@ -107,17 +115,14 @@ def getReadGroups(self, inBam): and not stripped out. ID is required for all read groups. Resulting keys are in same order as @RG lines in bam file. ''' - rgs = [dict(x.split(':', 1) for x in row[1:]) - for row in self.getHeader(inBam) - if len(row)>0 and row[0]=='@RG'] + rgs = [dict(x.split(':', 1) for x in row[1:]) for row in self.getHeader(inBam) + if len(row) > 0 and row[0] == '@RG'] return OrderedDict((rg['ID'], rg) for rg in rgs) - + def count(self, inBam, opts=[], regions=[]): cmd = [self.install_and_get_path(), 'view', '-c'] + opts + [inBam] + regions - #return int(pysam.view(*cmd)[0].strip()) + # return int(pysam.view(*cmd)[0].strip()) return int(subprocess.check_output(cmd).strip()) - def mpileup(self, inBam, outPileup, opts = []): - self.execute('mpileup', opts + [inBam], stdout = outPileup, - stderr = '/dev/null') # Suppress info messages - + def mpileup(self, inBam, outPileup, opts=[]): + self.execute('mpileup', opts + [inBam], stdout=outPileup, stderr='/dev/null') # Suppress info messages diff --git a/tools/scripts/noBlastHits_v3.py b/tools/scripts/noBlastHits_v3.py index 62bba1773..6d80c9c1b 100755 --- a/tools/scripts/noBlastHits_v3.py +++ b/tools/scripts/noBlastHits_v3.py @@ -2,39 +2,40 @@ import argparse import sys -parser = argparse.ArgumentParser(description=\ - 'This program outputs to stdout reads that have no blast hits') -parser.add_argument('-b',action="store",dest="blastPath",required=True,\ - help="path to the blast hits file") -parser.add_argument('-r',action="store",dest="readsPath",required=True,\ - help="path to the reads file") -parser.add_argument('-m',action="store",dest="hit",required=True,\ - help="hit => output reads with hits, nohit => output reads with no hits",choices=['hit','nohit']) +parser = argparse.ArgumentParser(description='This program outputs to stdout reads that have no blast hits') +parser.add_argument('-b', action="store", dest="blastPath", required=True, help="path to the blast hits file") +parser.add_argument('-r', action="store", dest="readsPath", required=True, help="path to the reads file") +parser.add_argument('-m', + action="store", + dest="hit", + required=True, + help="hit => output reads with hits, nohit => output reads with no hits", + choices=['hit', 'nohit']) args = parser.parse_args() blastReads = {} blastFile = open(args.blastPath, 'r') for line in blastFile: - blastReads[(line[0:line.find('\t')])] = True + blastReads[(line[0:line.find('\t')])] = True blastFile.close() readsFile = open(args.readsPath, 'r') -nohit = args.hit == "nohit" -isFastq = args.readsPath.endswith( '.fastq' ) +nohit = args.hit == "nohit" +isFastq = args.readsPath.endswith('.fastq') while True: - line1 = readsFile.readline() - line2 = readsFile.readline() - if not line2: - break - line3 = '' - line4 = '' - if isFastq: - line3 = readsFile.readline() - if not line3: - break - line4 = readsFile.readline() - if not line4: - break - if nohit != (line1[1:line1.find('\n')] in blastReads): - sys.stdout.write(line1+line2+line3+line4) + line1 = readsFile.readline() + line2 = readsFile.readline() + if not line2: + break + line3 = '' + line4 = '' + if isFastq: + line3 = readsFile.readline() + if not line3: + break + line4 = readsFile.readline() + if not line4: + break + if nohit != (line1[1:line1.find('\n')] in blastReads): + sys.stdout.write(line1 + line2 + line3 + line4) readsFile.close() diff --git a/tools/scripts/noBlastLikeHits.py b/tools/scripts/noBlastLikeHits.py index d5e493700..d974756d9 100755 --- a/tools/scripts/noBlastLikeHits.py +++ b/tools/scripts/noBlastLikeHits.py @@ -4,35 +4,39 @@ import argparse import sys -parser = argparse.ArgumentParser(description=\ - 'This program outputs to stdout reads that have no blast hits') -parser.add_argument('-b',action="store",dest="blastPath",required=True,\ - help="path to the blast-like hits file") -parser.add_argument('-r',action="store",dest="readsPath",required=True,\ - help="path to the reads file") -parser.add_argument('-m',action="store",dest="hit",required=True,\ - help="hit => output reads with hits, nohit => output reads with no hits",choices=['hit','nohit']) +parser = argparse.ArgumentParser(description='This program outputs to stdout reads that have no blast hits') +parser.add_argument('-b', action="store", dest="blastPath", required=True, help="path to the blast-like hits file") +parser.add_argument('-r', action="store", dest="readsPath", required=True, help="path to the reads file") +parser.add_argument('-m', + action="store", + dest="hit", + required=True, + help="hit => output reads with hits, nohit => output reads with no hits", + choices=['hit', 'nohit']) args = parser.parse_args() -#finds the nth and n+1th occurrence of a substring +# finds the nth and n+1th occurrence of a substring + + def find_nth(str, substr, n): - pos = list() - i = 0 - for j in range(n+1): - i = str.find(substr, i + len(substr)) - if(j == n-1): - pos.append(i) - elif(j == n): - pos.append(i) - break - return pos - + pos = list() + i = 0 + for j in range(n + 1): + i = str.find(substr, i + len(substr)) + if (j == n - 1): + pos.append(i) + elif (j == n): + pos.append(i) + break + return pos + + blastReads = {} blastFile = open(args.blastPath, 'r') for line in blastFile: - if line.find('#') != 0: - pos = find_nth(line,'\t',6) - blastReads[line[pos.pop(0)+1:pos.pop(0)]] = True + if line.find('#') != 0: + pos = find_nth(line, '\t', 6) + blastReads[line[pos.pop(0) + 1:pos.pop(0)]] = True blastFile.close() FILE = open(args.readsPath, 'r') @@ -40,71 +44,71 @@ def find_nth(str, substr, n): char = FILE.readline()[0:1] FILE.seek(0) offsets = list() -#fasta format +# fasta format if char == '>': - while True: - line = FILE.readline() - if not line: - break - if line[0:1] == char: - count += 1 - offsets.append(FILE.tell()-len(line)) -#fastq format + while True: + line = FILE.readline() + if not line: + break + if line[0:1] == char: + count += 1 + offsets.append(FILE.tell() - len(line)) +# fastq format elif char == '@': - while True: - offsets.append(FILE.tell()) - line = FILE.readline() - if not line: - offsets.pop(count) - break - count += 1 - linesToPlus = 0 - while FILE.readline()[0:1] != '+': - linesToPlus += 1 - for i in range(0, linesToPlus): - FILE.readline() + while True: + offsets.append(FILE.tell()) + line = FILE.readline() + if not line: + offsets.pop(count) + break + count += 1 + linesToPlus = 0 + while FILE.readline()[0:1] != '+': + linesToPlus += 1 + for i in range(0, linesToPlus): + FILE.readline() else: - print("Your file does not appear to be fasta or fastq") - sys.exit() + print("Your file does not appear to be fasta or fastq") + sys.exit() if args.hit == "nohit": - for i in range(0, len(offsets)): - FILE.seek(offsets[i]) - line = FILE.readline() - if not(line.split()[0][1:] in blastReads): - if i == count-1: - FILE.seek(offsets[i]) - while True: - line = FILE.readline() - if not line: - break - sys.stdout.write(line) - else: - curOffset = 0 - targetOffset = offsets[i+1]-offsets[i] + for i in range(0, len(offsets)): FILE.seek(offsets[i]) - while curOffset != targetOffset: - line = FILE.readline() - curOffset += len(line) - sys.stdout.write(line) + line = FILE.readline() + if not (line.split()[0][1:] in blastReads): + if i == count - 1: + FILE.seek(offsets[i]) + while True: + line = FILE.readline() + if not line: + break + sys.stdout.write(line) + else: + curOffset = 0 + targetOffset = offsets[i + 1] - offsets[i] + FILE.seek(offsets[i]) + while curOffset != targetOffset: + line = FILE.readline() + curOffset += len(line) + sys.stdout.write(line) else: - for i in range(0, len(offsets)): - FILE.seek(offsets[i]) - line = FILE.readline() - if line.split()[0][1:] in blastReads: - if i == count-1: - FILE.seek(offsets[i]) - while True: - line = FILE.readline() - if not line: - break - sys.stdout.write(line) - else: - curOffset = 0 - targetOffset = offsets[i+1]-offsets[i] + for i in range(0, len(offsets)): FILE.seek(offsets[i]) - while curOffset != targetOffset: - line = FILE.readline() - curOffset += len(line) - sys.stdout.write(line) + line = FILE.readline() + if line.split()[0][1:] in blastReads: + if i == count - 1: + FILE.seek(offsets[i]) + while True: + line = FILE.readline() + if not line: + break + sys.stdout.write(line) + else: + curOffset = 0 + targetOffset = offsets[i + 1] - offsets[i] + FILE.seek(offsets[i]) + while curOffset != targetOffset: + line = FILE.readline() + curOffset += len(line) + sys.stdout.write(line) FILE.close() diff --git a/tools/scripts/subsampler.py b/tools/scripts/subsampler.py index 1798c8c81..55d3eaa97 100755 --- a/tools/scripts/subsampler.py +++ b/tools/scripts/subsampler.py @@ -4,47 +4,58 @@ import sys import subprocess -parser = argparse.ArgumentParser(description=\ - 'This program outputs to stdout a user-defined number of reads from given reads file[s]') -parser.add_argument('-mode',action="store",dest="mode",required=True,\ - help="s => single-end, p => paired-end") -#parser.add_argument('-format',action="store",dest="format",required=True,\ +parser = argparse.ArgumentParser( + description='This program outputs to stdout a user-defined number of reads from given reads file[s]') +parser.add_argument('-mode', action="store", dest="mode", required=True, help="s => single-end, p => paired-end") +# parser.add_argument('-format',action="store",dest="format",required=True,\ # help="fasta => fasta, fastq => fastq",choices=['fasta','fastq']) -parser.add_argument('-n',action="store",type=int,dest="numSeq",required=True,\ - help="specifies number of reads to output") -parser.add_argument('-in',action="store",dest="inputs",nargs='+',required=True,\ - help="specifies input files (put both input files separated by a space for paired end)") -parser.add_argument('-out',action="store",dest="outputs",nargs='*',required=False,\ - default=['/dev/stdout'],help="specifies output files (default for single end is stdout)") +parser.add_argument('-n', + action="store", + type=int, + dest="numSeq", + required=True, + help="specifies number of reads to output") +parser.add_argument('-in', + action="store", + dest="inputs", + nargs='+', + required=True, + help="specifies input files (put both input files separated by a space for paired end)") +parser.add_argument('-out', + action="store", + dest="outputs", + nargs='*', + required=False, + default=['/dev/stdout'], + help="specifies output files (default for single end is stdout)") args = parser.parse_args() #format = args.format numSeq = args.numSeq mode = args.mode reads = args.inputs outs = args.outputs -if mode=='s': - if len(reads) != 1 or len(outs) > 1: - print("please specify only one read file and at most one output file (if paired end, specify with -pe)") - sys.exit() -elif mode=='p': - if len(reads) != 2 or len(outs) != 2: - print("please specify both read files and two output files") - sys.exit() +if mode == 's': + if len(reads) != 1 or len(outs) > 1: + print("please specify only one read file and at most one output file (if paired end, specify with -pe)") + sys.exit() +elif mode == 'p': + if len(reads) != 2 or len(outs) != 2: + print("please specify both read files and two output files") + sys.exit() fileName1 = reads[0] if fileName1.find(".gz") != -1: - subprocess.call("gunzip "+fileName1, shell=True) - fileName1 = fileName1[0: -3] + subprocess.call("gunzip " + fileName1, shell=True) + fileName1 = fileName1[0:-3] FILE = open(fileName1, 'r') if mode == 'p': - fileName2 = reads[1] - if fileName2.find(".gz") != -1: - subprocess.call("gunzip "+fileName2, shell=True) - fileName2 = fileName1[0: -3] - FILE2 = open(fileName2,'r') + fileName2 = reads[1] + if fileName2.find(".gz") != -1: + subprocess.call("gunzip " + fileName2, shell=True) + fileName2 = fileName1[0:-3] + FILE2 = open(fileName2, 'r') random.seed() -#name of the input file (fasta or fastq) -#assumes input file is standard fasta/fastq format - +# name of the input file (fasta or fastq) +# assumes input file is standard fasta/fastq format """ calculate number of total reads pseudorandomly determine which reads will be extracted @@ -53,110 +64,109 @@ char = FILE.readline()[0:1] FILE.seek(0) offsets = list() -#fasta format +# fasta format if char == '>': - while True: - line = FILE.readline() - if not line: - break - if line[0:1] == char: - count += 1 - offsets.append(FILE.tell()-len(line)) - if mode == 'p': - offsets2 = list() while True: - line = FILE2.readline() - if not line: - break - if line[0:1] == char: - offsets2.append(FILE2.tell()-len(line)) -#fastq format + line = FILE.readline() + if not line: + break + if line[0:1] == char: + count += 1 + offsets.append(FILE.tell() - len(line)) + if mode == 'p': + offsets2 = list() + while True: + line = FILE2.readline() + if not line: + break + if line[0:1] == char: + offsets2.append(FILE2.tell() - len(line)) +# fastq format elif char == '@': - while True: - offsets.append(FILE.tell()) - line = FILE.readline() - if not line: - offsets.pop(count) - break - count += 1 - linesToPlus = 0 - while FILE.readline()[0:1] != '+': - linesToPlus += 1 - for i in range(0, linesToPlus): - FILE.readline() - if mode == 'p': - offsets2 = list() while True: - offsets2.append(FILE2.tell()) - line = FILE2.readline() - if not line: - offsets2.pop(count) - break - linesToPlus = 0 - while FILE2.readline()[0:1] != '+': - linesToPlus += 1 - for i in range(0, linesToPlus): - FILE2.readline() + offsets.append(FILE.tell()) + line = FILE.readline() + if not line: + offsets.pop(count) + break + count += 1 + linesToPlus = 0 + while FILE.readline()[0:1] != '+': + linesToPlus += 1 + for i in range(0, linesToPlus): + FILE.readline() + if mode == 'p': + offsets2 = list() + while True: + offsets2.append(FILE2.tell()) + line = FILE2.readline() + if not line: + offsets2.pop(count) + break + linesToPlus = 0 + while FILE2.readline()[0:1] != '+': + linesToPlus += 1 + for i in range(0, linesToPlus): + FILE2.readline() else: - print("Your file does not appear to be a valid format") - sys.exit() + print("Your file does not appear to be a valid format") + sys.exit() if count < numSeq: - numSeq = count + numSeq = count selected = list(range(0, count)) random.shuffle(selected) -selected = selected[0: numSeq] -selected.sort() +selected = sorted(selected[0:numSeq]) if mode == 's': - out = open(outs[0],'w') - for i in range(0, numSeq): - if selected[i] == count-1: - FILE.seek(offsets[selected[i]]) - while True: - line = FILE.readline() - if not line: - break - out.write(line) - else: - curOffset = 0 - targetOffset = offsets[selected[i]+1]-offsets[selected[i]] - FILE.seek(offsets[selected[i]]) - while curOffset != targetOffset: - line = FILE.readline() - curOffset += len(line) - out.write(line) - FILE.close() - out.close() - sys.exit() -#to avoid incessant checking + out = open(outs[0], 'w') + for i in range(0, numSeq): + if selected[i] == count - 1: + FILE.seek(offsets[selected[i]]) + while True: + line = FILE.readline() + if not line: + break + out.write(line) + else: + curOffset = 0 + targetOffset = offsets[selected[i] + 1] - offsets[selected[i]] + FILE.seek(offsets[selected[i]]) + while curOffset != targetOffset: + line = FILE.readline() + curOffset += len(line) + out.write(line) + FILE.close() + out.close() + sys.exit() +# to avoid incessant checking elif mode == 'p': - out1 = open(outs[0],'w') - out2 = open(outs[1],'w') - for i in range(0, numSeq): - if selected[i] == count-1: - FILE.seek(offsets[selected[i]]) - FILE2.seek(offsets2[selected[i]]) - while True: - line = FILE.readline() - line2 = FILE2.readline() - if not line: - break - out1.write(line) - out2.write(line2) - else: - curOffset = 0 - targetOffset = offsets[selected[i]+1]-offsets[selected[i]] - FILE.seek(offsets[selected[i]]) - FILE2.seek(offsets2[selected[i]]) - while curOffset != targetOffset: - line = FILE.readline() - line2 = FILE2.readline() - curOffset += len(line) - out1.write(line) - out2.write(line2) - FILE.close() - FILE2.close() - out1.close() - out2.close() - sys.exit() + out1 = open(outs[0], 'w') + out2 = open(outs[1], 'w') + for i in range(0, numSeq): + if selected[i] == count - 1: + FILE.seek(offsets[selected[i]]) + FILE2.seek(offsets2[selected[i]]) + while True: + line = FILE.readline() + line2 = FILE2.readline() + if not line: + break + out1.write(line) + out2.write(line2) + else: + curOffset = 0 + targetOffset = offsets[selected[i] + 1] - offsets[selected[i]] + FILE.seek(offsets[selected[i]]) + FILE2.seek(offsets2[selected[i]]) + while curOffset != targetOffset: + line = FILE.readline() + line2 = FILE2.readline() + curOffset += len(line) + out1.write(line) + out2.write(line2) + FILE.close() + FILE2.close() + out1.close() + out2.close() + sys.exit() diff --git a/tools/snpeff.py b/tools/snpeff.py index 9f801bc25..3d38cb628 100644 --- a/tools/snpeff.py +++ b/tools/snpeff.py @@ -5,18 +5,24 @@ # built-ins import hashlib -import os, tempfile, logging, subprocess +import os +import tempfile +import logging +import subprocess # third-party import pysam # module-specific -import tools, util.file, util.genbank +import tools +import util.file +import util.genbank log = logging.getLogger(__name__) URL = 'http://downloads.sourceforge.net/project/snpeff/snpEff_v4_1i_core.zip' + class SnpEff(tools.Tool): jvmMemDefault = '4g' @@ -25,7 +31,7 @@ def __init__(self, install_methods=None, extra_genomes=['KJ660346.2']): install_methods = [DownloadAndTweakSnpEff(URL, extra_genomes)] self.known_dbs = set() self.installed_dbs = set() - super(SnpEff, self).__init__(install_methods = install_methods) + super(SnpEff, self).__init__(install_methods=install_methods) def version(self): return "4.1" @@ -33,11 +39,8 @@ def version(self): def execute(self, command, args, JVMmemory=None, stdin=None, stdout=None): if JVMmemory is None: JVMmemory = self.jvmMemDefault - toolCmd = ['java', - '-Xmx' + JVMmemory, - '-Djava.io.tmpdir=' + tempfile.tempdir, - '-jar', self.install_and_get_path(), - command] + args + toolCmd = ['java', '-Xmx' + JVMmemory, '-Djava.io.tmpdir=' + tempfile.tempdir, '-jar', + self.install_and_get_path(), command] + args log.debug(' '.join(toolCmd)) subprocess.check_call(toolCmd, stdin=stdin, stdout=stdout) @@ -73,23 +76,23 @@ def create_db(self, accessions, emailAddress, JVMmemory): outputDir = os.path.realpath(os.path.join(os.path.dirname(config_file), dataDir, databaseId)) #tempDir = tempfile.gettempdir() - records = util.genbank.fetch_full_records_from_genbank(accessions, - outputDir, - emailAddress, - forceOverwrite=True, - combinedFilePrefix="genes", - removeSeparateFiles=False) + records = util.genbank.fetch_full_records_from_genbank(accessions, + outputDir, + emailAddress, + forceOverwrite=True, + combinedFilePrefix="genes", + removeSeparateFiles=False) combinedGenbankFilepath = records[0] - add_genomes_to_snpeff_config_file(config_file, [(databaseId, sortedAccessionString, sortedAccessionString)]) + add_genomes_to_snpeff_config_file( + config_file, [ + (databaseId, sortedAccessionString, sortedAccessionString) + ]) self.known_dbs.add(databaseId) self.installed_dbs.add(databaseId) - args = [ - '-genbank', - '-v', databaseId - ] - self.execute('build', args, JVMmemory=JVMmemory) + args = ['-genbank', '-v', databaseId] + self.execute('build', args, JVMmemory=JVMmemory) def available_databases(self): toolCmd = ['java', '-jar', self.install_and_get_path(), 'databases'] @@ -105,9 +108,9 @@ def available_databases(self): split_points = list(line.index(key) for key in keys) elif not line.startswith('----'): indexes = split_points + [len(line)] - row = dict((keys[i], line[indexes[i]:indexes[i+1]].strip()) for i in range(len(split_points))) + row = dict((keys[i], line[indexes[i]:indexes[i + 1]].strip()) for i in range(len(split_points))) self.known_dbs.add(row['Genome']) - if row.get('Status')=='OK': + if row.get('Status') == 'OK': self.installed_dbs.add(row['Genome']) yield row @@ -128,14 +131,14 @@ def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None): genomeToUse = "" # if we don't have the genome, by name (snpEff official) or by hash (custom) - if (not self.has_genome(databaseId)): + if (not self.has_genome(databaseId)): if (not self.has_genome(genomes[0])): log.info("Checking for snpEff database online...") # check to see if it is available for download, and if so install it for row in self.available_databases(): if (genomes[0].lower() in row['Genome'].lower()) or ( - genomes[0].lower() in row['Bundle'].lower()) or ( - genomes[0].lower() in row['Organism'].lower()): + genomes[0].lower() in row['Bundle'].lower()) or ( + genomes[0].lower() in row['Organism'].lower()): self.download_db(row['Genome']) # backward compatability for where a single genome name is provided @@ -151,21 +154,15 @@ def annotate_vcf(self, inVcf, genomes, outVcf, emailAddress, JVMmemory=None): if not genomeToUse: raise Exception() - + args = [ - '-treatAllAsProteinCoding', 'false', - '-t', - '-noLog', - '-ud', '0', - '-noStats', - '-noShiftHgvs', - genomeToUse, + '-treatAllAsProteinCoding', 'false', '-t', '-noLog', '-ud', '0', '-noStats', '-noShiftHgvs', genomeToUse, os.path.realpath(inVcf) - ] + ] with open(tmpVcf, 'wt') as outf: self.execute('ann', args, JVMmemory=JVMmemory, stdout=outf) - + if outVcf.endswith('.vcf.gz'): pysam.tabix_compress(tmpVcf, outVcf, force=True) pysam.tabix_index(outVcf, force=True, preset='vcf') @@ -177,10 +174,11 @@ def get_data_dir(config_file): with open(config_file, 'rt') as inf: for line in inf: if line.strip().startswith('data.dir'): - dataDir = line[line.find("=")+1:].strip() + dataDir = line[line.find("=") + 1:].strip() break return dataDir + def add_genomes_to_snpeff_config_file(config_file, new_genomes): """ new_genomes is a 3-tuple (g,d,c): @@ -193,7 +191,7 @@ def add_genomes_to_snpeff_config_file(config_file, new_genomes): for line in inf: if not line.startswith('#') and line.strip(): i = line.find('.genome : ') - if i>=0: + if i >= 0: genomes.add(line[:i]) with open(config_file, 'at') as outf: for (g, d, c) in new_genomes: @@ -202,11 +200,13 @@ def add_genomes_to_snpeff_config_file(config_file, new_genomes): if g != c: outf.write("\t{}.chromosomes : {}\n".format(g, c)) + class DownloadAndTweakSnpEff(tools.DownloadPackage): + def __init__(self, url, extra_genomes=[]): self.extra_genomes = extra_genomes - super(DownloadAndTweakSnpEff, self).__init__( - url, 'snpEff/snpEff.jar', require_executability=False) + super(DownloadAndTweakSnpEff, self).__init__(url, 'snpEff/snpEff.jar', require_executability=False) + def post_download(self): config_file = os.path.join(self.destination_dir, 'snpEff', 'snpEff.config') add_genomes_to_snpeff_config_file(config_file, zip(self.extra_genomes, self.extra_genomes, self.extra_genomes)) diff --git a/tools/tango.py b/tools/tango.py index e69de29bb..8b1378917 100644 --- a/tools/tango.py +++ b/tools/tango.py @@ -0,0 +1 @@ + diff --git a/tools/tbl2asn.py b/tools/tbl2asn.py index a3d8f8dd0..9e2af86b1 100644 --- a/tools/tbl2asn.py +++ b/tools/tbl2asn.py @@ -5,36 +5,43 @@ __author__ = "dpark@broadinstitute.org" -import logging, tools, util.file -import os, os.path, subprocess, gzip +import logging +import tools +import util.file +import os +import os.path +import subprocess +import gzip url = 'ftp://ftp.ncbi.nih.gov/toolbox/ncbi_tools/converters/by_program/tbl2asn/{os}.tbl2asn.gz' log = logging.getLogger(__name__) + class Tbl2AsnTool(tools.Tool): - def __init__(self, install_methods = None): + + def __init__(self, install_methods=None): if install_methods is None: install_methods = [DownloadGzipBinary(url.format(os=get_bintype()), 'tbl2asn')] - tools.Tool.__init__(self, install_methods = install_methods) + tools.Tool.__init__(self, install_methods=install_methods) def version(self): return None def execute(self, templateFile, inputDir, outputDir=None, - source_quals=[], comment=None, verification='vb', - file_type='s', structured_comment_file=None, - per_genome_comment=False): - + source_quals=[], comment=None, verification='vb', + file_type='s', structured_comment_file=None, + per_genome_comment=False): + toolCmd = [self.install_and_get_path(), '-t', templateFile] - + if inputDir: toolCmd += ['-p', inputDir] if outputDir: toolCmd += ['-r', outputDir] if source_quals: toolCmd.append('-j') - toolCmd.append(' '.join("[{}={}]".format(k,v) for k,v in source_quals)) + toolCmd.append(' '.join("[{}={}]".format(k, v) for k, v in source_quals)) if comment: toolCmd += ['-y', comment] if structured_comment_file: @@ -45,7 +52,7 @@ def execute(self, templateFile, inputDir, outputDir=None, toolCmd += ['-a', file_type] if per_genome_comment: toolCmd += ['-X', 'C'] - + log.debug(' '.join(toolCmd)) subprocess.check_call(toolCmd) @@ -62,11 +69,12 @@ def get_bintype(): else: raise Exception('unsupported OS') + class DownloadGzipBinary(tools.DownloadPackage): + def unpack(self, download_dir): util.file.mkdir_p(self.destination_dir) - if (self.download_file.endswith('.gz') and - not self.download_file.endswith('.tar.gz')): + if (self.download_file.endswith('.gz') and not self.download_file.endswith('.tar.gz')): with gzip.open(os.path.join(download_dir, self.download_file)) as inf: with open(self.targetpath, 'wb') as outf: outf.write(inf.read()) diff --git a/tools/trimmomatic.py b/tools/trimmomatic.py index 0a5212b8a..0eaaafe72 100644 --- a/tools/trimmomatic.py +++ b/tools/trimmomatic.py @@ -5,11 +5,13 @@ trimmomaticURL = 'http://www.usadellab.org/cms/uploads/supplementary/' \ 'Trimmomatic/Trimmomatic-0.32.zip' -class TrimmomaticTool(tools.Tool) : - def __init__(self, install_methods = None) : - if install_methods == None : + +class TrimmomaticTool(tools.Tool): + + def __init__(self, install_methods=None): + if install_methods is None: install_methods = [] install_methods.append(tools.DownloadPackage(trimmomaticURL, - 'Trimmomatic-0.32/trimmomatic-0.32.jar', - require_executability=False)) - tools.Tool.__init__(self, install_methods = install_methods) + 'Trimmomatic-0.32/trimmomatic-0.32.jar', + require_executability=False)) + tools.Tool.__init__(self, install_methods=install_methods) diff --git a/tools/trinity.py b/tools/trinity.py index a2e3dfbff..8846bae0c 100644 --- a/tools/trinity.py +++ b/tools/trinity.py @@ -1,94 +1,92 @@ ''' The Trinity RNA-SEQ assembler - + This uses an older version of Trinity that uses an older assembly algorithm that works better with highly diverse viral genomes. ''' -import logging, os, os.path, subprocess, tempfile, shutil +import logging +import os +import os.path +import subprocess +import tempfile +import shutil import tools tool_version = "2011-11-26" trinityVersion = "trinityrnaseq_r{}".format(tool_version) -url = "http://sourceforge.net/projects/trinityrnaseq/files/{}.tgz".format( - trinityVersion) +url = "http://sourceforge.net/projects/trinityrnaseq/files/{}.tgz".format(trinityVersion) log = logging.getLogger(__name__) -class TrinityTool(tools.Tool) : +class TrinityTool(tools.Tool): jvmMemDefault = '4g' - def __init__(self, install_methods = None) : - if install_methods == None : - install_methods = [ - DownloadAndBuildTrinity(url, trinityVersion + '/Trinity.pl') - ] - tools.Tool.__init__(self, install_methods = install_methods) - - def version(self) : + + def __init__(self, install_methods=None): + if install_methods is None: + install_methods = [DownloadAndBuildTrinity(url, trinityVersion + '/Trinity.pl')] + tools.Tool.__init__(self, install_methods=install_methods) + + def version(self): return tool_version - - def execute(self, inFastq1, inFastq2, outFasta, min_contig_length=300, - JVMmemory=None, threads=1): - if JVMmemory==None: + + def execute(self, inFastq1, inFastq2, outFasta, min_contig_length=300, JVMmemory=None, threads=1): + if JVMmemory is None: JVMmemory = self.jvmMemDefault outdir = tempfile.mkdtemp(prefix='trinity-') if int(threads) < 1: threads = 1 - cmd = [self.install_and_get_path(), - '--CPU', '{}'.format(int(threads)), - '--bflyHeapSpace', JVMmemory.upper(), - '--min_contig_length', str(min_contig_length), - '--seqType', 'fq', - '--left', inFastq1, - '--right', inFastq2, - '--output', outdir] + cmd = [self.install_and_get_path(), '--CPU', '{}'.format(int(threads)), '--bflyHeapSpace', JVMmemory.upper(), + '--min_contig_length', str(min_contig_length), '--seqType', 'fq', '--left', inFastq1, '--right', + inFastq2, '--output', outdir] log.debug(' '.join(cmd)) subprocess.check_call(cmd) shutil.copyfile(os.path.join(outdir, 'Trinity.fasta'), outFasta) shutil.rmtree(outdir, ignore_errors=True) -class DownloadAndBuildTrinity(tools.DownloadPackage) : - def post_download(self) : + +class DownloadAndBuildTrinity(tools.DownloadPackage): + + def post_download(self): trinityDir = os.path.join(self.destination_dir, trinityVersion) - if tool_version == "2011-11-26" : + if tool_version == "2011-11-26": # Chrysalis doesn't compile. Need to add an include file. - badFilePath = os.path.join(trinityDir, 'Chrysalis', 'analysis', - 'RunButterfly.cc') + badFilePath = os.path.join(trinityDir, 'Chrysalis', 'analysis', 'RunButterfly.cc') os.rename(badFilePath, badFilePath + '.orig') with open(badFilePath, 'wt') as outf: outf.write('#include \n') - with open(badFilePath+'.orig', 'rt') as inf: + with open(badFilePath + '.orig', 'rt') as inf: for line in inf: outf.write(line) - + # Trinity.pl insists on Java 1.6, but Java >= 1.6 is fine badFilePath = os.path.join(trinityDir, 'Trinity.pl') os.rename(badFilePath, badFilePath + '.orig') with open(badFilePath, 'wt') as outf: - with open(badFilePath+'.orig', 'rt') as inf: + with open(badFilePath + '.orig', 'rt') as inf: for line in inf: if line.startswith('unless ($java_version =~ /java version'): outf.write('$java_version =~ /java version \"1\.(\d+)\./;\n') outf.write('unless ($1 >= 6) {\n') else: outf.write(line) - shutil.copymode(badFilePath+'.orig', badFilePath) - + shutil.copymode(badFilePath + '.orig', badFilePath) + # Now we can make: os.system('cd "{}" && make -s'.format(trinityDir)) shutil.rmtree(os.path.join(trinityDir, 'sample_data'), ignore_errors=True) - - def verify_install(self) : - if not tools.DownloadPackage.verify_install(self) : + + def verify_install(self): + if not tools.DownloadPackage.verify_install(self): return False # Verify that chrysalis and inchworm were built trinityDir = os.path.join(self.destination_dir, trinityVersion) chrysalisPath = os.path.join(trinityDir, 'Chrysalis', 'Chrysalis') - inchwormPath = os.path.join(trinityDir, 'Inchworm', 'src', 'inchworm') - for path in [chrysalisPath, inchwormPath] : - if not os.access(path, (os.X_OK | os.R_OK)) : + inchwormPath = os.path.join(trinityDir, 'Inchworm', 'src', 'inchworm') + for path in [chrysalisPath, inchwormPath]: + if not os.access(path, (os.X_OK | os.R_OK)): log.debug('%s was not built.', path) self.installed = False self.installed = True diff --git a/tools/vphaser2.py b/tools/vphaser2.py index d055fd946..2273951d9 100644 --- a/tools/vphaser2.py +++ b/tools/vphaser2.py @@ -2,48 +2,52 @@ V-Phaser 2 variant caller ''' -import logging, subprocess, os, tempfile, shutil +import logging +import subprocess +import os +import tempfile +import shutil import pysam -import tools, util.file +import tools +import util.file log = logging.getLogger(__name__) -class Vphaser2Tool(tools.Tool) : - def __init__(self, install_methods = None) : - if install_methods == None : + +class Vphaser2Tool(tools.Tool): + + def __init__(self, install_methods=None): + if install_methods is None: path = _get_vphaser2_path() install_methods = [tools.PrexistingUnixCommand(path)] - tools.Tool.__init__(self, install_methods = install_methods) + tools.Tool.__init__(self, install_methods=install_methods) - def execute(self, inBam, outDir, numThreads = None) : - cmd = [self.install_and_get_path(), - '-i', inBam, - '-o', outDir - ] + def execute(self, inBam, outDir, numThreads=None): + cmd = [self.install_and_get_path(), '-i', inBam, '-o', outDir] cmdStr = ' '.join(cmd) envCopy = os.environ.copy() - if numThreads != None : + if numThreads is not None: envCopy['OMP_NUM_THREADS'] = str(numThreads) cmdStr = 'OMP_NUM_THREADS=%d ' % numThreads + cmdStr log.debug(cmdStr) - + # Use check_output instead of check_call so that we get error information # if the executable can't run on travis. # Also has the effect of suppressing informational messages from vphaser, # which is probably a good thing. - try : - subprocess.check_output(cmd, env = envCopy, stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as ex : - print(ex.output) # Useful in case of no log handler. + try: + subprocess.check_output(cmd, env=envCopy, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as ex: + print(ex.output) # Useful in case of no log handler. log.error(ex.output) raise - def iterate(self, inBam, numThreads = None) : + def iterate(self, inBam, numThreads=None): """ - Run V-Phaser 2 on inBam. Interate through lines in files + Run V-Phaser 2 on inBam. Interate through lines in files CHROM.var.raw.txt in order of chroms in the inBam header. For each line yield: - [CHROM, Ref_Pos, Var, Cons, Strd_bias_pval, Type, Var_perc, + [CHROM, Ref_Pos, Var, Cons, Strd_bias_pval, Type, Var_perc, SNP_or_LP_Profile1, SNP_or_LP_Profile2, ...] """ outdir = tempfile.mkdtemp('vphaser2') @@ -55,25 +59,25 @@ def iterate(self, inBam, numThreads = None) : if os.path.isfile(bti): os.unlink(bti) chromNames = pysam.Samfile(inBam).references - for chromName in chromNames : + for chromName in chromNames: outfile = os.path.join(outdir, chromName + '.var.raw.txt') - if not os.path.exists(outfile) : + if not os.path.exists(outfile): continue - with open(outfile, 'rt') as inf : - for line in inf : - if not line.startswith('#') : + with open(outfile, 'rt') as inf: + for line in inf: + if not line.startswith('#'): yield [chromName] + line.strip().split() shutil.rmtree(outdir) -def _get_vphaser2_path() : + +def _get_vphaser2_path(): uname = os.uname() - if uname[0] == 'Darwin' : + if uname[0] == 'Darwin': osName = 'MacOSX' - elif uname[0] == 'Linux' and uname[4].endswith('64') : + elif uname[0] == 'Linux' and uname[4].endswith('64'): osName = 'linux64' - else : - log.debug('V-Phaser 2 not implemented for OS %s %s', - uname[0], uname[4]) + else: + log.debug('V-Phaser 2 not implemented for OS %s %s', uname[0], uname[4]) return '' binariesPath = util.file.get_binaries_path() return os.path.join(binariesPath, 'V-Phaser-2.0', osName, 'variant_caller') diff --git a/util/__init__.py b/util/__init__.py index e69de29bb..8b1378917 100644 --- a/util/__init__.py +++ b/util/__init__.py @@ -0,0 +1 @@ + diff --git a/util/annot.py b/util/annot.py index 0225c33c3..06e0e5b69 100644 --- a/util/annot.py +++ b/util/annot.py @@ -4,14 +4,22 @@ __version__ = "PLACEHOLDER" __date__ = "PLACEHOLDER" -import sqlite3, itertools, urllib, logging, re, os -import util.file, util.misc +import sqlite3 +import itertools +import urllib +import logging +import re +import os +import util.file +import util.misc log = logging.getLogger(__name__) + class SnpAnnotater(object): ''' Add annotations to snps based on a snpEff-annotated VCF file. ''' + def __init__(self, snpEffVcf=None, snpIterator=None): self.snpIterator = snpIterator self.dbFile = util.file.mkstempfname(prefix='SnpAnnotater-', suffix='.db') @@ -33,68 +41,75 @@ def __init__(self, snpEffVcf=None, snpIterator=None): self.cur.execute("create index idx_annot on annot(chr,pos)") if snpEffVcf: self.loadVcf(snpEffVcf) + def loadVcf(self, snpEffVcf): #log.info("reading in snpEff VCF file: %s" % snpEffVcf) with util.file.open_or_gzopen(snpEffVcf, 'rt') as inf: ffp = util.file.FlatFileParser(inf) try: - imap = hasattr(itertools, 'imap') and itertools.imap or map #py2 & py3 compatibility - ifilter = hasattr(itertools, 'ifilter') and itertools.ifilter or filter #py2 & py3 compatibility + imap = hasattr(itertools, 'imap') and itertools.imap or map # py2 & py3 compatibility + ifilter = hasattr(itertools, 'ifilter') and itertools.ifilter or filter # py2 & py3 compatibility self.cur.executemany("""insert into annot (chr,pos,allele_ref,allele_alt, effect,impact,gene_id,gene_name,protein_pos,residue_ref,residue_alt) - values (?,?,?,?,?,?,?,?,?,?,?)""", - imap(lambda row: - [row['CHROM'], int(row['POS']), row['REF'], row['ALT']] - + parse_eff(row['CHROM'], row['POS'], row['INFO']), - ifilter(lambda r: r['ALT'] != '.', ffp))) + values (?,?,?,?,?,?,?,?,?,?,?)""", imap( + lambda row: [row['CHROM'], int(row['POS']), row['REF'], row['ALT']] + parse_eff(row['CHROM'], row['POS'], row['INFO']), + ifilter(lambda r: r['ALT'] != '.', ffp))) except Exception as e: log.exception("exception processing file %s line %s", snpEffVcf, ffp.line_num) raise self.cur.execute("select chr,pos from annot group by chr,pos having count(*)>1") - dupes = [(c,p) for c,p in self.cur] + dupes = [(c, p) for c, p in self.cur] if dupes: - log.info("deleting annotation for %d duplicate positions: %s", len(dupes), ', '.join(['%s:%s'%(c,p) for c,p in dupes])) + log.info("deleting annotation for %d duplicate positions: %s", len(dupes), + ', '.join(['%s:%s' % (c, p) for c, p in dupes])) self.cur.executemany("delete from annot where chr=? and pos=?", dupes) self.conn.commit() + def __iter__(self): assert self.snpIterator for snp in self.snpIterator: yield self.annotate(snp) + def annotate(self, row): self.cur.execute("""select effect,impact,gene_id,gene_name,protein_pos, allele_ref,allele_alt,residue_ref,residue_alt from annot where chr=? and pos=?""", [row['chr'], int(row['pos'])]) x = self.cur.fetchone() - if x != None: - row['effect'],row['impact'],row['gene_id'],row['gene_name'],row['protein_pos'],row['allele_ref'],row['allele_alt'],row['residue_ref'],row['residue_alt'] = x - row['alleles'] = '/'.join((row['allele_ref'],row['allele_alt'])) + if x is not None: + row['effect'], row['impact'], row['gene_id'], row['gene_name'], row['protein_pos'], row[ + 'allele_ref' + ], row['allele_alt'], row['residue_ref'], row['residue_alt'] = x + row['alleles'] = '/'.join((row['allele_ref'], row['allele_alt'])) if row['residue_alt']: - row['residues'] = '/'.join((row['residue_ref'],row['residue_alt'])) + row['residues'] = '/'.join((row['residue_ref'], row['residue_alt'])) else: row['residues'] = row['residue_ref'] else: row['effect'] = 'UNKNOWN' row['impact'] = 'UNKNOWN' return row + def new_fields(self): return ('effect', 'impact', 'gene_id', 'gene_name', 'protein_pos', 'alleles', 'residues') + def __enter__(self): return self + def __exit__(self, exc_type, exc_val, exc_tb): self.close() return 0 + def close(self): self.cur.close() self.conn.close() os.unlink(self.dbFile) - def parse_eff(chrom, pos, info, required=True): try: - impact_rank = {'HIGH':0,'MODERATE':1,'LOW':2,'MODIFIER':3} + impact_rank = {'HIGH': 0, 'MODERATE': 1, 'LOW': 2, 'MODIFIER': 3} infos = [x for x in info.split(';') if x.startswith('EFF=')] - assert len(infos)<=1 + assert len(infos) <= 1 if not infos: assert not required return ['', '', '', '', '', '', ''] @@ -105,13 +120,13 @@ def parse_eff(chrom, pos, info, required=True): assert eff.endswith(')') effect, other = eff[:-1].split('(') other = other.split('|') - assert len(other)>=10 + assert len(other) >= 10 impact = other[0] gene_id = other[8] assert not gene_id or (gene_id.endswith('-1') and gene_id.startswith('rna_')) if gene_id: gene_id = gene_id[4:-2] - if gene_id=='PF14_0620' or gene_id=='PF3D7_1465300': + if gene_id == 'PF14_0620' or gene_id == 'PF3D7_1465300': gene_name = 'tRNA 3-trailer sequence RNase, putative' else: try: @@ -124,25 +139,25 @@ def parse_eff(chrom, pos, info, required=True): if effect.startswith('SYNON'): aas = (aa_chg[0], '') codon = int(aa_chg[1:]) - elif effect.startswith('NON_SYNON') or effect.startswith('START_') or effect.startswith('STOP_') or effect.startswith('CODON_'): + elif effect.startswith('NON_SYNON') or effect.startswith('START_') or effect.startswith( + 'STOP_') or effect.startswith('CODON_'): mo = re.match(r'^(\D+)(\d+)(\D+)$', aa_chg) assert mo, "unrecognized coding change format for %s (%s)" % (effect, aa_chg) aas = (mo.group(1), mo.group(3)) codon = int(mo.group(2)) - elif effect=='FRAME_SHIFT': + elif effect == 'FRAME_SHIFT': mo = re.match(r'^(\D*)(\d+)(\D*)$', aa_chg) assert mo, "unrecognized coding change format for %s (%s)" % (effect, aa_chg) - aas = ('','') + aas = ('', '') codon = int(mo.group(2)) else: assert 0, "unrecognized effect type (%s) for variant with coding change (%s)" % (effect, aa_chg) else: - aas = ('','') + aas = ('', '') codon = '' - out.append([impact_rank[impact], effect, impact, gene_id, gene_name, - codon, aas[0], aas[1]]) + out.append([impact_rank[impact], effect, impact, gene_id, gene_name, codon, aas[0], aas[1]]) - if len(out)>1: + if len(out) > 1: out.sort() if out[0][0] == out[1][0]: #log.debug("SNP found with multiple effects of the same impact level: %s:%s - %s" % (chrom, pos, info)) @@ -154,5 +169,3 @@ def parse_eff(chrom, pos, info, required=True): except Exception as e: log.exception("exception parsing snpEff on row %s:%s - %s", chrom, pos, info) raise - - diff --git a/util/cmd.py b/util/cmd.py index 5e8210ff3..b43f07d31 100644 --- a/util/cmd.py +++ b/util/cmd.py @@ -3,7 +3,13 @@ command-line functions from a single python script. ''' -import os, os.path, tempfile, sys, shutil, logging, argparse +import os +import os.path +import tempfile +import sys +import shutil +import logging +import argparse import util.version __author__ = "dpark@broadinstitute.org" @@ -12,6 +18,7 @@ log = logging.getLogger() tmpDir = None + def setup_logger(log_level): loglevel = getattr(logging, log_level.upper(), None) assert loglevel, "unrecognized log level: %s" % log_level @@ -20,49 +27,59 @@ def setup_logger(log_level): h.setFormatter(logging.Formatter("%(asctime)s - %(module)s:%(lineno)d:%(funcName)s - %(levelname)s - %(message)s")) log.addHandler(h) + def script_name(): - return os.path.basename(sys.argv[0]).rsplit('.',1)[0] + return os.path.basename(sys.argv[0]).rsplit('.', 1)[0] -def common_args(parser, arglist=(('tmpDir',None), ('loglevel',None))): - for k,v in arglist: - if k=='loglevel': + +def common_args(parser, arglist=(('tmpDir', None), ('loglevel', None))): + for k, v in arglist: + if k == 'loglevel': if not v: v = 'DEBUG' - parser.add_argument("--loglevel", dest="loglevel", - help="Verboseness of output. [default: %(default)s]", - default=v, - choices=('DEBUG','INFO','WARNING','ERROR','CRITICAL','EXCEPTION')) - elif k=='tmpDir': + parser.add_argument("--loglevel", + dest="loglevel", + help="Verboseness of output. [default: %(default)s]", + default=v, + choices=('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL', 'EXCEPTION')) + elif k == 'tmpDir': if not v: v = find_tmpDir() - parser.add_argument("--tmpDir", dest="tmpDir", - help="Base directory for temp files. [default: %(default)s]", - default=v) + parser.add_argument("--tmpDir", + dest="tmpDir", + help="Base directory for temp files. [default: %(default)s]", + default=v) parser.add_argument("--tmpDirKeep", - action="store_true", dest="tmpDirKeep", - help="""Keep the tmpDir if an exception occurs while + action="store_true", + dest="tmpDirKeep", + help="""Keep the tmpDir if an exception occurs while running. Default is to delete all temp files at the end, even if there's a failure.""", - default=False) - elif k=='version': + default=False) + elif k == 'version': if not v: - v=__version__ + v = __version__ parser.add_argument('--version', '-V', action='version', version=v) else: raise Exception("unrecognized argument %s" % k) return parser + def main_command(mainfunc): ''' This wraps a python method in another method that can be called with an argparse.Namespace object. When called, it will pass all the values of the object on as parameters to the function call. ''' + def _main(args): - args2 = dict((k,v) for k,v in vars(args).items() if k not in ('loglevel','tmpDir','tmpDirKeep','version','func_main','command')) + args2 = dict((k, v) for k, v in vars(args).items() if k not in ( + 'loglevel', 'tmpDir', 'tmpDirKeep', 'version', 'func_main', 'command')) mainfunc(**args2) + _main.__doc__ = mainfunc.__doc__ return _main + def attach_main(parser, cmd_main, split_args=False): ''' This attaches the main function call to a parser object. ''' @@ -72,6 +89,7 @@ def attach_main(parser, cmd_main, split_args=False): parser.set_defaults(func_main=cmd_main) return parser + def make_parser(commands, description): ''' commands: a list of pairs containing the following: 1. name of command (string, no whitespace) @@ -82,14 +100,13 @@ def make_parser(commands, description): description: a long string to present as a description of your script as a whole if the script is run with no arguments ''' - if len(commands)==1 and commands[0][0]==None: + if len(commands) == 1 and commands[0][0] == None: # only one (nameless) command in this script, simplify parser = commands[0][1]() parser.set_defaults(command='') else: # multiple commands available - parser = argparse.ArgumentParser(description=description, - usage='%(prog)s subcommand', add_help=False) + parser = argparse.ArgumentParser(description=description, usage='%(prog)s subcommand', add_help=False) parser.add_argument('--help', '-h', action='help', help=argparse.SUPPRESS) parser.add_argument('--version', '-V', action='version', version=__version__, help=argparse.SUPPRESS) subparsers = parser.add_subparsers(title='subcommands', dest='command') @@ -98,38 +115,41 @@ def make_parser(commands, description): cmd_parser(p) return parser + def main_argparse(commands, description): parser = make_parser(commands, description) - + # if called with no arguments, print help - if len(sys.argv)==1: + if len(sys.argv) == 1: parser.parse_args(['--help']) - elif len(sys.argv)==2 and (len(commands)>1 or commands[0][0]!=None): + elif len(sys.argv) == 2 and (len(commands) > 1 or commands[0][0] != None): parser.parse_args([sys.argv[1], '--help']) args = parser.parse_args() - + setup_logger(not hasattr(args, 'loglevel') and 'DEBUG' or args.loglevel) log.info("software version: %s, python version: %s", __version__, sys.version) - log.info("command: %s %s %s", - sys.argv[0], sys.argv[1], - ' '.join(["%s=%s" % (k,v) for k,v in vars(args).items() if k not in ('command', 'func_main')])) - + log.info("command: %s %s %s", sys.argv[0], sys.argv[1], + ' '.join(["%s=%s" % (k, v) for k, v in vars(args).items() if k not in ('command', 'func_main')])) + if hasattr(args, 'tmpDir'): ''' If this command has a tmpDir option, use that as a base directory and create a subdirectory within it which we will then destroy at the end of execution. ''' - proposed_dir = 'tmp-%s-%s' % (script_name(),args.command) + proposed_dir = 'tmp-%s-%s' % (script_name(), args.command) if 'LSB_JOBID' in os.environ: - proposed_dir = 'tmp-%s-%s-%s-%s' % (script_name(),args.command,os.environ['LSB_JOBID'],os.environ['LSB_JOBINDEX']) - tempfile.tempdir = tempfile.mkdtemp(prefix='%s-'%proposed_dir, dir=args.tmpDir) + proposed_dir = 'tmp-%s-%s-%s-%s' % (script_name(), args.command, os.environ['LSB_JOBID'], + os.environ['LSB_JOBINDEX']) + tempfile.tempdir = tempfile.mkdtemp(prefix='%s-' % proposed_dir, dir=args.tmpDir) log.debug("using tempDir: %s", tempfile.tempdir) - os.environ['TMPDIR'] = tempfile.tempdir # this is for running R + os.environ['TMPDIR'] = tempfile.tempdir # this is for running R try: ret = args.func_main(args) except: - if hasattr(args, 'tmpDirKeep') and args.tmpDirKeep and not (tempfile.tempdir.startswith('/tmp') or tempfile.tempdir.startswith('/local')): - log.exception("Exception occurred while running %s, saving tmpDir at %s", args.command, tempfile.tempdir) + if hasattr(args, 'tmpDirKeep') and args.tmpDirKeep and not ( + tempfile.tempdir.startswith('/tmp') or tempfile.tempdir.startswith('/local')): + log.exception( + "Exception occurred while running %s, saving tmpDir at %s", args.command, tempfile.tempdir) else: shutil.rmtree(tempfile.tempdir) raise @@ -138,10 +158,11 @@ def main_argparse(commands, description): else: # otherwise just run the command ret = args.func_main(args) - if ret==None: + if ret is None: ret = 0 return ret + def find_tmpDir(): ''' This provides a suggested base directory for a temp dir for use in your argparse-based tmpDir option. diff --git a/util/file.py b/util/file.py index bd5b278ea..28d8952ac 100644 --- a/util/file.py +++ b/util/file.py @@ -6,7 +6,13 @@ __version__ = "PLACEHOLDER" __date__ = "PLACEHOLDER" -import os, gzip, tempfile, shutil, errno, logging, json +import os +import gzip +import tempfile +import shutil +import errno +import logging +import json import util.cmd # imports needed for download_file() and webfile_readlines() @@ -19,50 +25,57 @@ log = logging.getLogger(__name__) -def get_project_path() : + +def get_project_path(): '''Return the absolute path of the top-level project, assumed to be the parent of the directory containing this script.''' # abspath converts relative to absolute path; expanduser interprets ~ - path = __file__ # path to this script + path = __file__ # path to this script path = os.path.expanduser(path) # interpret ~ - path = os.path.abspath(path) # convert to absolute path - path = os.path.dirname(path) # containing directory: util - path = os.path.dirname(path) # containing directory: main project dir + path = os.path.abspath(path) # convert to absolute path + path = os.path.dirname(path) # containing directory: util + path = os.path.dirname(path) # containing directory: main project dir return path -def get_build_path() : + +def get_build_path(): '''Return absolute path of "build" directory''' return os.path.join(get_project_path(), 'tools', 'build') -def get_scripts_path() : + +def get_scripts_path(): '''Return absolute path of "scripts" directory''' return os.path.join(get_project_path(), 'tools', 'scripts') -def get_binaries_path() : + +def get_binaries_path(): '''Return absolute path of "binaries" directory''' return os.path.join(get_project_path(), 'tools', 'binaries') -def get_test_path() : + +def get_test_path(): '''Return absolute path of "test" directory''' return os.path.join(get_project_path(), 'test') -def get_test_input_path(testClassInstance=None) : + +def get_test_input_path(testClassInstance=None): '''Return the path to the directory containing input files for the specified test class ''' - if testClassInstance is not None : - return os.path.join(get_test_path(), 'input', - type(testClassInstance).__name__) + if testClassInstance is not None: + return os.path.join(get_test_path(), 'input', type(testClassInstance).__name__) else: return os.path.join(get_test_path(), 'input') -def get_resources() : + +def get_resources(): ''' Return the project resources dictionary ''' jsonfile = os.path.join(get_project_path(), 'resources.json') with open(jsonfile, 'rt') as inf: resources = json.load(inf) return resources + def mkstempfname(suffix='', prefix='tmp', dir=None, text=False): ''' There's no other one-liner way to securely ask for a temp file by filename only. This calls mkstemp, which does what we want, except @@ -73,34 +86,39 @@ def mkstempfname(suffix='', prefix='tmp', dir=None, text=False): os.close(fd) return fn + def set_tmpDir(name): proposed_prefix = ['tmp'] if name: proposed_prefix.append(name) - for e in ('LSB_JOBID','LSB_JOBINDEX'): + for e in ('LSB_JOBID', 'LSB_JOBINDEX'): if e in os.environ: proposed_prefix.append(os.environ[e]) - tempfile.tempdir = tempfile.mkdtemp(prefix='-'.join(proposed_prefix)+'-', - dir=util.cmd.find_tmpDir()) + tempfile.tempdir = tempfile.mkdtemp(prefix='-'.join(proposed_prefix) + '-', dir=util.cmd.find_tmpDir()) + def destroy_tmpDir(): if tempfile.tempdir: shutil.rmtree(tempfile.tempdir) tempfile.tempdir = None + def mkdir_p(dirpath): ''' Verify that the directory given exists, and if not, create it. ''' try: os.makedirs(dirpath) - except OSError as exc: # Python >2.5 + except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(dirpath): pass - else: raise + else: + raise + def open_or_gzopen(fname, *opts): return fname.endswith('.gz') and gzip.open(fname, *opts) or open(fname, *opts) + def read_tabfile_dict(inFile): ''' Read a tab text file (possibly gzipped) and return contents as an iterator of dicts. @@ -112,11 +130,12 @@ def read_tabfile_dict(inFile): if line.startswith('#'): row[0] = row[0][1:] header = row - elif header==None: + elif header is None: header = row else: - assert len(header)==len(row) - yield dict((k,v) for k,v in zip(header, row) if v) + assert len(header) == len(row) + yield dict((k, v) for k, v in zip(header, row) if v) + def read_tabfile(inFile): ''' Read a tab text file (possibly gzipped) and return contents as an @@ -127,6 +146,7 @@ def read_tabfile(inFile): if not line.startswith('#'): yield line.rstrip('\n').split('\t') + def readFlatFileHeader(filename, headerPrefix='#', delim='\t'): with open_or_gzopen(filename, 'rt') as inf: header = inf.readline().rstrip('\n').split(delim) @@ -134,12 +154,14 @@ def readFlatFileHeader(filename, headerPrefix='#', delim='\t'): header[0] = header[0][len(headerPrefix):] return header + class FlatFileParser(object): ''' Generic flat file parser that parses tabular text input ''' + def __init__(self, lineIter=None, name=None, outType='dict', - readHeader=True, headerPrefix='#', delim='\t', - requireUniqueHeader=False): + readHeader=True, headerPrefix='#', delim='\t', + requireUniqueHeader=False): self.lineIter = lineIter self.header = None self.name = name @@ -147,9 +169,9 @@ def __init__(self, lineIter=None, name=None, outType='dict', self.readHeader = readHeader self.delim = delim self.requireUniqueHeader = requireUniqueHeader - self.line_num=0 - assert outType in ('dict','arrayStrict', 'arrayLoose','both') - self.outType=outType + self.line_num = 0 + assert outType in ('dict', 'arrayStrict', 'arrayLoose', 'both') + self.outType = outType assert readHeader or outType in ('arrayStrict', 'arrayLoose') def __enter__(self): @@ -162,7 +184,7 @@ def __iter__(self): assert self.lineIter for row in self.lineIter: out = self.parse(row) - if out != None: + if out is not None: yield out def parse(self, row): @@ -195,16 +217,15 @@ def parseHeader(self, row): assert row self.header = row if self.outType != 'arrayLoose': - assert len(row) == len(dict([(x,0) for x in row])) + assert len(row) == len(dict([(x, 0) for x in row])) def parseRow(self, row): - assert self.outType == 'arrayLoose' or ( self.header and - len(self.header) == len(row) ) + assert self.outType == 'arrayLoose' or (self.header and len(self.header) == len(row)) - if self.outType =='arrayLoose' or self.outType == 'arrayStrict' : + if self.outType == 'arrayLoose' or self.outType == 'arrayStrict': return row - out = { self.header[i]: row[i] for i in range( len(self.header) ) } - if self.outType=='both': + out = {self.header[i]: row[i] for i in range(len(self.header))} + if self.outType == 'both': for i in range(len(self.header)): out[i] = row[i] return out @@ -222,7 +243,8 @@ def fastaMaker(seqs, linewidth=60): yield "{}\n".format(line) if seq: - yield seq+"\n" + yield seq + "\n" + def makeFastaFile(seqs, outFasta): with open(outFasta, 'wt') as outf: @@ -231,6 +253,7 @@ def makeFastaFile(seqs, outFasta): return outFasta + def concat(inputFilePaths, outputFilePath): ''' This function creates an output file containing the @@ -243,6 +266,7 @@ def concat(inputFilePaths, outputFilePath): for line in infile: outfile.write(line) + def download_file(uriToGet, dest, destFileName=None): destDir = os.path.realpath(os.path.expanduser(dest)) @@ -260,15 +284,17 @@ def download_file(uriToGet, dest, destFileName=None): with open(destPath, "wb") as outf: while True: - chunk = req.read(1024) - if not chunk: break - outf.write(chunk) + chunk = req.read(1024) + if not chunk: + break + outf.write(chunk) return destPath + def webfile_readlines(uriToGet): - for line in urlopen(uriToGet):#.readlines(): + for line in urlopen(uriToGet): # .readlines(): cleanedLine = line.decode("utf-8").strip() if len(cleanedLine) > 0: - yield cleanedLine \ No newline at end of file + yield cleanedLine diff --git a/util/genbank.py b/util/genbank.py index 3c62f70f2..d01e84205 100644 --- a/util/genbank.py +++ b/util/genbank.py @@ -1,7 +1,9 @@ #!/usr/bin/python # built-ins -import time, os, logging +import time +import os +import logging # third-party from Bio import Entrez @@ -9,6 +11,7 @@ log = logging.getLogger(__name__) + def get_feature_table_id(featureTableFile): seqid = "" with open(featureTableFile, 'rt') as inf: @@ -21,20 +24,22 @@ def get_feature_table_id(featureTableFile): if not line.startswith('>Feature '): raise Exception("not sure how to handle a non-Feature record") seqid = line[len('>Feature '):].strip() - if not ((seqid.startswith('gb|') or seqid.startswith('ref|')) and seqid.endswith('|') and len(seqid)>4): + if not ( + (seqid.startswith('gb|') or seqid.startswith('ref|')) and seqid.endswith('|') and len(seqid) > 4): raise Exception("reference annotation does not refer to a GenBank or RefSeq accession") - seqid = seqid[seqid.find("|")+1:-1] + seqid = seqid[seqid.find("|") + 1:-1] if len(seqid) > 0: return seqid - + + def _fetch_from_nuccore(accessionList, destinationDir, emailAddress, forceOverwrite=False, rettype="fasta", retmode="text", fileExt=None, combinedFilePrefix=None, removeSeparateFiles=False, chunkSize=1): - """ + """ This function downloads and saves files from NCBI nuccore. """ - db = "nuccore" + db = "nuccore" Entrez.email = emailAddress Entrez.tool = "https://github.com/broadinstitute/viral-ngs" @@ -46,13 +51,10 @@ def _fetch_from_nuccore(accessionList, destinationDir, emailAddress, if chunkSize > maxChunkSize or (len(accessionList) > maxChunkSize and chunkSize == 1): chunkSize = maxChunkSize - outEx = { - "fasta": "fasta", - "ft":"tbl", - "gb":"gbk" - } + outEx = {"fasta": "fasta", "ft": "tbl", "gb": "gbk"} - assert rettype in outEx.keys(), "The return type requested, %s, is not compatible with the nuccore fetch." % rettype + assert rettype in outEx.keys( + ), "The return type requested, %s, is not compatible with the nuccore fetch." % rettype outputDirectory = os.path.abspath(os.path.expanduser(destinationDir)) @@ -60,42 +62,42 @@ def _fetch_from_nuccore(accessionList, destinationDir, emailAddress, os.makedirs(outputDirectory) # if the file extension to use is specified as the fileExt arg, use it - # otherwise, use the default for the rettype according to the outEx dict, + # otherwise, use the default for the rettype according to the outEx dict, # falling back to the retmode if there is no match if not fileExt: - outputExtension = outEx.get(rettype, retmode) + outputExtension = outEx.get(rettype, retmode) else: outputExtension = str(fileExt) - # ensure the extension starts with a ".", also allowing for passed-in + # ensure the extension starts with a ".", also allowing for passed-in # extensions that already have it if outputExtension[:1] != ".": outputExtension = "." + outputExtension - log.info( "Fetching %s entries from GenBank: %s\n", str(len(accessionList)), ", ".join(accessionList[:10])) + log.info("Fetching %s entries from GenBank: %s\n", str(len(accessionList)), ", ".join(accessionList[:10])) outputFiles = [] for chunkNum, chunk in enumerate(boltons.iterutils.chunked_iter(accessionList, chunkSize)): - # for i,acc in enumerate(chunk): + # for i,acc in enumerate(chunk): accString = ",".join(chunk) # if the filename would be longer than Linux allows, simply say "chunk-chunkNum" - if len(accString)+len(outputExtension) <= 254: - outputFilePath = os.path.join(outputDirectory, accString+outputExtension) + if len(accString) + len(outputExtension) <= 254: + outputFilePath = os.path.join(outputDirectory, accString + outputExtension) else: outputFilePath = os.path.join(outputDirectory, "chunk-{}".format(chunkNum) + outputExtension) if not forceOverwrite: log.info("not overwriting, checking for existence") - assert not os.path.exists(outputFilePath), """File %s already exists. Consider removing - this file or specifying a different output directory. The files for the accessions specified + assert not os.path.exists(outputFilePath), """File %s already exists. Consider removing + this file or specifying a different output directory. The files for the accessions specified can be overwritten if you add --forceOverwrite flag. Processing aborted.""" % outputFilePath tryCount = 1 while True: try: - log.info("Fetching file %s: %s, try #%s", chunkNum+1, accString, tryCount) + log.info("Fetching file %s: %s, try #%s", chunkNum + 1, accString, tryCount) handle = Entrez.efetch(db=db, rettype=rettype, id=accString) with open(outputFilePath, "w") as outf: @@ -104,8 +106,10 @@ def _fetch_from_nuccore(accessionList, destinationDir, emailAddress, outputFiles.append(outputFilePath) except IOError as e: - log.warning("Error fetching file %s: %s, try #%s probably because NCBI is too busy.", chunkNum+1, accString, tryCount) - + log.warning( + "Error fetching file %s: %s, try #%s probably because NCBI is too busy.", chunkNum + 1, accString, + tryCount) + tryCount += 1 if tryCount > 4: log.warning("Tried too many times. Aborting.") @@ -120,16 +124,16 @@ def _fetch_from_nuccore(accessionList, destinationDir, emailAddress, # assert that we are not trying to remove the intermediate files without writing a combined file if removeSeparateFiles: - assert combinedFilePrefix, """The intermediate files + assert combinedFilePrefix, """The intermediate files can only be removed if a combined file is written via --combinedFilePrefix""" # build a path to the combined genome file if combinedFilePrefix: - concatenatedGenomeFilepath = os.path.join(outputDirectory, combinedFilePrefix+outputExtension) + concatenatedGenomeFilepath = os.path.join(outputDirectory, combinedFilePrefix + outputExtension) if not forceOverwrite: - assert not os.path.exists(concatenatedGenomeFilepath), """File %s already exists. Consider removing - this file or specifying a different output directory. The files for the accessions specified + assert not os.path.exists(concatenatedGenomeFilepath), """File %s already exists. Consider removing + this file or specifying a different output directory. The files for the accessions specified can be overwritten if you add --forceOverwrite flag. Processing aborted.""" % outputFilePath # concatenate the files together into one genome file @@ -147,17 +151,53 @@ def _fetch_from_nuccore(accessionList, destinationDir, emailAddress, # add the combined file to the list of files returned outputFiles.append(concatenatedGenomeFilepath) - # return list of files return outputFiles -def fetch_fastas_from_genbank(accessionList, destinationDir, emailAddress, forceOverwrite, combinedFilePrefix, removeSeparateFiles, fileExt=None, rettype="fasta", retmode="text", chunkSize=1): - return _fetch_from_nuccore(accessionList, destinationDir, emailAddress, forceOverwrite, rettype, retmode, fileExt, combinedFilePrefix, removeSeparateFiles, chunkSize) - -def fetch_feature_tables_from_genbank(accessionList, destinationDir, emailAddress, forceOverwrite, combinedFilePrefix, removeSeparateFiles, fileExt=None, rettype="ft", retmode="text", chunkSize=1): - return _fetch_from_nuccore(accessionList, destinationDir, emailAddress, forceOverwrite, rettype, retmode, fileExt, combinedFilePrefix, removeSeparateFiles, chunkSize) - -def fetch_full_records_from_genbank(accessionList, destinationDir, emailAddress, forceOverwrite, combinedFilePrefix, removeSeparateFiles, fileExt=None, rettype="gb", retmode="text", chunkSize=1): - return _fetch_from_nuccore(accessionList, destinationDir, emailAddress, forceOverwrite, rettype, retmode, fileExt, combinedFilePrefix, removeSeparateFiles, chunkSize) - +def fetch_fastas_from_genbank( + accessionList, + destinationDir, + emailAddress, + forceOverwrite, + combinedFilePrefix, + removeSeparateFiles, + fileExt=None, + rettype="fasta", + retmode="text", + chunkSize=1): + return _fetch_from_nuccore( + accessionList, destinationDir, emailAddress, forceOverwrite, rettype, retmode, fileExt, combinedFilePrefix, + removeSeparateFiles, chunkSize) + + +def fetch_feature_tables_from_genbank( + accessionList, + destinationDir, + emailAddress, + forceOverwrite, + combinedFilePrefix, + removeSeparateFiles, + fileExt=None, + rettype="ft", + retmode="text", + chunkSize=1): + return _fetch_from_nuccore( + accessionList, destinationDir, emailAddress, forceOverwrite, rettype, retmode, fileExt, combinedFilePrefix, + removeSeparateFiles, chunkSize) + + +def fetch_full_records_from_genbank( + accessionList, + destinationDir, + emailAddress, + forceOverwrite, + combinedFilePrefix, + removeSeparateFiles, + fileExt=None, + rettype="gb", + retmode="text", + chunkSize=1): + return _fetch_from_nuccore( + accessionList, destinationDir, emailAddress, forceOverwrite, rettype, retmode, fileExt, combinedFilePrefix, + removeSeparateFiles, chunkSize) diff --git a/util/misc.py b/util/misc.py index 8052bea66..cfd8e2afa 100644 --- a/util/misc.py +++ b/util/misc.py @@ -1,9 +1,10 @@ '''A few miscellaneous tools. ''' -from __future__ import division # Division of integers with / should never round! +from __future__ import division # Division of integers with / should never round! import itertools __author__ = "dpark@broadinstitute.org" + def unique(items): ''' Return unique items in the same order as seen in the input. ''' seen = set() @@ -12,6 +13,7 @@ def unique(items): seen.add(i) yield i + def histogram(items): ''' I count the number of times I see stuff and return a dict of counts. ''' out = {} @@ -20,7 +22,8 @@ def histogram(items): out[i] += 1 return out -def freqs(items, zero_checks = set()): + +def freqs(items, zero_checks=set()): ''' Given a list of comparable, non-unique items, produce an iterator of (item, count, freq) tuples. item is a unique instance of one of the items seen on input @@ -37,11 +40,12 @@ def freqs(items, zero_checks = set()): out.setdefault(i, 0) out[i] += 1 tot += 1 - for k,v in out.items(): - yield (k,v,float(v)/tot) + for k, v in out.items(): + yield (k, v, float(v) / tot) for i in zero_checks: if i not in out: - yield (i,0,0.0) + yield (i, 0, 0.0) + def intervals(i, n, l): ''' Divide something of length l into n equally sized parts and return the @@ -49,23 +53,25 @@ def intervals(i, n, l): will be adjacent and non-overlapping with the next part. i must be a number from 1 to n. ''' - assert 1 <= i <= n and l>=n - part_size = l//n - start = 1 + part_size * (i-1) + assert 1 <= i <= n and l >= n + part_size = l // n + start = 1 + part_size * (i - 1) stop = part_size * i - if i==n: + if i == n: stop = l - return (start,stop) + return (start, stop) # from http://stackoverflow.com/a/312467 -def batch_iterator(iterator, batch_size) : + + +def batch_iterator(iterator, batch_size): """Returns lists of length batch_size. - + This can be used on any iterator, for example to batch up SeqRecord objects from Bio.SeqIO.parse(...), or to batch Alignment objects from Bio.AlignIO.parse(...), or simply lines from a file handle. - + This is a generator function, and it returns lists of the entries from the supplied iterator. Each list will have batch_size entries, although the final list may be shorter. @@ -75,4 +81,3 @@ def batch_iterator(iterator, batch_size) : while item: yield item item = list(itertools.islice(it, batch_size)) - diff --git a/util/stats.py b/util/stats.py index 98f1120c8..239867712 100644 --- a/util/stats.py +++ b/util/stats.py @@ -1,5 +1,5 @@ '''A few pure-python statistical tools to avoid the need to install scipy. ''' -from __future__ import division # Division of integers with / should never round! +from __future__ import division # Division of integers with / should never round! from math import exp, log, pi, sqrt, gamma, lgamma, erf import itertools @@ -11,33 +11,36 @@ except ImportError: # Python <3.4, avoid numpy if these two methods are all we really need def mean(l): - if len(l)>0: - return float(sum(l))/len(l) + if len(l) > 0: + return float(sum(l)) / len(l) else: raise Exception("empty list for mean") + def median(l): - if len(l)>0: + if len(l) > 0: half = len(l) // 2 l.sort() if len(l) % 2 == 0: - return (l[half-1] + l[half]) / 2.0 + return (l[half - 1] + l[half]) / 2.0 else: return l[half] else: raise Exception("empty list for median") -def product(iter) : + +def product(iter): prod = 1 - for x in iter : + for x in iter: prod *= x return prod -def chi2_contingency(contingencyTable, correction = True) : + +def chi2_contingency(contingencyTable, correction=True): """ contingencyTable is a sequence of m sequences, each of length n. - Return an estimate using the chi-square distribution of the two-tailed + Return an estimate using the chi-square distribution of the two-tailed p-value for an m x n contingency table against the null hypothesis - that the row and column criteria are independent. This is not as - accurate as fisher_exact, but is faster (and is implemented for all + that the row and column criteria are independent. This is not as + accurate as fisher_exact, but is faster (and is implemented for all m and n). If correction is True and there is 1 degree of freedom, apply Yates's correction for continuity, i.e., adjust each observed value @@ -47,19 +50,17 @@ def chi2_contingency(contingencyTable, correction = True) : less than 5. """ # scipy equivalent: scipy.stats.chi2_contingency(contingencyTable)[1] - - if len(contingencyTable) == 0 : + + if len(contingencyTable) == 0: return 1.0 - if len(set(map(len, contingencyTable))) != 1 : + if len(set(map(len, contingencyTable))) != 1: raise ValueError('Not all rows have the same length') - + # Eliminate rows and columns with 0 sum - colSums = [sum(row[col] for row in contingencyTable) - for col in range(len(contingencyTable[0]))] - table = [[x for x, colSum in zip(row, colSums) if colSum != 0] - for row in contingencyTable if sum(row) != 0] + colSums = [sum(row[col] for row in contingencyTable) for col in range(len(contingencyTable[0]))] + table = [[x for x, colSum in zip(row, colSums) if colSum != 0] for row in contingencyTable if sum(row) != 0] - if len(table) < 2 or len(table[0]) < 2 : + if len(table) < 2 or len(table[0]) < 2: return 1.0 m = len(table) @@ -68,20 +69,21 @@ def chi2_contingency(contingencyTable, correction = True) : colSums = [sum(row[col] for row in table) for col in range(n)] N = sum(rowSums) expect = [[rowSums[i] * colSums[j] / N for j in range(n)] for i in range(m)] - if correction and m == n == 2 : - def corr(i, j) : - if expect[i][j] > table[i][j] : + if correction and m == n == 2: + + def corr(i, j): + if expect[i][j] > table[i][j]: return min(table[i][j] + 0.5, expect[i][j]) - else : + else: return max(table[i][j] - 0.5, expect[i][j]) + table = [[corr(i, j) for j in range(n)] for i in range(m)] - chisq = sum((table[i][j] - expect[i][j]) ** 2 / expect[i][j] - for j in range(n) - for i in range(m)) + chisq = sum((table[i][j] - expect[i][j]) ** 2 / expect[i][j] for j in range(n) for i in range(m)) pval = 1 - pchisq(chisq, (m - 1) * (n - 1)) return pval -def fisher_exact(contingencyTable) : + +def fisher_exact(contingencyTable): """ Fisher exact test for the 2 x n case. contingencyTable is a sequence of 2 length-n sequences of integers. Return the two-tailed p-value against the null hypothesis that the row @@ -93,38 +95,36 @@ def fisher_exact(contingencyTable) : 2 x n case by transposing or by removing rows that are all 0s. Also handles degenerate cases of 0 or 1 row by returning 1.0. """ - if len(contingencyTable) == 0 : + if len(contingencyTable) == 0: return 1.0 - if len(set(map(len, contingencyTable))) != 1 : + if len(set(map(len, contingencyTable))) != 1: raise ValueError('Not all rows have the same length') - if any(x != int(x) for row in contingencyTable for x in row) : + if any(x != int(x) for row in contingencyTable for x in row): raise ValueError('Some table entry is not an integer') - if any(x < 0 for row in contingencyTable for x in row) : + if any(x < 0 for row in contingencyTable for x in row): raise ValueError('Some table entry is negative') # Eliminate rows and columns with 0 sum - colSums = [sum(row[col] for row in contingencyTable) - for col in range(len(contingencyTable[0]))] - table = [[x for x, colSum in zip(row, colSums) if colSum != 0] - for row in contingencyTable if sum(row) != 0] + colSums = [sum(row[col] for row in contingencyTable) for col in range(len(contingencyTable[0]))] + table = [[x for x, colSum in zip(row, colSums) if colSum != 0] for row in contingencyTable if sum(row) != 0] - if len(table) < 2 or len(table[0]) < 2 : + if len(table) < 2 or len(table[0]) < 2: return 1.0 - if len(table) > len(table[0]) : - table = list(zip(*table)) # Transpose + if len(table) > len(table[0]): + table = list(zip(*table)) # Transpose m = len(table) n = len(table[0]) - if m != 2 : + if m != 2: raise NotImplementedError('More than 2 non-zero rows and columns.') # Put row with smaller sum first. Makes the loop iterations simpler. - table.sort(key = sum) + table.sort(key=sum) # Put column with largest sum last. Makes loop quick rejection faster. - table = list(zip(*table)) # Transpose - table.sort(key = sum) - table = list(zip(*table)) # Transpose back + table = list(zip(*table)) # Transpose + table.sort(key=sum) + table = list(zip(*table)) # Transpose back # There are many optimizations possible for the following code, but it would # still be O(S^(n-1)) so it would still be too slow for anything @@ -135,68 +135,68 @@ def fisher_exact(contingencyTable) : logChooseNrowSum = log_choose(sum(rowSums), rowSums[0]) - def prob_of_table(firstRow) : - return exp(sum(log_choose(cs, a) for cs, a in zip(colSums, firstRow)) - - logChooseNrowSum) + def prob_of_table(firstRow): + return exp(sum(log_choose(cs, a) for cs, a in zip(colSums, firstRow)) - logChooseNrowSum) p0 = prob_of_table(table[0]) result = 0 - for firstRowM1 in itertools.product(*[range(min(rowSums[0], colSums[i]) + 1) - for i in range(n - 1)]) : + for firstRowM1 in itertools.product(*[range(min(rowSums[0], colSums[i]) + 1) for i in range(n - 1)]): lastElmt = rowSums[0] - sum(firstRowM1) - if lastElmt < 0 or lastElmt > colSums[-1] : + if lastElmt < 0 or lastElmt > colSums[-1]: continue prob = prob_of_table(firstRowM1 + (lastElmt,)) - if prob <= p0 + 1e-9 : # (1e-9 handles floating point round off) + if prob <= p0 + 1e-9: # (1e-9 handles floating point round off) result += prob return result -def log_choose(n, k) : + +def log_choose(n, k): # Return log(n choose k). Compute using lgamma(x + 1) = log(x!) - if not (0 <= k <=n) : + if not (0 <= k <= n): raise ValueError('%d is negative or more than %d' % (k, n)) return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1) -def gammainc_halfint(s, x) : - """ Lower incomplete gamma function = + +def gammainc_halfint(s, x): + """ Lower incomplete gamma function = integral from 0 to x of t ** (s-1) exp(-t) dt divided by gamma(s), i.e., the fraction of gamma that you get if you integrate only until x instead of all the way to infinity. Implemented here only if s is a positive multiple of 0.5. """ # scipy equivalent: scipy.special.gammainc(s,x) - - if s <= 0 : + + if s <= 0: raise ValueError('%s is not positive' % s) - if x < 0 : + if x < 0: raise ValueError('%s < 0' % x) - if s * 2 != int(s * 2) : + if s * 2 != int(s * 2): raise NotImplementedError('%s is not a multiple of 0.5' % s) - + # Handle integers analytically - if s == int(s) : + if s == int(s): term = 1 total = 1 - for k in range(1, int(s)) : - term *= x / k + for k in range(1, int(s)): + term *= x / k total += term return 1 - exp(-x) * total # Otherwise s is integer + 0.5. Decrease to 0.5 using recursion formula: result = 0.0 - while s > 1 : + while s > 1: result -= x ** (s - 1) * exp(-x) / gamma(s) s = s - 1 # Then use gammainc(0.5, x) = erf(sqrt(x)) result += erf(sqrt(x)) return result -def pchisq(x, k) : + +def pchisq(x, k): "Cumulative distribution function of chi squared with k degrees of freedom." - if k < 1 or k != int(k) : + if k < 1 or k != int(k): raise ValueError('%s is not a positive integer' % k) - if x < 0 : + if x < 0: raise ValueError('%s < 0' % x) return gammainc_halfint(k / 2, x / 2) - diff --git a/util/vcf.py b/util/vcf.py index d23a8c96e..352cb41ea 100644 --- a/util/vcf.py +++ b/util/vcf.py @@ -5,12 +5,18 @@ __version__ = "PLACEHOLDER" __date__ = "PLACEHOLDER" -import os, shutil, logging, itertools, sqlite3 +import os +import shutil +import logging +import itertools +import sqlite3 import pysam -import util.file, util.misc +import util.file +import util.misc log = logging.getLogger(__name__) + def make_intervals(i, n, fasta, chr_prefix='', verbose=False): ''' Divide a sorted genome into n equally sized parts and return the i'th part. We will return a list of intervals: chr, start, stop. It may @@ -23,16 +29,16 @@ def make_intervals(i, n, fasta, chr_prefix='', verbose=False): # read genome dict file tot = 0 chrlens = [] - for c,c_len in get_chrlens(fasta): + for c, c_len in get_chrlens(fasta): if c.startswith(chr_prefix): - chrlens.append((c,c_len,tot)) + chrlens.append((c, c_len, tot)) tot += c_len # define our chunk by gpos: - part_size = tot//n - g_start = 1 + part_size * (i-1) + part_size = tot // n + g_start = 1 + part_size * (i - 1) g_stop = part_size * i - if i==n: + if i == n: g_stop = tot # find the genomic intervals that correspond to our gpos window @@ -42,12 +48,14 @@ def make_intervals(i, n, fasta, chr_prefix='', verbose=False): c_g_start += 1 if c_g_stop >= g_start and c_g_start <= g_stop: start = max(g_start, c_g_start) - c_g_start + 1 - stop = min(g_stop, c_g_stop) - c_g_start + 1 + stop = min(g_stop, c_g_stop) - c_g_start + 1 out.append((c, start, stop)) if verbose: - log.info("Dividing the %d bp genome into %d chunks of %d bp each. The %dth chunk contains the following %d intervals: %s" % ( - tot, n, part_size, i, len(out), ', '.join(["%s:%d-%d"%x for x in out]))) + log.info( + "Dividing the %d bp genome into %d chunks of %d bp each. The %dth chunk contains the following %d intervals: %s" + % ( + tot, n, part_size, i, len(out), ', '.join(["%s:%d-%d" % x for x in out]))) return out @@ -57,8 +65,8 @@ def sliding_windows(fasta, width, offset, chr_prefix=''): (except maybe the last window on each chromosome) and may overlap (offsetwidth). ''' - assert width>0 and offset>0 - for c,c_len in get_chrlens(fasta): + assert width > 0 and offset > 0 + for c, c_len in get_chrlens(fasta): if c.startswith(chr_prefix): start = 1 while start <= c_len: @@ -72,31 +80,35 @@ class GenomePosition: Read chromosome lengths and order from either a Picard/GATK-index for a FASTA file (a .dict file) or from a VCF header. ''' + def __init__(self, seqDb): self.gpos_map = {} self.clen_map = {} self.chrs = [] totlen = 0 - for c,clen in get_chrlens(seqDb): - self.chrs.append((c,clen)) + for c, clen in get_chrlens(seqDb): + self.chrs.append((c, clen)) self.gpos_map[c] = totlen self.clen_map[c] = clen totlen += clen self.total = totlen + def get_gpos(self, c, p): assert isinstance(p, int) assert c in self.gpos_map assert 1 <= p <= self.clen_map[c] return p + self.gpos_map[c] + def get_chr_pos(self, gpos): assert isinstance(gpos, int) assert 1 <= gpos <= self.total totlen = 0 - for c,clen in self.chrs: - if gpos <= totlen+clen: + for c, clen in self.chrs: + if gpos <= totlen + clen: break totlen += clen - return (c,gpos-totlen) + return (c, gpos - totlen) + def get_chrlens(inFile): ''' Read chromosome lengths and order from either a Picard/GATK-index for @@ -114,11 +126,11 @@ def get_chrlens(inFile): with open(inFile, 'rt') as inf: for line in inf: row = line.rstrip('\n').split('\t') - if row[0]=='@SQ': + if row[0] == '@SQ': assert row[1].startswith('SN:') and row[2].startswith('LN:') c = row[1][3:] c_len = int(row[2][3:]) - chrlens.append((c,c_len)) + chrlens.append((c, c_len)) elif inFile.endswith('.vcf') or inFile.endswith('.vcf.gz'): with util.file.open_or_gzopen(inFile, 'rt') as inf: for line in inf: @@ -127,7 +139,7 @@ def get_chrlens(inFile): line = line[13:-1] c = line.split(',')[0] clen = int(line.split('=')[1]) - chrlens.append((c,clen)) + chrlens.append((c, clen)) elif line.startswith('#CHROM'): break else: @@ -135,36 +147,37 @@ def get_chrlens(inFile): assert chrlens, "no sequence data found in %s % inFile" return chrlens + def calc_maf(genos, ancestral=None, ploidy=1): # get list of alleles - if ploidy==1: + if ploidy == 1: alleles = genos else: alleles = [] for g in genos: g = g.split('/') - assert len(g)==ploidy + assert len(g) == ploidy alleles += g # count up - out = {'n_tot':len(alleles)} + out = {'n_tot': len(alleles)} acounts = util.misc.histogram(alleles) - alist = sorted([(n,a) for a,n in acounts.items()]) + alist = sorted([(n, a) for a, n in acounts.items()]) # daf - if ancestral != None: + if ancestral is not None: out['a_ancestral'] = ancestral - derived = list(sorted([a for a in acounts.keys() if a!=ancestral])) + derived = list(sorted([a for a in acounts.keys() if a != ancestral])) out['a_derived'] = ','.join(derived) out['dac'] = sum(acounts[a] for a in derived) - out['daf'] = out['n_tot'] and float(out['dac'])/out['n_tot'] or None + out['daf'] = out['n_tot'] and float(out['dac']) / out['n_tot'] or None # maf if out['n_tot']: out['a_major'] = alist[-1][1] - out['a_minor'] = ','.join([a for n,a in alist[:-1]]) + out['a_minor'] = ','.join([a for n, a in alist[:-1]]) out['mac'] = out['n_tot'] - alist[-1][0] - out['maf'] = float(out['mac'])/out['n_tot'] + out['maf'] = float(out['mac']) / out['n_tot'] else: out['a_major'] = None out['a_minor'] = None @@ -182,35 +195,42 @@ class TabixReader(pysam.Tabixfile): methods __reversed__() and __len__() in pysam.TabixIterator. __getitem__ could be a bonus, but prob unnecessary. ''' + def __init__(self, inFile, parser=pysam.asTuple()): # because of odd Cython weirdness, we don't actually want to call super.__init__ here.. #super(TabixReader, self).__init__(inFile, parser=parser) self.parser = parser + def __enter__(self): return self + def __exit__(self, exc_type, exc_val, exc_tb): self.close() return 0 + def close(self): super(TabixReader, self).close() + def chroms(self): return self.contigs + def get(self, chrom=None, start=None, stop=None, region=None): - if start!=None: + if start is not None: start -= 1 - return self.fetch(reference=chrom, start=start, end=stop, - region=region, parser=self.parser) + return self.fetch(reference=chrom, start=start, end=stop, region=region, parser=self.parser) def get_pos_from_vcf_record(vcfrec): # new versions of pysam return a zero-based position here return vcfrec.pos + 1 + def bytes_to_string(o): - if type(o) == bytes: + if isinstance(o, bytes): o = o.decode('utf-8') return o + class VcfReader(TabixReader): ''' Same as TabixReader with a few more perks for VCF files: - emit results parsed as pysam VCF rows @@ -218,9 +238,10 @@ class VcfReader(TabixReader): - provide self.samples(), a list of sample names in order of appearance - provide get_range(c,start,stop) and get_snp_genos(c,pos) ''' + def __init__(self, inFile, ploidy=1, parser=pysam.asVCF()): super(VcfReader, self).__init__(inFile, parser=parser) - assert ploidy in (1,2) + assert ploidy in (1, 2) self.ploidy = ploidy self.clens = [] self.sample_names = None @@ -230,19 +251,24 @@ def __init__(self, inFile, ploidy=1, parser=pysam.asVCF()): line = line[13:-1] c = line.split(',')[0] clen = int(line.split('=')[1]) - self.clens.append((c,clen)) + self.clens.append((c, clen)) elif line.startswith('#CHROM'): row = line.split('\t') self.sample_names = row[9:] self.clens = dict(self.clens) assert self.sample_names + def samples(self): return self.sample_names + def chrlens(self): return self.clens + def get_positions(self, c=None, start=None, stop=None, region=None): - for snp in self.get(c,start,stop,region): - yield (bytes_to_string(snp.contig), get_pos_from_vcf_record(snp), get_pos_from_vcf_record(snp)+len(snp.ref)-1) + for snp in self.get(c, start, stop, region): + yield (bytes_to_string(snp.contig), get_pos_from_vcf_record(snp), + get_pos_from_vcf_record(snp) + len(snp.ref) - 1) + def get_range(self, c=None, start=None, stop=None, region=None, as_strings=True, more=False): ''' Read a VCF file (optionally just a piece of it) and return contents as an iterator with parsed contents. Each row is returned as @@ -257,35 +283,35 @@ def get_range(self, c=None, start=None, stop=None, region=None, as_strings=True, alleles will be integers. If more is true, a fifth column will be emitted with the pysam VCF object. ''' - for snp in self.get(c,start,stop,region): + for snp in self.get(c, start, stop, region): alleles = [bytes_to_string(snp.ref)] + bytes_to_string(snp.alt).split(',') alleles = [a for a in alleles if a != '.'] - if self.ploidy==1: - genos = [(self.sample_names[i], int(bytes_to_string(snp[i])[0])) - for i in range(len(self.sample_names)) - if bytes_to_string(snp[i])[0] != '.'] + if self.ploidy == 1: + genos = [(self.sample_names[i], int(bytes_to_string(snp[i])[0])) for i in range(len(self.sample_names)) + if bytes_to_string(snp[i])[0] != '.'] if as_strings: - genos = [(s,alleles[a]) for s,a in genos] + genos = [(s, alleles[a]) for s, a in genos] else: - genos = [(self.sample_names[i], [int(bytes_to_string(snp[i])[j*2]) for j in range(self.ploidy)]) - for i in range(len(self.sample_names)) - if bytes_to_string(snp[i])[0] != '.'] + genos = [(self.sample_names[i], [int(bytes_to_string(snp[i])[j * 2]) for j in range(self.ploidy)]) + for i in range(len(self.sample_names)) if bytes_to_string(snp[i])[0] != '.'] if as_strings: - genos = [(s,[alleles[a] for a in g]) for s,g in genos] + genos = [(s, [alleles[a] for a in g]) for s, g in genos] if more: yield (bytes_to_string(snp.contig), get_pos_from_vcf_record(snp), alleles, genos, snp) else: yield (bytes_to_string(snp.contig), get_pos_from_vcf_record(snp), alleles, genos) + def get_snp_genos(self, c, p, as_strings=True): ''' Read a single position from a VCF file and return the genotypes as a sample -> allele map. If there is not exactly one matching row in the VCF file at this position (if there are none or multiple) then we return an empty map: {}. ''' - snps = [x for x in self.get_range(c,p,p,as_strings=as_strings)] - return len(snps)==1 and dict(snps[0][3]) or {} + snps = [x for x in self.get_range(c, p, p, as_strings=as_strings)] + return len(snps) == 1 and dict(snps[0][3]) or {} + def getFullSequences(self, c, start, stop, samples, - na='-', refSeq=None, refInvariantsOnly=False, ignoreIndels=False): + na='-', refSeq=None, refInvariantsOnly=False, ignoreIndels=False): ''' chr - chromosome name start - start position stop - default = start @@ -296,13 +322,16 @@ def getFullSequences(self, c, start, stop, samples, sites or sites with no entries for any samples. if False, refSeq is used for all missing data. ''' - assert 1<=start<=stop - assert len(na)==1 + assert 1 <= start <= stop + assert len(na) == 1 # get all the VCF records - vcf_records = [(p-start,alleles,dict(genos)) for chrom,p,alleles,genos - in self.get_range(c, start, stop, as_strings=True) - if not ignoreIndels or set(map(len, alleles)) == set([1])] + vcf_records = [(p - start, alleles, dict(genos)) + for chrom, p, alleles, genos in self.get_range(c, + start, + stop, + as_strings=True) + if not ignoreIndels or set(map(len, alleles)) == set([1])] # Construct a list called "seq" into which we will replace alleles as # we discover them. This is a list, not a string, because each position @@ -311,21 +340,21 @@ def getFullSequences(self, c, start, stop, samples, # converted to a string at the end. if refSeq: # assume refSeq alleles as a baseline for missing data - assert len(refSeq)==(stop-start+1) + assert len(refSeq) == (stop - start + 1) seq = list(refSeq) if refInvariantsOnly: # but don't assume refSeq alleles for any known variant sites - for i,alleles,genos in vcf_records: - if len(set(genos[s] for s in samples if s in genos))>1: + for i, alleles, genos in vcf_records: + if len(set(genos[s] for s in samples if s in genos)) > 1: for j in range(len(alleles[0])): - if i+j < len(seq): - seq[i+j] = na + if i + j < len(seq): + seq[i + j] = na else: # assume nothing when data is missing - seq = list(na * (stop-start+1)) + seq = list(na * (stop - start + 1)) for sample in samples: - assert sample==None or sample in self.samples() + assert sample is None or sample in self.samples() # Make copy of reference sequence newseq = [s for s in seq] @@ -337,16 +366,16 @@ def getFullSequences(self, c, start, stop, samples, # length as the original (if ignoreIndels is True, it must be the same # length as the original). newseq = ''.join(newseq) - assert len(newseq)==(stop-start+1) or not ignoreIndels + assert len(newseq) == (stop - start + 1) or not ignoreIndels yield (sample, newseq) -def replaceAlleles(sample,seq,vcf_records): +def replaceAlleles(sample, seq, vcf_records): ''' Replace alleles, one site at a time. ''' - for i,alleles,genos in vcf_records: + for i, alleles, genos in vcf_records: # set allele to the DNA sequence we will replace at positions i through i+len(refallele)-1 - if sample==None: + if sample is None: # caller is asking for the reference sample's sequence allele = alleles[0] alleles = [allele] @@ -356,31 +385,32 @@ def replaceAlleles(sample,seq,vcf_records): if not samp_geno: continue if isinstance(samp_geno, list): - log.warn("TO DO: add code to turn hets into IUPAC ambiguity codes (%s %s = %s)." % (i,sample, '/'.join(samp_geno))) + log.warn( + "TO DO: add code to turn hets into IUPAC ambiguity codes (%s %s = %s)." % + (i, sample, '/'.join(samp_geno))) continue allele = samp_geno # replace portion of sequence with allele # NEED BUGFIXES HERE for overlapping VCF records - if allele == None: + if allele is None: # nothing here ... is this even possible anymore? pass - elif len(alleles[0])==1: + elif len(alleles[0]) == 1: # the most common cases: SNP, novar, or indel replacing one ref base (with zero or more bases) seq[i] = allele else: # most complicated case: something replacing multiple ref bases # TODO: beware--this is all untested! for j in range(max(len(alleles[0]), len(allele))): - if i+j < len(seq): - if j= ref length, fill out the rest of the bases - seq[i+j] = allele[j:] + seq[i + j] = allele[j:] else: - seq[i+j] = allele[j] + seq[i + j] = allele[j] else: # new allele is shorter than ref, so delete extra bases - seq[i+j] = '' + seq[i + j] = '' return seq - diff --git a/util/version.py b/util/version.py index eb55cb66a..e54df16f8 100644 --- a/util/version.py +++ b/util/version.py @@ -4,26 +4,30 @@ __author__ = "dpark@broadinstitute.org" __version__ = None -import subprocess, os, os.path - -def get_project_path() : +import subprocess +import os +import os.path + + +def get_project_path(): '''Return the absolute path of the top-level project, assumed to be the parent of the directory containing this script.''' # abspath converts relative to absolute path; expanduser interprets ~ - path = __file__ # path to this script + path = __file__ # path to this script path = os.path.expanduser(path) # interpret ~ - path = os.path.abspath(path) # convert to absolute path - path = os.path.dirname(path) # containing directory: util - path = os.path.dirname(path) # containing directory: main project dir + path = os.path.abspath(path) # convert to absolute path + path = os.path.dirname(path) # containing directory: util + path = os.path.dirname(path) # containing directory: main project dir return path + def call_git_describe(): cwd = os.getcwd() try: os.chdir(get_project_path()) cmd = ['git', 'describe', '--tags', '--always', '--dirty'] out = subprocess.check_output(cmd) - if type(out) != str: + if not isinstance(out, str): out = out.decode('utf-8') ver = out.strip() except: @@ -31,9 +35,11 @@ def call_git_describe(): os.chdir(cwd) return ver + def release_file(): return os.path.join(get_project_path(), 'VERSION') - + + def read_release_version(): try: with open(release_file(), 'rt') as inf: @@ -41,29 +47,31 @@ def read_release_version(): except: version = None return version - + + def write_release_version(version): with open(release_file(), 'wt') as outf: - outf.write(version+'\n') + outf.write(version + '\n') + def get_version(): global __version__ - if __version__ == None: - from_git = call_git_describe() + if __version__ is None: + from_git = call_git_describe() from_file = read_release_version() - + if from_git: if from_file != from_git: write_release_version(from_git) __version__ = from_git else: __version__ = from_file - - if __version__ == None: + + if __version__ is None: raise ValueError("Cannot find the version number!") - + return __version__ - - + + if __name__ == "__main__": print(get_version())