diff --git a/assembly.py b/assembly.py index 5ae556724..62ae45729 100755 --- a/assembly.py +++ b/assembly.py @@ -44,6 +44,11 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): read_utils.purge_unmated(rmdupfq[0], rmdupfq[1], purgefq[0], purgefq[1]) os.unlink(rmdupfq[0]) os.unlink(rmdupfq[1]) + + # Log count + with open(purgefq[0], 'rt') as inf: + n = int(sum(1 for line in inf)/4) + log.info("PRE-SUBSAMPLE COUNT: {} read pairs".format(n)) # Subsample subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq'])) @@ -58,19 +63,38 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000): os.unlink(purgefq[1]) # Fastq -> BAM - # Note: this destroys RG IDs! We should instead just pull the list - # of IDs out of purge_unmated, random.sample them ourselves, and - # use FilterSamReadsTool to go straight from inBam -> outBam + # Note: this destroys RG IDs! We should instead frun the BAM->fastq step in a way + # breaks out the read groups and perform the above steps in a way that preserves + # the RG IDs. tmp_bam = util.file.mkstempfname('.subsamp.bam') tmp_header = util.file.mkstempfname('.header.sam') - tools.picard.FastqToSamTool().execute( - subsampfq[0], subsampfq[1], 'Dummy', tmp_bam) tools.samtools.SamtoolsTool().dumpHeader(inBam, tmp_header) - tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam) + if n == 0: + # FastqToSam cannot deal with empty input + # but Picard SamFormatConverter can deal with empty files + opts = ['INPUT='+tmp_header, 'OUTPUT='+outBam, 'VERBOSITY=ERROR'] + tools.picard.PicardTools().execute('SamFormatConverter', opts, JVMmemory='50m') + else: + tools.picard.FastqToSamTool().execute( + subsampfq[0], subsampfq[1], 'Dummy', tmp_bam) + tools.samtools.SamtoolsTool().reheader(tmp_bam, tmp_header, outBam) os.unlink(tmp_bam) os.unlink(tmp_header) os.unlink(subsampfq[0]) os.unlink(subsampfq[1]) +def parser_trim_rmdup_subsamp(parser=argparse.ArgumentParser()): + parser.add_argument('inBam', + help='Input reads, unaligned BAM format.') + parser.add_argument('clipDb', + help='Trimmomatic clip DB.') + parser.add_argument('outBam', + help='Output reads, unaligned BAM format (currently, read groups and other header information are destroyed in this process).') + parser.add_argument('--n_reads', default=100000, type=int, + help='Subsample reads to no more than this many pairs. (default %(default)s)') + util.cmd.common_args(parser, (('loglevel',None), ('version',None), ('tmpDir',None))) + util.cmd.attach_main(parser, trim_rmdup_subsamp_reads, split_args=True) + return parser +__commands__.append(('trim_rmdup_subsamp', parser_trim_rmdup_subsamp)) def assemble_trinity(inBam, outFasta, clipDb, n_reads=100000, outReads=None): diff --git a/intrahost.py b/intrahost.py index 9f4438660..2bfb7954c 100755 --- a/intrahost.py +++ b/intrahost.py @@ -76,12 +76,14 @@ def vphaser_to_vcf(inFile, refFasta, multiAlignment, outVcf): ''' # read in multiple alignments of consensus sequences - aln = Bio.AlignIO.read(open(multiAlignment, 'rt'), 'fasta') + with open(multiAlignment, 'rt') as inf: + aln = Bio.AlignIO.read(inf, 'fasta') # open reference genome and set ref as a BioPython SeqRecord - ref = list(Bio.SeqIO.parse(open(refFasta, 'rt'), 'fasta')) - assert len(ref)==1 - ref = ref[0] + with open(refFasta, 'rt') as inf: + ref = list(Bio.SeqIO.parse(inf, 'fasta')) + assert len(ref)==1 + ref = ref[0] # prepare sample list samples = list(util.misc.unique(row['patient'] for row in util.file.read_tabfile_dict(inFile))) diff --git a/pipes/rules/assembly.rules b/pipes/rules/assembly.rules index beba6f699..b56cf512d 100644 --- a/pipes/rules/assembly.rules +++ b/pipes/rules/assembly.rules @@ -2,11 +2,6 @@ This is a basic framework for assembly of viral genomes, currently tailored for EBOV. Some generalization work needed to expand this to generic viral genomes with an arbitrary number of segments/chromosomes. - - note: all this runs in ~3hrs, no step takes more than 4GB RAM, and the - DAG is linear per sample, so we could conslidate most of this into a - single Bellini-like script (once we actually Tool-ify all of the critical - pieces like Trinity, MOSIAK, VFAT, Novoalign, GATK) """ __author__ = 'Kristian Andersen , Daniel Park ' @@ -24,6 +19,10 @@ rule all_assemble: # create BAMs of aligned reads to own consensus expand("{dataDir}/{subdir}/{sample}.bam", dataDir=config["dataDir"], subdir=config["subdirs"]["align_self"], + sample=read_samples_file(config["samples_assembly"])), + # assembly reports too + expand("{reportsDir}/assembly/{sample}.txt", + reportsDir=config["reportsDir"], sample=read_samples_file(config["samples_assembly"])) params: LSF="-N" @@ -32,6 +31,9 @@ rule all_assemble_failures: expand("{dataDir}/{subdir}/{sample}.fasta", dataDir=config["dataDir"], subdir=config["subdirs"]["assembly"], sample=read_samples_file(config.get("samples_assembly_failures"))), + expand("{reportsDir}/assembly/{sample}.txt", + reportsDir=config["reportsDir"], + sample=read_samples_file(config["samples_assembly"])) params: LSF="-N" @@ -42,38 +44,42 @@ rule assemble_trinity: and random subsample to no more than 100k reads. ''' input: config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.taxfilt.bam' - output: config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta', - config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam' + output: config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta' resources: mem=3 params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'), n_reads="100000", logid="{sample}", - clipDb=config["trim_clipDb"] + clipDb=config["trim_clipDb"], + subsamp_bam=config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam' run: makedirs(expand("{dir}/{subdir}", dir=[config["dataDir"],config["tmpDir"]], subdir=config["subdirs"]["assembly"])) - shell("{config[binDir]}/assembly.py assemble_trinity {input} {params.clipDb} {output[0]} --n_reads={params.n_reads} --outReads {output[1]}") + shell("{config[binDir]}/assembly.py assemble_trinity {input} {params.clipDb} {output} --n_reads={params.n_reads} --outReads {params.subsamp_bam}") rule orient_and_impute: ''' This step cleans up the Trinity assembly with a known reference genome. - VFAT (maybe Bellini later): take the Trinity contigs, align them to - the known reference genome, switch it to the same strand as the - reference, and produce chromosome-level assemblies (with runs of - N's in between the Trinity contigs). - filter_short_seqs: We then toss out all assemblies that come out to - < 15kb or < 95% unambiguous and fail otherwise. - modify_contig: Finally, we trim off anything at the end that exceeds + order_and_orient (VFAT, maybe Bellini later): take the de novo assembly, + align them to the known reference genome, switch it to the same + strand as the reference, and produce chromosome-level assemblies + (with runs of N's in between the de novo contigs). + filter_short_seqs: Fail on assemblies that come out to + < 15kb or < 95% unambiguous. + impute_from_reference: trim off anything at the end that exceeds the length of the known reference assembly. We also replace all Ns and everything within 55bp of the chromosome ends with the reference sequence. This is clearly incorrect consensus sequence, but it allows downstream steps to map reads in parts of the genome that would otherwise be Ns, and we will correct all of the inferred positions with two steps of read-based refinement (below), and - revert positions back to Ns where read support is lacking. + revert positions back to Ns where read support is lacking. The + de novo step is still important because it allows for significant + structural / indel changes (or significant substitution distances) + which will be captured in this output. + This is the only step in the pipeline that needs generalization to + multi-chromosome genomes. ''' - input: config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta', - config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam' + input: config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly1-trinity.fasta' output: config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly2-vfat.fasta', config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.assembly3-modify.fasta' resources: mem=3 @@ -83,9 +89,10 @@ rule orient_and_impute: min_unambig = str(config["assembly_min_unambig"]), renamed_prefix="", replace_length="55", - logid="{sample}" + logid="{sample}", + subsamp_bam=config["tmpDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.subsamp.bam' run: - shell("{config[binDir]}/assembly.py order_and_orient {input[0]} {params.refGenome} {output[0]} --inReads {input[1]}") + shell("{config[binDir]}/assembly.py order_and_orient {input} {params.refGenome} {output[0]} --inReads {params.subsamp_bam}") shell("{config[binDir]}/assembly.py impute_from_reference {output[0]} {params.refGenome} {output[1]} --newName {params.renamed_prefix}{wildcards.sample} --replaceLength {params.replace_length} --minLength {params.length} --minUnambig {params.min_unambig}") rule refine_assembly_1: @@ -115,15 +122,13 @@ rule refine_assembly_2: ''' This a second pass refinement step very similar to the first. The only differences are that Novoalign mapping parameters are more conservative and the input consensus sequence has already - been refined once and the input reads are the full raw read - set (instead of the cleaned read set). We also require higher - minimum read coverage (3 instead of 2) in order to call a - non-ambiguous base. + been refined once. We also require higher minimum read coverage + (3 instead of 2) in order to call a non-ambiguous base. The output of this step is the final assembly for this sample. Final FASTA file is indexed for Picard, Samtools, and Novoalign. ''' input: config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly4-refined.fasta', - config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.raw.bam' + config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.cleaned.bam' output: config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta', config["tmpDir"] +'/'+config["subdirs"]["assembly"]+'/{sample}.assembly4.vcf.gz' resources: mem=4 @@ -136,19 +141,21 @@ rule refine_assembly_2: rule map_reads_to_self: ''' After the final assembly is produced, we also produce BAM files with all reads mapped back to its own consensus. Outputs several BAM files, sorted and indexed: - {sample}.bam - all raw reads - {sample}.mapped.bam - only mapped reads - {sample}.rmdup.bam - only mapped reads, with duplicates removed by Picard - {sample}.realigned.bam - mapped/rmdup reads, realigned with GATK IndelRealigner + {sample}.bam - all raw reads aligned to self + {sample}.mapped.bam - only mapped reads, duplicates removed by Picard, + realigned with GATK IndelRealigner ''' input: config["dataDir"]+'/'+config["subdirs"]["per_sample"]+'/{sample}.cleaned.bam', config["dataDir"]+'/'+config["subdirs"]["assembly"]+'/{sample}.fasta' output: config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.bam', - config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.mapped.bam' + config["dataDir"]+'/'+config["subdirs"]["align_self"]+'/{sample}.mapped.bam', + config["reportsDir"]+'/assembly/{sample}.txt' resources: mem=4 params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'), logid="{sample}", novoalign_options = "-r Random -l 40 -g 40 -x 20 -t 100 -k -c 3" run: - makedirs(os.path.join(config["dataDir"], config["subdirs"]["align_self"])) + makedirs([os.path.join(config["dataDir"], config["subdirs"]["align_self"]), + os.path.join(config["reportsDir"], "assembly")]) shell("{config[binDir]}/read_utils.py align_and_fix {input} --outBamAll {output[0]} --outBamFiltered {output[1]} --novoalign_options '{params.novoalign_options}'") + shell("{config[binDir]}/reports.py assembly_stats {wildcards.sample} {output[2]} --assembly_dir {config[dataDir]}/{config[subdirs][assembly]} --assembly_tmp {config[tmpDir]}/{config[subdirs][assembly]} --align_dir {config[dataDir]}/{config[subdirs][align_self]}") diff --git a/pipes/rules/hs_deplete.rules b/pipes/rules/hs_deplete.rules index 55fb562aa..d7fcabc16 100644 --- a/pipes/rules/hs_deplete.rules +++ b/pipes/rules/hs_deplete.rules @@ -16,7 +16,7 @@ rule all_deplete: expand("{dataDir}/{subdir}/{sample}.{adjective}.bam", dataDir=config["dataDir"], subdir=config["subdirs"]["per_sample"], - adjective=['raw','cleaned','taxfilt'], + adjective=['cleaned','taxfilt'], sample=read_samples_file(config["samples_depletion"])), params: LSF="-N" @@ -24,7 +24,7 @@ rule all_deplete: """ rule revert_bam: input: config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam' - output: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam' + output: config["tmpDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam' resources: mem=3 params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'), logid="{sample}" @@ -38,7 +38,7 @@ rule deplete_bmtagger: ''' This step depletes human reads using BMTagger This sometimes takes just over 4 hrs for highly dense lanes ''' - input: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam' + input: config["tmpDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam' # expand("{dbdir}/{db}.{suffix}", # dbdir=config["bmTaggerDbDir"], # db=config["bmTaggerDbs_remove"], @@ -82,20 +82,21 @@ rule depletion: ''' Runs a full human read depletion pipeline and removes PCR duplicates ''' input: config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam' - output: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam', - config["tmpDir"]+ '/'+config["subdirs"]["depletion"]+'/{sample}.bmtagger_depleted.bam', + output: config["tmpDir"] +'/'+config["subdirs"]["depletion"]+'/{sample}.bmtagger_depleted.bam', config["tmpDir"] +'/'+config["subdirs"]["depletion"]+'/{sample}.rmdup.bam', config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.cleaned.bam' resources: mem=15 params: LSF=config.get('LSF_queues', {}).get('long', '-q forest'), bmtaggerDbs = expand("{dbdir}/{db}", dbdir=config["bmTaggerDbDir"], db=config["bmTaggerDbs_remove"]), blastDbs = expand("{dbdir}/{db}", dbdir=config["blastDbDir"], db=config["blastDb_remove"]), + revert_bam = config["tmpDir"] +'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam', logid="{sample}" run: makedirs(expand("{dir}/{subdir}", dir=[config["dataDir"],config["tmpDir"]], subdir=config["subdirs"]["depletion"])) - shell("{config[binDir]}/taxon_filter.py deplete_human {input} {output} --bmtaggerDbs {params.bmtaggerDbs} --blastDbs {params.blastDbs}") + shell("{config[binDir]}/taxon_filter.py deplete_human {input} {params.revert_bam} {output} --bmtaggerDbs {params.bmtaggerDbs} --blastDbs {params.blastDbs}") + os.unlink(params.revert_bam) rule filter_to_taxon: ''' This step reduces the read set to a specific taxon (usually the genus diff --git a/pipes/rules/interhost.rules b/pipes/rules/interhost.rules index 597727a2f..232a44fe5 100644 --- a/pipes/rules/interhost.rules +++ b/pipes/rules/interhost.rules @@ -37,7 +37,7 @@ rule all_ref_guided: config["reportsDir"]+'/summary.coverage_ref.txt.gz' rule ref_guided_consensus: - input: config["dataDir"]+'/'+config["subdirs"]["depletion"]+'/{sample}.raw.bam' + input: config["dataDir"]+'/'+config["subdirs"]["source"]+'/{sample}.bam' output: config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.realigned.bam', config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.vcf.gz', config["dataDir"]+'/'+config["subdirs"]["align_ref"]+'/{sample}.fasta' diff --git a/pipes/rules/reports_only b/pipes/rules/reports_only new file mode 100644 index 000000000..6e4997703 --- /dev/null +++ b/pipes/rules/reports_only @@ -0,0 +1,56 @@ +""" + Just the assembly reports and nothing else +""" + +__author__ = 'Daniel Park ' + +import os + +configfile: "config.json" + +def read_file(fname): + with open(fname, 'rt') as inf: + for line in inf: + yield line.strip() + +def cat_files_with_header(inFiles, outFile): + with open(outFile, 'wt') as outf: + header = None + for f in inFiles: + with open(f, 'rt') as inf: + h = None + for line in inf: + if h == None: + # this is the header line + h = line + if header == None: + # this is the first file + outf.write(line) + header = h + else: + # this is not the first file + if h != header: + raise Exception("headers do not match") + else: + outf.write(line) + + +rule all_assembly_reports: + input: + expand("{reportsDir}/assembly/{sample}.txt", + reportsDir=config["reportsDir"], + sample=read_file(config["samples_depletion"])) + output: + config["reportsDir"]+"/summary.assembly.txt" + params: LSF="-N" + run: + cat_files_with_header(input, output[0]) + + +rule assembly_report: + output: config["reportsDir"]+'/assembly/{sample}.txt' + resources: mem=2 + params: LSF=config.get('LSF_queues', {}).get('short', '-W 4:00'), + logid="{sample}" + shell: "{config[binDir]}/reports.py assembly_stats {wildcards.sample} {output} --assembly_dir {config[dataDir]}/{config[subdirs][assembly]} --assembly_tmp {config[tmpDir]}/{config[subdirs][assembly]} --align_dir {config[dataDir]}/{config[subdirs][align_self]}" + diff --git a/reports.py b/reports.py index 2df08af74..bb3b77e32 100755 --- a/reports.py +++ b/reports.py @@ -6,7 +6,10 @@ __commands__ = [] import argparse, logging, subprocess, glob, os, os.path, time +import pysam, Bio.SeqIO import util.cmd, util.file, util.misc +import tools.samtools +import assembly log = logging.getLogger(__name__) @@ -17,6 +20,96 @@ from numpy import mean, median +def get_assembly_stats(sample, + cov_thresholds=(1,5,20,100), + assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly', + align_dir='data/02_align_to_self'): + ''' Fetch assembly-level statistics for a given sample ''' + out = {'sample':sample} + samtools = tools.samtools.SamtoolsTool() + header = ['sample', 'assembled_trinity', 'trinity_in_reads', + 'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig', + 'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup', + 'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0', + ] + ['aln2self_cov_%dX'%t for t in cov_thresholds] + + # pre-assembly stats + out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, + sample + '.assembly1-trinity.fasta')) and 1 or 0 + sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam') + if os.path.isfile(sub_bam): + out['trinity_in_reads'] = samtools.count(sub_bam) + + # assembly stats + assembly_fname = os.path.join(assembly_dir, sample + '.fasta') + if not os.path.isfile(assembly_fname): + out['n_contigs'] = 0 + return (header, out) + with open(assembly_fname, 'rt') as inf: + counts = [(len(s), assembly.unambig_count(s.seq)) + for s in Bio.SeqIO.parse(inf, 'fasta')] + out['n_contigs'] = len(counts) + out['contig_len'] = ','.join(str(x) for x,y in counts) + out['unambig_bases'] = ','.join(str(y) for x,y in counts) + out['pct_unambig'] = ','.join(str(float(y)/x) for x,y in counts) + + # read counts from align-to-self + bam_fname = os.path.join(align_dir, sample + '.bam') + if not os.path.isfile(bam_fname): + return (header, out) + out['aln2self_reads_tot'] = samtools.count(bam_fname) + out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4']) + out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028']) + if out['aln2self_reads_aln']: + out['aln2self_pct_nondup'] = float(out['aln2self_reads_rmdup']) / out['aln2self_reads_aln'] + + # genome coverage stats + bam_fname = os.path.join(align_dir, sample + '.mapped.bam') + with pysam.AlignmentFile(bam_fname, 'rb') as bam: + coverages = list([pcol.nsegments for pcol in bam.pileup()]) + out['aln2self_cov_median'] = median(coverages) + out['aln2self_cov_mean'] = "%0.3f"%mean(coverages) + out['aln2self_cov_mean_non0'] = "%0.3f"%mean([n for n in coverages if n>0]) + for thresh in cov_thresholds: + out['aln2self_cov_%dX'%thresh] = sum(1 for n in coverages if n>=thresh) + + return (header, out) + +def assembly_stats(samples, outFile, + cov_thresholds, assembly_dir, assembly_tmp, align_dir): + ''' Fetch assembly-level statistics for a given sample ''' + header_written = False + with open(outFile, 'wt') as outf: + for sample in samples: + log.info("fetching stats on "+sample) + header, out = get_assembly_stats(sample, cov_thresholds, assembly_dir, assembly_tmp, align_dir) + if not header_written: + outf.write('\t'.join(map(str, header))+'\n') + header_written = True + outf.write('\t'.join([str(out.get(h,'')) for h in header])+'\n') + outf.flush() +def parser_assembly_stats(parser=argparse.ArgumentParser()): + parser.add_argument('samples', nargs='+', help='Sample names.') + parser.add_argument('outFile', help='Output report file.') + parser.add_argument('--cov_thresholds', nargs='+', type=int, + default=(1,5,20,100), + help='Genome coverage thresholds to report on. (default: %(default)s)') + parser.add_argument('--assembly_dir', default='data/02_assembly', + help='Directory with assembly outputs. (default: %(default)s)') + parser.add_argument('--assembly_tmp', default='tmp/02_assembly', + help='Directory with assembly temp files. (default: %(default)s)') + parser.add_argument('--align_dir', default='data/02_align_to_self', + help='Directory with reads aligned to own assembly. (default: %(default)s)') + util.cmd.attach_main(parser, assembly_stats, split_args=True) + return parser +__commands__.append(('assembly_stats', parser_assembly_stats)) + + + +def get_refalign_stats(sample): + pass + + def consolidate_bamstats(inFiles, outFile): '''Consolidate multiple bamstats reports into one.''' with util.file.open_or_gzopen(outFile, 'wt') as outf: diff --git a/taxon_filter.py b/taxon_filter.py index dfea893f8..7d5ddea25 100755 --- a/taxon_filter.py +++ b/taxon_filter.py @@ -379,15 +379,18 @@ def partition_bmtagger(inFastq1, inFastq2, databases, # Technically, this violates the loop invariant ;-) break log.debug("starting select_reads") - matches = set(line.strip() for line in open(matchesFile)) + with open(matchesFile) as inf: + matches = set(line.strip() for line in inf) noMatchFcn = lambda rec : strip12(rec.id) not in matches select_reads(prevReads1, curReads1, noMatchFcn) select_reads(prevReads2, curReads2, noMatchFcn) if outMatch != None : log.debug("preparing outMatch files") - allMatches = set(line.strip() - for matchesFile in matchesFiles - for line in open(matchesFile)) + allMatches = set() + for matchesFile in matchesFiles: + with open(matchesFile) as inf: + newMatches = set(line.strip() for line in inf) + allMatches = allMatches.union(newMatches) matchFcn = lambda rec : strip12(rec.id) in allMatches select_reads(inFastq1, outMatch[0], matchFcn) select_reads(inFastq2, outMatch[1], matchFcn) diff --git a/tools/__init__.py b/tools/__init__.py index 4efa5f6c3..7fbcc04e5 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -66,6 +66,8 @@ def execute(self, args): assert not os.system(self.exec_path + ' ' + args) def install_and_get_path(self) : self.install() + if self.executable_path()==None: + raise NameError("unsuccessful in installing " + type(self).__name__) return self.executable_path() class InstallMethod(object): diff --git a/tools/last.py b/tools/last.py index 54ed877b1..8c055235a 100644 --- a/tools/last.py +++ b/tools/last.py @@ -33,12 +33,14 @@ def post_download(self) : mafConvertPath = os.path.join(binpath, 'maf-convert') os.system('2to3 {mafConvertPath} -w --no-diffs'.format(**locals())) # Still more fixes needed: the first for 3.x, the second for 2.7 - fileContents = open(mafConvertPath).read() - fileContents = fileContents.replace('string.maketrans("", "")', 'None') - fileContents = fileContents.replace( - '#! /usr/bin/env python', - '#! /usr/bin/env python\nfrom __future__ import print_function') - open(mafConvertPath, 'w').write(fileContents) + with open(mafConvertPath, 'rt') as inf: + fileContents = inf.read() + fileContents = fileContents.replace('string.maketrans("", "")', 'None') + fileContents = fileContents.replace( + '#! /usr/bin/env python', + '#! /usr/bin/env python\nfrom __future__ import print_function') + with open(mafConvertPath, 'wt') as outf: + outf.write(fileContents) def verify_install(self) : 'Default checks + verify python 2.7/3.x compatibility fixes were done' if not tools.DownloadPackage.verify_install(self) : @@ -47,7 +49,8 @@ def verify_install(self) : self.lastWithVersion, 'bin', 'maf-convert') if not os.access(mafConvertPath, os.X_OK | os.R_OK) : return False - return 'print_function' in open(mafConvertPath).read() + with open(mafConvertPath, 'rt') as inf: + return 'print_function' in inf.read() class Lastal(LastTools) : subtoolName = 'lastal' diff --git a/util/cmd.py b/util/cmd.py index 93393cabb..11a6b1ff2 100644 --- a/util/cmd.py +++ b/util/cmd.py @@ -112,7 +112,7 @@ def main_argparse(commands, description): log.info("software version: %s, python version: %s" % (__version__, sys.version)) log.info("command: %s %s %s" % ( sys.argv[0], sys.argv[1], - ' '.join(["%s=%s" % (k,v) for k,v in vars(args).items()]))) + ' '.join(["%s=%s" % (k,v) for k,v in vars(args).items() if k not in ('command', 'func_main')]))) if hasattr(args, 'tmpDir'): ''' If this command has a tmpDir option, use that as a base directory