Skip to content

Commit

Permalink
Merge pull request #128 from broadinstitute/dp-fix
Browse files Browse the repository at this point in the history
pylint/landscape inspired code fixes
  • Loading branch information
dpark01 committed Apr 17, 2015
2 parents d77b1a8 + 0972ebb commit 1ba2dc4
Show file tree
Hide file tree
Showing 25 changed files with 96 additions and 102 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[![Build Status](https://travis-ci.org/broadinstitute/viral-ngs.svg?branch=master)](https://travis-ci.org/broadinstitute/viral-ngs)
[![Coverage Status](https://coveralls.io/repos/broadinstitute/viral-ngs/badge.png)](https://coveralls.io/r/broadinstitute/viral-ngs)
[![Code Health](https://landscape.io/github/broadinstitute/viral-ngs/master/landscape.svg?style=flat)](https://landscape.io/github/broadinstitute/viral-ngs)
[![Documentation Status](https://readthedocs.org/projects/viral-ngs/badge/?version=latest)](https://readthedocs.org/projects/viral-ngs/?badge=latest)

viral-ngs
Expand Down
24 changes: 12 additions & 12 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def trim_rmdup_subsamp_reads(inBam, clipDb, outBam, n_reads=100000):
# Log count
with open(purgefq[0], 'rt') as inf:
n = int(sum(1 for line in inf)/4)
log.info("PRE-SUBSAMPLE COUNT: {} read pairs".format(n))
log.info("PRE-SUBSAMPLE COUNT: %s read pairs", n)

# Subsample
subsampfq = list(map(util.file.mkstempfname, ['.subsamp.1.fastq', '.subsamp.2.fastq']))
Expand Down Expand Up @@ -506,7 +506,7 @@ def main_modify_contig(args):
__commands__.append(('modify_contig', parser_modify_contig))


class ContigModifier:
class ContigModifier(object):
''' Initial modifications to Trinity+VFAT assembly output based on
MUSCLE alignment to known reference genome
author: rsealfon
Expand Down Expand Up @@ -592,7 +592,7 @@ def remove_end_ns(self):



class MutableSequence:
class MutableSequence(object):
def __init__(self, name, start, stop, init_seq=None):
if not (stop>=start>=1):
raise IndexError("coords out of bounds")
Expand Down Expand Up @@ -661,9 +661,9 @@ def vcfrow_parse_and_call_snps(vcfrow, samples, min_dp=0, major_cutoff=0.5, min_
alleles = [vcfrow[3]] + [a for a in vcfrow[4].split(',') if a not in '.']
start = int(vcfrow[1])
stop = start + len(vcfrow[3]) - 1
format = vcfrow[8].split(':')
format = dict((format[i], i) for i in range(len(format)))
assert 'GT' in format and format['GT']==0 # required by VCF spec
format_col = vcfrow[8].split(':')
format_col = dict((format_col[i], i) for i in range(len(format_col)))
assert 'GT' in format_col and format_col['GT']==0 # required by VCF spec
assert len(vcfrow)==9+len(samples)
info = [x.split('=') for x in vcfrow[7].split(';') if x != '.']
info = dict(x for x in info if len(x)==2)
Expand All @@ -676,18 +676,18 @@ def vcfrow_parse_and_call_snps(vcfrow, samples, min_dp=0, major_cutoff=0.5, min_
# require a minimum read coverage
if len(alleles)==1:
# simple invariant case
dp = ('DP' in format and len(rec)>format['DP']) and int(rec[format['DP']]) or 0
dp = ('DP' in format_col and len(rec)>format_col['DP']) and int(rec[format_col['DP']]) or 0
if dp < min_dp:
continue
geno = alleles
if info_dp and float(dp)/info_dp < min_dp_ratio:
log.warn("dropping invariant call at %s:%s %s (%s) due to low DP ratio (%s / %s = %s < %s)" % (
c,p,sample,geno,dp,info_dp,float(dp)/info_dp,min_dp_ratio))
log.warn("dropping invariant call at %s:%s-%s %s (%s) due to low DP ratio (%s / %s = %s < %s)",
c,start,stop,sample,geno,dp,info_dp,float(dp)/info_dp,min_dp_ratio)
continue
else:
# variant: manually call the highest read count allele if it exceeds a threshold
assert ('AD' in format and len(rec)>format['AD'])
allele_depths = list(map(int, rec[format['AD']].split(',')))
assert ('AD' in format_col and len(rec)>format_col['AD'])
allele_depths = list(map(int, rec[format_col['AD']].split(',')))
assert len(allele_depths)==len(alleles)
allele_depths = [(allele_depths[i], alleles[i]) for i in range(len(alleles)) if allele_depths[i]>0]
allele_depths = list(reversed(sorted((n,a) for n,a in allele_depths if n>=min_dp)))
Expand Down Expand Up @@ -736,7 +736,7 @@ def vcf_to_seqs(vcfIter, chrlens, samples, min_dp=0, major_cutoff=0.5, min_dp_ra
# mix of indels with no clear winner... force the most popular one
seqs[s].replace(start, stop, alleles[0])
except:
log.exception("Exception occurred while parsing VCF file. Row: '%s'" % vcfrow)
log.exception("Exception occurred while parsing VCF file. Row: '%s'", vcfrow)
raise

# at the end, dump the last chromosome
Expand Down
1 change: 0 additions & 1 deletion broad_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,6 @@ def main_illumina_basecalls(args):
picardOpts = dict((opt, getattr(args, opt))
for opt in tools.picard.IlluminaBasecallsToSamTool.option_list
if hasattr(args, opt) and getattr(args, opt)!=None)
params_file = util.file.mkstempfname('library_params.txt')
if not picardOpts.get('run_start_date'):
picardOpts['run_start_date'] = get_earliest_date(args.inBustardDir)
#if not picardOpts.get('read_group_id'):
Expand Down
27 changes: 14 additions & 13 deletions interhost.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import Bio.AlignIO
from Bio import SeqIO
import argparse, logging, os, sys, array, bisect
import argparse, logging, os, array, bisect

try:
from itertools import zip_longest
Expand All @@ -22,7 +22,7 @@

# =========== CoordMapper =================

class CoordMapper :
class CoordMapper(object):
""" Map (chrom, coordinate) between genome A and genome B.
Coordinates are 1-based.
Indels are handled as follows after corresponding sequences are aligned:
Expand Down Expand Up @@ -308,17 +308,17 @@ def multichr_mafft(args):
prefix = "" if args.outFilePrefix == None else args.outFilePrefix

# reorder the data into new FASTA files, where each FASTA file has only variants of its respective chromosome
transposedFiles = transposeChromosomeFiles(args.inFastas, absoluteOutDirectory, prefix)
transposedFiles = transposeChromosomeFiles(args.inFastas)

# since the FASTA files are
for idx, filePath in enumerate(transposedFiles):
pass

# execute MAFFT alignment. The input file is passed within a list, since argparse ordinarily
# passes input files in this way, and the MAFFT tool expects lists,
# but in this case we are creating the input file ourselves
tools.mafft.MafftTool().execute(
inFastas = [os.path.abspath(filePath)],
outFile = absoluteOutDirectory + "/{}_{}.fasta".format(prefix, idx),
outFile = os.path.join(absoluteOutDirectory, "{}_{}.fasta".format(prefix, idx)),
localpair = args.localpair,
globalpair = args.globalpair,
preservecase = args.preservecase,
Expand Down Expand Up @@ -364,17 +364,17 @@ def make_vcf(a, ref_idx, chrom):
alt.append(a[j][i])
if len(alt) > 0:
row = [chrom, i+1, '.', a[ref_idx][i], ','.join(alt), '.', '.', '.', 'GT']
vars = []
genos = []
for k in range(len(a)):
if a[k][i] == a[ref_idx][i]:
vars.append(0)
genos.append(0)
elif a[k][i] not in bases:
vars.append(".")
genos.append(".")
else:
for m in range(0, len(alt)):
if a[k][i] == alt[m]:
vars.append(m+1)
yield row+vars
genos.append(m+1)
yield row+genos

def transposeChromosomeFiles(inputFilenamesList):
''' Input: a list of FASTA files representing a genome for each sample.
Expand All @@ -394,18 +394,19 @@ def transposeChromosomeFiles(inputFilenamesList):
fastaFiles = [SeqIO.parse(x, 'fasta') for x in inputFilesList]

# for each interleaved record
for idx, chrRecordList in enumerate( zip_longest(*fastaFiles) ):
for chrRecordList in zip_longest(*fastaFiles):
if any(rec==None for rec in chrRecordList):
raise Exception("input files must all have the same number of sequences")

outputFilename = util.file.mkstempfname('.fasta')
outputFilenames.append(outputFilename)
with open(outputFilename, "w") as outf:
# write the corresonding records to a new FASTA file
countWritten = SeqIO.write(chrRecordList, outf, 'fasta')
SeqIO.write(chrRecordList, outf, 'fasta')

# close all input files
[x.close() for x in inputFilesList]
for x in inputFilesList:
x.close()

return outputFilenames

Expand Down
34 changes: 17 additions & 17 deletions intrahost.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
"[email protected], [email protected]"
__commands__ = []

import argparse, logging, itertools, re, shutil, tempfile, os
import argparse, logging, itertools, re, shutil, os
import Bio.AlignIO, Bio.SeqIO, Bio.Data.IUPACData
import pysam
import util.cmd, util.file, util.vcf, util.misc
from util.stats import mean, median, fisher_exact, chi2_contingency
from util.stats import median, fisher_exact, chi2_contingency
from interhost import CoordMapper
from tools.vphaser2 import Vphaser2Tool
from tools.samtools import SamtoolsTool
Expand Down Expand Up @@ -117,7 +117,7 @@ def filter_strand_bias(isnvs, minReadsEach = None, maxBias = None) :
if maxBias == None :
maxBias = defaultMaxBias
for row in isnvs:
front = row[:alleleCol]
#front = row[:alleleCol]
for fieldInd in range(len(row) - 1, alleleCol - 1, -1) :
f, r = AlleleFieldParser(row[fieldInd]).strand_counts()
if (int(f)<minReadsEach or int(r)<minReadsEach
Expand Down Expand Up @@ -179,8 +179,8 @@ def compute_library_bias(isnvs, inBam, inConsFasta) :
libBam = rgBams[0]
samtoolsTool.index(libBam)
n_reads = samtoolsTool.count(libBam)
log.debug("LB:{} has {} reads in {} read groups ({})".format(
lib, n_reads, len(rgs), ', '.join(rgs)))
log.debug("LB:%s has %s reads in %s read groups (%s)",
lib, n_reads, len(rgs), ', '.join(rgs))
libBams.append(libBam)

for row in isnvs :
Expand Down Expand Up @@ -454,8 +454,8 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, assemblies, strip_chr_version
if tot_n>0 and float(n)/tot_n >= 0.005)
# drop this position:sample if no variation left
if len(row['allele_counts']) < 2:
log.info("dropping iSNV at %s:%s (%s) because no variation remains after simple filtering" % (
row['s_chrom'], row['s_pos'], row['sample']))
log.info("dropping iSNV at %s:%s (%s) because no variation remains after simple filtering",
row['s_chrom'], row['s_pos'], row['sample'])
continue
# reposition vphaser deletions minus one to be consistent with
# VCF conventions
Expand Down Expand Up @@ -512,7 +512,7 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, assemblies, strip_chr_version
cons_stop = samp_to_cmap[s].mapBtoA(ref_sequence.id, end, side = 1)[1]
if cons_start == None or cons_stop == None :
log.info("variant is outside consensus assembly "
"for %s at %s:%s-%s." % (s, ref_sequence.id, pos, end))
"for %s at %s:%s-%s.", s, ref_sequence.id, pos, end)
continue
cons = samp_to_seqIndex[s][samp_to_cmap[s].mapBtoA(ref_sequence.id)]
allele = str(cons[cons_start-1:cons_stop].seq).upper()
Expand All @@ -521,7 +521,7 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, assemblies, strip_chr_version
if all(a in set(('A','C','T','G')) for a in allele):
consAlleles[s] = allele
else:
log.warning("dropping ambiguous consensus for %s at %s:%s-%s: %s" % (s, ref_sequence.id, pos, end, allele))
log.warning("dropping ambiguous consensus for %s at %s:%s-%s: %s", s, ref_sequence.id, pos, end, allele)

# define genotypes and fractions
iSNVs = {} # {sample : {allele : fraction, ...}, ...}
Expand Down Expand Up @@ -575,8 +575,8 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, assemblies, strip_chr_version
raise Exception()
if f>0.5 and a!=consAllele[samp_offsets[s]]:
log.warning("vPhaser and assembly pipelines mismatch at "
"%s:%d (%s) - consensus %s, vPhaser %s" %
(ref_sequence.id, pos, s, consAllele[samp_offsets[s]], a))
"%s:%d (%s) - consensus %s, vPhaser %s",
ref_sequence.id, pos, s, consAllele[samp_offsets[s]], a)
new_allele = list(consAllele)
new_allele[samp_offsets[s]] = a
a = ''.join(new_allele)
Expand Down Expand Up @@ -612,7 +612,7 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, assemblies, strip_chr_version
# if we filtered any alleles above, make sure to omit absent alleles
alleles_isnv2.append((len(counts),sum(counts),a))
else:
log.info("dropped allele {} at position {}:{}".format(a, ref_sequence.id, pos))
log.info("dropped allele %s at position %s:%s", a, ref_sequence.id, pos)
alleles_isnv = list(allele for n_samples, n_reads, allele in reversed(sorted(alleles_isnv2)))
alleles = list(util.misc.unique([refAllele] + alleles_cons + alleles_isnv))

Expand All @@ -621,7 +621,7 @@ def merge_to_vcf(refFasta, outVcf, samples, isnvs, assemblies, strip_chr_version
raise Exception()
elif len(alleles)==1:
# if we filtered any alleles above, skip this position if there is no variation left here
log.info("dropped position {}:{} due to lack of variation".format(ref_sequence.id, pos))
log.info("dropped position %s:%s due to lack of variation", ref_sequence.id, pos)
continue
alleleMap = dict((a,i) for i,a in enumerate(alleles))
genos = [str(alleleMap.get(consAlleles.get(s),'.')) for s in samples]
Expand Down Expand Up @@ -690,10 +690,10 @@ def parser_merge_to_vcf(parser=argparse.ArgumentParser()):
# ===================================================

def compute_Fws(vcfrow):
format = vcfrow[8].split(':')
if 'AF' not in format:
format_col = vcfrow[8].split(':')
if 'AF' not in format_col:
return None
af_idx = format.index('AF')
af_idx = format_col.index('AF')

freqs = [dat.split(':') for dat in vcfrow[9:]]
freqs = [float(dat[af_idx].split(',')[0]) for dat in freqs if len(dat)>af_idx and dat[af_idx]!='.' and dat[0]!='.' and int(dat[0])<=1]
Expand Down Expand Up @@ -857,7 +857,7 @@ def iSNV_table(vcf_iter):
out['Fws_snp'] = info['FWS']
yield out
except:
log.error("VCF parsing error at {}:{}".format(row['CHROM'], row['POS']))
log.error("VCF parsing error at %s:%s", row['CHROM'], row['POS'])
raise

def parser_iSNV_table(parser=argparse.ArgumentParser()):
Expand Down
8 changes: 4 additions & 4 deletions read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,8 @@ def split_bam(inBam, outBams) :
# accomplished by Picard RevertSam with SANITIZE=true)
totalReadCount = samtools.count(inBam)
maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2)
log.info("splitting %d reads into %d files of %d reads each" % (
totalReadCount, len(outBams), maxReads))
log.info("splitting %d reads into %d files of %d reads each",
totalReadCount, len(outBams), maxReads)

# load BAM header into memory
header = samtools.getHeader(inBam)
Expand Down Expand Up @@ -539,7 +539,7 @@ def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None):
if 'PU' in rg:
fname = rg['PU']
lb_to_files[rg.get('LB','none')].add(os.path.join(tempDir, fname))
log.info("found %d distinct libraries and %d read groups" % (len(lb_to_files), len(read_groups)))
log.info("found %d distinct libraries and %d read groups", len(lb_to_files), len(read_groups))

# For each library, merge FASTQs and run rmdup for entire library
readList = mkstempfname('.keep_reads.txt')
Expand All @@ -558,7 +558,7 @@ def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None):
outf.write(line)
os.unlink(fn)
else:
log.warn("no reads found in %s, assuming that's because there's no reads in that read group" % fn)
log.warn("no reads found in %s, assuming that's because there's no reads in that read group", fn)

# M-Vicuna DupRm to see what we should keep (append IDs to running file)
mvicuna_fastqs_to_readlist(infastqs[0], infastqs[1], readList)
Expand Down
2 changes: 1 addition & 1 deletion reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__author__ = "[email protected]"
__commands__ = []

import argparse, logging, subprocess, glob, os, os.path, time
import argparse, logging, subprocess, glob, os, time
import pysam, Bio.SeqIO
import util.cmd, util.file, util.misc
import tools.samtools
Expand Down
9 changes: 4 additions & 5 deletions taxon_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
+ "[email protected]"
__commands__ = []

import argparse, logging, subprocess, os, tempfile, errno, shutil
import argparse, logging, subprocess, os, tempfile, shutil
from Bio import SeqIO
import util.cmd, util.file
import tools, tools.blast
Expand Down Expand Up @@ -117,7 +117,6 @@ def lastal_get_hits(inFastq, db, outList):
lastalPath = tools.last.Lastal().install_and_get_path()
mafSortPath = tools.last.MafSort().install_and_get_path()
mafConvertPath = tools.last.MafConvert().install_and_get_path()
prinseqPath = tools.prinseq.PrinseqTool().install_and_get_path()
noBlastLikeHitsPath = os.path.join( util.file.get_scripts_path(),
'noBlastLikeHits.py')

Expand Down Expand Up @@ -560,7 +559,7 @@ def deplete_blastn(inFastq, outFastq, refDbs) :
## Run blastn using each of the databases in turn
blastOutFiles = []
for db in refDbs :
log.info("running blastn on {} against {}".format(inFastq, db))
log.info("running blastn on %s against %s", inFastq, db)
blastOutFiles += blastn_chunked_fasta(inFasta, db)

## Combine results from different databases
Expand Down Expand Up @@ -640,7 +639,7 @@ def deplete_blastn_bam(inBam, db, outBam, chunkSize=1000000, JVMmemory=None):
read_utils.fastq_to_fasta(fastq1, fasta)
os.unlink(fastq1)
os.unlink(fastq2)
log.info("running blastn on {} pair 1 against {}".format(inBam, db))
log.info("running blastn on %s pair 1 against %s", inBam, db)
blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize)
with open(blast_hits, 'wt') as outf:
for blastOutFile in blastOutFiles:
Expand All @@ -662,7 +661,7 @@ def deplete_blastn_bam(inBam, db, outBam, chunkSize=1000000, JVMmemory=None):
read_utils.fastq_to_fasta(fastq2, fasta)
os.unlink(fastq1)
os.unlink(fastq2)
log.info("running blastn on {} pair 2 against {}".format(inBam, db))
log.info("running blastn on %s pair 2 against %s", inBam, db)
blastOutFiles = blastn_chunked_fasta(fasta, db, chunkSize)
with open(blast_hits, 'wt') as outf:
for blastOutFile in blastOutFiles:
Expand Down
Loading

0 comments on commit 1ba2dc4

Please sign in to comment.