diff --git a/bioframework/consensus.py b/bioframework/consensus.py index 8dd433c..746e915 100755 --- a/bioframework/consensus.py +++ b/bioframework/consensus.py @@ -1,15 +1,23 @@ """ Usage: - consensus --ref --vcf [--mind ] [--majority ] [-o ] + consensus --ref --vcf [--mind ] [--majority ] [-o ] [--sample ] [--bam ] [--minbq ] Options: --ref= Reference fasta file --vcf= VCF output --majority= Percentage required [default: 80] --mind= minimum depth to call base non-N [default: 10] - --output,-o= output file [default: ] + --minbq= bases below this quality are ignored when computing depth [default: 0] + --sample= sample name + --bam= bam file, required to check for coverage + --output,-o= output file [default: ] """ #stdlib +# fb_consensus --ref +# /media/VD_Research/Analysis/ProjectBased_Analysis/melanie/share/Issue_11973_freeBayes_MP/Dengue/Issue_12657_80-20/Projects/2195/Den3__Thailand__FJ744727__2001.fasta +# --bam $dir/tagged.bam --mind 10 --vcf +# /media/VD_Research/Analysis/ProjectBased_Analysis/melanie/share/Issue_11973_freeBayes_MP/Dengue/Issue_12657_80-20/Projects/2195/freebayes.vcf + from operator import itemgetter as get from functools import partial from itertools import ifilter, imap, groupby, takewhile, repeat, starmap, izip_longest @@ -20,13 +28,16 @@ from Bio import SeqIO #done from Bio.SeqRecord import SeqRecord #done +from Bio.Seq import Seq + import vcf #done from vcf.model import _Record -import sh #todo +# import sh #todo #from toolz import compose from toolz.dicttoolz import merge, dissoc, merge_with, valfilter, keyfilter #done from docopt import docopt #ignore -from schema import Schema, Use #ignore +from schema import Schema, Use, Optional #ignore +from plumbum.cmd import samtools #from contracts import contract, new_contract #can ignore #from mypy.types import VCFRow ############# @@ -197,30 +208,71 @@ def single_consensus(muts, ref): ########## # I/O # ########## -def consensus_str(ref, consensus): # type: (SeqRecord, str) -> str - return ">{0}:Consensus\n{1}".format(ref.id, consensus) +def consensus_str(sample, ref, consensus): # type: (SeqRecord, str) -> str + return ">{0}_{1}:Consensus\n{2}".format(sample if sample else '', ref.id, consensus) -def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int] - pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True) - get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int] - return imap(get_pos, pileup) +#def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int] +# pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True) +# get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int] +# return imap(get_pos, pileup) #TODO: is pileup 0-based or 1-based index? def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str start, end = next(positions), collections.deque(positions, 1)[0] return '-'*start + ref[:start:end] + '-'*(len(ref) - end) +# def samtoolsDepth(bam): +# lines = samtools['depth'][bam]().split('\n') +# lines = map(str.split, lines) +# stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines) +# return stats + +def samtoolsDepth(ref_id, bam, minbq): + lines = samtools['depth'][bam, '-r', ref_id, '-q', minbq]().split('\n') + lines = filter(lambda x: x.strip(), lines) + lines = map(lambda x: x.split('\t'), lines) + stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines) + return stats + +#def uncoveredPositions(mind, bam): +# depthStats = samtoolsDepth(bam) +# underStats = filter(lambda x: x['depth'] < mind, depthStats) +# return map(lambda x: x['pos'], underStats) + +def uncoveredPositions(mind, minbq, bam, ref): + depthStats = samtoolsDepth(str(ref.id), bam, minbq) # use ref string + allPositions = range(1, len(ref.seq)+1) + underStats = filter(lambda x: x['depth'] < mind, depthStats) + underPositions = map(lambda x: x['pos'], underStats) + allDepthPositions = map(lambda x: x['pos'], depthStats) + zeroPositions = set(allPositions) - set(allDepthPositions) + return set(underPositions) | zeroPositions + # return map(lambda x: x['pos'], underStats) + +#def addNsAtUncovered(positions, ref): +# new_ref = ref +# for pos in positions: +# new_ref[pos-1] = 'N' +# return new_ref + +def addNsAtUncovered(mind, minbq, bam, ref_seq): + badPositions = uncoveredPositions(mind, minbq, bam, ref_seq) + new_ref = list(str(ref_seq.seq)) + for pos in badPositions: + new_ref[pos-1] = 'N' + return SeqRecord(seq=Seq(''.join(new_ref)), id=ref_seq.id) #@contract(ref_fasta=str, vcf=str, mind=int, majority=int) -def run(ref_fasta, freebayes_vcf, outfile, mind, majority): +def run(ref_fasta, freebayes_vcf, outfile, mind, minbq, majority, sample, bam): # type: (str, str, BinaryIO, int, int) -> int _refs = SeqIO.parse(ref_fasta, 'fasta') with open(freebayes_vcf, 'r') as vcf_handle: _muts = map(flatten_vcf_record, vcf.Reader(vcf_handle)) refs, muts = list(_refs), list(_muts) + refs = map(partial(addNsAtUncovered, mind, minbq, bam), refs) the_refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority) - strings = imap(consensus_str, the_refs, imap(get(0), seqs_and_muts)) + strings = imap(partial(consensus_str, sample), the_refs, imap(get(0), seqs_and_muts)) result = '\n'.join(strings) outfile.write(result) outfile.close() @@ -230,13 +282,16 @@ def main(): # type: () -> None scheme = Schema( { '--vcf' : os.path.isfile, '--ref' : os.path.isfile, + Optional('--sample') : lambda x: True, + Optional('--bam') : lambda x: True, '--majority' : Use(int), '--mind' : Use(int), + '--minbq' : Use(int), '--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))}) raw_args = docopt(__doc__, version='Version 1.0') args = scheme.validate(raw_args) run(args['--ref'], args['--vcf'], args['--output'], - args['--mind'], args['--output']) + args['--mind'], args['--minbq'], args['--output'], args['--sample'], args['--bam']) if __name__ == '__main__': main() diff --git a/bioframework/tagreads.py b/bioframework/tagreads.py index 42fa565..5acd7d1 100644 --- a/bioframework/tagreads.py +++ b/bioframework/tagreads.py @@ -155,14 +155,19 @@ def get_bam_header( bam ): return hdr.rstrip() def parse_args( args=sys.argv[1:] ): - from ngs_mapper import config - conf_parser, args, config, configfile = config.get_config_argparse(args) - defaults = config['tagreads'] + defaults = { 'SM' : { 'default' : None, + 'help' : 'Sets the SM tag value inside of each read group. Default is the portion of the filname that preceeds the .bam[Default: %(default)s]' + }, + 'CN' : { 'default' : None, + 'help' : 'Sets the CN tag inside of each read group to the value specified.[Default: %(default)s]' + } + } + + # config['tagreads'] parser = argparse.ArgumentParser( description='''Adds Sanger, MiSeq, 454 and IonTorrent read groups to a bam file. Will tag all reads in the alignment for each of the platforms based on the read name''', - parents=[conf_parser] ) parser.add_argument( diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..8886a67 --- /dev/null +++ b/install.sh @@ -0,0 +1,3 @@ +conda install --file requirements-conda.txt +pip install -r requirements.txt +python setup.py install diff --git a/requirements-conda.txt b/requirements-conda.txt index d2b04c0..62509a0 100644 --- a/requirements-conda.txt +++ b/requirements-conda.txt @@ -1,5 +1,6 @@ # Dependencies that are software and/or large such as scikit-learn... bwa samtools +freebayes cutadapt biopython diff --git a/scripts/Makefile b/scripts/Makefile new file mode 100644 index 0000000..1f85921 --- /dev/null +++ b/scripts/Makefile @@ -0,0 +1,25 @@ + +SAMPLE=$(notdir $(NGSDIR)) +BAM=$(NGSDIR)/$(SAMPLE).bam +REF=$(NGSDIR)/$(SAMPLE).ref.fasta +TAGGED=$(NGSDIR)/tagged.bam +FBVCF=$(NGSDIR)/freebayes.vcf +FBCONSENSUS=$(NGSDIR)/$(SAMPLE).freebayes_consensus.fasta +# MIND=0 +# MINBQ=0 + +all: $(FBCONSENSUS) + +$(TAGGED): $(BAM) + fb_tagreads $< | samtools view - -Sbh | samtools sort - $(TAGGED:.bam=) + +$(TAGGED).bai: $(TAGGED) + samtools index $< + +$(FBVCF): $(TAGGED) $(NGSDIR)/tagged.bam.bai $(REF) + freebayes -f $(REF) $< > $@ + +$(FBCONSENSUS): $(FBVCF) $(REF) + #fb_consensus --vcf $< --ref $(REF) --sample $(SAMPLE) > $@ + fb_consensus --vcf $< --ref $(REF) --sample $(SAMPLE) --bam $(TAGGED) --mind $(MIND) --minbq $(MINBQ) > $@ +#cd ~/bioframes && python bioframes/consensus.py $^ > $@ diff --git a/scripts/consensus.sh b/scripts/consensus.sh new file mode 100644 index 0000000..dbba369 --- /dev/null +++ b/scripts/consensus.sh @@ -0,0 +1,9 @@ +if [ -z "$1" ] || [ -z "$2" ]; +then + echo "Usage: $(basename $0) "; + exit 1 +fi; +makefile=/media/VD_Research/Admin/PBS/Software/bioframework/scripts/Makefile + +# make -f $(dirname $0)/Makefile NGSDIR=$1 +make -f $makefile NGSDIR=$1 REF=$2 diff --git a/setup.py b/setup.py index 1d09b17..aaaac3f 100755 --- a/setup.py +++ b/setup.py @@ -18,7 +18,13 @@ def jip_modules(path=bioframework.JIP_PATH): description = bioframework.__description__, license = "GPLv2", keywords = bioframework.__keywords__, + # scripts = glob('scripts/*'), + scripts = ['scripts/consensus.sh'], entry_points = { + 'console_scripts': [ + 'fb_tagreads = bioframework.tagreads:main', + 'fb_consensus = bioframework.consensus:main' + ] }, install_requires = [ 'pyjip', @@ -26,7 +32,8 @@ def jip_modules(path=bioframework.JIP_PATH): 'docopt', 'schema', 'pyvcf', - 'typing' + 'typing', + 'plumbum' ], data_files=[ (join(sys.prefix,'bin'), jip_modules()),