Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consensus #23

Open
wants to merge 14 commits into
base: bam-processing
Choose a base branch
from
81 changes: 68 additions & 13 deletions bioframework/consensus.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,23 @@
"""
Usage:
consensus --ref <ref> --vcf <vcf> [--mind <mind>] [--majority <majority>] [-o <output>]
consensus --ref <ref> --vcf <vcf> [--mind <mind>] [--majority <majority>] [-o <output>] [--sample <sample>] [--bam <bam>] [--minbq <minbq>]

Options:
--ref=<ref> Reference fasta file
--vcf=<vcf> VCF output
--majority=<majority> Percentage required [default: 80]
--mind=<mind> minimum depth to call base non-N [default: 10]
--output,-o=<output> output file [default: ]
--minbq=<mind> bases below this quality are ignored when computing depth [default: 0]
--sample=<sample> sample name
--bam=<bam> bam file, required to check for coverage
--output,-o=<output> output file [default: ]
"""
#stdlib
# fb_consensus --ref
# /media/VD_Research/Analysis/ProjectBased_Analysis/melanie/share/Issue_11973_freeBayes_MP/Dengue/Issue_12657_80-20/Projects/2195/Den3__Thailand__FJ744727__2001.fasta
# --bam $dir/tagged.bam --mind 10 --vcf
# /media/VD_Research/Analysis/ProjectBased_Analysis/melanie/share/Issue_11973_freeBayes_MP/Dengue/Issue_12657_80-20/Projects/2195/freebayes.vcf

from operator import itemgetter as get
from functools import partial
from itertools import ifilter, imap, groupby, takewhile, repeat, starmap, izip_longest
Expand All @@ -20,13 +28,16 @@

from Bio import SeqIO #done
from Bio.SeqRecord import SeqRecord #done
from Bio.Seq import Seq

import vcf #done
from vcf.model import _Record
import sh #todo
# import sh #todo
#from toolz import compose
from toolz.dicttoolz import merge, dissoc, merge_with, valfilter, keyfilter #done
from docopt import docopt #ignore
from schema import Schema, Use #ignore
from schema import Schema, Use, Optional #ignore
from plumbum.cmd import samtools
#from contracts import contract, new_contract #can ignore
#from mypy.types import VCFRow
#############
Expand Down Expand Up @@ -197,30 +208,71 @@ def single_consensus(muts, ref):
##########
# I/O #
##########
def consensus_str(ref, consensus): # type: (SeqRecord, str) -> str
return ">{0}:Consensus\n{1}".format(ref.id, consensus)
def consensus_str(sample, ref, consensus): # type: (SeqRecord, str) -> str
return ">{0}_{1}:Consensus\n{2}".format(sample if sample else '', ref.id, consensus)

def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int]
pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True)
get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int]
return imap(get_pos, pileup)
#def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int]
# pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True)
# get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int]
# return imap(get_pos, pileup)

#TODO: is pileup 0-based or 1-based index?
def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str
start, end = next(positions), collections.deque(positions, 1)[0]
return '-'*start + ref[:start:end] + '-'*(len(ref) - end)

# def samtoolsDepth(bam):
# lines = samtools['depth'][bam]().split('\n')
# lines = map(str.split, lines)
# stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines)
# return stats

def samtoolsDepth(ref_id, bam, minbq):
lines = samtools['depth'][bam, '-r', ref_id, '-q', minbq]().split('\n')
lines = filter(lambda x: x.strip(), lines)
lines = map(lambda x: x.split('\t'), lines)
stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines)
return stats

#def uncoveredPositions(mind, bam):
# depthStats = samtoolsDepth(bam)
# underStats = filter(lambda x: x['depth'] < mind, depthStats)
# return map(lambda x: x['pos'], underStats)

def uncoveredPositions(mind, minbq, bam, ref):
depthStats = samtoolsDepth(str(ref.id), bam, minbq) # use ref string
allPositions = range(1, len(ref.seq)+1)
underStats = filter(lambda x: x['depth'] < mind, depthStats)
underPositions = map(lambda x: x['pos'], underStats)
allDepthPositions = map(lambda x: x['pos'], depthStats)
zeroPositions = set(allPositions) - set(allDepthPositions)
return set(underPositions) | zeroPositions
# return map(lambda x: x['pos'], underStats)

#def addNsAtUncovered(positions, ref):
# new_ref = ref
# for pos in positions:
# new_ref[pos-1] = 'N'
# return new_ref


def addNsAtUncovered(mind, minbq, bam, ref_seq):
badPositions = uncoveredPositions(mind, minbq, bam, ref_seq)
new_ref = list(str(ref_seq.seq))
for pos in badPositions:
new_ref[pos-1] = 'N'
return SeqRecord(seq=Seq(''.join(new_ref)), id=ref_seq.id)

#@contract(ref_fasta=str, vcf=str, mind=int, majority=int)
def run(ref_fasta, freebayes_vcf, outfile, mind, majority):
def run(ref_fasta, freebayes_vcf, outfile, mind, minbq, majority, sample, bam):
# type: (str, str, BinaryIO, int, int) -> int
_refs = SeqIO.parse(ref_fasta, 'fasta')
with open(freebayes_vcf, 'r') as vcf_handle:
_muts = map(flatten_vcf_record, vcf.Reader(vcf_handle))
refs, muts = list(_refs), list(_muts)
refs = map(partial(addNsAtUncovered, mind, minbq, bam), refs)
the_refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority)
strings = imap(consensus_str, the_refs, imap(get(0), seqs_and_muts))
strings = imap(partial(consensus_str, sample), the_refs, imap(get(0), seqs_and_muts))
result = '\n'.join(strings)
outfile.write(result)
outfile.close()
Expand All @@ -230,13 +282,16 @@ def main(): # type: () -> None
scheme = Schema(
{ '--vcf' : os.path.isfile,
'--ref' : os.path.isfile,
Optional('--sample') : lambda x: True,
Optional('--bam') : lambda x: True,
'--majority' : Use(int),
'--mind' : Use(int),
'--minbq' : Use(int),
'--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))})
raw_args = docopt(__doc__, version='Version 1.0')
args = scheme.validate(raw_args)
run(args['--ref'], args['--vcf'], args['--output'],
args['--mind'], args['--output'])
args['--mind'], args['--minbq'], args['--output'], args['--sample'], args['--bam'])

if __name__ == '__main__':
main()
13 changes: 9 additions & 4 deletions bioframework/tagreads.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,14 +155,19 @@ def get_bam_header( bam ):
return hdr.rstrip()

def parse_args( args=sys.argv[1:] ):
from ngs_mapper import config
conf_parser, args, config, configfile = config.get_config_argparse(args)
defaults = config['tagreads']
defaults = { 'SM' : { 'default' : None,
'help' : 'Sets the SM tag value inside of each read group. Default is the portion of the filname that preceeds the .bam[Default: %(default)s]'
},
'CN' : { 'default' : None,
'help' : 'Sets the CN tag inside of each read group to the value specified.[Default: %(default)s]'
}
}

# config['tagreads']

parser = argparse.ArgumentParser(
description='''Adds Sanger, MiSeq, 454 and IonTorrent read groups to a bam file.
Will tag all reads in the alignment for each of the platforms based on the read name''',
parents=[conf_parser]
)

parser.add_argument(
Expand Down
3 changes: 3 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
conda install --file requirements-conda.txt
pip install -r requirements.txt
python setup.py install
1 change: 1 addition & 0 deletions requirements-conda.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Dependencies that are software and/or large such as scikit-learn...
bwa
samtools
freebayes
cutadapt
biopython
25 changes: 25 additions & 0 deletions scripts/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

SAMPLE=$(notdir $(NGSDIR))
BAM=$(NGSDIR)/$(SAMPLE).bam
REF=$(NGSDIR)/$(SAMPLE).ref.fasta
TAGGED=$(NGSDIR)/tagged.bam
FBVCF=$(NGSDIR)/freebayes.vcf
FBCONSENSUS=$(NGSDIR)/$(SAMPLE).freebayes_consensus.fasta
# MIND=0
# MINBQ=0

all: $(FBCONSENSUS)

$(TAGGED): $(BAM)
fb_tagreads $< | samtools view - -Sbh | samtools sort - $(TAGGED:.bam=)

$(TAGGED).bai: $(TAGGED)
samtools index $<

$(FBVCF): $(TAGGED) $(NGSDIR)/tagged.bam.bai $(REF)
freebayes -f $(REF) $< > $@

$(FBCONSENSUS): $(FBVCF) $(REF)
#fb_consensus --vcf $< --ref $(REF) --sample $(SAMPLE) > $@
fb_consensus --vcf $< --ref $(REF) --sample $(SAMPLE) --bam $(TAGGED) --mind $(MIND) --minbq $(MINBQ) > $@
#cd ~/bioframes && python bioframes/consensus.py $^ > $@
9 changes: 9 additions & 0 deletions scripts/consensus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
if [ -z "$1" ] || [ -z "$2" ];
then
echo "Usage: $(basename $0) <ngsdir> <reference>";
exit 1
fi;
makefile=/media/VD_Research/Admin/PBS/Software/bioframework/scripts/Makefile

# make -f $(dirname $0)/Makefile NGSDIR=$1
make -f $makefile NGSDIR=$1 REF=$2
9 changes: 8 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,22 @@ def jip_modules(path=bioframework.JIP_PATH):
description = bioframework.__description__,
license = "GPLv2",
keywords = bioframework.__keywords__,
# scripts = glob('scripts/*'),
scripts = ['scripts/consensus.sh'],
entry_points = {
'console_scripts': [
'fb_tagreads = bioframework.tagreads:main',
'fb_consensus = bioframework.consensus:main'
]
},
install_requires = [
'pyjip',
'toolz',
'docopt',
'schema',
'pyvcf',
'typing'
'typing',
'plumbum'
],
data_files=[
(join(sys.prefix,'bin'), jip_modules()),
Expand Down