VDBWRAIR · averagehat · Jul 28, 2016 · Jul 29, 2016 · Jul 29, 2016 · Sep 13, 2016
diff --git a/bioframework/consensus.py b/bioframework/consensus.py
@@ -1,15 +1,23 @@
 """
 Usage:
-     consensus --ref <ref> --vcf <vcf> [--mind <mind>] [--majority <majority>] [-o <output>]
+     consensus --ref <ref> --vcf <vcf> [--mind <mind>] [--majority <majority>] [-o <output>] [--sample <sample>] [--bam <bam>] [--minbq <minbq>]
 
 Options:
     --ref=<ref>             Reference fasta file
     --vcf=<vcf>             VCF output
     --majority=<majority>   Percentage required [default: 80]
     --mind=<mind>           minimum depth to call base non-N [default: 10]
-    --output,-o=<output>       output file [default: ]
+    --minbq=<mind>          bases below this quality are ignored when computing depth [default: 0]
+    --sample=<sample>       sample name
+    --bam=<bam>             bam file, required to check for coverage
+    --output,-o=<output>    output file [default: ]
 """
 #stdlib
+# fb_consensus --ref
+# /media/VD_Research/Analysis/ProjectBased_Analysis/melanie/share/Issue_11973_freeBayes_MP/Dengue/Issue_12657_80-20/Projects/2195/Den3__Thailand__FJ744727__2001.fasta
+# --bam $dir/tagged.bam --mind 10 --vcf
+# /media/VD_Research/Analysis/ProjectBased_Analysis/melanie/share/Issue_11973_freeBayes_MP/Dengue/Issue_12657_80-20/Projects/2195/freebayes.vcf
+
 from operator import itemgetter as get
 from functools import partial
 from itertools import ifilter, imap, groupby, takewhile, repeat, starmap, izip_longest
@@ -20,13 +28,16 @@
 
 from Bio import SeqIO #done
 from Bio.SeqRecord import SeqRecord #done
+from Bio.Seq import Seq
+
 import vcf #done
 from vcf.model import _Record
-import sh #todo
+# import sh #todo
 #from toolz import compose
 from toolz.dicttoolz import merge, dissoc, merge_with, valfilter, keyfilter #done
 from docopt import docopt #ignore
-from schema import Schema, Use #ignore
+from schema import Schema, Use, Optional #ignore
+from plumbum.cmd import samtools
 #from contracts import contract, new_contract #can ignore
 #from mypy.types import VCFRow
 #############
@@ -197,30 +208,71 @@ def single_consensus(muts, ref):
 ##########
 # I/O    #
 ##########
-def consensus_str(ref, consensus): # type: (SeqRecord, str) -> str
-    return ">{0}:Consensus\n{1}".format(ref.id, consensus)
+def consensus_str(sample, ref, consensus): # type: (SeqRecord, str) -> str
+    return ">{0}_{1}:Consensus\n{2}".format(sample if sample else '', ref.id, consensus)
 
-def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int]
-    pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True)
-    get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int]
-    return imap(get_pos, pileup)
+#def zero_coverage_positions(bam_file, ref_file): # type: (str, str) -> Iterable[int]
+#    pileup = sh.Command('mpileup')(bam_file, f=ref_file, _iter=True)
+#    get_pos = lambda x: int(x.split()[1]) # type: Callable[[str],int]
+#    return imap(get_pos, pileup)
 
 #TODO: is pileup 0-based or 1-based index?
 def trim_ref(ref, positions): # type: (str, Iterator[int]) -> str
     start, end = next(positions), collections.deque(positions, 1)[0]
     return '-'*start + ref[:start:end] + '-'*(len(ref) - end)
 
+# def samtoolsDepth(bam):
+#     lines = samtools['depth'][bam]().split('\n')
+#     lines = map(str.split, lines)
+#     stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines)
+#     return stats
+
+def samtoolsDepth(ref_id, bam, minbq):
+    lines = samtools['depth'][bam, '-r', ref_id, '-q', minbq]().split('\n')
+    lines = filter(lambda x: x.strip(), lines)
+    lines = map(lambda x: x.split('\t'), lines)
+    stats = map(lambda x: { 'pos' : int(x[1]), 'depth' : int(x[2]) }, lines)
+    return stats
+
+#def uncoveredPositions(mind, bam):
+#    depthStats = samtoolsDepth(bam)
+#    underStats = filter(lambda x: x['depth'] < mind, depthStats)
+#    return map(lambda x: x['pos'], underStats)
+
+def uncoveredPositions(mind, minbq, bam, ref):
+    depthStats = samtoolsDepth(str(ref.id), bam, minbq) # use ref string
+    allPositions = range(1, len(ref.seq)+1)
+    underStats = filter(lambda x: x['depth'] < mind, depthStats)
+    underPositions = map(lambda x: x['pos'], underStats)
+    allDepthPositions = map(lambda x: x['pos'], depthStats)
+    zeroPositions = set(allPositions) - set(allDepthPositions)
+    return set(underPositions) | zeroPositions
+    # return map(lambda x: x['pos'], underStats)
+
+#def addNsAtUncovered(positions, ref):
+#    new_ref = ref
+#    for pos in positions:
+#        new_ref[pos-1] = 'N'
+#    return new_ref
+
 
+def addNsAtUncovered(mind, minbq, bam, ref_seq):
+    badPositions = uncoveredPositions(mind, minbq, bam, ref_seq)
+    new_ref = list(str(ref_seq.seq))
+    for pos in badPositions:
+        new_ref[pos-1] = 'N'
+    return SeqRecord(seq=Seq(''.join(new_ref)), id=ref_seq.id)
 
 #@contract(ref_fasta=str, vcf=str, mind=int, majority=int)
-def run(ref_fasta, freebayes_vcf, outfile, mind, majority):
+def run(ref_fasta, freebayes_vcf, outfile, mind, minbq, majority, sample, bam):
     # type: (str, str, BinaryIO, int, int) -> int
     _refs = SeqIO.parse(ref_fasta, 'fasta')
     with open(freebayes_vcf, 'r') as vcf_handle:
         _muts = map(flatten_vcf_record, vcf.Reader(vcf_handle))
         refs, muts = list(_refs), list(_muts)
+        refs = map(partial(addNsAtUncovered, mind, minbq, bam), refs)
         the_refs, seqs_and_muts = all_consensuses(refs, muts, mind, majority)
-        strings = imap(consensus_str, the_refs, imap(get(0), seqs_and_muts))
+        strings = imap(partial(consensus_str, sample), the_refs, imap(get(0), seqs_and_muts))
         result = '\n'.join(strings)
         outfile.write(result)
         outfile.close()
@@ -230,13 +282,16 @@ def main(): # type: () -> None
     scheme = Schema(
         { '--vcf' : os.path.isfile,
           '--ref' : os.path.isfile,
+         Optional('--sample') : lambda x: True,
+         Optional('--bam') : lambda x: True,
           '--majority' : Use(int),
           '--mind' : Use(int),
+          '--minbq' : Use(int),
           '--output' : Use(lambda x: sys.stdout if not x else open(x, 'w'))})
     raw_args = docopt(__doc__, version='Version 1.0')
     args = scheme.validate(raw_args)
     run(args['--ref'], args['--vcf'], args['--output'],
-        args['--mind'], args['--output'])
+        args['--mind'], args['--minbq'], args['--output'], args['--sample'], args['--bam'])
 
 if __name__ == '__main__':
     main()
diff --git a/bioframework/tagreads.py b/bioframework/tagreads.py
@@ -155,14 +155,19 @@ def get_bam_header( bam ):
     return hdr.rstrip()
 
 def parse_args( args=sys.argv[1:] ):
-    from ngs_mapper import config
-    conf_parser, args, config, configfile = config.get_config_argparse(args)
-    defaults = config['tagreads']
+    defaults = { 'SM' : { 'default' : None,
+                         'help' : 'Sets the SM tag value inside of each read group. Default is the portion of the filname that preceeds the .bam[Default: %(default)s]'
+                         },
+                 'CN' : { 'default' : None,
+                         'help' : 'Sets the CN tag inside of each read group to the value specified.[Default: %(default)s]'
+                         }
+                }
+
+                #         config['tagreads']
 
     parser = argparse.ArgumentParser(
         description='''Adds Sanger, MiSeq, 454 and IonTorrent read groups to a bam file.
         Will tag all reads in the alignment for each of the platforms based on the read name''',
-        parents=[conf_parser]
     )
 
     parser.add_argument(

diff --git a/install.sh b/install.sh
@@ -0,0 +1,3 @@
+conda install --file requirements-conda.txt
+pip install -r requirements.txt
+python setup.py install
diff --git a/requirements-conda.txt b/requirements-conda.txt
@@ -1,5 +1,6 @@
 # Dependencies that are software and/or large such as scikit-learn...
 bwa
 samtools
+freebayes
 cutadapt
 biopython
diff --git a/scripts/Makefile b/scripts/Makefile
@@ -0,0 +1,25 @@
+
+SAMPLE=$(notdir $(NGSDIR))
+BAM=$(NGSDIR)/$(SAMPLE).bam
+REF=$(NGSDIR)/$(SAMPLE).ref.fasta
+TAGGED=$(NGSDIR)/tagged.bam
+FBVCF=$(NGSDIR)/freebayes.vcf
+FBCONSENSUS=$(NGSDIR)/$(SAMPLE).freebayes_consensus.fasta
+# MIND=0
+# MINBQ=0
+
+all: $(FBCONSENSUS)
+
+$(TAGGED): $(BAM)
+	fb_tagreads $< | samtools view  - -Sbh | samtools sort - $(TAGGED:.bam=)
+
+$(TAGGED).bai: $(TAGGED)
+	samtools index $<
+
+$(FBVCF): $(TAGGED) $(NGSDIR)/tagged.bam.bai $(REF)
+	freebayes -f $(REF) $< > $@
+
+$(FBCONSENSUS):  $(FBVCF) $(REF)
+	#fb_consensus --vcf $< --ref $(REF) --sample $(SAMPLE) > $@
+	fb_consensus --vcf $< --ref $(REF) --sample $(SAMPLE) --bam $(TAGGED) --mind $(MIND) --minbq $(MINBQ) > $@
+#cd ~/bioframes && python bioframes/consensus.py $^ > $@
diff --git a/scripts/consensus.sh b/scripts/consensus.sh
@@ -0,0 +1,9 @@
+if [ -z  "$1" ] || [ -z "$2" ]; 
+then 
+    echo "Usage: $(basename $0) <ngsdir> <reference>";
+    exit 1
+fi;
+makefile=/media/VD_Research/Admin/PBS/Software/bioframework/scripts/Makefile
+
+# make -f $(dirname $0)/Makefile NGSDIR=$1
+make -f $makefile NGSDIR=$1 REF=$2
diff --git a/setup.py b/setup.py
@@ -18,15 +18,22 @@ def jip_modules(path=bioframework.JIP_PATH):
     description = bioframework.__description__,
     license = "GPLv2",
     keywords = bioframework.__keywords__,
+    # scripts = glob('scripts/*'),
+    scripts = ['scripts/consensus.sh'],
     entry_points = {
+        'console_scripts': [
+            'fb_tagreads = bioframework.tagreads:main',
+            'fb_consensus = bioframework.consensus:main'
+        ]
     },
     install_requires = [
         'pyjip',
         'toolz',
         'docopt',
         'schema',
         'pyvcf',
-        'typing'
+        'typing',
+        'plumbum'
     ],
     data_files=[
         (join(sys.prefix,'bin'), jip_modules()),