Merge pull request #59 from timothymillar/f/llk-encoding

v0.2.0
PlantandFoodResearch · Oct 6, 2020 · fd3b35b · fd3b35b
2 parents 02efedc + 6504ec4
commit fd3b35b
Show file tree

Hide file tree

Showing 35 changed files with 278 additions and 635 deletions.
diff --git a/mchap/application/assemble.py b/mchap/application/assemble.py
@@ -7,7 +7,7 @@
 import multiprocessing as mp
 
 from mchap.assemble.mcmc.denovo import DenovoMCMC
-from mchap.encoding import symbolic
+from mchap.encoding import character, integer
 from mchap.io import \
     read_bed4, \
     extract_sample_ids, \
@@ -413,12 +413,15 @@ def header(self):
             vcf.formatfields.GQ,
             vcf.formatfields.PHQ,
             vcf.formatfields.DP,
-            vcf.formatfields.RC,
+            vcf.formatfields.RCOUNT,
+            vcf.formatfields.RCALLS,
+            vcf.formatfields.MEC,
             vcf.formatfields.FT,
             vcf.formatfields.GPM,
             vcf.formatfields.PPM, 
             vcf.formatfields.MPGP,
             vcf.formatfields.MPED,
+            vcf.formatfields.RASSIGN,
         )
 
         vcf_header = vcf.VCFHeader(
@@ -454,7 +457,7 @@ def _assemble_locus(self, header, locus):
 
                 # process read data
                 read_count = read_chars.shape[0]
-                read_depth = symbolic.depth(read_chars)
+                read_depth = character.depth(read_chars)
                 read_chars, read_quals = add_nan_read_if_empty(locus, read_chars, read_quals)
                 read_calls = encode_read_alleles(locus, read_chars)
                 read_dists = encode_read_distributions(
@@ -497,11 +500,14 @@ def _assemble_locus(self, header, locus):
                 sample_data[sample].update({
                     'GPM': vcf.formatfields.probabilities(genotype[1], self.precision),
                     'PPM': vcf.formatfields.probabilities(phenotype[1].sum(), self.precision),
-                    'RC': read_count,
+                    'RCOUNT': read_count,
+                    'RCALLS': np.sum(read_calls >= 0),
                     'DP': vcf.formatfields.haplotype_depth(read_depth),
                     'GQ': vcf.formatfields.quality(genotype[1]),
                     'PHQ': vcf.formatfields.quality(phenotype[1].sum()),
-                    'FT': filterset
+                    'MEC': integer.minimum_error_correction(read_calls, genotype[0]).sum(),
+                    'FT': filterset,
+                    'RASSIGN': np.round(integer.read_assignment(read_calls, genotype[0]).sum(axis=0), 1),
                 })
 
                 # Null out the genotype and phenotype arrays
@@ -546,6 +552,10 @@ def _assemble_locus(self, header, locus):
             _, probs, dosage = labeler.label_phenotype_posterior(*phenotype, expected_dosage=True)
             sample_data[sample]['MPGP'] = vcf.formatfields.probabilities(probs, self.precision)
             sample_data[sample]['MPED'] = vcf.formatfields.probabilities(dosage, self.precision)
+
+            # sort the read-count assignment
+            idx = labeler.argsort(genotype[0])
+            sample_data[sample]['RASSIGN'] = list(sample_data[sample]['RASSIGN'][idx])
 
         # construct vcf record
         return vcf.VCFRecord(

diff --git a/mchap/assemble/classes.py b/mchap/assemble/classes.py
@@ -2,7 +2,7 @@
 from dataclasses import dataclass
 
 from mchap import mset
-from mchap.encoding import allelic
+from mchap.encoding import integer
 
 
 @dataclass
@@ -157,7 +157,7 @@ def __post_init__(self):
             n_chains, n_steps = self.genotypes.shape[0:2]
             for c in range(n_chains):
                 for i in range(n_steps):
-                    self.genotypes[c, i] = allelic.sort(self.genotypes[c, i])
+                    self.genotypes[c, i] = integer.sort(self.genotypes[c, i])
 
 
     def burn(self, n):

diff --git a/mchap/assemble/inheritence.py b/mchap/assemble/inheritence.py
@@ -5,7 +5,7 @@
 from collections import Counter as _Counter
 
 from mchap import mset
-from mchap.encoding import allelic
+from mchap.encoding import integer
 
 
 def gamete_probabilities(genotypes,
@@ -132,7 +132,7 @@ def cross_probabilities(maternal_gametes,
                     genotype[idx] = hap
                     idx += 1
 
-            genotype = allelic.sort(genotype)
+            genotype = integer.sort(genotype)
             string = genotype.tostring()
             if string not in string_to_genotype:
                 string_to_genotype[string] = genotype

diff --git a/mchap/assemble/mcmc/denovo.py b/mchap/assemble/mcmc/denovo.py
@@ -7,8 +7,6 @@
 from dataclasses import dataclass
 
 from mchap import mset
-from mchap.encoding import allelic
-from mchap.encoding import probabilistic
 from mchap.assemble.mcmc.step import mutation, structural
 from mchap.assemble.likelihood import log_likelihood
 from mchap.assemble import util, complexity
@@ -163,11 +161,8 @@ def _mcmc(self, reads, initial=None):
 
         # set the initial genotype
         if initial is None:
-            # for each haplotype, take a random sample of mean of reads
-            var_dist = _read_mean_dist(reads_het)
-            genotype = np.empty((self.ploidy, n_het_base), dtype=np.int8)
-            for i in range(self.ploidy):
-                genotype[i] = probabilistic.sample_alleles(var_dist)
+            dist = _read_mean_dist(reads_het)
+            genotype = np.array([util.sample_alleles(dist) for _ in range(self.ploidy)])
         else:
             # use the provided array
             assert initial.shape == (self.ploidy, n_het_base)
@@ -331,6 +326,10 @@ def _read_mean_dist(reads):
     -------
     mean_read : ndarray, float, shape (n_positions, max_allele)
         The mean probabilities of input reads.
+
+    Notes
+    -----
+    If read distributions are normalized if they do not sum to 1.
     
     """
     # work around to avoid nan values caused by gaps
@@ -346,6 +345,10 @@ def _read_mean_dist(reads):
     n_alleles = np.sum(~np.all(reads == 0, axis=0), axis=1, keepdims=True)
     fill = 1 / np.tile(n_alleles, (1, reads.shape[-1]))
     dist[gaps] = fill[gaps]
+
+    # normalize
+    dist /= dist.sum(axis=-1, keepdims=True)
+
     return dist
 
 

diff --git a/mchap/assemble/util.py b/mchap/assemble/util.py
@@ -284,6 +284,43 @@ def count_equivalent_permutations(dosage):
     return numerator // denominator
 
 
+@numba.njit
+def sample_alleles(array, dtype=np.int8):
+    """Sample a random set of alleles from probabilities.
+
+    Parameters
+    ----------
+    array : ndarray, float, shape (n_read, n_base, max_allele)
+        Probability of each allele at each base position
+    dtype : type
+        Numpy dtype of alleles.
+
+    Returns
+    ------- 
+    alleles : ndarray, int, shape (n_read, n_base)
+        Sampled alleles
+
+    Notes
+    -----
+    Does not handle gaps (nan values).
+
+    """
+    # normalize
+    shape = array.shape[0:-1]
+
+    array = array.reshape(-1, array.shape[-1])
+    sums = np.sum(array, axis=-1)
+    dists = array / np.expand_dims(sums, -1)
+    n = len(dists)
+
+    alleles = np.empty(n, dtype=dtype)
+
+    for i in range(n):
+        alleles[i] = random_choice(dists[i])
+
+    return alleles.reshape(shape)
+
+
 @numba.njit
 def seed_numba(seed):
     """Set numba random seed

diff --git a/mchap/encoding/allelic/__init__.py b/mchap/encoding/allelic/__init__.py
diff --git a/mchap/encoding/symbolic/__init__.py → mchap/encoding/character/__init__.py b/mchap/encoding/symbolic/__init__.py → mchap/encoding/character/__init__.py
diff --git a/mchap/encoding/symbolic/sequence.py → mchap/encoding/character/sequence.py b/mchap/encoding/symbolic/sequence.py → mchap/encoding/character/sequence.py
diff --git a/mchap/encoding/symbolic/transcode.py → mchap/encoding/character/transcode.py b/mchap/encoding/symbolic/transcode.py → mchap/encoding/character/transcode.py
diff --git a/mchap/encoding/probabilistic/__init__.py → mchap/encoding/integer/__init__.py b/mchap/encoding/probabilistic/__init__.py → mchap/encoding/integer/__init__.py
@@ -1,3 +1,4 @@
 from .sequence import *
 from .transcode import *
+from .kmer import *
 from .stats import *
diff --git a/mchap/encoding/allelic/kmer.py → mchap/encoding/integer/kmer.py b/mchap/encoding/allelic/kmer.py → mchap/encoding/integer/kmer.py
diff --git a/mchap/encoding/allelic/sequence.py → mchap/encoding/integer/sequence.py b/mchap/encoding/allelic/sequence.py → mchap/encoding/integer/sequence.py
diff --git a/mchap/encoding/allelic/transcode.py → mchap/encoding/integer/transcode.py b/mchap/encoding/allelic/transcode.py → mchap/encoding/integer/transcode.py
@@ -39,12 +39,9 @@ def as_probabilistic(
     (the number of possible non-called alleles) to get the probability of 
     each of the non-called alleles.
     If an position is constrained to have fewer than the possible number 
-    of alternate alleles (e.g. a bi-allelic contstraint) using the
-    n_alleles argument then the probability of the alternate alleles 
-    is calculated for all possible alleles before removing the 
-    non-allowed alleles an re-normalizing the remaining probabilities.
-    This can result in the called allele having a probability of greater
-    than p.    
+    of alternate alleles (e.g. a bi-allelic constraint) using the
+    n_alleles argument then the probability across all remaining alleles
+    will sum to less than 1.    
     """
     # check inputs
     array = np.array(array, copy=False)
@@ -65,16 +62,13 @@ def as_probabilistic(
     new = ((1 - p) / error_factor)[..., None] * ~onehot
     calls = p[..., None] * onehot
     new[onehot] = calls[onehot]
-
-    # zero out non-alleles
-    new[..., n_alleles[..., None] <= alleles] = 0
-
-    # normalize 
-    new /= np.nansum(new, axis=-1, keepdims=True)
-
-    # nan fill gaps and re-zero
+
+    # nan fill gaps
     new[array < 0] = np.nan
+
+    # zero out non-alleles
     new[..., n_alleles[..., None] <= alleles] = 0
+
     return new
 
 

diff --git a/mchap/encoding/probabilistic/sequence.py b/mchap/encoding/probabilistic/sequence.py