Skip to content

Commit

Permalink
Merge pull request #59 from timothymillar/f/llk-encoding
Browse files Browse the repository at this point in the history
v0.2.0
  • Loading branch information
timothymillar authored Oct 6, 2020
2 parents 02efedc + 6504ec4 commit fd3b35b
Show file tree
Hide file tree
Showing 35 changed files with 278 additions and 635 deletions.
20 changes: 15 additions & 5 deletions mchap/application/assemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import multiprocessing as mp

from mchap.assemble.mcmc.denovo import DenovoMCMC
from mchap.encoding import symbolic
from mchap.encoding import character, integer
from mchap.io import \
read_bed4, \
extract_sample_ids, \
Expand Down Expand Up @@ -413,12 +413,15 @@ def header(self):
vcf.formatfields.GQ,
vcf.formatfields.PHQ,
vcf.formatfields.DP,
vcf.formatfields.RC,
vcf.formatfields.RCOUNT,
vcf.formatfields.RCALLS,
vcf.formatfields.MEC,
vcf.formatfields.FT,
vcf.formatfields.GPM,
vcf.formatfields.PPM,
vcf.formatfields.MPGP,
vcf.formatfields.MPED,
vcf.formatfields.RASSIGN,
)

vcf_header = vcf.VCFHeader(
Expand Down Expand Up @@ -454,7 +457,7 @@ def _assemble_locus(self, header, locus):

# process read data
read_count = read_chars.shape[0]
read_depth = symbolic.depth(read_chars)
read_depth = character.depth(read_chars)
read_chars, read_quals = add_nan_read_if_empty(locus, read_chars, read_quals)
read_calls = encode_read_alleles(locus, read_chars)
read_dists = encode_read_distributions(
Expand Down Expand Up @@ -497,11 +500,14 @@ def _assemble_locus(self, header, locus):
sample_data[sample].update({
'GPM': vcf.formatfields.probabilities(genotype[1], self.precision),
'PPM': vcf.formatfields.probabilities(phenotype[1].sum(), self.precision),
'RC': read_count,
'RCOUNT': read_count,
'RCALLS': np.sum(read_calls >= 0),
'DP': vcf.formatfields.haplotype_depth(read_depth),
'GQ': vcf.formatfields.quality(genotype[1]),
'PHQ': vcf.formatfields.quality(phenotype[1].sum()),
'FT': filterset
'MEC': integer.minimum_error_correction(read_calls, genotype[0]).sum(),
'FT': filterset,
'RASSIGN': np.round(integer.read_assignment(read_calls, genotype[0]).sum(axis=0), 1),
})

# Null out the genotype and phenotype arrays
Expand Down Expand Up @@ -546,6 +552,10 @@ def _assemble_locus(self, header, locus):
_, probs, dosage = labeler.label_phenotype_posterior(*phenotype, expected_dosage=True)
sample_data[sample]['MPGP'] = vcf.formatfields.probabilities(probs, self.precision)
sample_data[sample]['MPED'] = vcf.formatfields.probabilities(dosage, self.precision)

# sort the read-count assignment
idx = labeler.argsort(genotype[0])
sample_data[sample]['RASSIGN'] = list(sample_data[sample]['RASSIGN'][idx])

# construct vcf record
return vcf.VCFRecord(
Expand Down
4 changes: 2 additions & 2 deletions mchap/assemble/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from dataclasses import dataclass

from mchap import mset
from mchap.encoding import allelic
from mchap.encoding import integer


@dataclass
Expand Down Expand Up @@ -157,7 +157,7 @@ def __post_init__(self):
n_chains, n_steps = self.genotypes.shape[0:2]
for c in range(n_chains):
for i in range(n_steps):
self.genotypes[c, i] = allelic.sort(self.genotypes[c, i])
self.genotypes[c, i] = integer.sort(self.genotypes[c, i])


def burn(self, n):
Expand Down
4 changes: 2 additions & 2 deletions mchap/assemble/inheritence.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from collections import Counter as _Counter

from mchap import mset
from mchap.encoding import allelic
from mchap.encoding import integer


def gamete_probabilities(genotypes,
Expand Down Expand Up @@ -132,7 +132,7 @@ def cross_probabilities(maternal_gametes,
genotype[idx] = hap
idx += 1

genotype = allelic.sort(genotype)
genotype = integer.sort(genotype)
string = genotype.tostring()
if string not in string_to_genotype:
string_to_genotype[string] = genotype
Expand Down
17 changes: 10 additions & 7 deletions mchap/assemble/mcmc/denovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
from dataclasses import dataclass

from mchap import mset
from mchap.encoding import allelic
from mchap.encoding import probabilistic
from mchap.assemble.mcmc.step import mutation, structural
from mchap.assemble.likelihood import log_likelihood
from mchap.assemble import util, complexity
Expand Down Expand Up @@ -163,11 +161,8 @@ def _mcmc(self, reads, initial=None):

# set the initial genotype
if initial is None:
# for each haplotype, take a random sample of mean of reads
var_dist = _read_mean_dist(reads_het)
genotype = np.empty((self.ploidy, n_het_base), dtype=np.int8)
for i in range(self.ploidy):
genotype[i] = probabilistic.sample_alleles(var_dist)
dist = _read_mean_dist(reads_het)
genotype = np.array([util.sample_alleles(dist) for _ in range(self.ploidy)])
else:
# use the provided array
assert initial.shape == (self.ploidy, n_het_base)
Expand Down Expand Up @@ -331,6 +326,10 @@ def _read_mean_dist(reads):
-------
mean_read : ndarray, float, shape (n_positions, max_allele)
The mean probabilities of input reads.
Notes
-----
If read distributions are normalized if they do not sum to 1.
"""
# work around to avoid nan values caused by gaps
Expand All @@ -346,6 +345,10 @@ def _read_mean_dist(reads):
n_alleles = np.sum(~np.all(reads == 0, axis=0), axis=1, keepdims=True)
fill = 1 / np.tile(n_alleles, (1, reads.shape[-1]))
dist[gaps] = fill[gaps]

# normalize
dist /= dist.sum(axis=-1, keepdims=True)

return dist


Expand Down
37 changes: 37 additions & 0 deletions mchap/assemble/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,43 @@ def count_equivalent_permutations(dosage):
return numerator // denominator


@numba.njit
def sample_alleles(array, dtype=np.int8):
"""Sample a random set of alleles from probabilities.
Parameters
----------
array : ndarray, float, shape (n_read, n_base, max_allele)
Probability of each allele at each base position
dtype : type
Numpy dtype of alleles.
Returns
-------
alleles : ndarray, int, shape (n_read, n_base)
Sampled alleles
Notes
-----
Does not handle gaps (nan values).
"""
# normalize
shape = array.shape[0:-1]

array = array.reshape(-1, array.shape[-1])
sums = np.sum(array, axis=-1)
dists = array / np.expand_dims(sums, -1)
n = len(dists)

alleles = np.empty(n, dtype=dtype)

for i in range(n):
alleles[i] = random_choice(dists[i])

return alleles.reshape(shape)


@numba.njit
def seed_numba(seed):
"""Set numba random seed
Expand Down
3 changes: 0 additions & 3 deletions mchap/encoding/allelic/__init__.py

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .sequence import *
from .transcode import *
from .kmer import *
from .stats import *
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,9 @@ def as_probabilistic(
(the number of possible non-called alleles) to get the probability of
each of the non-called alleles.
If an position is constrained to have fewer than the possible number
of alternate alleles (e.g. a bi-allelic contstraint) using the
n_alleles argument then the probability of the alternate alleles
is calculated for all possible alleles before removing the
non-allowed alleles an re-normalizing the remaining probabilities.
This can result in the called allele having a probability of greater
than p.
of alternate alleles (e.g. a bi-allelic constraint) using the
n_alleles argument then the probability across all remaining alleles
will sum to less than 1.
"""
# check inputs
array = np.array(array, copy=False)
Expand All @@ -65,16 +62,13 @@ def as_probabilistic(
new = ((1 - p) / error_factor)[..., None] * ~onehot
calls = p[..., None] * onehot
new[onehot] = calls[onehot]

# zero out non-alleles
new[..., n_alleles[..., None] <= alleles] = 0

# normalize
new /= np.nansum(new, axis=-1, keepdims=True)

# nan fill gaps and re-zero

# nan fill gaps
new[array < 0] = np.nan

# zero out non-alleles
new[..., n_alleles[..., None] <= alleles] = 0

return new


Expand Down
150 changes: 0 additions & 150 deletions mchap/encoding/probabilistic/sequence.py

This file was deleted.

Loading

0 comments on commit fd3b35b

Please sign in to comment.