Skip to content

Commit

Permalink
Negative binomial fixes (#109)
Browse files Browse the repository at this point in the history
* Fix homozygous log likelihood calculation

* Use built in logpmf instead of math.log to stop math domain errors

* Force use negative binomial

* Fix rounding errors causing domain errors from trying log(0)

* Always genotype using negative binomial; disable het calling

* remove duplicated code

* update to use cluster_vcf_records 0.13.1 and vcflib from apt install

* add gramtools to dependencies; update vcflib description

* Install cluster_vcf_records version 0.13.1
  • Loading branch information
martinghunt authored Jan 21, 2021
1 parent ccbe2cc commit 0b8920b
Show file tree
Hide file tree
Showing 19 changed files with 115 additions and 391 deletions.
12 changes: 2 additions & 10 deletions .ci/install_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ apt-get install -y \
python3-pip \
python3-setuptools \
tabix \
libvcflib-tools \
wget \
zlib1g-dev

Expand All @@ -35,10 +36,7 @@ if [ ! -d $install_root ]; then
fi
cd $install_root

git clone https://github.com/iqbal-lab-org/cluster_vcf_records.git
cd cluster_vcf_records
git checkout fd41155cea23c35196ae56f66402eb8e1b287b37
pip3 install .
pip3 install 'cluster_vcf_records==0.13.1'

#_________________________ bcftools _______________________#
cd $install_root
Expand All @@ -63,12 +61,6 @@ make
cd ..
cp -s vt-git/vt .

#______________________vcflib _________________________________#
cd $install_root
git clone --recursive https://github.com/vcflib/vcflib.git
cd vcflib
make

# Why six>=1.14.0?
# See https://github.com/pypa/virtualenv/issues/1551
pip3 install tox "six>=1.14.0"
Expand Down
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ python:
install: sudo ./.ci/install_dependencies.sh $HOME/tools

before_script:
- export PATH=$HOME/tools:$HOME/tools/vcflib/bin:$PATH
- export PATH=$HOME/tools:$PATH
# the install script makes $HOME/.nextflow, owned by root with permissions
# tha tmake the tests fail because when nextflow runs, it tries to write
# there
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@ Variant call adjudication.
Dependencies:

* Python 3 (tested on version 3.6.9)
* [gramtools](https://github.com/iqbal-lab-org/gramtools) commit
9313eceb606a6fc159e4a14c168b7a6f888c5ed2
* [vt](https://github.com/atks/vt.git)
* [vcflib](https://github.com/vcflib/vcflib.git). Specifically,
either `vcflib`, or all three of
`vcfbreakmulti`, `vcfallelicprimitives`, and `vcfuniq` must be installed.
* Optionally, [nextflow](https://www.nextflow.io/), if you want to use the
pipeline to regenotype a large number of samples.
Expand Down
4 changes: 2 additions & 2 deletions Singularity.def
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ OSVersion: bionic
MirrorURL: http://us.archive.ubuntu.com/ubuntu/

%environment
export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH
export PATH=/bioinf-tools/:$PATH
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
export LANG=C.UTF-8

Expand All @@ -15,7 +15,7 @@ export LANG=C.UTF-8

%post
#_____________________ setup $PATH _______________________#
export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH
export PATH=/bioinf-tools/:$PATH
export LANG=C.UTF-8

/minos/install_dependencies.sh /bioinf-tools
Expand Down
3 changes: 2 additions & 1 deletion minos/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ def main(args=None):
subparser_adjudicate.add_argument(
"--include_het_calls",
action="store_true",
help="Consider heterozygous calls when genotyping each allele, instead of only making homozygous calls.",
#help="NOT IMPLEMENTED! Consider heterozygous calls when genotyping each allele, instead of only making homozygous calls.",
help=argparse.SUPPRESS,
)
subparser_adjudicate.add_argument("outdir", help="Name of output directory")
subparser_adjudicate.add_argument(
Expand Down
2 changes: 2 additions & 0 deletions minos/adjudicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ def __init__(
self.filter_min_gcp = filter_min_gcp
self.filter_min_frs = filter_min_frs
self.call_hets = call_hets
if self.call_hets:
raise NotImplementedError("Heterozygous calling is not implemented")
self.debug = debug
self.cluster_input_vcfs = cluster_input_vcfs
self.ref_seq_lengths = {
Expand Down
14 changes: 12 additions & 2 deletions minos/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import shutil

import pyfastaq
from cluster_vcf_records import utils as cluster_utils

from minos import utils, __version__

Expand Down Expand Up @@ -50,7 +51,7 @@ def get_version_of_program(program, binary=None, allow_fail=False):
return None
return version
return None
elif program in ["vcfbreakmulti", "vcfallelicprimitives", "vcfuniq"]:
elif program in ["vcfbreakmulti", "vcfallelicprimitives", "vcfuniq", "vcflib"]:
return "Unknown"
else:
raise Exception(
Expand Down Expand Up @@ -102,8 +103,17 @@ def find_binaries_and_versions(programs=None):
"vt",
]

try:
vcflib_binaries = cluster_utils._get_vcflib_binaries()
except:
vcflib_binaries = {x: None for x in ["vcfbreakmulti", "vcfallelicprimitives", "vcfuniq"]}

for program in programs:
binary = find_binary(program, allow_fail=True)
if program in vcflib_binaries:
binary = vcflib_binaries[program]
else:
binary = find_binary(program, allow_fail=True)

if binary is None:
binary = "NOT_FOUND"
version = "NOT_FOUND"
Expand Down
11 changes: 5 additions & 6 deletions minos/genotype_confidence_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ def __init__(
self.min_conf_score = None
self.max_conf_score = None
self.call_hets = call_hets
if self.call_hets:
raise NotImplementedError("Heterozygous calling is not implemented")

@classmethod
def _simulate_confidence_scores(
Expand All @@ -48,12 +50,9 @@ def _simulate_confidence_scores(
)

while i < iterations:
if gtyper.use_nbinom:
correct_coverage = np.random.negative_binomial(
gtyper.no_of_successes, gtyper.prob_of_success
)
else:
correct_coverage = np.random.poisson(mean_depth)
correct_coverage = np.random.negative_binomial(
gtyper.no_of_successes, gtyper.prob_of_success
)

incorrect_coverage = np.random.binomial(mean_depth, error_rate)
if correct_coverage + incorrect_coverage == 0:
Expand Down
182 changes: 26 additions & 156 deletions minos/genotyper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import math
import operator

from scipy.stats import nbinom, poisson
from scipy.stats import nbinom


class Genotyper:
Expand All @@ -13,68 +13,50 @@ def __init__(
depth_variance,
error_rate,
call_hets=False,
force_poisson=False,
):
self.mean_depth = mean_depth
self.depth_variance = depth_variance
self.error_rate = error_rate
self.call_hets = call_hets
self.force_poisson = force_poisson
self._determine_poisson_or_nbinom()
if self.call_hets:
raise NotImplementedError("Heterozygous calling is not implemented")
self._set_nbinom_parameters()
self._set_min_cov_more_than_error()
self._init_alleles_and_genotypes()
logging.info(
f"Genotyper. use_het_model={call_hets}, mean_depth={mean_depth}, depth_variance={depth_variance}, error_rate={error_rate}, using_nbinom={self.use_nbinom}, nbinom_no_of_successes={self.no_of_successes}, nbinom_prob_of_success={self.prob_of_success}, min_cov_more_than_error={self.min_cov_more_than_error}"
f"Genotyper parameters: mean_depth={mean_depth}, depth_variance={depth_variance}, error_rate={error_rate}, nbinom_no_of_successes={self.no_of_successes}, nbinom_prob_of_success={self.prob_of_success}, min_cov_more_than_error={self.min_cov_more_than_error}"
)

def _determine_poisson_or_nbinom(self):
# We can only use nbinom if variance > depth. Otherwise fall back on
# poisson model, which is likely to be worse. Should only happen when
# read depth is very low
if self.force_poisson:
self.use_nbinom = False
else:
self.use_nbinom = self.depth_variance > self.mean_depth
def _set_nbinom_parameters(self):
# We can only use nbinom if variance > depth. Otherwise, force it by
# setting variance = 2 * depth. This is very rare - only seen it on
# simulated data, never on real data.
if self.depth_variance <= self.mean_depth:
logging.warning(f"Mean read depth ({self.mean_depth}) > depth variance ({self.depth_variance}) . Setting variance = 2 * mean for genotype model")
self.depth_variance = 2 * self.mean_depth

if self.use_nbinom:
if self.mean_depth == 0:
self.no_of_successes = 0
self.prob_of_success = 0
else:
self.no_of_successes = (self.mean_depth ** 2) / (
self.depth_variance - self.mean_depth
)
self.prob_of_success = (
1 - (self.depth_variance - self.mean_depth) / self.depth_variance
)
self.half_no_of_successes = ((0.5 * self.mean_depth) ** 2) / (
self.depth_variance - (0.5 * self.mean_depth)
)
self.half_prob_of_success = (
1
- (self.depth_variance - (0.5 * self.mean_depth)) / self.depth_variance
)
else:
self.no_of_successes = None
self.prob_of_success = None
self.half_no_of_successes = None
self.half_prob_of_success = None

def _nbinom_or_poisson_pmf(self, value, half_mean=False):
if self.use_nbinom:
if half_mean:
return nbinom.pmf(
value, self.half_no_of_successes, self.half_prob_of_success
)
else:
return nbinom.pmf(value, self.no_of_successes, self.prob_of_success)
def _nbinom_pmf(self, value, log=False):
if log:
return nbinom.logpmf(value, self.no_of_successes, self.prob_of_success)
else:
if half_mean:
return poisson.pmf(value, 0.5 * self.mean_depth)
else:
return poisson.pmf(value, self.mean_depth)
return nbinom.pmf(value, self.no_of_successes, self.prob_of_success)

def _set_min_cov_more_than_error(self):
self.min_cov_more_than_error = 0
if self.mean_depth == 0:
return
while self._nbinom_or_poisson_pmf(self.min_cov_more_than_error) <= pow(
while self._nbinom_pmf(self.min_cov_more_than_error) <= pow(
self.error_rate, self.min_cov_more_than_error
):
self.min_cov_more_than_error += 1
Expand Down Expand Up @@ -125,90 +107,14 @@ def _haploid_allele_coverages(
haploid_allele_coverages[allele] += coverage
return haploid_allele_coverages

@classmethod
def _coverage_of_diploid_alleles(
cls, allele1, allele2, allele_combination_cov, allele_groups_dict,
):
"""
Collects coverage on each of two alleles.
Define specific coverage for one allele as the sum of coverages for equivalence classes involving that
allele but not the other one.
When an equivalence class contains both alleles, the coverage on it is dispatched
to each allele proportionally to how much specific coverage each has.
"""
shared_cov = 0
allele1_total_cov, allele2_total_cov = 0, 0

for allele_key, coverage in allele_combination_cov.items():
assert coverage >= 0
allele_combination = allele_groups_dict[allele_key]
has_allele_1 = allele1 in allele_combination
has_allele_2 = allele2 in allele_combination
if has_allele_1 and has_allele_2:
shared_cov += coverage
elif has_allele_1:
allele1_total_cov += coverage
elif has_allele_2:
allele2_total_cov += coverage

## Perform the dispatching in ambiguous equiv classes ##
if (
allele1_total_cov == 0 and allele2_total_cov == 0
): # If both are zero, do equal dispatching
allele1_belonging = 0.5
else:
allele1_belonging = allele1_total_cov / (
allele1_total_cov + allele2_total_cov
)

allele1_total_cov += allele1_belonging * shared_cov
allele2_total_cov += (1 - allele1_belonging) * shared_cov
return allele1_total_cov, allele2_total_cov

def _log_likelihood_homozygous(
self, allele_depth, total_depth, allele_length, non_zeros
):
return sum(
[
-self.mean_depth * (1 + (allele_length - non_zeros) / allele_length),
allele_depth * math.log(self.mean_depth),
-math.lgamma(allele_depth + 1),
return sum([
self._nbinom_pmf(allele_depth, log=True),
(total_depth - allele_depth) * math.log(self.error_rate),
non_zeros
* math.log(1 - self._nbinom_or_poisson_pmf(0))
/ allele_length,
]
)

def _log_likelihood_heterozygous(
self,
allele_depth1,
allele_depth2,
total_depth,
allele_length1,
allele_length2,
non_zeros1,
non_zeros2,
):
return sum(
[
-self.mean_depth
* (
1
+ 0.5
* (
(1 - (non_zeros1 / allele_length1))
+ (1 - (non_zeros2 / allele_length2))
)
),
(allele_depth1 + allele_depth2) * math.log(0.5 * self.mean_depth),
-math.lgamma(allele_depth1 + 1),
-math.lgamma(allele_depth2 + 1),
(total_depth - allele_depth1 - allele_depth2)
* math.log(self.error_rate),
((non_zeros1 / allele_length1) + (non_zeros2 / allele_length2))
* math.log(1 - self._nbinom_or_poisson_pmf(0, half_mean=True)),
math.log(1 - self._nbinom_pmf(0)) * non_zeros / allele_length,
self._nbinom_pmf(0, log=True) * (allele_length - non_zeros) / allele_length,
]
)

Expand Down Expand Up @@ -255,46 +161,10 @@ def _calculate_log_likelihoods(self):
self.allele_combination_cov, self.allele_groups_dict
)

if not self.call_hets:
self.likelihoods.sort(key=operator.itemgetter(1), reverse=True)
logging.debug(f"likelihoods: {self.likelihoods}")
return

for (allele_number1, allele_number2) in itertools.combinations(
self.singleton_allele_coverages.keys(), 2
):
allele1_depth, allele2_depth = Genotyper._coverage_of_diploid_alleles(
allele_number1,
allele_number2,
self.allele_combination_cov,
self.allele_groups_dict,
)
allele1_length = len(self.allele_per_base_cov[allele_number1])
allele2_length = len(self.allele_per_base_cov[allele_number2])
non_zeros1 = non_zeros_per_allele[allele_number1]
non_zeros2 = non_zeros_per_allele[allele_number2]

log_likelihood = self._log_likelihood_heterozygous(
allele1_depth,
allele2_depth,
total_depth,
allele1_length,
allele2_length,
non_zeros1,
non_zeros2,
)
frac_support = (
0
if total_depth == 0
else round((allele1_depth + allele2_depth) / total_depth, 4)
)
self.likelihoods.append(
(set([allele_number1, allele_number2]), log_likelihood, frac_support)
)

self.likelihoods.sort(key=operator.itemgetter(1), reverse=True)
logging.debug(f"likelihoods: {self.likelihoods}")


def run(self, allele_combination_cov, allele_per_base_cov, allele_groups_dict):
self._init_alleles_and_genotypes(
allele_combination_cov=allele_combination_cov,
Expand Down
Loading

0 comments on commit 0b8920b

Please sign in to comment.