Negative binomial fixes (#109)

* Fix homozygous log likelihood calculation * Use built in logpmf instead of math.log to stop math domain errors * Force use negative binomial * Fix rounding errors causing domain errors from trying log(0) * Always genotype using negative binomial; disable het calling * remove duplicated code * update to use cluster_vcf_records 0.13.1 and vcflib from apt install * add gramtools to dependencies; update vcflib description * Install cluster_vcf_records version 0.13.1
iqbal-lab-org · Jan 21, 2021 · 0b8920b · 0b8920b
1 parent ccbe2cc
commit 0b8920b
Show file tree

Hide file tree

Showing 19 changed files with 115 additions and 391 deletions.
diff --git a/.ci/install_dependencies.sh b/.ci/install_dependencies.sh
@@ -26,6 +26,7 @@ apt-get install -y \
   python3-pip \
   python3-setuptools \
   tabix \
+  libvcflib-tools \
   wget \
   zlib1g-dev
 
@@ -35,10 +36,7 @@ if [ ! -d $install_root ]; then
 fi
 cd $install_root
 
-git clone https://github.com/iqbal-lab-org/cluster_vcf_records.git
-cd cluster_vcf_records
-git checkout fd41155cea23c35196ae56f66402eb8e1b287b37
-pip3 install .
+pip3 install 'cluster_vcf_records==0.13.1'
 
 #_________________________ bcftools _______________________#
 cd $install_root
@@ -63,12 +61,6 @@ make
 cd ..
 cp -s vt-git/vt .
 
-#______________________vcflib _________________________________#
-cd $install_root
-git clone --recursive https://github.com/vcflib/vcflib.git
-cd vcflib
-make
-
 # Why six>=1.14.0?
 # See https://github.com/pypa/virtualenv/issues/1551
 pip3 install tox "six>=1.14.0"

diff --git a/.travis.yml b/.travis.yml
@@ -7,7 +7,7 @@ python:
 install: sudo ./.ci/install_dependencies.sh $HOME/tools
 
 before_script:
-    - export PATH=$HOME/tools:$HOME/tools/vcflib/bin:$PATH
+    - export PATH=$HOME/tools:$PATH
     # the install script makes $HOME/.nextflow, owned by root with permissions
     # tha tmake the tests fail because when nextflow runs, it tries to write
     # there

diff --git a/README.md b/README.md
@@ -9,8 +9,11 @@ Variant call adjudication.
 Dependencies:
 
 * Python 3 (tested on version 3.6.9)
+* [gramtools](https://github.com/iqbal-lab-org/gramtools) commit
+  9313eceb606a6fc159e4a14c168b7a6f888c5ed2
 * [vt](https://github.com/atks/vt.git)
 * [vcflib](https://github.com/vcflib/vcflib.git). Specifically,
+  either `vcflib`, or all three of
   `vcfbreakmulti`, `vcfallelicprimitives`, and `vcfuniq` must be installed.
 * Optionally, [nextflow](https://www.nextflow.io/), if you want to use the
   pipeline to regenotype a large number of samples.

diff --git a/Singularity.def b/Singularity.def
@@ -3,7 +3,7 @@ OSVersion: bionic
 MirrorURL: http://us.archive.ubuntu.com/ubuntu/
 
 %environment
-export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH
+export PATH=/bioinf-tools/:$PATH
 export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 export LANG=C.UTF-8
 
@@ -15,7 +15,7 @@ export LANG=C.UTF-8
 
 %post
     #_____________________ setup $PATH _______________________#
-    export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH
+    export PATH=/bioinf-tools/:$PATH
     export LANG=C.UTF-8
 
     /minos/install_dependencies.sh /bioinf-tools

diff --git a/minos/__main__.py b/minos/__main__.py
@@ -115,7 +115,8 @@ def main(args=None):
     subparser_adjudicate.add_argument(
         "--include_het_calls",
         action="store_true",
-        help="Consider heterozygous calls when genotyping each allele, instead of only making homozygous calls.",
+        #help="NOT IMPLEMENTED! Consider heterozygous calls when genotyping each allele, instead of only making homozygous calls.",
+        help=argparse.SUPPRESS,
     )
     subparser_adjudicate.add_argument("outdir", help="Name of output directory")
     subparser_adjudicate.add_argument(

diff --git a/minos/adjudicator.py b/minos/adjudicator.py
@@ -117,6 +117,8 @@ def __init__(
         self.filter_min_gcp = filter_min_gcp
         self.filter_min_frs = filter_min_frs
         self.call_hets = call_hets
+        if self.call_hets:
+            raise NotImplementedError("Heterozygous calling is not implemented")
         self.debug = debug
         self.cluster_input_vcfs = cluster_input_vcfs
         self.ref_seq_lengths = {

diff --git a/minos/dependencies.py b/minos/dependencies.py
@@ -4,6 +4,7 @@
 import shutil
 
 import pyfastaq
+from cluster_vcf_records import utils as cluster_utils
 
 from minos import utils, __version__
 
@@ -50,7 +51,7 @@ def get_version_of_program(program, binary=None, allow_fail=False):
                     return None
                 return version
         return None
-    elif program in ["vcfbreakmulti", "vcfallelicprimitives", "vcfuniq"]:
+    elif program in ["vcfbreakmulti", "vcfallelicprimitives", "vcfuniq", "vcflib"]:
         return "Unknown"
     else:
         raise Exception(
@@ -102,8 +103,17 @@ def find_binaries_and_versions(programs=None):
             "vt",
         ]
 
+    try:
+        vcflib_binaries = cluster_utils._get_vcflib_binaries()
+    except:
+        vcflib_binaries = {x: None for x in ["vcfbreakmulti", "vcfallelicprimitives", "vcfuniq"]}
+
     for program in programs:
-        binary = find_binary(program, allow_fail=True)
+        if program in vcflib_binaries:
+            binary = vcflib_binaries[program]
+        else:
+            binary = find_binary(program, allow_fail=True)
+
         if binary is None:
             binary = "NOT_FOUND"
             version = "NOT_FOUND"

diff --git a/minos/genotype_confidence_simulator.py b/minos/genotype_confidence_simulator.py
@@ -24,6 +24,8 @@ def __init__(
         self.min_conf_score = None
         self.max_conf_score = None
         self.call_hets = call_hets
+        if self.call_hets:
+            raise NotImplementedError("Heterozygous calling is not implemented")
 
     @classmethod
     def _simulate_confidence_scores(
@@ -48,12 +50,9 @@ def _simulate_confidence_scores(
         )
 
         while i < iterations:
-            if gtyper.use_nbinom:
-                correct_coverage = np.random.negative_binomial(
-                    gtyper.no_of_successes, gtyper.prob_of_success
-                )
-            else:
-                correct_coverage = np.random.poisson(mean_depth)
+            correct_coverage = np.random.negative_binomial(
+                gtyper.no_of_successes, gtyper.prob_of_success
+            )
 
             incorrect_coverage = np.random.binomial(mean_depth, error_rate)
             if correct_coverage + incorrect_coverage == 0:

diff --git a/minos/genotyper.py b/minos/genotyper.py
@@ -3,7 +3,7 @@
 import math
 import operator
 
-from scipy.stats import nbinom, poisson
+from scipy.stats import nbinom
 
 
 class Genotyper:
@@ -13,68 +13,50 @@ def __init__(
         depth_variance,
         error_rate,
         call_hets=False,
-        force_poisson=False,
     ):
         self.mean_depth = mean_depth
         self.depth_variance = depth_variance
         self.error_rate = error_rate
         self.call_hets = call_hets
-        self.force_poisson = force_poisson
-        self._determine_poisson_or_nbinom()
+        if self.call_hets:
+            raise NotImplementedError("Heterozygous calling is not implemented")
+        self._set_nbinom_parameters()
         self._set_min_cov_more_than_error()
         self._init_alleles_and_genotypes()
         logging.info(
-            f"Genotyper. use_het_model={call_hets}, mean_depth={mean_depth}, depth_variance={depth_variance}, error_rate={error_rate}, using_nbinom={self.use_nbinom}, nbinom_no_of_successes={self.no_of_successes}, nbinom_prob_of_success={self.prob_of_success}, min_cov_more_than_error={self.min_cov_more_than_error}"
+            f"Genotyper parameters: mean_depth={mean_depth}, depth_variance={depth_variance}, error_rate={error_rate}, nbinom_no_of_successes={self.no_of_successes}, nbinom_prob_of_success={self.prob_of_success}, min_cov_more_than_error={self.min_cov_more_than_error}"
         )
 
-    def _determine_poisson_or_nbinom(self):
-        # We can only use nbinom if variance > depth. Otherwise fall back on
-        # poisson model, which is likely to be worse. Should only happen when
-        # read depth is very low
-        if self.force_poisson:
-            self.use_nbinom = False
-        else:
-            self.use_nbinom = self.depth_variance > self.mean_depth
+    def _set_nbinom_parameters(self):
+        # We can only use nbinom if variance > depth. Otherwise, force it by
+        # setting variance = 2 * depth. This is very rare - only seen it on
+        # simulated data, never on real data.
+        if self.depth_variance <= self.mean_depth:
+            logging.warning(f"Mean read depth ({self.mean_depth}) > depth variance ({self.depth_variance}) . Setting variance = 2 * mean for genotype model")
+            self.depth_variance = 2 * self.mean_depth
 
-        if self.use_nbinom:
+        if self.mean_depth == 0:
+            self.no_of_successes = 0
+            self.prob_of_success = 0
+        else:
             self.no_of_successes = (self.mean_depth ** 2) / (
                 self.depth_variance - self.mean_depth
             )
             self.prob_of_success = (
                 1 - (self.depth_variance - self.mean_depth) / self.depth_variance
             )
-            self.half_no_of_successes = ((0.5 * self.mean_depth) ** 2) / (
-                self.depth_variance - (0.5 * self.mean_depth)
-            )
-            self.half_prob_of_success = (
-                1
-                - (self.depth_variance - (0.5 * self.mean_depth)) / self.depth_variance
-            )
-        else:
-            self.no_of_successes = None
-            self.prob_of_success = None
-            self.half_no_of_successes = None
-            self.half_prob_of_success = None
 
-    def _nbinom_or_poisson_pmf(self, value, half_mean=False):
-        if self.use_nbinom:
-            if half_mean:
-                return nbinom.pmf(
-                    value, self.half_no_of_successes, self.half_prob_of_success
-                )
-            else:
-                return nbinom.pmf(value, self.no_of_successes, self.prob_of_success)
+    def _nbinom_pmf(self, value, log=False):
+        if log:
+            return nbinom.logpmf(value, self.no_of_successes, self.prob_of_success)
         else:
-            if half_mean:
-                return poisson.pmf(value, 0.5 * self.mean_depth)
-            else:
-                return poisson.pmf(value, self.mean_depth)
+            return nbinom.pmf(value, self.no_of_successes, self.prob_of_success)
 
     def _set_min_cov_more_than_error(self):
         self.min_cov_more_than_error = 0
         if self.mean_depth == 0:
             return
-        while self._nbinom_or_poisson_pmf(self.min_cov_more_than_error) <= pow(
+        while self._nbinom_pmf(self.min_cov_more_than_error) <= pow(
             self.error_rate, self.min_cov_more_than_error
         ):
             self.min_cov_more_than_error += 1
@@ -125,90 +107,14 @@ def _haploid_allele_coverages(
                 haploid_allele_coverages[allele] += coverage
         return haploid_allele_coverages
 
-    @classmethod
-    def _coverage_of_diploid_alleles(
-        cls, allele1, allele2, allele_combination_cov, allele_groups_dict,
-    ):
-        """
-        Collects coverage on each of two alleles.
-        Define specific coverage for one allele as the sum of coverages for equivalence classes involving that
-        allele but not the other one.
-
-        When an equivalence class contains both alleles, the coverage on it is dispatched
-        to each allele proportionally to how much specific coverage each has.
-        """
-        shared_cov = 0
-        allele1_total_cov, allele2_total_cov = 0, 0
-
-        for allele_key, coverage in allele_combination_cov.items():
-            assert coverage >= 0
-            allele_combination = allele_groups_dict[allele_key]
-            has_allele_1 = allele1 in allele_combination
-            has_allele_2 = allele2 in allele_combination
-            if has_allele_1 and has_allele_2:
-                shared_cov += coverage
-            elif has_allele_1:
-                allele1_total_cov += coverage
-            elif has_allele_2:
-                allele2_total_cov += coverage
-
-        ## Perform the dispatching in ambiguous equiv classes ##
-        if (
-            allele1_total_cov == 0 and allele2_total_cov == 0
-        ):  # If both are zero, do equal dispatching
-            allele1_belonging = 0.5
-        else:
-            allele1_belonging = allele1_total_cov / (
-                allele1_total_cov + allele2_total_cov
-            )
-
-        allele1_total_cov += allele1_belonging * shared_cov
-        allele2_total_cov += (1 - allele1_belonging) * shared_cov
-        return allele1_total_cov, allele2_total_cov
-
     def _log_likelihood_homozygous(
         self, allele_depth, total_depth, allele_length, non_zeros
     ):
-        return sum(
-            [
-                -self.mean_depth * (1 + (allele_length - non_zeros) / allele_length),
-                allele_depth * math.log(self.mean_depth),
-                -math.lgamma(allele_depth + 1),
+        return sum([
+                self._nbinom_pmf(allele_depth, log=True),
                 (total_depth - allele_depth) * math.log(self.error_rate),
-                non_zeros
-                * math.log(1 - self._nbinom_or_poisson_pmf(0))
-                / allele_length,
-            ]
-        )
-
-    def _log_likelihood_heterozygous(
-        self,
-        allele_depth1,
-        allele_depth2,
-        total_depth,
-        allele_length1,
-        allele_length2,
-        non_zeros1,
-        non_zeros2,
-    ):
-        return sum(
-            [
-                -self.mean_depth
-                * (
-                    1
-                    + 0.5
-                    * (
-                        (1 - (non_zeros1 / allele_length1))
-                        + (1 - (non_zeros2 / allele_length2))
-                    )
-                ),
-                (allele_depth1 + allele_depth2) * math.log(0.5 * self.mean_depth),
-                -math.lgamma(allele_depth1 + 1),
-                -math.lgamma(allele_depth2 + 1),
-                (total_depth - allele_depth1 - allele_depth2)
-                * math.log(self.error_rate),
-                ((non_zeros1 / allele_length1) + (non_zeros2 / allele_length2))
-                * math.log(1 - self._nbinom_or_poisson_pmf(0, half_mean=True)),
+                math.log(1 - self._nbinom_pmf(0)) * non_zeros / allele_length,
+                self._nbinom_pmf(0, log=True) * (allele_length - non_zeros) / allele_length,
             ]
         )
 
@@ -255,46 +161,10 @@ def _calculate_log_likelihoods(self):
             self.allele_combination_cov, self.allele_groups_dict
         )
 
-        if not self.call_hets:
-            self.likelihoods.sort(key=operator.itemgetter(1), reverse=True)
-            logging.debug(f"likelihoods: {self.likelihoods}")
-            return
-
-        for (allele_number1, allele_number2) in itertools.combinations(
-            self.singleton_allele_coverages.keys(), 2
-        ):
-            allele1_depth, allele2_depth = Genotyper._coverage_of_diploid_alleles(
-                allele_number1,
-                allele_number2,
-                self.allele_combination_cov,
-                self.allele_groups_dict,
-            )
-            allele1_length = len(self.allele_per_base_cov[allele_number1])
-            allele2_length = len(self.allele_per_base_cov[allele_number2])
-            non_zeros1 = non_zeros_per_allele[allele_number1]
-            non_zeros2 = non_zeros_per_allele[allele_number2]
-
-            log_likelihood = self._log_likelihood_heterozygous(
-                allele1_depth,
-                allele2_depth,
-                total_depth,
-                allele1_length,
-                allele2_length,
-                non_zeros1,
-                non_zeros2,
-            )
-            frac_support = (
-                0
-                if total_depth == 0
-                else round((allele1_depth + allele2_depth) / total_depth, 4)
-            )
-            self.likelihoods.append(
-                (set([allele_number1, allele_number2]), log_likelihood, frac_support)
-            )
-
         self.likelihoods.sort(key=operator.itemgetter(1), reverse=True)
         logging.debug(f"likelihoods: {self.likelihoods}")
 
+
     def run(self, allele_combination_cov, allele_per_base_cov, allele_groups_dict):
         self._init_alleles_and_genotypes(
             allele_combination_cov=allele_combination_cov,