diff --git a/.ci/install_dependencies.sh b/.ci/install_dependencies.sh index 1677c87..d70cc77 100755 --- a/.ci/install_dependencies.sh +++ b/.ci/install_dependencies.sh @@ -26,6 +26,7 @@ apt-get install -y \ python3-pip \ python3-setuptools \ tabix \ + libvcflib-tools \ wget \ zlib1g-dev @@ -35,10 +36,7 @@ if [ ! -d $install_root ]; then fi cd $install_root -git clone https://github.com/iqbal-lab-org/cluster_vcf_records.git -cd cluster_vcf_records -git checkout fd41155cea23c35196ae56f66402eb8e1b287b37 -pip3 install . +pip3 install 'cluster_vcf_records==0.13.1' #_________________________ bcftools _______________________# cd $install_root @@ -63,12 +61,6 @@ make cd .. cp -s vt-git/vt . -#______________________vcflib _________________________________# -cd $install_root -git clone --recursive https://github.com/vcflib/vcflib.git -cd vcflib -make - # Why six>=1.14.0? # See https://github.com/pypa/virtualenv/issues/1551 pip3 install tox "six>=1.14.0" diff --git a/.travis.yml b/.travis.yml index 24c6429..db28dbc 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: install: sudo ./.ci/install_dependencies.sh $HOME/tools before_script: - - export PATH=$HOME/tools:$HOME/tools/vcflib/bin:$PATH + - export PATH=$HOME/tools:$PATH # the install script makes $HOME/.nextflow, owned by root with permissions # tha tmake the tests fail because when nextflow runs, it tries to write # there diff --git a/README.md b/README.md index aa871d2..4957087 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,11 @@ Variant call adjudication. Dependencies: * Python 3 (tested on version 3.6.9) +* [gramtools](https://github.com/iqbal-lab-org/gramtools) commit + 9313eceb606a6fc159e4a14c168b7a6f888c5ed2 * [vt](https://github.com/atks/vt.git) * [vcflib](https://github.com/vcflib/vcflib.git). Specifically, + either `vcflib`, or all three of `vcfbreakmulti`, `vcfallelicprimitives`, and `vcfuniq` must be installed. * Optionally, [nextflow](https://www.nextflow.io/), if you want to use the pipeline to regenotype a large number of samples. diff --git a/Singularity.def b/Singularity.def index d6abe11..91bb948 100644 --- a/Singularity.def +++ b/Singularity.def @@ -3,7 +3,7 @@ OSVersion: bionic MirrorURL: http://us.archive.ubuntu.com/ubuntu/ %environment -export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH +export PATH=/bioinf-tools/:$PATH export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH export LANG=C.UTF-8 @@ -15,7 +15,7 @@ export LANG=C.UTF-8 %post #_____________________ setup $PATH _______________________# - export PATH=/bioinf-tools/:/bioinf-tools/vcflib/bin:$PATH + export PATH=/bioinf-tools/:$PATH export LANG=C.UTF-8 /minos/install_dependencies.sh /bioinf-tools diff --git a/minos/__main__.py b/minos/__main__.py index 5753f01..000b512 100755 --- a/minos/__main__.py +++ b/minos/__main__.py @@ -115,7 +115,8 @@ def main(args=None): subparser_adjudicate.add_argument( "--include_het_calls", action="store_true", - help="Consider heterozygous calls when genotyping each allele, instead of only making homozygous calls.", + #help="NOT IMPLEMENTED! Consider heterozygous calls when genotyping each allele, instead of only making homozygous calls.", + help=argparse.SUPPRESS, ) subparser_adjudicate.add_argument("outdir", help="Name of output directory") subparser_adjudicate.add_argument( diff --git a/minos/adjudicator.py b/minos/adjudicator.py index 769bd41..6bc1b63 100644 --- a/minos/adjudicator.py +++ b/minos/adjudicator.py @@ -117,6 +117,8 @@ def __init__( self.filter_min_gcp = filter_min_gcp self.filter_min_frs = filter_min_frs self.call_hets = call_hets + if self.call_hets: + raise NotImplementedError("Heterozygous calling is not implemented") self.debug = debug self.cluster_input_vcfs = cluster_input_vcfs self.ref_seq_lengths = { diff --git a/minos/dependencies.py b/minos/dependencies.py index 61ba2c1..90449a5 100644 --- a/minos/dependencies.py +++ b/minos/dependencies.py @@ -4,6 +4,7 @@ import shutil import pyfastaq +from cluster_vcf_records import utils as cluster_utils from minos import utils, __version__ @@ -50,7 +51,7 @@ def get_version_of_program(program, binary=None, allow_fail=False): return None return version return None - elif program in ["vcfbreakmulti", "vcfallelicprimitives", "vcfuniq"]: + elif program in ["vcfbreakmulti", "vcfallelicprimitives", "vcfuniq", "vcflib"]: return "Unknown" else: raise Exception( @@ -102,8 +103,17 @@ def find_binaries_and_versions(programs=None): "vt", ] + try: + vcflib_binaries = cluster_utils._get_vcflib_binaries() + except: + vcflib_binaries = {x: None for x in ["vcfbreakmulti", "vcfallelicprimitives", "vcfuniq"]} + for program in programs: - binary = find_binary(program, allow_fail=True) + if program in vcflib_binaries: + binary = vcflib_binaries[program] + else: + binary = find_binary(program, allow_fail=True) + if binary is None: binary = "NOT_FOUND" version = "NOT_FOUND" diff --git a/minos/genotype_confidence_simulator.py b/minos/genotype_confidence_simulator.py index fe6e621..78ffbd6 100644 --- a/minos/genotype_confidence_simulator.py +++ b/minos/genotype_confidence_simulator.py @@ -24,6 +24,8 @@ def __init__( self.min_conf_score = None self.max_conf_score = None self.call_hets = call_hets + if self.call_hets: + raise NotImplementedError("Heterozygous calling is not implemented") @classmethod def _simulate_confidence_scores( @@ -48,12 +50,9 @@ def _simulate_confidence_scores( ) while i < iterations: - if gtyper.use_nbinom: - correct_coverage = np.random.negative_binomial( - gtyper.no_of_successes, gtyper.prob_of_success - ) - else: - correct_coverage = np.random.poisson(mean_depth) + correct_coverage = np.random.negative_binomial( + gtyper.no_of_successes, gtyper.prob_of_success + ) incorrect_coverage = np.random.binomial(mean_depth, error_rate) if correct_coverage + incorrect_coverage == 0: diff --git a/minos/genotyper.py b/minos/genotyper.py index 807a3ad..f4d72f2 100644 --- a/minos/genotyper.py +++ b/minos/genotyper.py @@ -3,7 +3,7 @@ import math import operator -from scipy.stats import nbinom, poisson +from scipy.stats import nbinom class Genotyper: @@ -13,68 +13,50 @@ def __init__( depth_variance, error_rate, call_hets=False, - force_poisson=False, ): self.mean_depth = mean_depth self.depth_variance = depth_variance self.error_rate = error_rate self.call_hets = call_hets - self.force_poisson = force_poisson - self._determine_poisson_or_nbinom() + if self.call_hets: + raise NotImplementedError("Heterozygous calling is not implemented") + self._set_nbinom_parameters() self._set_min_cov_more_than_error() self._init_alleles_and_genotypes() logging.info( - f"Genotyper. use_het_model={call_hets}, mean_depth={mean_depth}, depth_variance={depth_variance}, error_rate={error_rate}, using_nbinom={self.use_nbinom}, nbinom_no_of_successes={self.no_of_successes}, nbinom_prob_of_success={self.prob_of_success}, min_cov_more_than_error={self.min_cov_more_than_error}" + f"Genotyper parameters: mean_depth={mean_depth}, depth_variance={depth_variance}, error_rate={error_rate}, nbinom_no_of_successes={self.no_of_successes}, nbinom_prob_of_success={self.prob_of_success}, min_cov_more_than_error={self.min_cov_more_than_error}" ) - def _determine_poisson_or_nbinom(self): - # We can only use nbinom if variance > depth. Otherwise fall back on - # poisson model, which is likely to be worse. Should only happen when - # read depth is very low - if self.force_poisson: - self.use_nbinom = False - else: - self.use_nbinom = self.depth_variance > self.mean_depth + def _set_nbinom_parameters(self): + # We can only use nbinom if variance > depth. Otherwise, force it by + # setting variance = 2 * depth. This is very rare - only seen it on + # simulated data, never on real data. + if self.depth_variance <= self.mean_depth: + logging.warning(f"Mean read depth ({self.mean_depth}) > depth variance ({self.depth_variance}) . Setting variance = 2 * mean for genotype model") + self.depth_variance = 2 * self.mean_depth - if self.use_nbinom: + if self.mean_depth == 0: + self.no_of_successes = 0 + self.prob_of_success = 0 + else: self.no_of_successes = (self.mean_depth ** 2) / ( self.depth_variance - self.mean_depth ) self.prob_of_success = ( 1 - (self.depth_variance - self.mean_depth) / self.depth_variance ) - self.half_no_of_successes = ((0.5 * self.mean_depth) ** 2) / ( - self.depth_variance - (0.5 * self.mean_depth) - ) - self.half_prob_of_success = ( - 1 - - (self.depth_variance - (0.5 * self.mean_depth)) / self.depth_variance - ) - else: - self.no_of_successes = None - self.prob_of_success = None - self.half_no_of_successes = None - self.half_prob_of_success = None - def _nbinom_or_poisson_pmf(self, value, half_mean=False): - if self.use_nbinom: - if half_mean: - return nbinom.pmf( - value, self.half_no_of_successes, self.half_prob_of_success - ) - else: - return nbinom.pmf(value, self.no_of_successes, self.prob_of_success) + def _nbinom_pmf(self, value, log=False): + if log: + return nbinom.logpmf(value, self.no_of_successes, self.prob_of_success) else: - if half_mean: - return poisson.pmf(value, 0.5 * self.mean_depth) - else: - return poisson.pmf(value, self.mean_depth) + return nbinom.pmf(value, self.no_of_successes, self.prob_of_success) def _set_min_cov_more_than_error(self): self.min_cov_more_than_error = 0 if self.mean_depth == 0: return - while self._nbinom_or_poisson_pmf(self.min_cov_more_than_error) <= pow( + while self._nbinom_pmf(self.min_cov_more_than_error) <= pow( self.error_rate, self.min_cov_more_than_error ): self.min_cov_more_than_error += 1 @@ -125,90 +107,14 @@ def _haploid_allele_coverages( haploid_allele_coverages[allele] += coverage return haploid_allele_coverages - @classmethod - def _coverage_of_diploid_alleles( - cls, allele1, allele2, allele_combination_cov, allele_groups_dict, - ): - """ - Collects coverage on each of two alleles. - Define specific coverage for one allele as the sum of coverages for equivalence classes involving that - allele but not the other one. - - When an equivalence class contains both alleles, the coverage on it is dispatched - to each allele proportionally to how much specific coverage each has. - """ - shared_cov = 0 - allele1_total_cov, allele2_total_cov = 0, 0 - - for allele_key, coverage in allele_combination_cov.items(): - assert coverage >= 0 - allele_combination = allele_groups_dict[allele_key] - has_allele_1 = allele1 in allele_combination - has_allele_2 = allele2 in allele_combination - if has_allele_1 and has_allele_2: - shared_cov += coverage - elif has_allele_1: - allele1_total_cov += coverage - elif has_allele_2: - allele2_total_cov += coverage - - ## Perform the dispatching in ambiguous equiv classes ## - if ( - allele1_total_cov == 0 and allele2_total_cov == 0 - ): # If both are zero, do equal dispatching - allele1_belonging = 0.5 - else: - allele1_belonging = allele1_total_cov / ( - allele1_total_cov + allele2_total_cov - ) - - allele1_total_cov += allele1_belonging * shared_cov - allele2_total_cov += (1 - allele1_belonging) * shared_cov - return allele1_total_cov, allele2_total_cov - def _log_likelihood_homozygous( self, allele_depth, total_depth, allele_length, non_zeros ): - return sum( - [ - -self.mean_depth * (1 + (allele_length - non_zeros) / allele_length), - allele_depth * math.log(self.mean_depth), - -math.lgamma(allele_depth + 1), + return sum([ + self._nbinom_pmf(allele_depth, log=True), (total_depth - allele_depth) * math.log(self.error_rate), - non_zeros - * math.log(1 - self._nbinom_or_poisson_pmf(0)) - / allele_length, - ] - ) - - def _log_likelihood_heterozygous( - self, - allele_depth1, - allele_depth2, - total_depth, - allele_length1, - allele_length2, - non_zeros1, - non_zeros2, - ): - return sum( - [ - -self.mean_depth - * ( - 1 - + 0.5 - * ( - (1 - (non_zeros1 / allele_length1)) - + (1 - (non_zeros2 / allele_length2)) - ) - ), - (allele_depth1 + allele_depth2) * math.log(0.5 * self.mean_depth), - -math.lgamma(allele_depth1 + 1), - -math.lgamma(allele_depth2 + 1), - (total_depth - allele_depth1 - allele_depth2) - * math.log(self.error_rate), - ((non_zeros1 / allele_length1) + (non_zeros2 / allele_length2)) - * math.log(1 - self._nbinom_or_poisson_pmf(0, half_mean=True)), + math.log(1 - self._nbinom_pmf(0)) * non_zeros / allele_length, + self._nbinom_pmf(0, log=True) * (allele_length - non_zeros) / allele_length, ] ) @@ -255,46 +161,10 @@ def _calculate_log_likelihoods(self): self.allele_combination_cov, self.allele_groups_dict ) - if not self.call_hets: - self.likelihoods.sort(key=operator.itemgetter(1), reverse=True) - logging.debug(f"likelihoods: {self.likelihoods}") - return - - for (allele_number1, allele_number2) in itertools.combinations( - self.singleton_allele_coverages.keys(), 2 - ): - allele1_depth, allele2_depth = Genotyper._coverage_of_diploid_alleles( - allele_number1, - allele_number2, - self.allele_combination_cov, - self.allele_groups_dict, - ) - allele1_length = len(self.allele_per_base_cov[allele_number1]) - allele2_length = len(self.allele_per_base_cov[allele_number2]) - non_zeros1 = non_zeros_per_allele[allele_number1] - non_zeros2 = non_zeros_per_allele[allele_number2] - - log_likelihood = self._log_likelihood_heterozygous( - allele1_depth, - allele2_depth, - total_depth, - allele1_length, - allele2_length, - non_zeros1, - non_zeros2, - ) - frac_support = ( - 0 - if total_depth == 0 - else round((allele1_depth + allele2_depth) / total_depth, 4) - ) - self.likelihoods.append( - (set([allele_number1, allele_number2]), log_likelihood, frac_support) - ) - self.likelihoods.sort(key=operator.itemgetter(1), reverse=True) logging.debug(f"likelihoods: {self.likelihoods}") + def run(self, allele_combination_cov, allele_per_base_cov, allele_groups_dict): self._init_alleles_and_genotypes( allele_combination_cov=allele_combination_cov, diff --git a/minos/gramtools.py b/minos/gramtools.py index a7e8b6b..a812798 100644 --- a/minos/gramtools.py +++ b/minos/gramtools.py @@ -362,6 +362,9 @@ def write_vcf_annotated_using_coverage_from_gramtools( returned by load_gramtools_vcf_and_allele_coverage_files(). Writes a new VCF that has allele counts for all the ALTs""" assert len(vcf_records) == len(all_allele_coverage) + if call_hets: + raise NotImplementedError("Heterozygous calling is not implemented") + header_lines = [ "##fileformat=VCFv4.2", diff --git a/requirements.txt b/requirements.txt index 9145547..aca805f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -cluster_vcf_records >= 0.13.0 +cluster_vcf_records >= 0.13.1 matplotlib pyfastaq >= 3.14.0 pysam >= 0.12 diff --git a/tests/adjudicator_test.py b/tests/adjudicator_test.py index d948c9a..77bbfca 100644 --- a/tests/adjudicator_test.py +++ b/tests/adjudicator_test.py @@ -174,7 +174,7 @@ def test_add_gt_conf_percentile_and_filters_to_vcf_file(): shutil.copyfile(original_file, tmp_file) error_rate = 0.00026045894282438386 simulations = genotype_confidence_simulator.GenotypeConfidenceSimulator( - 60, 100, error_rate, iterations=1000, call_hets=True, + 60, 100, error_rate, iterations=1000, call_hets=False, ) simulations.run_simulations() adjudicator.Adjudicator._add_gt_conf_percentile_and_filters_to_vcf_file( diff --git a/tests/data/adjudicator/add_gt_conf_percentile_to_vcf_file.expect.vcf b/tests/data/adjudicator/add_gt_conf_percentile_to_vcf_file.expect.vcf index 108799c..47d2bb6 100644 --- a/tests/data/adjudicator/add_gt_conf_percentile_to_vcf_file.expect.vcf +++ b/tests/data/adjudicator/add_gt_conf_percentile_to_vcf_file.expect.vcf @@ -12,8 +12,8 @@ ##FILTER= ##minos_max_read_length=200 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_name -ref 100 . T G . PASS . GT:DP:COV:FRS:GT_CONF:GT_CONF_PERCENTILE 1/1:63:0,63:0.9:609.67:52.86 -ref 142 . A C . MIN_DP . GT:DP:COV:FRS:GT_CONF:GT_CONF_PERCENTILE 1/1:1:0,67:1:641.39:67.68 -ref 200 . C A . PASS . GT:DP:COV:FRS:GT_CONF:GT_CONF_PERCENTILE 1/1:44:0,44:0.91:455.2:5.03 +ref 100 . T G . PASS . GT:DP:COV:FRS:GT_CONF:GT_CONF_PERCENTILE 1/1:63:0,63:0.9:609.67:65.81 +ref 142 . A C . MIN_DP . GT:DP:COV:FRS:GT_CONF:GT_CONF_PERCENTILE 1/1:1:0,67:1:641.39:79.2 +ref 200 . C A . PASS . GT:DP:COV:FRS:GT_CONF:GT_CONF_PERCENTILE 1/1:44:0,44:0.91:455.2:6.06 ref 300 . C T . MIN_DP;MIN_FRS;MIN_GCP . GT:DP:COV:FRS:GT_CONF:GT_CONF_PERCENTILE 0/1:1:49,6:0.89:27.98:0.0 -ref 333 . G T . PASS . GT:DP:COV:FRS:GT_CONF:GT_CONF_PERCENTILE 1/1:57:0,57:1:561.6:31.0 +ref 333 . G T . PASS . GT:DP:COV:FRS:GT_CONF:GT_CONF_PERCENTILE 1/1:57:0,57:1:561.6:41.26 diff --git a/tests/data/gramtools/write_vcf_annotated_using_coverage_from_gramtools.out.vcf b/tests/data/gramtools/write_vcf_annotated_using_coverage_from_gramtools.out.vcf index f412e49..ccf3d49 100644 --- a/tests/data/gramtools/write_vcf_annotated_using_coverage_from_gramtools.out.vcf +++ b/tests/data/gramtools/write_vcf_annotated_using_coverage_from_gramtools.out.vcf @@ -1,6 +1,6 @@ ##fileformat=VCFv4.2 ##source=minos, version 0.10.0 -##fileDate=2020-07-13 +##fileDate=2021-01-20 ##FORMAT= ##FORMAT= ##FORMAT= @@ -10,6 +10,6 @@ ##minosMeanReadDepth=5.5 ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_42 -ref 1 . G T . . . GT:DP:DPF:COV:FRS:GT_CONF 1/1:11:2.0:0,10:1.0:11.16 -ref 2 . A C,TA,G . . . GT:DP:DPF:COV:FRS:GT_CONF 2/3:10:1.8182:1,0,3,5:0.9:15.9 +ref 1 . G T . . . GT:DP:DPF:COV:FRS:GT_CONF 1/1:11:2.0:0,10:1.0:68.16 +ref 2 . A C,TA,G . . . GT:DP:DPF:COV:FRS:GT_CONF 3/3:10:1.8182:1,0,3,5:0.6:13.45 ref 42 . A G . . . GT:DP:DPF:COV:FRS:GT_CONF ./.:0:0:0,0:.:0.0 diff --git a/tests/data/gramtools/write_vcf_annotated_using_coverage_from_gramtools.out.vcf.filter.vcf b/tests/data/gramtools/write_vcf_annotated_using_coverage_from_gramtools.out.vcf.filter.vcf index 41f252a..07ebe35 100644 --- a/tests/data/gramtools/write_vcf_annotated_using_coverage_from_gramtools.out.vcf.filter.vcf +++ b/tests/data/gramtools/write_vcf_annotated_using_coverage_from_gramtools.out.vcf.filter.vcf @@ -1,6 +1,6 @@ ##fileformat=VCFv4.2 ##source=minos, version 0.10.0 -##fileDate=2020-07-13 +##fileDate=2021-01-20 ##FORMAT= ##FORMAT= ##FORMAT= @@ -10,6 +10,6 @@ ##minosMeanReadDepth=5.5 ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample_42 -ref 1 . G T . . . GT:DP:DPF:COV:FRS:GT_CONF 1/1:11:2.0:0,10:1.0:11.16 -ref 2 . A TA,G . . . GT:DP:DPF:COV:FRS:GT_CONF 1/2:10:1.8182:1,3,5:0.9:15.9 +ref 1 . G T . . . GT:DP:DPF:COV:FRS:GT_CONF 1/1:11:2.0:0,10:1.0:68.16 +ref 2 . A TA,G . . . GT:DP:DPF:COV:FRS:GT_CONF 2/2:10:1.8182:1,3,5:0.6:13.45 ref 42 . A G . . . GT:DP:DPF:COV:FRS:GT_CONF ./.:0:0:0,0:.:0.0 diff --git a/tests/dependencies_test.py b/tests/dependencies_test.py index 112ed3d..0e59aaa 100644 --- a/tests/dependencies_test.py +++ b/tests/dependencies_test.py @@ -55,10 +55,6 @@ def test_find_binaries_and_versions(self): self.assertIsNotNone(version) self.assertIsNotNone(path) - for version, path in got.items(): - self.assertIsNotNone(version) - self.assertIsNotNone(path) - def test_dependencies_report(self): """test dependencies_report""" got_ok, got_lines = dependencies.dependencies_report() diff --git a/tests/genotype_confidence_simulator_test.py b/tests/genotype_confidence_simulator_test.py index 75dc0f4..ed09a5d 100644 --- a/tests/genotype_confidence_simulator_test.py +++ b/tests/genotype_confidence_simulator_test.py @@ -12,7 +12,7 @@ def test_simulate_confidence_scores(): got = genotype_confidence_simulator.GenotypeConfidenceSimulator._simulate_confidence_scores( 50, 300, 0.1, iterations=5 ) - expected = [99, 114, 143, 200, 215] + expected = [76, 90, 119, 140, 156] assert got == expected #  Since the genotype confidence normalises by length, we shpuld get the same @@ -20,7 +20,6 @@ def test_simulate_confidence_scores(): got = genotype_confidence_simulator.GenotypeConfidenceSimulator._simulate_confidence_scores( 50, 300, 0.1, iterations=5, allele_length=2 ) - expected = [99, 114, 143, 200, 215] assert got == expected @@ -37,77 +36,57 @@ def test_make_conf_to_percentile_dict(): def test_run_simulations_and_get_percentile_allele_length_1(): """test run_simulations and get_percentile""" simulator = genotype_confidence_simulator.GenotypeConfidenceSimulator( - 50, 300, 0.01, iterations=5, call_hets=True + 50, 300, 0.01, iterations=5 ) simulator.run_simulations() expected_confidence_scores_percentiles = { - 55: 20.0, - 256: 40.0, - 285: 60.0, - 337: 80.0, - 368: 100.0, + 193: 20.0, + 221: 40.0, + 271: 60.0, + 278: 80.0, + 303: 100.0 } assert ( simulator.confidence_scores_percentiles == expected_confidence_scores_percentiles ) - assert simulator.get_percentile(55) == 20.00 - assert simulator.get_percentile(256) == 40.00 + assert simulator.get_percentile(193) == 20.00 + assert simulator.get_percentile(221) == 40.00 # Try getting number that is not in the dict and will have to be inferred - assert simulator.get_percentile(150) == 29.45 + assert simulator.get_percentile(207) == 30.0 # Try values outside the range of what we already have - simulator.get_percentile(53) == 0.00 - simulator.get_percentile(52) == 0.00 - simulator.get_percentile(369) == 100.00 - simulator.get_percentile(370) == 100.00 - - -def test_run_simulations_and_get_percentile_allele_length_1_no_het_calls(): - """test run_simulations and get_percentile with no het calls""" - simulator = genotype_confidence_simulator.GenotypeConfidenceSimulator( - 50, 300, 0.1, iterations=5, call_hets=False, - ) - simulator.run_simulations() - expected_confidence_scores_percentiles = { - 99: 20.0, - 114: 40.0, - 143: 60.0, - 200: 80.0, - 215: 100.0, - } - assert ( - simulator.confidence_scores_percentiles - == expected_confidence_scores_percentiles - ) - assert simulator.get_percentile(200) == 80.00 + simulator.get_percentile(192) == 0.00 + simulator.get_percentile(191) == 0.00 + simulator.get_percentile(304) == 100.00 + simulator.get_percentile(305) == 100.00 def test_run_simulations_and_get_percentile_allele_length_2(): """test run_simulations and get_percentile""" simulator = genotype_confidence_simulator.GenotypeConfidenceSimulator( - 50, 300, 0.01, allele_length=2, iterations=5, call_hets=True, + 50, 300, 0.01, allele_length=2, iterations=5 ) simulator.run_simulations() expected_confidence_scores_percentiles = { - 55: 20.0, - 256: 40.0, - 285: 60.0, - 337: 80.0, - 368: 100.0, + 193: 20.0, + 221: 40.0, + 271: 60.0, + 278: 80.0, + 303: 100.0 } assert ( simulator.confidence_scores_percentiles == expected_confidence_scores_percentiles ) - assert simulator.get_percentile(55) == 20.00 - assert simulator.get_percentile(256) == 40.00 + assert simulator.get_percentile(193) == 20.00 + assert simulator.get_percentile(221) == 40.00 # Try getting number that is not in the dict and will have to be inferred - assert simulator.get_percentile(150) == 29.45 + assert simulator.get_percentile(207) == 30.0 # Try values outside the range of what we already have - simulator.get_percentile(53) == 0.00 - simulator.get_percentile(52) == 0.00 - simulator.get_percentile(369) == 100.00 - simulator.get_percentile(370) == 100.00 + simulator.get_percentile(192) == 0.00 + simulator.get_percentile(191) == 0.00 + simulator.get_percentile(304) == 100.00 + simulator.get_percentile(305) == 100.00 def test_simulations(): diff --git a/tests/genotyper_test.py b/tests/genotyper_test.py index c98b54e..04d6a27 100644 --- a/tests/genotyper_test.py +++ b/tests/genotyper_test.py @@ -10,33 +10,30 @@ def test_init(): """test init""" gtyper = genotyper.Genotyper(0, 20, 0.0001) - assert gtyper.use_nbinom assert gtyper.min_cov_more_than_error == 0 assert gtyper.no_of_successes == 0 assert gtyper.prob_of_success == 0 gtyper = genotyper.Genotyper(10, 20, 0.0001) - assert gtyper.use_nbinom assert gtyper.no_of_successes == 10 assert gtyper.prob_of_success == 0.5 assert gtyper.min_cov_more_than_error == 1 gtyper = genotyper.Genotyper(10, 20, 0.01) - assert gtyper.use_nbinom assert gtyper.no_of_successes == 10 assert gtyper.prob_of_success == 0.5 assert gtyper.min_cov_more_than_error == 2 gtyper = genotyper.Genotyper(100, 200, 0.001) - assert gtyper.use_nbinom assert gtyper.no_of_successes == 100 assert gtyper.prob_of_success == 0.5 assert gtyper.min_cov_more_than_error == 8 + # variance < mean, so will hit the code where it forces + # variance = 2 * mean = 20 gtyper = genotyper.Genotyper(10, 5, 0.01) - assert not gtyper.use_nbinom - assert gtyper.no_of_successes == None - assert gtyper.prob_of_success == None + assert gtyper.no_of_successes == 10 + assert gtyper.prob_of_success == 0.5 assert gtyper.min_cov_more_than_error == 2 @@ -75,41 +72,6 @@ def test_haploid_allele_coverages(): assert got == [0, 21, 1, 0, 0, 0, 0] -def test_coverage_of_diploid_alleles_equal_dispatching(): - """ - test _coverage_of_diploid_alleles function - # Equiv class -> coverage: - # {0,1}: 9 - We collect coverages of alleles 0 and 1. - This is an edge case where the only coverage is on both: then we dispatch equally. - """ - allele_combination_cov = {"1": 9} - allele_groups_dict = {"1": {0, 1}} - got = genotyper.Genotyper._coverage_of_diploid_alleles( - 0, 1, allele_combination_cov, allele_groups_dict, - ) - assert got == (4.5, 4.5) - - -def test_coverage_of_diploid_alleles_correct_dispatching(): - """ - test _coverage_of_diploid_alleles function - # Equiv class -> coverage: - # {0}: 17 - # {1}: 80 - # {0,1}: 10 - # {0,2): 3 - We collect coverage for alleles 0 and 1: 20 units specific to 0, 80 specific to 1. - Thus we expect a 1:4 dispatching ratio of the 10 reads which agree with both 0 and 1. - """ - allele_combination_cov = {"1": 17, "2": 80, "3": 10, "4": 3} - allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {0, 2}} - got = genotyper.Genotyper._coverage_of_diploid_alleles( - 0, 1, allele_combination_cov, allele_groups_dict, - ) - assert got == (22, 88) - - def test_log_likelihood_homozygous(): """test _log_likelihood_homozygous""" gtyper = genotyper.Genotyper(100, 200, 0.01) @@ -120,7 +82,7 @@ def test_log_likelihood_homozygous(): got = gtyper._log_likelihood_homozygous( allele_depth, total_depth, allele_length, non_zeros ) - assert round(got, 2) == -26.71 + assert round(got, 2) == -26.78 gtyper = genotyper.Genotyper(10, 200, 0.01) allele_depth = 1 @@ -128,55 +90,12 @@ def test_log_likelihood_homozygous(): got = gtyper._log_likelihood_homozygous( allele_depth, total_depth, allele_length, non_zeros ) - assert round(got, 2) == -44.77 - - gtyper = genotyper.Genotyper(10, 200, 0.01, force_poisson=True) - allele_depth = 1 - total_depth = 9 - got = gtyper._log_likelihood_homozygous( - allele_depth, total_depth, allele_length, non_zeros - ) - assert round(got, 2) == -44.54 - - -def test_log_likelihood_heterozygous(): - """test _log_likelihood_heterozygous""" - gtyper = genotyper.Genotyper(100, 200, 0.01) - allele_depth1 = 45 - allele_depth2 = 40 - total_depth = 95 - allele_length1 = 3 - allele_length2 = 3 - non_zeros1 = 3 - non_zeros2 = 3 - got = gtyper._log_likelihood_heterozygous( - allele_depth1, - allele_depth2, - total_depth, - allele_length1, - allele_length2, - non_zeros1, - non_zeros2, - ) - assert round(got, 2) == -52.97 - - non_zeros1 = 2 - non_zeros2 = 2 - got = gtyper._log_likelihood_heterozygous( - allele_depth1, - allele_depth2, - total_depth, - allele_length1, - allele_length2, - non_zeros1, - non_zeros2, - ) - assert round(got, 2) == -86.31 + assert round(got, 2) == -39.34 def test_calculate_log_likelihoods(): """test _calculate_log_likelihoods""" - gtyper = genotyper.Genotyper(20, 40, 0.01, call_hets=True) + gtyper = genotyper.Genotyper(20, 40, 0.01) allele_combination_cov = {"1": 2, "2": 20, "3": 1} allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}} allele_per_base_cov = [[0, 1], [20, 19]] @@ -189,48 +108,25 @@ def test_calculate_log_likelihoods(): allele_groups_dict=allele_groups_dict, ) gtyper._calculate_log_likelihoods() - assert len(gtyper.likelihoods) == 3 + assert len(gtyper.likelihoods) == 2 expected = [ - ({1}, -11.68, depth1), - ({0, 1}, -22.93, depth01), - ({0}, -124.91, depth0), + ({1}, -12.03, depth1), + ({0}, -114.57, depth0), ] gtyper.likelihoods = [(x[0], round(x[1], 2), x[2]) for x in gtyper.likelihoods] assert gtyper.likelihoods == expected -def test_run_with_call_hets_false(): - """test run call_hets False""" - gtyper = genotyper.Genotyper(20, 40, 0.01, call_hets=False) - allele_combination_cov = {"1": 2, "2": 20, "3": 1} - allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}} - allele_per_base_cov = [[0, 1], [20, 19]] - gtyper.run(allele_combination_cov, allele_per_base_cov, allele_groups_dict) - depth0 = round(3 / 23, 4) - depth1 = round(21 / 23, 4) - expected = [({1}, -11.68, depth1), ({0}, -124.91, depth0)] - assert len(gtyper.likelihoods) == len(expected) - for i in range(len(expected)): - assert gtyper.likelihoods[i][0] == expected[i][0] - assert round(gtyper.likelihoods[i][1], 2) == round(expected[i][1], 2) - assert gtyper.likelihoods[i][2] == expected[i][2] - - def test_run(): - """test run call_hets True""" - gtyper = genotyper.Genotyper(20, 40, 0.01, call_hets=True) + """test run""" + gtyper = genotyper.Genotyper(20, 40, 0.01) allele_combination_cov = {"1": 2, "2": 20, "3": 1} allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}} allele_per_base_cov = [[0, 1], [20, 19]] gtyper.run(allele_combination_cov, allele_per_base_cov, allele_groups_dict) depth0 = round(3 / 23, 4) - depth01 = 1 depth1 = round(21 / 23, 4) - expected = [ - ({1}, -11.68, depth1), - ({0, 1}, -22.93, depth01), - ({0}, -124.91, depth0), - ] + expected = [({1}, -12.03, depth1), ({0}, -114.57, depth0)] assert len(gtyper.likelihoods) == len(expected) for i in range(len(expected)): assert gtyper.likelihoods[i][0] == expected[i][0] @@ -240,7 +136,7 @@ def test_run(): def test_run_zero_coverage(): """test run when all alleles have zero coverage""" - gtyper = genotyper.Genotyper(20, 40, 0.01, call_hets=True) + gtyper = genotyper.Genotyper(20, 40, 0.01) allele_combination_cov = {} allele_groups_dict = {"1": {0}, "2": {1}, "3": {0, 1}, "4": {2}} allele_per_base_cov = [[0], [0, 0]] @@ -255,7 +151,7 @@ def test_nomatherror_mean_depth0(): Can get a mean_depth of zero but try to genotype a non-zero coverage site due to rounding imprecision. In which case we need to avoid trying to do log(0) in likelihood calculation and should return no call. """ - gtyper = genotyper.Genotyper(0, 0, 0.01, call_hets=True) + gtyper = genotyper.Genotyper(0, 0, 0.01) allele_combination_cov = {"1": 1} allele_groups_dict = {"1": {0}, "2": {1}} allele_per_base_cov = [[1], [0, 0]] diff --git a/tests/gramtools_test.py b/tests/gramtools_test.py index b4c9a05..34a560a 100644 --- a/tests/gramtools_test.py +++ b/tests/gramtools_test.py @@ -196,33 +196,6 @@ def test_load_gramtools_vcf_and_allele_coverage_files(): gramtools.load_gramtools_vcf_and_allele_coverage_files(vcf_file, quasimap_dir) -def test_update_vcf_record_using_gramtools_allele_depths_heterozygous(): - """test update_using_gramtools_allele_depths heterozygous""" - record = vcf_record.VcfRecord( - "ref\t4\t.\tT\tA,G,TC\t228\t.\tINDEL;IDV=54;IMF=0.885246;DP=61;VDB=7.33028e-19;SGB=-0.693147;MQSB=0.9725;MQ0F=0;AC=2;AN=2;DP4=0,0,23,31;MQ=57\tGT:PL\t1/1:255,163,0" - ) - allele_combination_cov = {"1": 9, "2": 7, "3": 1} - allele_groups_dict = {"1": {0}, "2": {2}, "3": {2, 3}} - allele_per_base_cov = [[0], [9], [7], [1, 0]] - expected = vcf_record.VcfRecord( - "ref\t4\t.\tT\tA,G,TC\t.\t.\t.\tGT:DP:DPF:COV:FRS:GT_CONF\t0/2:17:1.1333:9,0,7,0:1.0:54.43" - ) - mean_depth = 15 - depth_variance = 30 - error_rate = 0.001 - gtyper = genotyper.Genotyper( - mean_depth, depth_variance, error_rate, call_hets=True, - ) - got_filtered = gramtools.update_vcf_record_using_gramtools_allele_depths( - record, gtyper, allele_combination_cov, allele_per_base_cov, allele_groups_dict, - ) - assert record == expected - expected_filtered = vcf_record.VcfRecord( - "ref\t4\t.\tT\tG\t.\t.\t.\tGT:DP:DPF:COV:FRS:GT_CONF\t0/1:17:1.1333:9,7:1.0:54.43" - ) - assert got_filtered == expected_filtered - - def test_update_vcf_record_using_gramtools_allele_depths_homozygous(): """test update_using_gramtools_allele_depths homozygous""" record = vcf_record.VcfRecord( @@ -232,7 +205,7 @@ def test_update_vcf_record_using_gramtools_allele_depths_homozygous(): allele_groups_dict = {"1": {0}, "2": {2}, "3": {1, 2}} allele_per_base_cov = [[1], [0, 0], [80]] expected = vcf_record.VcfRecord( - "ref\t4\t.\tT\tTC,G\t.\t.\t.\tGT:DP:DPF:COV:FRS:GT_CONF\t2/2:81:0.9529:1,0,80:0.9877:708.01" + "ref\t4\t.\tT\tTC,G\t.\t.\t.\tGT:DP:DPF:COV:FRS:GT_CONF\t2/2:81:0.9529:1,0,80:0.9877:646.06" ) mean_depth = 85 depth_variance = 200 @@ -245,7 +218,7 @@ def test_update_vcf_record_using_gramtools_allele_depths_homozygous(): ) assert record == expected expected_filtered = vcf_record.VcfRecord( - "ref\t4\t.\tT\tG\t.\t.\t.\tGT:DP:DPF:COV:FRS:GT_CONF\t1/1:81:0.9529:1,80:0.9877:708.01" + "ref\t4\t.\tT\tG\t.\t.\t.\tGT:DP:DPF:COV:FRS:GT_CONF\t1/1:81:0.9529:1,80:0.9877:646.06" ) assert got_filtered == expected_filtered @@ -306,7 +279,7 @@ def test_write_vcf_annotated_using_coverage_from_gramtools(): sample_name="sample_42", filtered_outfile=tmp_outfile_filtered, ref_seq_lengths={"ref": "10000"}, - call_hets=True, + call_hets=False, ) expected_vcf = os.path.join( data_dir, "write_vcf_annotated_using_coverage_from_gramtools.out.vcf"