From dcda4ad0d75ee244d3742cff0e474d791609c5b6 Mon Sep 17 00:00:00 2001 From: Shing Zhan Date: Wed, 28 Feb 2024 21:16:01 +0000 Subject: [PATCH] Update docstrings --- python/tests/beagle.py | 653 ----------------------------------- python/tests/beagle_numba.py | 39 ++- 2 files changed, 23 insertions(+), 669 deletions(-) delete mode 100644 python/tests/beagle.py diff --git a/python/tests/beagle.py b/python/tests/beagle.py deleted file mode 100644 index 47ebb6920b..0000000000 --- a/python/tests/beagle.py +++ /dev/null @@ -1,653 +0,0 @@ -""" -Implementation of the BEAGLE 4.1 algorithm to impute alleles. - -This was implemented while closely consulting the BEAGLE 4.1 paper: -Browning & Browning (2016). Genotype imputation with millions of reference samples. -Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 - -The BEAGLE 4.1 source code (particularly `LSHapBaum.java`) was also consulted: -https://faculty.washington.edu/browning/beagle/b4_1.html - -These notations are used throughout this implementation: -h = number of reference haplotypes. -m = number of genotyped markers. -x = number of ungenotyped markers. - -This implementation takes the following inputs: -1. Panel of reference haplotypes in a matrix of size (m + x, h). -2. One query haplotype in an array of size (m + x). -3. Site positions of all the markers in an array of size (m + x). - -In the query haplotype: -1. The genotyped markers take values of 0, 1, 2, or 3 (ACGT encoding). -2. The ungenotyped markers take -1. - -The forward and backward probability matrices are of size (m, h). -The HMM state probability matrix is of size (m + 1, h). -The interpolated allele probability matrix is of size (x, 4), -The imputed alleles are the maximum a posteriori (MAP) alleles. - -For efficiency, BEAGLE uses aggregated markers, which are clusters of markers -within a 0.005 cM interval (default). Because the genotypes are phased, -the alleles in the aggregated markers form distinct "allele sequences". -Below, we do not use aggregated markers or allele sequences, which would complicate -the implementation. - -Rather than trying to exactly replicating the original BEAGLE 4.1 algorithm, -this implementation computes Equation 1 of BB2016. We keep functions used -in a first attempt to faithfully implement the BEAGLE algorithm for documentation. -""" -import numpy as np - - -""" -TODO: Update use of genetic map. -TODO: Refactor and clean up this code. -TODO: Move useful documentation over to beagle_numba.py. -""" - - -def convert_to_genetic_map_position(pos, genetic_map_pos=None, genetic_map_cm=None): - """ - Convert physical positions (bp) to genetic map positions (cM). - - In BEAGLE 4.1, when a genetic map is not specified, it is assumed - that the recombination rate is constant (1 cM / 1 Mb). - - If a genetic map is specified, then the genetic map positions are - either taken straight from it or interpolated from it. The genetic map - needs to contain physical positions and corresponding genetic positions. - See `PlinkGenMap.java` in the BEAGLE 4.1 source code for details. - - :param numpy.ndarray pos: Physical positions (bp). - :param numpy.ndarray genetic_map_pos: Physical positions of genetic map. - :param numpy.ndarray genetic_map_cm: Genetic map positions of genetic map. - :return: Genetic map positions (cM). - :rtype: numpy.ndarray - """ - if genetic_map_pos is None or genetic_map_cm is None: - return pos / 1e6 - # TODO: Better way to pass the components of a genetic map. - base_pos = genetic_map_pos - gen_pos = genetic_map_cm - assert len(base_pos) == len(gen_pos), "Bad genetic map." - assert np.all(pos >= base_pos[0]) and np.all( - pos <= base_pos[-1] - ), "Physical positions outside of genetic map." - left_idx = np.searchsorted(base_pos, pos, side="right") - xMa = np.zeros(len(pos), dtype=np.float32) - bMa = np.zeros(len(pos), dtype=np.float32) - fbMfa = np.zeros(len(pos), dtype=np.float32) - for i in range(len(pos)): - xMa[i] = pos[i] - base_pos[left_idx[i] - 1] - bMa[i] = base_pos[left_idx[i]] - base_pos[left_idx[i] - 1] - fbMfa[i] = gen_pos[left_idx[i]] - gen_pos[left_idx[i] - 1] - return xMa / bMa * fbMfa - - -def get_weights(genotyped_pos, ungenotyped_pos, genetic_map=None): - """ - Get weights at ungenotyped markers in the query haplotype, which are used in - linear interpolation of HMM state probabilities at ungenotyped markers. - - In BB2016 (see below Equation 1), a weight between genotyped markers m and m + 1 - at ungenotyped marker x is denoted lambda_m,x. - - lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], - where g(i) = genetic map position of marker i. - - :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. - :param numpy.ndarray ungenotyped_pos: Site positions of ungenotyped markers. - :param msprime.RateMap genetic_map: Genetic map. - :return: Weights and associated marker interval start indices. - :rtype: tuple(numpy.ndarray, numpy.ndarray) - """ - MIN_CM_DIST = 1e-7 # See `ImpData.java` in BEAGLE 4.1 source code - assert len(genotyped_pos) > 1 - assert len(ungenotyped_pos) > 0 - # Check that the two sets are mutually exclusive. - m = len(genotyped_pos) - x = len(ungenotyped_pos) - assert len(np.union1d(genotyped_pos, ungenotyped_pos)) == m + x - if genetic_map is None: - genotyped_cm = convert_to_genetic_map_position(genotyped_pos) - ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos) - else: - genotyped_cm = genetic_map.get_cumulative_mass(genotyped_pos) - ungenotyped_cm = genetic_map.get_cumulative_mass(ungenotyped_pos) - # Calculate weights for ungenotyped markers. - weights = np.zeros(x, dtype=np.float32) - # Identify genotype markers k and k + 1 sandwiching marker ungenotyped marker i. - marker_interval_start = np.zeros(x, dtype=np.int32) - for i in range(x): - if ungenotyped_pos[i] < genotyped_pos[0]: - # Ungenotyped marker is before first genotyped marker. - k = 0 - weights[i] = 1.0 - elif ungenotyped_pos[i] > genotyped_pos[-1]: - # Ungenotyped marker is after last genotyped marker. - k = m - 1 - weights[i] = 0.0 - else: - # Ungenotyped marker is between genotyped markers k and k + 1. - k = np.max(np.where(ungenotyped_pos[i] > genotyped_pos)[0]) # Inefficient - cm_mP1_x = genotyped_cm[k + 1] - ungenotyped_cm[i] - cm_mP1_m = genotyped_cm[k + 1] - genotyped_cm[k] - if cm_mP1_m == 0: - cm_mP1_m = MIN_CM_DIST - weights[i] = cm_mP1_x / cm_mP1_m - marker_interval_start[i] = k - assert 0 <= np.min(weights) - assert np.max(weights) <= 1 - return (weights, marker_interval_start) - - -def get_mismatch_prob(pos, miscall_rate): - """ - Set mismatch probabilities at genotyped markers. - - In BEAGLE, the mismatch probability is dominated by the allelic miscall rate. - By default, it is set to 0.0001 and capped at 0.50. - - :param numpy.ndarray pos: Site positions. - :param float miscall_rate: Allelic miscall rate. - :return: Mismatch probabilities. - :rtype: numpy.ndarray - """ - assert isinstance(miscall_rate, float) - if miscall_rate >= 0.5: - # TODO: Print warning. - miscall_rate = 0.5 - mu = np.zeros(len(pos), dtype=np.float32) + miscall_rate - return mu - - -def get_switch_prob(pos, h, ne, genetic_map=None): - """ - Set switch probabilities at genotyped markers. - - Note that in BEAGLE, the default value of `ne` is set to 1e6. - If `h` is small and `ne` is large, the switch probability is nearly 1. - In such cases, it may be desirable to set `ne` to a small value - to avoid switching between haplotypes all the time. - - :param numpy.ndarray pos: Site positions. - :param int h: Number of reference haplotypes. - :param float ne: Effective population size. - :param msprime.RateMap genetic_map: Genetic map. - :return: Switch probabilities. - :rtype: numpy.ndarray - """ - assert np.min(pos) > 0 - assert isinstance(h, int) - assert isinstance(ne, float) - # Get genetic distance between adjacent markers. - if genetic_map is None: - cm = convert_to_genetic_map_position(pos) - else: - cm = genetic_map.get_cumulative_mass(pos) - d = np.zeros(len(pos), dtype=np.float32) - d[0] = cm[0] - d[1:] = cm[1:] - cm[:-1] - assert len(d) == len(pos) - rho = -np.expm1(-(4 * ne * d / h)) - assert len(d) == len(rho) - return rho - - -def compute_forward_matrix(ref_h, query_h, rho, mu): - """ - Implement forward algorithm to compute a forward probablity matrix of size (m, h). - - Reference haplotypes and query haplotype are subsetted to genotyped markers. - So, they are a matrix of size (m, h) and an array of size m, respectively. - - :param numpy.ndarray ref_h: Reference haplotypes. - :param numpy.ndarray query_h: One query haplotype. - :param numpy.ndarray rho: Switch probabilities. - :param numpy.ndarray mu: Mismatch probabilities. - :return: Forward probability matrix. - :rtype: numpy.ndarray - """ - alleles = np.arange(4) # ACGT - assert np.all(np.isin(ref_h, alleles)) - assert np.all(np.isin(query_h, alleles)) - m = ref_h.shape[0] - h = ref_h.shape[1] - assert m == len(query_h) - assert m == len(rho) - assert m == len(mu) - assert 0 <= np.min(rho) - assert np.max(rho) <= 1 - assert 0 <= np.min(mu) - assert np.max(mu) <= 0.5 - fm = np.zeros((m, h), dtype=np.float32) - last_sum = 1.0 # Part of normalization factor - for i in range(m): - # Get site-specific parameters. - shift = rho[i] / h - scale = (1 - rho[i]) / last_sum - # Get allele at genotyped marker i on query haplotype. - query_a = query_h[i] - for j in range(h): - # Get allele at genotyped marker i on reference haplotype j. - ref_a = ref_h[i, j] - # Get emission probability. - em_prob = mu[i] if query_a != ref_a else 1.0 - mu[i] - fm[i, j] = em_prob - if i == 0: - fm[i, j] *= 1.0 / h - else: - fm[i, j] *= scale * fm[i - 1, j] + shift - site_sum = np.sum(fm[i, :]) - last_sum = site_sum / h if i == 0 else site_sum - return fm - - -def compute_backward_matrix(ref_h, query_h, rho, mu): - """ - Implement backward algorithm to compute a backward probablity matrix of size (m, h). - - Reference haplotypes and query haplotype are subsetted to genotyped markers. - So, they are a matrix of size (m, h) and an array of size m, respectively. - - In BEAGLE 4.1, the values are kept one site at a time. - Here, we keep the values at all the sites. - - :param numpy.ndarray ref_h: Reference haplotypes. - :param numpy.ndarray query_h: One query haplotype. - :param numpy.ndarray rho: Switch probabilities. - :param numpy.ndarray mu: Mismatch probabilities. - :return: Backward probability matrix. - :rtype: numpy.ndarray - """ - alleles = np.arange(4) # ACGT - assert np.all(np.isin(ref_h, alleles)) - assert np.all(np.isin(query_h, alleles)) - m = ref_h.shape[0] - h = ref_h.shape[1] - assert m == len(query_h) - assert m == len(rho) - assert m == len(mu) - assert 0 <= np.min(rho) - assert np.max(rho) <= 1 - assert 0 <= np.min(mu) - assert np.max(mu) <= 0.5 - bm = np.zeros((m, h), dtype=np.float32) - bm[-1, :] = 1.0 / h # Initialise the last column - for i in range(m - 2, -1, -1): - query_a = query_h[i + 1] - for j in range(h): - ref_a = ref_h[i + 1, j] - em_prob = mu[i + 1] if ref_a != query_a else 1.0 - mu[i + 1] - # Note that BEAGLE keeps the values at one site at a time. - bm[i, j] = bm[i + 1, j] * em_prob - sum_site = np.sum(bm[i, :]) - scale = (1 - rho[i + 1]) / sum_site - shift = rho[i + 1] / h - bm[i, :] = scale * bm[i, :] + shift - return bm - - -def _get_reference_haplotype_segments(a, b): - """ - WARN: This function is part of an attempt to faithfully reimplement - the BEAGLE 4.1 algorithm. It is not complete and not used - in the current implementation. It is kept for documentation. - - Assuming all biallelic sites, get the index of a reference haplotype segment (i) - defined by the alleles a and b at two adjacent genotyped markers. - - #i, a, b - 0, 0, 0 - 1, 1, 0 - 2, 0, 1 - 3, 1, 1 - - See https://github.com/tskit-dev/tskit/issues/2802#issuecomment-1698767169 - - This is a helper function for `_compute_state_probability_matrix`. - """ - ref_hap_idx = a * 1 + b * 2 - return ref_hap_idx - - -def _compute_state_prob_matrix(fm, bm, ref_h, query_h, rho, mu): - """ - WARN: This function is part of an attempt to faithfully reimplement - the BEAGLE 4.1 algorithm. It is not complete and not used - in the current implementation. It is kept for documentation. - - Implement the forward-backward algorithm to compute: - 1. A HMM state probability matrix across genotyped markers. - 2. "Forward haplotype probabilities". - 3. "Backward haplotype probabilities". - - The HMM state probability matrix is of size (m + 1, h). - - Note that the additional column at m + 1 is for interpolation of - ungenotyped markers right of the last genotyped marker. - - Both the forward and backward haplotype probability matrices - are of size (m, 2), assuming only biallelic sites. - - Reference haplotypes and query haplotype are subsetted to genotyped markers. - So, they are a matrix of size (m, h) and an array of size m, respectively. - - This implementation is based on the function `setStateProbs` - of `LSHapBaum.java` in BEAGLE. It performs the computation - described in Equation 2 of BB2016. - - :param numpy.ndarray fm: Forward probability matrix. - :param numpy.ndarray bm: Backward probability matrix. - :param numpy.ndarray ref_h: Reference haplotypes. - :param numpy.ndarray query_h: One query haplotype. - :param numpy.ndarray rho: Switch probabilities. - :param numpy.ndarray mu: Mismatch probabilities. - :return: HMM state probability matrix, - forward haplotype probability matrix, and - backward haplotype probability matrix. - :rtype: tuple(numpy.ndarray, numpy.ndarray, numpy.ndarray) - """ - alleles = np.arange(4) # ACGT - m = fm.shape[0] - h = fm.shape[1] - assert (m, h) == bm.shape - assert not np.any(fm < 0) - assert not np.any(np.isnan(fm)) - assert not np.any(bm < 0) - assert not np.any(np.isnan(bm)) - assert np.all(np.isin(ref_h, alleles)) - assert np.all(np.isin(query_h, alleles)) - assert m == len(query_h) - assert m == len(rho) - assert m == len(mu) - assert 0 <= np.min(rho) - assert np.max(rho) <= 1 - assert 0 <= np.min(mu) - assert np.max(mu) <= 0.5 - # m + 1 columns to allow for ungenotyped markers right of the last genotyped marker. - sm = np.zeros((m + 1, h), dtype=np.float32) - fwd_hap_probs = np.zeros((m, 4), dtype=np.float32) - bwd_hap_probs = np.zeros((m, 4), dtype=np.float32) - for i in range(m - 1, -1, -1): - for j in range(h): - sm[i, j] = fm[i, j] * bm[i, j] - ref_hap_seg_mP1 = _get_reference_haplotype_segments( - ref_h[i, j], ref_h[i + 1, j] - ) - ref_hap_seg_m = _get_reference_haplotype_segments( - ref_h[i - 1, j], ref_h[i, j] - ) - fwd_hap_probs[i, ref_hap_seg_mP1] += sm[i, j] - bwd_hap_probs[i, ref_hap_seg_m] += sm[i, j] - sum_fwd_hap_probs = np.sum(fwd_hap_probs[i, :]) - fwd_hap_probs[i, :] /= sum_fwd_hap_probs - bwd_hap_probs[i, :] /= sum_fwd_hap_probs # Sum of forward hap probs - sm[m, :] = sm[m - 1, :] # Not in BEAGLE - return (sm, fwd_hap_probs, bwd_hap_probs) - - -def _interpolate_allele_prob( - fwd_hap_probs, bwd_hap_probs, genotyped_pos, ungenotyped_pos, genetic_map=None -): - """ - WARN: This function is part of an attempt to faithfully reimplement - the BEAGLE 4.1 algorithm. It is not complete and not used - in the current implementation. It is kept for documentation. - - Compute the interpolated allele probabilities at the ungenotyped markers. - - The forward and backward haplotype probability matrices are of size (m, 2). - The interpolated allele probability matrix is of size (x, 2). - We assume all biallelic sites, hence the 2. - - This implementation is based on Equation 2 (the rearranged one) in BB2016. - - :param numpy.ndarray fwd_hap_probs: Forward haplotype probability matrix. - :param numpy.ndarray bwd_hap_probs: Backward haplotype probability matrix. - :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. - :param numpy.ndarray ungenotyped_pos: Site positions of ungenotyped markers. - :param msprime.RateMap genetic_map: Genetic map. - :return: Interpolated allele probabilities. - :rtype: numpy.ndarray - """ - m = len(genotyped_pos) - x = len(ungenotyped_pos) - assert (m, 2) == fwd_hap_probs.shape - assert (m, 2) == bwd_hap_probs.shape - genotyped_cm = convert_to_genetic_map_position(genotyped_pos) - assert m == len(genotyped_cm) - ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos) - assert x == len(ungenotyped_cm) - weights, _ = get_weights(genotyped_pos, ungenotyped_pos, genetic_map=genetic_map) - assert x == len(weights) - i_hap_probs = np.zeros((x, 2), dtype=np.float32) - for i in range(x): - # Identify genotyped markers k and k + 1 sandwiching ungenotyped marker i. - if ungenotyped_pos[i] < genotyped_pos[0]: - k = 0 - elif ungenotyped_pos[i] > genotyped_pos[-1]: - k = m - 2 - else: - k = np.max(np.where(ungenotyped_pos[i] > genotyped_pos)[0]) - # Vectorised over the allelic states. - i_hap_probs[i, :] = weights[i] * bwd_hap_probs[k, :] - i_hap_probs[i, :] += (1 - weights[i]) * fwd_hap_probs[k, :] - return i_hap_probs - - -def compute_state_prob_matrix(fm, bm): - """ - Compute the HMM state probability matrix, in which each element is - the probability of a HMM state at a genotyped marker. - - Implementing this is simpler than faithfully reproducing the BEAGLE 4.1 algorithm. - - The output HMM state probability matrix is of size (m + 1, h). - - Note that the additional column at m + 1 is used for interpolation of - ungenotyped markers to the right of the last genotyped marker. - - The forward and backward probability matrices are of size (m, h). - - :param numpy.ndarray fm: Forward probability matrix. - :param numpy.ndarray bm: Backward probability matrix. - :return: HMM state probability matrix. - :rtype: numpy.ndarray - """ - assert fm.shape == bm.shape - m = fm.shape[0] - h = bm.shape[1] - assert not np.any(fm < 0) - assert not np.any(np.isnan(fm)) - assert not np.any(bm < 0) - assert not np.any(np.isnan(bm)) - # m + 1 columns to allow for ungenotyped markers right of the last genotyped marker. - sm = np.zeros((m + 1, h), dtype=np.float32) - sm[:-1, :] = np.multiply(fm, bm) - sm[m, :] = sm[m - 1, :] # Not in BEAGLE - return sm - - -def interpolate_allele_probabilities( - sm, ref_h, genotyped_pos, ungenotyped_pos, genetic_map=None -): - """ - Compute the interpolated allele probabilities at ungenotyped markers of - a query haplotype following Equation 1 of BB2016. - - The interpolated allele probability matrix is of size (x, a), - where a is the number of alleles. - - This function takes the output of `compute_state_prob_matrix`. - - Note that this function takes: - 1. HMM state probability matrix across genotyped markers of size (m + 1, h). - 2. reference haplotypes subsetted to ungenotyped markers of size (x, h). - - :param numpy.ndarray sm: HMM state probability matrix at genotyped markers. - :param numpy.ndarray ref_h: Reference haplotypes subsetted to ungenotyped markers. - :param numpy.ndarray genotyped_pos: Site positions at genotyped markers. - :param numpy.ndarray ungenotyped_pos: Site positions at ungenotyped markers. - :param msprime.RateMap genetic_map: Genetic map. - :return: Interpolated allele probabilities. - :rtype: numpy.ndarray - """ - alleles = np.arange(4) # ACGT - assert not np.any(sm < 0) - assert not np.any(np.isnan(sm)) - m = sm.shape[0] - 1 - h = sm.shape[1] - assert m == len(genotyped_pos) - x = len(ungenotyped_pos) - assert (x, h) == ref_h.shape - weights, marker_interval_start = get_weights( - genotyped_pos, ungenotyped_pos, genetic_map=genetic_map - ) - assert x == len(weights) - assert x == len(marker_interval_start) - p = np.zeros((x, len(alleles)), dtype=np.float32) - # Compute allele probabilities as per Equation 1 in BB2016. - for a in alleles: - # Going from left to right. - for i in range(x): - # Identify genotyped markers k and k + 1 sandwiching ungenotyped marker i. - k = marker_interval_start[i] - # Sum over the reference haplotypes where the allele is a at marker i. - has_a = ref_h[i, :] == a - p[i, a] += np.sum(weights[i] * sm[k, has_a]) - p[i, a] += np.sum((1 - weights[i]) * sm[k + 1, has_a]) - # Rescale probabilities. - p_rescaled = p / np.sum(p, axis=1)[:, np.newaxis] - return p_rescaled - - -def get_map_alleles(allele_prob): - """ - Compute the maximum a posteriori alleles at ungenotyped markers of a query haplotype. - - The imputed alleles is an array of size x. - - WARN: If the allele probabilities are equal, then allele 0 is arbitrarily chosen. - - :param numpy.ndarray allele_prob: Interpolated allele probabilities. - :return: Imputed alleles and their associated probabilities. - :rtype: tuple(numpy.ndarray, numpy.ndarray) - """ - assert np.all(allele_prob >= 0) - assert not np.any(np.isnan(allele_prob)) - imputed_alleles = np.argmax(allele_prob, axis=1) - max_allele_prob = np.max(allele_prob, axis=1) - assert len(imputed_alleles) == allele_prob.shape[0] - assert len(max_allele_prob) == allele_prob.shape[0] - return (imputed_alleles, max_allele_prob) - - -def run_beagle( - ref_h, query_h, pos, miscall_rate=1e-4, ne=1e6, genetic_map=None, debug=False -): - """ - Run a simplified version of the BEAGLE 4.1 imputation algorithm - based on Equation 1 of BB2016. - - Reference haplotypes and query haplotype are of size (m + x, h) and (m + x), - respectively, - - The site positions of all the markers are an array of size (m + x). - - This produces imputed alleles and interpolated allele probabilities - at the ungenotyped markers of the query haplotype. - - :param numpy.ndarray ref_h: Reference haplotypes. - :param numpy.ndarray query_h: One query haplotype. - :param numpy.ndarray pos: Site positions of all the markers. - :param float miscall_rate: Allelic miscall rate. - :param int ne: Effective population size. - :param msprime.RateMap genetic_map: Genetic map. - :param bool debug: Whether to print intermediate results. - :return: Imputed alleles and their associated probabilities. - :rtype: tuple(numpy.ndarray, numpy.ndarray) - """ - # Set precision for printing. - if debug: - np.set_printoptions(precision=20) - assert ref_h.shape[0] == len(pos) - assert len(query_h) == len(pos) - # Indices of genotyped markers in the query haplotype. - genotyped_idx = np.where(query_h != -1)[0] - assert len(genotyped_idx) > 1 - # Indices of ungenotyped markers in the query haplotype. - ungenotyped_idx = np.where(query_h == -1)[0] - assert len(ungenotyped_idx) > 0 - # Site positions of genotyped markers. - genotyped_pos = pos[genotyped_idx] - # Site positions of ungenotyped markers. - ungenotyped_pos = pos[ungenotyped_idx] - h = ref_h.shape[1] - m = len(genotyped_pos) - x = len(ungenotyped_pos) - assert m + x == len(pos) - # Subset reference haplotypes to genotyped markers. - ref_h_genotyped = ref_h[genotyped_idx, :] - if debug: - print("Reference haplotypes subsetted to genotyped markers") - print(ref_h_genotyped) - assert (m, h) == ref_h_genotyped.shape - # Subset query haplotype to genotyped markers. - query_h_genotyped = query_h[genotyped_idx] - if debug: - print("Query haplotype subsetted to genotyped markers") - print(query_h_genotyped) - assert m == len(query_h_genotyped) - # Set mismatch probabilities at genotyped markers. - mu = get_mismatch_prob(genotyped_pos, miscall_rate=miscall_rate) - if debug: - print("Mismatch probabilities") - print(mu) - assert m == len(mu) - # Set switch probabilities at genotyped markers. - rho = get_switch_prob(genotyped_pos, h, ne=ne) - if debug: - print("Switch probabilities") - print(rho) - assert m == len(rho) - # Compute forward probability matrix at genotyped markers. - fm = compute_forward_matrix(ref_h_genotyped, query_h_genotyped, rho, mu) - if debug: - print("Forward probability matrix") - print(fm) - assert (m, h) == fm.shape - # Compute backward probability matrix at genotyped markers. - bm = compute_backward_matrix(ref_h_genotyped, query_h_genotyped, rho, mu) - if debug: - print("Backward probability matrix") - print(bm) - assert (m, h) == bm.shape - # Compute HMM state probability matrix at genotyped markers. - sm = compute_state_prob_matrix(fm, bm) - if debug: - print("HMM state probability matrix") - print(sm) - assert (m + 1, h) == sm.shape - # Subset the reference haplotypes to ungenotyped markers. - ref_h_ungenotyped = ref_h[ungenotyped_idx, :] - # Interpolate allele probabilities at ungenotyped markers. - alleles = np.arange(4) # ACGT - i_allele_prob = interpolate_allele_probabilities( - sm, ref_h_ungenotyped, genotyped_pos, ungenotyped_pos, genetic_map=genetic_map - ) - if debug: - print("Interpolated allele probabilities") - print(i_allele_prob) - assert (x, len(alleles)) == i_allele_prob.shape - # Get MAP alleles at ungenotyped markers. - imputed_alleles, max_allele_prob = get_map_alleles(i_allele_prob) - if debug: - print("Imputed alleles") - print(imputed_alleles) - print("Max allele probabilities") - print(max_allele_prob) - assert x == len(imputed_alleles) - assert x == len(max_allele_prob) - return (imputed_alleles, max_allele_prob) diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py index dee2c01f74..7de8667336 100644 --- a/python/tests/beagle_numba.py +++ b/python/tests/beagle_numba.py @@ -2,12 +2,9 @@ Implementation of the BEAGLE 4.1 algorithm to impute alleles by linear interpolation of state probabilities at ungenotyped positions in the hidden state probability matrix. -This was implemented while closely consulting the BEAGLE 4.1 paper: -Browning & Browning (2016). Genotype imputation with millions of reference samples. -Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 - -The BEAGLE 4.1 source code (particularly `LSHapBaum.java`) was closely consulted: -https://faculty.washington.edu/browning/beagle/b4_1.html +This was implemented while closely consulting the BEAGLE 4.1 paper and source code: +* Browning & Browning (2016). Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 +* Source code: https://faculty.washington.edu/browning/beagle/b4_1.html These notations are used throughout: h = number of reference haplotypes. @@ -15,18 +12,28 @@ x = number of ungenotyped positions. This implementation takes the following inputs: -1. Panel of reference haplotypes in a matrix of size (m + x, h). -2. One query haplotype in an array of size (m + x). -3. Physical positions of all the positions in an array of size (m + x). +* Panel of reference haplotypes in a matrix of size (m + x, h). +* Query haplotype in an array of size (m + x). +* Physical positions of all the markers in an array of size (m + x). +* Genetic map. In the query haplotype: -1. The genotyped positions take values of 0, 1, 2, or 3 (ACGT encoding). -2. The ungenotyped positions take -1. - -The forward and backward probability matrices are of size (m, h). -The state probability matrix is of size (m, h). -The imputed allele probability matrix is of size (x, 4), -The imputed alleles are the maximum a posteriori (MAP) alleles. +* Genotyped positions take values of 0, 1, 2, or 3 (i.e. ACGT encoding). +* Ungenotyped positions take -1. + +The following objects are computed: +* Forward and backward probability matrices of size (m, h). +* Hidden state probability matrix of size (m, h). +* Interpolated state probability matrix of size (x, h). +* Imputed allele probability matrix of size (x, 4), +* Imputed alleles as the maximum a posteriori alleles. + +The following evaluation metrics are produced in VCF format: +* Estimated allelic R-squared (AR2). +* Dosage R-squared (DR2). +* Estimated allele frequency (AF). +* Genotype probabilities of 00, 01/10, and 11 (GP). +* Estimated dosage (DS). To improve computational efficiency, BEAGLE uses aggregated markers, which are clusters of markers within a 0.005 cM interval (default).