From 2178c3cfd1f0ca56aea75a2fbdbc5d82b53f3d4b Mon Sep 17 00:00:00 2001 From: Shing Zhan Date: Mon, 7 Aug 2023 18:19:00 +0100 Subject: [PATCH] Add tests to check LS HMM of _tskit.lshmm compared to BEAGLE --- python/tests/beagle.py | 653 +++++++++++++++++++++++ python/tests/beagle_numba.py | 890 ++++++++++++++++++++++++++++++++ python/tests/test_imputation.py | 719 ++++++++++++++++++++++++++ 3 files changed, 2262 insertions(+) create mode 100644 python/tests/beagle.py create mode 100644 python/tests/beagle_numba.py create mode 100644 python/tests/test_imputation.py diff --git a/python/tests/beagle.py b/python/tests/beagle.py new file mode 100644 index 0000000000..47ebb6920b --- /dev/null +++ b/python/tests/beagle.py @@ -0,0 +1,653 @@ +""" +Implementation of the BEAGLE 4.1 algorithm to impute alleles. + +This was implemented while closely consulting the BEAGLE 4.1 paper: +Browning & Browning (2016). Genotype imputation with millions of reference samples. +Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 + +The BEAGLE 4.1 source code (particularly `LSHapBaum.java`) was also consulted: +https://faculty.washington.edu/browning/beagle/b4_1.html + +These notations are used throughout this implementation: +h = number of reference haplotypes. +m = number of genotyped markers. +x = number of ungenotyped markers. + +This implementation takes the following inputs: +1. Panel of reference haplotypes in a matrix of size (m + x, h). +2. One query haplotype in an array of size (m + x). +3. Site positions of all the markers in an array of size (m + x). + +In the query haplotype: +1. The genotyped markers take values of 0, 1, 2, or 3 (ACGT encoding). +2. The ungenotyped markers take -1. + +The forward and backward probability matrices are of size (m, h). +The HMM state probability matrix is of size (m + 1, h). +The interpolated allele probability matrix is of size (x, 4), +The imputed alleles are the maximum a posteriori (MAP) alleles. + +For efficiency, BEAGLE uses aggregated markers, which are clusters of markers +within a 0.005 cM interval (default). Because the genotypes are phased, +the alleles in the aggregated markers form distinct "allele sequences". +Below, we do not use aggregated markers or allele sequences, which would complicate +the implementation. + +Rather than trying to exactly replicating the original BEAGLE 4.1 algorithm, +this implementation computes Equation 1 of BB2016. We keep functions used +in a first attempt to faithfully implement the BEAGLE algorithm for documentation. +""" +import numpy as np + + +""" +TODO: Update use of genetic map. +TODO: Refactor and clean up this code. +TODO: Move useful documentation over to beagle_numba.py. +""" + + +def convert_to_genetic_map_position(pos, genetic_map_pos=None, genetic_map_cm=None): + """ + Convert physical positions (bp) to genetic map positions (cM). + + In BEAGLE 4.1, when a genetic map is not specified, it is assumed + that the recombination rate is constant (1 cM / 1 Mb). + + If a genetic map is specified, then the genetic map positions are + either taken straight from it or interpolated from it. The genetic map + needs to contain physical positions and corresponding genetic positions. + See `PlinkGenMap.java` in the BEAGLE 4.1 source code for details. + + :param numpy.ndarray pos: Physical positions (bp). + :param numpy.ndarray genetic_map_pos: Physical positions of genetic map. + :param numpy.ndarray genetic_map_cm: Genetic map positions of genetic map. + :return: Genetic map positions (cM). + :rtype: numpy.ndarray + """ + if genetic_map_pos is None or genetic_map_cm is None: + return pos / 1e6 + # TODO: Better way to pass the components of a genetic map. + base_pos = genetic_map_pos + gen_pos = genetic_map_cm + assert len(base_pos) == len(gen_pos), "Bad genetic map." + assert np.all(pos >= base_pos[0]) and np.all( + pos <= base_pos[-1] + ), "Physical positions outside of genetic map." + left_idx = np.searchsorted(base_pos, pos, side="right") + xMa = np.zeros(len(pos), dtype=np.float32) + bMa = np.zeros(len(pos), dtype=np.float32) + fbMfa = np.zeros(len(pos), dtype=np.float32) + for i in range(len(pos)): + xMa[i] = pos[i] - base_pos[left_idx[i] - 1] + bMa[i] = base_pos[left_idx[i]] - base_pos[left_idx[i] - 1] + fbMfa[i] = gen_pos[left_idx[i]] - gen_pos[left_idx[i] - 1] + return xMa / bMa * fbMfa + + +def get_weights(genotyped_pos, ungenotyped_pos, genetic_map=None): + """ + Get weights at ungenotyped markers in the query haplotype, which are used in + linear interpolation of HMM state probabilities at ungenotyped markers. + + In BB2016 (see below Equation 1), a weight between genotyped markers m and m + 1 + at ungenotyped marker x is denoted lambda_m,x. + + lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], + where g(i) = genetic map position of marker i. + + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray ungenotyped_pos: Site positions of ungenotyped markers. + :param msprime.RateMap genetic_map: Genetic map. + :return: Weights and associated marker interval start indices. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + MIN_CM_DIST = 1e-7 # See `ImpData.java` in BEAGLE 4.1 source code + assert len(genotyped_pos) > 1 + assert len(ungenotyped_pos) > 0 + # Check that the two sets are mutually exclusive. + m = len(genotyped_pos) + x = len(ungenotyped_pos) + assert len(np.union1d(genotyped_pos, ungenotyped_pos)) == m + x + if genetic_map is None: + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos) + else: + genotyped_cm = genetic_map.get_cumulative_mass(genotyped_pos) + ungenotyped_cm = genetic_map.get_cumulative_mass(ungenotyped_pos) + # Calculate weights for ungenotyped markers. + weights = np.zeros(x, dtype=np.float32) + # Identify genotype markers k and k + 1 sandwiching marker ungenotyped marker i. + marker_interval_start = np.zeros(x, dtype=np.int32) + for i in range(x): + if ungenotyped_pos[i] < genotyped_pos[0]: + # Ungenotyped marker is before first genotyped marker. + k = 0 + weights[i] = 1.0 + elif ungenotyped_pos[i] > genotyped_pos[-1]: + # Ungenotyped marker is after last genotyped marker. + k = m - 1 + weights[i] = 0.0 + else: + # Ungenotyped marker is between genotyped markers k and k + 1. + k = np.max(np.where(ungenotyped_pos[i] > genotyped_pos)[0]) # Inefficient + cm_mP1_x = genotyped_cm[k + 1] - ungenotyped_cm[i] + cm_mP1_m = genotyped_cm[k + 1] - genotyped_cm[k] + if cm_mP1_m == 0: + cm_mP1_m = MIN_CM_DIST + weights[i] = cm_mP1_x / cm_mP1_m + marker_interval_start[i] = k + assert 0 <= np.min(weights) + assert np.max(weights) <= 1 + return (weights, marker_interval_start) + + +def get_mismatch_prob(pos, miscall_rate): + """ + Set mismatch probabilities at genotyped markers. + + In BEAGLE, the mismatch probability is dominated by the allelic miscall rate. + By default, it is set to 0.0001 and capped at 0.50. + + :param numpy.ndarray pos: Site positions. + :param float miscall_rate: Allelic miscall rate. + :return: Mismatch probabilities. + :rtype: numpy.ndarray + """ + assert isinstance(miscall_rate, float) + if miscall_rate >= 0.5: + # TODO: Print warning. + miscall_rate = 0.5 + mu = np.zeros(len(pos), dtype=np.float32) + miscall_rate + return mu + + +def get_switch_prob(pos, h, ne, genetic_map=None): + """ + Set switch probabilities at genotyped markers. + + Note that in BEAGLE, the default value of `ne` is set to 1e6. + If `h` is small and `ne` is large, the switch probability is nearly 1. + In such cases, it may be desirable to set `ne` to a small value + to avoid switching between haplotypes all the time. + + :param numpy.ndarray pos: Site positions. + :param int h: Number of reference haplotypes. + :param float ne: Effective population size. + :param msprime.RateMap genetic_map: Genetic map. + :return: Switch probabilities. + :rtype: numpy.ndarray + """ + assert np.min(pos) > 0 + assert isinstance(h, int) + assert isinstance(ne, float) + # Get genetic distance between adjacent markers. + if genetic_map is None: + cm = convert_to_genetic_map_position(pos) + else: + cm = genetic_map.get_cumulative_mass(pos) + d = np.zeros(len(pos), dtype=np.float32) + d[0] = cm[0] + d[1:] = cm[1:] - cm[:-1] + assert len(d) == len(pos) + rho = -np.expm1(-(4 * ne * d / h)) + assert len(d) == len(rho) + return rho + + +def compute_forward_matrix(ref_h, query_h, rho, mu): + """ + Implement forward algorithm to compute a forward probablity matrix of size (m, h). + + Reference haplotypes and query haplotype are subsetted to genotyped markers. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Forward probability matrix. + :rtype: numpy.ndarray + """ + alleles = np.arange(4) # ACGT + assert np.all(np.isin(ref_h, alleles)) + assert np.all(np.isin(query_h, alleles)) + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) + assert np.max(rho) <= 1 + assert 0 <= np.min(mu) + assert np.max(mu) <= 0.5 + fm = np.zeros((m, h), dtype=np.float32) + last_sum = 1.0 # Part of normalization factor + for i in range(m): + # Get site-specific parameters. + shift = rho[i] / h + scale = (1 - rho[i]) / last_sum + # Get allele at genotyped marker i on query haplotype. + query_a = query_h[i] + for j in range(h): + # Get allele at genotyped marker i on reference haplotype j. + ref_a = ref_h[i, j] + # Get emission probability. + em_prob = mu[i] if query_a != ref_a else 1.0 - mu[i] + fm[i, j] = em_prob + if i == 0: + fm[i, j] *= 1.0 / h + else: + fm[i, j] *= scale * fm[i - 1, j] + shift + site_sum = np.sum(fm[i, :]) + last_sum = site_sum / h if i == 0 else site_sum + return fm + + +def compute_backward_matrix(ref_h, query_h, rho, mu): + """ + Implement backward algorithm to compute a backward probablity matrix of size (m, h). + + Reference haplotypes and query haplotype are subsetted to genotyped markers. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + In BEAGLE 4.1, the values are kept one site at a time. + Here, we keep the values at all the sites. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Backward probability matrix. + :rtype: numpy.ndarray + """ + alleles = np.arange(4) # ACGT + assert np.all(np.isin(ref_h, alleles)) + assert np.all(np.isin(query_h, alleles)) + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) + assert np.max(rho) <= 1 + assert 0 <= np.min(mu) + assert np.max(mu) <= 0.5 + bm = np.zeros((m, h), dtype=np.float32) + bm[-1, :] = 1.0 / h # Initialise the last column + for i in range(m - 2, -1, -1): + query_a = query_h[i + 1] + for j in range(h): + ref_a = ref_h[i + 1, j] + em_prob = mu[i + 1] if ref_a != query_a else 1.0 - mu[i + 1] + # Note that BEAGLE keeps the values at one site at a time. + bm[i, j] = bm[i + 1, j] * em_prob + sum_site = np.sum(bm[i, :]) + scale = (1 - rho[i + 1]) / sum_site + shift = rho[i + 1] / h + bm[i, :] = scale * bm[i, :] + shift + return bm + + +def _get_reference_haplotype_segments(a, b): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Assuming all biallelic sites, get the index of a reference haplotype segment (i) + defined by the alleles a and b at two adjacent genotyped markers. + + #i, a, b + 0, 0, 0 + 1, 1, 0 + 2, 0, 1 + 3, 1, 1 + + See https://github.com/tskit-dev/tskit/issues/2802#issuecomment-1698767169 + + This is a helper function for `_compute_state_probability_matrix`. + """ + ref_hap_idx = a * 1 + b * 2 + return ref_hap_idx + + +def _compute_state_prob_matrix(fm, bm, ref_h, query_h, rho, mu): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Implement the forward-backward algorithm to compute: + 1. A HMM state probability matrix across genotyped markers. + 2. "Forward haplotype probabilities". + 3. "Backward haplotype probabilities". + + The HMM state probability matrix is of size (m + 1, h). + + Note that the additional column at m + 1 is for interpolation of + ungenotyped markers right of the last genotyped marker. + + Both the forward and backward haplotype probability matrices + are of size (m, 2), assuming only biallelic sites. + + Reference haplotypes and query haplotype are subsetted to genotyped markers. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + This implementation is based on the function `setStateProbs` + of `LSHapBaum.java` in BEAGLE. It performs the computation + described in Equation 2 of BB2016. + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: HMM state probability matrix, + forward haplotype probability matrix, and + backward haplotype probability matrix. + :rtype: tuple(numpy.ndarray, numpy.ndarray, numpy.ndarray) + """ + alleles = np.arange(4) # ACGT + m = fm.shape[0] + h = fm.shape[1] + assert (m, h) == bm.shape + assert not np.any(fm < 0) + assert not np.any(np.isnan(fm)) + assert not np.any(bm < 0) + assert not np.any(np.isnan(bm)) + assert np.all(np.isin(ref_h, alleles)) + assert np.all(np.isin(query_h, alleles)) + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) + assert np.max(rho) <= 1 + assert 0 <= np.min(mu) + assert np.max(mu) <= 0.5 + # m + 1 columns to allow for ungenotyped markers right of the last genotyped marker. + sm = np.zeros((m + 1, h), dtype=np.float32) + fwd_hap_probs = np.zeros((m, 4), dtype=np.float32) + bwd_hap_probs = np.zeros((m, 4), dtype=np.float32) + for i in range(m - 1, -1, -1): + for j in range(h): + sm[i, j] = fm[i, j] * bm[i, j] + ref_hap_seg_mP1 = _get_reference_haplotype_segments( + ref_h[i, j], ref_h[i + 1, j] + ) + ref_hap_seg_m = _get_reference_haplotype_segments( + ref_h[i - 1, j], ref_h[i, j] + ) + fwd_hap_probs[i, ref_hap_seg_mP1] += sm[i, j] + bwd_hap_probs[i, ref_hap_seg_m] += sm[i, j] + sum_fwd_hap_probs = np.sum(fwd_hap_probs[i, :]) + fwd_hap_probs[i, :] /= sum_fwd_hap_probs + bwd_hap_probs[i, :] /= sum_fwd_hap_probs # Sum of forward hap probs + sm[m, :] = sm[m - 1, :] # Not in BEAGLE + return (sm, fwd_hap_probs, bwd_hap_probs) + + +def _interpolate_allele_prob( + fwd_hap_probs, bwd_hap_probs, genotyped_pos, ungenotyped_pos, genetic_map=None +): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Compute the interpolated allele probabilities at the ungenotyped markers. + + The forward and backward haplotype probability matrices are of size (m, 2). + The interpolated allele probability matrix is of size (x, 2). + We assume all biallelic sites, hence the 2. + + This implementation is based on Equation 2 (the rearranged one) in BB2016. + + :param numpy.ndarray fwd_hap_probs: Forward haplotype probability matrix. + :param numpy.ndarray bwd_hap_probs: Backward haplotype probability matrix. + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray ungenotyped_pos: Site positions of ungenotyped markers. + :param msprime.RateMap genetic_map: Genetic map. + :return: Interpolated allele probabilities. + :rtype: numpy.ndarray + """ + m = len(genotyped_pos) + x = len(ungenotyped_pos) + assert (m, 2) == fwd_hap_probs.shape + assert (m, 2) == bwd_hap_probs.shape + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + assert m == len(genotyped_cm) + ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos) + assert x == len(ungenotyped_cm) + weights, _ = get_weights(genotyped_pos, ungenotyped_pos, genetic_map=genetic_map) + assert x == len(weights) + i_hap_probs = np.zeros((x, 2), dtype=np.float32) + for i in range(x): + # Identify genotyped markers k and k + 1 sandwiching ungenotyped marker i. + if ungenotyped_pos[i] < genotyped_pos[0]: + k = 0 + elif ungenotyped_pos[i] > genotyped_pos[-1]: + k = m - 2 + else: + k = np.max(np.where(ungenotyped_pos[i] > genotyped_pos)[0]) + # Vectorised over the allelic states. + i_hap_probs[i, :] = weights[i] * bwd_hap_probs[k, :] + i_hap_probs[i, :] += (1 - weights[i]) * fwd_hap_probs[k, :] + return i_hap_probs + + +def compute_state_prob_matrix(fm, bm): + """ + Compute the HMM state probability matrix, in which each element is + the probability of a HMM state at a genotyped marker. + + Implementing this is simpler than faithfully reproducing the BEAGLE 4.1 algorithm. + + The output HMM state probability matrix is of size (m + 1, h). + + Note that the additional column at m + 1 is used for interpolation of + ungenotyped markers to the right of the last genotyped marker. + + The forward and backward probability matrices are of size (m, h). + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :return: HMM state probability matrix. + :rtype: numpy.ndarray + """ + assert fm.shape == bm.shape + m = fm.shape[0] + h = bm.shape[1] + assert not np.any(fm < 0) + assert not np.any(np.isnan(fm)) + assert not np.any(bm < 0) + assert not np.any(np.isnan(bm)) + # m + 1 columns to allow for ungenotyped markers right of the last genotyped marker. + sm = np.zeros((m + 1, h), dtype=np.float32) + sm[:-1, :] = np.multiply(fm, bm) + sm[m, :] = sm[m - 1, :] # Not in BEAGLE + return sm + + +def interpolate_allele_probabilities( + sm, ref_h, genotyped_pos, ungenotyped_pos, genetic_map=None +): + """ + Compute the interpolated allele probabilities at ungenotyped markers of + a query haplotype following Equation 1 of BB2016. + + The interpolated allele probability matrix is of size (x, a), + where a is the number of alleles. + + This function takes the output of `compute_state_prob_matrix`. + + Note that this function takes: + 1. HMM state probability matrix across genotyped markers of size (m + 1, h). + 2. reference haplotypes subsetted to ungenotyped markers of size (x, h). + + :param numpy.ndarray sm: HMM state probability matrix at genotyped markers. + :param numpy.ndarray ref_h: Reference haplotypes subsetted to ungenotyped markers. + :param numpy.ndarray genotyped_pos: Site positions at genotyped markers. + :param numpy.ndarray ungenotyped_pos: Site positions at ungenotyped markers. + :param msprime.RateMap genetic_map: Genetic map. + :return: Interpolated allele probabilities. + :rtype: numpy.ndarray + """ + alleles = np.arange(4) # ACGT + assert not np.any(sm < 0) + assert not np.any(np.isnan(sm)) + m = sm.shape[0] - 1 + h = sm.shape[1] + assert m == len(genotyped_pos) + x = len(ungenotyped_pos) + assert (x, h) == ref_h.shape + weights, marker_interval_start = get_weights( + genotyped_pos, ungenotyped_pos, genetic_map=genetic_map + ) + assert x == len(weights) + assert x == len(marker_interval_start) + p = np.zeros((x, len(alleles)), dtype=np.float32) + # Compute allele probabilities as per Equation 1 in BB2016. + for a in alleles: + # Going from left to right. + for i in range(x): + # Identify genotyped markers k and k + 1 sandwiching ungenotyped marker i. + k = marker_interval_start[i] + # Sum over the reference haplotypes where the allele is a at marker i. + has_a = ref_h[i, :] == a + p[i, a] += np.sum(weights[i] * sm[k, has_a]) + p[i, a] += np.sum((1 - weights[i]) * sm[k + 1, has_a]) + # Rescale probabilities. + p_rescaled = p / np.sum(p, axis=1)[:, np.newaxis] + return p_rescaled + + +def get_map_alleles(allele_prob): + """ + Compute the maximum a posteriori alleles at ungenotyped markers of a query haplotype. + + The imputed alleles is an array of size x. + + WARN: If the allele probabilities are equal, then allele 0 is arbitrarily chosen. + + :param numpy.ndarray allele_prob: Interpolated allele probabilities. + :return: Imputed alleles and their associated probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + assert np.all(allele_prob >= 0) + assert not np.any(np.isnan(allele_prob)) + imputed_alleles = np.argmax(allele_prob, axis=1) + max_allele_prob = np.max(allele_prob, axis=1) + assert len(imputed_alleles) == allele_prob.shape[0] + assert len(max_allele_prob) == allele_prob.shape[0] + return (imputed_alleles, max_allele_prob) + + +def run_beagle( + ref_h, query_h, pos, miscall_rate=1e-4, ne=1e6, genetic_map=None, debug=False +): + """ + Run a simplified version of the BEAGLE 4.1 imputation algorithm + based on Equation 1 of BB2016. + + Reference haplotypes and query haplotype are of size (m + x, h) and (m + x), + respectively, + + The site positions of all the markers are an array of size (m + x). + + This produces imputed alleles and interpolated allele probabilities + at the ungenotyped markers of the query haplotype. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Site positions of all the markers. + :param float miscall_rate: Allelic miscall rate. + :param int ne: Effective population size. + :param msprime.RateMap genetic_map: Genetic map. + :param bool debug: Whether to print intermediate results. + :return: Imputed alleles and their associated probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + # Set precision for printing. + if debug: + np.set_printoptions(precision=20) + assert ref_h.shape[0] == len(pos) + assert len(query_h) == len(pos) + # Indices of genotyped markers in the query haplotype. + genotyped_idx = np.where(query_h != -1)[0] + assert len(genotyped_idx) > 1 + # Indices of ungenotyped markers in the query haplotype. + ungenotyped_idx = np.where(query_h == -1)[0] + assert len(ungenotyped_idx) > 0 + # Site positions of genotyped markers. + genotyped_pos = pos[genotyped_idx] + # Site positions of ungenotyped markers. + ungenotyped_pos = pos[ungenotyped_idx] + h = ref_h.shape[1] + m = len(genotyped_pos) + x = len(ungenotyped_pos) + assert m + x == len(pos) + # Subset reference haplotypes to genotyped markers. + ref_h_genotyped = ref_h[genotyped_idx, :] + if debug: + print("Reference haplotypes subsetted to genotyped markers") + print(ref_h_genotyped) + assert (m, h) == ref_h_genotyped.shape + # Subset query haplotype to genotyped markers. + query_h_genotyped = query_h[genotyped_idx] + if debug: + print("Query haplotype subsetted to genotyped markers") + print(query_h_genotyped) + assert m == len(query_h_genotyped) + # Set mismatch probabilities at genotyped markers. + mu = get_mismatch_prob(genotyped_pos, miscall_rate=miscall_rate) + if debug: + print("Mismatch probabilities") + print(mu) + assert m == len(mu) + # Set switch probabilities at genotyped markers. + rho = get_switch_prob(genotyped_pos, h, ne=ne) + if debug: + print("Switch probabilities") + print(rho) + assert m == len(rho) + # Compute forward probability matrix at genotyped markers. + fm = compute_forward_matrix(ref_h_genotyped, query_h_genotyped, rho, mu) + if debug: + print("Forward probability matrix") + print(fm) + assert (m, h) == fm.shape + # Compute backward probability matrix at genotyped markers. + bm = compute_backward_matrix(ref_h_genotyped, query_h_genotyped, rho, mu) + if debug: + print("Backward probability matrix") + print(bm) + assert (m, h) == bm.shape + # Compute HMM state probability matrix at genotyped markers. + sm = compute_state_prob_matrix(fm, bm) + if debug: + print("HMM state probability matrix") + print(sm) + assert (m + 1, h) == sm.shape + # Subset the reference haplotypes to ungenotyped markers. + ref_h_ungenotyped = ref_h[ungenotyped_idx, :] + # Interpolate allele probabilities at ungenotyped markers. + alleles = np.arange(4) # ACGT + i_allele_prob = interpolate_allele_probabilities( + sm, ref_h_ungenotyped, genotyped_pos, ungenotyped_pos, genetic_map=genetic_map + ) + if debug: + print("Interpolated allele probabilities") + print(i_allele_prob) + assert (x, len(alleles)) == i_allele_prob.shape + # Get MAP alleles at ungenotyped markers. + imputed_alleles, max_allele_prob = get_map_alleles(i_allele_prob) + if debug: + print("Imputed alleles") + print(imputed_alleles) + print("Max allele probabilities") + print(max_allele_prob) + assert x == len(imputed_alleles) + assert x == len(max_allele_prob) + return (imputed_alleles, max_allele_prob) diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py new file mode 100644 index 0000000000..f01252acbc --- /dev/null +++ b/python/tests/beagle_numba.py @@ -0,0 +1,890 @@ +""" +Implementation of the BEAGLE algorithm to impute alleles by linear interpolation of +state probabilities at ungenotyped markers in the hidden state probability matrix. + +This was implemented while closely consulting the BEAGLE 4.1 paper: +Browning & Browning (2016). Genotype imputation with millions of reference samples. +Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 + +The BEAGLE 4.1 source code (particularly `LSHapBaum.java`) was closely consulted: +https://faculty.washington.edu/browning/beagle/b4_1.html + +These notations are used throughout: +h = number of reference haplotypes. +m = number of genotyped markers. +x = number of ungenotyped markers. + +This implementation takes the following inputs: +1. Panel of reference haplotypes in a matrix of size (m + x, h). +2. One query haplotype in an array of size (m + x). +3. Physical positions of all the markers in an array of size (m + x). + +In the query haplotype: +1. The genotyped positions take values of 0, 1, 2, or 3 (ACGT encoding). +2. The ungenotyped positions take -1. + +The forward and backward probability matrices are of size (m, h). +The state probability matrix is of size (m, h). +The imputed allele probability matrix is of size (x, 4), +The imputed alleles are the maximum a posteriori (MAP) alleles. + +To improve computational efficiency, BEAGLE uses aggregated markers, +which are clusters of markers within a 0.005 cM interval (default). +Because the genotypes are phased, the alleles in the aggregated markers +form distinct "allele sequences". Below, we do not use aggregated markers +or allele sequences, which would complicate the implementation. + +Rather than trying to exactly replicate the original BEAGLE algorithm, +this implementation uses Equation 1 of BB2016. +""" +import warnings +from dataclasses import dataclass + +import numpy as np +from numba import njit + +import _tskit +import tskit + + +__VERSION__ = "0.0.0" +__DATE__ = "20XXXXXX" + +_ACGT_ALLELES = [0, 1, 2, 3, tskit.MISSING_DATA] + + +@dataclass(frozen=True) +class GeneticMap: + """ + A genetic map containing: + * Physical positions (bp). + * Genetic map positions (cM). + """ + + base_pos: np.ndarray + gen_pos: np.ndarray + + def __post_init__(self): + assert len(self.base_pos) == len( + self.gen_pos + ), "Lengths of physical positions and genetic map positions differ." + assert np.all( + self.base_pos[1:] > self.base_pos[:-1] + ), "Physical positions are not in strict ascending order." + + +@dataclass(frozen=True) +class ImpData: + """ + Imputation data containing: + * Individual names. + * Physical positions of imputed sites (bp). + * Designated REF allele at each site. + * Designated ALT allele at each site. + * Imputed alleles at each site. + * Imputed allele probabilities at each site. + + Assume that all the sites are biallelic. + + Let x = number of imputed sites and q = number of query haplotypes. + Since the query haplotypes are from diploid individuals, q is equal to + twice the number of individuals. + + Imputed alleles is a matrix of size (q, x). + Imputed allele probabilities is a matrix of size (q, x). + """ + + individual_names: list + site_pos: np.ndarray + refs: np.ndarray + alts: np.ndarray + alleles: np.ndarray + allele_probs: np.ndarray + + def __post_init__(self): + assert len(self.individual_names) > 0, "There must be at least one individual." + assert len(self.site_pos) > 0, "There must be at least one site." + assert self.alleles.shape[0] / 2 == len( + self.individual_names + ), "Number of query haplotypes is not equal to twice the number of individuals." + assert len(self.site_pos) == len( + self.alts + ), "Number of sites in refs is not equal to the number of site positions." + assert len(self.site_pos) == len( + self.refs + ), "Number of sites in alts is not equal to the number of site positions." + assert ( + len(self.site_pos) == self.alleles.shape[1] + ), "Number of sites in alleles is not equal to the number of site positions." + assert ( + self.alleles.shape == self.allele_probs.shape + ), "Dimensions in alleles and allele probabilities don't match." + for i in range(len(self.alleles)): + assert np.all(np.isin(self.alleles[:, i], [self.refs[i], self.alts[i]])) + assert np.all( + np.isin(np.unique(self.refs), _ACGT_ALLELES) + ), "Unrecognized alleles are in REF alleles." + assert np.all( + np.isin(np.unique(self.alts), _ACGT_ALLELES) + ), "Unrecognized alleles are in ALT alleles." + assert np.all( + np.isin(np.unique(self.alleles), _ACGT_ALLELES) + ), "Unrecognized alleles are in alleles." + + @property + def num_sites(self): + return len(self.site_pos) + + @property + def num_samples(self): + return self.alleles.shape[0] + + @property + def num_individuals(self): + return len(self.individual_names) + + def get_ref_allele_at_site(self, i): + return self.refs[i] + + def get_alt_allele_at_site(self, i): + return self.alts[i] + + def get_alleles_at_site(self, i): + idx_hap1 = np.arange(0, self.num_samples, 2) + idx_hap2 = np.arange(1, self.num_samples, 2) + a1 = self.alleles[idx_hap1, i] + ap1 = self.allele_probs[idx_hap1, i] + a2 = self.alleles[idx_hap2, i] + ap2 = self.allele_probs[idx_hap2, i] + return a1, ap1, a2, ap2 + + +""" Helper functions. """ + + +def remap_alleles(a): + """ + Map an array of alleles encoded as characters to integers. + + :param np.ndarray a: Alleles. + :return: Recoded alleles. + :rtype: np.ndarray(dtype=np.int8) + """ + _ALLELES_ACGT = "ACGT" + b = np.zeros(len(a), dtype=np.int8) - 1 # Encoded as missing by default + for i in range(len(a)): + if a[i] in [None, ""]: + continue + elif a[i] in _ALLELES_ACGT: + b[i] = _ALLELES_ACGT.index(a[i]) + else: + raise AssertionError(f"Allele {a[i]} is not recognised.") + return b + + +def check_data(ref_h, query_h): + """ + For each site, check if the alleles in the query haplotype + are represented in the reference haplotypes. + + Missing data (i.e. -1) are ignored. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :return: True if alleles in query are in reference at all the sites. + :rtype: bool + """ + m = ref_h.shape[0] + assert m == len(query_h), "Reference and query haplotypes differ in length." + for i in range(m): + if query_h[i] != tskit.MISSING_DATA: + ref_a = np.unique(ref_h[i, :]) + if query_h[i] not in ref_a: + print(f"Allele {query_h[i]} at {i}-th site is not in reference.") + return False + return True + + +def convert_to_genetic_map_positions(pos, genetic_map=None): + """ + Convert physical positions (bp) to genetic map positions (cM). + + In BEAGLE 4.1, when a genetic map is not specified, it is assumed + that the recombination rate is constant (1 cM / 1 Mb). + + If a genetic map is specified, then the genetic map positions are + either taken straight from it or interpolated from it. The genetic map + needs to contain physical positions and corresponding genetic map positions. + See `PlinkGenMap.java` in the BEAGLE 4.1 source code for details. + + :param numpy.ndarray pos: Physical positions (bp). + :param GeneticMap genetic_map: Genetic map. + :return: Genetic map positions (cM). + :rtype: numpy.ndarray + """ + if genetic_map is None: + return pos / 1e6 + assert np.all(pos >= genetic_map.base_pos[0]) and np.all( + pos < genetic_map.base_pos[-1] + ), "Physical positions outside of genetic map." + # Approximate genetic map distances by linear interpolation. + # Note np.searchsorted(a, v, side='right') returns i s.t. a[i-1] <= v < a[i]. + right_idx = np.searchsorted(genetic_map.base_pos, pos, side="right") + est_cm = np.zeros(len(pos), dtype=np.float64) # BEAGLE 4.1 uses double in Java. + for i in range(len(pos)): + a = genetic_map.base_pos[right_idx[i] - 1] + b = genetic_map.base_pos[right_idx[i]] + fa = genetic_map.gen_pos[right_idx[i] - 1] + fb = genetic_map.gen_pos[right_idx[i]] + assert pos[i] >= a, f"Query position not >= left-bound position: {pos[i]}, {a}." + assert ( + fb >= fa + ), f"Genetic map positions not in monotonically ascending order: {fb}, {fa}." + est_cm[i] = fa + est_cm[i] += (fb - fa) * (pos[i] - a) / (b - a) + return est_cm + + +""" Li & Stephens HMM. """ + + +def get_mismatch_probs(pos, error_rate): + """ + Compute mismatch probabilities at genotyped positions. + + Mutation rates should be dominated by the rate of allele error, + which should be the main source of mismatch between query and + reference haplotypes. + + In BEAGLE 4.1/5.4, the default value is set to 1e-4 and capped at 0.50. + In IMPUTE5, the default value is also set to 1e-4. + + When using `_tskit.LsHmm`, this is `mu`. + + :param numpy.ndarray pos: Physical positions (bp). + :param float error_rate: Allele error rate. + :return: Mismatch probabilities. + :rtype: numpy.ndarray + """ + MAX_ERROR_RATE = 0.50 + if error_rate >= MAX_ERROR_RATE: + error_rate = MAX_ERROR_RATE + mismatch_probs = np.zeros(len(pos), dtype=np.float64) + error_rate + return mismatch_probs + + +def get_transition_probs(cm, h, ne): + """ + Compute transition probabilities at genotyped positions. + + Note that in BEAGLE 4.1, the default value of `ne` is set to 1e6, + whereas in BEAGLE 5.4, the default value of `ne` is set to 1e5. + In BEAGLE, this default value was optimized empirically on + datasets from large outbred human populations. Also, in IMPUTE5, + the default value of `ne` is set to 1e6. + + If `h` is small and `ne` is large, the transition probabilities are ~1. + In such cases, it may be desirable to set `ne` to a small value + to avoid switching between haplotypes all the time. + + When using `_tskit.LsHmm`, this is `rho`. + + :param numpy.ndarray cm: Genetic map positions (cM). + :param int h: Number of reference haplotypes. + :param float ne: Effective population size. + :return: Transition probabilities. + :rtype: numpy.ndarray + """ + # Recombination rate of first site is always 0. + r = np.zeros(len(cm), dtype=np.float64) + r[1:] = np.diff(cm) + c = -0.04 * (ne / h) + rho = -1 * np.expm1(c * r) + return rho + + +@njit +def compute_forward_matrix(ref_h, query_h, tp, mp): + """ + Implement forward algorithm to compute a forward probablity matrix of size (m, h). + + Reference haplotypes and query haplotype are subsetted to genotyped markers. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray tp: Per-site transition probabilities. + :param numpy.ndarray mp: Per-site mismatch probabilities. + :return: Forward probability matrix. + :rtype: numpy.ndarray + """ + m = ref_h.shape[0] + h = ref_h.shape[1] + fm = np.zeros((m, h), dtype=np.float64) + last_sum = 1.0 # Part of normalization factor. + for i in range(m): + # Get site-specific parameters. + shift = tp[i] / h + scale = (1 - tp[i]) / last_sum + # Get allele at genotyped marker i on query haplotype. + query_a = query_h[i] + for j in range(h): + # Get allele at genotyped marker i on reference haplotype j. + ref_a = ref_h[i, j] + # Get emission probability. + em_prob = mp[i] if query_a != ref_a else 1.0 - mp[i] + fm[i, j] = em_prob + if i == 0: + fm[i, j] *= 1.0 / h # Prior probabilities. + else: + fm[i, j] *= scale * fm[i - 1, j] + shift + site_sum = np.sum(fm[i, :]) + last_sum = site_sum / h if i == 0 else site_sum + return fm + + +@njit +def compute_backward_matrix(ref_h, query_h, tp, mp): + """ + Implement backward algorithm to compute a backward probablity matrix of size (m, h). + + Reference haplotypes and query haplotype are subsetted to genotyped positions. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + In BEAGLE 4.1, the values are kept one position at a time. Here, we keep the values + at all the positions. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray tp: Per-site transition probabilities. + :param numpy.ndarray mp: Per-site mismatch probabilities. + :return: Backward probability matrix. + :rtype: numpy.ndarray + """ + m = ref_h.shape[0] + h = ref_h.shape[1] + bm = np.zeros((m, h), dtype=np.float64) + bm[-1, :] = 1.0 / h # Initialise the last column. + for i in range(m - 2, -1, -1): + query_a = query_h[i + 1] + for j in range(h): + ref_a = ref_h[i + 1, j] + em_prob = mp[i + 1] if ref_a != query_a else 1.0 - mp[i + 1] + bm[i, j] = bm[i + 1, j] * em_prob + sum_site = np.sum(bm[i, :]) + scale = (1 - tp[i + 1]) / sum_site + shift = tp[i + 1] / h + bm[i, :] = scale * bm[i, :] + shift + return bm + + +def compute_state_prob_matrix(fm, bm): + """ + Compute the hidden state probability matrix, in which each element is + the probability of an HMM state at a genotyped position. + + The forward and backward probability matrices are of size (m, h). + The output hidden state probability matrix is of size (m, h). + + Implementing this is simpler than faithfully reproducing BEAGLE 4.1. + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :return: Hidden state probability matrix. + :rtype: numpy.ndarray + """ + assert fm.shape == bm.shape, "Forward and backward matrices differ in shape." + sm = np.multiply(fm, bm) + return sm + + +""" Genotype imputation. """ + + +@njit +def get_weights(typed_pos, untyped_pos, typed_cm, untyped_cm): + """ + Get weights for the ungenotyped positions in a query haplotype, which are used in + linear interpolation of hidden state probabilities at the ungenotyped positions. + + In BB2016 (see below Equation 1), a weight between genotyped positions m and (m + 1) + bounding ungenotyped marker x is denoted lambda_m,x. + + lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], + where g(i) = genetic map position of marker i. + + This function looks for the right-bounding position instead of the left-bounding. + + :param numpy.ndarray typed_pos: Physical positions of genotyped markers (bp). + :param numpy.ndarray untyped_pos: Physical positions of ungenotyped markers (bp). + :param numpy.ndarray typed_cm: Genetic map positions of genotyped markers (cM). + :param numpy.ndarray untyped_cm: Genetic map positions of ungenotyped markers (cM). + :return: Weights for ungenotyped positions and indices of right-bounding positions. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + _MIN_CM_DIST = 1e-7 # Avoid division by zero. + m = len(typed_pos) + x = len(untyped_pos) + # Identify genotype positions (m - 1) and m bounding ungenotyped position i. + # Note np.searchsorted(a, v, side='right') returns i s.t. a[i-1] <= v < a[i]. + right_idx = np.searchsorted(typed_pos, untyped_pos, side="right") + # Calculate weights for ungenotyped positions. + weights = np.zeros(x, dtype=np.float64) + for i in range(len(right_idx)): + k = right_idx[i] + if k == 0: + # Left of the first genotyped position. + weights[i] = 1.0 + elif k == m: + # Right of the last genotyped position. + weights[i] = 0.0 + else: + # Between two genotyped positions. + cm_m2x = typed_cm[k] - untyped_cm[i] + # TODO: Check if this a subtle source of error. + cm_m2mM1 = max(typed_cm[k] - typed_cm[k - 1], _MIN_CM_DIST) + weights[i] = cm_m2x / cm_m2mM1 + return (weights, right_idx) + + +@njit +def interpolate_allele_probs(sm, ref_h, typed_pos, untyped_pos, typed_cm, untyped_cm): + """ + Interpolate allele probabilities at the ungenotyped positions of a query haplotype + following Equation 1 of BB2016. + + The interpolated allele probability matrix is of size (x, a), + where a is the number of alleles. + + Note that this function takes: + 1. Hidden state probability matrix at genotyped positions of size (m, h). + 2. Reference haplotypes subsetted to ungenotyped positions of size (x, h). + + :param numpy.ndarray sm: HMM state probability matrix at genotyped positions. + :param numpy.ndarray ref_h: Reference haplotypes subsetted to ungenotyped positions. + :param numpy.ndarray typed_pos: Physical positions of genotyped markers (bp). + :param numpy.ndarray untyped_pos: Physical positions of ungenotyped markers (bp). + :param numpy.ndarray typed_cm: Genetic map positions at genotyped markers (cM). + :param numpy.ndarray untyped_cm: Genetic map positions at ungenotyped markers (cM). + :return: Imputed allele probabilities. + :rtype: numpy.ndarray + """ + # TODO: Allow for biallelic site matrix. Work with `_tskit.lshmm` properly. + alleles = np.arange(len(tskit.ALLELES_ACGT)) + m = sm.shape[0] + x = ref_h.shape[0] + h = ref_h.shape[1] + weights, right_idx = get_weights(typed_pos, untyped_pos, typed_cm, untyped_cm) + probs = np.zeros((x, len(alleles)), dtype=np.float64) + for i in range(x): + k = right_idx[i] + w = weights[i] + for j in range(h): + for a in alleles: + if ref_h[i, j] == a: + if k == 0: + assert w == 1.0, "Weight should be 1.0." + probs[i, a] += sm[k, j] + elif k == m: + # FIXME: It's unclear how BEAGLE handles this case. + assert w == 0.0, "Weight should be 0.0." + probs[i, a] += sm[k - 1, j] + else: + probs[i, a] += w * sm[k - 1, j] + probs[i, a] += (1 - w) * sm[k, j] + # Zero trivial probabilities. + # TODO: Investigate whether this arbitrary threshold matters. + # _THRESHOLD = min(0.005, 1 / h) + # probs[probs < _THRESHOLD] = 0 + # Rescale probabilities. + # TODO: Check if this is necessary. Could this be a subtle source of error? + probs_rescaled = probs / np.sum(probs, axis=1)[:, np.newaxis] + return probs_rescaled + + +def get_map_alleles(allele_probs): + """ + Compute maximum a posteriori (MAP) alleles at the ungenotyped markers + of a query haplotype, based on posterior marginal allele probabilities. + + The imputed alleles is an array of size x. + + WARN: If the allele probabilities are equal, then allele 0 is arbitrarily chosen. + TODO: Investigate how often this happens and the effect of this arbitrary choice. + + :param numpy.ndarray allele_probs: Imputed allele probabilities. + :return: Imputed alleles and their associated probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + imputed_alleles = np.argmax(allele_probs, axis=1) + max_allele_probs = np.max(allele_probs, axis=1) + return (imputed_alleles, max_allele_probs) + + +def run_interpolation_beagle( + ref_h, + query_h, + pos, + ne=1e6, + error_rate=1e-4, + genetic_map=None, +): + """ + Perform a simplified procedure of interpolation-style imputation + based on Equation 1 of BB2016. + + Reference haplotypes and query haplotype are of size (m + x, h) and (m + x). + + The physical positions of all the markers are an array of size (m + x). + + This produces imputed alleles and their probabilities at the ungenotyped markers + of the query haplotype. + + The default values of `ne` and `error_rate` are taken from BEAGLE 4.1, not 5.4. + In BEAGLE 5.4, the default value of `ne` is 1e5 and `error_rate` is data-dependent. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Physical positions of all the markers (bp). + :param int ne: Effective population size. + :param float error_rate: Allele error rate. + :param GeneticMap genetic_map: Genetic map. + :return: Imputed alleles and their probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + warnings.warn("This function is experimental and not fully tested.", stacklevel=1) + warnings.warn( + "Check the reference and query haplotypes use the same allele encoding.", + stacklevel=1, + ) + # Set marker indices. + typed_idx = np.where(query_h != tskit.MISSING_DATA)[0] + untyped_idx = np.where(query_h == tskit.MISSING_DATA)[0] + # Set physical positions of markers. + typed_pos = pos[typed_idx] + untyped_pos = pos[untyped_idx] + # Get genetic map positions of markers. + typed_cm = convert_to_genetic_map_positions(typed_pos, genetic_map) + untyped_cm = convert_to_genetic_map_positions(untyped_pos, genetic_map) + # Subset haplotypes. + ref_h_typed = ref_h[typed_idx, :] + ref_h_untyped = ref_h[untyped_idx, :] + query_h_typed = query_h[typed_idx] + # Get rates at genotyped markers. + h = ref_h.shape[1] + transition_probs = get_transition_probs(typed_cm, h=h, ne=ne) + mismatch_probs = get_mismatch_probs(typed_pos, error_rate=error_rate) + # Compute matrices at genotyped markers. + fm = compute_forward_matrix( + ref_h_typed, query_h_typed, transition_probs, mismatch_probs + ) + bm = compute_backward_matrix( + ref_h_typed, query_h_typed, transition_probs, mismatch_probs + ) + sm = compute_state_prob_matrix(fm, bm) + # Perform linear interpolation. + imputed_allele_probs = interpolate_allele_probs( + sm=sm, + ref_h=ref_h_untyped, + typed_pos=typed_pos, + untyped_pos=untyped_pos, + typed_cm=typed_cm, + untyped_cm=untyped_cm, + ) + imputed_alleles, max_allele_probs = get_map_alleles(imputed_allele_probs) + return (imputed_alleles, max_allele_probs) + + +def run_tsimpute( + ref_ts, + query_h, + pos, + tp, + mp, + precision=22, + genetic_map=None, +): + """ + Perform interpolation-style imputation, except that the forward and backward + probability matrices are computed from a tree sequence. + + Reference haplotypes and query haplotype are of size (m + x, h) and (m + x). + + The physical positions of all the markers are an array of size (m + x). + + This produces imputed alleles and their probabilities at the ungenotyped markers + of the query haplotype. + + TODO: Set default precision. What should it be? + TODO: Allow for imputation on user-specified genomic interval. + TODO: Handle `acgt_alleles` properly. + TODO: Put this function elsewhere. + + :param numpy.ndarray ref_ts: Tree sequence with reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Physical positions of all the markers (bp). + :param numpy.ndarray tp: Per-site transition probabilities. + :param numpy.ndarray mp: Per-site mismatch probabilities. + :param int precision: Precision when running Li & Stephens HMM. + :param GeneticMap genetic_map: Genetic map. + :return: Imputed alleles and their probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + warnings.warn( + "Check the reference and query haplotypes use the same allele encoding.", + stacklevel=1, + ) + # Set markers indices. + typed_idx = np.where(query_h != tskit.MISSING_DATA)[0] + untyped_idx = np.where(query_h == tskit.MISSING_DATA)[0] + # Set physical positions of markers. + typed_pos = pos[typed_idx] + untyped_pos = pos[untyped_idx] + # Get genetic map positions of markers. + typed_cm = convert_to_genetic_map_positions(typed_pos, genetic_map) + untyped_cm = convert_to_genetic_map_positions(untyped_pos, genetic_map) + # Get rates at genotyped markers. + transition_probs = tp[typed_idx] + mismatch_probs = mp[typed_idx] + # Subset haplotypes. + ref_ts_typed = ref_ts.delete_sites(site_ids=untyped_idx) + ref_ts_untyped = ref_ts.delete_sites(site_ids=typed_idx) + ref_h_untyped = ref_ts_untyped.genotype_matrix(alleles=tskit.ALLELES_ACGT) + query_h_typed = query_h[typed_idx] + # Get matrices from tree sequence. + fm = _tskit.CompressedMatrix(ref_ts_typed._ll_tree_sequence) + bm = _tskit.CompressedMatrix(ref_ts_typed._ll_tree_sequence) + # WARN: Be careful with argument `acgt_alleles`!!! + ls_hmm = _tskit.LsHmm( + ref_ts_typed._ll_tree_sequence, + recombination_rate=transition_probs, # Transition probabilities + mutation_rate=mismatch_probs, # Mismatch probabilities + acgt_alleles=True, + precision=precision, + ) + ls_hmm.forward_matrix(query_h_typed.T, fm) + ls_hmm.backward_matrix(query_h_typed.T, fm.normalisation_factor, bm) + sm = compute_state_prob_matrix(fm.decode(), bm.decode()) + # Perform linear interpolation. + imputed_allele_probs = interpolate_allele_probs( + sm=sm, + ref_h=ref_h_untyped, + typed_pos=typed_pos, + untyped_pos=untyped_pos, + typed_cm=typed_cm, + untyped_cm=untyped_cm, + ) + imputed_alleles, max_allele_probs = get_map_alleles(imputed_allele_probs) + return (imputed_alleles, max_allele_probs) + + +""" Evaluation metrics and printing of results. """ + + +# Individual-level data. +def compute_individual_scores(alleles_1, allele_probs_1, alleles_2, allele_probs_2): + """ + Compute genotype probabilities and allele dosages of diploid individuals at a site + based on posterior allele probabilities. + + Assume that all sites are biallelic. Otherwise, the calculation below is incorrect. + Note 0 refers to the major allele and 1 the minor allele in the reference panel. + + Genotype probabilities are: P(00), P(01 or 10), P(11). + Allele dosages are: 00 = 0, 01 or 10 = 1, 11 = 2. + + In BEAGLE 4.1 output, + GP: "Estimated Genotype Probability", and + DS: "Estimated ALT dose [P(RA) + P(AA)]". + + :param numpy.ndarray alleles_1: Imputed alleles for haplotype 1. + :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1. + :param numpy.ndarray alleles_2: Imputed alleles for haplotype 2. + :param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2. + :return: Genotype probabilities and allele dosages. + :rtype: [numpy.ndarray, numpy.ndarray] + """ + n = len(alleles_1) # Number of individuals + assert len(alleles_2) == n, "Lengths of alleles differ." + assert n > 1, "There must be at least one individual." + assert len(allele_probs_1) == n, "Lengths of alleles and probabilities differ." + assert len(allele_probs_2) == n, "Lengths of alleles and probabilities differ." + gt_probs = np.zeros((n, 3), dtype=np.float32) + dosages = np.zeros(n, dtype=np.float32) + for i in range(n): + ap_hap1_0 = allele_probs_1[i] if alleles_1[i] == 0 else 1 - allele_probs_1[i] + ap_hap1_1 = 1 - ap_hap1_0 + ap_hap2_0 = allele_probs_2[i] if alleles_2[i] == 0 else 1 - allele_probs_2[i] + ap_hap2_1 = 1 - ap_hap2_0 + gt_probs[i, 0] = ap_hap1_0 * ap_hap2_0 # P(00) + gt_probs[i, 1] = ap_hap1_0 * ap_hap2_1 # P(01) + gt_probs[i, 1] += ap_hap1_1 * ap_hap2_0 # P(10) + gt_probs[i, 2] = ap_hap1_1 * ap_hap2_1 # P(11) + dosages[i] = gt_probs[i, 1] + 2 * gt_probs[i, 2] + return gt_probs, dosages + + +# Site-level data. +def compute_estimated_allelic_r_squared(gt_probs): + """ + Compute the estimated allelic R^2 at a site from posterior genotype probabilities. + + Assume that site is biallelic. Otherwise, the calculation below is incorrect. + Note that 0 refers to the major allele and 1 the minor allele. + + It is not the true allelic R^2, which needs access to true genotypes to compute. + The true allelic R^s is the squared correlation between true and imputed ALT dosages. + It has been shown the true and estimated allelic R-squared are highly correlated. + + In BEAGLE 4.1, it is AR2: "Allelic R-Squared: estimated squared correlation + between most probable REF dose and true REF dose". + + See the formulation in the Appendix 1 of Browning & Browning (2009). + Am J Hum Genet. 84(2): 210–223. doi: 10.1016/j.ajhg.2009.01.005. + + :param numpy.ndarray alleles_1: Imputed alleles for haplotype 1. + :param numpy.ndarray allele_probs_1: Imputed allele probabilities for haplotype 1. + :param numpy.ndarray alleles_2: Imputed alleles for haplotype 2. + :param numpy.ndarray allele_probs_2: Imputed allele probabilities for haplotype 2. + :return: Estimated allelic R-squared. + :rtype: float + """ + n = len(gt_probs) # Number of individuals + z = np.argmax(gt_probs, axis=1) # Most likely imputed genotypes + u = gt_probs[:, 1] + 2 * gt_probs[:, 2] # E[X | y_i] + w = gt_probs[:, 1] + 4 * gt_probs[:, 2] # E[X^2 | y_i] + t_1 = (np.sum(z * u) - (1 / n) * (np.sum(u) * np.sum(z))) ** 2 + # TODO: Properly handle cases where terms 2 and/or 3 are 0. + t_2 = np.sum(w) - (1 / n) * np.sum(u) ** 2 + t_3 = np.sum(z**2) - (1 / n) * np.sum(z) ** 2 + est_allelic_rsq = -1 if t_2 * t_3 == 0 else t_1 / (t_2 * t_3) + return est_allelic_rsq + + +def compute_dosage_r_squared(): + """ + In BEAGLE 4.1, DR2: "Dosage R-Squared: estimated squared correlation + between estimated REF dose [P(RA) + 2 * P(RR)] and true REF dose". + """ + pass + + +def compute_allele_frequency(gt_probs, allele=1): + """ + Estimate the frequency of an allele at a site from posterior genotype probabilities. + + Assume that site is biallelic. Otherwise, the calculation below is incorrect. + Note that 0 refers to the major allele and 1 the minor allele. + + Input are the posterior genotype probabilities at a site for n diploid individuals: + P(00), P(01 or 10), and P(11). The matrix is of size (n, 3). + + See the note in "Standardized Allele-Frequency Error" in Browning & Browning (2009). + Am J Hum Genet. 84(2): 210–223. doi: 10.1016/j.ajhg.2009.01.005. + + In BEAGLE 4.1, AF: "Estimated ALT Allele Frequencies". + + :param np.ndarray gt_probs: Genotype probabilities at a site. + :param int allele: Allele (default = 1). + :return: Estimated allele frequency. + :rtype: float + """ + n = len(gt_probs) # Number of individuals + assert n > 0, "There must be at least one individual." + assert allele in [0, 1], f"Allele {allele} is not recognized." + if allele == 0: + est_allele_count_0 = 2 * np.sum(gt_probs[:, 0]) + np.sum(gt_probs[:, 1]) + return est_allele_count_0 / (2 * n) + else: + est_allele_count_1 = np.sum(gt_probs[:, 1]) + 2 * np.sum(gt_probs[:, 2]) + return est_allele_count_1 / (2 * n) + + +def write_vcf(ref_ts, impdata, out_file, chr_name="1"): + """ + Print imputation results in VCF format, following the VCF output of BEAGLE 4.1. + + x = number of imputed sites + n = number of diploid individuals + + TODO: Print VCF records for genotyped sites. + + :param tskit.TreeSequence ref_ts: Tree sequence with reference haplotypes. + :param ImpData impdata: Object containing imputation data. + :param str out_file: Path to output VCF file. + :param str chr_name: Chromosome name. + :return: None + """ + _HEADER = [ + "##fileformat=VCFv4.2", + f"##filedata={__DATE__}", + f"##source=tsimpute (version {__VERSION__})" + "##INFO=', + "##INFO=', + "##INFO=', + '##INFO=', + '##FORMAT=', + "##FORMAT=', + "##FORMAT=', + "##FORMAT=', + ] + _COL_NAMES = [ + "CHROM", + "POS", + "ID", + "REF", + "ALT", + "QUAL", + "FILTER", + "INFO", + "FORMAT", + ] + with open(out_file, "w") as f: + # Add header with metadata and definitions. + for line in _HEADER: + f.write(line + "\n") + # Add column names. + col_str = "#" + col_str += "\t".join(_COL_NAMES) + "\t" + col_str += "\t".join(impdata.individual_names) + f.write(col_str + "\n") + # Add VCF records. + for i in range(impdata.num_sites): + a1, ap1, a2, ap2 = impdata.get_alleles_at_site(i) + gt_probs, dosages = compute_individual_scores(a1, ap1, a2, ap2) + ar2 = round(compute_estimated_allelic_r_squared(gt_probs), 2) + line_str = chr_name + "\t" + line_str += str(int(impdata.site_pos[i])) + "\t" + line_str += str(i) + "\t" + REF = _ACGT_ALLELES[impdata.get_ref_allele_at_site(i)] + ALT = _ACGT_ALLELES[impdata.get_alt_allele_at_site(i)] + line_str += REF + "\t" + line_str += ALT + "\t" + # QUAL + line_str += "." + "\t" + # FILTER + line_str += "PASS" + "\t" + # INFO + dr2 = round(-1, 2) # TODO + af = round(compute_allele_frequency(gt_probs, allele=1), 2) + info_str = f"AR2={ar2};DR2={dr2};AF={af};IMP" + line_str += info_str + "\t" + # FORMAT + line_str += "GT:DS:GP" + "\t" + # DATA + data_str = "" + for j in range(impdata.num_individuals): + data_str += str(a1[j]) + "|" + str(a2[j]) + ":" + data_str += str(round(dosages[j], 2)) + ":" + data_str += ",".join([str(round(gt_probs[j, k], 2)) for k in range(3)]) + data_str += "\t" + line_str += data_str + f.write(line_str + "\n") diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py new file mode 100644 index 0000000000..96cdaaf3a3 --- /dev/null +++ b/python/tests/test_imputation.py @@ -0,0 +1,719 @@ +""" +Tests for interpolation-style genotype imputation. +""" +import io + +import numpy as np +import pytest + +import tests.beagle +import tests.beagle_numba +import tskit + + +# The following toy query haplotypes were imputed from the toy reference haplotypes +# using BEAGLE 4.1 with Ne set to 10. +# +# There are 9 sites, starting from 10,000 to 90,000, with 10,000 increments. +# The REF is A and ALT is C for all the sites in the VCF input to BEAGLE. +# The ancestral state is A and derived state is C for all the sites. +# In this setup, A is encoded as 0 and C as 1 whether the allele encoding is +# ancestral/derived (0/1) or ACGT (0123). +# +# Case 0: +# Reference haplotypes and query haplotypes have 0|0 at all sites. +# First and last reference markers are missing in the query haplotypes. +# fmt: off +toy_ref_0 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_0 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_0_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_0_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [1.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + ], + # query_0, haplotype 1 + [ + [1.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_0.shape == (9, 2) +assert toy_query_0.shape == (2, 9) +assert toy_query_0_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_0[0] == 0) == 4 +assert np.sum(toy_query_0[0] == -1) == 5 +assert np.array_equal(toy_query_0[0], toy_query_0[1]) +assert np.all(toy_query_0_beagle_imputed[0] == 0) +assert np.all(toy_query_0_beagle_imputed[1] == 0) +assert toy_query_0_beagle_allele_probs.shape == (2, 2, 5) + +# Case 1: +# Reference haplotypes and query haplotypes have 1|1 at all sites. +# First and last reference markers are missing in the query haplotypes. +# fmt: off +toy_ref_1 = np.array( + [ + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_1 = np.array( + [ + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_1_beagle_imputed = np.array( + [ + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_1_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0], + ], + # query_0, haplotype 1 + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_1.shape == (9, 2) +assert toy_query_1.shape == (2, 9) +assert toy_query_1_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_1[0] == 1) == 4 +assert np.sum(toy_query_1[0] == -1) == 5 +assert np.array_equal(toy_query_1[0], toy_query_1[1]) +assert np.all(toy_query_1_beagle_imputed[0] == 1) +assert np.all(toy_query_1_beagle_imputed[1] == 1) +assert toy_query_1_beagle_allele_probs.shape == (2, 2, 5) + +# Case 2: +# Reference haplotypes and query haplotypes have 0|0 at all sites. +# First and last reference markers are genotyped in the query haplotypes. +# fmt: off +toy_ref_2 = np.copy(toy_ref_0) +toy_query_2 = np.array( + [ + [0, 0, -1, 0, -1, 0, -1, 0, 0], # query_0, haplotype 0 + [0, 0, -1, 0, -1, 0, -1, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_2_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_2_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [1.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ], + # query_0, haplotype 1 + [ + [1.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_2.shape == (9, 2) +assert toy_query_2.shape == (2, 9) +assert toy_query_2_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_2[0] == 0) == 6 +assert np.sum(toy_query_2[0] == -1) == 3 +assert np.array_equal(toy_query_2[0], toy_query_2[1]) +assert np.all(toy_query_2_beagle_imputed[0] == 0) +assert np.all(toy_query_2_beagle_imputed[1] == 0) +assert toy_query_2_beagle_allele_probs.shape == (2, 2, 3) + +# Case 3: +# Reference haplotypes and query haplotypes have 0|1 at all sites. +# fmt: off +toy_ref_3 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_3 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_3_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_3_beagle_allele_probs = np.array( + # Ungenotyped markers only + # See comment https://github.com/tskit-dev/tskit/pull/2815#issuecomment-1708178257 + [ + # query_0, haplotype 0 + [ + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.9999998], + [0.0 , 0.0 , 0.0, 0.0 , 0.0], + ], + # query_0, haplotype 1 + [ + [0.0 , 0.0 , 0.0, 0.0 , 0.0], + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.9999998], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_3.shape == (9, 2) +assert toy_query_3.shape == (2, 9) +assert toy_query_3_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_3[0] == 0) == 4 +assert np.sum(toy_query_3[1] == 1) == 4 +assert np.sum(toy_query_3[0] == -1) == np.sum(toy_query_3[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_3_beagle_imputed[0], toy_query_3_beagle_imputed[1])) +) +assert toy_query_3_beagle_allele_probs.shape == (2, 2, 5) + +# Case 4: +# Reference haplotypes and query haplotypes have alternating 0|1 and 1|0 genotypes. +# Query haplotypes have 0|1 at all genotyped sites. +# fmt: off +toy_ref_4 = np.array( + [ + [0, 1, 0, 1, 0, 1, 0, 1, 0], # ref_0, haplotype 0 + [1, 0, 1, 0, 1, 0, 1, 0, 1], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_4 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_4_beagle_imputed = np.array( + [ + [1, 0, 1, 0, 1, 0, 1, 0, 1], # query_0, haplotype 0 + [0, 1, 0, 1, 0, 1, 0, 1, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_4_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.0 , 0.0 , 0.0, 0.0 , 0.9999998], + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.0], + ], + # query_0, haplotype 1 + [ + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.0], + [0.0 , 0.0 , 0.0, 0.0 , 0.9999998], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_4.shape == (9, 2) +assert toy_query_4.shape == (2, 9) +assert toy_query_4_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_4[0] == 0) == 4 +assert np.sum(toy_query_4[1] == 1) == 4 +assert np.sum(toy_query_4[0] == -1) == np.sum(toy_query_4[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_4_beagle_imputed[0], toy_query_4_beagle_imputed[1])) +) +assert toy_query_3_beagle_allele_probs.shape == (2, 2, 5) + +# Case 5: +# The reference panel has two individuals. The first individual has 0|0 at all sites, +# and the second individual has 1|1 at all sites. +# The query haplotypes have one recombination breakpoint in the middle. +# fmt: off +toy_ref_5 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_1 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_1 + ], dtype=np.int32 +).T +toy_query_5 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 1, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 0, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_5_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 1, 1], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_5_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.9999999, 0.99999994, 0.9999546, 0.5454091, 0.09090912], + [0.0 , 0.0 , 0.0 , 0.4545909, 0.9090909], + ], + # query_0, haplotype 1 + [ + [0.0 , 0.0 , 0.0 , 0.4545909, 0.9090909], + [0.9999999, 0.99999994, 0.9999546, 0.5454091, 0.09090912], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_5.shape == (9, 4) +assert toy_query_5.shape == (2, 9) +assert toy_query_5_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_5[0] == 0) == 3 +assert np.sum(toy_query_5[0] == 1) == 1 +assert np.sum(toy_query_5[0] == -1) == 5 +assert np.sum(toy_query_5[1] == 0) == 1 +assert np.sum(toy_query_5[1] == 1) == 3 +assert np.sum(toy_query_5[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_5_beagle_imputed[0], toy_query_5_beagle_imputed[1])) +) +assert toy_query_5_beagle_allele_probs.shape == (2, 2, 5) + +# Case 6: +# The reference panel has two individuals. The first individual has 0|0 at all sites, +# and the second individual has 1|1 at all sites. +# The query haplotypes have two recombination breakpoints in the middle. +# fmt: off +toy_ref_6 = np.copy(toy_ref_5) +toy_query_6 = np.array( + [ + [-1, 0, -1, 1, -1, 1, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 0, -1, 0, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_6_beagle_imputed = np.array( + [ + [0, 0, 1, 1, 1, 1, 1, 0, 0], # query_0, haplotype 0 + [1, 1, 0, 0, 0, 0, 0, 1, 1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_6_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.90983605, 0.46770614, 0.025481388, 0.46838862, 0.91139066], + [0.09016396, 0.53229386, 0.97451866 , 0.5316114 , 0.088609315], + ], + # query_0, haplotype 1 + [ + [0.09016395, 0.53229386, 0.97451866 , 0.5316114 , 0.0886093], + [0.90983605, 0.46770614, 0.025481388, 0.46838862, 0.91139066], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_6.shape == (9, 4) +assert toy_query_6.shape == (2, 9) +assert toy_query_6_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_6[0] == 0) == 2 +assert np.sum(toy_query_6[0] == 1) == 2 +assert np.sum(toy_query_6[0] == -1) == 5 +assert np.sum(toy_query_6[1] == 0) == 2 +assert np.sum(toy_query_6[1] == 1) == 2 +assert np.sum(toy_query_6[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_6_beagle_imputed[0], toy_query_6_beagle_imputed[1])) +) +assert toy_query_6_beagle_allele_probs.shape == (2, 2, 5) + +# Case 7: +# Hand-crafted example. +# fmt: off +toy_ref_7 = np.array( + [ + [0, 1, 2, 0, 1, 1, 2, 3, 1], # ref_0 + [1, 1, 2, 3, 1, 1, 2, 3, 1], # ref_0 + [0, 1, 3, 0, 1, 1, 2, 3, 1], # ref_1 + [0, 2, 2, 3, 0, 1, 2, 0, 0], # ref_1 + [0, 1, 2, 3, 0, 2, 3, 0, 0], # ref_2 + [1, 1, 2, 3, 1, 1, 2, 3, 1], # ref_2 + [0, 1, 3, 0, 1, 1, 2, 3, 1], # ref_3 + [0, 1, 2, 3, 0, 1, 2, 3, 1], # ref_3 + [0, 2, 2, 3, 0, 1, 2, 0, 0], # ref_4 + [0, 1, 2, 3, 0, 2, 3, 0, 0], # ref_4 + ], dtype=np.int32 +).T +toy_query_7 = np.array( + [ + [0, 1, -1, 0, -1, 2, -1, 3, 1], # query_0, haplotype 0 + [1, 2, -1, 3, -1, 1, -1, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_7_beagle_imputed = np.array( + [ + [0, 1, 3, 0, 1, 2, 2, 3, 1], # query_0, haplotype 0 + [1, 2, 2, 3, 0, 1, 2, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_7_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.33317572, 0.0 , 0.99945664], + [0.66635144, 0.9991437, 0.0], + ], + # query_0, haplotype 1 + [ + [0.9995998, 0.99971414, 0.99974835], + [0.0 , 0.0 , 0.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_7.shape == (9, 10) +assert toy_query_7.shape == (2, 9) +assert toy_query_7_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_7[0] == -1) == np.sum(toy_query_7[1] == -1) == 3 +assert toy_query_7_beagle_allele_probs.shape == (2, 2, 3) + +# Case 8: +# Same as case 7 except the last genotyped marker is missing. +# fmt: off +toy_ref_8 = np.copy(toy_ref_7) +toy_query_8 = np.array( + [ + [-1, 1, -1, 0, -1, 2, -1, 3, -1], # query_0, haplotype 0 + [-1, 2, -1, 3, -1, 1, -1, 0, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_8_beagle_imputed = np.array( + [ + [0, 1, 3, 0, 1, 2, 2, 3, 1], # query_0, haplotype 0 + [0, 2, 2, 3, 0, 1, 2, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_8_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.9997735, 0.33310473, 0.0 , 0.9992305, 0.9997736], + [0.0 , 0.66620946, 0.99893093, 0.0 , 0.0], + ], + # query_0, haplotype 1 + [ + [0.9999998, 0.9999998, 0.9999998, 0.99991965, 0.0], + [0.0 , 0.0 , 0.0 , 0.0 , 0.9999998], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_8.shape == (9, 10) +assert toy_query_8.shape == (2, 9) +assert toy_query_8_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_8[0] == -1) == 5 +assert np.sum(toy_query_8[1] == -1) == 5 +assert toy_query_8_beagle_allele_probs.shape == (2, 2, 5) + + +@pytest.mark.parametrize( + "input_ref,input_query,expected", + [ + (toy_ref_0, toy_query_0, toy_query_0_beagle_imputed), + (toy_ref_1, toy_query_1, toy_query_1_beagle_imputed), + (toy_ref_2, toy_query_2, toy_query_2_beagle_imputed), + (toy_ref_3, toy_query_3, toy_query_3_beagle_imputed), + (toy_ref_4, toy_query_4, toy_query_4_beagle_imputed), + (toy_ref_5, toy_query_5, toy_query_5_beagle_imputed), + (toy_ref_6, toy_query_6, toy_query_6_beagle_imputed), + (toy_ref_7, toy_query_7, toy_query_7_beagle_imputed), + (toy_ref_8, toy_query_8, toy_query_8_beagle_imputed), + ], +) +def test_beagle_vanilla(input_ref, input_query, expected): + """Compare imputed alleles from Python BEAGLE implementation and BEAGLE 4.1.""" + assert input_query.shape == expected.shape + pos = (np.arange(9) + 1) * 1e4 + num_query_haps = input_query.shape[0] + num_ungenotyped_markers = np.sum(input_query[0] == -1) + imputed_alleles = np.zeros( + (num_query_haps, num_ungenotyped_markers), dtype=np.int32 + ) + expected_ungenotyped = expected[:, input_query[0] == -1] + assert imputed_alleles.shape == expected_ungenotyped.shape + for i in range(num_query_haps): + imputed_alleles[i], _ = tests.beagle.run_beagle( + input_ref, input_query[i], pos, miscall_rate=1e-4, ne=10.0 + ) + np.testing.assert_array_equal(imputed_alleles[i], expected_ungenotyped[i]) + + +@pytest.mark.parametrize( + "input_ref,input_query", + [ + (toy_ref_0, toy_query_0), + (toy_ref_1, toy_query_1), + (toy_ref_2, toy_query_2), + (toy_ref_3, toy_query_3), + (toy_ref_4, toy_query_4), + (toy_ref_5, toy_query_5), + (toy_ref_6, toy_query_6), + (toy_ref_7, toy_query_7), + (toy_ref_8, toy_query_8), + ], +) +def test_beagle_numba(input_ref, input_query): + """Compare imputed alleles from vanilla and numba Python BEAGLE implementations.""" + pos = (np.arange(9) + 1) * 1e4 + num_query_haps = input_query.shape[0] + for i in range(num_query_haps): + imputed_alleles_vanilla, _ = tests.beagle.run_beagle( + input_ref, + input_query[i], + pos, + ne=10.0, + miscall_rate=1e-4, + ) + imputed_alleles_numba, _ = tests.beagle_numba.run_interpolation_beagle( + input_ref, + input_query[i], + pos, + ne=10.0, + error_rate=1e-4, + ) + np.testing.assert_array_equal(imputed_alleles_vanilla, imputed_alleles_numba) + + +# Below is toy data set case 7 in tree sequence format. +toy_ts_nodes_text = """\ +id is_sample time population individual metadata +0 1 0.000000 0 0 +1 1 0.000000 0 0 +2 1 0.000000 0 1 +3 1 0.000000 0 1 +4 1 0.000000 0 2 +5 1 0.000000 0 2 +6 1 0.000000 0 3 +7 1 0.000000 0 3 +8 1 0.000000 0 4 +9 1 0.000000 0 4 +10 0 0.009923 0 -1 +11 0 0.038603 0 -1 +12 0 0.057935 0 -1 +13 0 0.145141 0 -1 +14 0 0.238045 0 -1 +15 0 0.528344 0 -1 +16 0 0.646418 0 -1 +17 0 1.462199 0 -1 +18 0 2.836600 0 -1 +19 0 3.142225 0 -1 +20 0 4.056253 0 -1 +""" + +toy_ts_edges_text = """\ +left right parent child metadata +0.000000 100000.000000 10 1 +0.000000 100000.000000 10 5 +0.000000 100000.000000 11 3 +0.000000 100000.000000 11 8 +0.000000 100000.000000 12 2 +0.000000 100000.000000 12 6 +0.000000 100000.000000 13 0 +0.000000 100000.000000 13 12 +0.000000 100000.000000 14 10 +0.000000 100000.000000 14 13 +0.000000 40443.000000 15 9 +0.000000 40443.000000 15 14 +40443.000000 100000.000000 16 4 +40443.000000 100000.000000 16 9 +0.000000 40443.000000 17 4 +0.000000 100000.000000 17 11 +40443.000000 100000.000000 17 16 +0.000000 12721.000000 18 7 +0.000000 12721.000000 18 17 +12721.000000 100000.000000 19 7 +40443.000000 100000.000000 19 14 +12721.000000 40443.000000 19 15 +0.000000 12721.000000 20 15 +12721.000000 100000.000000 20 17 +0.000000 12721.000000 20 18 +12721.000000 100000.000000 20 19 +""" + +toy_ts_sites_text = """\ +position ancestral_state metadata +10000.000000 A +20000.000000 C +30000.000000 G +40000.000000 T +50000.000000 A +60000.000000 C +70000.000000 G +80000.000000 T +90000.000000 A +""" + +toy_ts_mutations_text = """\ +site node time derived_state parent metadata +0 10 unknown C -1 +1 11 unknown G -1 +2 12 unknown T -1 +3 13 unknown A -1 +4 14 unknown C -1 +5 16 unknown G -1 +6 16 unknown T -1 +7 17 unknown A -1 +8 19 unknown C -1 +""" + +toy_ts_individuals_text = """\ +flags +0 +0 +0 +0 +0 +""" + + +def get_toy_data(): + """Get toy example 7 in tree sequence format.""" + ts = tskit.load_text( + nodes=io.StringIO(toy_ts_nodes_text), + edges=io.StringIO(toy_ts_edges_text), + sites=io.StringIO(toy_ts_sites_text), + mutations=io.StringIO(toy_ts_mutations_text), + individuals=io.StringIO(toy_ts_individuals_text), + strict=False, + ) + return ts + + +def parse_matrix(csv_text): + # TODO: Maybe remove. + # This returns a record array, which is essentially the same as a + # pandas dataframe, which we can access via df["m"] etc. + return np.lib.npyio.recfromcsv(io.StringIO(csv_text)) + + +@pytest.mark.parametrize( + "input_query,expected", + [ + (toy_query_7, toy_query_7_beagle_imputed), + (toy_query_8, toy_query_8_beagle_imputed), + ], +) +def test_tsimpute(input_query, expected): + """ + Compare imputed alleles from tsimpute and BEAGLE 4.1. + """ + toy_ref_ts = get_toy_data() # Same for both cases + pos = toy_ref_ts.sites_position + num_query_haps = input_query.shape[0] + mu = np.zeros(len(pos), dtype=np.float32) + 1e-8 + rho = np.zeros(len(pos), dtype=np.float32) + 1e-8 + expected_subset = expected[:, input_query[0] == -1] + for i in range(num_query_haps): + imputed_alleles, _ = tests.beagle_numba.run_tsimpute( + toy_ref_ts, + input_query[i], + pos, + rho, + mu, + ) + np.testing.assert_array_equal(imputed_alleles, expected_subset[i]) + + +# Tests for helper functions. +@pytest.mark.parametrize( + "typed_pos,untyped_pos,expected_weights,expected_idx", + [ + # All ungenotyped markers are between genotyped markers. + ( + np.array([10, 20]) * 1e6, + np.array([15]) * 1e6, + np.array([0.5]), + np.array([0]), + ), + # Same as above, but more genotyped markers. + ( + np.array([10, 20, 30, 40]) * 1e6, + np.array([15, 25, 35]) * 1e6, + np.array([0.5, 0.5, 0.5]), + np.array([0, 1, 2]), + ), + # Ungenotyped markers are left of the first genotyped marker. + ( + np.array([10, 20]) * 1e6, + np.array([5, 15]) * 1e6, + np.array([1.0, 0.5]), + np.array([0, 0]), + ), + # Ungenotyped markers are right of the last genotyped marker. + ( + np.array([10, 20]) * 1e6, + np.array([15, 25]) * 1e6, + np.array([0.5, 0.0]), + np.array([0, 1]), + ), + # Denominator below min threshold. + (np.array([10, 20]), np.array([15]), np.array([0.001]), np.array([0])), + ], +) +def test_get_weights(typed_pos, untyped_pos, expected_weights, expected_idx): + # beagle, no numba + actual_weights, actual_idx = tests.beagle.get_weights(typed_pos, untyped_pos) + np.testing.assert_allclose(actual_weights, expected_weights) + np.testing.assert_array_equal(actual_idx, expected_idx) + # beagle, numba + genotyped_cm = tests.beagle_numba.convert_to_genetic_map_positions(typed_pos) + ungenotyped_cm = tests.beagle_numba.convert_to_genetic_map_positions(untyped_pos) + actual_weights, actual_idx = tests.beagle_numba.get_weights( + typed_pos, untyped_pos, genotyped_cm, ungenotyped_cm + ) + np.testing.assert_allclose(actual_weights, expected_weights) + np.testing.assert_array_equal(actual_idx, expected_idx)