From b00c9b4ce7972a15545761ea06ad11bab241a930 Mon Sep 17 00:00:00 2001 From: Shing Zhan Date: Mon, 7 Aug 2023 18:19:00 +0100 Subject: [PATCH] Add tests to check LS HMM of _tskit.lshmm compared to BEAGLE --- python/tests/beagle.py | 689 ++++++++++++++++++++++++++++++++ python/tests/beagle_numba.py | 441 ++++++++++++++++++++ python/tests/test_imputation.py | 628 +++++++++++++++++++++++++++++ 3 files changed, 1758 insertions(+) create mode 100644 python/tests/beagle.py create mode 100644 python/tests/beagle_numba.py create mode 100644 python/tests/test_imputation.py diff --git a/python/tests/beagle.py b/python/tests/beagle.py new file mode 100644 index 0000000000..312c34e4ba --- /dev/null +++ b/python/tests/beagle.py @@ -0,0 +1,689 @@ +""" +Implementation of the BEAGLE 4.1 algorithm to impute genotypes. + +This was implemented while closely consulting the BEAGLE 4.1 paper: +Browning & Browning (2016). Genotype imputation with millions of reference samples. +Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 + +The source code of BEAGLE 4.1 (particularly `LSHapBaum.java`) was also consulted: +https://faculty.washington.edu/browning/beagle/b4_1.html + +Here are some notations used throughout this implementation: +h = Number of reference haplotypes. +m = Number of genotyped markers. +x = Number of imputed markers. + +This implementation takes the following inputs: +1. Panel of reference haplotypes in a matrix of size (m + x, h). +2. One query haplotype in an array of size (m + x). +3. Genomic site positions of all the markers in an array of size (m + x). + +For simplicity, we assume that: +1. All the sites are biallelic (0/1 encoding). +2. The genetic map is defined by equating 1 Mbp to 1 cM. + +In the query haplotype: +1. The genotyped markers take values of 0 or 1. +2. The imputed markers take -1. + +The forward and backward probability matrices are of size (m, h). +The HMM state probability matrix is of size (m + 1, h). +The interpolated haplotype probability matrix is of size (x, 2), +The imputed alleles in the query haplotype are the maximum a posteriori (MAP) alleles. + +For efficiency, BEAGLE uses aggregated markers, which are clusters of markers +within a 0.005 cM interval (default). Because the genotypes are phased, +the alleles in the aggregated markers form distinct "allele sequences". +Below, we do not use aggregated markers or allele sequences, which would complicate +the implementation. + +Rather than trying to exactly replicating the original BEAGLE 4.1 algorithm, +this implementation computes Equation 1 of BB2016. The functions used in an attempt +to faithfully implement the BEAGLE algorithm are kept for documentation. +""" +import numpy as np + +import _tskit + + +def convert_to_genetic_map_position(pos): + """ + Convert genomic site positions (bp) to genetic map positions (cM). + + In BEAGLE 4.1, when a genetic map is not specified, it is assumed that 1 Mbp = 1cM. + + This trivial function is meant for documentation. + + :param numpy.ndarray pos: Site positions. + :return: Genetic map positions. + :rtype: numpy.ndarray + """ + assert 0 < np.min(pos) + return pos / 1e6 + + +def get_weights(genotyped_pos, imputed_pos): + """ + Get weights at imputed markers in the query haplotype. + + These weights are used in linear interpolation of HMM state probabilities + at imputed markers. + + In BB2016 (see below Equation 1), a weight between genotyped markers m and m + 1 + at imputed marker x is denoted lambda_m,x. + + lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], + where g(i) = genetic map position of marker i. + + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray imputed_pos: Site positions of imputed markers. + :return: Weights used in linear interpolation. + :rtype: numpy.ndarray + """ + # Check that the two sets are mutually exclusive. + assert not np.any(np.isin(genotyped_pos, imputed_pos)) + # Set min genetic distance to avoid division by zero. + MIN_CM_DIST = 0.005 + m = len(genotyped_pos) + x = len(imputed_pos) + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + imputed_cm = convert_to_genetic_map_position(imputed_pos) + assert not np.any(np.isin(genotyped_cm, imputed_cm)) + # Calculate weights for imputed markers. + weights = np.zeros(x, dtype=np.float64) + # Identify genotype markers k and k + 1 between imputed marker i + for i in np.arange(x): + if imputed_cm[i] < genotyped_cm[0]: + # Special case: imputed marker is before the first genotyped marker. + k = 0 + weights[i] = 1.0 + elif imputed_cm[i] > genotyped_cm[-1]: + # Special case: imputed marker is after the last genotyped marker. + k = m - 1 + weights[i] = 0.0 + else: + k = np.max(np.where(imputed_cm[i] > genotyped_cm)[0]) + cm_mP1_x = genotyped_cm[k + 1] - imputed_cm[i] + cm_mP1_m = np.max([genotyped_cm[k + 1] - genotyped_cm[k], MIN_CM_DIST]) + weights[i] = cm_mP1_x / cm_mP1_m + assert 0 <= np.min(weights) and np.max(weights) <= 1, "Weights are not in [0, 1]." + return weights + + +def get_mismatch_prob(pos, miscall_rate): + """ + Set mismatch probabilities at genotyped markers. + + In BEAGLE, the mismatch probability is dominated by the allelic miscall rate. + By default, it is set to 0.0001 and capped at 0.50. + + :param numpy.ndarray pos: Site positions. + :param float miscall_rate: Allelic miscall rate. + :return: Mismatch probabilities. + :rtype: numpy.ndarray + """ + assert isinstance(miscall_rate, float) + if miscall_rate >= 0.5: + miscall_rate = 0.5 + mu = np.zeros(len(pos), dtype=np.float64) + miscall_rate + return mu + + +def get_switch_prob(pos, h, ne): + """ + Set switch probabilities at genotyped markers. + + Note that in BEAGLE, the default value of `ne` is set to 1e6. + If `h` is small and `ne` is large, the switch probability is nearly 1. + In such cases, it may be desirable to set `ne` to a small value + to avoid switching between haplotypes all the time. + + :param numpy.ndarray pos: Site positions. + :param int h: Number of reference haplotypes. + :param float ne: Effective population size. + :return: Switch probabilities. + :rtype: numpy.ndarray + """ + assert pos[0] > 0, "Site positions are not 1-based." + assert isinstance(h, int), "Number of reference haplotypes is not an integer." + assert isinstance(ne, float), "Effective population size is not a float." + # Get genetic distance between adjacent markers. + cm = np.concatenate([[0], convert_to_genetic_map_position(pos)]) + gen_dist = cm[1:] - cm[:-1] + assert len(gen_dist) == len(pos) + rho = -np.expm1(-(4 * ne * gen_dist / h)) + return rho + + +def compute_forward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement the forward algorithm. + + The forward probablity matrix is of size (m, h), + where m is the number of genotyped markers + and h is the number of reference haplotypes. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Forward probability matrix. + :rtype: numpy.ndarray + """ + assert np.all( + np.isin(ref_h, [0, 1]) + ), "Reference haplotypes have non-biallelic values." + assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values." + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + # BEAGLE caps mismatch probabilities at 0.5. + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + fm = np.zeros((m, h), dtype=np.float64) + last_sum = 1.0 # Part of normalization factor + for i in np.arange(m): + # Get site-specific parameters + shift = rho[i] / h + scale = (1 - rho[i]) / last_sum + # Get allele at genotyped marker i on query haplotype j + query_a = query_h[i] + for j in np.arange(h): + # Get allele at genotyped marker i on reference haplotype j + ref_a = ref_h[i, j] + # Get emission probability + em_prob = mu[i] if query_a != ref_a else 1.0 - mu[i] + if m == 0: + fm[i, j] = em_prob + else: + fm[i, j] = em_prob * (scale * fm[i - 1, j] + shift) + site_sum = np.sum(fm[i, :]) + last_sum = site_sum / h if m == 0 else site_sum + return fm + + +def compute_backward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement the backward algorithm. + + The backward probablity matrix is of size (m, h), + where m is the number of genotyped markers + and h is the number of reference haplotypes. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + In BEAGLE 4.1, the values are kept one site at a time. + Here, we keep the values at all the sites. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Backward probability matrix. + :rtype: numpy.ndarray + """ + assert np.all( + np.isin(ref_h, [0, 1]) + ), "Reference haplotypes have non-biallelic values." + assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values." + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + bm = np.zeros((m, h), dtype=np.float64) + bm[-1, :] = 1.0 / h # Initialise the last column + for i in np.arange(m - 2, -1, -1): + query_a = query_h[i + 1] + for j in np.arange(h): + ref_a = ref_h[i + 1, j] + em_prob = mu[i + 1] if ref_a != query_a else 1.0 - mu[i + 1] + # Note that BEAGLE keeps the values at one site at a time. + bm[i, j] = bm[i + 1, j] * em_prob + sum_site = np.sum(bm[i, :]) + scale = (1 - rho[i + 1]) / sum_site + shift = rho[i + 1] / h + bm[i, :] = scale * bm[i, :] + shift + return bm + + +def _get_ref_hap_seg(a, b): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Assuming all biallelic sites, get the index of a reference haplotype segment (i) + defined by the alleles a and b at two adjacent genotyped markers. + + #i, a, b + 0, 0, 0 + 1, 1, 0 + 2, 0, 1 + 3, 1, 1 + + See https://github.com/tskit-dev/tskit/issues/2802#issuecomment-1698767169 + + This is a helper function for `_compute_state_probability_matrix`. + """ + ref_hap_idx = a * 1 + b * 2 + return ref_hap_idx + + +def _compute_state_probability_matrix(fm, bm, ref_h, query_h, rho, mu): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Implement the HMM forward-backward algorithm to compute: + 1. A HMM state probability matrix across genotyped markers; + 2. "Forward haplotype probabilities"; and + 3. "Backward haplotype probabilities". + + A HMM state is an index of a reference haplotype. + + The HMM state probability matrix is of size (m + 1, h), + where m is the number of genotyped markers in the query haplotype + and h is the number of reference haplotypes. + + Note that the additional column at m + 1 is for interpolation of + imputed markers right of the last genotyped marker. + + Both the forward and backward haplotype probability matrices + are of size (m, 2), assuming only biallelic sites. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + This implementation is based on the function `setStateProbs` + of `LSHapBaum.java` in BEAGLE. It performs the computation + described in Equation 2 of BB2016. + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: HMM state probability matrix, + forward haplotype probability matrix, and + backward haplotype probability matrix. + :rtype: tuple + """ + m = fm.shape[0] + h = fm.shape[1] + assert (m, h) == bm.shape + assert not np.any(fm < 0), "Forward probability matrix has negative values." + assert not np.any(np.isnan(fm)), "Forward probability matrix has NaN values." + assert not np.any(bm < 0), "Backward probability matrix has negative values." + assert not np.any(np.isnan(bm)), "Backward probability matrix has NaN values." + assert np.all( + np.isin(ref_h, [0, 1]) + ), "Reference haplotypes have non-biallelic values." + assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values." + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + # m + 1 columns to allow for imputed markers right of the last genotyped marker. + sm = np.zeros((m + 1, h), dtype=np.float64) + fwd_hap_probs = np.zeros((m, 4), dtype=np.float64) + bwd_hap_probs = np.zeros((m, 4), dtype=np.float64) + for i in np.arange(m - 1, -1, -1): + for j in np.arange(h): + sm[i, j] = fm[i, j] * bm[i, j] + ref_hap_seg_mP1 = _get_ref_hap_seg(ref_h[i, j], ref_h[i + 1, j]) + ref_hap_seg_m = _get_ref_hap_seg(ref_h[i - 1, j], ref_h[i, j]) + fwd_hap_probs[i, ref_hap_seg_mP1] += sm[i, j] + bwd_hap_probs[i, ref_hap_seg_m] += sm[i, j] + sum_fwd_hap_probs = np.sum(fwd_hap_probs[i, :]) + fwd_hap_probs[i, :] /= sum_fwd_hap_probs + bwd_hap_probs[i, :] /= sum_fwd_hap_probs # Sum of forward hap probs + sm[m, :] = sm[m - 1, :] # Not in BEAGLE + return (sm, fwd_hap_probs, bwd_hap_probs) + + +def _interpolate_allele_probabilities( + fwd_hap_probs, bwd_hap_probs, genotyped_pos, imputed_pos +): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Compute the interpolated allele probabilities at the imputed markers + of a query haplotype. + + The forward and backward haplotype probability matrices are of size (m, 2), + where m is the number of genotyped markers. + Here, we assume all biallelic sites, hence the 2. + + The interpolated allele probability matrix is of size (x, 2), + where x is the number of imputed markers. + Again, we assume all biallelic sites, hence the 2. + + This implementation is based on Equation 2 (the rearranged one) in BB2016. + + :param numpy.ndarray fwd_hap_probs: Forward haplotype probability matrix. + :param numpy.ndarray bwd_hap_probs: Backward haplotype probability matrix. + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray imputed_pos: Site positions of imputed markers. + :return: Interpolated allele probabilities. + :rtype: numpy.ndarray + """ + m = len(genotyped_pos) + x = len(imputed_pos) + assert (m, 2) == fwd_hap_probs.shape + assert (m, 2) == bwd_hap_probs.shape + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + assert m == len(genotyped_cm) + imputed_cm = convert_to_genetic_map_position(imputed_pos) + assert x == len(imputed_cm) + weights = get_weights(genotyped_cm, imputed_cm) + assert x == len(weights) + i_hap_probs = np.zeros((x, 2), dtype=np.float64) + for i in np.arange(x): + # Identify the genotyped markers k and k + 1 between imputed marker i + if imputed_cm[i] < genotyped_cm[0]: + k = 0 + elif imputed_cm[i] > genotyped_cm[-1]: + k = m - 2 + else: + k = np.max(np.where(imputed_cm[i] > genotyped_cm)[0]) + # Vectorised over the allelic states + i_hap_probs[i, :] = weights[i] * bwd_hap_probs[k, :] + i_hap_probs[i, :] += (1 - weights[i]) * fwd_hap_probs[k, :] + return i_hap_probs + + +def compute_state_probability_matrix(fm, bm, ref_h, query_h): + """ + Implement the HMM forward-backward algorithm following Equation 1 of BB2016. + This is simpler than faithfully reimplementing the original BEAGLE 4.1 algorithm. + + A HMM state is (index of a genotyped marker, index of a reference haplotype). + + The output HMM state probability matrix is of size (m + 1, h), + where m is the number of genotyped markers in the query haplotype + and h is the number of reference haplotypes. + + Note that the additional column at m + 1 is for interpolation of + imputed markers right of the last genotyped marker. + + The forward and backward probability matrices are both of size (m, h). + + The reference haplotypes of size (m, h), and the query haplotype is of size m. + Here, they are both subsetted to genotyped markers. + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :return: HMM state probability matrix. + :rtype: numpy.ndarray + """ + assert np.all( + np.isin(ref_h, [0, 1]) + ), "Reference haplotypes have non-biallelic values." + assert np.all(np.isin(query_h, [0, 1])), "Query haplotype has non-biallelic values." + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert (m, h) == fm.shape + assert (m, h) == bm.shape + assert not np.any(fm < 0), "Forward probability matrix has negative values." + assert not np.any(np.isnan(fm)), "Forward probability matrix has NaN values." + assert not np.any(bm < 0), "Backward probability matrix has negative values." + assert not np.any(np.isnan(bm)), "Backward probability matrix has NaN values." + # m + 1 columns to allow for imputed markers right of the last genotyped marker. + sm = np.zeros((m + 1, h), dtype=np.float64) + sm[:-1, :] = np.multiply(fm, bm) + sm[m, :] = sm[m - 1, :] # Not in BEAGLE + return sm + + +def interpolate_allele_probabilities( + sm, ref_h, genotyped_pos, imputed_pos, alleles=(0, 1) +): + """ + Compute the interpolated allele probabilities at imputed markers of + a query haplotype following Equation 1 of BB2016. + + Assuming all biallelic sites, the interpolated allele probability matrix + is of size (x, 2), where x is the number of imputed markers. + + This function takes the output of `compute_state_probability_matrix`. + + Note that this function takes: + 1. HMM state probability matrix across genotyped markers of size (m + 1, h). + 2. reference haplotypes subsetted to imputed markers of size (x, h). + + :param numpy.ndarray sm: HMM state probability matrix at genotyped markers. + :param numpy.ndarray ref_h: Reference haplotypes subsetted to imputed markers. + :param numpy.ndarray genotyped_pos: Site positions at genotyped markers. + :param numpy.ndarray imputed_pos: Site positions at imputed markers. + :param list alleles: Distince alleles in the haplotypes. + :return: Interpolated allele probabilities. + :rtype: numpy.ndarray + """ + assert not np.any(sm < 0), "HMM state probability matrix has negative values." + assert not np.any(np.isnan(sm)), "HMM state probability matrix has NaN values." + m = sm.shape[0] - 1 + h = sm.shape[1] + assert np.all( + np.isin(ref_h, [0, 1]) + ), "Reference haplotypes have non-biallelic values." + assert m == len(genotyped_pos) + x = len(imputed_pos) + assert (x, h) == ref_h.shape + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + imputed_cm = convert_to_genetic_map_position(imputed_pos) + weights = get_weights(genotyped_pos, imputed_pos) + assert x == len(weights) + p = np.zeros((x, len(alleles)), dtype=np.float64) + # Compute allele probabilities as per Equation 1 in BB2016. + for a in alleles: + for i in np.arange(x): + # Identify the genotyped markers k and k + 1 between the imputed marker i + # The indices k should mirror those in `get_weights`. + if imputed_cm[i] < genotyped_cm[0]: + # Special case: imputed marker is before the first genotyped marker. + k = 0 + elif imputed_cm[i] > genotyped_cm[-1]: + # Special case: imputed marker is after the last genotyped marker. + k = m - 1 + else: + k = np.max(np.where(imputed_cm[i] > genotyped_cm)[0]) + # Sum over the reference haplotypes where the allele is a at marker i. + has_a = ref_h[i, :] == a + p[i, a] += np.sum(weights[i] * sm[k, has_a]) + p[i, a] += np.sum((1 - weights[i]) * sm[k + 1, has_a]) + return p + + +def get_map_alleles(allele_probs): + """ + Compute the maximum a posteriori alleles at the imputed markers of a query haplotype. + + Assuming all biallelic sites, the output is an array of size x, + where x is the number of imputed markers. + + WARN: If the allele probabilities are equal, then allele 0 is arbitrarily chosen. + + :param numpy.ndarray allele_probs: Interpolated allele probabilities. + :return: Imputed alleles in the query haplotype. + :rtype: numpy.ndarray + """ + assert not np.any(allele_probs < 0), "Allele probabilities have negative values." + assert not np.any(np.isnan(allele_probs)), "Allele probabilities have NaN values." + imputed_alleles = np.argmax(allele_probs, axis=1) + assert len(imputed_alleles) == allele_probs.shape[0] + return imputed_alleles + + +def run_beagle(ref_h, query_h, pos, miscall_rate=0.0001, ne=1e6, debug=False): + """ + Run a simplified version of the BEAGLE imputation algorithm + based on Equation 1 of BB2016. + + `ref_h` and `query_h` span all genotyped and imputed marker sites. + So, their sizes are (m + x, h) and (m + x), respectively, + where m is the number of genotyped markers + and x is the number of imputed markers. + + The genomic site positions of all the markers are in `pos`. + + This produces imputed alleles and interpolated allele probabilities + at the imputed markers of the query haplotype. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Site positions of all the markers. + :param float miscall_rate: Allelic miscall rate. + :param int ne: Effective population size. + :param bool debug: Whether to print intermediate results. + :return: Imputed alleles and interpolated allele probabilities. + :rtype: list(numpy.ndarray, numpy.ndarray) + """ + # Set precision for printing + if debug: + np.set_printoptions(precision=20) + assert ref_h.shape[0] == len(pos) + assert len(query_h) == len(pos) + # Indices of genotyped markers in the query haplotype + genotyped_pos_idx = np.where(query_h != -1)[0] + assert len(genotyped_pos_idx) > 1 + # Indices of imputed markers in the query haplotype + imputed_pos_idx = np.where(query_h == -1)[0] + assert len(imputed_pos_idx) > 0 + # Site positions of genotyped markers + genotyped_pos = pos[genotyped_pos_idx] + # Site positions of imputed markers + imputed_pos = pos[imputed_pos_idx] + h = ref_h.shape[1] + m = len(genotyped_pos) + x = len(imputed_pos) + assert m + x == len(pos) + # Subset the reference haplotypes to genotyped markers + ref_h_genotyped = ref_h[genotyped_pos_idx, :] + if debug: + print("Reference haplotypes subsetted to genotyped markers") + print(ref_h_genotyped) + assert ref_h_genotyped.shape == (m, h) + # Subset the query haplotype to genotyped markers + query_h_genotyped = query_h[genotyped_pos_idx] + if debug: + print("Query haplotype subsetted to genotyped markers") + print(query_h_genotyped) + assert len(query_h_genotyped) == m + # Set mismatch probabilities at genotyped markers + mu = get_mismatch_prob(genotyped_pos, miscall_rate=miscall_rate) + if debug: + print("Mismatch probabilities") + print(mu) + assert len(mu) == m + # Set switch probabilities at genotyped markers + rho = get_switch_prob(genotyped_pos, h, ne=ne) + if debug: + print("Switch probabilities") + print(rho) + assert len(rho) == m + # Compute forward probability matrix at genotyped markers + fm = compute_forward_probability_matrix(ref_h_genotyped, query_h_genotyped, rho, mu) + if debug: + print("Forward probability matrix") + print(fm) + assert fm.shape == (m, h) + # Compute backward probability matrix at genotyped markers + bm = compute_backward_probability_matrix( + ref_h_genotyped, query_h_genotyped, rho, mu + ) + if debug: + print("Backward probability matrix") + print(bm) + assert bm.shape == (m, h) + # Compute HMM state probability matrix at genotyped markers + sm = compute_state_probability_matrix(fm, bm, ref_h_genotyped, query_h_genotyped) + if debug: + print("HMM state probability matrix") + print(sm) + assert sm.shape == (m + 1, h) + # Subset the reference haplotypes to imputed markers + ref_h_imputed = ref_h[imputed_pos_idx, :] + # Interpolate allele probabilities at imputed markers + i_allele_probs = interpolate_allele_probabilities( + sm, ref_h_imputed, genotyped_pos, imputed_pos + ) + if debug: + print("Interpolated allele probabilities") + print(i_allele_probs) + assert i_allele_probs.shape == (x, 2) + # Get MAP alleles at imputed markers + imputed_alleles = get_map_alleles(i_allele_probs) + if debug: + print("Imputed alleles") + print(imputed_alleles) + assert len(imputed_alleles) == x + return (imputed_alleles, i_allele_probs) + + +def run_tsimpute(ref_ts, query_h, pos, mu, rho): + """ + Run the simplified BEAGLE algorithm above on a tree sequence. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Site positions of all the markers. + :param numpy.ndarray mu: Mismatch probabilities. + :param numpy.ndarray rho: Switch probabilities. + :param bool debug: Whether to print intermediate results. + :return: Imputed alleles and interpolated allele probabilities. + :rtype: list(numpy.ndarray, numpy.ndarray) + """ + # Prepare marker positions. + genotyped_site_ids = np.where(query_h != -1)[0] + genotyped_pos = pos[genotyped_site_ids] + imputed_site_ids = np.where(query_h == -1)[0] + imputed_pos = pos[imputed_site_ids] + # Prepare reference haplotypes. + ref_ts_m = ref_ts.delete_sites(site_ids=imputed_site_ids) + ref_h_m = ref_ts_m.genotype_matrix() + ref_ts_x = ref_ts.delete_sites(site_ids=genotyped_site_ids) + ref_h_x = ref_ts_x.genotype_matrix() + query_h_m = query_h[genotyped_site_ids] + # Get forward and backward matrices from ts. + fm = _tskit.CompressedMatrix(ref_ts_m._ll_tree_sequence) + bm = _tskit.CompressedMatrix(ref_ts_m._ll_tree_sequence) + ls_hmm = _tskit.LsHmm(ref_ts_m._ll_tree_sequence, mu, rho, acgt_alleles=True) + ls_hmm.forward_matrix(query_h_m.T, fm) + ls_hmm.backward_matrix(query_h_m.T, fm.normalisation_factor, bm) + # Compute state probability matrix. + sm = compute_state_probability_matrix( + fm.decode(), + bm.decode(), + ref_h_m, + query_h_m, + ) + # Interpolate allele probabilities. + i_allele_probs = interpolate_allele_probabilities( + sm, ref_h_x, genotyped_pos, imputed_pos + ) + # Get MAP alleles at imputed markers. + imputed_alleles = get_map_alleles(i_allele_probs) + return (imputed_alleles, i_allele_probs) diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py new file mode 100644 index 0000000000..e292a77215 --- /dev/null +++ b/python/tests/beagle_numba.py @@ -0,0 +1,441 @@ +""" +Implementation of the BEAGLE 4.1 algorithm to impute genotypes. + +This is the numba-fied version of `beagle.py`. +""" +import numpy as np +from numba import njit + +import _tskit + + +@njit +def convert_to_genetic_map_position(pos): + """ + Convert genomic site positions (bp) to genetic map positions (cM). + + In BEAGLE 4.1, when a genetic map is not specified, it is assumed that 1 Mbp = 1cM. + + This trivial function is meant for documentation. + + :param numpy.ndarray pos: Site positions. + :return: Genetic map positions. + :rtype: numpy.ndarray + """ + assert 0 < np.min(pos), "Site positions are not 1-based." + return pos / 1e6 + + +@njit +def get_weights(genotyped_pos, imputed_pos): + """ + Get weights at imputed markers in the query haplotype. + + These weights are used in linear interpolation of HMM state probabilities + at imputed markers. + + In BB2016 (see below Equation 1), a weight between genotyped markers m and m + 1 + at imputed marker x is denoted lambda_m,x. + + lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], + where g(i) = genetic map position of marker i. + + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray imputed_pos: Site positions of imputed markers. + :return: Weights used in linear interpolation. + :rtype: numpy.ndarray + """ + m = len(genotyped_pos) + x = len(imputed_pos) + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + imputed_cm = convert_to_genetic_map_position(imputed_pos) + # Calculate weights for imputed markers. + weights = np.zeros(x, dtype=np.float64) + # Identify genotype markers k and k + 1 between imputed marker i + idx_m = m - 1 + for idx_x in np.arange(x - 1, -1, -1): + assert ( + imputed_cm[idx_x] != genotyped_cm[idx_m] + ), "Genotyped markers and imputed markers overlap." + while imputed_cm[idx_x] < genotyped_cm[idx_m] and idx_m > -1: + idx_m = max(idx_m - 1, -1) + if idx_m == m - 1: + # Right of the last genotyped marker + weights[idx_x] = 0.0 + elif idx_m == -1: + # Left of the first genotyped marker + weights[idx_x] = 1.0 + else: + # Between two genotyped markers + cm_mP1_x = genotyped_cm[idx_m + 1] - imputed_cm[idx_x] + cm_mP1_m = genotyped_cm[idx_m + 1] - genotyped_cm[idx_m] + # Set min genetic distance to avoid division by zero. + cm_mP1_m = cm_mP1_m if cm_mP1_m > 0.005 else 0.005 + weights[idx_x] = cm_mP1_x / cm_mP1_m + assert 0 <= np.min(weights), "Some weights are negative." + assert np.max(weights) <= 1, "Some weights are greater than 1." + return weights + + +@njit +def get_mismatch_prob(pos, miscall_rate): + """ + Set mismatch probabilities at genotyped markers. + + In BEAGLE, the mismatch probability is dominated by the allelic miscall rate. + By default, it is set to 0.0001 and capped at 0.50. + + :param numpy.ndarray pos: Site positions. + :param float miscall_rate: Allelic miscall rate. + :return: Mismatch probabilities. + :rtype: numpy.ndarray + """ + if miscall_rate >= 0.5: + miscall_rate = 0.5 + mu = np.zeros(len(pos), dtype=np.float64) + miscall_rate + return mu + + +@njit +def get_switch_prob(pos, h, ne): + """ + Set switch probabilities at genotyped markers. + + Note that in BEAGLE, the default value of `ne` is set to 1e6. + If `h` is small and `ne` is large, the switch probability is nearly 1. + In such cases, it may be desirable to set `ne` to a small value + to avoid switching between haplotypes all the time. + + :param numpy.ndarray pos: Site positions. + :param int h: Number of reference haplotypes. + :param float ne: Effective population size. + :return: Switch probabilities. + :rtype: numpy.ndarray + """ + assert 0 < np.min(pos), "Site positions are not 1-based." + # Get genetic distances between adjacent markers. + cm = convert_to_genetic_map_position(pos) + d = np.zeros(len(pos), dtype=np.float64) + d[0] = cm[0] + d[1:] = cm[1:] - cm[:-1] + rho = -np.expm1(-(4 * ne * d / h)) + return rho + + +@njit +def compute_forward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement the forward algorithm. + + The forward probablity matrix is of size (m, h), + where m is the number of genotyped markers + and h is the number of reference haplotypes. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Forward probability matrix. + :rtype: numpy.ndarray + """ + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + # BEAGLE caps mismatch probabilities at 0.5. + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + fm = np.zeros((m, h), dtype=np.float64) + last_sum = 1.0 # Part of normalization factor + for i in np.arange(m): + # Get site-specific parameters + shift = rho[i] / h + scale = (1 - rho[i]) / last_sum + # Get allele at genotyped marker i on query haplotype j + query_a = query_h[i] + for j in np.arange(h): + # Get allele at genotyped marker i on reference haplotype j + ref_a = ref_h[i, j] + # Get emission probability + em_prob = mu[i] if query_a != ref_a else 1.0 - mu[i] + if m == 0: + fm[i, j] = em_prob + else: + fm[i, j] = em_prob * (scale * fm[i - 1, j] + shift) + site_sum = np.sum(fm[i, :]) + last_sum = site_sum / h if m == 0 else site_sum + return fm + + +@njit +def compute_backward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement the backward algorithm. + + The backward probablity matrix is of size (m, h), + where m is the number of genotyped markers + and h is the number of reference haplotypes. + + `ref_h` is a panel of reference haplotypes of size m x h. + Here, it is subsetted to genotyped markers. + + `query_h` is also subsetted to genotyped markers (length m). + + In BEAGLE 4.1, the values are kept one site at a time. + Here, we keep the values at all the sites. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Backward probability matrix. + :rtype: numpy.ndarray + """ + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + bm = np.zeros((m, h), dtype=np.float64) + bm[-1, :] = 1.0 / h # Initialise the last column + for i in np.arange(m - 2, -1, -1): + query_a = query_h[i + 1] + for j in np.arange(h): + ref_a = ref_h[i + 1, j] + em_prob = mu[i + 1] if ref_a != query_a else 1.0 - mu[i + 1] + # Note that BEAGLE keeps the values at one site at a time. + bm[i, j] = bm[i + 1, j] * em_prob + sum_site = np.sum(bm[i, :]) + scale = (1 - rho[i + 1]) / sum_site + shift = rho[i + 1] / h + bm[i, :] = scale * bm[i, :] + shift + return bm + + +@njit +def compute_state_probability_matrix(fm, bm, ref_h, query_h): + """ + Implement the HMM forward-backward algorithm following Equation 1 of BB2016. + This is simpler than faithfully reimplementing the original BEAGLE 4.1 algorithm. + + A HMM state is (index of a genotyped marker, index of a reference haplotype). + + The output HMM state probability matrix is of size (m + 1, h), + where m is the number of genotyped markers in the query haplotype + and h is the number of reference haplotypes. + + Note that the additional column at m + 1 is for interpolation of + imputed markers right of the last genotyped marker. + + The forward and backward probability matrices are both of size (m, h). + + The reference haplotypes of size (m, h), and the query haplotype is of size m. + Here, they are both subsetted to genotyped markers. + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :return: HMM state probability matrix. + :rtype: numpy.ndarray + """ + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert (m, h) == fm.shape + assert (m, h) == bm.shape + # m + 1 columns to allow for imputed markers right of the last genotyped marker. + sm = np.zeros((m + 1, h), dtype=np.float64) + sm[:-1, :] = np.multiply(fm, bm) + sm[m, :] = sm[m - 1, :] # Not in BEAGLE + return sm + + +@njit +def interpolate_allele_probabilities( + sm, ref_h, genotyped_pos, imputed_pos, alleles=(0, 1) +): + """ + Compute the interpolated allele probabilities at imputed markers of + a query haplotype following Equation 1 of BB2016. + + Assuming all biallelic sites, the interpolated allele probability matrix + is of size (x, 2), where x is the number of imputed markers. + + This function takes the output of `compute_state_probability_matrix`. + + Note that this function takes: + 1. HMM state probability matrix across genotyped markers of size (m + 1, h). + 2. reference haplotypes subsetted to imputed markers of size (x, h). + + :param numpy.ndarray sm: HMM state probability matrix at genotyped markers. + :param numpy.ndarray ref_h: Reference haplotypes subsetted to imputed markers. + :param numpy.ndarray genotyped_pos: Site positions at genotyped markers. + :param numpy.ndarray imputed_pos: Site positions at imputed markers. + :param list alleles: Distince alleles in the haplotypes. + :return: Interpolated allele probabilities. + :rtype: numpy.ndarray + """ + m = sm.shape[0] - 1 + h = sm.shape[1] + assert m == len(genotyped_pos) + x = len(imputed_pos) + assert (x, h) == ref_h.shape + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + imputed_cm = convert_to_genetic_map_position(imputed_pos) + weights = get_weights(genotyped_pos, imputed_pos) + assert x == len(weights) + p = np.zeros((x, len(alleles)), dtype=np.float64) + # Compute allele probabilities as per Equation 1 in BB2016. + for a in alleles: + for i in np.arange(x): + # Identify the genotyped markers k and k + 1 between the imputed marker i + # The indices k should mirror those in `get_weights`. + if imputed_cm[i] < genotyped_cm[0]: + # Special case: imputed marker is before the first genotyped marker. + k = 0 + elif imputed_cm[i] > genotyped_cm[-1]: + # Special case: imputed marker is after the last genotyped marker. + k = m - 1 + else: + k = np.max(np.where(imputed_cm[i] > genotyped_cm)[0]) + # Sum over the reference haplotypes where the allele is a at marker i. + has_a = ref_h[i, :] == a + p[i, a] += np.sum(weights[i] * sm[k, has_a]) + p[i, a] += np.sum((1 - weights[i]) * sm[k + 1, has_a]) + return p + + +@njit +def get_map_alleles(allele_probs): + """ + Compute the maximum a posteriori alleles at the imputed markers of a query haplotype. + + Assuming all biallelic sites, the output is an array of size x, + where x is the number of imputed markers. + + WARN: If the allele probabilities are equal, then allele 0 is arbitrarily chosen. + + :param numpy.ndarray allele_probs: Interpolated allele probabilities. + :return: Imputed alleles in the query haplotype. + :rtype: numpy.ndarray + """ + imputed_alleles = np.argmax(allele_probs, axis=1) + assert len(imputed_alleles) == allele_probs.shape[0] + return imputed_alleles + + +@njit +def run_beagle(ref_h, query_h, pos, miscall_rate=0.0001, ne=1e6): + """ + Run a simplified version of the BEAGLE imputation algorithm + based on Equation 1 of BB2016. + + `ref_h` and `query_h` span all genotyped and imputed marker sites. + So, their sizes are (m + x, h) and (m + x), respectively, + where m is the number of genotyped markers + and x is the number of imputed markers. + + The genomic site positions of all the markers are in `pos`. + + This produces imputed alleles and interpolated allele probabilities + at the imputed markers of the query haplotype. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Site positions of all the markers. + :param float miscall_rate: Allelic miscall rate. + :param int ne: Effective population size. + :return: Imputed alleles and interpolated allele probabilities. + :rtype: list(numpy.ndarray, numpy.ndarray) + """ + assert ref_h.shape[0] == len(pos) + assert len(query_h) == len(pos) + # Indices of genotyped markers in the query haplotype + genotyped_pos_idx = np.where(query_h != -1)[0] + assert len(genotyped_pos_idx) > 1 + # Indices of imputed markers in the query haplotype + imputed_pos_idx = np.where(query_h == -1)[0] + assert len(imputed_pos_idx) > 0 + # Site positions of genotyped markers + genotyped_pos = pos[genotyped_pos_idx] + # Site positions of imputed markers + imputed_pos = pos[imputed_pos_idx] + h = ref_h.shape[1] + m = len(genotyped_pos) + x = len(imputed_pos) + assert m + x == len(pos) + # Subset the reference haplotypes to genotyped markers + ref_h_genotyped = ref_h[genotyped_pos_idx, :] + assert ref_h_genotyped.shape == (m, h) + # Subset the query haplotype to genotyped markers + query_h_genotyped = query_h[genotyped_pos_idx] + assert len(query_h_genotyped) == m + # Set mismatch probabilities at genotyped markers + mu = get_mismatch_prob(genotyped_pos, miscall_rate=miscall_rate) + assert len(mu) == m + # Set switch probabilities at genotyped markers + rho = get_switch_prob(genotyped_pos, h, ne=ne) + assert len(rho) == m + # Compute forward probability matrix at genotyped markers + fm = compute_forward_probability_matrix(ref_h_genotyped, query_h_genotyped, rho, mu) + assert fm.shape == (m, h) + # Compute backward probability matrix at genotyped markers + bm = compute_backward_probability_matrix( + ref_h_genotyped, query_h_genotyped, rho, mu + ) + assert bm.shape == (m, h) + # Compute HMM state probability matrix at genotyped markers + sm = compute_state_probability_matrix(fm, bm, ref_h_genotyped, query_h_genotyped) + assert sm.shape == (m + 1, h) + # Subset the reference haplotypes to imputed markers + ref_h_imputed = ref_h[imputed_pos_idx, :] + # Interpolate allele probabilities at imputed markers + i_allele_probs = interpolate_allele_probabilities( + sm, ref_h_imputed, genotyped_pos, imputed_pos + ) + assert i_allele_probs.shape == (x, 2) + # Get MAP alleles at imputed markers + imputed_alleles = get_map_alleles(i_allele_probs) + assert len(imputed_alleles) == x + return (imputed_alleles, i_allele_probs) + + +def run_tsimpute(ref_ts, query_h, pos, mu, rho): + # Prepare marker positions. + genotyped_site_ids = np.where(query_h != -1)[0] + genotyped_pos = pos[genotyped_site_ids] + imputed_site_ids = np.where(query_h == -1)[0] + imputed_pos = pos[imputed_site_ids] + # Prepare reference haplotypes. + ref_ts_m = ref_ts.delete_sites(site_ids=imputed_site_ids) + ref_h_m = ref_ts_m.genotype_matrix() + ref_ts_x = ref_ts.delete_sites(site_ids=genotyped_site_ids) + ref_h_x = ref_ts_x.genotype_matrix() + query_h_m = query_h[genotyped_site_ids] + # Get forward and backward matrices from ts. + fm = _tskit.CompressedMatrix(ref_ts_m._ll_tree_sequence) + bm = _tskit.CompressedMatrix(ref_ts_m._ll_tree_sequence) + ls_hmm = _tskit.LsHmm(ref_ts_m._ll_tree_sequence, mu, rho, acgt_alleles=True) + ls_hmm.forward_matrix(query_h_m.T, fm) + ls_hmm.backward_matrix(query_h_m.T, fm.normalisation_factor, bm) + # Compute state probability matrix. + sm = compute_state_probability_matrix( + fm.decode(), + bm.decode(), + ref_h_m, + query_h_m, + ) + # Interpolate allele probabilities. + ap = interpolate_allele_probabilities(sm, ref_h_x, genotyped_pos, imputed_pos) + # Get imputed alleles. + ia = get_map_alleles(ap) + return ia diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py new file mode 100644 index 0000000000..05c886812e --- /dev/null +++ b/python/tests/test_imputation.py @@ -0,0 +1,628 @@ +""" +Tests for genotype imputation (forward and Baum-Welsh algorithms). +""" +import io + +import numpy as np +import pytest + +import tskit +from tests.beagle import run_beagle + + +# The following toy query haplotypes were imputed from the toy reference haplotypes +# using BEAGLE 4.1 with Ne set to 10. +# +# There are 9 sites, starting from 10,000 to 90,000, with 10,000 increments. +# The REF is A and ALT is T for all the sites in the VCF input to BEAGLE. +# +# Case 0: +# Reference haplotypes and query haplotypes have 0|0 at all sites. +# First and last reference markers are missing in the query haplotypes. +# fmt: off +toy_ref_0 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 1 + ] +).T +toy_query_0 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 1 + ] +) +toy_query_0_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 1 + ] +) +toy_query_0_beagle_allele_probs = np.array( + # Imputed markers only + [ + # query_0, haplotype 0 + [ + [1.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + ], + # query_0, haplotype 1 + [ + [1.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + ], + ], + dtype=np.float64, +) +# fmt: on +assert toy_ref_0.shape == (9, 2) +assert toy_query_0.shape == (2, 9) +assert toy_query_0_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_0[0] == 0) == 4 +assert np.sum(toy_query_0[0] == -1) == 5 +assert np.array_equal(toy_query_0[0], toy_query_0[1]) +assert np.all(toy_query_0_beagle_imputed[0] == 0) +assert np.all(toy_query_0_beagle_imputed[1] == 0) +assert toy_query_0_beagle_allele_probs.shape == (2, 2, 5) + +# Case 1: +# Reference haplotypes and query haplotypes have 1|1 at all sites. +# First and last reference markers are missing in the query haplotypes. +# fmt: off +toy_ref_1 = np.array( + [ + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 1 + ] +).T +toy_query_1 = np.array( + [ + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ] +) +toy_query_1_beagle_imputed = np.array( + [ + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 1 + ] +) +toy_query_1_beagle_allele_probs = np.array( + # Imputed markers only + [ + # query_0, haplotype 0 + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0], + ], + # query_0, haplotype 1 + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0], + ], + ], + dtype=np.float64, +) +# fmt: on +assert toy_ref_1.shape == (9, 2) +assert toy_query_1.shape == (2, 9) +assert toy_query_1_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_1[0] == 1) == 4 +assert np.sum(toy_query_1[0] == -1) == 5 +assert np.array_equal(toy_query_1[0], toy_query_1[1]) +assert np.all(toy_query_1_beagle_imputed[0] == 1) +assert np.all(toy_query_1_beagle_imputed[1] == 1) +assert toy_query_1_beagle_allele_probs.shape == (2, 2, 5) + +# Case 2: +# Reference haplotypes and query haplotypes have 0|0 at all sites. +# First and last reference markers are genotyped in the query haplotypes. +# fmt: off +toy_ref_2 = np.copy(toy_ref_0) +toy_query_2 = np.array( + [ + [0, 0, -1, 0, -1, 0, -1, 0, 0], # query_0, haplotype 0 + [0, 0, -1, 0, -1, 0, -1, 0, 0], # query_0, haplotype 1 + ] +) +toy_query_2_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 1 + ] +) +toy_query_2_beagle_allele_probs = np.array( + # Imputed markers only + [ + # query_0, haplotype 0 + [ + [1.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ], + # query_0, haplotype 1 + [ + [1.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ], + ], + dtype=np.float64, +) +# fmt: on +assert toy_ref_2.shape == (9, 2) +assert toy_query_2.shape == (2, 9) +assert toy_query_2_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_2[0] == 0) == 6 +assert np.sum(toy_query_2[0] == -1) == 3 +assert np.array_equal(toy_query_2[0], toy_query_2[1]) +assert np.all(toy_query_2_beagle_imputed[0] == 0) +assert np.all(toy_query_2_beagle_imputed[1] == 0) +assert toy_query_2_beagle_allele_probs.shape == (2, 2, 3) + +# Case 3: +# Reference haplotypes and query haplotypes have 0|1 at all sites. +# fmt: off +toy_ref_3 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 1 + ] +).T +toy_query_3 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ] +) +toy_query_3_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 1 + ] +) +toy_query_3_beagle_allele_probs = np.array( + # Imputed markers only + # See comment https://github.com/tskit-dev/tskit/pull/2815#issuecomment-1708178257 + [ + # query_0, haplotype 0 + [ + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.9999998], + [0.0 , 0.0 , 0.0, 0.0 , 0.0], + ], + # query_0, haplotype 1 + [ + [0.0 , 0.0 , 0.0, 0.0 , 0.0], + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.9999998], + ], + ], + dtype=np.float64, +) +# fmt: on +assert toy_ref_3.shape == (9, 2) +assert toy_query_3.shape == (2, 9) +assert toy_query_3_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_3[0] == 0) == 4 +assert np.sum(toy_query_3[1] == 1) == 4 +assert np.sum(toy_query_3[0] == -1) == np.sum(toy_query_3[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_3_beagle_imputed[0], toy_query_3_beagle_imputed[1])) +) +assert toy_query_3_beagle_allele_probs.shape == (2, 2, 5) + +# Case 4: +# Reference haplotypes and query haplotypes have alternating 0|1 and 1|0 genotypes. +# Query haplotypes have 0|1 at all genotyped sites. +# fmt: off +toy_ref_4 = np.array( + [ + [0, 1, 0, 1, 0, 1, 0, 1, 0], # ref_0, haplotype 0 + [1, 0, 1, 0, 1, 0, 1, 0, 1], # ref_0, haplotype 1 + ] +).T +toy_query_4 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ] +) +toy_query_4_beagle_imputed = np.array( + [ + [1, 0, 1, 0, 1, 0, 1, 0, 1], # query_0, haplotype 0 + [0, 1, 0, 1, 0, 1, 0, 1, 0], # query_0, haplotype 1 + ] +) +toy_query_4_beagle_allele_probs = np.array( + # Imputed markers only + [ + # query_0, haplotype 0 + [ + [0.0 , 0.0 , 0.0, 0.0 , 0.9999998], + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.0], + ], + # query_0, haplotype 1 + [ + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.0], + [0.0 , 0.0 , 0.0, 0.0 , 0.9999998], + ], + ], + dtype=np.float64, +) +# fmt: on +assert toy_ref_4.shape == (9, 2) +assert toy_query_4.shape == (2, 9) +assert toy_query_4_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_4[0] == 0) == 4 +assert np.sum(toy_query_4[1] == 1) == 4 +assert np.sum(toy_query_4[0] == -1) == np.sum(toy_query_4[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_4_beagle_imputed[0], toy_query_4_beagle_imputed[1])) +) +assert toy_query_3_beagle_allele_probs.shape == (2, 2, 5) + +# Case 5: +# The reference panel has two individuals. The first individual has 0|0 at all sites, +# and the second individual has 1|1 at all sites. +# The query haplotypes have one recombination breakpoint in the middle. +# fmt: off +toy_ref_5 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_1 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_1 + ] +).T +toy_query_5 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 1, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 0, -1], # query_0, haplotype 1 + ] +) +toy_query_5_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 1, 1], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 0, 0], # query_0, haplotype 1 + ] +) +toy_query_5_beagle_allele_probs = np.array( + # Imputed markers only + [ + # query_0, haplotype 0 + [ + [0.9999999, 0.99999994, 0.9999546, 0.5454091, 0.09090912], + [0.0 , 0.0 , 0.0 , 0.4545909, 0.9090909], + ], + # query_0, haplotype 1 + [ + [0.0 , 0.0 , 0.0 , 0.4545909, 0.9090909], + [0.9999999, 0.99999994, 0.9999546, 0.5454091, 0.09090912], + ], + ], + dtype=np.float64, +) +# fmt: on +assert toy_ref_5.shape == (9, 4) +assert toy_query_5.shape == (2, 9) +assert toy_query_5_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_5[0] == 0) == 3 +assert np.sum(toy_query_5[0] == 1) == 1 +assert np.sum(toy_query_5[0] == -1) == 5 +assert np.sum(toy_query_5[1] == 0) == 1 +assert np.sum(toy_query_5[1] == 1) == 3 +assert np.sum(toy_query_5[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_5_beagle_imputed[0], toy_query_5_beagle_imputed[1])) +) +assert toy_query_5_beagle_allele_probs.shape == (2, 2, 5) + +# Case 6: +# The reference panel has two individuals. The first individual has 0|0 at all sites, +# and the second individual has 1|1 at all sites. +# The query haplotypes have two recombination breakpoints in the middle. +# fmt: off +toy_ref_6 = np.copy(toy_ref_5) +toy_query_6 = np.array( + [ + [-1, 0, -1, 1, -1, 1, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 0, -1, 0, -1, 1, -1], # query_0, haplotype 1 + ] +) +toy_query_6_beagle_imputed = np.array( + [ + [0, 0, 1, 1, 1, 1, 1, 0, 0], # query_0, haplotype 0 + [1, 1, 0, 0, 0, 0, 0, 1, 1], # query_0, haplotype 1 + ] +) +toy_query_6_beagle_allele_probs = np.array( + # Imputed markers only + [ + # query_0, haplotype 0 + [ + [0.90983605, 0.46770614, 0.025481388, 0.46838862, 0.91139066], + [0.09016396, 0.53229386, 0.97451866 , 0.5316114 , 0.088609315], + ], + # query_0, haplotype 1 + [ + [0.09016395, 0.53229386, 0.97451866 , 0.5316114 , 0.0886093], + [0.90983605, 0.46770614, 0.025481388, 0.46838862, 0.91139066], + ], + ], + dtype=np.float64, +) +# fmt: on +assert toy_ref_6.shape == (9, 4) +assert toy_query_6.shape == (2, 9) +assert toy_query_6_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_6[0] == 0) == 2 +assert np.sum(toy_query_6[0] == 1) == 2 +assert np.sum(toy_query_6[0] == -1) == 5 +assert np.sum(toy_query_6[1] == 0) == 2 +assert np.sum(toy_query_6[1] == 1) == 2 +assert np.sum(toy_query_6[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_6_beagle_imputed[0], toy_query_6_beagle_imputed[1])) +) +assert toy_query_6_beagle_allele_probs.shape == (2, 2, 5) + +# Case 7: +# Hand-crafted example. +# fmt: off +toy_ref_7 = np.array( + [ + [0, 0, 0, 1, 1, 0, 0, 0, 1], # ref_0 + [1, 0, 0, 0, 1, 0, 0, 0, 1], # ref_0 + [0, 0, 1, 1, 1, 0, 0, 0, 1], # ref_1 + [0, 1, 0, 0, 0, 0, 0, 1, 0], # ref_1 + [0, 0, 0, 0, 0, 1, 1, 1, 0], # ref_2 + [1, 0, 0, 0, 1, 0, 0, 0, 1], # ref_2 + [0, 0, 1, 1, 1, 0, 0, 0, 1], # ref_3 + [0, 0, 0, 0, 0, 0, 0, 0, 1], # ref_3 + [0, 1, 0, 0, 0, 0, 0, 1, 0], # ref_4 + [0, 0, 0, 0, 0, 1, 1, 1, 0], # ref_4 + ] +).T +toy_query_7 = np.array( + [ + [0, 0, -1, 1, -1, 1, -1, 0, 1], # query_0, haplotype 0 + [1, 1, -1, 0, -1, 0, -1, 1, 0], # query_0, haplotype 1 + ] +) +toy_query_7_beagle_imputed = np.array( + [ + [0, 0, 1, 1, 1, 1, 0, 0, 1], # query_0, haplotype 0 + [1, 1, 0, 0, 0, 0, 0, 1, 0], # query_0, haplotype 1 + ] +) +toy_query_7_beagle_allele_probs = np.array( + # Imputed markers only + [ + # query_0, haplotype 0 + [ + [0.33317572, 0.0 , 0.99945664], + [0.66635144, 0.9991437, 0.0], + ], + # query_0, haplotype 1 + [ + [0.9995998, 0.99971414, 0.99974835], + [0.0 , 0.0 , 0.0], + ], + ], + dtype=np.float64, +) +# fmt: on +assert toy_ref_7.shape == (9, 10) +assert toy_query_7.shape == (2, 9) +assert toy_query_7_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_7[0] == -1) == np.sum(toy_query_7[1] == -1) == 3 +assert toy_query_7_beagle_allele_probs.shape == (2, 2, 3) + +# Case 8: +# Same as case 7 except the last genotyped marker is missing. +# fmt: off +toy_ref_8 = np.copy(toy_ref_7) +toy_query_8 = np.array( + [ + [-1, 0, -1, 1, -1, 1, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 0, -1, 0, -1, 1, -1], # query_0, haplotype 1 + ] +) +toy_query_8_beagle_imputed = np.array( + [ + [0, 0, 1, 1, 1, 1, 0, 0, 1], # query_0, haplotype 0 + [0, 1, 0, 0, 0, 0, 0, 1, 0], # query_0, haplotype 1 + ] +) +toy_query_8_beagle_allele_probs = np.array( + # Imputed markers only + [ + # query_0, haplotype 0 + [ + [0.9997735, 0.33310473, 0.0 , 0.9992305, 0.9997736], + [0.0 , 0.66620946, 0.99893093, 0.0 , 0.0], + ], + # query_0, haplotype 1 + [ + [0.9999998, 0.9999998, 0.9999998, 0.99991965, 0.0], + [0.0 , 0.0 , 0.0 , 0.0 , 0.9999998], + ], + ], + dtype=np.float64, +) +# fmt: on +assert toy_ref_8.shape == (9, 10) +assert toy_query_8.shape == (2, 9) +assert toy_query_8_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_8[0] == -1) == 5 +assert np.sum(toy_query_8[1] == -1) == 5 +assert toy_query_8_beagle_allele_probs.shape == (2, 2, 5) + + +@pytest.mark.parametrize( + "input_ref,input_query,expected", + [ + (toy_ref_0, toy_query_0, toy_query_0_beagle_imputed), + (toy_ref_1, toy_query_1, toy_query_1_beagle_imputed), + (toy_ref_2, toy_query_2, toy_query_2_beagle_imputed), + (toy_ref_3, toy_query_3, toy_query_3_beagle_imputed), + (toy_ref_4, toy_query_4, toy_query_4_beagle_imputed), + (toy_ref_5, toy_query_5, toy_query_5_beagle_imputed), + (toy_ref_6, toy_query_6, toy_query_6_beagle_imputed), + (toy_ref_7, toy_query_7, toy_query_7_beagle_imputed), + (toy_ref_8, toy_query_8, toy_query_8_beagle_imputed), + ], +) +def test_compare_imputed_alleles(input_ref, input_query, expected): + """Compare with imputed alleles from BEAGLE 4.1.""" + assert input_query.shape == expected.shape + pos = (np.arange(9) + 1) * 1e4 + num_query_haps = input_query.shape[0] + num_imputed_markers = np.sum(input_query[0] == -1) # x + imputed_alleles = np.zeros((num_query_haps, num_imputed_markers), dtype=int) + expected_subset = expected[:, input_query[0] == -1] # Subsetted to imputed markers + assert expected_subset.shape == imputed_alleles.shape + for i in np.arange(num_query_haps): + imputed_alleles[i], _ = run_beagle( + input_ref, input_query[i], pos, miscall_rate=0.0001, ne=10.0 + ) + num_diff = np.sum(imputed_alleles[i] != expected_subset[i]) + assert np.array_equal(num_diff == 0), f"Wrongly imputed alleles: {num_diff}" + + +@pytest.mark.parametrize( + "input_ref,input_query,expected", + [ + (toy_ref_0, toy_query_0, toy_query_0_beagle_allele_probs), + (toy_ref_1, toy_query_1, toy_query_1_beagle_allele_probs), + (toy_ref_2, toy_query_2, toy_query_2_beagle_allele_probs), + (toy_ref_3, toy_query_3, toy_query_3_beagle_allele_probs), + # (toy_ref_4, toy_query_4, toy_query_4_beagle_allele_probs), + # (toy_ref_5, toy_query_5, toy_query_5_beagle_allele_probs), + # (toy_ref_6, toy_query_6, toy_query_6_beagle_allele_probs), + # (toy_ref_7, toy_query_7, toy_query_7_beagle_allele_probs), + # (toy_ref_8, toy_query_8, toy_query_8_beagle_allele_probs), + ], +) +def test_compare_allele_probabilities(input_ref, input_query, expected): + """Check interpolated allele probabilities from BEAGLE 4.1.""" + pos = (np.arange(9) + 1) * 1e4 + num_query_haps = input_query.shape[0] + for i in np.arange(num_query_haps): + _, i_allele_probs = run_beagle( + input_ref, input_query[i], pos, miscall_rate=0.0001, ne=10.0 + ) + # Rescale probabilities before comparison + rescaled_probs = np.zeros_like(i_allele_probs).T + for j in np.arange(rescaled_probs.shape[1]): + rescaled_probs[:, j] = i_allele_probs[j] / np.sum(i_allele_probs[j]) + assert np.allclose(rescaled_probs, expected[i], atol=1e-04) + + +# Below is toy data set case 7 in tree sequence format. +toy_ts_nodes_text = """\ +id is_sample time population individual metadata +0 1 0.000000 0 0 +1 1 0.000000 0 0 +2 1 0.000000 0 1 +3 1 0.000000 0 1 +4 1 0.000000 0 2 +5 1 0.000000 0 2 +6 1 0.000000 0 3 +7 1 0.000000 0 3 +8 1 0.000000 0 4 +9 1 0.000000 0 4 +10 0 0.009923 0 -1 +11 0 0.038603 0 -1 +12 0 0.057935 0 -1 +13 0 0.145141 0 -1 +14 0 0.238045 0 -1 +15 0 0.528344 0 -1 +16 0 0.646418 0 -1 +17 0 1.462199 0 -1 +18 0 2.836600 0 -1 +19 0 3.142225 0 -1 +20 0 4.056253 0 -1 +""" + +toy_ts_edges_text = """\ +left right parent child metadata +0.000000 100000.000000 10 1 +0.000000 100000.000000 10 5 +0.000000 100000.000000 11 3 +0.000000 100000.000000 11 8 +0.000000 100000.000000 12 2 +0.000000 100000.000000 12 6 +0.000000 100000.000000 13 0 +0.000000 100000.000000 13 12 +0.000000 100000.000000 14 10 +0.000000 100000.000000 14 13 +0.000000 40443.000000 15 9 +0.000000 40443.000000 15 14 +40443.000000 100000.000000 16 4 +40443.000000 100000.000000 16 9 +0.000000 40443.000000 17 4 +0.000000 100000.000000 17 11 +40443.000000 100000.000000 17 16 +0.000000 12721.000000 18 7 +0.000000 12721.000000 18 17 +12721.000000 100000.000000 19 7 +40443.000000 100000.000000 19 14 +12721.000000 40443.000000 19 15 +0.000000 12721.000000 20 15 +12721.000000 100000.000000 20 17 +0.000000 12721.000000 20 18 +12721.000000 100000.000000 20 19 +""" + +toy_ts_sites_text = """\ +position ancestral_state metadata +10000.000000 A +20000.000000 A +30000.000000 A +40000.000000 A +50000.000000 A +60000.000000 A +70000.000000 A +80000.000000 A +90000.000000 A +""" + +toy_ts_mutations_text = """\ +site node time derived_state parent metadata +0 10 unknown C -1 +1 11 unknown C -1 +2 12 unknown C -1 +3 13 unknown C -1 +4 14 unknown C -1 +5 16 unknown C -1 +6 16 unknown C -1 +7 17 unknown C -1 +8 19 unknown C -1 +""" + +toy_ts_individuals_text = """\ +flags +0 +0 +0 +0 +0 +""" + + +def get_toy_data(): + """Get toy example 7 in tree sequence format.""" + ts = tskit.load_text( + nodes=io.StringIO(toy_ts_nodes_text), + edges=io.StringIO(toy_ts_edges_text), + sites=io.StringIO(toy_ts_sites_text), + mutations=io.StringIO(toy_ts_mutations_text), + individuals=io.StringIO(toy_ts_individuals_text), + strict=False, + ) + return ts + + +def parse_matrix(csv_text): + # TODO: Maybe remove. + # This returns a record array, which is essentially the same as a + # pandas dataframe, which we can access via df["m"] etc. + return np.lib.npyio.recfromcsv(io.StringIO(csv_text))