From 55f990b20a6080cc2ce20926fb7a231b033d825c Mon Sep 17 00:00:00 2001 From: Shing Zhan Date: Mon, 7 Aug 2023 18:19:00 +0100 Subject: [PATCH] Add tests to check LS HMM of _tskit.lshmm compared to BEAGLE --- python/tests/beagle.py | 612 +++++++++++++++++++++++++++ python/tests/beagle_numba.py | 373 +++++++++++++++++ python/tests/test_imputation.py | 708 ++++++++++++++++++++++++++++++++ 3 files changed, 1693 insertions(+) create mode 100644 python/tests/beagle.py create mode 100644 python/tests/beagle_numba.py create mode 100644 python/tests/test_imputation.py diff --git a/python/tests/beagle.py b/python/tests/beagle.py new file mode 100644 index 0000000000..e60f4ca198 --- /dev/null +++ b/python/tests/beagle.py @@ -0,0 +1,612 @@ +""" +Implementation of the BEAGLE 4.1 algorithm to impute alleles. + +This was implemented while closely consulting the BEAGLE 4.1 paper: +Browning & Browning (2016). Genotype imputation with millions of reference samples. +Am J Hum Genet 98:116-126. doi:10.1016/j.ajhg.2015.11.020 + +The BEAGLE 4.1 source code (particularly `LSHapBaum.java`) was also consulted: +https://faculty.washington.edu/browning/beagle/b4_1.html + +These notations are used throughout this implementation: +h = number of reference haplotypes. +m = number of genotyped markers. +x = number of ungenotyped markers. + +This implementation takes the following inputs: +1. Panel of reference haplotypes in a matrix of size (m + x, h). +2. One query haplotype in an array of size (m + x). +3. Site positions of all the markers in an array of size (m + x). + +For simplicity, we assume that: +1. All the sites are biallelic (but ACGT encoding is used). +2. The genetic map is defined by equating 1 Mbp to 1 cM. + +In the query haplotype: +1. The genotyped markers take values of 0 or 1. +2. The ungenotyped markers take -1. + +The forward and backward probability matrices are of size (m, h). +The HMM state probability matrix is of size (m + 1, h). +The interpolated allele probability matrix is of size (x, 4), +The imputed alleles are the maximum a posteriori (MAP) alleles. + +For efficiency, BEAGLE uses aggregated markers, which are clusters of markers +within a 0.005 cM interval (default). Because the genotypes are phased, +the alleles in the aggregated markers form distinct "allele sequences". +Below, we do not use aggregated markers or allele sequences, which would complicate +the implementation. + +Rather than trying to exactly replicating the original BEAGLE 4.1 algorithm, +this implementation computes Equation 1 of BB2016. We keep functions used +in a first attempt to faithfully implement the BEAGLE algorithm for documentation. +""" +import numpy as np + + +def convert_to_genetic_map_position(pos): + """ + Convert genomic site positions (bp) to genetic map positions (cM). + + In BEAGLE 4.1, when a genetic map is not specified, it is assumed + that the recombination rate is constant (1 cM / 1 Mb). + + This trivial function is meant for documentation. + + :param numpy.ndarray pos: Site positions. + :return: Genetic map positions. + :rtype: numpy.ndarray + """ + assert 0 < np.min(pos), "Site positions are not 1-based or invalid." + return pos / 1e6 # Convert to Mb + + +def get_weights(genotyped_pos, ungenotyped_pos): + """ + Get weights at ungenotyped markers in the query haplotype, which are used in + linear interpolation of HMM state probabilities at ungenotyped markers. + + In BB2016 (see below Equation 1), a weight between genotyped markers m and m + 1 + at ungenotyped marker x is denoted lambda_m,x. + + lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], + where g(i) = genetic map position of marker i. + + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray ungenotyped_pos: Site positions of ungenotyped markers. + :return: Weights and associated marker interval start indices. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + assert len(genotyped_pos) > 1, "There are fewer than two genotyped markers." + assert len(ungenotyped_pos) > 0, "There are no ungenotyped markers." + # Check that the two sets are mutually exclusive. + m = len(genotyped_pos) + x = len(ungenotyped_pos) + assert len(np.union1d(genotyped_pos, ungenotyped_pos)) == m + x + # Set min genetic distance. + MIN_CM_DIST = 0.005 + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos) + # Calculate weights for ungenotyped markers. + weights = np.zeros(x, dtype=np.float32) + # Identify genotype markers k and k + 1 sandwiching marker i. + marker_interval_start = np.zeros(x, dtype=np.int32) + for i in np.arange(x): + if ungenotyped_pos[i] < genotyped_pos[0]: + # Ungenotyped marker is before the first genotyped marker. + k = 0 + weights[i] = 1.0 + elif ungenotyped_pos[i] > genotyped_pos[-1]: + # Ungenotyped marker is after the last genotyped marker. + k = m - 1 + weights[i] = 0.0 + else: + # Ungenotyped marker si between genotyped markers k and k + 1. + k = np.max(np.where(ungenotyped_pos[i] > genotyped_pos)[0]) # Inefficient + cm_mP1_x = genotyped_cm[k + 1] - ungenotyped_cm[i] + cm_mP1_m = np.max([genotyped_cm[k + 1] - genotyped_cm[k], MIN_CM_DIST]) + weights[i] = cm_mP1_x / cm_mP1_m + marker_interval_start[i] = k + assert 0 <= np.min(weights), "Some weights are less than 1." + assert np.max(weights) <= 1, "Some weights are greater than 1." + return (weights, marker_interval_start) + + +def get_mismatch_prob(pos, miscall_rate): + """ + Set mismatch probabilities at genotyped markers. + + In BEAGLE, the mismatch probability is dominated by the allelic miscall rate. + By default, it is set to 0.0001 and capped at 0.50. + + :param numpy.ndarray pos: Site positions. + :param float miscall_rate: Allelic miscall rate. + :return: Mismatch probabilities. + :rtype: numpy.ndarray + """ + assert isinstance(miscall_rate, float) + if miscall_rate >= 0.5: + miscall_rate = 0.5 + mu = np.zeros(len(pos), dtype=np.float32) + miscall_rate + return mu + + +def get_switch_prob(pos, h, ne): + """ + Set switch probabilities at genotyped markers. + + Note that in BEAGLE, the default value of `ne` is set to 1e6. + If `h` is small and `ne` is large, the switch probability is nearly 1. + In such cases, it may be desirable to set `ne` to a small value + to avoid switching between haplotypes all the time. + + :param numpy.ndarray pos: Site positions. + :param int h: Number of reference haplotypes. + :param float ne: Effective population size. + :return: Switch probabilities. + :rtype: numpy.ndarray + """ + assert pos[0] > 0, "Site positions are not 1-based." + assert isinstance(h, int), "Number of reference haplotypes is not an integer." + assert isinstance(ne, float), "Effective population size is not a float." + # Get genetic distance between adjacent markers. + cm = convert_to_genetic_map_position(pos) + d = np.zeros(len(pos), dtype=np.float32) + d[0] = cm[0] + d[1:] = cm[1:] - cm[:-1] + assert len(d) == len(pos) + rho = -np.expm1(-(4 * ne * d / h)) + assert len(d) == len(rho) + return rho + + +def compute_forward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement forward algorithm to compute a forward probablity matrix of size (m, h). + + Reference haplotypes and the query haplotype are subsetted to genotyped markers. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Forward probability matrix. + :rtype: numpy.ndarray + """ + alleles = np.arange(3) # ACGT encoding + assert np.all( + np.isin(ref_h, alleles) + ), "Reference haplotypes have non-ACGT alleles." + assert np.all(np.isin(query_h, alleles)), "Query haplotype has non-ACGT alleles." + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + fm = np.zeros((m, h), dtype=np.float32) + last_sum = 1.0 # Part of normalization factor + for i in np.arange(m): + # Get site-specific parameters. + shift = rho[i] / h + scale = (1 - rho[i]) / last_sum + # Get allele at genotyped marker i on query haplotype j. + query_a = query_h[i] + for j in np.arange(h): + # Get allele at genotyped marker i on reference haplotype j. + ref_a = ref_h[i, j] + # Get emission probability. + em_prob = mu[i] if query_a != ref_a else 1.0 - mu[i] + if m == 0: + fm[i, j] = em_prob + else: + fm[i, j] = em_prob * (scale * fm[i - 1, j] + shift) + site_sum = np.sum(fm[i, :]) + last_sum = site_sum / h if m == 0 else site_sum + return fm + + +def compute_backward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement backward algorithm to compute backward probablity matrix of size (m, h). + + Reference haplotypes and the query haplotype are subsetted to genotyped markers. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + In BEAGLE 4.1, the values are kept one site at a time. + Here, we keep the values at all the sites. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Backward probability matrix. + :rtype: numpy.ndarray + """ + alleles = np.arange(3) # ACGT encoding + assert np.all( + np.isin(ref_h, alleles) + ), "Reference haplotypes have non-ACGT alleles." + assert np.all(np.isin(query_h, alleles)), "Query haplotype has non-ACGT alleles." + m = ref_h.shape[0] + h = ref_h.shape[1] + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + bm = np.zeros((m, h), dtype=np.float32) + bm[-1, :] = 1.0 / h # Initialise the last column + for i in np.arange(m - 2, -1, -1): + query_a = query_h[i + 1] + for j in np.arange(h): + ref_a = ref_h[i + 1, j] + em_prob = mu[i + 1] if ref_a != query_a else 1.0 - mu[i + 1] + # Note that BEAGLE keeps the values at one site at a time. + bm[i, j] = bm[i + 1, j] * em_prob + sum_site = np.sum(bm[i, :]) + scale = (1 - rho[i + 1]) / sum_site + shift = rho[i + 1] / h + bm[i, :] = scale * bm[i, :] + shift + return bm + + +def _get_reference_haplotype_segments(a, b): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Assuming all biallelic sites, get the index of a reference haplotype segment (i) + defined by the alleles a and b at two adjacent genotyped markers. + + #i, a, b + 0, 0, 0 + 1, 1, 0 + 2, 0, 1 + 3, 1, 1 + + See https://github.com/tskit-dev/tskit/issues/2802#issuecomment-1698767169 + + This is a helper function for `_compute_state_probability_matrix`. + """ + ref_hap_idx = a * 1 + b * 2 + return ref_hap_idx + + +def _compute_state_probability_matrix(fm, bm, ref_h, query_h, rho, mu): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Implement the forward-backward algorithm to compute: + 1. A HMM state probability matrix across genotyped markers. + 2. "Forward haplotype probabilities". + 3. "Backward haplotype probabilities". + + The HMM state probability matrix is of size (m + 1, h). + + Note that the additional column at m + 1 is for interpolation of + ungenotyped markers right of the last genotyped marker. + + Both the forward and backward haplotype probability matrices + are of size (m, 2), assuming only biallelic sites. + + Reference haplotypes and the query haplotype are subsetted to genotyped markers. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + This implementation is based on the function `setStateProbs` + of `LSHapBaum.java` in BEAGLE. It performs the computation + described in Equation 2 of BB2016. + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: HMM state probability matrix, + forward haplotype probability matrix, and + backward haplotype probability matrix. + :rtype: tuple(numpy.ndarray, numpy.ndarray, numpy.ndarray) + """ + alleles = np.arange(3) # ACGT encoding + m = fm.shape[0] + h = fm.shape[1] + assert (m, h) == bm.shape + assert not np.any(fm < 0), "Forward probability matrix has negative values." + assert not np.any(np.isnan(fm)), "Forward probability matrix has NaN values." + assert not np.any(bm < 0), "Backward probability matrix has negative values." + assert not np.any(np.isnan(bm)), "Backward probability matrix has NaN values." + assert np.all( + np.isin(ref_h, alleles) + ), "Reference haplotypes have non-ACGT alleles." + assert np.all(np.isin(query_h, alleles)), "Query haplotype has non-ACGT alleles." + assert m == len(query_h) + assert m == len(rho) + assert m == len(mu) + assert 0 <= np.min(rho) and np.max(rho) <= 1 + assert 0 <= np.min(mu) and np.max(mu) <= 0.5 + # m + 1 columns to allow for imputed markers right of the last genotyped marker. + sm = np.zeros((m + 1, h), dtype=np.float32) + fwd_hap_probs = np.zeros((m, 4), dtype=np.float32) + bwd_hap_probs = np.zeros((m, 4), dtype=np.float32) + for i in np.arange(m - 1, -1, -1): + for j in np.arange(h): + sm[i, j] = fm[i, j] * bm[i, j] + ref_hap_seg_mP1 = _get_reference_haplotype_segments( + ref_h[i, j], ref_h[i + 1, j] + ) + ref_hap_seg_m = _get_reference_haplotype_segments( + ref_h[i - 1, j], ref_h[i, j] + ) + fwd_hap_probs[i, ref_hap_seg_mP1] += sm[i, j] + bwd_hap_probs[i, ref_hap_seg_m] += sm[i, j] + sum_fwd_hap_probs = np.sum(fwd_hap_probs[i, :]) + fwd_hap_probs[i, :] /= sum_fwd_hap_probs + bwd_hap_probs[i, :] /= sum_fwd_hap_probs # Sum of forward hap probs + sm[m, :] = sm[m - 1, :] # Not in BEAGLE + return (sm, fwd_hap_probs, bwd_hap_probs) + + +def _interpolate_allele_probabilities( + fwd_hap_probs, bwd_hap_probs, genotyped_pos, ungenotyped_pos +): + """ + WARN: This function is part of an attempt to faithfully reimplement + the BEAGLE 4.1 algorithm. It is not complete and not used + in the current implementation. It is kept for documentation. + + Compute the interpolated allele probabilities at the ungenotyped markers. + + The forward and backward haplotype probability matrices are of size (m, 2). + The interpolated allele probability matrix is of size (x, 2). + We assume all biallelic sites, hence the 2. + + This implementation is based on Equation 2 (the rearranged one) in BB2016. + + :param numpy.ndarray fwd_hap_probs: Forward haplotype probability matrix. + :param numpy.ndarray bwd_hap_probs: Backward haplotype probability matrix. + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray ungenotyped_pos: Site positions of ungenotyped markers. + :return: Interpolated allele probabilities. + :rtype: numpy.ndarray + """ + m = len(genotyped_pos) + x = len(ungenotyped_pos) + assert (m, 2) == fwd_hap_probs.shape + assert (m, 2) == bwd_hap_probs.shape + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + assert m == len(genotyped_cm) + imputed_cm = convert_to_genetic_map_position(ungenotyped_pos) + assert x == len(imputed_cm) + weights, _ = get_weights(genotyped_cm, imputed_cm) + assert x == len(weights) + i_hap_probs = np.zeros((x, 2), dtype=np.float32) + for i in np.arange(x): + # Identify the genotyped markers k and k + 1 sandwiching ungenotyped marker i. + if ungenotyped_pos[i] < genotyped_pos[0]: + k = 0 + elif ungenotyped_pos[i] > genotyped_pos[-1]: + k = m - 2 + else: + k = np.max(np.where(ungenotyped_pos[i] > genotyped_pos)[0]) + # Vectorised over the allelic states + i_hap_probs[i, :] = weights[i] * bwd_hap_probs[k, :] + i_hap_probs[i, :] += (1 - weights[i]) * fwd_hap_probs[k, :] + return i_hap_probs + + +def compute_state_probability_matrix(fm, bm): + """ + Compute the HMM state probability matrix, in which each element is + the probability of a HMM state at a genotyped marker. + + Implementing this is simpler than faithfully reproducing the BEAGLE 4.1 algorithm. + + The output HMM state probability matrix is of size (m + 1, h). + + Note that the additional column at m + 1 is used for interpolation of + ungenotyped markers to the right of the last genotyped marker. + + The forward and backward probability matrices are both of size (m, h). + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :return: HMM state probability matrix. + :rtype: numpy.ndarray + """ + assert fm.shape == bm.shape + m = fm.shape[0] + h = bm.shape[1] + assert not np.any(fm < 0), "Forward probability matrix has negative values." + assert not np.any(np.isnan(fm)), "Forward probability matrix has NaN values." + assert not np.any(bm < 0), "Backward probability matrix has negative values." + assert not np.any(np.isnan(bm)), "Backward probability matrix has NaN values." + # m + 1 columns to allow for imputed markers right of the last genotyped marker. + sm = np.zeros((m + 1, h), dtype=np.float32) + sm[:-1, :] = np.multiply(fm, bm) + sm[m, :] = sm[m - 1, :] # Not in BEAGLE + return sm + + +def interpolate_allele_probabilities(sm, ref_h, genotyped_pos, ungenotyped_pos): + """ + Compute the interpolated allele probabilities at ungenotyped markers of + a query haplotype following Equation 1 of BB2016. + + The interpolated allele probability matrix is of size (x, a), + where a is the number of alleles. + + This function takes the output of `compute_state_probability_matrix`. + + Note that this function takes: + 1. HMM state probability matrix across genotyped markers of size (m + 1, h). + 2. reference haplotypes subsetted to ungenotyped markers of size (x, h). + + :param numpy.ndarray sm: HMM state probability matrix at genotyped markers. + :param numpy.ndarray ref_h: Reference haplotypes subsetted to ungenotyped markers. + :param numpy.ndarray genotyped_pos: Site positions at genotyped markers. + :param numpy.ndarray ungenotyped_pos: Site positions at ungenotyped markers. + :return: Interpolated allele probabilities. + :rtype: numpy.ndarray + """ + alleles = np.arange(4) # ACGT encoding + assert not np.any(sm < 0), "HMM state probability matrix has negative values." + assert not np.any(np.isnan(sm)), "HMM state probability matrix has NaN values." + m = sm.shape[0] - 1 + h = sm.shape[1] + assert m == len(genotyped_pos) + x = len(ungenotyped_pos) + assert (x, h) == ref_h.shape + weights, marker_interval_start = get_weights(genotyped_pos, ungenotyped_pos) + assert x == len(weights) + assert x == len(marker_interval_start) + p = np.zeros((x, len(alleles)), dtype=np.float32) + # Compute allele probabilities as per Equation 1 in BB2016. + for a in alleles: + # Going from left to right. + for i in np.arange(x): + # Identify genotyped markers k and k + 1 sandwiching ungenotyped marker i. + k = marker_interval_start[i] + # Sum over the reference haplotypes where the allele is a at marker i. + has_a = ref_h[i, :] == a + p[i, a] += np.sum(weights[i] * sm[k, has_a]) + p[i, a] += np.sum((1 - weights[i]) * sm[k + 1, has_a]) + # Rescale probabilities. + rescaled_p = np.zeros_like(p) + for i in np.arange(x): + rescaled_p[i, :] = p[i, :] / np.sum(p[i, :]) + return rescaled_p + + +def get_map_alleles(allele_probs): + """ + Compute the maximum a posteriori alleles at ungenotyped markers of a query haplotype. + + The imputed alleles is an array of size x. + + WARN: If the allele probabilities are equal, then allele 0 is arbitrarily chosen. + + :param numpy.ndarray allele_probs: Interpolated allele probabilities. + :return: Imputed alleles and their associated probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + assert np.all(allele_probs >= 0), "Some allele probabilities are negative." + assert not np.any(np.isnan(allele_probs)), "Some allele probabilities are NaN." + imputed_alleles = np.argmax(allele_probs, axis=1) + max_allele_probs = np.max(allele_probs, axis=1) + assert len(imputed_alleles) == allele_probs.shape[0] + assert len(max_allele_probs) == allele_probs.shape[0] + return (imputed_alleles, max_allele_probs) + + +def run_beagle(ref_h, query_h, pos, miscall_rate=0.0001, ne=1e6, debug=False): + """ + Run a simplified version of the BEAGLE 4.1 imputation algorithm + based on Equation 1 of BB2016. + + Reference haplotypes and query haplotype are of size (m + x, h) and (m + x), + respectively, + + The site positions of all the markers are in `pos`, of size (m + x). + + This produces imputed alleles and interpolated allele probabilities + at the ungenotyped markers of the query haplotype. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Site positions of all the markers. + :param float miscall_rate: Allelic miscall rate. + :param int ne: Effective population size. + :param bool debug: Whether to print intermediate results. + :return: Imputed alleles and their associated probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + # Set precision for printing. + if debug: + np.set_printoptions(precision=20) + assert ref_h.shape[0] == len(pos) + assert len(query_h) == len(pos) + # Indices of genotyped markers in the query haplotype. + genotyped_pos_idx = np.where(query_h != -1)[0] + assert len(genotyped_pos_idx) > 1 + # Indices of ungenotyped markers in the query haplotype. + ungenotyped_pos_idx = np.where(query_h == -1)[0] + assert len(ungenotyped_pos_idx) > 0 + # Site positions of genotyped markers. + genotyped_pos = pos[genotyped_pos_idx] + # Site positions of ungenotyped markers. + ungenotyped_pos = pos[ungenotyped_pos_idx] + h = ref_h.shape[1] + m = len(genotyped_pos) + x = len(ungenotyped_pos) + assert m + x == len(pos) + # Subset reference haplotypes to genotyped markers. + ref_h_genotyped = ref_h[genotyped_pos_idx, :] + if debug: + print("Reference haplotypes subsetted to genotyped markers") + print(ref_h_genotyped) + assert (m, h) == ref_h_genotyped.shape + # Subset query haplotype to genotyped markers. + query_h_genotyped = query_h[genotyped_pos_idx] + if debug: + print("Query haplotype subsetted to genotyped markers") + print(query_h_genotyped) + assert m == len(query_h_genotyped) + # Set mismatch probabilities at genotyped markers. + mu = get_mismatch_prob(genotyped_pos, miscall_rate=miscall_rate) + if debug: + print("Mismatch probabilities") + print(mu) + assert m == len(mu) + # Set switch probabilities at genotyped markers. + rho = get_switch_prob(genotyped_pos, h, ne=ne) + if debug: + print("Switch probabilities") + print(rho) + assert m == len(rho) + # Compute forward probability matrix at genotyped markers. + fm = compute_forward_probability_matrix(ref_h_genotyped, query_h_genotyped, rho, mu) + if debug: + print("Forward probability matrix") + print(fm) + assert (m, h) == fm.shape + # Compute backward probability matrix at genotyped markers. + bm = compute_backward_probability_matrix( + ref_h_genotyped, query_h_genotyped, rho, mu + ) + if debug: + print("Backward probability matrix") + print(bm) + assert (m, h) == bm.shape + # Compute HMM state probability matrix at genotyped markers. + sm = compute_state_probability_matrix(fm, bm) + if debug: + print("HMM state probability matrix") + print(sm) + assert (m + 1, h) == sm.shape + # Subset the reference haplotypes to ungenotyped markers. + ref_h_ungenotyped = ref_h[ungenotyped_pos_idx, :] + # Interpolate allele probabilities at ungenotyped markers. + alleles = np.arange(4) # ACGT + i_allele_probs = interpolate_allele_probabilities( + sm, ref_h_ungenotyped, genotyped_pos, ungenotyped_pos + ) + if debug: + print("Interpolated allele probabilities") + print(i_allele_probs) + assert (x, len(alleles)) == i_allele_probs.shape + # Get MAP alleles at ungenotyped markers. + imputed_alleles, max_allele_probs = get_map_alleles(i_allele_probs) + if debug: + print("Imputed alleles") + print(imputed_alleles) + print("Max allele probabilities") + print(max_allele_probs) + assert x == len(imputed_alleles) + assert x == len(max_allele_probs) + return (imputed_alleles, max_allele_probs) diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py new file mode 100644 index 0000000000..a8b01e9868 --- /dev/null +++ b/python/tests/beagle_numba.py @@ -0,0 +1,373 @@ +""" +Implementation of the BEAGLE 4.1 algorithm to impute alleles. + +This is the numba-fied version of `beagle.py`. +""" +import numpy as np +from numba import njit + +import _tskit + + +@njit +def convert_to_genetic_map_position(pos): + """ + Convert genomic site positions (bp) to genetic map positions (cM). + + In BEAGLE 4.1, when a genetic map is not specified, it is assumed + that the recombination rate is constant (1 cM / 1 Mb). + + This trivial function is meant for documentation. + + :param numpy.ndarray pos: Site positions. + :return: Genetic map positions. + :rtype: numpy.ndarray + """ + return pos / 1e6 # Convert to Mb + + +@njit +def get_weights(genotyped_pos, ungenotyped_pos): + """ + Get weights at ungenotyped markers in the query haplotype, which are used in + linear interpolation of HMM state probabilities at ungenotyped markers. + + In BB2016 (see below Equation 1), a weight between genotyped markers m and m + 1 + at ungenotyped marker x is denoted lambda_m,x. + + lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], + where g(i) = genetic map position of marker i. + + :param numpy.ndarray genotyped_pos: Site positions of genotyped markers. + :param numpy.ndarray ungenotyped_pos: Site positions of ungenotyped markers. + :return: Weights and marker interval start indices. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + m = len(genotyped_pos) + x = len(ungenotyped_pos) + genotyped_cm = convert_to_genetic_map_position(genotyped_pos) + ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos) + # Calculate weights for ungenotyped markers. + weights = np.zeros(x, dtype=np.float32) + # Identify genotype markers k and k + 1 sandwiching ungenotyped marker i. + marker_interval_start = np.zeros(x, dtype=np.int32) + idx_m = m - 1 + # Going from right to left. + for idx_x in np.arange(x - 1, -1, -1): + while ungenotyped_pos[idx_x] < genotyped_pos[idx_m] and idx_m > 0: + idx_m = max(idx_m - 1, 0) + if idx_m == m - 1: + # Right of the last genotyped marker. + weights[idx_x] = 0.0 + elif idx_m == 0 and ungenotyped_pos[idx_x] < genotyped_pos[idx_m]: + # Left of the first genotyped marker. + weights[idx_x] = 1.0 + else: + # Between two genotyped markers. + cm_mP1_x = genotyped_cm[idx_m + 1] - ungenotyped_cm[idx_x] + cm_mP1_m = genotyped_cm[idx_m + 1] - genotyped_cm[idx_m] + # Set min genetic distance. + cm_mP1_m = cm_mP1_m if cm_mP1_m > 0.005 else 0.005 + weights[idx_x] = cm_mP1_x / cm_mP1_m + marker_interval_start[idx_x] = idx_m + return (weights, marker_interval_start) + + +def get_mismatch_prob(pos, miscall_rate): + """ + Set mismatch probabilities at genotyped markers. + + In BEAGLE, the mismatch probability is dominated by the allelic miscall rate. + By default, it is set to 0.0001 and capped at 0.50. + + :param numpy.ndarray pos: Site positions. + :param float miscall_rate: Allelic miscall rate. + :return: Mismatch probabilities. + :rtype: numpy.ndarray + """ + if miscall_rate >= 0.5: + miscall_rate = 0.5 + mu = np.zeros(len(pos), dtype=np.float32) + miscall_rate + return mu + + +def get_switch_prob(pos, h, ne): + """ + Set switch probabilities at genotyped markers. + + Note that in BEAGLE, the default value of `ne` is set to 1e6. + If `h` is small and `ne` is large, the switch probability is nearly 1. + In such cases, it may be desirable to set `ne` to a small value + to avoid switching between haplotypes all the time. + + :param numpy.ndarray pos: Site positions. + :param int h: Number of reference haplotypes. + :param float ne: Effective population size. + :return: Switch probabilities. + :rtype: numpy.ndarray + """ + # Get genetic distances between adjacent markers. + cm = convert_to_genetic_map_position(pos) + d = np.zeros(len(pos), dtype=np.float32) + d[0] = cm[0] + d[1:] = cm[1:] - cm[:-1] + rho = -np.expm1(-(4 * ne * d / h)) + return rho + + +@njit +def compute_forward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement forward algorithm to compute a forward probablity matrix of size (m, h). + + Reference haplotypes and the query haplotype are subsetted to genotyped markers. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Forward probability matrix. + :rtype: numpy.ndarray + """ + m = ref_h.shape[0] + h = ref_h.shape[1] + fm = np.zeros((m, h), dtype=np.float32) + last_sum = 1.0 # Part of normalization factor + for i in np.arange(m): + # Get site-specific parameters + shift = rho[i] / h + scale = (1 - rho[i]) / last_sum + # Get allele at genotyped marker i on query haplotype j + query_a = query_h[i] + for j in np.arange(h): + # Get allele at genotyped marker i on reference haplotype j + ref_a = ref_h[i, j] + # Get emission probability + em_prob = mu[i] if query_a != ref_a else 1.0 - mu[i] + if m == 0: + fm[i, j] = em_prob + else: + fm[i, j] = em_prob * (scale * fm[i - 1, j] + shift) + site_sum = np.sum(fm[i, :]) + last_sum = site_sum / h if m == 0 else site_sum + return fm + + +@njit +def compute_backward_probability_matrix(ref_h, query_h, rho, mu): + """ + Implement backward algorithm to compute backward probablity matrix of size (m, h). + + Reference haplotypes and the query haplotype are subsetted to genotyped markers. + So, they are a matrix of size (m, h) and an array of size m, respectively. + + In BEAGLE 4.1, the values are kept one site at a time. + Here, we keep the values at all the sites. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray rho: Switch probabilities. + :param numpy.ndarray mu: Mismatch probabilities. + :return: Backward probability matrix. + :rtype: numpy.ndarray + """ + m = ref_h.shape[0] + h = ref_h.shape[1] + bm = np.zeros((m, h), dtype=np.float32) + bm[-1, :] = 1.0 / h # Initialise the last column + for i in np.arange(m - 2, -1, -1): + query_a = query_h[i + 1] + for j in np.arange(h): + ref_a = ref_h[i + 1, j] + em_prob = mu[i + 1] if ref_a != query_a else 1.0 - mu[i + 1] + # Note that BEAGLE keeps the values at one site at a time. + bm[i, j] = bm[i + 1, j] * em_prob + sum_site = np.sum(bm[i, :]) + scale = (1 - rho[i + 1]) / sum_site + shift = rho[i + 1] / h + bm[i, :] = scale * bm[i, :] + shift + return bm + + +def compute_state_probability_matrix(fm, bm): + """ + Compute the HMM state probability matrix, in which each element is + the probability of a HMM state at a genotyped marker. + + Implementing this is simpler than faithfully reproducing the BEAGLE 4.1 algorithm. + + The output HMM state probability matrix is of size (m + 1, h). + + Note that the additional column at m + 1 is used for interpolation of + ungenotyped markers to the right of the last genotyped marker. + + The forward and backward probability matrices are both of size (m, h). + + :param numpy.ndarray fm: Forward probability matrix. + :param numpy.ndarray bm: Backward probability matrix. + :return: HMM state probability matrix. + :rtype: numpy.ndarray + """ + m = fm.shape[0] + h = bm.shape[1] + # m + 1 columns to allow for imputed markers right of the last genotyped marker. + sm = np.zeros((m + 1, h), dtype=np.float32) + sm[:-1, :] = np.multiply(fm, bm) + sm[m, :] = sm[m - 1, :] # Not in BEAGLE + return sm + + +@njit +def interpolate_allele_probabilities(sm, ref_h, genotyped_pos, ungenotyped_pos): + """ + Compute the interpolated allele probabilities at ungenotyped markers of + a query haplotype following Equation 1 of BB2016. + + The interpolated allele probability matrix is of size (x, a), + where a is the number of alleles. + + This function takes the output of `compute_state_probability_matrix`. + + Note that this function takes: + 1. HMM state probability matrix across genotyped markers of size (m + 1, h). + 2. reference haplotypes subsetted to ungenotyped markers of size (x, h). + + :param numpy.ndarray sm: HMM state probability matrix at genotyped markers. + :param numpy.ndarray ref_h: Reference haplotypes subsetted to imputed markers. + :param numpy.ndarray genotyped_pos: Site positions at genotyped markers. + :param numpy.ndarray ungenotyped_pos: Site positions at ungenotyped markers. + :return: Interpolated allele probabilities. + :rtype: numpy.ndarray + """ + alleles = np.arange(4) # ACGT encoding + x = len(ungenotyped_pos) + weights, marker_interval_start = get_weights(genotyped_pos, ungenotyped_pos) + p = np.zeros((x, len(alleles)), dtype=np.float32) + for a in alleles: + for i in np.arange(x): + # Identify genotyped markers k and k + 1 sandwiching ungenotyped marker i. + k = marker_interval_start[i] + # Sum over the reference haplotypes where the allele is a at marker i. + has_a = ref_h[i, :] == a + p[i, a] += np.sum(weights[i] * sm[k, has_a]) + p[i, a] += np.sum((1 - weights[i]) * sm[k + 1, has_a]) + # Rescale probabilities. + rescaled_p = np.zeros_like(p) + for i in np.arange(x): + rescaled_p[i, :] = p[i, :] / np.sum(p[i, :]) + return rescaled_p + + +def get_map_alleles(allele_probs): + """ + Compute the maximum a posteriori alleles at ungenotyped markers of a query haplotype. + + The imputed alleles is an array of size x. + + WARN: If the allele probabilities are equal, then allele 0 is arbitrarily chosen. + + :param numpy.ndarray allele_probs: Interpolated allele probabilities. + :return: Imputed alleles and their associated probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + imputed_alleles = np.argmax(allele_probs, axis=1) + max_allele_probs = np.max(allele_probs, axis=1) + return (imputed_alleles, max_allele_probs) + + +def run_beagle(ref_h, query_h, pos, miscall_rate=0.0001, ne=1e6): + """ + Run a simplified version of the BEAGLE 4.1 imputation algorithm + based on Equation 1 of BB2016. + + Reference haplotypes and query haplotype are of size (m + x, h) and (m + x), + respectively, + + The site positions of all the markers are in `pos`, of size (m + x). + + This produces imputed alleles and interpolated allele probabilities + at the ungenotyped markers of the query haplotype. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Site positions of all the markers. + :param float miscall_rate: Allelic miscall rate. + :param int ne: Effective population size. + :return: Imputed alleles and their associated probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + # Set indices of markers. + genotyped_pos_idx = np.where(query_h != -1)[0] + ungenotyped_pos_idx = np.where(query_h == -1)[0] + # Set site positions of markers. + genotyped_pos = pos[genotyped_pos_idx] + ungenotyped_pos = pos[ungenotyped_pos_idx] + # Subset haplotypes. + ref_h_genotyped = ref_h[genotyped_pos_idx, :] + ref_h_ungenotyped = ref_h[ungenotyped_pos_idx, :] + query_h_genotyped = query_h[genotyped_pos_idx] + # Set switch and mismatch probabilities at genotyped markers. + mu = get_mismatch_prob(genotyped_pos, miscall_rate=miscall_rate) + rho = get_switch_prob(genotyped_pos, h=ref_h.shape[1], ne=ne) + # Compute the HMM matrices at genotyped markers. + fm = compute_forward_probability_matrix(ref_h_genotyped, query_h_genotyped, rho, mu) + bm = compute_backward_probability_matrix( + ref_h_genotyped, query_h_genotyped, rho, mu + ) + sm = compute_state_probability_matrix(fm, bm) + # Interpolate allele probabilities at ungenotyped markers. + i_allele_probs = interpolate_allele_probabilities( + sm, ref_h_ungenotyped, genotyped_pos, ungenotyped_pos + ) + # Get MAP alleles at ungenotyped markers. + imputed_alleles, max_allele_probs = get_map_alleles(i_allele_probs) + return (imputed_alleles, max_allele_probs) + + +def run_tsimpute(ref_ts, query_h, pos, mu, rho): + """ + Run the BEAGLE algorithm except that the forward and backward probability + matrices are computed from a tree sequence. + + TODO: Put this function elsewhere. + + :param numpy.ndarray ref_h: Reference haplotypes. + :param numpy.ndarray query_h: One query haplotype. + :param numpy.ndarray pos: Site positions of all the markers. + :param numpy.ndarray mu: Mutation rate. + :param numpy.ndarray rho: Recombination rate. + :return: Imputed alleles and their associated probabilities. + :rtype: tuple(numpy.ndarray, numpy.ndarray) + """ + # Set indices of markers. + genotyped_site_idx = np.where(query_h != -1)[0] + ungenotyped_site_idx = np.where(query_h == -1)[0] + # Set site positions of markers. + genotyped_site_pos = pos[genotyped_site_idx] + ungenotyped_site_pos = pos[ungenotyped_site_idx] + # Get parameters at genotyped markers. + mu = mu[genotyped_site_idx] + rho = rho[genotyped_site_idx] + # Subset haplotypes. + ref_ts_genotyped = ref_ts.delete_sites(site_ids=ungenotyped_site_idx) + ref_ts_ungenotyped = ref_ts.delete_sites(site_ids=genotyped_site_idx) + ref_h_ungenotyped = ref_ts_ungenotyped.genotype_matrix() + query_h_genotyped = query_h[genotyped_site_idx] + # Get forward and backward matrices from tree sequence. + fm = _tskit.CompressedMatrix(ref_ts_genotyped._ll_tree_sequence) + bm = _tskit.CompressedMatrix(ref_ts_genotyped._ll_tree_sequence) + ls_hmm = _tskit.LsHmm( + ref_ts_genotyped._ll_tree_sequence, mu, rho, acgt_alleles=True + ) + ls_hmm.forward_matrix(query_h_genotyped.T, fm) + ls_hmm.backward_matrix(query_h_genotyped.T, fm.normalisation_factor, bm) + # Compute state probability matrix. + sm = compute_state_probability_matrix(fm.decode(), bm.decode()) + # Interpolate allele probabilities. + i_allele_probs = interpolate_allele_probabilities( + sm, ref_h_ungenotyped, genotyped_site_pos, ungenotyped_site_pos + ) + # Get MAP alleles at ungenotyped markers. + imputed_alleles, max_allele_probs = get_map_alleles(i_allele_probs) + return (imputed_alleles, max_allele_probs) diff --git a/python/tests/test_imputation.py b/python/tests/test_imputation.py new file mode 100644 index 0000000000..85380233f1 --- /dev/null +++ b/python/tests/test_imputation.py @@ -0,0 +1,708 @@ +""" +Tests for genotype imputation (forward and Baum-Welsh algorithms). +""" +import io + +import numpy as np +import pytest + +import tests.beagle +import tests.beagle_numba +import tskit + + +# The following toy query haplotypes were imputed from the toy reference haplotypes +# using BEAGLE 4.1 with Ne set to 10. +# +# There are 9 sites, starting from 10,000 to 90,000, with 10,000 increments. +# The REF is A and ALT is C for all the sites in the VCF input to BEAGLE. +# The ancestral state is A and derived state is C for all the sites. +# In this setup, A is encoded as 0 and C as 1 whether the allele encoding is +# ancestral/derived (0/1) or ACGT (0123). +# +# TODO: Add more test cases covering imputation of all the ACGT alleles. +# +# Case 0: +# Reference haplotypes and query haplotypes have 0|0 at all sites. +# First and last reference markers are missing in the query haplotypes. +# fmt: off +toy_ref_0 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_0 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_0_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_0_beagle_allele_probs = np.array( + # Imputed markers only + [ + # query_0, haplotype 0 + [ + [1.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + ], + # query_0, haplotype 1 + [ + [1.0, 1.0, 1.0, 1.0, 1.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_0.shape == (9, 2) +assert toy_query_0.shape == (2, 9) +assert toy_query_0_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_0[0] == 0) == 4 +assert np.sum(toy_query_0[0] == -1) == 5 +assert np.array_equal(toy_query_0[0], toy_query_0[1]) +assert np.all(toy_query_0_beagle_imputed[0] == 0) +assert np.all(toy_query_0_beagle_imputed[1] == 0) +assert toy_query_0_beagle_allele_probs.shape == (2, 2, 5) + +# Case 1: +# Reference haplotypes and query haplotypes have 1|1 at all sites. +# First and last reference markers are missing in the query haplotypes. +# fmt: off +toy_ref_1 = np.array( + [ + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_1 = np.array( + [ + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_1_beagle_imputed = np.array( + [ + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_1_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0], + ], + # query_0, haplotype 1 + [ + [0.0, 0.0, 0.0, 0.0, 0.0], + [1.0, 1.0, 1.0, 1.0, 1.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_1.shape == (9, 2) +assert toy_query_1.shape == (2, 9) +assert toy_query_1_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_1[0] == 1) == 4 +assert np.sum(toy_query_1[0] == -1) == 5 +assert np.array_equal(toy_query_1[0], toy_query_1[1]) +assert np.all(toy_query_1_beagle_imputed[0] == 1) +assert np.all(toy_query_1_beagle_imputed[1] == 1) +assert toy_query_1_beagle_allele_probs.shape == (2, 2, 5) + +# Case 2: +# Reference haplotypes and query haplotypes have 0|0 at all sites. +# First and last reference markers are genotyped in the query haplotypes. +# fmt: off +toy_ref_2 = np.copy(toy_ref_0) +toy_query_2 = np.array( + [ + [0, 0, -1, 0, -1, 0, -1, 0, 0], # query_0, haplotype 0 + [0, 0, -1, 0, -1, 0, -1, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_2_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_2_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [1.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ], + # query_0, haplotype 1 + [ + [1.0, 1.0, 1.0], + [0.0, 0.0, 0.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_2.shape == (9, 2) +assert toy_query_2.shape == (2, 9) +assert toy_query_2_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_2[0] == 0) == 6 +assert np.sum(toy_query_2[0] == -1) == 3 +assert np.array_equal(toy_query_2[0], toy_query_2[1]) +assert np.all(toy_query_2_beagle_imputed[0] == 0) +assert np.all(toy_query_2_beagle_imputed[1] == 0) +assert toy_query_2_beagle_allele_probs.shape == (2, 2, 3) + +# Case 3: +# Reference haplotypes and query haplotypes have 0|1 at all sites. +# fmt: off +toy_ref_3 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_3 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_3_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_3_beagle_allele_probs = np.array( + # Ungenotyped markers only + # See comment https://github.com/tskit-dev/tskit/pull/2815#issuecomment-1708178257 + [ + # query_0, haplotype 0 + [ + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.9999998], + [0.0 , 0.0 , 0.0, 0.0 , 0.0], + ], + # query_0, haplotype 1 + [ + [0.0 , 0.0 , 0.0, 0.0 , 0.0], + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.9999998], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_3.shape == (9, 2) +assert toy_query_3.shape == (2, 9) +assert toy_query_3_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_3[0] == 0) == 4 +assert np.sum(toy_query_3[1] == 1) == 4 +assert np.sum(toy_query_3[0] == -1) == np.sum(toy_query_3[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_3_beagle_imputed[0], toy_query_3_beagle_imputed[1])) +) +assert toy_query_3_beagle_allele_probs.shape == (2, 2, 5) + +# Case 4: +# Reference haplotypes and query haplotypes have alternating 0|1 and 1|0 genotypes. +# Query haplotypes have 0|1 at all genotyped sites. +# fmt: off +toy_ref_4 = np.array( + [ + [0, 1, 0, 1, 0, 1, 0, 1, 0], # ref_0, haplotype 0 + [1, 0, 1, 0, 1, 0, 1, 0, 1], # ref_0, haplotype 1 + ], dtype=np.int32 +).T +toy_query_4 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_4_beagle_imputed = np.array( + [ + [1, 0, 1, 0, 1, 0, 1, 0, 1], # query_0, haplotype 0 + [0, 1, 0, 1, 0, 1, 0, 1, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_4_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.0 , 0.0 , 0.0, 0.0 , 0.9999998], + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.0], + ], + # query_0, haplotype 1 + [ + [0.9999998, 0.9999999, 1.0, 0.9999999, 0.0], + [0.0 , 0.0 , 0.0, 0.0 , 0.9999998], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_4.shape == (9, 2) +assert toy_query_4.shape == (2, 9) +assert toy_query_4_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_4[0] == 0) == 4 +assert np.sum(toy_query_4[1] == 1) == 4 +assert np.sum(toy_query_4[0] == -1) == np.sum(toy_query_4[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_4_beagle_imputed[0], toy_query_4_beagle_imputed[1])) +) +assert toy_query_3_beagle_allele_probs.shape == (2, 2, 5) + +# Case 5: +# The reference panel has two individuals. The first individual has 0|0 at all sites, +# and the second individual has 1|1 at all sites. +# The query haplotypes have one recombination breakpoint in the middle. +# fmt: off +toy_ref_5 = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0 + [0, 0, 0, 0, 0, 0, 0, 0, 0], # ref_0 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_1 + [1, 1, 1, 1, 1, 1, 1, 1, 1], # ref_1 + ], dtype=np.int32 +).T +toy_query_5 = np.array( + [ + [-1, 0, -1, 0, -1, 0, -1, 1, -1], # query_0, haplotype 0 + [-1, 1, -1, 1, -1, 1, -1, 0, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_5_beagle_imputed = np.array( + [ + [0, 0, 0, 0, 0, 0, 0, 1, 1], # query_0, haplotype 0 + [1, 1, 1, 1, 1, 1, 1, 0, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_5_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.9999999, 0.99999994, 0.9999546, 0.5454091, 0.09090912], + [0.0 , 0.0 , 0.0 , 0.4545909, 0.9090909], + ], + # query_0, haplotype 1 + [ + [0.0 , 0.0 , 0.0 , 0.4545909, 0.9090909], + [0.9999999, 0.99999994, 0.9999546, 0.5454091, 0.09090912], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_5.shape == (9, 4) +assert toy_query_5.shape == (2, 9) +assert toy_query_5_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_5[0] == 0) == 3 +assert np.sum(toy_query_5[0] == 1) == 1 +assert np.sum(toy_query_5[0] == -1) == 5 +assert np.sum(toy_query_5[1] == 0) == 1 +assert np.sum(toy_query_5[1] == 1) == 3 +assert np.sum(toy_query_5[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_5_beagle_imputed[0], toy_query_5_beagle_imputed[1])) +) +assert toy_query_5_beagle_allele_probs.shape == (2, 2, 5) + +# Case 6: +# The reference panel has two individuals. The first individual has 0|0 at all sites, +# and the second individual has 1|1 at all sites. +# The query haplotypes have two recombination breakpoints in the middle. +# fmt: off +toy_ref_6 = np.copy(toy_ref_5) +toy_query_6 = np.array( + [ + [-1, 0, -1, 1, -1, 1, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 0, -1, 0, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_6_beagle_imputed = np.array( + [ + [0, 0, 1, 1, 1, 1, 1, 0, 0], # query_0, haplotype 0 + [1, 1, 0, 0, 0, 0, 0, 1, 1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_6_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.90983605, 0.46770614, 0.025481388, 0.46838862, 0.91139066], + [0.09016396, 0.53229386, 0.97451866 , 0.5316114 , 0.088609315], + ], + # query_0, haplotype 1 + [ + [0.09016395, 0.53229386, 0.97451866 , 0.5316114 , 0.0886093], + [0.90983605, 0.46770614, 0.025481388, 0.46838862, 0.91139066], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_6.shape == (9, 4) +assert toy_query_6.shape == (2, 9) +assert toy_query_6_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_6[0] == 0) == 2 +assert np.sum(toy_query_6[0] == 1) == 2 +assert np.sum(toy_query_6[0] == -1) == 5 +assert np.sum(toy_query_6[1] == 0) == 2 +assert np.sum(toy_query_6[1] == 1) == 2 +assert np.sum(toy_query_6[1] == -1) == 5 +assert np.all( + np.invert(np.equal(toy_query_6_beagle_imputed[0], toy_query_6_beagle_imputed[1])) +) +assert toy_query_6_beagle_allele_probs.shape == (2, 2, 5) + +# Case 7: +# Hand-crafted example. +# fmt: off +toy_ref_7 = np.array( + [ + [0, 0, 0, 1, 1, 0, 0, 0, 1], # ref_0 + [1, 0, 0, 0, 1, 0, 0, 0, 1], # ref_0 + [0, 0, 1, 1, 1, 0, 0, 0, 1], # ref_1 + [0, 1, 0, 0, 0, 0, 0, 1, 0], # ref_1 + [0, 0, 0, 0, 0, 1, 1, 1, 0], # ref_2 + [1, 0, 0, 0, 1, 0, 0, 0, 1], # ref_2 + [0, 0, 1, 1, 1, 0, 0, 0, 1], # ref_3 + [0, 0, 0, 0, 0, 0, 0, 0, 1], # ref_3 + [0, 1, 0, 0, 0, 0, 0, 1, 0], # ref_4 + [0, 0, 0, 0, 0, 1, 1, 1, 0], # ref_4 + ], dtype=np.int32 +).T +toy_query_7 = np.array( + [ + [0, 0, -1, 1, -1, 1, -1, 0, 1], # query_0, haplotype 0 + [1, 1, -1, 0, -1, 0, -1, 1, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_7_beagle_imputed = np.array( + [ + [0, 0, 1, 1, 1, 1, 0, 0, 1], # query_0, haplotype 0 + [1, 1, 0, 0, 0, 0, 0, 1, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_7_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.33317572, 0.0 , 0.99945664], + [0.66635144, 0.9991437, 0.0], + ], + # query_0, haplotype 1 + [ + [0.9995998, 0.99971414, 0.99974835], + [0.0 , 0.0 , 0.0], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_7.shape == (9, 10) +assert toy_query_7.shape == (2, 9) +assert toy_query_7_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_7[0] == -1) == np.sum(toy_query_7[1] == -1) == 3 +assert toy_query_7_beagle_allele_probs.shape == (2, 2, 3) + +# Case 8: +# Same as case 7 except the last genotyped marker is missing. +# fmt: off +toy_ref_8 = np.copy(toy_ref_7) +toy_query_8 = np.array( + [ + [-1, 0, -1, 1, -1, 1, -1, 0, -1], # query_0, haplotype 0 + [-1, 1, -1, 0, -1, 0, -1, 1, -1], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_8_beagle_imputed = np.array( + [ + [0, 0, 1, 1, 1, 1, 0, 0, 1], # query_0, haplotype 0 + [0, 1, 0, 0, 0, 0, 0, 1, 0], # query_0, haplotype 1 + ], dtype=np.int32 +) +toy_query_8_beagle_allele_probs = np.array( + # Ungenotyped markers only + [ + # query_0, haplotype 0 + [ + [0.9997735, 0.33310473, 0.0 , 0.9992305, 0.9997736], + [0.0 , 0.66620946, 0.99893093, 0.0 , 0.0], + ], + # query_0, haplotype 1 + [ + [0.9999998, 0.9999998, 0.9999998, 0.99991965, 0.0], + [0.0 , 0.0 , 0.0 , 0.0 , 0.9999998], + ], + ], + dtype=np.float32, +) +# fmt: on +assert toy_ref_8.shape == (9, 10) +assert toy_query_8.shape == (2, 9) +assert toy_query_8_beagle_imputed.shape == (2, 9) +assert np.sum(toy_query_8[0] == -1) == 5 +assert np.sum(toy_query_8[1] == -1) == 5 +assert toy_query_8_beagle_allele_probs.shape == (2, 2, 5) + + +@pytest.mark.parametrize( + "input_ref,input_query,expected", + [ + (toy_ref_0, toy_query_0, toy_query_0_beagle_imputed), + (toy_ref_1, toy_query_1, toy_query_1_beagle_imputed), + (toy_ref_2, toy_query_2, toy_query_2_beagle_imputed), + (toy_ref_3, toy_query_3, toy_query_3_beagle_imputed), + (toy_ref_4, toy_query_4, toy_query_4_beagle_imputed), + (toy_ref_5, toy_query_5, toy_query_5_beagle_imputed), + (toy_ref_6, toy_query_6, toy_query_6_beagle_imputed), + (toy_ref_7, toy_query_7, toy_query_7_beagle_imputed), + (toy_ref_8, toy_query_8, toy_query_8_beagle_imputed), + ], +) +def test_compare_imputed_alleles(input_ref, input_query, expected): + """Compare with imputed alleles from BEAGLE 4.1.""" + assert input_query.shape == expected.shape + pos = (np.arange(9) + 1) * 1e4 + num_query_haps = input_query.shape[0] + num_imputed_markers = np.sum(input_query[0] == -1) # x + imputed_alleles = np.zeros((num_query_haps, num_imputed_markers), dtype=int) + expected_subset = expected[:, input_query[0] == -1] # Subsetted to imputed markers + assert expected_subset.shape == imputed_alleles.shape + for i in np.arange(num_query_haps): + imputed_alleles[i], _ = tests.beagle.run_beagle( + input_ref, input_query[i], pos, miscall_rate=0.0001, ne=10.0 + ) + np.testing.assert_array_equal(imputed_alleles[i], expected_subset[i]) + + +@pytest.mark.parametrize( + "input_ref,input_query", + [ + (toy_ref_0, toy_query_0), + (toy_ref_1, toy_query_1), + (toy_ref_2, toy_query_2), + (toy_ref_3, toy_query_3), + (toy_ref_4, toy_query_4), + (toy_ref_5, toy_query_5), + (toy_ref_6, toy_query_6), + (toy_ref_7, toy_query_7), + (toy_ref_8, toy_query_8), + ], +) +def test_beagle_numba(input_ref, input_query): + pos = (np.arange(9) + 1) * 1e4 + num_query_haps = input_query.shape[0] + for i in np.arange(num_query_haps): + imputed_alleles, allele_probs = tests.beagle.run_beagle( + input_ref, input_query[i], pos, miscall_rate=0.0001, ne=10.0 + ) + imputed_alleles_numba, allele_probs_numba = tests.beagle_numba.run_beagle( + input_ref, input_query[i], pos, miscall_rate=0.0001, ne=10.0 + ) + np.testing.assert_array_equal(imputed_alleles, imputed_alleles_numba) + np.testing.assert_allclose(allele_probs, allele_probs_numba) + + +# Below is toy data set case 7 in tree sequence format. +toy_ts_nodes_text = """\ +id is_sample time population individual metadata +0 1 0.000000 0 0 +1 1 0.000000 0 0 +2 1 0.000000 0 1 +3 1 0.000000 0 1 +4 1 0.000000 0 2 +5 1 0.000000 0 2 +6 1 0.000000 0 3 +7 1 0.000000 0 3 +8 1 0.000000 0 4 +9 1 0.000000 0 4 +10 0 0.009923 0 -1 +11 0 0.038603 0 -1 +12 0 0.057935 0 -1 +13 0 0.145141 0 -1 +14 0 0.238045 0 -1 +15 0 0.528344 0 -1 +16 0 0.646418 0 -1 +17 0 1.462199 0 -1 +18 0 2.836600 0 -1 +19 0 3.142225 0 -1 +20 0 4.056253 0 -1 +""" + +toy_ts_edges_text = """\ +left right parent child metadata +0.000000 100000.000000 10 1 +0.000000 100000.000000 10 5 +0.000000 100000.000000 11 3 +0.000000 100000.000000 11 8 +0.000000 100000.000000 12 2 +0.000000 100000.000000 12 6 +0.000000 100000.000000 13 0 +0.000000 100000.000000 13 12 +0.000000 100000.000000 14 10 +0.000000 100000.000000 14 13 +0.000000 40443.000000 15 9 +0.000000 40443.000000 15 14 +40443.000000 100000.000000 16 4 +40443.000000 100000.000000 16 9 +0.000000 40443.000000 17 4 +0.000000 100000.000000 17 11 +40443.000000 100000.000000 17 16 +0.000000 12721.000000 18 7 +0.000000 12721.000000 18 17 +12721.000000 100000.000000 19 7 +40443.000000 100000.000000 19 14 +12721.000000 40443.000000 19 15 +0.000000 12721.000000 20 15 +12721.000000 100000.000000 20 17 +0.000000 12721.000000 20 18 +12721.000000 100000.000000 20 19 +""" + +toy_ts_sites_text = """\ +position ancestral_state metadata +10000.000000 A +20000.000000 A +30000.000000 A +40000.000000 A +50000.000000 A +60000.000000 A +70000.000000 A +80000.000000 A +90000.000000 A +""" + +toy_ts_mutations_text = """\ +site node time derived_state parent metadata +0 10 unknown C -1 +1 11 unknown C -1 +2 12 unknown C -1 +3 13 unknown C -1 +4 14 unknown C -1 +5 16 unknown C -1 +6 16 unknown C -1 +7 17 unknown C -1 +8 19 unknown C -1 +""" + +toy_ts_individuals_text = """\ +flags +0 +0 +0 +0 +0 +""" + + +def get_toy_data(): + """Get toy example 7 in tree sequence format.""" + ts = tskit.load_text( + nodes=io.StringIO(toy_ts_nodes_text), + edges=io.StringIO(toy_ts_edges_text), + sites=io.StringIO(toy_ts_sites_text), + mutations=io.StringIO(toy_ts_mutations_text), + individuals=io.StringIO(toy_ts_individuals_text), + strict=False, + ) + return ts + + +def parse_matrix(csv_text): + # TODO: Maybe remove. + # This returns a record array, which is essentially the same as a + # pandas dataframe, which we can access via df["m"] etc. + return np.lib.npyio.recfromcsv(io.StringIO(csv_text)) + + +@pytest.mark.parametrize( + "input_ref,input_query", + [ + (toy_ref_7, toy_query_7), + (toy_ref_8, toy_query_8), + ], +) +def test_tsimpute(input_ref, input_query): + """ + Test whether the outputs of tsimpute and Python BEAGLE implementation match. + """ + # TODO: Compare interpolated allele probabilities. + toy_ref_ts = get_toy_data() # Same for both cases + pos = toy_ref_ts.sites_position + num_query_haps = input_query.shape[0] + mu = np.zeros(len(pos), dtype=np.float32) + 1e-8 + rho = np.zeros(len(pos), dtype=np.float32) + 1e-8 + for i in np.arange(num_query_haps): + imputed_alleles, _ = tests.beagle.run_beagle( + input_ref, input_query[i], pos, miscall_rate=0.0001, ne=10.0 + ) + imputed_alleles_ts, _ = tests.beagle_numba.run_tsimpute( + toy_ref_ts, input_query[i], pos, mu, rho + ) + np.testing.assert_array_equal(imputed_alleles, imputed_alleles_ts) + + +# Tests for helper functions. +@pytest.mark.parametrize( + "genotyped_pos,ungenotyped_pos,expected_weights,expected_idx", + [ + # All ungenotyped markers are between genotyped markers. + ( + np.array([10, 20]) * 1e6, + np.array([15]) * 1e6, + np.array([0.5]), + np.array([0]), + ), + # Same as above, but more genotyped markers. + ( + np.array([10, 20, 30, 40]) * 1e6, + np.array([15, 25, 35]) * 1e6, + np.array([0.5, 0.5, 0.5]), + np.array([0, 1, 2]), + ), + # Ungenotyped markers are left of the first genotyped marker. + ( + np.array([10, 20]) * 1e6, + np.array([5, 15]) * 1e6, + np.array([1.0, 0.5]), + np.array([0, 0]), + ), + # Ungenotyped markers are right of the last genotyped marker. + ( + np.array([10, 20]) * 1e6, + np.array([15, 25]) * 1e6, + np.array([0.5, 0.0]), + np.array([0, 1]), + ), + # Denominator below min threshold. + (np.array([10, 20]), np.array([15]), np.array([0.001]), np.array([0])), + ], +) +def test_get_weights(genotyped_pos, ungenotyped_pos, expected_weights, expected_idx): + actual_weights, actual_idx = tests.beagle.get_weights( + genotyped_pos, ungenotyped_pos + ) + np.testing.assert_allclose(actual_weights, expected_weights) + np.testing.assert_array_equal(actual_idx, expected_idx) + actual_weights, actual_idx = tests.beagle_numba.get_weights( + genotyped_pos, ungenotyped_pos + ) + np.testing.assert_allclose(actual_weights, expected_weights) + np.testing.assert_array_equal(actual_idx, expected_idx)