From aad036def08210e816fa4c0360a11ecac9491757 Mon Sep 17 00:00:00 2001 From: Shing Zhan Date: Sun, 14 Jan 2024 09:53:10 +0000 Subject: [PATCH] Add GeneticMap class and refactor --- python/tests/beagle_numba.py | 98 ++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 50 deletions(-) diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py index dd9544dc75..fec799680b 100644 --- a/python/tests/beagle_numba.py +++ b/python/tests/beagle_numba.py @@ -6,6 +6,8 @@ This is the numba-fied version of `beagle.py`. """ +from dataclasses import dataclass + import numpy as np from numba import njit @@ -13,7 +15,22 @@ import tskit -def convert_to_genetic_map_position(pos, genetic_map_pos=None, genetic_map_cm=None): +@dataclass +class GeneticMap: + """ + A genetic map having: + 1. Physical positions (bp). + 2. Genetic map positions (cM). + """ + + base_pos: np.ndarray + gen_pos: np.ndarray + + def __post_init__(self): + assert len(self.base_pos) == len(self.gen_pos) + + +def convert_to_genetic_map_positions(pos, genetic_map=None): """ Convert physical positions (bp) to genetic map positions (cM). @@ -26,29 +43,31 @@ def convert_to_genetic_map_position(pos, genetic_map_pos=None, genetic_map_cm=No See `PlinkGenMap.java` in the BEAGLE 4.1 source code for details. :param numpy.ndarray pos: Physical positions (bp). - :param numpy.ndarray genetic_map_pos: Physical positions of genetic map. - :param numpy.ndarray genetic_map_cm: Genetic map positions of genetic map. + :param GeneticMap genetic_map: Genetic map. :return: Genetic map positions (cM). :rtype: numpy.ndarray """ - if genetic_map_pos is None or genetic_map_cm is None: + if genetic_map is None: return pos / 1e6 - # TODO: Better way to pass the components of a genetic map. - base_pos = genetic_map_pos - gen_pos = genetic_map_cm - assert len(base_pos) == len(gen_pos), "Bad genetic map." - assert np.all(pos >= base_pos[0]) and np.all( - pos <= base_pos[-1] + assert np.all(pos >= genetic_map.base_pos[0]) and np.all( + pos <= genetic_map.base_pos[-1] ), "Physical positions outside of genetic map." - left_idx = np.searchsorted(base_pos, pos, side="right") + # Approximate genetic map distances between adjacent base positions. + left_idx = np.searchsorted(genetic_map.base_pos, pos, side="right") xMa = np.zeros(len(pos), dtype=np.float32) bMa = np.zeros(len(pos), dtype=np.float32) fbMfa = np.zeros(len(pos), dtype=np.float32) for i in range(len(pos)): - xMa[i] = pos[i] - base_pos[left_idx[i] - 1] - bMa[i] = base_pos[left_idx[i]] - base_pos[left_idx[i] - 1] - fbMfa[i] = gen_pos[left_idx[i]] - gen_pos[left_idx[i] - 1] - return xMa / bMa * fbMfa + xMa[i] = pos[i] - genetic_map.base_pos[left_idx[i] - 1] + bMa[i] = ( + genetic_map.base_pos[left_idx[i]] - genetic_map.base_pos[left_idx[i] - 1] + ) + fbMfa[i] = ( + genetic_map.gen_pos[left_idx[i]] - genetic_map.gen_pos[left_idx[i] - 1] + ) + est_cM_diff = xMa / bMa * fbMfa + # TODO: Calculate cumulative genetic map distances. + return est_cM_diff @njit @@ -338,19 +357,17 @@ def run_beagle( pos, miscall_rate=1e-4, ne=1e6, - genetic_map_pos=None, - genetic_map_cm=None, + genetic_map=None, ): """ Run a simplified version of the BEAGLE 4.1 imputation algorithm based on Equation 1 of BB2016. - Reference haplotypes and query haplotype are of size (m + x, h) and (m + x), - respectively, + Reference haplotypes and query haplotype are of size (m + x, h) and (m + x). - The site positions of all the markers are an array of size (m + x). + The physical positions of all the markers are an array of size (m + x). - This produces imputed alleles and interpolated allele probabilities + This produces imputed alleles and their interpolated probabilities at the ungenotyped markers of the query haplotype. :param numpy.ndarray ref_h: Reference haplotypes. @@ -358,8 +375,7 @@ def run_beagle( :param numpy.ndarray pos: Physical positions of all the markers. :param float miscall_rate: Allelic miscall rate. :param int ne: Effective population size. - :param numpy.ndarray genetic_map_pos: Physical positions of genetic map. - :param numpy.ndarray genetic_map_cm: Genetic map positions of genetic map. + :param GeneticMap genetic_map: Genetic map. :return: Imputed alleles and their interpolated probabilities. :rtype: tuple(numpy.ndarray, numpy.ndarray) """ @@ -370,16 +386,8 @@ def run_beagle( genotyped_pos = pos[genotyped_idx] ungenotyped_pos = pos[ungenotyped_idx] # Get genetic map positions of markers. - if genetic_map_pos is None or genetic_map_cm is None: - genotyped_cm = convert_to_genetic_map_position(genotyped_pos) - ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos) - else: - genotyped_cm = convert_to_genetic_map_position( - genotyped_pos, genetic_map_pos, genetic_map_cm - ) - ungenotyped_cm = convert_to_genetic_map_position( - ungenotyped_pos, genetic_map_pos, genetic_map_cm - ) + genotyped_cm = convert_to_genetic_map_positions(genotyped_pos, genetic_map) + ungenotyped_cm = convert_to_genetic_map_positions(ungenotyped_pos, genetic_map) # Subset haplotypes. ref_h_genotyped = ref_h[genotyped_idx, :] ref_h_ungenotyped = ref_h[ungenotyped_idx, :] @@ -413,8 +421,7 @@ def run_tsimpute( mu, rho, precision=22, - genetic_map_pos=None, - genetic_map_cm=None, + genetic_map=None, ): """ Run the BEAGLE 4.1 algorithm except that the forward and backward probability @@ -441,16 +448,8 @@ def run_tsimpute( genotyped_pos = pos[genotyped_idx] ungenotyped_pos = pos[ungenotyped_idx] # Get genetic map positions of markers. - if genetic_map_pos is None or genetic_map_cm is None: - genotyped_cm = convert_to_genetic_map_position(genotyped_pos) - ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos) - else: - genotyped_cm = convert_to_genetic_map_position( - genotyped_pos, genetic_map_pos, genetic_map_cm - ) - ungenotyped_cm = convert_to_genetic_map_position( - ungenotyped_pos, genetic_map_pos, genetic_map_cm - ) + genotyped_cm = convert_to_genetic_map_positions(genotyped_pos, genetic_map) + ungenotyped_cm = convert_to_genetic_map_positions(ungenotyped_pos, genetic_map) # Get parameters at genotyped markers. mu = mu[genotyped_idx] rho = rho[genotyped_idx] @@ -459,10 +458,11 @@ def run_tsimpute( ref_ts_ungenotyped = ref_ts.delete_sites(site_ids=genotyped_idx) ref_h_ungenotyped = ref_ts_ungenotyped.genotype_matrix(alleles=tskit.ALLELES_ACGT) query_h_genotyped = query_h[genotyped_idx] - # Get forward and backward matrices from tree sequence. + # Get matrices from tree sequence. fm = _tskit.CompressedMatrix(ref_ts_genotyped._ll_tree_sequence) bm = _tskit.CompressedMatrix(ref_ts_genotyped._ll_tree_sequence) - # WARN: Be careful with the position arguments rho and mu!!! + # WARN: Be careful with the positional arguments rho and mu!!! + # WARN: Be careful with the argument `acgt_alleles`!!! ls_hmm = _tskit.LsHmm( ref_ts_genotyped._ll_tree_sequence, rho, @@ -472,9 +472,8 @@ def run_tsimpute( ) ls_hmm.forward_matrix(query_h_genotyped.T, fm) ls_hmm.backward_matrix(query_h_genotyped.T, fm.normalisation_factor, bm) - # Compute HMM state probability matrix. sm = compute_state_prob_matrix(fm.decode(), bm.decode()) - # Interpolate allele probabilities. + # Perform interpolation. i_allele_prob = interpolate_allele_prob( sm, ref_h_ungenotyped, @@ -483,6 +482,5 @@ def run_tsimpute( genotyped_cm, ungenotyped_cm, ) - # Get MAP alleles at ungenotyped markers. imputed_alleles, max_allele_prob = get_map_alleles(i_allele_prob) return (imputed_alleles, max_allele_prob)