Skip to content

Commit

Permalink
Add GeneticMap class and refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
szhan committed Jan 14, 2024
1 parent 39ab9a4 commit aad036d
Showing 1 changed file with 48 additions and 50 deletions.
98 changes: 48 additions & 50 deletions python/tests/beagle_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,31 @@
This is the numba-fied version of `beagle.py`.
"""
from dataclasses import dataclass

import numpy as np
from numba import njit

import _tskit
import tskit


def convert_to_genetic_map_position(pos, genetic_map_pos=None, genetic_map_cm=None):
@dataclass
class GeneticMap:
"""
A genetic map having:
1. Physical positions (bp).
2. Genetic map positions (cM).
"""

base_pos: np.ndarray
gen_pos: np.ndarray

def __post_init__(self):
assert len(self.base_pos) == len(self.gen_pos)


def convert_to_genetic_map_positions(pos, genetic_map=None):
"""
Convert physical positions (bp) to genetic map positions (cM).
Expand All @@ -26,29 +43,31 @@ def convert_to_genetic_map_position(pos, genetic_map_pos=None, genetic_map_cm=No
See `PlinkGenMap.java` in the BEAGLE 4.1 source code for details.
:param numpy.ndarray pos: Physical positions (bp).
:param numpy.ndarray genetic_map_pos: Physical positions of genetic map.
:param numpy.ndarray genetic_map_cm: Genetic map positions of genetic map.
:param GeneticMap genetic_map: Genetic map.
:return: Genetic map positions (cM).
:rtype: numpy.ndarray
"""
if genetic_map_pos is None or genetic_map_cm is None:
if genetic_map is None:
return pos / 1e6
# TODO: Better way to pass the components of a genetic map.
base_pos = genetic_map_pos
gen_pos = genetic_map_cm
assert len(base_pos) == len(gen_pos), "Bad genetic map."
assert np.all(pos >= base_pos[0]) and np.all(
pos <= base_pos[-1]
assert np.all(pos >= genetic_map.base_pos[0]) and np.all(
pos <= genetic_map.base_pos[-1]
), "Physical positions outside of genetic map."
left_idx = np.searchsorted(base_pos, pos, side="right")
# Approximate genetic map distances between adjacent base positions.
left_idx = np.searchsorted(genetic_map.base_pos, pos, side="right")
xMa = np.zeros(len(pos), dtype=np.float32)
bMa = np.zeros(len(pos), dtype=np.float32)
fbMfa = np.zeros(len(pos), dtype=np.float32)
for i in range(len(pos)):
xMa[i] = pos[i] - base_pos[left_idx[i] - 1]
bMa[i] = base_pos[left_idx[i]] - base_pos[left_idx[i] - 1]
fbMfa[i] = gen_pos[left_idx[i]] - gen_pos[left_idx[i] - 1]
return xMa / bMa * fbMfa
xMa[i] = pos[i] - genetic_map.base_pos[left_idx[i] - 1]
bMa[i] = (
genetic_map.base_pos[left_idx[i]] - genetic_map.base_pos[left_idx[i] - 1]
)
fbMfa[i] = (
genetic_map.gen_pos[left_idx[i]] - genetic_map.gen_pos[left_idx[i] - 1]
)
est_cM_diff = xMa / bMa * fbMfa
# TODO: Calculate cumulative genetic map distances.
return est_cM_diff


@njit
Expand Down Expand Up @@ -338,28 +357,25 @@ def run_beagle(
pos,
miscall_rate=1e-4,
ne=1e6,
genetic_map_pos=None,
genetic_map_cm=None,
genetic_map=None,
):
"""
Run a simplified version of the BEAGLE 4.1 imputation algorithm
based on Equation 1 of BB2016.
Reference haplotypes and query haplotype are of size (m + x, h) and (m + x),
respectively,
Reference haplotypes and query haplotype are of size (m + x, h) and (m + x).
The site positions of all the markers are an array of size (m + x).
The physical positions of all the markers are an array of size (m + x).
This produces imputed alleles and interpolated allele probabilities
This produces imputed alleles and their interpolated probabilities
at the ungenotyped markers of the query haplotype.
:param numpy.ndarray ref_h: Reference haplotypes.
:param numpy.ndarray query_h: One query haplotype.
:param numpy.ndarray pos: Physical positions of all the markers.
:param float miscall_rate: Allelic miscall rate.
:param int ne: Effective population size.
:param numpy.ndarray genetic_map_pos: Physical positions of genetic map.
:param numpy.ndarray genetic_map_cm: Genetic map positions of genetic map.
:param GeneticMap genetic_map: Genetic map.
:return: Imputed alleles and their interpolated probabilities.
:rtype: tuple(numpy.ndarray, numpy.ndarray)
"""
Expand All @@ -370,16 +386,8 @@ def run_beagle(
genotyped_pos = pos[genotyped_idx]
ungenotyped_pos = pos[ungenotyped_idx]
# Get genetic map positions of markers.
if genetic_map_pos is None or genetic_map_cm is None:
genotyped_cm = convert_to_genetic_map_position(genotyped_pos)
ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos)
else:
genotyped_cm = convert_to_genetic_map_position(
genotyped_pos, genetic_map_pos, genetic_map_cm
)
ungenotyped_cm = convert_to_genetic_map_position(
ungenotyped_pos, genetic_map_pos, genetic_map_cm
)
genotyped_cm = convert_to_genetic_map_positions(genotyped_pos, genetic_map)
ungenotyped_cm = convert_to_genetic_map_positions(ungenotyped_pos, genetic_map)
# Subset haplotypes.
ref_h_genotyped = ref_h[genotyped_idx, :]
ref_h_ungenotyped = ref_h[ungenotyped_idx, :]
Expand Down Expand Up @@ -413,8 +421,7 @@ def run_tsimpute(
mu,
rho,
precision=22,
genetic_map_pos=None,
genetic_map_cm=None,
genetic_map=None,
):
"""
Run the BEAGLE 4.1 algorithm except that the forward and backward probability
Expand All @@ -441,16 +448,8 @@ def run_tsimpute(
genotyped_pos = pos[genotyped_idx]
ungenotyped_pos = pos[ungenotyped_idx]
# Get genetic map positions of markers.
if genetic_map_pos is None or genetic_map_cm is None:
genotyped_cm = convert_to_genetic_map_position(genotyped_pos)
ungenotyped_cm = convert_to_genetic_map_position(ungenotyped_pos)
else:
genotyped_cm = convert_to_genetic_map_position(
genotyped_pos, genetic_map_pos, genetic_map_cm
)
ungenotyped_cm = convert_to_genetic_map_position(
ungenotyped_pos, genetic_map_pos, genetic_map_cm
)
genotyped_cm = convert_to_genetic_map_positions(genotyped_pos, genetic_map)
ungenotyped_cm = convert_to_genetic_map_positions(ungenotyped_pos, genetic_map)
# Get parameters at genotyped markers.
mu = mu[genotyped_idx]
rho = rho[genotyped_idx]
Expand All @@ -459,10 +458,11 @@ def run_tsimpute(
ref_ts_ungenotyped = ref_ts.delete_sites(site_ids=genotyped_idx)
ref_h_ungenotyped = ref_ts_ungenotyped.genotype_matrix(alleles=tskit.ALLELES_ACGT)
query_h_genotyped = query_h[genotyped_idx]
# Get forward and backward matrices from tree sequence.
# Get matrices from tree sequence.
fm = _tskit.CompressedMatrix(ref_ts_genotyped._ll_tree_sequence)
bm = _tskit.CompressedMatrix(ref_ts_genotyped._ll_tree_sequence)
# WARN: Be careful with the position arguments rho and mu!!!
# WARN: Be careful with the positional arguments rho and mu!!!
# WARN: Be careful with the argument `acgt_alleles`!!!
ls_hmm = _tskit.LsHmm(
ref_ts_genotyped._ll_tree_sequence,
rho,
Expand All @@ -472,9 +472,8 @@ def run_tsimpute(
)
ls_hmm.forward_matrix(query_h_genotyped.T, fm)
ls_hmm.backward_matrix(query_h_genotyped.T, fm.normalisation_factor, bm)
# Compute HMM state probability matrix.
sm = compute_state_prob_matrix(fm.decode(), bm.decode())
# Interpolate allele probabilities.
# Perform interpolation.
i_allele_prob = interpolate_allele_prob(
sm,
ref_h_ungenotyped,
Expand All @@ -483,6 +482,5 @@ def run_tsimpute(
genotyped_cm,
ungenotyped_cm,
)
# Get MAP alleles at ungenotyped markers.
imputed_alleles, max_allele_prob = get_map_alleles(i_allele_prob)
return (imputed_alleles, max_allele_prob)

0 comments on commit aad036d

Please sign in to comment.