From 32d9591b3ddd73bf15cbff8141e8b2867c3a19b3 Mon Sep 17 00:00:00 2001 From: Shing Zhan Date: Sun, 21 Jan 2024 19:51:22 +0000 Subject: [PATCH] Rework get_weights using numpy.searchsorted --- python/tests/beagle_numba.py | 46 +++++++++++++++++------------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py index ed142ce89f..b8145efe18 100644 --- a/python/tests/beagle_numba.py +++ b/python/tests/beagle_numba.py @@ -257,40 +257,38 @@ def get_weights(typed_pos, untyped_pos, typed_cm, untyped_cm): lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)], where g(i) = genetic map position of marker i. - :param numpy.ndarray typed_pos: Physical positions of genotyped markers. - :param numpy.ndarray untyped_pos: Physical positions of ungenotyped markers. - :param numpy.ndarray typed_cm: Genetic map positions of genotyped markers. - :param numpy.ndarray untyped_cm: Genetic map positions of ungenotyped markers. - :return: Weights for untyped markers and indices of left-bounding markers. + This function looks for the right-bounding marker instead of the left-bounding. + + :param numpy.ndarray typed_pos: Physical positions of genotyped markers (bp). + :param numpy.ndarray untyped_pos: Physical positions of ungenotyped markers (bp). + :param numpy.ndarray typed_cm: Genetic map positions of genotyped markers (cM). + :param numpy.ndarray untyped_cm: Genetic map positions of ungenotyped markers (cM). + :return: Weights for ungenotyped markers and indices of right-bounding markers. :rtype: tuple(numpy.ndarray, numpy.ndarray) """ - MIN_CM_DIST = 1e-7 # Avoid division by zero. + _MIN_CM_DIST = 1e-7 # Avoid division by zero. m = len(typed_pos) x = len(untyped_pos) + # Identify genotype markers m-1 and m bounding ungenotyped marker i. + # Note np.searchsorted(a, v, side='right') returns i s.t. a[i-1] <= v < a[i]. + right_idx = np.searchsorted(typed_pos, untyped_pos, side="right") # Calculate weights for ungenotyped markers. weights = np.zeros(x, dtype=np.float64) - # Identify genotype markers m and m+1 bounding ungenotyped marker i. - left_idx = np.zeros(x, dtype=np.int32) - # Going from right to left. - # TODO: Refactor using np.searchsorted(). - idx_m = m - 1 - for idx_x in range(x - 1, -1, -1): - while idx_m > 0 and untyped_pos[idx_x] < typed_pos[idx_m]: - idx_m = max(idx_m - 1, 0) - if idx_m == m - 1: - # Right of the last genotyped marker. - weights[idx_x] = 0.0 - elif idx_m == 0 and untyped_pos[idx_x] < typed_pos[idx_m]: + for i in range(len(right_idx)): + r = right_idx[i] + if r == 0: # Left of the first genotyped marker. - weights[idx_x] = 1.0 + weights[i] = 1.0 + elif r == m: + # Right of the last genotyped marker. + weights[i] = 0.0 else: # Between two genotyped markers. - cm_mP1_x = typed_cm[idx_m + 1] - untyped_cm[idx_x] + cm_m2x = typed_cm[r] - untyped_cm[i] # TODO: Check if this a subtle source of numerical error. - cm_mP1_m = max(typed_cm[idx_m + 1] - typed_cm[idx_m], MIN_CM_DIST) - weights[idx_x] = cm_mP1_x / cm_mP1_m - left_idx[idx_x] = idx_m - return (weights, left_idx) + cm_m2mM1 = max(typed_cm[r] - typed_cm[r - 1], _MIN_CM_DIST) + weights[i] = cm_m2x / cm_m2mM1 + return (weights, right_idx) @njit