Rework get_weights using numpy.searchsorted

szhan · Jan 21, 2024 · 32d9591 · 32d9591
1 parent 6d8bccf
commit 32d9591
Showing 1 changed file with 22 additions and 24 deletions.
diff --git a/python/tests/beagle_numba.py b/python/tests/beagle_numba.py
@@ -257,40 +257,38 @@ def get_weights(typed_pos, untyped_pos, typed_cm, untyped_cm):
     lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)],
     where g(i) = genetic map position of marker i.
 
-    :param numpy.ndarray typed_pos: Physical positions of genotyped markers.
-    :param numpy.ndarray untyped_pos: Physical positions of ungenotyped markers.
-    :param numpy.ndarray typed_cm: Genetic map positions of genotyped markers.
-    :param numpy.ndarray untyped_cm: Genetic map positions of ungenotyped markers.
-    :return: Weights for untyped markers and indices of left-bounding markers.
+    This function looks for the right-bounding marker instead of the left-bounding.
+
+    :param numpy.ndarray typed_pos: Physical positions of genotyped markers (bp).
+    :param numpy.ndarray untyped_pos: Physical positions of ungenotyped markers (bp).
+    :param numpy.ndarray typed_cm: Genetic map positions of genotyped markers (cM).
+    :param numpy.ndarray untyped_cm: Genetic map positions of ungenotyped markers (cM).
+    :return: Weights for ungenotyped markers and indices of right-bounding markers.
     :rtype: tuple(numpy.ndarray, numpy.ndarray)
     """
-    MIN_CM_DIST = 1e-7  # Avoid division by zero.
+    _MIN_CM_DIST = 1e-7  # Avoid division by zero.
     m = len(typed_pos)
     x = len(untyped_pos)
+    # Identify genotype markers m-1 and m bounding ungenotyped marker i.
+    # Note np.searchsorted(a, v, side='right') returns i s.t. a[i-1] <= v < a[i].
+    right_idx = np.searchsorted(typed_pos, untyped_pos, side="right")
     # Calculate weights for ungenotyped markers.
     weights = np.zeros(x, dtype=np.float64)
-    # Identify genotype markers m and m+1 bounding ungenotyped marker i.
-    left_idx = np.zeros(x, dtype=np.int32)
-    # Going from right to left.
-    # TODO: Refactor using np.searchsorted().
-    idx_m = m - 1
-    for idx_x in range(x - 1, -1, -1):
-        while idx_m > 0 and untyped_pos[idx_x] < typed_pos[idx_m]:
-            idx_m = max(idx_m - 1, 0)
-        if idx_m == m - 1:
-            # Right of the last genotyped marker.
-            weights[idx_x] = 0.0
-        elif idx_m == 0 and untyped_pos[idx_x] < typed_pos[idx_m]:
+    for i in range(len(right_idx)):
+        r = right_idx[i]
+        if r == 0:
             # Left of the first genotyped marker.
-            weights[idx_x] = 1.0
+            weights[i] = 1.0
+        elif r == m:
+            # Right of the last genotyped marker.
+            weights[i] = 0.0
         else:
             # Between two genotyped markers.
-            cm_mP1_x = typed_cm[idx_m + 1] - untyped_cm[idx_x]
+            cm_m2x = typed_cm[r] - untyped_cm[i]
             # TODO: Check if this a subtle source of numerical error.
-            cm_mP1_m = max(typed_cm[idx_m + 1] - typed_cm[idx_m], MIN_CM_DIST)
-            weights[idx_x] = cm_mP1_x / cm_mP1_m
-        left_idx[idx_x] = idx_m
-    return (weights, left_idx)
+            cm_m2mM1 = max(typed_cm[r] - typed_cm[r - 1], _MIN_CM_DIST)
+            weights[i] = cm_m2x / cm_m2mM1
+    return (weights, right_idx)
 
 
 @njit