Skip to content

Commit

Permalink
Rework get_weights using numpy.searchsorted
Browse files Browse the repository at this point in the history
  • Loading branch information
szhan committed Jan 21, 2024
1 parent 6d8bccf commit 32d9591
Showing 1 changed file with 22 additions and 24 deletions.
46 changes: 22 additions & 24 deletions python/tests/beagle_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,40 +257,38 @@ def get_weights(typed_pos, untyped_pos, typed_cm, untyped_cm):
lambda_m,x = [g(m + 1) - g(x)] / [g(m + 1) - g(m)],
where g(i) = genetic map position of marker i.
:param numpy.ndarray typed_pos: Physical positions of genotyped markers.
:param numpy.ndarray untyped_pos: Physical positions of ungenotyped markers.
:param numpy.ndarray typed_cm: Genetic map positions of genotyped markers.
:param numpy.ndarray untyped_cm: Genetic map positions of ungenotyped markers.
:return: Weights for untyped markers and indices of left-bounding markers.
This function looks for the right-bounding marker instead of the left-bounding.
:param numpy.ndarray typed_pos: Physical positions of genotyped markers (bp).
:param numpy.ndarray untyped_pos: Physical positions of ungenotyped markers (bp).
:param numpy.ndarray typed_cm: Genetic map positions of genotyped markers (cM).
:param numpy.ndarray untyped_cm: Genetic map positions of ungenotyped markers (cM).
:return: Weights for ungenotyped markers and indices of right-bounding markers.
:rtype: tuple(numpy.ndarray, numpy.ndarray)
"""
MIN_CM_DIST = 1e-7 # Avoid division by zero.
_MIN_CM_DIST = 1e-7 # Avoid division by zero.
m = len(typed_pos)
x = len(untyped_pos)
# Identify genotype markers m-1 and m bounding ungenotyped marker i.
# Note np.searchsorted(a, v, side='right') returns i s.t. a[i-1] <= v < a[i].
right_idx = np.searchsorted(typed_pos, untyped_pos, side="right")
# Calculate weights for ungenotyped markers.
weights = np.zeros(x, dtype=np.float64)
# Identify genotype markers m and m+1 bounding ungenotyped marker i.
left_idx = np.zeros(x, dtype=np.int32)
# Going from right to left.
# TODO: Refactor using np.searchsorted().
idx_m = m - 1
for idx_x in range(x - 1, -1, -1):
while idx_m > 0 and untyped_pos[idx_x] < typed_pos[idx_m]:
idx_m = max(idx_m - 1, 0)
if idx_m == m - 1:
# Right of the last genotyped marker.
weights[idx_x] = 0.0
elif idx_m == 0 and untyped_pos[idx_x] < typed_pos[idx_m]:
for i in range(len(right_idx)):
r = right_idx[i]
if r == 0:
# Left of the first genotyped marker.
weights[idx_x] = 1.0
weights[i] = 1.0
elif r == m:
# Right of the last genotyped marker.
weights[i] = 0.0
else:
# Between two genotyped markers.
cm_mP1_x = typed_cm[idx_m + 1] - untyped_cm[idx_x]
cm_m2x = typed_cm[r] - untyped_cm[i]
# TODO: Check if this a subtle source of numerical error.
cm_mP1_m = max(typed_cm[idx_m + 1] - typed_cm[idx_m], MIN_CM_DIST)
weights[idx_x] = cm_mP1_x / cm_mP1_m
left_idx[idx_x] = idx_m
return (weights, left_idx)
cm_m2mM1 = max(typed_cm[r] - typed_cm[r - 1], _MIN_CM_DIST)
weights[i] = cm_m2x / cm_m2mM1
return (weights, right_idx)


@njit
Expand Down

0 comments on commit 32d9591

Please sign in to comment.