Skip to content

Commit

Permalink
Merge pull request #97 from Goosang-Yu/dev_goosang
Browse files Browse the repository at this point in the history
Update PE7 models
  • Loading branch information
Goosang-Yu authored Sep 10, 2024
2 parents c7b9539 + 906e6b8 commit 56ff4ec
Show file tree
Hide file tree
Showing 6 changed files with 369 additions and 3 deletions.
2 changes: 1 addition & 1 deletion genet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# How to release with 2FA accounts

# [주의!] 먼저 아래의 version을 변경한다!!
__version__ = '0.15.1'
__version__ = '0.16.0'

# hatch build
# twine upload --username __token__ dist/*
Expand Down
262 changes: 262 additions & 0 deletions genet/analysis/Alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
import sys
import numpy as np
from genet.analysis.constants import BLOSUM62
from collections import namedtuple


__all__ = ["AlignmentResult", "aligner"]


IS_PY2 = False
if sys.version_info.major == 2:
IS_PY2 = True

# Container for alignment result
AlignmentResult = namedtuple(
'AlignmentResult',
['seq1', 'seq2', 'start1', 'start2',
'end1', 'end2', 'n_gaps1', 'n_gaps2',
'n_mismatches', 'score'])


def aligner(seqj:str, seqi:str, method:str='global',
gap_open:float=-7, gap_extend:float=-7, gap_double:float=-7,
matrix:dict=BLOSUM62, max_hits:int=1):
'''Calculates the alignment of two sequences.
The supported 'methods' are:
* 'global' for a global Needleman-Wunsh algorithm
* 'local' for a local Smith-Waterman alignment
* 'global_cfe' for a global alignment with cost-free ends
* 'glocal' for an alignment which is 'global' only with respect to
the shorter sequence (also known as a 'semi-global' alignment)
Returns the aligned (sub)sequences as character arrays.
Gotoh, O. (1982). J. Mol. Biol. 162, 705-708.
Needleman, S. & Wunsch, C. (1970). J. Mol. Biol. 48(3), 443-53.
Smith, T.F. & Waterman M.S. (1981). J. Mol. Biol. 147, 195-197.
Arguments:
- seqj First aligned iterable object of symbols.
- seqi Second aligned iterable object of symbols.
- method Type of alignment: 'global', 'global_cfe', 'local', 'glocal'.
- gap_open The gap-opening cost.
- gap_extend The cost of extending an open gap.
- gap_double The gap-opening cost if a gap is already open
in the other sequence.
- matrix A score matrix dictionary.
- max_hits The maximum number of results to return in
case multiple alignments with the same score are found. If set to 1,
a single ``AlignmentResult`` object is returned. If set to values
larger than 1, a list containing ``AlignmentResult`` objects are
returned. If set to `None`, all alignments with the maximum score
are returned.
'''
assert max_hits is None or max_hits > 0
NONE, LEFT, UP, DIAG = range(4) # NONE is 0
GAP_CHAR = ord('-') if not IS_PY2 else '-'
max_j = len(seqj)
max_i = len(seqi)

if max_j > max_i:
flip = 1
seqi, seqj = seqj, seqi
max_i, max_j = max_j, max_i
else:
flip = 0

seqi = seqi.encode() if not isinstance(seqi, bytes) else seqi
seqj = seqj.encode() if not isinstance(seqj, bytes) else seqj

F = np.zeros((max_i + 1, max_j + 1), dtype=np.float32)
I = np.ndarray((max_i + 1, max_j + 1), dtype=np.float32)
I.fill(-np.inf)
J = np.ndarray((max_i + 1, max_j + 1), dtype=np.float32)
J.fill(-np.inf)
pointer = np.zeros((max_i + 1, max_j + 1), dtype=np.uint) # NONE

if method == 'global':
pointer[0, 1:] = LEFT
pointer[1:, 0] = UP
F[0, 1:] = gap_open + gap_extend * \
np.arange(0, max_j, dtype=np.float32)
F[1:, 0] = gap_open + gap_extend * \
np.arange(0, max_i, dtype=np.float32)
elif method == 'global_cfe':
pointer[0, 1:] = LEFT
pointer[1:, 0] = UP
elif method == 'glocal':
pointer[0, 1:] = LEFT
F[0, 1:] = gap_open + gap_extend * \
np.arange(0, max_j, dtype=np.float32)

for i in range(1, max_i + 1):
ci = seqi[i - 1:i]
for j in range(1, max_j + 1):
cj = seqj[j - 1:j]
# I
I[i, j] = max(
F[i, j - 1] + gap_open,
I[i, j - 1] + gap_extend,
J[i, j - 1] + gap_double)
# J
J[i, j] = max(
F[i - 1, j] + gap_open,
J[i - 1, j] + gap_extend,
I[i - 1, j] + gap_double)
# F
diag_score = F[i - 1, j - 1] + matrix[cj][ci]
left_score = I[i, j]
up_score = J[i, j]
max_score = max(diag_score, up_score, left_score)

F[i, j] = max(0, max_score) if method == 'local' else max_score

if method == 'local':
if F[i, j] == 0:
pass # point[i,j] = NONE
elif max_score == diag_score:
pointer[i, j] = DIAG
elif max_score == up_score:
pointer[i, j] = UP
elif max_score == left_score:
pointer[i, j] = LEFT
elif method == 'glocal':
# In a semi-global alignment we want to consume as much as
# possible of the longer sequence.
if max_score == up_score:
pointer[i, j] = UP
elif max_score == diag_score:
pointer[i, j] = DIAG
elif max_score == left_score:
pointer[i, j] = LEFT
else:
# global
if max_score == up_score:
pointer[i, j] = UP
elif max_score == left_score:
pointer[i, j] = LEFT
else:
pointer[i, j] = DIAG

# container for traceback coordinates
ij_pairs = []
if method == 'local':
# max anywhere
maxv_indices = np.argwhere(F == F.max())[:max_hits]
for index in maxv_indices:
ij_pairs.append(index)
elif method == 'glocal':
# max in last col
max_score = F[:, -1].max()
maxi_indices = np.argwhere(F[:, -1] == F[:, -1].max())\
.flatten()[:max_hits]
for i in maxi_indices:
ij_pairs.append((i, max_j))
elif method == 'global_cfe':
# from i,j to max(max(last row), max(last col)) for free
row_max = F[-1].max()
col_max = F[:, -1].max()
# expecting max to exist on either last column or last row
if row_max > col_max:
col_idces = np.argwhere(F[-1] == row_max).flatten()[:max_hits]
for cid in col_idces:
ij_pairs.append((i, cid))
elif row_max < col_max:
row_idces = np.argwhere(F[:, -1] == col_max).flatten()[:max_hits]
for rid in row_idces:
ij_pairs.append((rid, j))
# special case: max is on last row, last col
elif row_max == col_max == F[i, j]:
# check if max score also exist on other cells in last row
# or last col. we expect only one of the case.
col_idces = np.argwhere(F[-1] == row_max).flatten()
row_idces = np.argwhere(F[:, -1] == col_max).flatten()
ncol_idces = len(col_idces)
nrow_idces = len(row_idces)

# tiebreaker between row/col is whichever has more max scores
if ncol_idces > nrow_idces:
for cid in col_idces[:max_hits]:
ij_pairs.append((i, cid))
elif ncol_idces < nrow_idces:
for rid in row_idces[:max_hits]:
ij_pairs.append((rid, j))
elif ncol_idces == nrow_idces == 1:
ij_pairs.append((i, j))
else:
raise RuntimeError('Unexpected multiple maximum global_cfe'
' scores.')
else:
raise RuntimeError('Unexpected global_cfe scenario.')
else:
# method must be global at this point
ij_pairs.append((i, j))

results = []
for i, j in ij_pairs:
align_j = []
align_i = []
score = F[i, j]
p = pointer[i, j]
# mimic Python's coord system
if method.startswith("global"):
end_i, end_j = max_i, max_j
else:
end_i, end_j = i, j
n_gaps_i, n_gaps_j, n_mmatch = 0, 0, 0

# special case for global_cfe ~ one cell may contain multiple pointer
# directions
if method == 'global_cfe':
if i < max_i:
align_i.extend([c for c in seqi[i:][::-1]])
align_j.extend([GAP_CHAR] * (max_i - i))
n_gaps_j += 1
elif j < max_j:
align_i.extend([GAP_CHAR] * (max_j - j))
align_j.extend([c for c in seqj[j:][::-1]])
n_gaps_i += 1

while p != NONE:
if p == DIAG:
i -= 1
j -= 1
ichar = seqi[i]
jchar = seqj[j]
if ichar != jchar:
n_mmatch += 1
align_j.append(jchar)
align_i.append(ichar)
elif p == LEFT:
j -= 1
align_j.append(seqj[j])
if not align_i or align_i[-1] != GAP_CHAR:
n_gaps_i += 1
align_i.append(GAP_CHAR)
elif p == UP:
i -= 1
align_i.append(seqi[i])
if not align_j or align_j[-1] != GAP_CHAR:
n_gaps_j += 1
align_j.append(GAP_CHAR)
else:
raise Exception('wtf!')
p = pointer[i, j]

align_i = bytes(align_i[::-1]) \
if not IS_PY2 else ''.join(align_i[::-1])
align_j = bytes(align_j[::-1]) \
if not IS_PY2 else ''.join(align_j[::-1])

aln = (AlignmentResult(align_i, align_j, i, j, end_i, end_j,
n_gaps_i, n_gaps_j, n_mmatch, score)
if flip else
AlignmentResult(align_j, align_i, j, i, end_j, end_i,
n_gaps_j, n_gaps_i, n_mmatch, score))

results.append(aln)

return results
1 change: 0 additions & 1 deletion genet/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from genet.analysis.functional import(
loadseq,
SortByBarcodes,
)

Expand Down
76 changes: 76 additions & 0 deletions genet/analysis/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-

__all__ = ['DNAFULL', 'BLOSUM62']

_NUCL_LETTERS = [
b'A', b'T', b'G', b'C', b'U', # non-ambiguous letters
b'R', b'Y', b'S', b'W', b'K', b'M', # 2-base ambiguity
b'B', b'D', b'H', b'V', # 3-base ambiguity
b'N', # 4-base ambiguity
]

_NUCL_SCORES = {
# A T G C U R Y S W K M B D H V N
b'A': [ 5, -4, -4, -4, -4, 1, -4, -4, 1, -4, 1, -4, -1, -1, -1, -2],
b'T': [-4, 5, -4, -4, 5, -4, 1, -4, 1, 1, -4, -1, -1, -1, -4, -2],
b'G': [-4, -4, 5, -4, -4, 1, -4, 1, -4, 1, -4, -1, -1, -4, -1, -2],
b'C': [-4, -4, -4, 5, -4, -4, 1, 1, -4, -4, 1, -1, -4, -1, -1, -2],
b'U': [-4, 5, -4, -4, 5, -4, 1, -4, 1, 1, -4, -1, -1, -1, -4, -2],

b'R': [ 1, -4, 1, -4, -4, -1, -4, -2, -2, -2, -2, -3, -1, -3, -1, -1],
b'Y': [-4, 1, -4, 1, 1, -4, -1, -2, -2, -2, -2, -1, -3, -1, -3, -1],
b'S': [-4, -4, 1, 1, -4, -2, -2, -1, -4, -2, -2, -1, -3, -3, -1, -1],
b'W': [ 1, 1, -4, -4, 1, -2, -2, -4, -1, -2, -2, -3, -1, -1, -3, -1],
b'K': [-4, 1, 1, -4, 1, -2, -2, -2, -2, -1, -4, -1, -1, -3, -3, -1],
b'M': [ 1, -4, -4, 1, -4, -2, -2, -2, -2, -4, -1, -3, -3, -1, -1, -1],

b'B': [-4, -1, -1, -1, -1, -3, -1, -1, -3, -1, -3, -1, -2, -2, -2, -1],
b'D': [-1, -1, -1, -4, -1, -1, -3, -3, -1, -1, -3, -2, -1, -2, -2, -1],
b'H': [-1, -1, -4, -1, -1, -3, -1, -3, -1, -3, -1, -2, -2, -1, -2, -1],
b'V': [-1, -4, -1, -1, -4, -1, -3, -1, -3, -3, -1, -2, -2, -2, -1, -1],

b'N': [-2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
}

# NOTE: This relies on the ordering of _NUCL_LETTERS and the lists inside
# _NUCL_SCORES above.
DNAFULL = {letter: dict(zip(_NUCL_LETTERS, scores))
for letter, scores in _NUCL_SCORES.items()}

_AA_LETTERS = [
b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'K', b'L', b'M',
b'N', b'P', b'Q', b'R', b'S', b'T', b'V', b'W', b'X', b'Y', b'Z', b'*',
]

_BLOSUM62_SCORES = {
# A B C D E F G H I K L M N P Q R S T V W X Y Z *
b'A': [ 4, -2, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, 0, -2, -1, -4],
b'B': [-2, 4, -3, 4, 1, -3, -1, 0, -3, 0, -4, -3, 3, -2, 0, -1, 0, -1, -3, -4, -1, -3, 1, -4],
b'C': [ 0, -3, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2, -2, -3, -4],
b'D': [-2, 4, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -1, -3, 1, -4],
b'E': [-1, 1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -1, -2, 4, -4],
b'F': [-2, -3, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, -1, 3, -3, -4],
b'G': [ 0, -1, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -1, -3, -2, -4],
b'H': [-2, 0, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, -1, 2, 0, -4],
b'I': [-1, -3, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1, -1, -3, -4],
b'K': [-1, 0, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -1, -2, 1, -4],
b'L': [-1, -4, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1, -1, -3, -4],
b'M': [-1, -3, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1, -1, -1, -4],
b'N': [-2, 3, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -1, -2, 0, -4],
b'P': [-1, -2, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -2, -3, -1, -4],
b'Q': [-1, 0, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1, -1, 3, -4],
b'R': [-1, -1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -1, -2, 0, -4],
b'S': [ 1, 0, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, 0, -2, 0, -4],
b'T': [ 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, 0, -2, -1, -4],
b'V': [ 0, -3, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1, -1, -2, -4],
b'W': [-3, -4, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, -2, 2, -3, -4],
b'X': [ 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, 0, 0, -1, -2, -1, -1, -1, -4],
b'Y': [-2, -3, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, - 1, 7, -2, -4],
b'Z': [-1, 1, -3, 1, 4, -3, -2, 0, -3, 1, -3, -1, 0, -1, 3, 0, 0, -1, -2, -3, -1, -2, 4, -4],
b'*': [-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1],
}

# NOTE: This relies on the ordering of _AA_LETTERS and the lists inside
# _BLOSUM62_SCORES above.
BLOSUM62 = {letter: dict(zip(_AA_LETTERS, scores))
for letter, scores in _BLOSUM62_SCORES.items()}
2 changes: 1 addition & 1 deletion genet/database/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def _download_summary(self, target_file:str, download_path:str, convert=True):
)

# File will be converted to parquet format automatically
if convert==True: self._convert_to_parquet()
if convert==True: self._convert_to_parquet(target_file)

print(f'[Info] Complete')

Expand Down
Loading

0 comments on commit 56ff4ec

Please sign in to comment.