Merge pull request #97 from Goosang-Yu/dev_goosang

Update PE7 models
Goosang-Yu · Sep 10, 2024 · 56ff4ec · 56ff4ec
2 parents c7b9539 + 906e6b8
commit 56ff4ec
Show file tree

Hide file tree

Showing 6 changed files with 369 additions and 3 deletions.
diff --git a/genet/__init__.py b/genet/__init__.py
@@ -7,7 +7,7 @@
 # How to release with 2FA accounts
 
 # [주의!] 먼저 아래의 version을 변경한다!!
-__version__ = '0.15.1'
+__version__ = '0.16.0'
 
 # hatch build
 # twine upload --username __token__ dist/*

diff --git a/genet/analysis/Alignment.py b/genet/analysis/Alignment.py
@@ -0,0 +1,262 @@
+import sys
+import numpy as np
+from genet.analysis.constants import BLOSUM62
+from collections import namedtuple
+
+
+__all__ = ["AlignmentResult", "aligner"]
+
+
+IS_PY2 = False
+if sys.version_info.major == 2:
+    IS_PY2 = True
+
+# Container for alignment result
+AlignmentResult = namedtuple(
+    'AlignmentResult',
+    ['seq1', 'seq2', 'start1', 'start2',
+     'end1', 'end2', 'n_gaps1', 'n_gaps2',
+     'n_mismatches', 'score'])
+
+
+def aligner(seqj:str, seqi:str, method:str='global', 
+            gap_open:float=-7, gap_extend:float=-7, gap_double:float=-7,
+            matrix:dict=BLOSUM62, max_hits:int=1):
+    '''Calculates the alignment of two sequences.
+
+    The supported 'methods' are:
+        * 'global' for a global Needleman-Wunsh algorithm
+        * 'local' for a local Smith-Waterman alignment
+        * 'global_cfe' for a global alignment with cost-free ends
+        * 'glocal' for an alignment which is 'global' only with respect to
+          the shorter sequence (also known as a 'semi-global' alignment)
+
+    Returns the aligned (sub)sequences as character arrays.
+
+    Gotoh, O. (1982). J. Mol. Biol. 162, 705-708.
+    Needleman, S. & Wunsch, C. (1970). J. Mol. Biol. 48(3), 443-53.
+    Smith, T.F. & Waterman M.S. (1981). J. Mol. Biol. 147, 195-197.
+
+    Arguments:
+
+        - seqj First aligned iterable object of symbols.
+        - seqi Second aligned iterable object of symbols.
+        - method Type of alignment: 'global', 'global_cfe', 'local', 'glocal'.
+        - gap_open The gap-opening cost.
+        - gap_extend The cost of extending an open gap.
+        - gap_double The gap-opening cost if a gap is already open
+          in the other sequence.
+        - matrix  A score matrix dictionary.
+        - max_hits The maximum number of results to return in
+          case multiple alignments with the same score are found. If set to 1,
+          a single ``AlignmentResult`` object is returned. If set to values
+          larger than 1, a list containing ``AlignmentResult`` objects are
+          returned. If set to `None`, all alignments with the maximum score
+          are returned.
+    '''
+    assert max_hits is None or max_hits > 0
+    NONE, LEFT, UP, DIAG = range(4)  # NONE is 0
+    GAP_CHAR = ord('-') if not IS_PY2 else '-'
+    max_j = len(seqj)
+    max_i = len(seqi)
+
+    if max_j > max_i:
+        flip = 1
+        seqi, seqj = seqj, seqi
+        max_i, max_j = max_j, max_i
+    else:
+        flip = 0
+
+    seqi = seqi.encode() if not isinstance(seqi, bytes) else seqi
+    seqj = seqj.encode() if not isinstance(seqj, bytes) else seqj
+
+    F = np.zeros((max_i + 1, max_j + 1), dtype=np.float32)
+    I = np.ndarray((max_i + 1, max_j + 1), dtype=np.float32)
+    I.fill(-np.inf)
+    J = np.ndarray((max_i + 1, max_j + 1), dtype=np.float32)
+    J.fill(-np.inf)
+    pointer = np.zeros((max_i + 1, max_j + 1), dtype=np.uint)  # NONE
+
+    if method == 'global':
+        pointer[0, 1:] = LEFT
+        pointer[1:, 0] = UP
+        F[0, 1:] = gap_open + gap_extend * \
+            np.arange(0, max_j, dtype=np.float32)
+        F[1:, 0] = gap_open + gap_extend * \
+            np.arange(0, max_i, dtype=np.float32)
+    elif method == 'global_cfe':
+        pointer[0, 1:] = LEFT
+        pointer[1:, 0] = UP
+    elif method == 'glocal':
+        pointer[0, 1:] = LEFT
+        F[0, 1:] = gap_open + gap_extend * \
+            np.arange(0, max_j, dtype=np.float32)
+
+    for i in range(1, max_i + 1):
+        ci = seqi[i - 1:i]
+        for j in range(1, max_j + 1):
+            cj = seqj[j - 1:j]
+            # I
+            I[i, j] = max(
+                         F[i, j - 1] + gap_open,
+                         I[i, j - 1] + gap_extend,
+                         J[i, j - 1] + gap_double)
+            # J
+            J[i, j] = max(
+                         F[i - 1, j] + gap_open,
+                         J[i - 1, j] + gap_extend,
+                         I[i - 1, j] + gap_double)
+            # F
+            diag_score = F[i - 1, j - 1] + matrix[cj][ci]
+            left_score = I[i, j]
+            up_score = J[i, j]
+            max_score = max(diag_score, up_score, left_score)
+
+            F[i, j] = max(0, max_score) if method == 'local' else max_score
+
+            if method == 'local':
+                if F[i, j] == 0:
+                    pass  # point[i,j] = NONE
+                elif max_score == diag_score:
+                    pointer[i, j] = DIAG
+                elif max_score == up_score:
+                    pointer[i, j] = UP
+                elif max_score == left_score:
+                    pointer[i, j] = LEFT
+            elif method == 'glocal':
+                # In a semi-global alignment we want to consume as much as
+                # possible of the longer sequence.
+                if max_score == up_score:
+                    pointer[i, j] = UP
+                elif max_score == diag_score:
+                    pointer[i, j] = DIAG
+                elif max_score == left_score:
+                    pointer[i, j] = LEFT
+            else:
+                # global
+                if max_score == up_score:
+                    pointer[i, j] = UP
+                elif max_score == left_score:
+                    pointer[i, j] = LEFT
+                else:
+                    pointer[i, j] = DIAG
+
+    # container for traceback coordinates
+    ij_pairs = []
+    if method == 'local':
+        # max anywhere
+        maxv_indices = np.argwhere(F == F.max())[:max_hits]
+        for index in maxv_indices:
+            ij_pairs.append(index)
+    elif method == 'glocal':
+        # max in last col
+        max_score = F[:, -1].max()
+        maxi_indices = np.argwhere(F[:, -1] == F[:, -1].max())\
+            .flatten()[:max_hits]
+        for i in maxi_indices:
+            ij_pairs.append((i, max_j))
+    elif method == 'global_cfe':
+        # from i,j to max(max(last row), max(last col)) for free
+        row_max = F[-1].max()
+        col_max = F[:, -1].max()
+        # expecting max to exist on either last column or last row
+        if row_max > col_max:
+            col_idces = np.argwhere(F[-1] == row_max).flatten()[:max_hits]
+            for cid in col_idces:
+                ij_pairs.append((i, cid))
+        elif row_max < col_max:
+            row_idces = np.argwhere(F[:, -1] == col_max).flatten()[:max_hits]
+            for rid in row_idces:
+                ij_pairs.append((rid, j))
+        # special case: max is on last row, last col
+        elif row_max == col_max == F[i, j]:
+            # check if max score also exist on other cells in last row
+            # or last col. we expect only one of the case.
+            col_idces = np.argwhere(F[-1] == row_max).flatten()
+            row_idces = np.argwhere(F[:, -1] == col_max).flatten()
+            ncol_idces = len(col_idces)
+            nrow_idces = len(row_idces)
+
+            # tiebreaker between row/col is whichever has more max scores
+            if ncol_idces > nrow_idces:
+                for cid in col_idces[:max_hits]:
+                    ij_pairs.append((i, cid))
+            elif ncol_idces < nrow_idces:
+                for rid in row_idces[:max_hits]:
+                    ij_pairs.append((rid, j))
+            elif ncol_idces == nrow_idces == 1:
+                ij_pairs.append((i, j))
+            else:
+                raise RuntimeError('Unexpected multiple maximum global_cfe'
+                                   ' scores.')
+        else:
+            raise RuntimeError('Unexpected global_cfe scenario.')
+    else:
+        # method must be global at this point
+        ij_pairs.append((i, j))
+
+    results = []
+    for i, j in ij_pairs:
+        align_j = []
+        align_i = []
+        score = F[i, j]
+        p = pointer[i, j]
+        # mimic Python's coord system
+        if method.startswith("global"):
+            end_i, end_j = max_i, max_j
+        else:
+            end_i, end_j = i, j
+        n_gaps_i, n_gaps_j, n_mmatch = 0, 0, 0
+
+        # special case for global_cfe ~ one cell may contain multiple pointer
+        # directions
+        if method == 'global_cfe':
+            if i < max_i:
+                align_i.extend([c for c in seqi[i:][::-1]])
+                align_j.extend([GAP_CHAR] * (max_i - i))
+                n_gaps_j += 1
+            elif j < max_j:
+                align_i.extend([GAP_CHAR] * (max_j - j))
+                align_j.extend([c for c in seqj[j:][::-1]])
+                n_gaps_i += 1
+
+        while p != NONE:
+            if p == DIAG:
+                i -= 1
+                j -= 1
+                ichar = seqi[i]
+                jchar = seqj[j]
+                if ichar != jchar:
+                    n_mmatch += 1
+                align_j.append(jchar)
+                align_i.append(ichar)
+            elif p == LEFT:
+                j -= 1
+                align_j.append(seqj[j])
+                if not align_i or align_i[-1] != GAP_CHAR:
+                    n_gaps_i += 1
+                align_i.append(GAP_CHAR)
+            elif p == UP:
+                i -= 1
+                align_i.append(seqi[i])
+                if not align_j or align_j[-1] != GAP_CHAR:
+                    n_gaps_j += 1
+                align_j.append(GAP_CHAR)
+            else:
+                raise Exception('wtf!')
+            p = pointer[i, j]
+
+        align_i = bytes(align_i[::-1]) \
+            if not IS_PY2 else ''.join(align_i[::-1])
+        align_j = bytes(align_j[::-1]) \
+            if not IS_PY2 else ''.join(align_j[::-1])
+
+        aln = (AlignmentResult(align_i, align_j, i, j, end_i, end_j,
+                               n_gaps_i, n_gaps_j, n_mmatch, score)
+               if flip else
+               AlignmentResult(align_j, align_i, j, i, end_j, end_i,
+                               n_gaps_j, n_gaps_i, n_mmatch, score))
+
+        results.append(aln)
+
+    return results
diff --git a/genet/analysis/__init__.py b/genet/analysis/__init__.py
@@ -1,5 +1,4 @@
 from genet.analysis.functional import(
-    loadseq,
     SortByBarcodes,
 )
 

diff --git a/genet/analysis/constants.py b/genet/analysis/constants.py
@@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+
+__all__ = ['DNAFULL', 'BLOSUM62']
+
+_NUCL_LETTERS = [
+    b'A', b'T', b'G', b'C', b'U',        # non-ambiguous letters
+    b'R', b'Y', b'S', b'W', b'K', b'M',   # 2-base ambiguity
+    b'B', b'D', b'H', b'V',             # 3-base ambiguity
+    b'N',                            # 4-base ambiguity
+]
+
+_NUCL_SCORES = {
+     #     A   T   G   C   U   R   Y   S   W   K   M   B   D   H   V   N
+    b'A': [ 5, -4, -4, -4, -4,  1, -4, -4,  1, -4,  1, -4, -1, -1, -1, -2],
+    b'T': [-4,  5, -4, -4,  5, -4,  1, -4,  1,  1, -4, -1, -1, -1, -4, -2],
+    b'G': [-4, -4,  5, -4, -4,  1, -4,  1, -4,  1, -4, -1, -1, -4, -1, -2],
+    b'C': [-4, -4, -4,  5, -4, -4,  1,  1, -4, -4,  1, -1, -4, -1, -1, -2],
+    b'U': [-4,  5, -4, -4,  5, -4,  1, -4,  1,  1, -4, -1, -1, -1, -4, -2],
+
+    b'R': [ 1, -4,  1, -4, -4, -1, -4, -2, -2, -2, -2, -3, -1, -3, -1, -1],
+    b'Y': [-4,  1, -4,  1,  1, -4, -1, -2, -2, -2, -2, -1, -3, -1, -3, -1],
+    b'S': [-4, -4,  1,  1, -4, -2, -2, -1, -4, -2, -2, -1, -3, -3, -1, -1],
+    b'W': [ 1,  1, -4, -4,  1, -2, -2, -4, -1, -2, -2, -3, -1, -1, -3, -1],
+    b'K': [-4,  1,  1, -4,  1, -2, -2, -2, -2, -1, -4, -1, -1, -3, -3, -1],
+    b'M': [ 1, -4, -4,  1, -4, -2, -2, -2, -2, -4, -1, -3, -3, -1, -1, -1],
+
+    b'B': [-4, -1, -1, -1, -1, -3, -1, -1, -3, -1, -3, -1, -2, -2, -2, -1],
+    b'D': [-1, -1, -1, -4, -1, -1, -3, -3, -1, -1, -3, -2, -1, -2, -2, -1],
+    b'H': [-1, -1, -4, -1, -1, -3, -1, -3, -1, -3, -1, -2, -2, -1, -2, -1],
+    b'V': [-1, -4, -1, -1, -4, -1, -3, -1, -3, -3, -1, -2, -2, -2, -1, -1],
+
+    b'N': [-2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+}
+
+# NOTE: This relies on the ordering of _NUCL_LETTERS and the lists inside
+#       _NUCL_SCORES above.
+DNAFULL = {letter: dict(zip(_NUCL_LETTERS, scores))
+           for letter, scores in _NUCL_SCORES.items()}
+
+_AA_LETTERS = [
+    b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'K', b'L', b'M',
+    b'N', b'P', b'Q', b'R', b'S', b'T', b'V', b'W', b'X', b'Y', b'Z', b'*',
+]
+
+_BLOSUM62_SCORES = {
+     #     A   B   C   D   E   F   G   H   I   K   L   M   N   P   Q   R   S   T   V   W   X   Y   Z   *
+    b'A': [ 4, -2,  0, -2, -1, -2,  0, -2, -1, -1, -1, -1, -2, -1, -1, -1,  1,  0,  0, -3,  0, -2, -1, -4],
+    b'B': [-2,  4, -3,  4,  1, -3, -1,  0, -3,  0, -4, -3,  3, -2,  0, -1,  0, -1, -3, -4, -1, -3,  1, -4],
+    b'C': [ 0, -3,  9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2, -2, -3, -4],
+    b'D': [-2,  4, -3,  6,  2, -3, -1, -1, -3, -1, -4, -3,  1, -1,  0, -2,  0, -1, -3, -4, -1, -3,  1, -4],
+    b'E': [-1,  1, -4,  2,  5, -3, -2,  0, -3,  1, -3, -2,  0, -1,  2,  0,  0, -1, -2, -3, -1, -2,  4, -4],
+    b'F': [-2, -3, -2, -3, -3,  6, -3, -1,  0, -3,  0,  0, -3, -4, -3, -3, -2, -2, -1,  1, -1,  3, -3, -4],
+    b'G': [ 0, -1, -3, -1, -2, -3,  6, -2, -4, -2, -4, -3,  0, -2, -2, -2,  0, -2, -3, -2, -1, -3, -2, -4],
+    b'H': [-2,  0, -3, -1,  0, -1, -2,  8, -3, -1, -3, -2,  1, -2,  0,  0, -1, -2, -3, -2, -1,  2,  0, -4],
+    b'I': [-1, -3, -1, -3, -3,  0, -4, -3,  4, -3,  2,  1, -3, -3, -3, -3, -2, -1,  3, -3, -1, -1, -3, -4],
+    b'K': [-1,  0, -3, -1,  1, -3, -2, -1, -3,  5, -2, -1,  0, -1,  1,  2,  0, -1, -2, -3, -1, -2,  1, -4],
+    b'L': [-1, -4, -1, -4, -3,  0, -4, -3,  2, -2,  4,  2, -3, -3, -2, -2, -2, -1,  1, -2, -1, -1, -3, -4],
+    b'M': [-1, -3, -1, -3, -2,  0, -3, -2,  1, -1,  2,  5, -2, -2,  0, -1, -1, -1,  1, -1, -1, -1, -1, -4],
+    b'N': [-2,  3, -3,  1,  0, -3,  0,  1, -3,  0, -3, -2,  6, -2,  0,  0,  1,  0, -3, -4, -1, -2,  0, -4],
+    b'P': [-1, -2, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2,  7, -1, -2, -1, -1, -2, -4, -2, -3, -1, -4],
+    b'Q': [-1,  0, -3,  0,  2, -3, -2,  0, -3,  1, -2,  0,  0, -1,  5,  1,  0, -1, -2, -2, -1, -1,  3, -4],
+    b'R': [-1, -1, -3, -2,  0, -3, -2,  0, -3,  2, -2, -1,  0, -2,  1,  5, -1, -1, -3, -3, -1, -2,  0, -4],
+    b'S': [ 1,  0, -1,  0,  0, -2,  0, -1, -2,  0, -2, -1,  1, -1,  0, -1,  4,  1, -2, -3,  0, -2,  0, -4],
+    b'T': [ 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1,  0, -1, -1, -1,  1,  5,  0, -2,  0, -2, -1, -4],
+    b'V': [ 0, -3, -1, -3, -2, -1, -3, -3,  3, -2,  1,  1, -3, -2, -2, -3, -2,  0,  4, -3, -1, -1, -2, -4],
+    b'W': [-3, -4, -2, -4, -3,  1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, -2,  2, -3, -4],
+    b'X': [ 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1,  0,  0, -1, -2, -1, -1, -1, -4],
+    b'Y': [-2, -3, -2, -3, -2,  3, -3,  2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, - 1,  7, -2, -4],
+    b'Z': [-1,  1, -3,  1,  4, -3, -2,  0, -3,  1, -3, -1,  0, -1,  3,  0,  0, -1, -2, -3, -1, -2,  4, -4],
+    b'*': [-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4,  1],
+}
+
+# NOTE: This relies on the ordering of _AA_LETTERS and the lists inside
+#       _BLOSUM62_SCORES above.
+BLOSUM62 = {letter: dict(zip(_AA_LETTERS, scores))
+            for letter, scores in _BLOSUM62_SCORES.items()}
diff --git a/genet/database/functional.py b/genet/database/functional.py
@@ -50,7 +50,7 @@ def _download_summary(self, target_file:str, download_path:str, convert=True):
             )
 
         # File will be converted to parquet format automatically
-        if convert==True: self._convert_to_parquet()
+        if convert==True: self._convert_to_parquet(target_file)
 
         print(f'[Info] Complete')