diff --git a/genet/__init__.py b/genet/__init__.py index a0bb127..44d4fe9 100644 --- a/genet/__init__.py +++ b/genet/__init__.py @@ -7,7 +7,7 @@ # How to release with 2FA accounts # [주의!] 먼저 아래의 version을 변경한다!! -__version__ = '0.15.1' +__version__ = '0.16.0' # hatch build # twine upload --username __token__ dist/* diff --git a/genet/analysis/Alignment.py b/genet/analysis/Alignment.py new file mode 100644 index 0000000..41d97e3 --- /dev/null +++ b/genet/analysis/Alignment.py @@ -0,0 +1,262 @@ +import sys +import numpy as np +from genet.analysis.constants import BLOSUM62 +from collections import namedtuple + + +__all__ = ["AlignmentResult", "aligner"] + + +IS_PY2 = False +if sys.version_info.major == 2: + IS_PY2 = True + +# Container for alignment result +AlignmentResult = namedtuple( + 'AlignmentResult', + ['seq1', 'seq2', 'start1', 'start2', + 'end1', 'end2', 'n_gaps1', 'n_gaps2', + 'n_mismatches', 'score']) + + +def aligner(seqj:str, seqi:str, method:str='global', + gap_open:float=-7, gap_extend:float=-7, gap_double:float=-7, + matrix:dict=BLOSUM62, max_hits:int=1): + '''Calculates the alignment of two sequences. + + The supported 'methods' are: + * 'global' for a global Needleman-Wunsh algorithm + * 'local' for a local Smith-Waterman alignment + * 'global_cfe' for a global alignment with cost-free ends + * 'glocal' for an alignment which is 'global' only with respect to + the shorter sequence (also known as a 'semi-global' alignment) + + Returns the aligned (sub)sequences as character arrays. + + Gotoh, O. (1982). J. Mol. Biol. 162, 705-708. + Needleman, S. & Wunsch, C. (1970). J. Mol. Biol. 48(3), 443-53. + Smith, T.F. & Waterman M.S. (1981). J. Mol. Biol. 147, 195-197. + + Arguments: + + - seqj First aligned iterable object of symbols. + - seqi Second aligned iterable object of symbols. + - method Type of alignment: 'global', 'global_cfe', 'local', 'glocal'. + - gap_open The gap-opening cost. + - gap_extend The cost of extending an open gap. + - gap_double The gap-opening cost if a gap is already open + in the other sequence. + - matrix A score matrix dictionary. + - max_hits The maximum number of results to return in + case multiple alignments with the same score are found. If set to 1, + a single ``AlignmentResult`` object is returned. If set to values + larger than 1, a list containing ``AlignmentResult`` objects are + returned. If set to `None`, all alignments with the maximum score + are returned. + ''' + assert max_hits is None or max_hits > 0 + NONE, LEFT, UP, DIAG = range(4) # NONE is 0 + GAP_CHAR = ord('-') if not IS_PY2 else '-' + max_j = len(seqj) + max_i = len(seqi) + + if max_j > max_i: + flip = 1 + seqi, seqj = seqj, seqi + max_i, max_j = max_j, max_i + else: + flip = 0 + + seqi = seqi.encode() if not isinstance(seqi, bytes) else seqi + seqj = seqj.encode() if not isinstance(seqj, bytes) else seqj + + F = np.zeros((max_i + 1, max_j + 1), dtype=np.float32) + I = np.ndarray((max_i + 1, max_j + 1), dtype=np.float32) + I.fill(-np.inf) + J = np.ndarray((max_i + 1, max_j + 1), dtype=np.float32) + J.fill(-np.inf) + pointer = np.zeros((max_i + 1, max_j + 1), dtype=np.uint) # NONE + + if method == 'global': + pointer[0, 1:] = LEFT + pointer[1:, 0] = UP + F[0, 1:] = gap_open + gap_extend * \ + np.arange(0, max_j, dtype=np.float32) + F[1:, 0] = gap_open + gap_extend * \ + np.arange(0, max_i, dtype=np.float32) + elif method == 'global_cfe': + pointer[0, 1:] = LEFT + pointer[1:, 0] = UP + elif method == 'glocal': + pointer[0, 1:] = LEFT + F[0, 1:] = gap_open + gap_extend * \ + np.arange(0, max_j, dtype=np.float32) + + for i in range(1, max_i + 1): + ci = seqi[i - 1:i] + for j in range(1, max_j + 1): + cj = seqj[j - 1:j] + # I + I[i, j] = max( + F[i, j - 1] + gap_open, + I[i, j - 1] + gap_extend, + J[i, j - 1] + gap_double) + # J + J[i, j] = max( + F[i - 1, j] + gap_open, + J[i - 1, j] + gap_extend, + I[i - 1, j] + gap_double) + # F + diag_score = F[i - 1, j - 1] + matrix[cj][ci] + left_score = I[i, j] + up_score = J[i, j] + max_score = max(diag_score, up_score, left_score) + + F[i, j] = max(0, max_score) if method == 'local' else max_score + + if method == 'local': + if F[i, j] == 0: + pass # point[i,j] = NONE + elif max_score == diag_score: + pointer[i, j] = DIAG + elif max_score == up_score: + pointer[i, j] = UP + elif max_score == left_score: + pointer[i, j] = LEFT + elif method == 'glocal': + # In a semi-global alignment we want to consume as much as + # possible of the longer sequence. + if max_score == up_score: + pointer[i, j] = UP + elif max_score == diag_score: + pointer[i, j] = DIAG + elif max_score == left_score: + pointer[i, j] = LEFT + else: + # global + if max_score == up_score: + pointer[i, j] = UP + elif max_score == left_score: + pointer[i, j] = LEFT + else: + pointer[i, j] = DIAG + + # container for traceback coordinates + ij_pairs = [] + if method == 'local': + # max anywhere + maxv_indices = np.argwhere(F == F.max())[:max_hits] + for index in maxv_indices: + ij_pairs.append(index) + elif method == 'glocal': + # max in last col + max_score = F[:, -1].max() + maxi_indices = np.argwhere(F[:, -1] == F[:, -1].max())\ + .flatten()[:max_hits] + for i in maxi_indices: + ij_pairs.append((i, max_j)) + elif method == 'global_cfe': + # from i,j to max(max(last row), max(last col)) for free + row_max = F[-1].max() + col_max = F[:, -1].max() + # expecting max to exist on either last column or last row + if row_max > col_max: + col_idces = np.argwhere(F[-1] == row_max).flatten()[:max_hits] + for cid in col_idces: + ij_pairs.append((i, cid)) + elif row_max < col_max: + row_idces = np.argwhere(F[:, -1] == col_max).flatten()[:max_hits] + for rid in row_idces: + ij_pairs.append((rid, j)) + # special case: max is on last row, last col + elif row_max == col_max == F[i, j]: + # check if max score also exist on other cells in last row + # or last col. we expect only one of the case. + col_idces = np.argwhere(F[-1] == row_max).flatten() + row_idces = np.argwhere(F[:, -1] == col_max).flatten() + ncol_idces = len(col_idces) + nrow_idces = len(row_idces) + + # tiebreaker between row/col is whichever has more max scores + if ncol_idces > nrow_idces: + for cid in col_idces[:max_hits]: + ij_pairs.append((i, cid)) + elif ncol_idces < nrow_idces: + for rid in row_idces[:max_hits]: + ij_pairs.append((rid, j)) + elif ncol_idces == nrow_idces == 1: + ij_pairs.append((i, j)) + else: + raise RuntimeError('Unexpected multiple maximum global_cfe' + ' scores.') + else: + raise RuntimeError('Unexpected global_cfe scenario.') + else: + # method must be global at this point + ij_pairs.append((i, j)) + + results = [] + for i, j in ij_pairs: + align_j = [] + align_i = [] + score = F[i, j] + p = pointer[i, j] + # mimic Python's coord system + if method.startswith("global"): + end_i, end_j = max_i, max_j + else: + end_i, end_j = i, j + n_gaps_i, n_gaps_j, n_mmatch = 0, 0, 0 + + # special case for global_cfe ~ one cell may contain multiple pointer + # directions + if method == 'global_cfe': + if i < max_i: + align_i.extend([c for c in seqi[i:][::-1]]) + align_j.extend([GAP_CHAR] * (max_i - i)) + n_gaps_j += 1 + elif j < max_j: + align_i.extend([GAP_CHAR] * (max_j - j)) + align_j.extend([c for c in seqj[j:][::-1]]) + n_gaps_i += 1 + + while p != NONE: + if p == DIAG: + i -= 1 + j -= 1 + ichar = seqi[i] + jchar = seqj[j] + if ichar != jchar: + n_mmatch += 1 + align_j.append(jchar) + align_i.append(ichar) + elif p == LEFT: + j -= 1 + align_j.append(seqj[j]) + if not align_i or align_i[-1] != GAP_CHAR: + n_gaps_i += 1 + align_i.append(GAP_CHAR) + elif p == UP: + i -= 1 + align_i.append(seqi[i]) + if not align_j or align_j[-1] != GAP_CHAR: + n_gaps_j += 1 + align_j.append(GAP_CHAR) + else: + raise Exception('wtf!') + p = pointer[i, j] + + align_i = bytes(align_i[::-1]) \ + if not IS_PY2 else ''.join(align_i[::-1]) + align_j = bytes(align_j[::-1]) \ + if not IS_PY2 else ''.join(align_j[::-1]) + + aln = (AlignmentResult(align_i, align_j, i, j, end_i, end_j, + n_gaps_i, n_gaps_j, n_mmatch, score) + if flip else + AlignmentResult(align_j, align_i, j, i, end_j, end_i, + n_gaps_j, n_gaps_i, n_mmatch, score)) + + results.append(aln) + + return results \ No newline at end of file diff --git a/genet/analysis/__init__.py b/genet/analysis/__init__.py index 318fccf..04bab3e 100644 --- a/genet/analysis/__init__.py +++ b/genet/analysis/__init__.py @@ -1,5 +1,4 @@ from genet.analysis.functional import( - loadseq, SortByBarcodes, ) diff --git a/genet/analysis/constants.py b/genet/analysis/constants.py new file mode 100644 index 0000000..2d76e40 --- /dev/null +++ b/genet/analysis/constants.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- + +__all__ = ['DNAFULL', 'BLOSUM62'] + +_NUCL_LETTERS = [ + b'A', b'T', b'G', b'C', b'U', # non-ambiguous letters + b'R', b'Y', b'S', b'W', b'K', b'M', # 2-base ambiguity + b'B', b'D', b'H', b'V', # 3-base ambiguity + b'N', # 4-base ambiguity +] + +_NUCL_SCORES = { + # A T G C U R Y S W K M B D H V N + b'A': [ 5, -4, -4, -4, -4, 1, -4, -4, 1, -4, 1, -4, -1, -1, -1, -2], + b'T': [-4, 5, -4, -4, 5, -4, 1, -4, 1, 1, -4, -1, -1, -1, -4, -2], + b'G': [-4, -4, 5, -4, -4, 1, -4, 1, -4, 1, -4, -1, -1, -4, -1, -2], + b'C': [-4, -4, -4, 5, -4, -4, 1, 1, -4, -4, 1, -1, -4, -1, -1, -2], + b'U': [-4, 5, -4, -4, 5, -4, 1, -4, 1, 1, -4, -1, -1, -1, -4, -2], + + b'R': [ 1, -4, 1, -4, -4, -1, -4, -2, -2, -2, -2, -3, -1, -3, -1, -1], + b'Y': [-4, 1, -4, 1, 1, -4, -1, -2, -2, -2, -2, -1, -3, -1, -3, -1], + b'S': [-4, -4, 1, 1, -4, -2, -2, -1, -4, -2, -2, -1, -3, -3, -1, -1], + b'W': [ 1, 1, -4, -4, 1, -2, -2, -4, -1, -2, -2, -3, -1, -1, -3, -1], + b'K': [-4, 1, 1, -4, 1, -2, -2, -2, -2, -1, -4, -1, -1, -3, -3, -1], + b'M': [ 1, -4, -4, 1, -4, -2, -2, -2, -2, -4, -1, -3, -3, -1, -1, -1], + + b'B': [-4, -1, -1, -1, -1, -3, -1, -1, -3, -1, -3, -1, -2, -2, -2, -1], + b'D': [-1, -1, -1, -4, -1, -1, -3, -3, -1, -1, -3, -2, -1, -2, -2, -1], + b'H': [-1, -1, -4, -1, -1, -3, -1, -3, -1, -3, -1, -2, -2, -1, -2, -1], + b'V': [-1, -4, -1, -1, -4, -1, -3, -1, -3, -3, -1, -2, -2, -2, -1, -1], + + b'N': [-2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], +} + +# NOTE: This relies on the ordering of _NUCL_LETTERS and the lists inside +# _NUCL_SCORES above. +DNAFULL = {letter: dict(zip(_NUCL_LETTERS, scores)) + for letter, scores in _NUCL_SCORES.items()} + +_AA_LETTERS = [ + b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'K', b'L', b'M', + b'N', b'P', b'Q', b'R', b'S', b'T', b'V', b'W', b'X', b'Y', b'Z', b'*', +] + +_BLOSUM62_SCORES = { + # A B C D E F G H I K L M N P Q R S T V W X Y Z * + b'A': [ 4, -2, 0, -2, -1, -2, 0, -2, -1, -1, -1, -1, -2, -1, -1, -1, 1, 0, 0, -3, 0, -2, -1, -4], + b'B': [-2, 4, -3, 4, 1, -3, -1, 0, -3, 0, -4, -3, 3, -2, 0, -1, 0, -1, -3, -4, -1, -3, 1, -4], + b'C': [ 0, -3, 9, -3, -4, -2, -3, -3, -1, -3, -1, -1, -3, -3, -3, -3, -1, -1, -1, -2, -2, -2, -3, -4], + b'D': [-2, 4, -3, 6, 2, -3, -1, -1, -3, -1, -4, -3, 1, -1, 0, -2, 0, -1, -3, -4, -1, -3, 1, -4], + b'E': [-1, 1, -4, 2, 5, -3, -2, 0, -3, 1, -3, -2, 0, -1, 2, 0, 0, -1, -2, -3, -1, -2, 4, -4], + b'F': [-2, -3, -2, -3, -3, 6, -3, -1, 0, -3, 0, 0, -3, -4, -3, -3, -2, -2, -1, 1, -1, 3, -3, -4], + b'G': [ 0, -1, -3, -1, -2, -3, 6, -2, -4, -2, -4, -3, 0, -2, -2, -2, 0, -2, -3, -2, -1, -3, -2, -4], + b'H': [-2, 0, -3, -1, 0, -1, -2, 8, -3, -1, -3, -2, 1, -2, 0, 0, -1, -2, -3, -2, -1, 2, 0, -4], + b'I': [-1, -3, -1, -3, -3, 0, -4, -3, 4, -3, 2, 1, -3, -3, -3, -3, -2, -1, 3, -3, -1, -1, -3, -4], + b'K': [-1, 0, -3, -1, 1, -3, -2, -1, -3, 5, -2, -1, 0, -1, 1, 2, 0, -1, -2, -3, -1, -2, 1, -4], + b'L': [-1, -4, -1, -4, -3, 0, -4, -3, 2, -2, 4, 2, -3, -3, -2, -2, -2, -1, 1, -2, -1, -1, -3, -4], + b'M': [-1, -3, -1, -3, -2, 0, -3, -2, 1, -1, 2, 5, -2, -2, 0, -1, -1, -1, 1, -1, -1, -1, -1, -4], + b'N': [-2, 3, -3, 1, 0, -3, 0, 1, -3, 0, -3, -2, 6, -2, 0, 0, 1, 0, -3, -4, -1, -2, 0, -4], + b'P': [-1, -2, -3, -1, -1, -4, -2, -2, -3, -1, -3, -2, -2, 7, -1, -2, -1, -1, -2, -4, -2, -3, -1, -4], + b'Q': [-1, 0, -3, 0, 2, -3, -2, 0, -3, 1, -2, 0, 0, -1, 5, 1, 0, -1, -2, -2, -1, -1, 3, -4], + b'R': [-1, -1, -3, -2, 0, -3, -2, 0, -3, 2, -2, -1, 0, -2, 1, 5, -1, -1, -3, -3, -1, -2, 0, -4], + b'S': [ 1, 0, -1, 0, 0, -2, 0, -1, -2, 0, -2, -1, 1, -1, 0, -1, 4, 1, -2, -3, 0, -2, 0, -4], + b'T': [ 0, -1, -1, -1, -1, -2, -2, -2, -1, -1, -1, -1, 0, -1, -1, -1, 1, 5, 0, -2, 0, -2, -1, -4], + b'V': [ 0, -3, -1, -3, -2, -1, -3, -3, 3, -2, 1, 1, -3, -2, -2, -3, -2, 0, 4, -3, -1, -1, -2, -4], + b'W': [-3, -4, -2, -4, -3, 1, -2, -2, -3, -3, -2, -1, -4, -4, -2, -3, -3, -2, -3, 11, -2, 2, -3, -4], + b'X': [ 0, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -1, -1, 0, 0, -1, -2, -1, -1, -1, -4], + b'Y': [-2, -3, -2, -3, -2, 3, -3, 2, -1, -2, -1, -1, -2, -3, -1, -2, -2, -2, -1, 2, - 1, 7, -2, -4], + b'Z': [-1, 1, -3, 1, 4, -3, -2, 0, -3, 1, -3, -1, 0, -1, 3, 0, 0, -1, -2, -3, -1, -2, 4, -4], + b'*': [-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1], +} + +# NOTE: This relies on the ordering of _AA_LETTERS and the lists inside +# _BLOSUM62_SCORES above. +BLOSUM62 = {letter: dict(zip(_AA_LETTERS, scores)) + for letter, scores in _BLOSUM62_SCORES.items()} \ No newline at end of file diff --git a/genet/database/functional.py b/genet/database/functional.py index 0b2681d..cfa8e7b 100644 --- a/genet/database/functional.py +++ b/genet/database/functional.py @@ -50,7 +50,7 @@ def _download_summary(self, target_file:str, download_path:str, convert=True): ) # File will be converted to parquet format automatically - if convert==True: self._convert_to_parquet() + if convert==True: self._convert_to_parquet(target_file) print(f'[Info] Complete') diff --git a/genet/models/constants.py b/genet/models/constants.py index 36ede81..6c89609 100644 --- a/genet/models/constants.py +++ b/genet/models/constants.py @@ -320,6 +320,35 @@ 'repo': 'Goosang-Yu/genet-models/main/genet_models', 'path': 'DeepPrime/DP_variant_A549_PE4max_epegRNA_Opti_220428' }, + + 'PE7max-e-A549': { + 'type': 'DeepPrime-FT', + 'repo': 'Goosang-Yu/genet-models/main/genet_models', + 'path': 'DeepPrime/DeepPrime_FT_A549_PE7_epegRNA_dataset' + }, + 'PE7max-A549': { + 'type': 'DeepPrime-FT', + 'repo': 'Goosang-Yu/genet-models/main/genet_models', + 'path': 'DeepPrime/DeepPrime_FT_A549_PE7_pegRNA_dataset' + }, + + 'PE7max-e-DLD1': { + 'type': 'DeepPrime-FT', + 'repo': 'Goosang-Yu/genet-models/main/genet_models', + 'path': 'DeepPrime/DeepPrime_FT_DLD1_PE7_epegRNA_dataset' + }, + 'PE7max-DLD1': { + 'type': 'DeepPrime-FT', + 'repo': 'Goosang-Yu/genet-models/main/genet_models', + 'path': 'DeepPrime/DeepPrime_FT_DLD1_PE7_pegRNA_dataset' + }, + + 'PE7max-HEK293T': { + 'type': 'DeepPrime-FT', + 'repo': 'Goosang-Yu/genet-models/main/genet_models', + 'path': 'DeepPrime/DeepPrime_FT_HEK293T_PE7_pegRNA_dataset_240909' + }, + 'NRCH_PE2-HEK293T': { 'type': 'DeepPrime-FT', 'repo': 'Goosang-Yu/genet-models/main/genet_models',