From d539187c7d0f46e118693f115dafadecf3f57eab Mon Sep 17 00:00:00 2001 From: Vitaliy Mysak Date: Fri, 11 Oct 2024 14:45:48 -0700 Subject: [PATCH] Add parse method to CigarHit --- src/aligntools/cigar_hit.py | 51 +++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/aligntools/cigar_hit.py b/src/aligntools/cigar_hit.py index f30bb7c..1577870 100644 --- a/src/aligntools/cigar_hit.py +++ b/src/aligntools/cigar_hit.py @@ -7,6 +7,7 @@ from dataclasses import dataclass from functools import cached_property, reduce from fractions import Fraction +import re from aligntools.coordinate_mapping import CoordinateMapping from aligntools.cigar_actions import CigarActions @@ -19,6 +20,11 @@ def intervals_overlap(x: Tuple[int, int], y: Tuple[int, int]) -> bool: return x[0] <= y[1] and x[1] >= y[0] +parse_expr = re.compile(r'(?P.+)@' + r'\[(?P\d+),(?P\d+)\]->' + r'\[(?P\d+),(?P\d+)\]') + + @dataclass(frozen=True) class CigarHit: """ @@ -332,6 +338,51 @@ def translate(self, reference_delta: int, query_delta: int) -> 'CigarHit': q_st=self.q_st + query_delta, q_ei=self.q_ei + query_delta) + @staticmethod + def parse_cigar_hit(string: str) -> 'CigarHit': + """ + Parses a string representation of a CigarHit + and returns a CigarHit object. + + This method is inverse of CigarHit.__str__. + + :param hit_str: A string representation of a CigarHit. + :return: CigarHit object parsed from the input string. + :raises ParseError: If the string cannot be parsed into a CigarHit. + """ + + # Regular expression to match the structure of a serialized CigarHit + match = parse_expr.match(string) + + if not match: + raise ex.ParseError(f"Invalid CigarHit string format: {string!r}.") + + try: + # Extracting components from the matched regex groups + cigar_str = match.group('cigar') + q_st = int(match.group('q_st')) + q_ei = int(match.group('q_ei')) + r_st = int(match.group('r_st')) + r_ei = int(match.group('r_ei')) + except ValueError as e: + raise ex.ParseError(f"Error parsing indices in: {string!r}.") \ + from e + + # Validating that start indices + # are less than or equal to end indices. + if q_st > q_ei + 1: + raise ex.ParseError( + f"Query start index ({q_st}) " + f"greater than end index ({q_ei} + 1) in: {string!r}.") + + if r_st > r_ei + 1: + raise ex.ParseError( + f"Reference start index ({r_st}) " + f"greater than end index ({r_ei} + 1) in: {string!r}.") + + cigar: Cigar = Cigar.coerce(cigar_str) + return CigarHit(cigar, r_st, r_ei, q_st, q_ei) + def __repr__(self): return 'CigarHit(%r, r_st=%r, r_ei=%r, q_st=%r, q_ei=%r)' \ % (self.cigar, self.r_st, self.r_ei, self.q_st, self.q_ei)