From 51bc687881d82a6e4a9ce524ff311b793422b52c Mon Sep 17 00:00:00 2001 From: Leon Rauschning <99650940+lrauschning@users.noreply.github.com> Date: Fri, 15 Nov 2024 19:48:29 +0800 Subject: [PATCH] initial realign private implementation --- msyd/pyxfiles/realignment.pyx | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/msyd/pyxfiles/realignment.pyx b/msyd/pyxfiles/realignment.pyx index b1e1cc7..d7c7cf8 100644 --- a/msyd/pyxfiles/realignment.pyx +++ b/msyd/pyxfiles/realignment.pyx @@ -28,8 +28,9 @@ from syri.writeout import getsrtable import msyd.util as util import msyd.cigar as cigar import msyd.intersection as intersection +import msyd.priv as priv import msyd.io as io -from msyd.multisyn import Multisyn +from msyd.multisyn import Multisyn, Private from msyd.coords import Range @@ -118,6 +119,23 @@ cpdef construct_mts(merasyns, old, syn): ## private regions will be at least MIN_REALIGN long ## => kind of enforced on ref anyway? +cpdef mt_to_privates(mt, org, chrom): + """ + Takes a mappingtree, and returns a DataFrame of all intervals left in it as Private objects. + This fn is used to extract leftover non-realigning regions to annotate as private. + :args: + :mt: input mappingtree, typically obtained from subtract_mappingtrees + :org: organism mt corresponds to + :chrom: chromosome we are working on + """ + cdef ret = deque() + for entry in mappingtree: + if entry.end - entry.start >= priv.MIN_PRIV_THRESH: + ret.append(Private(org, chrom, entry.data, entry.data + entry.end - entry.start)) + + return pd.DataFrame(list(ret)) + + cpdef subtract_mts(mappingtrees, merasyns, skip_ref=True): """ Takes a dict containing an `Intervaltree` with offsets for each organism to be realigned (as produced by `construct_mts`), and returns new mappingtrees with regions covered by multisyn objects in `merasyns` subtracted. @@ -651,12 +669,15 @@ cdef process_gaps(df, qrynames, fastas, mp_preset='asm20', ncores=1, annotate_pr ## recalculate mappingtrees from current merasyns to remove newly found merasynteny logger.debug(f"Old Mappingtrees: {mappingtrees}.\n Adding {merasyns[ref]}.") - mappingtrees = subtract_mts(mappingtrees, merasyns[ref], skip_ref=True) + mappingtrees = subtract_mts(mappingtrees, merasyns[ref], skip_ref=not annotate_private) logger.debug(f"New Mappingtrees: {mappingtrees}") # remove all orgs that have already been used as a reference for reforg in merasyns: if reforg in mappingtrees: + if annotate_private: + merasyns[ref] = util.merge(merasyns[ref], mt_to_privates(mappingtrees[reforg], reforg, syn.ref.chr)) + #TODO handle early exit cases del mappingtrees[reforg] ## extract the remaining sequences for future realignment