From f76c8244634ea1c660e753d197a8077311978d54 Mon Sep 17 00:00:00 2001 From: Leon Rauschning <99650940+lrauschning@users.noreply.github.com> Date: Wed, 27 Nov 2024 15:15:13 +0800 Subject: [PATCH] polish implementation, minor fixes --- msyd/pyxfiles/io.pyx | 2 +- msyd/pyxfiles/priv.pyx | 7 ++++--- msyd/pyxfiles/realignment.pyx | 17 +++++++++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/msyd/pyxfiles/io.pyx b/msyd/pyxfiles/io.pyx index d5c3071..1cb7342 100644 --- a/msyd/pyxfiles/io.pyx +++ b/msyd/pyxfiles/io.pyx @@ -691,7 +691,7 @@ cpdef save_df_to_psf(df, buf, save_cigars=True, emit_header=True, force_ref_pos= else: buf.write('\t'.join(['.', '.', '.', f"PRIVATE{privcounter}", priv.ref.org, priv.ref.chr, str(priv.ref.start), str(priv.ref.end), ''])) - write_multisyns([priv], buf, orgs, save_cigars=save_cigars) + write_multisyn(priv, buf, orgs, save_cigars=save_cigars) privcounter += 1 # write mesyn regions that have a position on reference at their appropriate position diff --git a/msyd/pyxfiles/priv.pyx b/msyd/pyxfiles/priv.pyx index 6a1d908..6d8ade6 100644 --- a/msyd/pyxfiles/priv.pyx +++ b/msyd/pyxfiles/priv.pyx @@ -8,9 +8,6 @@ import sys import logging -logger = util.CustomFormatter.getlogger(__name__) -logger.setLevel(logging.INFO) - from collections import deque, defaultdict from multiprocessing import Pool @@ -22,6 +19,10 @@ import msyd.util as util from msyd.multisyn import Multisyn, Private from msyd.coords import Range +logger = util.CustomFormatter.getlogger(__name__) +logger.setLevel(logging.INFO) + + cdef int MIN_PRIV_THRESH = intersection.get_min_syn_thresh() diff --git a/msyd/pyxfiles/realignment.pyx b/msyd/pyxfiles/realignment.pyx index d7c7cf8..52b205b 100644 --- a/msyd/pyxfiles/realignment.pyx +++ b/msyd/pyxfiles/realignment.pyx @@ -74,6 +74,12 @@ cpdef construct_mts(merasyns, old, syn): offsetdict = {org:rng.end for org, rng in old.ranges_dict.items()} # stores the current offset in each org for merasyn in merasyns: + # mark private regions as covered, if they are + if merasyn.is_private(): + if merasyn.ref.org != 'ref': + offsetdict[merasyn.ref.org] = merasyn.ref.end + continue + # iterate through all multisyns found so far for org, rng in merasyn.ranges_dict.items(): #print(f"{offsetdict[org]}, {posdict[org]}, {rng}, {mappingtrees[org]}") @@ -129,7 +135,7 @@ cpdef mt_to_privates(mt, org, chrom): :chrom: chromosome we are working on """ cdef ret = deque() - for entry in mappingtree: + for entry in mt: if entry.end - entry.start >= priv.MIN_PRIV_THRESH: ret.append(Private(org, chrom, entry.data, entry.data + entry.end - entry.start)) @@ -645,20 +651,27 @@ cdef process_gaps(df, qrynames, fastas, mp_preset='asm20', ncores=1, annotate_pr logger.debug("Running syri") syns = syri_get_syntenic(ref, alns) + # no synteny found; can be genuine, or an issue with alignment if len(syns) == 0: logger.info(f"No synteny to {ref} was found!") # debug: emit non-aligning sequences #if refend - refstart > 1000: # print(f"\n===\n>{ref} {list(reftree)}\n{refseq}") # print('\n'.join([f">{id} {list(mappingtrees[id])}\n{seq}" for id, seq in seqdict.items()])) + # draw out remaining sequences as private from the mappingtree + # if there is none, there wouldn't have been any to call as large enough anyway + if annotate_private and ref in mappingtrees: + merasyns[ref] = mt_to_privates(mappingtrees[ref], ref, syn.ranges_dict[org].chr) continue ## Find merasyn in the syri calls - multisyns = intersection.reduce_find_overlaps(list(syns.values()), cores=ncores, annotate_private=annotate_private) + multisyns = intersection.reduce_find_overlaps(list(syns.values()), cores=ncores) # no need to recalculate the tree if no multisynteny was found if multisyns is None or multisyns.empty: logger.info("No multisynteny was found in this round!") + if annotate_private and ref in mappingtrees: + merasyns[ref] = mt_to_privates(mappingtrees[ref], ref, syn.ranges_dict[org].chr) continue # Add all merasyns with alphabetical sorting by reference name