From faf6bf0fbce006ba8bbc863d4c755497c6012522 Mon Sep 17 00:00:00 2001 From: ksahlin Date: Mon, 26 Jun 2023 10:58:40 +0200 Subject: [PATCH 1/2] added fix to issue #25 --- NGSpeciesID | 2 +- modules/consensus.py | 34 ++++++++++++++++++++++------------ setup.py | 2 +- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/NGSpeciesID b/NGSpeciesID index c857fc1..8bb8f99 100755 --- a/NGSpeciesID +++ b/NGSpeciesID @@ -183,7 +183,7 @@ def write_fastq(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description="Reference-free clustering and consensus forming of targeted ONT or PacBio reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--version', action='version', version='%(prog)s 0.1.3') + parser.add_argument('--version', action='version', version='%(prog)s 0.3.0') parser.add_argument('--fastq', type=str, default=False, help='Path to consensus fastq file(s)') parser.add_argument('--flnc', type=str, default=False, help='The flnc reads generated by the isoseq3 algorithm (BAM file)') diff --git a/modules/consensus.py b/modules/consensus.py index 4647aa0..601f793 100644 --- a/modules/consensus.py +++ b/modules/consensus.py @@ -136,6 +136,25 @@ def run_racon(reads_to_center, center_file, outfolder, cores, racon_iter): # consensus.run_medaka( " medaka_consensus -i ~/tmp/stefan_isonclust/mixed_b1_b3_b4_b5.fastq -d ~/tmp/stefan_isonclust/mixed_b1_b3_b4_b5_isonclust/consensus_references.fasta -o ~/tmp/stefan_isonclust/mixed_b1_b3_b4_b5_medaka/ -t 1 -m r941_min_high_g303") +def highest_aln_identity(seq, seq2): + # RC + seq2_rc = reverse_complement(seq2) + seq_aln_rc, seq2_aln_rc, cigar_string_rc, cigar_tuples_rc, alignment_score_rc = parasail_alignment(seq, seq2_rc) + nr_mismatching_pos = len([1 for n1, n2 in zip(seq_aln_rc, seq2_aln_rc) if n1 != n2]) + total_pos_rc = len(seq_aln_rc) + aln_identity_rc = (total_pos_rc - nr_mismatching_pos) / float(total_pos_rc) + print('Rec comp orientation identity %: 'aln_identity_rc) + + # FW + seq_aln, seq2_aln, cigar_string, cigar_tuples, alignment_score = parasail_alignment(seq, seq2) + nr_mismatching_pos = len([1 for n1, n2 in zip(seq_aln, seq2_aln) if n1 != n2]) + total_pos = len(seq_aln) + aln_identity_fw = (total_pos - nr_mismatching_pos) / float(total_pos) + print('Forward orientation identity %: 'aln_identity_fw) + aln_identity = max([aln_identity_fw, aln_identity_rc]) + return aln_identity + + def detect_reverse_complements(centers, rc_identity_threshold): filtered_centers = [] already_removed = set() @@ -151,23 +170,14 @@ def detect_reverse_complements(centers, rc_identity_threshold): print("has already been merged, skipping") continue - elif i == len(centers) - 1: # last sequence and it is not in already removed + elif i == len(centers) - 1: # last sequence and it is not in already_removed filtered_centers.append( [merged_nr_reads, c_id, seq, all_reads ] ) else: for j, (nr_reads_in_cl2, c_id2, seq2, reads_path) in enumerate(centers[i+1:]): - seq2_rc = reverse_complement(seq2) - seq_aln, seq2_rc_aln, cigar_string, cigar_tuples, alignment_score = parasail_alignment(seq, seq2_rc) - # print(i,j) - # print(seq_aln) - # print(seq2_rc_aln) - nr_mismatching_pos = len([1 for n1, n2 in zip(seq_aln, seq2_rc_aln) if n1 != n2]) - total_pos = len(seq_aln) - aln_identity = (total_pos - nr_mismatching_pos) / float(total_pos) - print(aln_identity) - + aln_identity = highest_aln_identity(seq, seq2) if aln_identity >= rc_identity_threshold: - print("Detected alignment identidy above threchold for reverse complement. Keeping center with the most read support and adding rc reads to supporting reads.") + print("Detected two consensus sequences with alignment identidy above threshold (from either reverse complement or split clusters). Keeping center with the most read support and merging reads.") merged_nr_reads += nr_reads_in_cl2 already_removed.add(c_id2) diff --git a/setup.py b/setup.py index d9ad987..cbbf8e4 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setup( name='NGSpeciesID', # Required - version='0.1.3', # Required + version='0.3.0', # Required description='Reconstructs viral consensus sequences from a set of ONT reads.', # Required long_description=long_description, # Optional url='https://github.com/ksahlin/NGSpeciesID', # Optional From f0bfb73a2c6fa4ab137cc2c055147f2a42a4fcd9 Mon Sep 17 00:00:00 2001 From: ksahlin Date: Mon, 26 Jun 2023 11:02:13 +0200 Subject: [PATCH 2/2] syntax fix --- modules/consensus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/consensus.py b/modules/consensus.py index 601f793..9818502 100644 --- a/modules/consensus.py +++ b/modules/consensus.py @@ -143,14 +143,14 @@ def highest_aln_identity(seq, seq2): nr_mismatching_pos = len([1 for n1, n2 in zip(seq_aln_rc, seq2_aln_rc) if n1 != n2]) total_pos_rc = len(seq_aln_rc) aln_identity_rc = (total_pos_rc - nr_mismatching_pos) / float(total_pos_rc) - print('Rec comp orientation identity %: 'aln_identity_rc) + print('Rec comp orientation identity %: ', aln_identity_rc) # FW seq_aln, seq2_aln, cigar_string, cigar_tuples, alignment_score = parasail_alignment(seq, seq2) nr_mismatching_pos = len([1 for n1, n2 in zip(seq_aln, seq2_aln) if n1 != n2]) total_pos = len(seq_aln) aln_identity_fw = (total_pos - nr_mismatching_pos) / float(total_pos) - print('Forward orientation identity %: 'aln_identity_fw) + print('Forward orientation identity %: ', aln_identity_fw) aln_identity = max([aln_identity_fw, aln_identity_rc]) return aln_identity