From b064f8eaada3c4433dc8c55cdea0366ef42916f2 Mon Sep 17 00:00:00 2001 From: Kristoffer Sahlin Date: Wed, 13 Oct 2021 21:12:18 +0200 Subject: [PATCH] added new parameter to control number of reads sent to spoa for draft consensus --- NGSpeciesID | 2 ++ modules/consensus.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/NGSpeciesID b/NGSpeciesID index 0780c51..0d68d8a 100755 --- a/NGSpeciesID +++ b/NGSpeciesID @@ -197,6 +197,8 @@ if __name__ == '__main__': parser.add_argument('--consensus', action="store_true", help='After clustering, (1) run spoa on all clusters, (2) detect reverse complements, (3) run medaka.') parser.add_argument('--abundance_ratio', type=float, default=0.1, help='Threshold for --consensus algorithm. Consider only clusters larger than a fraction of number of total reads (default 0.1)') parser.add_argument('--rc_identity_threshold', type=float, default=0.9, help='Threshold for --consensus algorithm. Define a reverse complement if identity is over this threshold (default 0.9)') + parser.add_argument('--max_seqs_for_consensus', type=int, default=-1, help='Maximum number of seqs to form consensus with spoa [INT] (default = -1, which means to use all sequences available regardless of cluster size).') + group = parser.add_mutually_exclusive_group() group.add_argument('--medaka', action="store_true", help='Run final medaka polishing algorithm.') group.add_argument('--racon', action="store_true", help='Run final racon polishing algorithm.') diff --git a/modules/consensus.py b/modules/consensus.py index 985d76a..86c3532 100644 --- a/modules/consensus.py +++ b/modules/consensus.py @@ -204,6 +204,7 @@ def polish_sequences(centers, args): f.write(">{0}\n{1}\n".format("consensus_cl_id_{0}_total_supporting_reads_{1}".format(c_id, nr_reads_in_cluster), center)) f.close() + nr_reads_used = 0 all_reads_file = os.path.join(args.outfolder, "reads_to_consensus_{0}.fastq".format(c_id)) f = open(all_reads_file, "w") for fasta_file in all_reads: @@ -211,11 +212,12 @@ def polish_sequences(centers, args): for acc, (seq, qual) in reads.items(): acc_tmp = acc.split()[0] f.write("@{0}\n{1}\n{2}\n{3}\n".format(acc_tmp, seq, "+", qual)) + nr_reads_used += 1 f.close() # to_polishing.append( (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) ) if args.medaka: - print("running medaka on spoa reference {0}.".format(c_id)) + print("running medaka on spoa reference {0} using {1} reads for polishing.".format(c_id, nr_reads_used)) # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing: polishing_outfolder = os.path.join(args.outfolder, "medaka_cl_id_{0}".format(c_id)) help_functions.mkdir_p(polishing_outfolder) @@ -225,7 +227,7 @@ def polish_sequences(centers, args): center_polished = l[1].strip() centers[i][2] = center_polished elif args.racon: - print("running racon on spoa reference {0}.".format(c_id)) + print("running racon on spoa reference {0} using {1} reads for polishing.".format(c_id, nr_reads_used)) # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing: polishing_outfolder = os.path.join(args.outfolder, "racon_cl_id_{0}".format(c_id)) help_functions.mkdir_p(polishing_outfolder) @@ -247,13 +249,16 @@ def form_draft_consensus(clusters, representatives, sorted_reads_fastq_file, wor nr_reads_in_cluster = len(all_read_acc) # print("nr_reads_in_cluster", nr_reads_in_cluster) if nr_reads_in_cluster >= abundance_cutoff: - for acc in all_read_acc: + for i, acc in enumerate(all_read_acc): + if (args.max_seqs_for_consensus) >=0 and (i >= args.max_seqs_for_consensus): + break seq, qual = reads[acc] reads_path.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual)) # reads_path.write(">{0}\n{1}\n".format(str(q_id)+str(pos1)+str(pos2), seq)) reads_path.close() # spoa_ref = create_augmented_reference.run_spoa(reads_path.name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa") - print("creating center of {0} sequences.".format(nr_reads_in_cluster)) + tmp_param = args.max_seqs_for_consensus if args.max_seqs_for_consensus > 0 else 2**32 + print("creating center of {0} sequences.".format(min(nr_reads_in_cluster, tmp_param))) center = run_spoa(reads_path.name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa") centers.append( [nr_reads_in_cluster, c_id, center, reads_path.name]) return centers