Merge branch 'develop'

ksahlin · Oct 13, 2021 · e368f66 · e368f66
2 parents 68ca12f + 1423c1c
commit e368f66
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 6 deletions.
diff --git a/NGSpeciesID b/NGSpeciesID
@@ -180,7 +180,7 @@ def write_fastq(args):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Reference-free clustering and consensus forming of targeted ONT or PacBio reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--version', action='version', version='%(prog)s 0.1.2.1')
+    parser.add_argument('--version', action='version', version='%(prog)s 0.1.2.2')
 
     parser.add_argument('--fastq', type=str,  default=False, help='Path to consensus fastq file(s)')
     parser.add_argument('--flnc', type=str, default=False, help='The flnc reads generated by the isoseq3 algorithm (BAM file)')
@@ -197,6 +197,8 @@ if __name__ == '__main__':
     parser.add_argument('--consensus', action="store_true", help='After clustering, (1) run spoa on all clusters, (2) detect reverse complements, (3) run medaka.')
     parser.add_argument('--abundance_ratio', type=float, default=0.1, help='Threshold for --consensus algorithm. Consider only clusters larger than a fraction of number of total reads (default 0.1)')
     parser.add_argument('--rc_identity_threshold', type=float, default=0.9, help='Threshold for --consensus algorithm. Define a reverse complement if identity is over this threshold (default 0.9)')
+    parser.add_argument('--max_seqs_for_consensus', type=int, default=-1,  help='Maximum number of seqs to form consensus with spoa [INT] (default = -1, which means to use all sequences available regardless of cluster size).')
+
     group = parser.add_mutually_exclusive_group()
     group.add_argument('--medaka', action="store_true", help='Run final medaka polishing algorithm.')
     group.add_argument('--racon', action="store_true", help='Run final racon polishing algorithm.')

diff --git a/modules/consensus.py b/modules/consensus.py
@@ -204,18 +204,20 @@ def polish_sequences(centers, args):
         f.write(">{0}\n{1}\n".format("consensus_cl_id_{0}_total_supporting_reads_{1}".format(c_id, nr_reads_in_cluster), center))
         f.close()
 
+        nr_reads_used = 0
         all_reads_file = os.path.join(args.outfolder, "reads_to_consensus_{0}.fastq".format(c_id))
         f = open(all_reads_file, "w")
         for fasta_file in all_reads: 
             reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(fasta_file, 'r'))}
             for acc, (seq, qual) in reads.items():
                 acc_tmp = acc.split()[0]
                 f.write("@{0}\n{1}\n{2}\n{3}\n".format(acc_tmp, seq, "+", qual))
+                nr_reads_used += 1
         f.close()
         # to_polishing.append( (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) )
 
         if args.medaka:
-            print("running medaka on spoa reference {0}.".format(c_id))
+            print("running medaka on spoa reference {0} using {1} reads for polishing.".format(c_id, nr_reads_used))
             # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing:
             polishing_outfolder = os.path.join(args.outfolder, "medaka_cl_id_{0}".format(c_id))
             help_functions.mkdir_p(polishing_outfolder)
@@ -225,7 +227,7 @@ def polish_sequences(centers, args):
             center_polished = l[1].strip()
             centers[i][2] = center_polished
         elif args.racon:
-            print("running racon on spoa reference {0}.".format(c_id))
+            print("running racon on spoa reference {0} using {1} reads for polishing.".format(c_id, nr_reads_used))
             # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing:
             polishing_outfolder = os.path.join(args.outfolder, "racon_cl_id_{0}".format(c_id))
             help_functions.mkdir_p(polishing_outfolder)
@@ -247,13 +249,16 @@ def form_draft_consensus(clusters, representatives, sorted_reads_fastq_file, wor
         nr_reads_in_cluster = len(all_read_acc)
         # print("nr_reads_in_cluster", nr_reads_in_cluster)
         if nr_reads_in_cluster >= abundance_cutoff:
-            for acc in all_read_acc:
+            for i, acc in enumerate(all_read_acc):
+                if (args.max_seqs_for_consensus) >=0 and (i >= args.max_seqs_for_consensus):
+                    break
                 seq, qual = reads[acc]
                 reads_path.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
                 # reads_path.write(">{0}\n{1}\n".format(str(q_id)+str(pos1)+str(pos2), seq))
             reads_path.close()
             # spoa_ref = create_augmented_reference.run_spoa(reads_path.name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa")
-            print("creating center of {0} sequences.".format(nr_reads_in_cluster))
+            tmp_param = args.max_seqs_for_consensus if args.max_seqs_for_consensus > 0 else 2**32
+            print("creating center of {0} sequences.".format(min(nr_reads_in_cluster, tmp_param)))
             center = run_spoa(reads_path.name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa")
             centers.append( [nr_reads_in_cluster, c_id, center, reads_path.name])
     return centers
diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 setup(
 
     name='NGSpeciesID',  # Required
-    version='0.1.2.1',  # Required
+    version='0.1.2.2',  # Required
     description='Reconstructs viral consensus sequences from a set of ONT reads.',  # Required
     long_description=long_description,  # Optional
     url='https://github.com/ksahlin/NGSpeciesID',  # Optional