Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
ksahlin committed Oct 13, 2021
2 parents 68ca12f + 1423c1c commit e368f66
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
4 changes: 3 additions & 1 deletion NGSpeciesID
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def write_fastq(args):

if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Reference-free clustering and consensus forming of targeted ONT or PacBio reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--version', action='version', version='%(prog)s 0.1.2.1')
parser.add_argument('--version', action='version', version='%(prog)s 0.1.2.2')

parser.add_argument('--fastq', type=str, default=False, help='Path to consensus fastq file(s)')
parser.add_argument('--flnc', type=str, default=False, help='The flnc reads generated by the isoseq3 algorithm (BAM file)')
Expand All @@ -197,6 +197,8 @@ if __name__ == '__main__':
parser.add_argument('--consensus', action="store_true", help='After clustering, (1) run spoa on all clusters, (2) detect reverse complements, (3) run medaka.')
parser.add_argument('--abundance_ratio', type=float, default=0.1, help='Threshold for --consensus algorithm. Consider only clusters larger than a fraction of number of total reads (default 0.1)')
parser.add_argument('--rc_identity_threshold', type=float, default=0.9, help='Threshold for --consensus algorithm. Define a reverse complement if identity is over this threshold (default 0.9)')
parser.add_argument('--max_seqs_for_consensus', type=int, default=-1, help='Maximum number of seqs to form consensus with spoa [INT] (default = -1, which means to use all sequences available regardless of cluster size).')

group = parser.add_mutually_exclusive_group()
group.add_argument('--medaka', action="store_true", help='Run final medaka polishing algorithm.')
group.add_argument('--racon', action="store_true", help='Run final racon polishing algorithm.')
Expand Down
13 changes: 9 additions & 4 deletions modules/consensus.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,18 +204,20 @@ def polish_sequences(centers, args):
f.write(">{0}\n{1}\n".format("consensus_cl_id_{0}_total_supporting_reads_{1}".format(c_id, nr_reads_in_cluster), center))
f.close()

nr_reads_used = 0
all_reads_file = os.path.join(args.outfolder, "reads_to_consensus_{0}.fastq".format(c_id))
f = open(all_reads_file, "w")
for fasta_file in all_reads:
reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(fasta_file, 'r'))}
for acc, (seq, qual) in reads.items():
acc_tmp = acc.split()[0]
f.write("@{0}\n{1}\n{2}\n{3}\n".format(acc_tmp, seq, "+", qual))
nr_reads_used += 1
f.close()
# to_polishing.append( (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) )

if args.medaka:
print("running medaka on spoa reference {0}.".format(c_id))
print("running medaka on spoa reference {0} using {1} reads for polishing.".format(c_id, nr_reads_used))
# for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing:
polishing_outfolder = os.path.join(args.outfolder, "medaka_cl_id_{0}".format(c_id))
help_functions.mkdir_p(polishing_outfolder)
Expand All @@ -225,7 +227,7 @@ def polish_sequences(centers, args):
center_polished = l[1].strip()
centers[i][2] = center_polished
elif args.racon:
print("running racon on spoa reference {0}.".format(c_id))
print("running racon on spoa reference {0} using {1} reads for polishing.".format(c_id, nr_reads_used))
# for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing:
polishing_outfolder = os.path.join(args.outfolder, "racon_cl_id_{0}".format(c_id))
help_functions.mkdir_p(polishing_outfolder)
Expand All @@ -247,13 +249,16 @@ def form_draft_consensus(clusters, representatives, sorted_reads_fastq_file, wor
nr_reads_in_cluster = len(all_read_acc)
# print("nr_reads_in_cluster", nr_reads_in_cluster)
if nr_reads_in_cluster >= abundance_cutoff:
for acc in all_read_acc:
for i, acc in enumerate(all_read_acc):
if (args.max_seqs_for_consensus) >=0 and (i >= args.max_seqs_for_consensus):
break
seq, qual = reads[acc]
reads_path.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
# reads_path.write(">{0}\n{1}\n".format(str(q_id)+str(pos1)+str(pos2), seq))
reads_path.close()
# spoa_ref = create_augmented_reference.run_spoa(reads_path.name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa")
print("creating center of {0} sequences.".format(nr_reads_in_cluster))
tmp_param = args.max_seqs_for_consensus if args.max_seqs_for_consensus > 0 else 2**32
print("creating center of {0} sequences.".format(min(nr_reads_in_cluster, tmp_param)))
center = run_spoa(reads_path.name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa")
centers.append( [nr_reads_in_cluster, c_id, center, reads_path.name])
return centers
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
setup(

name='NGSpeciesID', # Required
version='0.1.2.1', # Required
version='0.1.2.2', # Required
description='Reconstructs viral consensus sequences from a set of ONT reads.', # Required
long_description=long_description, # Optional
url='https://github.com/ksahlin/NGSpeciesID', # Optional
Expand Down

0 comments on commit e368f66

Please sign in to comment.