Skip to content

Commit

Permalink
Changed standard output of isONform to fasta. Added --write_fastq to …
Browse files Browse the repository at this point in the history
…force output in fastq format
  • Loading branch information
aljpetri committed Nov 14, 2023
1 parent 1f511d3 commit 526427f
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 32 deletions.
14 changes: 9 additions & 5 deletions isONform_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,12 @@ def main(args):
if args.split_wrt_batches:
print("STILLSPLITWRTBATCHES")
file_handling = time()
batch_merging_parallel.join_back_via_batch_merging(args.outfolder, args.delta, args.delta_len, args.delta_iso_len_3, args.delta_iso_len_5, args.max_seqs_to_spoa,args.iso_abundance)
Parallelization_side_functions.generate_full_output(args.outfolder)
if args.write_fastq:
write_fastq = True
else:
write_fastq = False
batch_merging_parallel.join_back_via_batch_merging(args.outfolder, args.delta, args.delta_len, args.delta_iso_len_3, args.delta_iso_len_5, args.max_seqs_to_spoa,args.iso_abundance, write_fastq)
Parallelization_side_functions.generate_full_output(args.outfolder,write_fastq)
Parallelization_side_functions.remove_folders(args.outfolder)
shutil.rmtree(split_directory)
print("Joined back batched files in:", time() - file_handling)
Expand All @@ -306,7 +310,7 @@ def main(args):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="De novo reconstruction of long-read transcriptome reads",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--version', action='version', version='%(prog)s 0.3.0')
parser.add_argument('--version', action='version', version='%(prog)s 0.3.1')
parser.add_argument('--fastq_folder', type=str, default=False,
help='Path to input fastq folder with reads in clusters')
parser.add_argument('--t', dest="nr_cores", type=int, default=8, help='Number of cores allocated for clustering')
Expand All @@ -332,14 +336,14 @@ def main(args):
parser.add_argument('--delta',type=float,default=0.1, help='diversity rate used to compare sequences')
parser.add_argument('--max_seqs_to_spoa', type=int, default=200, help='Maximum number of seqs to spoa')
parser.add_argument('--verbose', action="store_true", help='Print various developer stats.')
parser.add_argument('--iso_abundance', type=int, default=1,
parser.add_argument('--iso_abundance', type=int, default=5,
help='Cutoff parameter: abundance of reads that have to support an isoform to show in results')
parser.add_argument('--delta_iso_len_3', type=int, default=30,
help='Cutoff parameter: maximum length difference at 3prime end, for which subisoforms are still merged into longer isoforms')
parser.add_argument('--delta_iso_len_5', type=int, default=50,
help='Cutoff parameter: maximum length difference at 5prime end, for which subisoforms are still merged into longer isoforms')
parser.add_argument('--tmpdir', type=str,default=None, help='OPTIONAL PARAMETER: Absolute path to custom folder in which to store temporary files. If tmpdir is not specified, isONform will attempt to write the temporary files into the tmp folder on your system. It is advised to only use this parameter if the symlinking does not work on your system.')

parser.add_argument('--write_fastq', action="store_true", help=' Indicates that we want to ouptut the final output (transcriptome) as fastq file (New standard: fasta)')
args = parser.parse_args()
print(len(sys.argv))
if len(sys.argv) == 1:
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ def main(args):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="De novo error correction of long-read transcriptome reads",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--version', action='version', version='%(prog)s 0.3.0')
parser.add_argument('--version', action='version', version='%(prog)s 0.3.1')
parser.add_argument('--fastq', type=str, default=False, help='Path to input fastq file with reads')

parser.add_argument('--k', type=int, default=20, help='Kmer size')
Expand Down
34 changes: 23 additions & 11 deletions modules/Parallelization_side_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,23 @@ def generate_single_mapping(outfolder):
for line in g:
# append content to second file
f.write(line)
def generate_single_output(outfolder):
def generate_single_output(outfolder,write_fastq):
subfolders = [f.path for f in os.scandir(outfolder) if f.is_dir()]
print("Generating transcriptome.fastq")
f = open(os.path.join(outfolder,"transcriptome.fastq"), "w")
if write_fastq:
print("Generating transcriptome.fastq")
f = open(os.path.join(outfolder,"transcriptome.fastq"), "w")
else:
print("Generating transcriptome.fasta")
f = open(os.path.join(outfolder, "transcriptome.fasta"), "w")
for subfolder in subfolders:
#print("subfolder",subfolder)
actual_folder=subfolder.split("/")[-1]
#print(actual_folder)
if actual_folder.isdigit():
fname=os.path.join(outfolder,"cluster"+str(actual_folder)+"_merged.fq")
if write_fastq:
fname=os.path.join(outfolder, "cluster"+str(actual_folder)+"_merged.fq")
else:
fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged.fa")
#print(fname)
if os.path.isfile(fname):
#print("True")
Expand All @@ -80,16 +87,21 @@ def generate_single_output(outfolder):
# append content to second file
f.write(line)

def generate_low_abundance_output(outfolder):
def generate_low_abundance_output(outfolder,write_fastq):
subfolders = [f.path for f in os.scandir(outfolder) if f.is_dir()]
f = open(os.path.join(outfolder, "transcriptome_low.fastq"), "w")
if write_fastq:
f = open(os.path.join(outfolder, "transcriptome_low.fastq"), "w")
else:
f = open(os.path.join(outfolder, "transcriptome_low.fasta"), "w")
for subfolder in subfolders:
actual_folder = subfolder.split("/")[-1]
# print(actual_folder)
if actual_folder.isdigit():
fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged_low_abundance.fastq")
if write_fastq:
fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged_low_abundance.fq")
# print(fname)

else:
fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged_low_abundance.fa")
if os.path.isfile(fname):
g = open(fname, "r")
# read content from first file
Expand All @@ -113,9 +125,9 @@ def remove_folders(outfolder):
for subfolder in subfolders:
shutil.rmtree(os.path.join(outfolder,subfolder))

def generate_full_output(outfolder):
generate_single_output(outfolder)
generate_low_abundance_output(outfolder)
def generate_full_output(outfolder,write_fastq):
generate_single_output(outfolder,write_fastq)
generate_low_abundance_output(outfolder, write_fastq)
generate_single_mapping(outfolder)
generate_low_abundance_mapping(outfolder)
generate_single_support(outfolder)
31 changes: 17 additions & 14 deletions modules/batch_merging_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,16 @@ def read_batch_file(batch_id, all_infos_dict, all_reads_dict, cl_dir):
all_infos_dict[batch_id] = {}


def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder):
def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder, write_fastq):
write_low_abundance = False
support_name = "support_" + str(folder) + ".txt"
other_support_name = "support_" + str(folder) + "low_abundance.txt"
consensus_name = "cluster" + str(folder) + "_merged.fq"
other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fq"
if write_fastq:
consensus_name = "cluster" + str(folder) + "_merged.fq"
other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fq"
else:
consensus_name = "cluster" + str(folder) + "_merged.fa"
other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fa"
mapping_name = "cluster" + str(folder) + "_mapping.txt"
other_mapping_name = "cluster" + str(folder) + "_mapping_low_abundance.txt"
support_file = open(os.path.join(outfolder, support_name), "w")
Expand All @@ -103,14 +107,20 @@ def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder)
if not infos.merged:
if len(infos.reads) >= iso_abundance or iso_abundance == 1:
mapping_file.write(">{0}\n{1}\n".format(new_id, all_infos_dict[batchid][id].reads))
consensus_file.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence,
if write_fastq:
consensus_file.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence,
"+" * len(all_infos_dict[batchid][id].sequence)))
else:
consensus_file.write("@{0}\n{1}\n".format(new_id, all_infos_dict[batchid][id].sequence))
support_file.write("{0}: {1}\n".format(new_id, len(all_infos_dict[batchid][id].reads)))
else:
if write_low_abundance:
other_consensus.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence,
if write_fastq:
other_consensus.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence,
"+" * len(
all_infos_dict[batchid][id].sequence)))
else:
other_consensus.write("@{0}\n{1}\n".format(new_id, all_infos_dict[batchid][id].sequence))
if new_id in all_infos_dict:
other_support_file.write("{0}: {1}\n".format(new_id, len(all_infos_dict[new_id].reads)))
else:
Expand Down Expand Up @@ -196,7 +206,7 @@ def actual_merging_process(all_infos_dict, delta, delta_len,


def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3,
delta_iso_len_5, max_seqs_to_spoa, iso_abundance):
delta_iso_len_5, max_seqs_to_spoa, iso_abundance,write_fastq):
print("Batch Merging")
unique_cl_ids = set()
subfolders = [f.path for f in os.scandir(outdir) if f.is_dir()]
Expand Down Expand Up @@ -233,13 +243,6 @@ def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3,
map_name = "mapping" + str(batch_id)
map_file = open(os.path.join(cl_dir, map_name), 'rb')
batch_mappings[batch_id] = pickle.load(map_file)
#print(all_reads_dict)
#print(batch_reads)
#print(batch_mappings)
# read all files needed to perform the batch merging and store the respective infos into all_infos_dict as well as all_reads_dict
# read_batch_file(batch_id, all_infos_dict, all_reads_dict, cl_dir)
# batch_reads[batch_id]=read_spoa_file(batch_id,cl_dir)
# batch_mappings[batch_id]=read_mapping_file(batch_id,cl_dir)
# collect the information so that we have one data structure containing all infos
for b_id, value in batch_reads.items():
all_infos_batch = {}
Expand All @@ -263,7 +266,7 @@ def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3,
for c_id, c_infos in b_infos.items():
if not c_infos.merged:
nr_reads += len(c_infos.reads)
write_final_output(all_infos_dict, outdir, iso_abundance, cl_dir, cl_id)
write_final_output(all_infos_dict, outdir, iso_abundance, cl_dir, cl_id, write_fastq)
#shutil.rmtree(os.path.join(outdir,cl_id))

DEBUG = False
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
setup(

name='isONform', # Required
version='0.3.0', # Required
version='0.3.1', # Required
description='De novo construction of isoforms from long-read data ', # Required
long_description=long_description, # Optional
long_description_content_type='text/markdown',
Expand Down

0 comments on commit 526427f

Please sign in to comment.