From 526427f140cc57489563433c9c3d7e46bdc4fa3b Mon Sep 17 00:00:00 2001 From: aljpetri Date: Tue, 14 Nov 2023 12:21:45 +0200 Subject: [PATCH] Changed standard output of isONform to fasta. Added --write_fastq to force output in fastq format --- isONform_parallel.py | 14 ++++++---- main.py | 2 +- modules/Parallelization_side_functions.py | 34 +++++++++++++++-------- modules/batch_merging_parallel.py | 31 +++++++++++---------- setup.py | 2 +- 5 files changed, 51 insertions(+), 32 deletions(-) diff --git a/isONform_parallel.py b/isONform_parallel.py index d6a2004..d5dc265 100755 --- a/isONform_parallel.py +++ b/isONform_parallel.py @@ -294,8 +294,12 @@ def main(args): if args.split_wrt_batches: print("STILLSPLITWRTBATCHES") file_handling = time() - batch_merging_parallel.join_back_via_batch_merging(args.outfolder, args.delta, args.delta_len, args.delta_iso_len_3, args.delta_iso_len_5, args.max_seqs_to_spoa,args.iso_abundance) - Parallelization_side_functions.generate_full_output(args.outfolder) + if args.write_fastq: + write_fastq = True + else: + write_fastq = False + batch_merging_parallel.join_back_via_batch_merging(args.outfolder, args.delta, args.delta_len, args.delta_iso_len_3, args.delta_iso_len_5, args.max_seqs_to_spoa,args.iso_abundance, write_fastq) + Parallelization_side_functions.generate_full_output(args.outfolder,write_fastq) Parallelization_side_functions.remove_folders(args.outfolder) shutil.rmtree(split_directory) print("Joined back batched files in:", time() - file_handling) @@ -306,7 +310,7 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description="De novo reconstruction of long-read transcriptome reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--version', action='version', version='%(prog)s 0.3.0') + parser.add_argument('--version', action='version', version='%(prog)s 0.3.1') parser.add_argument('--fastq_folder', type=str, default=False, help='Path to input fastq folder with reads in clusters') parser.add_argument('--t', dest="nr_cores", type=int, default=8, help='Number of cores allocated for clustering') @@ -332,14 +336,14 @@ def main(args): parser.add_argument('--delta',type=float,default=0.1, help='diversity rate used to compare sequences') parser.add_argument('--max_seqs_to_spoa', type=int, default=200, help='Maximum number of seqs to spoa') parser.add_argument('--verbose', action="store_true", help='Print various developer stats.') - parser.add_argument('--iso_abundance', type=int, default=1, + parser.add_argument('--iso_abundance', type=int, default=5, help='Cutoff parameter: abundance of reads that have to support an isoform to show in results') parser.add_argument('--delta_iso_len_3', type=int, default=30, help='Cutoff parameter: maximum length difference at 3prime end, for which subisoforms are still merged into longer isoforms') parser.add_argument('--delta_iso_len_5', type=int, default=50, help='Cutoff parameter: maximum length difference at 5prime end, for which subisoforms are still merged into longer isoforms') parser.add_argument('--tmpdir', type=str,default=None, help='OPTIONAL PARAMETER: Absolute path to custom folder in which to store temporary files. If tmpdir is not specified, isONform will attempt to write the temporary files into the tmp folder on your system. It is advised to only use this parameter if the symlinking does not work on your system.') - + parser.add_argument('--write_fastq', action="store_true", help=' Indicates that we want to ouptut the final output (transcriptome) as fastq file (New standard: fasta)') args = parser.parse_args() print(len(sys.argv)) if len(sys.argv) == 1: diff --git a/main.py b/main.py index bcb9e33..03fb601 100644 --- a/main.py +++ b/main.py @@ -586,7 +586,7 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description="De novo error correction of long-read transcriptome reads", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--version', action='version', version='%(prog)s 0.3.0') + parser.add_argument('--version', action='version', version='%(prog)s 0.3.1') parser.add_argument('--fastq', type=str, default=False, help='Path to input fastq file with reads') parser.add_argument('--k', type=int, default=20, help='Kmer size') diff --git a/modules/Parallelization_side_functions.py b/modules/Parallelization_side_functions.py index f4f46d3..85e244b 100644 --- a/modules/Parallelization_side_functions.py +++ b/modules/Parallelization_side_functions.py @@ -58,16 +58,23 @@ def generate_single_mapping(outfolder): for line in g: # append content to second file f.write(line) -def generate_single_output(outfolder): +def generate_single_output(outfolder,write_fastq): subfolders = [f.path for f in os.scandir(outfolder) if f.is_dir()] - print("Generating transcriptome.fastq") - f = open(os.path.join(outfolder,"transcriptome.fastq"), "w") + if write_fastq: + print("Generating transcriptome.fastq") + f = open(os.path.join(outfolder,"transcriptome.fastq"), "w") + else: + print("Generating transcriptome.fasta") + f = open(os.path.join(outfolder, "transcriptome.fasta"), "w") for subfolder in subfolders: #print("subfolder",subfolder) actual_folder=subfolder.split("/")[-1] #print(actual_folder) if actual_folder.isdigit(): - fname=os.path.join(outfolder,"cluster"+str(actual_folder)+"_merged.fq") + if write_fastq: + fname=os.path.join(outfolder, "cluster"+str(actual_folder)+"_merged.fq") + else: + fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged.fa") #print(fname) if os.path.isfile(fname): #print("True") @@ -80,16 +87,21 @@ def generate_single_output(outfolder): # append content to second file f.write(line) -def generate_low_abundance_output(outfolder): +def generate_low_abundance_output(outfolder,write_fastq): subfolders = [f.path for f in os.scandir(outfolder) if f.is_dir()] - f = open(os.path.join(outfolder, "transcriptome_low.fastq"), "w") + if write_fastq: + f = open(os.path.join(outfolder, "transcriptome_low.fastq"), "w") + else: + f = open(os.path.join(outfolder, "transcriptome_low.fasta"), "w") for subfolder in subfolders: actual_folder = subfolder.split("/")[-1] # print(actual_folder) if actual_folder.isdigit(): - fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged_low_abundance.fastq") + if write_fastq: + fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged_low_abundance.fq") # print(fname) - + else: + fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged_low_abundance.fa") if os.path.isfile(fname): g = open(fname, "r") # read content from first file @@ -113,9 +125,9 @@ def remove_folders(outfolder): for subfolder in subfolders: shutil.rmtree(os.path.join(outfolder,subfolder)) -def generate_full_output(outfolder): - generate_single_output(outfolder) - generate_low_abundance_output(outfolder) +def generate_full_output(outfolder,write_fastq): + generate_single_output(outfolder,write_fastq) + generate_low_abundance_output(outfolder, write_fastq) generate_single_mapping(outfolder) generate_low_abundance_mapping(outfolder) generate_single_support(outfolder) \ No newline at end of file diff --git a/modules/batch_merging_parallel.py b/modules/batch_merging_parallel.py index 2ffac28..a9b715f 100644 --- a/modules/batch_merging_parallel.py +++ b/modules/batch_merging_parallel.py @@ -82,12 +82,16 @@ def read_batch_file(batch_id, all_infos_dict, all_reads_dict, cl_dir): all_infos_dict[batch_id] = {} -def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder): +def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder, write_fastq): write_low_abundance = False support_name = "support_" + str(folder) + ".txt" other_support_name = "support_" + str(folder) + "low_abundance.txt" - consensus_name = "cluster" + str(folder) + "_merged.fq" - other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fq" + if write_fastq: + consensus_name = "cluster" + str(folder) + "_merged.fq" + other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fq" + else: + consensus_name = "cluster" + str(folder) + "_merged.fa" + other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fa" mapping_name = "cluster" + str(folder) + "_mapping.txt" other_mapping_name = "cluster" + str(folder) + "_mapping_low_abundance.txt" support_file = open(os.path.join(outfolder, support_name), "w") @@ -103,14 +107,20 @@ def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder) if not infos.merged: if len(infos.reads) >= iso_abundance or iso_abundance == 1: mapping_file.write(">{0}\n{1}\n".format(new_id, all_infos_dict[batchid][id].reads)) - consensus_file.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence, + if write_fastq: + consensus_file.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence, "+" * len(all_infos_dict[batchid][id].sequence))) + else: + consensus_file.write("@{0}\n{1}\n".format(new_id, all_infos_dict[batchid][id].sequence)) support_file.write("{0}: {1}\n".format(new_id, len(all_infos_dict[batchid][id].reads))) else: if write_low_abundance: - other_consensus.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence, + if write_fastq: + other_consensus.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence, "+" * len( all_infos_dict[batchid][id].sequence))) + else: + other_consensus.write("@{0}\n{1}\n".format(new_id, all_infos_dict[batchid][id].sequence)) if new_id in all_infos_dict: other_support_file.write("{0}: {1}\n".format(new_id, len(all_infos_dict[new_id].reads))) else: @@ -196,7 +206,7 @@ def actual_merging_process(all_infos_dict, delta, delta_len, def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3, - delta_iso_len_5, max_seqs_to_spoa, iso_abundance): + delta_iso_len_5, max_seqs_to_spoa, iso_abundance,write_fastq): print("Batch Merging") unique_cl_ids = set() subfolders = [f.path for f in os.scandir(outdir) if f.is_dir()] @@ -233,13 +243,6 @@ def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3, map_name = "mapping" + str(batch_id) map_file = open(os.path.join(cl_dir, map_name), 'rb') batch_mappings[batch_id] = pickle.load(map_file) - #print(all_reads_dict) - #print(batch_reads) - #print(batch_mappings) - # read all files needed to perform the batch merging and store the respective infos into all_infos_dict as well as all_reads_dict - # read_batch_file(batch_id, all_infos_dict, all_reads_dict, cl_dir) - # batch_reads[batch_id]=read_spoa_file(batch_id,cl_dir) - # batch_mappings[batch_id]=read_mapping_file(batch_id,cl_dir) # collect the information so that we have one data structure containing all infos for b_id, value in batch_reads.items(): all_infos_batch = {} @@ -263,7 +266,7 @@ def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3, for c_id, c_infos in b_infos.items(): if not c_infos.merged: nr_reads += len(c_infos.reads) - write_final_output(all_infos_dict, outdir, iso_abundance, cl_dir, cl_id) + write_final_output(all_infos_dict, outdir, iso_abundance, cl_dir, cl_id, write_fastq) #shutil.rmtree(os.path.join(outdir,cl_id)) DEBUG = False diff --git a/setup.py b/setup.py index d2f2ef4..3379b67 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setup( name='isONform', # Required - version='0.3.0', # Required + version='0.3.1', # Required description='De novo construction of isoforms from long-read data ', # Required long_description=long_description, # Optional long_description_content_type='text/markdown',