Changed standard output of isONform to fasta. Added --write_fastq to …

…force output in fastq format
aljpetri · Nov 14, 2023 · 526427f · 526427f
1 parent 1f511d3
commit 526427f
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 32 deletions.
diff --git a/isONform_parallel.py b/isONform_parallel.py
@@ -294,8 +294,12 @@ def main(args):
     if args.split_wrt_batches:
         print("STILLSPLITWRTBATCHES")
         file_handling = time()
-        batch_merging_parallel.join_back_via_batch_merging(args.outfolder, args.delta, args.delta_len, args.delta_iso_len_3, args.delta_iso_len_5, args.max_seqs_to_spoa,args.iso_abundance)
-        Parallelization_side_functions.generate_full_output(args.outfolder)
+        if args.write_fastq:
+            write_fastq = True
+        else:
+            write_fastq = False
+        batch_merging_parallel.join_back_via_batch_merging(args.outfolder, args.delta, args.delta_len, args.delta_iso_len_3, args.delta_iso_len_5, args.max_seqs_to_spoa,args.iso_abundance, write_fastq)
+        Parallelization_side_functions.generate_full_output(args.outfolder,write_fastq)
         Parallelization_side_functions.remove_folders(args.outfolder)
         shutil.rmtree(split_directory)
         print("Joined back batched files in:", time() - file_handling)
@@ -306,7 +310,7 @@ def main(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="De novo reconstruction of long-read transcriptome reads",
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--version', action='version', version='%(prog)s 0.3.0')
+    parser.add_argument('--version', action='version', version='%(prog)s 0.3.1')
     parser.add_argument('--fastq_folder', type=str, default=False,
                         help='Path to input fastq folder with reads in clusters')
     parser.add_argument('--t', dest="nr_cores", type=int, default=8, help='Number of cores allocated for clustering')
@@ -332,14 +336,14 @@ def main(args):
     parser.add_argument('--delta',type=float,default=0.1, help='diversity rate used to compare sequences')
     parser.add_argument('--max_seqs_to_spoa', type=int, default=200, help='Maximum number of seqs to spoa')
     parser.add_argument('--verbose', action="store_true", help='Print various developer stats.')
-    parser.add_argument('--iso_abundance', type=int, default=1,
+    parser.add_argument('--iso_abundance', type=int, default=5,
                         help='Cutoff parameter: abundance of reads that have to support an isoform to show in results')
     parser.add_argument('--delta_iso_len_3', type=int, default=30,
                         help='Cutoff parameter: maximum length difference at 3prime end, for which subisoforms are still merged into longer isoforms')
     parser.add_argument('--delta_iso_len_5', type=int, default=50,
                         help='Cutoff parameter: maximum length difference at 5prime end, for which subisoforms are still merged into longer isoforms')
     parser.add_argument('--tmpdir', type=str,default=None, help='OPTIONAL PARAMETER: Absolute path to custom folder in which to store temporary files. If tmpdir is not specified, isONform will attempt to write the temporary files into the tmp folder on your system. It is advised to only use this parameter if the symlinking does not work on your system.')
-
+    parser.add_argument('--write_fastq', action="store_true", help=' Indicates that we want to ouptut the final output (transcriptome) as fastq file (New standard: fasta)')
     args = parser.parse_args()
     print(len(sys.argv))
     if len(sys.argv) == 1:

diff --git a/main.py b/main.py
@@ -586,7 +586,7 @@ def main(args):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="De novo error correction of long-read transcriptome reads",
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--version', action='version', version='%(prog)s 0.3.0')
+    parser.add_argument('--version', action='version', version='%(prog)s 0.3.1')
     parser.add_argument('--fastq', type=str, default=False, help='Path to input fastq file with reads')
 
     parser.add_argument('--k', type=int, default=20, help='Kmer size')

diff --git a/modules/Parallelization_side_functions.py b/modules/Parallelization_side_functions.py
@@ -58,16 +58,23 @@ def generate_single_mapping(outfolder):
                 for line in g:
                     # append content to second file
                     f.write(line)
-def generate_single_output(outfolder):
+def generate_single_output(outfolder,write_fastq):
     subfolders = [f.path for f in os.scandir(outfolder) if f.is_dir()]
-    print("Generating transcriptome.fastq")
-    f = open(os.path.join(outfolder,"transcriptome.fastq"), "w")
+    if write_fastq:
+        print("Generating transcriptome.fastq")
+        f = open(os.path.join(outfolder,"transcriptome.fastq"), "w")
+    else:
+        print("Generating transcriptome.fasta")
+        f = open(os.path.join(outfolder, "transcriptome.fasta"), "w")
     for subfolder in subfolders:
         #print("subfolder",subfolder)
         actual_folder=subfolder.split("/")[-1]
         #print(actual_folder)
         if actual_folder.isdigit():
-            fname=os.path.join(outfolder,"cluster"+str(actual_folder)+"_merged.fq")
+            if write_fastq:
+                fname=os.path.join(outfolder, "cluster"+str(actual_folder)+"_merged.fq")
+            else:
+                fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged.fa")
             #print(fname)
             if os.path.isfile(fname):
                 #print("True")
@@ -80,16 +87,21 @@ def generate_single_output(outfolder):
                     # append content to second file
                     f.write(line)
 
-def generate_low_abundance_output(outfolder):
+def generate_low_abundance_output(outfolder,write_fastq):
     subfolders = [f.path for f in os.scandir(outfolder) if f.is_dir()]
-    f = open(os.path.join(outfolder, "transcriptome_low.fastq"), "w")
+    if write_fastq:
+        f = open(os.path.join(outfolder, "transcriptome_low.fastq"), "w")
+    else:
+        f = open(os.path.join(outfolder, "transcriptome_low.fasta"), "w")
     for subfolder in subfolders:
         actual_folder = subfolder.split("/")[-1]
                     # print(actual_folder)
         if actual_folder.isdigit():
-            fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged_low_abundance.fastq")
+            if write_fastq:
+                fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged_low_abundance.fq")
                         # print(fname)
-
+            else:
+                fname = os.path.join(outfolder, "cluster" + str(actual_folder) + "_merged_low_abundance.fa")
             if os.path.isfile(fname):
                 g = open(fname, "r")
                 # read content from first file
@@ -113,9 +125,9 @@ def remove_folders(outfolder):
     for subfolder in subfolders:
         shutil.rmtree(os.path.join(outfolder,subfolder))
 
-def generate_full_output(outfolder):
-    generate_single_output(outfolder)
-    generate_low_abundance_output(outfolder)
+def generate_full_output(outfolder,write_fastq):
+    generate_single_output(outfolder,write_fastq)
+    generate_low_abundance_output(outfolder, write_fastq)
     generate_single_mapping(outfolder)
     generate_low_abundance_mapping(outfolder)
     generate_single_support(outfolder)
diff --git a/modules/batch_merging_parallel.py b/modules/batch_merging_parallel.py
@@ -82,12 +82,16 @@ def read_batch_file(batch_id, all_infos_dict, all_reads_dict, cl_dir):
             all_infos_dict[batch_id] = {}
 
 
-def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder):
+def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder, write_fastq):
     write_low_abundance = False
     support_name = "support_" + str(folder) + ".txt"
     other_support_name = "support_" + str(folder) + "low_abundance.txt"
-    consensus_name = "cluster" + str(folder) + "_merged.fq"
-    other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fq"
+    if write_fastq:
+        consensus_name = "cluster" + str(folder) + "_merged.fq"
+        other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fq"
+    else:
+        consensus_name = "cluster" + str(folder) + "_merged.fa"
+        other_consensus_name = "cluster" + str(folder) + "_merged_low_abundance.fa"
     mapping_name = "cluster" + str(folder) + "_mapping.txt"
     other_mapping_name = "cluster" + str(folder) + "_mapping_low_abundance.txt"
     support_file = open(os.path.join(outfolder, support_name), "w")
@@ -103,14 +107,20 @@ def write_final_output(all_infos_dict, outfolder, iso_abundance, cl_dir, folder)
             if not infos.merged:
                 if len(infos.reads) >= iso_abundance or iso_abundance == 1:
                     mapping_file.write(">{0}\n{1}\n".format(new_id, all_infos_dict[batchid][id].reads))
-                    consensus_file.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence,
+                    if write_fastq:
+                        consensus_file.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence,
                                                                       "+" * len(all_infos_dict[batchid][id].sequence)))
+                    else:
+                        consensus_file.write("@{0}\n{1}\n".format(new_id, all_infos_dict[batchid][id].sequence))
                     support_file.write("{0}: {1}\n".format(new_id, len(all_infos_dict[batchid][id].reads)))
                 else:
                     if write_low_abundance:
-                        other_consensus.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence,
+                        if write_fastq:
+                            other_consensus.write("@{0}\n{1}\n+\n{2}\n".format(new_id, all_infos_dict[batchid][id].sequence,
                                                                            "+" * len(
                                                                                all_infos_dict[batchid][id].sequence)))
+                        else:
+                            other_consensus.write("@{0}\n{1}\n".format(new_id, all_infos_dict[batchid][id].sequence))
                         if new_id in all_infos_dict:
                             other_support_file.write("{0}: {1}\n".format(new_id, len(all_infos_dict[new_id].reads)))
                         else:
@@ -196,7 +206,7 @@ def actual_merging_process(all_infos_dict, delta, delta_len,
 
 
 def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3,
-                                delta_iso_len_5, max_seqs_to_spoa, iso_abundance):
+                                delta_iso_len_5, max_seqs_to_spoa, iso_abundance,write_fastq):
     print("Batch Merging")
     unique_cl_ids = set()
     subfolders = [f.path for f in os.scandir(outdir) if f.is_dir()]
@@ -233,13 +243,6 @@ def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3,
                 map_name = "mapping" + str(batch_id)
                 map_file = open(os.path.join(cl_dir, map_name), 'rb')
                 batch_mappings[batch_id] = pickle.load(map_file)
-        #print(all_reads_dict)
-        #print(batch_reads)
-        #print(batch_mappings)
-                # read all files needed to perform the batch merging and store the respective infos into all_infos_dict as well as all_reads_dict
-                # read_batch_file(batch_id, all_infos_dict, all_reads_dict, cl_dir)
-                # batch_reads[batch_id]=read_spoa_file(batch_id,cl_dir)
-                # batch_mappings[batch_id]=read_mapping_file(batch_id,cl_dir)
         # collect the information so that we have one data structure containing all infos
         for b_id, value in batch_reads.items():
             all_infos_batch = {}
@@ -263,7 +266,7 @@ def join_back_via_batch_merging(outdir, delta, delta_len, delta_iso_len_3,
             for c_id, c_infos in b_infos.items():
                 if not c_infos.merged:
                     nr_reads += len(c_infos.reads)
-            write_final_output(all_infos_dict, outdir, iso_abundance, cl_dir, cl_id)
+            write_final_output(all_infos_dict, outdir, iso_abundance, cl_dir, cl_id, write_fastq)
         #shutil.rmtree(os.path.join(outdir,cl_id))
 
 DEBUG = False

diff --git a/setup.py b/setup.py
@@ -19,7 +19,7 @@
 setup(
 
     name='isONform',  # Required
-    version='0.3.0',  # Required
+    version='0.3.1',  # Required
     description='De novo construction of isoforms from long-read data ',  # Required
     long_description=long_description,  # Optional
     long_description_content_type='text/markdown',