gzip temporary files

genotoul-bioinfo · Dec 18, 2024 · 5649c64 · 5649c64
1 parent f3780e8
commit 5649c64
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 13 deletions.
diff --git a/binette/cds.py b/binette/cds.py
@@ -72,7 +72,7 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) -
 
     """
     logging.info("Writing predicted protein sequences.")
-    with open(outfaa, "w") as fl:
+    with gzip.open(outfaa, "wt") as fl:
         for contig_id, genes in contig_to_genes:
             genes.write_translations(fl, contig_id)
 
@@ -237,12 +237,6 @@ def filter_faa_file(
     originating from contigs listed in `contigs_to_keep`, and writes the filtered
     sequences to a new FASTA file. The output file supports optional `.gz` compression.
 
-    Metrics computed and logged:
-    - Total number of contigs in `contigs_to_keep`.
-    - Number and proportion of contigs with at least one protein-coding gene.
-    - Number and proportion of contigs without any protein-coding genes.
-    - Number of contigs from the input FASTA file that are not in `contigs_to_keep`.
-
     :param contigs_to_keep: A set of contig names to retain in the output FASTA file.
     :param input_faa_file: Path to the input FASTA file containing protein sequences.
     :param filtered_faa_file: Path to the output FASTA file for filtered sequences.

diff --git a/binette/diamond.py b/binette/diamond.py
@@ -92,6 +92,7 @@ def run(
         f"-o {output} "
         f"--threads {threads} "
         f"--db {db} "
+        f"--compress 1 "
         f"--query-cover {query_cover} "
         f"--subject-cover {subject_cover} "
         f"--id {percent_id} "

diff --git a/binette/main.py b/binette/main.py
@@ -315,9 +315,9 @@ def manage_protein_alignement(
 
     else:
         contigs_iterator = (
-            s
-            for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix())
-            if s.name in contigs_in_bins
+            seq
+            for seq in contig_manager.parse_fasta_file(contigs_fasta.as_posix())
+            if seq.name in contigs_in_bins
         )
         contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads)
 
@@ -330,7 +330,10 @@ def manage_protein_alignement(
         else:
             raise FileNotFoundError(checkm2_db)
 
-        diamond_log = diamond_result_file.parents[0] / f"{diamond_result_file.stem}.log"
+        diamond_log = (
+            diamond_result_file.parents[0]
+            / f"{diamond_result_file.stem.split('.')[0]}.log"
+        )
 
         diamond.run(
             faa_file.as_posix(),
@@ -482,9 +485,9 @@ def main():
 
     use_existing_protein_file = False
 
-    faa_file = out_tmp_dir / "assembly_proteins.faa"
+    faa_file = out_tmp_dir / "assembly_proteins.faa.gz"
 
-    diamond_result_file = out_tmp_dir / "diamond_result.tsv"
+    diamond_result_file = out_tmp_dir / "diamond_result.tsv.gz"
 
     # Output files #
     final_bin_report: Path = args.outdir / "final_bins_quality_reports.tsv"