From f3780e8ae2be121920f151d006477f65476f0a5d Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 18 Dec 2024 18:30:58 +0100
Subject: [PATCH 1/5] filter input prot to only use gene of interest

---
 binette/cds.py    | 75 ++++++++++++++++++++++++++++++++++++++++++++++-
 binette/main.py   | 17 +++++++----
 tests/cds_test.py | 64 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 149 insertions(+), 7 deletions(-)

diff --git a/binette/cds.py b/binette/cds.py
index cc125c5..a5d1389 100644
--- a/binette/cds.py
+++ b/binette/cds.py
@@ -2,11 +2,13 @@
 import multiprocessing.pool
 import logging
 from collections import Counter, defaultdict
-from typing import Dict, List, Iterator, Tuple, Any, Union
+from typing import Dict, List, Iterator, Tuple, Any, Union, Set
 
 import pyfastx
 import pyrodigal
 from tqdm import tqdm
+from pathlib import Path
+import gzip
 
 
 def get_contig_from_cds_name(cds_name: str) -> str:
@@ -207,3 +209,74 @@ def get_contig_cds_metadata(
     }
 
     return contig_info
+
+
+from typing import Set
+from pathlib import Path
+import gzip
+import pyfastx
+import logging
+
+from typing import Set
+from pathlib import Path
+import gzip
+import pyfastx
+import logging
+
+
+def filter_faa_file(
+    contigs_to_keep: Set[str],
+    input_faa_file: Path,
+    filtered_faa_file: Path,
+):
+    """
+    Filters a FASTA file containing protein sequences to only include sequences
+    from contigs present in the provided set of contigs (`contigs_to_keep`).
+
+    This function processes the input FASTA file, identifies protein sequences
+    originating from contigs listed in `contigs_to_keep`, and writes the filtered
+    sequences to a new FASTA file. The output file supports optional `.gz` compression.
+
+    Metrics computed and logged:
+    - Total number of contigs in `contigs_to_keep`.
+    - Number and proportion of contigs with at least one protein-coding gene.
+    - Number and proportion of contigs without any protein-coding genes.
+    - Number of contigs from the input FASTA file that are not in `contigs_to_keep`.
+
+    :param contigs_to_keep: A set of contig names to retain in the output FASTA file.
+    :param input_faa_file: Path to the input FASTA file containing protein sequences.
+    :param filtered_faa_file: Path to the output FASTA file for filtered sequences.
+                              If the filename ends with `.gz`, the output will be compressed.
+    """
+    # Determine whether the output file should be compressed
+    proper_open = gzip.open if str(filtered_faa_file).endswith(".gz") else open
+
+    # Initialize tracking sets for metrics
+    contigs_with_genes = set()
+    contigs_parsed = set()
+
+    # Process the input FASTA file and filter sequences based on contigs_to_keep
+    with proper_open(filtered_faa_file, "wt") as fl:
+        for name, seq in pyfastx.Fastx(input_faa_file):
+            contig = get_contig_from_cds_name(name)
+            contigs_parsed.add(contig)
+            if contig in contigs_to_keep:
+                contigs_with_genes.add(contig)
+                fl.write(f">{name}\n{seq}\n")
+
+    # Calculate metrics
+    total_contigs = len(contigs_to_keep)
+    contigs_with_no_genes = total_contigs - len(contigs_with_genes)
+    contigs_not_in_keep_list = len(contigs_parsed - contigs_to_keep)
+
+    # Log the computed metrics
+    logging.info(f"Processing protein sequences from '{input_faa_file}'.")
+    logging.info(
+        f"Filtered {input_faa_file} to retain genes from {total_contigs} contigs that are included the input bins."
+    )
+    logging.info(
+        f"Found {contigs_with_no_genes} contigs ({contigs_with_no_genes / total_contigs:.2%}) with no genes."
+    )
+    logging.debug(
+        f"{contigs_not_in_keep_list} contigs from the input FASTA file are not in the keep list."
+    )
diff --git a/binette/main.py b/binette/main.py
index 7f1ad60..e47a67d 100755
--- a/binette/main.py
+++ b/binette/main.py
@@ -482,12 +482,7 @@ def main():
 
     use_existing_protein_file = False
 
-    if args.proteins:
-        logging.info(f"Using the provided protein sequences file: {args.proteins}")
-        faa_file = args.proteins
-        use_existing_protein_file = True
-    else:
-        faa_file = out_tmp_dir / "assembly_proteins.faa"
+    faa_file = out_tmp_dir / "assembly_proteins.faa"
 
     diamond_result_file = out_tmp_dir / "diamond_result.tsv"
 
@@ -506,6 +501,16 @@ def main():
         fasta_extensions=set(args.fasta_extensions),
     )
 
+    if args.proteins and not args.resume:
+        logging.info(f"Using the provided protein sequences file: {args.proteins}")
+        use_existing_protein_file = True
+
+        cds.filter_faa_file(
+            contigs_in_bins,
+            input_faa_file=args.proteins,
+            filtered_faa_file=faa_file,
+        )
+
     contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(
         faa_file=faa_file,
         contigs_fasta=args.contigs,
diff --git a/tests/cds_test.py b/tests/cds_test.py
index 97380fc..32b6580 100644
--- a/tests/cds_test.py
+++ b/tests/cds_test.py
@@ -5,6 +5,8 @@
 from pathlib import Path
 from unittest.mock import mock_open, patch
 
+import gzip
+
 
 class MockContig:
     def __init__(self, name, seq):
@@ -226,3 +228,65 @@ def test_is_nucleic_acid():
 
     # Amino acid sequence
     assert cds.is_nucleic_acid("MSIRGVGGNGNSR") is False  # Numbers are invalid
+
+
+def test_filter_faa_file_basic(tmp_path):
+    """Test basic functionality of filtering a FASTA file."""
+    # Create input data
+    input_faa = tmp_path / "input.faa"
+    filtered_faa = tmp_path / "filtered.faa"
+
+    input_faa.write_text(
+        ">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n"
+    )
+
+    # Contigs to keep
+    contigs_to_keep = {"contig1", "contig3"}
+
+    # Run the function
+    cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa)
+
+    # Check the filtered output
+    expected_output = ">contig1_gene1\nATGCGT\n>contig3_gene1\nATGAAA\n"
+    assert filtered_faa.read_text() == expected_output
+
+
+def test_filter_faa_file_gz_output(tmp_path):
+    """Test filtering with gzipped output."""
+    input_faa = tmp_path / "input.faa"
+    filtered_faa = tmp_path / "filtered.faa.gz"
+
+    input_faa.write_text(
+        ">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n"
+    )
+
+    # Contigs to keep
+    contigs_to_keep = {"contig2"}
+
+    # Run the function
+    cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa)
+
+    # Check the filtered output
+    with gzip.open(filtered_faa, "rt") as f:
+        filtered_content = f.read()
+    expected_output = ">contig2_gene1\nATGCCG\n"
+    assert filtered_content == expected_output
+
+
+def test_filter_faa_file_no_matching_contigs(tmp_path):
+    """Test filtering when no contigs match the input list."""
+    input_faa = tmp_path / "input.faa"
+    filtered_faa = tmp_path / "filtered_no_match.faa"
+
+    input_faa.write_text(
+        ">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n"
+    )
+
+    # Contigs to keep
+    contigs_to_keep = {"contig4", "contig5"}
+
+    # Run the function
+    cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa)
+
+    # Check the output file is empty
+    assert filtered_faa.read_text() == ""

From 5649c647d9cc527b9d9eb5a1bd6e7381928d2e58 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 18 Dec 2024 18:59:13 +0100
Subject: [PATCH 2/5] gzip temporary files

---
 binette/cds.py     |  8 +-------
 binette/diamond.py |  1 +
 binette/main.py    | 15 +++++++++------
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/binette/cds.py b/binette/cds.py
index a5d1389..e60762f 100644
--- a/binette/cds.py
+++ b/binette/cds.py
@@ -72,7 +72,7 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) -
 
     """
     logging.info("Writing predicted protein sequences.")
-    with open(outfaa, "w") as fl:
+    with gzip.open(outfaa, "wt") as fl:
         for contig_id, genes in contig_to_genes:
             genes.write_translations(fl, contig_id)
 
@@ -237,12 +237,6 @@ def filter_faa_file(
     originating from contigs listed in `contigs_to_keep`, and writes the filtered
     sequences to a new FASTA file. The output file supports optional `.gz` compression.
 
-    Metrics computed and logged:
-    - Total number of contigs in `contigs_to_keep`.
-    - Number and proportion of contigs with at least one protein-coding gene.
-    - Number and proportion of contigs without any protein-coding genes.
-    - Number of contigs from the input FASTA file that are not in `contigs_to_keep`.
-
     :param contigs_to_keep: A set of contig names to retain in the output FASTA file.
     :param input_faa_file: Path to the input FASTA file containing protein sequences.
     :param filtered_faa_file: Path to the output FASTA file for filtered sequences.
diff --git a/binette/diamond.py b/binette/diamond.py
index 33fc834..3d34cbb 100644
--- a/binette/diamond.py
+++ b/binette/diamond.py
@@ -92,6 +92,7 @@ def run(
         f"-o {output} "
         f"--threads {threads} "
         f"--db {db} "
+        f"--compress 1 "
         f"--query-cover {query_cover} "
         f"--subject-cover {subject_cover} "
         f"--id {percent_id} "
diff --git a/binette/main.py b/binette/main.py
index e47a67d..5f95cb4 100755
--- a/binette/main.py
+++ b/binette/main.py
@@ -315,9 +315,9 @@ def manage_protein_alignement(
 
     else:
         contigs_iterator = (
-            s
-            for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix())
-            if s.name in contigs_in_bins
+            seq
+            for seq in contig_manager.parse_fasta_file(contigs_fasta.as_posix())
+            if seq.name in contigs_in_bins
         )
         contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads)
 
@@ -330,7 +330,10 @@ def manage_protein_alignement(
         else:
             raise FileNotFoundError(checkm2_db)
 
-        diamond_log = diamond_result_file.parents[0] / f"{diamond_result_file.stem}.log"
+        diamond_log = (
+            diamond_result_file.parents[0]
+            / f"{diamond_result_file.stem.split('.')[0]}.log"
+        )
 
         diamond.run(
             faa_file.as_posix(),
@@ -482,9 +485,9 @@ def main():
 
     use_existing_protein_file = False
 
-    faa_file = out_tmp_dir / "assembly_proteins.faa"
+    faa_file = out_tmp_dir / "assembly_proteins.faa.gz"
 
-    diamond_result_file = out_tmp_dir / "diamond_result.tsv"
+    diamond_result_file = out_tmp_dir / "diamond_result.tsv.gz"
 
     # Output files #
     final_bin_report: Path = args.outdir / "final_bins_quality_reports.tsv"

From 788f168d5fa24b1896315132f03f7a9cc3b76ef2 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 18 Dec 2024 19:09:21 +0100
Subject: [PATCH 3/5] pyfastx index stored in temporary file dir

---
 binette/contig_manager.py |  4 ++--
 binette/main.py           | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/binette/contig_manager.py b/binette/contig_manager.py
index fc783f5..91cdf6e 100644
--- a/binette/contig_manager.py
+++ b/binette/contig_manager.py
@@ -2,7 +2,7 @@
 from typing import Dict, Iterable, Tuple, Set, Any, Union
 
 
-def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta:
+def parse_fasta_file(fasta_file: str, index_file: str) -> pyfastx.Fasta:
     """
     Parse a FASTA file and return a pyfastx.Fasta object.
 
@@ -10,7 +10,7 @@ def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta:
 
     :return: A pyfastx.Fasta object representing the parsed FASTA file.
     """
-    fa = pyfastx.Fasta(fasta_file, build_index=True)
+    fa = pyfastx.Fasta(fasta_file, build_index=True, index_file=index_file)
     return fa
 
 
diff --git a/binette/main.py b/binette/main.py
index 5f95cb4..8533f4e 100755
--- a/binette/main.py
+++ b/binette/main.py
@@ -213,6 +213,7 @@ def parse_input_files(
     bin_dirs: List[Path],
     contig2bin_tables: List[Path],
     contigs_fasta: Path,
+    temporary_dir: Path,
     fasta_extensions: Set[str] = {".fasta", ".fna", ".fa"},
 ) -> Tuple[
     Dict[str, Set[bin_manager.Bin]], Set[bin_manager.Bin], Set[str], Dict[str, int]
@@ -254,7 +255,10 @@ def parse_input_files(
     original_bins = bin_manager.dereplicate_bin_sets(bin_set_name_to_bins.values())
 
     logging.info(f"Parsing contig fasta file: {contigs_fasta}")
-    contigs_object = contig_manager.parse_fasta_file(contigs_fasta.as_posix())
+    index_file = temporary_dir / f"{contigs_fasta.name}.fxi"
+    contigs_object = contig_manager.parse_fasta_file(
+        contigs_fasta.as_posix(), index_file=index_file.as_posix()
+    )
 
     unexpected_contigs = {
         contig for contig in contigs_in_bins if contig not in contigs_object
@@ -276,6 +280,7 @@ def parse_input_files(
 def manage_protein_alignement(
     faa_file: Path,
     contigs_fasta: Path,
+    temporary_dir: Path,
     contig_to_length: Dict[str, int],
     contigs_in_bins: Set[str],
     diamond_result_file: Path,
@@ -314,9 +319,12 @@ def manage_protein_alignement(
         )
 
     else:
+        index_file = temporary_dir / f"{contigs_fasta.name}.fxi"
         contigs_iterator = (
             seq
-            for seq in contig_manager.parse_fasta_file(contigs_fasta.as_posix())
+            for seq in contig_manager.parse_fasta_file(
+                contigs_fasta.as_posix(), index_file=index_file.as_posix()
+            )
             if seq.name in contigs_in_bins
         )
         contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads)
@@ -502,6 +510,7 @@ def main():
         args.contig2bin_tables,
         args.contigs,
         fasta_extensions=set(args.fasta_extensions),
+        temporary_dir=out_tmp_dir,
     )
 
     if args.proteins and not args.resume:
@@ -517,6 +526,7 @@ def main():
     contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(
         faa_file=faa_file,
         contigs_fasta=args.contigs,
+        temporary_dir=out_tmp_dir,
         contig_to_length=contig_to_length,
         contigs_in_bins=contigs_in_bins,
         diamond_result_file=diamond_result_file,

From af582a7147d33df32f63ceaca6bc5a375016a86d Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Thu, 19 Dec 2024 14:40:41 +0100
Subject: [PATCH 4/5] adjust logging

---
 binette/cds.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/binette/cds.py b/binette/cds.py
index e60762f..27d2e64 100644
--- a/binette/cds.py
+++ b/binette/cds.py
@@ -266,10 +266,10 @@ def filter_faa_file(
     # Log the computed metrics
     logging.info(f"Processing protein sequences from '{input_faa_file}'.")
     logging.info(
-        f"Filtered {input_faa_file} to retain genes from {total_contigs} contigs that are included the input bins."
+        f"Filtered {input_faa_file} to retain genes from {total_contigs} contigs that are included in the input bins."
     )
-    logging.info(
-        f"Found {contigs_with_no_genes} contigs ({contigs_with_no_genes / total_contigs:.2%}) with no genes."
+    logging.debug(
+        f"Found {contigs_with_no_genes}/{total_contigs}  contigs ({contigs_with_no_genes / total_contigs:.2%}) with no genes."
     )
     logging.debug(
         f"{contigs_not_in_keep_list} contigs from the input FASTA file are not in the keep list."

From 98cdde5e0e16e375a64d4af96e57ff346155a699 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Thu, 19 Dec 2024 14:41:16 +0100
Subject: [PATCH 5/5] update tests

---
 tests/cds_test.py            |  4 ++--
 tests/contig_manager_test.py |  9 +++++----
 tests/diamonds_test.py       |  2 +-
 tests/main_binette_test.py   | 11 +++++++----
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/tests/cds_test.py b/tests/cds_test.py
index 32b6580..285a9d1 100644
--- a/tests/cds_test.py
+++ b/tests/cds_test.py
@@ -118,14 +118,14 @@ def test_write_faa(contig1, orf_finder):
 
     predicted_genes = orf_finder.find_genes(contig1.seq)
     contig_name = "contig"
-    output_file = "tests/tmp_file.faa"
+    output_file = "tests/tmp_file.faa.gz"
 
     cds.write_faa(output_file, [(contig_name, predicted_genes)])
 
     # Check if the file was created and first seq starts
     # with the contig name as expected
     assert Path(output_file).exists()
-    with open(output_file, "r") as f:
+    with gzip.open(output_file, "rt") as f:
         assert f.read().startswith(f">{contig_name}")
 
 
diff --git a/tests/contig_manager_test.py b/tests/contig_manager_test.py
index 38aca8b..5214232 100644
--- a/tests/contig_manager_test.py
+++ b/tests/contig_manager_test.py
@@ -4,12 +4,13 @@
 
 
 # Parses a valid FASTA file and returns a pyfastx.Fasta object.
-def test_valid_fasta_file():
+def test_valid_fasta_file(tmp_path):
     # Arrange
     fasta_file = "tests/contigs.fasta"
 
-    # Act
-    result = contig_manager.parse_fasta_file(fasta_file)
+    index_file = "tests/contigs.fasta.fxi"
+
+    result = contig_manager.parse_fasta_file(fasta_file, str(index_file))
 
     # Assert
     assert isinstance(result, pyfastx.Fasta)
@@ -22,7 +23,7 @@ def test_invalid_fasta_file():
 
     # Act and Assert
     with pytest.raises(RuntimeError):
-        contig_manager.parse_fasta_file(fasta_file)
+        contig_manager.parse_fasta_file(fasta_file, "./index.fxi")
 
 
 # The function returns a tuple containing two dictionaries.
diff --git a/tests/diamonds_test.py b/tests/diamonds_test.py
index eda2975..8d628e2 100644
--- a/tests/diamonds_test.py
+++ b/tests/diamonds_test.py
@@ -155,7 +155,7 @@ def __init__(self, returncode):
         # Simulating successful run of diamond command
         if (
             args[0]
-            == "diamond blastp --outfmt 6 --max-target-seqs 1 --query test.faa -o output.txt --threads 1 --db db --query-cover 80 --subject-cover 80 --id 30 --evalue 1e-05 --block-size 2 2> log.txt"
+            == "diamond blastp --outfmt 6 --max-target-seqs 1 --query test.faa -o output.txt --threads 1 --db db --compress 1 --query-cover 80 --subject-cover 80 --id 30 --evalue 1e-05 --block-size 2 2> log.txt"
         ):
             return CompletedProcess(0)
 
diff --git a/tests/main_binette_test.py b/tests/main_binette_test.py
index 0b2aba7..c519693 100644
--- a/tests/main_binette_test.py
+++ b/tests/main_binette_test.py
@@ -136,6 +136,7 @@ def test_manage_protein_alignement_resume(tmp_path):
         contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(
             faa_file=Path(faa_file),
             contigs_fasta=Path("contigs_fasta"),
+            temporary_dir=Path(tmp_path),
             contig_to_length=contig_to_length,
             contigs_in_bins=set(),
             diamond_result_file=Path("diamond_result_file"),
@@ -179,6 +180,7 @@ def test_manage_protein_alignement_not_resume(tmpdir, tmp_path):
         contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(
             faa_file=Path(faa_file),
             contigs_fasta=Path(contigs_fasta),
+            temporary_dir=Path(tmp_path),
             contig_to_length=contig_to_length,
             contigs_in_bins=set(),
             diamond_result_file=Path(diamond_result_file),
@@ -208,7 +210,7 @@ def test_parse_input_files_with_contig2bin_tables(tmp_path):
 
     # Call the function and capture the return values
     original_bins, contigs_in_bins, contig_to_length = parse_input_files(
-        None, [bin_set1, bin_set2], fasta_file
+        None, [bin_set1, bin_set2], fasta_file, tmp_path
     )
 
     # # Perform assertions on the returned values
@@ -230,7 +232,7 @@ def test_parse_input_files_with_contig2bin_tables_with_unknown_contig(tmp_path):
     fasta_file.write_text(fasta_file_content)
 
     with pytest.raises(ValueError):
-        parse_input_files(None, [bin_set3], fasta_file)
+        parse_input_files(None, [bin_set3], fasta_file, tmp_path)
 
 
 def test_parse_input_files_bin_dirs(create_temp_bin_directories, tmp_path):
@@ -247,7 +249,7 @@ def test_parse_input_files_bin_dirs(create_temp_bin_directories, tmp_path):
 
     # Call the function and capture the return values
     original_bins, contigs_in_bins, contig_to_length = parse_input_files(
-        bin_dirs, contig2bin_tables, fasta_file
+        bin_dirs, contig2bin_tables, fasta_file, temporary_dir=tmp_path
     )
 
     # # Perform assertions on the returned values
@@ -384,6 +386,7 @@ def test_manage_protein_alignment_no_resume(tmp_path):
         contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(
             faa_file,
             contigs_fasta,
+            tmp_path,
             contig_to_length,
             contigs_in_bins,
             diamond_result_file,
@@ -395,7 +398,7 @@ def test_manage_protein_alignment_no_resume(tmp_path):
         )
 
         # Assertions to check if functions were called
-        mock_parse_fasta_file.assert_called_once_with(contigs_fasta.as_posix())
+        mock_parse_fasta_file.assert_called_once()
         mock_predict.assert_called_once()
         mock_diamond_get_contig_to_kegg_id.assert_called_once()
         mock_diamond_run.assert_called_once_with(