genotoul-bioinfo · JeanMainguy · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 19, 2024
diff --git a/binette/cds.py b/binette/cds.py
@@ -2,11 +2,13 @@
 import multiprocessing.pool
 import logging
 from collections import Counter, defaultdict
-from typing import Dict, List, Iterator, Tuple, Any, Union
+from typing import Dict, List, Iterator, Tuple, Any, Union, Set
 
 import pyfastx
 import pyrodigal
 from tqdm import tqdm
+from pathlib import Path
+import gzip
 
 
 def get_contig_from_cds_name(cds_name: str) -> str:
@@ -70,7 +72,7 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) -
 
     """
     logging.info("Writing predicted protein sequences.")
-    with open(outfaa, "w") as fl:
+    with gzip.open(outfaa, "wt") as fl:
         for contig_id, genes in contig_to_genes:
             genes.write_translations(fl, contig_id)
 
@@ -207,3 +209,68 @@ def get_contig_cds_metadata(
     }
 
     return contig_info
+
+
+from typing import Set
+from pathlib import Path
+import gzip
+import pyfastx
+import logging
+
+from typing import Set
+from pathlib import Path
+import gzip
+import pyfastx
+import logging
+
+
+def filter_faa_file(
+    contigs_to_keep: Set[str],
+    input_faa_file: Path,
+    filtered_faa_file: Path,
+):
+    """
+    Filters a FASTA file containing protein sequences to only include sequences
+    from contigs present in the provided set of contigs (`contigs_to_keep`).
+
+    This function processes the input FASTA file, identifies protein sequences
+    originating from contigs listed in `contigs_to_keep`, and writes the filtered
+    sequences to a new FASTA file. The output file supports optional `.gz` compression.
+
+    :param contigs_to_keep: A set of contig names to retain in the output FASTA file.
+    :param input_faa_file: Path to the input FASTA file containing protein sequences.
+    :param filtered_faa_file: Path to the output FASTA file for filtered sequences.
+                              If the filename ends with `.gz`, the output will be compressed.
+    """
+    # Determine whether the output file should be compressed
+    proper_open = gzip.open if str(filtered_faa_file).endswith(".gz") else open
+
+    # Initialize tracking sets for metrics
+    contigs_with_genes = set()
+    contigs_parsed = set()
+
+    # Process the input FASTA file and filter sequences based on contigs_to_keep
+    with proper_open(filtered_faa_file, "wt") as fl:
+        for name, seq in pyfastx.Fastx(input_faa_file):
+            contig = get_contig_from_cds_name(name)
+            contigs_parsed.add(contig)
+            if contig in contigs_to_keep:
+                contigs_with_genes.add(contig)
+                fl.write(f">{name}\n{seq}\n")
+
+    # Calculate metrics
+    total_contigs = len(contigs_to_keep)
+    contigs_with_no_genes = total_contigs - len(contigs_with_genes)
+    contigs_not_in_keep_list = len(contigs_parsed - contigs_to_keep)
+
+    # Log the computed metrics
+    logging.info(f"Processing protein sequences from '{input_faa_file}'.")
+    logging.info(
+        f"Filtered {input_faa_file} to retain genes from {total_contigs} contigs that are included in the input bins."
+    )
+    logging.debug(
+        f"Found {contigs_with_no_genes}/{total_contigs}  contigs ({contigs_with_no_genes / total_contigs:.2%}) with no genes."
+    )
+    logging.debug(
+        f"{contigs_not_in_keep_list} contigs from the input FASTA file are not in the keep list."
+    )
diff --git a/binette/contig_manager.py b/binette/contig_manager.py
@@ -2,15 +2,15 @@
 from typing import Dict, Iterable, Tuple, Set, Any, Union
 
 
-def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta:
+def parse_fasta_file(fasta_file: str, index_file: str) -> pyfastx.Fasta:
     """
     Parse a FASTA file and return a pyfastx.Fasta object.
 
     :param fasta_file: The path to the FASTA file.
 
     :return: A pyfastx.Fasta object representing the parsed FASTA file.
     """
-    fa = pyfastx.Fasta(fasta_file, build_index=True)
+    fa = pyfastx.Fasta(fasta_file, build_index=True, index_file=index_file)
     return fa
 
 

diff --git a/binette/diamond.py b/binette/diamond.py
@@ -92,6 +92,7 @@ def run(
         f"-o {output} "
         f"--threads {threads} "
         f"--db {db} "
+        f"--compress 1 "
         f"--query-cover {query_cover} "
         f"--subject-cover {subject_cover} "
         f"--id {percent_id} "

diff --git a/binette/main.py b/binette/main.py
@@ -213,6 +213,7 @@ def parse_input_files(
     bin_dirs: List[Path],
     contig2bin_tables: List[Path],
     contigs_fasta: Path,
+    temporary_dir: Path,
     fasta_extensions: Set[str] = {".fasta", ".fna", ".fa"},
 ) -> Tuple[
     Dict[str, Set[bin_manager.Bin]], Set[bin_manager.Bin], Set[str], Dict[str, int]
@@ -254,7 +255,10 @@ def parse_input_files(
     original_bins = bin_manager.dereplicate_bin_sets(bin_set_name_to_bins.values())
 
     logging.info(f"Parsing contig fasta file: {contigs_fasta}")
-    contigs_object = contig_manager.parse_fasta_file(contigs_fasta.as_posix())
+    index_file = temporary_dir / f"{contigs_fasta.name}.fxi"
+    contigs_object = contig_manager.parse_fasta_file(
+        contigs_fasta.as_posix(), index_file=index_file.as_posix()
+    )
 
     unexpected_contigs = {
         contig for contig in contigs_in_bins if contig not in contigs_object
@@ -276,6 +280,7 @@ def parse_input_files(
 def manage_protein_alignement(
     faa_file: Path,
     contigs_fasta: Path,
+    temporary_dir: Path,
     contig_to_length: Dict[str, int],
     contigs_in_bins: Set[str],
     diamond_result_file: Path,
@@ -314,10 +319,13 @@ def manage_protein_alignement(
         )
 
     else:
+        index_file = temporary_dir / f"{contigs_fasta.name}.fxi"
         contigs_iterator = (
-            s
-            for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix())
-            if s.name in contigs_in_bins
+            seq
+            for seq in contig_manager.parse_fasta_file(
+                contigs_fasta.as_posix(), index_file=index_file.as_posix()
+            )
+            if seq.name in contigs_in_bins
         )
         contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads)
 
@@ -330,7 +338,10 @@ def manage_protein_alignement(
         else:
             raise FileNotFoundError(checkm2_db)
 
-        diamond_log = diamond_result_file.parents[0] / f"{diamond_result_file.stem}.log"
+        diamond_log = (
+            diamond_result_file.parents[0]
+            / f"{diamond_result_file.stem.split('.')[0]}.log"
+        )
 
         diamond.run(
             faa_file.as_posix(),
@@ -482,14 +493,9 @@ def main():
 
     use_existing_protein_file = False
 
-    if args.proteins:
-        logging.info(f"Using the provided protein sequences file: {args.proteins}")
-        faa_file = args.proteins
-        use_existing_protein_file = True
-    else:
-        faa_file = out_tmp_dir / "assembly_proteins.faa"
+    faa_file = out_tmp_dir / "assembly_proteins.faa.gz"
 
-    diamond_result_file = out_tmp_dir / "diamond_result.tsv"
+    diamond_result_file = out_tmp_dir / "diamond_result.tsv.gz"
 
     # Output files #
     final_bin_report: Path = args.outdir / "final_bins_quality_reports.tsv"
@@ -504,11 +510,23 @@ def main():
         args.contig2bin_tables,
         args.contigs,
         fasta_extensions=set(args.fasta_extensions),
+        temporary_dir=out_tmp_dir,
     )
 
+    if args.proteins and not args.resume:
+        logging.info(f"Using the provided protein sequences file: {args.proteins}")
+        use_existing_protein_file = True
+
+        cds.filter_faa_file(
+            contigs_in_bins,
+            input_faa_file=args.proteins,
+            filtered_faa_file=faa_file,
+        )
+
     contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(
         faa_file=faa_file,
         contigs_fasta=args.contigs,
+        temporary_dir=out_tmp_dir,
         contig_to_length=contig_to_length,
         contigs_in_bins=contigs_in_bins,
         diamond_result_file=diamond_result_file,

diff --git a/tests/cds_test.py b/tests/cds_test.py
@@ -5,6 +5,8 @@
 from pathlib import Path
 from unittest.mock import mock_open, patch
 
+import gzip
+
 
 class MockContig:
     def __init__(self, name, seq):
@@ -116,14 +118,14 @@ def test_write_faa(contig1, orf_finder):
 
     predicted_genes = orf_finder.find_genes(contig1.seq)
     contig_name = "contig"
-    output_file = "tests/tmp_file.faa"
+    output_file = "tests/tmp_file.faa.gz"
 
     cds.write_faa(output_file, [(contig_name, predicted_genes)])
 
     # Check if the file was created and first seq starts
     # with the contig name as expected
     assert Path(output_file).exists()
-    with open(output_file, "r") as f:
+    with gzip.open(output_file, "rt") as f:
         assert f.read().startswith(f">{contig_name}")
 
 
@@ -226,3 +228,65 @@ def test_is_nucleic_acid():
 
     # Amino acid sequence
     assert cds.is_nucleic_acid("MSIRGVGGNGNSR") is False  # Numbers are invalid
+
+
+def test_filter_faa_file_basic(tmp_path):
+    """Test basic functionality of filtering a FASTA file."""
+    # Create input data
+    input_faa = tmp_path / "input.faa"
+    filtered_faa = tmp_path / "filtered.faa"
+
+    input_faa.write_text(
+        ">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n"
+    )
+
+    # Contigs to keep
+    contigs_to_keep = {"contig1", "contig3"}
+
+    # Run the function
+    cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa)
+
+    # Check the filtered output
+    expected_output = ">contig1_gene1\nATGCGT\n>contig3_gene1\nATGAAA\n"
+    assert filtered_faa.read_text() == expected_output
+
+
+def test_filter_faa_file_gz_output(tmp_path):
+    """Test filtering with gzipped output."""
+    input_faa = tmp_path / "input.faa"
+    filtered_faa = tmp_path / "filtered.faa.gz"
+
+    input_faa.write_text(
+        ">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n"
+    )
+
+    # Contigs to keep
+    contigs_to_keep = {"contig2"}
+
+    # Run the function
+    cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa)
+
+    # Check the filtered output
+    with gzip.open(filtered_faa, "rt") as f:
+        filtered_content = f.read()
+    expected_output = ">contig2_gene1\nATGCCG\n"
+    assert filtered_content == expected_output
+
+
+def test_filter_faa_file_no_matching_contigs(tmp_path):
+    """Test filtering when no contigs match the input list."""
+    input_faa = tmp_path / "input.faa"
+    filtered_faa = tmp_path / "filtered_no_match.faa"
+
+    input_faa.write_text(
+        ">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n"
+    )
+
+    # Contigs to keep
+    contigs_to_keep = {"contig4", "contig5"}
+
+    # Run the function
+    cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa)
+
+    # Check the output file is empty
+    assert filtered_faa.read_text() == ""
diff --git a/tests/contig_manager_test.py b/tests/contig_manager_test.py
@@ -4,12 +4,13 @@
 
 
 # Parses a valid FASTA file and returns a pyfastx.Fasta object.
-def test_valid_fasta_file():
+def test_valid_fasta_file(tmp_path):
     # Arrange
     fasta_file = "tests/contigs.fasta"
 
-    # Act
-    result = contig_manager.parse_fasta_file(fasta_file)
+    index_file = "tests/contigs.fasta.fxi"
+
+    result = contig_manager.parse_fasta_file(fasta_file, str(index_file))
 
     # Assert
     assert isinstance(result, pyfastx.Fasta)
@@ -22,7 +23,7 @@ def test_invalid_fasta_file():
 
     # Act and Assert
     with pytest.raises(RuntimeError):
-        contig_manager.parse_fasta_file(fasta_file)
+        contig_manager.parse_fasta_file(fasta_file, "./index.fxi")
 
 
 # The function returns a tuple containing two dictionaries.

diff --git a/tests/diamonds_test.py b/tests/diamonds_test.py
@@ -155,7 +155,7 @@ def __init__(self, returncode):
         # Simulating successful run of diamond command
         if (
             args[0]
-            == "diamond blastp --outfmt 6 --max-target-seqs 1 --query test.faa -o output.txt --threads 1 --db db --query-cover 80 --subject-cover 80 --id 30 --evalue 1e-05 --block-size 2 2> log.txt"
+            == "diamond blastp --outfmt 6 --max-target-seqs 1 --query test.faa -o output.txt --threads 1 --db db --compress 1 --query-cover 80 --subject-cover 80 --id 30 --evalue 1e-05 --block-size 2 2> log.txt"
         ):
             return CompletedProcess(0)