From 5d468c106afa406e1f4f74c14a517e2548c5ec7b Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 13:58:17 +0100
Subject: [PATCH 01/12] add proteins input arg

---
 binette/main.py | 50 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/binette/main.py b/binette/main.py
index 1c92a38..befbaee 100755
--- a/binette/main.py
+++ b/binette/main.py
@@ -10,6 +10,7 @@
 
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, Action, Namespace
 
+from email.parser import Parser
 import sys
 import logging
 import os
@@ -68,6 +69,21 @@ def __call__(
         setattr(namespace, self.dest, values)
 
 
+def is_valid_file(parser: ArgumentParser, arg: str) -> Path:
+    """
+    Validates that the provided input file exists.
+
+    :param parser: The ArgumentParser instance handling command-line arguments.
+    :param arg: The path to the file provided as an argument.
+    :return: A Path object representing the valid file.
+    """
+    path_arg = Path(arg)
+
+    # Check if the file exists at the provided path
+    if not path_arg.exists():
+        parser.error(f"Error: The specified file '{arg}' does not exist.")
+        
+    return path_arg
 
 def parse_arguments(args):
     """Parse script arguments."""
@@ -85,7 +101,7 @@ def parse_arguments(args):
         "-d",
         "--bin_dirs",
         nargs="+",
-        type=Path,
+        type=lambda x: is_valid_file(parser, x),
         action=UniqueStore,
         help="List of bin folders containing each bin in a fasta file.",
     )
@@ -95,12 +111,22 @@ def parse_arguments(args):
         "--contig2bin_tables",
         nargs="+",
         action=UniqueStore,
-        type=Path,
+        type=lambda x: is_valid_file(parser, x),
         help="List of contig2bin table with two columns separated\
             with a tabulation: contig, bin",
     )
 
-    input_group.add_argument("-c", "--contigs", required=True, type=Path, help="Contigs in fasta format.")
+    input_group.add_argument("-c", "--contigs", required=True, 
+                             type=lambda x: is_valid_file(parser, x),
+                               help="Contigs in fasta format.")
+
+    input_group.add_argument(
+        "-p", "--proteins",
+        type=lambda x: is_valid_file(parser, x),
+        help="FASTA file of predicted proteins in Prodigal format (>contigID_geneID). "
+            "Skips the gene prediction step if provided."
+    )
+      
 
     # Other parameters category
     other_group = parser.add_argument_group('Other Arguments')
@@ -211,7 +237,7 @@ def parse_input_files(bin_dirs: List[Path],
 
 def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_length: Dict[str, int],
                                 contigs_in_bins: Set[str], diamond_result_file: Path,
-                                checkm2_db: Optional[Path], threads: int, resume: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
+                                checkm2_db: Optional[Path], threads: int, use_existing_protein_file: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
     """
     Predicts or reuses proteins prediction and runs diamond on them.
     
@@ -222,14 +248,14 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len
     :param diamond_result_file: The path to the diamond result file.
     :param checkm2_db: The path to the CheckM2 database.
     :param threads: Number of threads for parallel processing.
-    :param resume: Boolean indicating whether to resume the process.
+    :param use_existing_protein_file: Boolean indicating whether to use an existing protein file.
     :param low_mem: Boolean indicating whether to use low memory mode.
 
     :return: A tuple containing dictionaries - contig_to_kegg_counter and contig_to_genes.
     """
 
     # Predict or reuse proteins prediction and run diamond on them
-    if resume:
+    if use_existing_protein_file:
         logging.info(f"Parsing faa file: {faa_file}.")
         contig_to_genes = cds.parse_faa_file(faa_file.as_posix())
         io.check_contig_consistency(contig_to_length, contig_to_genes, contigs_fasta.as_posix(), faa_file.as_posix())
@@ -360,8 +386,15 @@ def main():
     # Temporary files #
     out_tmp_dir:Path = args.outdir / "temporary_files"
     os.makedirs(out_tmp_dir, exist_ok=True)
+    
+    use_existing_protein_file = False
+
+    if args.proteins:
+        faa_file = args.protein
+        use_existing_protein_file = True
+    else:
+        faa_file = out_tmp_dir / "assembly_proteins.faa"
 
-    faa_file = out_tmp_dir / "assembly_proteins.faa"
     diamond_result_file = out_tmp_dir / "diamond_result.tsv"
 
     # Output files #
@@ -370,13 +403,14 @@ def main():
 
     if args.resume:
         io.check_resume_file(faa_file, diamond_result_file)
+        use_existing_protein_file = True
 
     bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = parse_input_files(args.bin_dirs, args.contig2bin_tables, args.contigs, fasta_extensions=set(args.fasta_extensions))
 
     contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(faa_file=faa_file, contigs_fasta=args.contigs, contig_to_length=contig_to_length,
                                                                         contigs_in_bins=contigs_in_bins,
                                                                         diamond_result_file=diamond_result_file, checkm2_db=args.checkm2_db,
-                                                                        threads=args.threads, resume=args.resume, low_mem=args.low_mem)
+                                                                        threads=args.threads, use_existing_protein_file=use_existing_protein_file, low_mem=args.low_mem)
     
     # Use contig index instead of contig name to save memory
     contig_to_index, index_to_contig = contig_manager.make_contig_index(contigs_in_bins)

From 814c83402a18aa331e24ef43ece8442065a9be6a Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 13:58:38 +0100
Subject: [PATCH 02/12] add and update tests with proteins args

---
 tests/main_binette_test.py | 92 +++++++++++++++++++++++++++++---------
 1 file changed, 71 insertions(+), 21 deletions(-)

diff --git a/tests/main_binette_test.py b/tests/main_binette_test.py
index 0e20acd..6729638 100644
--- a/tests/main_binette_test.py
+++ b/tests/main_binette_test.py
@@ -1,7 +1,7 @@
 
 import pytest
 import logging
-from binette.main import log_selected_bin_info, select_bins_and_write_them, manage_protein_alignement, parse_input_files, parse_arguments, init_logging, main, UniqueStore
+from binette.main import log_selected_bin_info, select_bins_and_write_them, manage_protein_alignement, parse_input_files, parse_arguments, init_logging, main, UniqueStore, is_valid_file
 from binette.bin_manager import Bin
 from binette import diamond, contig_manager, cds
 import os
@@ -13,6 +13,21 @@
 from argparse import ArgumentParser
 from pathlib import Path
 
+@pytest.fixture
+def test_environment(tmp_path: Path):
+    """
+    Fixture to set up a test environment with required directories and files.
+    """
+    folder1 = tmp_path / "folder1"
+    folder2 = tmp_path / "folder2"
+    contigs_file = tmp_path / "contigs.fasta"
+
+    folder1.mkdir()
+    folder2.mkdir()
+    contigs_file.write_text(">contig1\nATCG")  # Sample content for the FASTA file
+
+    return folder1, folder2, contigs_file
+
 @pytest.fixture
 def bins():
     b1 = Bin(contigs={"contig1"}, origin="set1", name="bin1")
@@ -111,7 +126,7 @@ def test_manage_protein_alignement_resume(tmp_path):
             diamond_result_file=Path("diamond_result_file"),
             checkm2_db=None,
             threads=1,
-            resume=True,
+            use_existing_protein_file=True,
             low_mem=False
         )
 
@@ -156,7 +171,7 @@ def test_manage_protein_alignement_not_resume(tmpdir, tmp_path):
             diamond_result_file=Path(diamond_result_file),
             checkm2_db=None,
             threads=1,
-            resume=True,
+            use_existing_protein_file=True,
             low_mem=False
         )
 
@@ -254,17 +269,32 @@ def test_argument_used_multiple_times():
         parser.parse_args(['--example', 'value', '--example', 'value2'])
 
 
-def test_parse_arguments_required_arguments():
-    # Test when only required arguments are provided
-    args = parse_arguments(["-d", "folder1", "folder2", "-c", "contigs.fasta"])
-    assert args.bin_dirs == [Path("folder1"), Path("folder2")]
-    assert args.contigs == Path("contigs.fasta")
+def test_parse_arguments_required_arguments(test_environment):
+    """
+    Test parsing when only required arguments are provided.
+    Ensure that input arguments exist before parsing.
+    """
+    # Create temporary directories and files
+    folder1, folder2, contigs_file = test_environment
+
+    # Parse arguments with existing files and directories
+    args = parse_arguments(["-d", str(folder1), str(folder2), "-c", str(contigs_file)])
 
-def test_parse_arguments_optional_arguments():
+    # Assert that the parsed arguments match the expected paths
+    assert args.bin_dirs == [folder1, folder2]
+    assert args.contigs == contigs_file
+
+
+def test_parse_arguments_optional_arguments(test_environment):
     # Test when required and optional arguments are provided
-    args = parse_arguments(["-d", "folder1", "folder2", "-c", "contigs.fasta", "--threads", "4", "--outdir", "output"])
-    assert args.bin_dirs == [Path("folder1"), Path("folder2")]
-    assert args.contigs == Path("contigs.fasta")
+    
+    # Create temporary directories and files
+    folder1, folder2, contigs_file = test_environment   
+
+    # Parse arguments with existing files and directories
+    args = parse_arguments(["-d", str(folder1), str(folder2), "-c", str(contigs_file), "--threads", "4", "--outdir", "output"])
+    assert args.bin_dirs == [folder1, folder2]
+    assert args.contigs == contigs_file
     assert args.threads == 4
     assert args.outdir == Path("output")
 
@@ -333,13 +363,12 @@ def test_manage_protein_alignment_no_resume(tmp_path):
             faa_file.as_posix(), diamond_result_file.as_posix(), checkm2_db.as_posix(), f"{os.path.splitext(diamond_result_file.as_posix())[0]}.log", threads, low_mem=low_mem
         )
 
-def test_main_resume_when_not_possible(monkeypatch):
+def test_main_resume_when_not_possible(monkeypatch, test_environment):
     # Define or mock the necessary inputs/arguments
+    folder1, folder2, contigs_file = test_environment
 
     # Mock sys.argv to use test_args
-    test_args = [
-        "-d", "bin_dir1", "bin_dir2",
-        "-c", "contigs.fasta",
+    test_args = ["-d", str(folder1), str(folder2), "-c", str(contigs_file), 
         # ... more arguments as required ...
         "--debug",
         "--resume"
@@ -350,13 +379,12 @@ def test_main_resume_when_not_possible(monkeypatch):
     with pytest.raises(FileNotFoundError):
         main()
 
-def test_main(monkeypatch):
+def test_main(monkeypatch, test_environment):
     # Define or mock the necessary inputs/arguments
-
+    folder1, folder2, contigs_file = test_environment
     # Mock sys.argv to use test_args
     test_args = [
-        "-d", "bin_dir1", "bin_dir2",
-        "-c", "contigs.fasta",
+        "-d", str(folder1), str(folder2), "-c", str(contigs_file), 
         # ... more arguments as required ...
         "--debug"
     ]
@@ -399,4 +427,26 @@ def test_main(monkeypatch):
         mock_write_original_bin_metrics.assert_called_once()
 
         assert mock_apply_contig_index.call_count == 3
-        assert mock_add_bin_metrics.call_count == 2
\ No newline at end of file
+        assert mock_add_bin_metrics.call_count == 2
+
+
+def test_is_valid_file_existing_file(tmp_path: Path):
+    """Test is_valid_file with a file that exists."""
+    # Create a temporary file
+    test_file = tmp_path / "test_file.txt"
+    test_file.write_text("Sample content")
+
+    parser = ArgumentParser()
+
+    # Assert that the function correctly returns the file path
+    result = is_valid_file(parser, str(test_file))
+    assert result == test_file
+
+def test_is_valid_file_non_existing_file():
+    """Test is_valid_file with a file that does not exist."""
+    parser = ArgumentParser()
+    non_existing_file = "non_existing_file.txt"
+
+    # Expect the function to call parser.error, which will raise a SystemExit exception
+    with pytest.raises(SystemExit):
+        is_valid_file(parser, non_existing_file)
\ No newline at end of file

From 5d2e5fba54877dd50b078f219a292357aaaba29a Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 15:17:18 +0100
Subject: [PATCH 03/12] add some check on the input faa file type and pytests
 accordingly

---
 binette/cds.py    | 40 +++++++++++++++++++++++++++++++++++
 tests/cds_test.py | 53 ++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/binette/cds.py b/binette/cds.py
index 8845abe..8294641 100644
--- a/binette/cds.py
+++ b/binette/cds.py
@@ -71,19 +71,59 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) -
         for contig_id, genes in contig_to_genes:
             genes.write_translations(fl, contig_id)
 
+
+def is_nucleic_acid(sequence: str) -> bool:
+    """
+    Determines whether the given sequence is a DNA or RNA sequence.
+
+    :param sequence: The sequence to check.
+    :return: True if the sequence is a DNA or RNA sequence, False otherwise.
+    """
+    # Define nucleotidic bases (DNA and RNA)
+    nucleotidic_bases = set('ATCGNUatcgnu')
+    
+    # Check if all characters in the sequence are valid nucleotidic bases (DNA or RNA)
+    if all(base in nucleotidic_bases for base in sequence):
+        return True
+    
+    # If any character is invalid, return False
+    return False
+
+ 
+
 def parse_faa_file(faa_file: str) -> Dict[str, List[str]]:
     """
     Parse a FASTA file containing protein sequences and organize them by contig.
 
     :param faa_file: Path to the input FASTA file.
     :return: A dictionary mapping contig names to lists of protein sequences.
+    :raises ValueError: If the file contains nucleotidic sequences instead of protein sequences.
     """
     contig_to_genes = defaultdict(list)
+    checked_sequences = []
+
+    # Iterate through the FASTA file and parse sequences
     for name, seq in pyfastx.Fastx(faa_file):
         contig = get_contig_from_cds_name(name)
         contig_to_genes[contig].append(seq)
+        
+        # Concatenate up to the first 20 sequences for validation
+        if len(checked_sequences) < 20:
+            checked_sequences.append(seq)
+
+    # Concatenate all checked sequences for a more reliable nucleic acid check
+    concatenated_seq = "".join(checked_sequences)
+
+    # Check if the concatenated sequence appears to be nucleic acid
+    if is_nucleic_acid(concatenated_seq):
+        raise ValueError(
+            f"The file '{faa_file}' appears to contain nucleotide sequences. "
+            "Ensure that the file contains valid protein sequences in FASTA format."
+        )
 
     return dict(contig_to_genes)
+   
+
 
 def get_aa_composition(genes: List[str]) -> Counter:
     """
diff --git a/tests/cds_test.py b/tests/cds_test.py
index 92db12c..20eb787 100644
--- a/tests/cds_test.py
+++ b/tests/cds_test.py
@@ -103,9 +103,6 @@ def test_extract_contig_name_from_cds_name():
     assert result == "contig1"
 
 
-# Import the functions write_faa and parse_faa_file here
-
-
 def test_write_faa(contig1, orf_finder):
     
     predicted_genes = orf_finder.find_genes(contig1.seq)
@@ -122,10 +119,11 @@ def test_write_faa(contig1, orf_finder):
 
 
 def test_parse_faa_file(tmp_path):
-    # Mock a FASTA file
+    # Mock a FASTA file of protein sequences
+    # at least one protein sequence to not triger the error
     fasta_content = (
         ">contig1_gene1\n"
-        "AAAAAAAAAAA\n"
+        "MPPPAOSKNSKSS\n" 
         ">contig1_gene2\n"
         "CCCCCCCCCCC\n"
         ">contig2_gene1\n"
@@ -139,11 +137,32 @@ def test_parse_faa_file(tmp_path):
 
     # Check if the output matches the expected dictionary
     expected_result = {
-        'contig1': ['AAAAAAAAAAA', 'CCCCCCCCCCC'],
+        'contig1': ['MPPPAOSKNSKSS', 'CCCCCCCCCCC'],
         'contig2': ['TTTTTTTTTTTT']
     }
     assert result == expected_result
 
+
+def test_parse_faa_file_raises_error_for_dna(tmp_path):
+    # Mock a DNA FASTA file
+    fasta_content = (
+        ">contig1_gene1\n"
+        "AAAAAAAAAAA\n"
+        ">contig1_gene2\n"
+        "CCCCCCCCCCC\n"
+        ">contig2_gene1\n"
+        "TTTTTTTTTTTT\n"
+    )
+    fna_file = tmp_path / "mock_file.fna"
+    fna_file.write_text(fasta_content)
+
+    
+    # Check that ValueError is raised when DNA sequences are encountered
+    with pytest.raises(ValueError):
+        cds.parse_faa_file(fna_file)
+
+
+
 def test_get_aa_composition():
 
     genes = ['AAAA',
@@ -175,4 +194,24 @@ def test_get_contig_cds_metadata():
     
     assert contig_metadata['contig_to_cds_count'] == {"c1":3, "c2":2}
     assert contig_metadata['contig_to_aa_counter'] == {"c1": {'A': 4, 'G': 4, "C":4} , "c2":{'C': 4, 'T': 4}}
-    assert contig_metadata['contig_to_aa_length'] == {"c1":12, "c2":8}
\ No newline at end of file
+    assert contig_metadata['contig_to_aa_length'] == {"c1":12, "c2":8}
+
+
+
+# Test function
+def test_is_nucleic_acid():
+    # Valid DNA sequence
+    assert cds.is_nucleic_acid("ATCG") is True
+    assert cds.is_nucleic_acid("ATCNNNNNG") is True # N can be found in DNA seq
+    # Valid RNA sequence
+    assert cds.is_nucleic_acid("AUGCAUGC") is True
+    
+    # Mixed case
+    assert cds.is_nucleic_acid("AtCg") is True
+    
+    # Invalid sequence (contains characters not part of DNA or RNA)
+    assert cds.is_nucleic_acid("ATCX") is False  # 'X' is not a valid base
+    assert cds.is_nucleic_acid("AUG#C") is False  # '#' is not a valid base
+    
+    # Amino acid sequence
+    assert cds.is_nucleic_acid("MSIRGVGGNGNSR") is False  # Numbers are invalid

From 6749ae1a289e5c1a4c35c617c90caeb02996b179 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 15:18:21 +0100
Subject: [PATCH 04/12] rm wrong import

---
 binette/main.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/binette/main.py b/binette/main.py
index befbaee..63288c5 100755
--- a/binette/main.py
+++ b/binette/main.py
@@ -10,7 +10,6 @@
 
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, Action, Namespace
 
-from email.parser import Parser
 import sys
 import logging
 import os

From 10cef1cd0b72efaaad5011e80b69177220ad39cb Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 16:20:28 +0100
Subject: [PATCH 05/12] fix typo in arg name and add log

---
 binette/main.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/binette/main.py b/binette/main.py
index 63288c5..d4b983d 100755
--- a/binette/main.py
+++ b/binette/main.py
@@ -389,11 +389,13 @@ def main():
     use_existing_protein_file = False
 
     if args.proteins:
-        faa_file = args.protein
+        logging.info(f"Using the provided protein sequences file: {args.proteins}")
+        faa_file = args.proteins
         use_existing_protein_file = True
     else:
         faa_file = out_tmp_dir / "assembly_proteins.faa"
 
+
     diamond_result_file = out_tmp_dir / "diamond_result.tsv"
 
     # Output files #

From 29b81dfa3123a058fd582bce317a50f85f79fc5b Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 16:33:52 +0100
Subject: [PATCH 06/12] add doc explaining precomputed protein input

---
 docs/usage.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/usage.md b/docs/usage.md
index 9b37065..60b3b13 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -58,6 +58,20 @@ For example, consider the following two `contig2bin_tables`:
 
 In both formats, the `--contigs` argument should specify a FASTA file containing all the contigs found in the bins. Typically, this file would be the assembly FASTA file used to generate the bins. In these exemple the `assembly.fasta` file should contain at least the five contigs mentioned in the `contig2bin_tables` files or in the bin fasta files: `contig_1`, `contig_8`, `contig_15`, `contig_9`, and `contig_10`.
 
+
+## Providing Protein Sequences
+
+You can provide protein sequences in FASTA format to Binette using the `--proteins` argument. The sequence identifiers must follow the Prodigal convention: `<contigID>_<GeneID>`. This naming format ensures proper mapping of each gene to its corresponding contig.  
+
+By using this option, the gene prediction step is skipped.  
+
+### Example  
+If your contig is named `contig_A`, the gene identifiers should follow this pattern:  
+- `contig_A_1`  
+- `contig_A_2`  
+- `contig_A_3`  
+
+
 ## Outputs
 
 Binette results are stored in the `results` directory. You can specify a different directory using the `--outdir` option.

From 2abff6dc7671461abd584ee9b5a985eb4e214edb Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 16:35:12 +0100
Subject: [PATCH 07/12] improve doc

---
 docs/usage.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/usage.md b/docs/usage.md
index 60b3b13..9cec137 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -59,9 +59,9 @@ For example, consider the following two `contig2bin_tables`:
 In both formats, the `--contigs` argument should specify a FASTA file containing all the contigs found in the bins. Typically, this file would be the assembly FASTA file used to generate the bins. In these exemple the `assembly.fasta` file should contain at least the five contigs mentioned in the `contig2bin_tables` files or in the bin fasta files: `contig_1`, `contig_8`, `contig_15`, `contig_9`, and `contig_10`.
 
 
-## Providing Protein Sequences
+## Providing Precomputed Protein Sequences
 
-You can provide protein sequences in FASTA format to Binette using the `--proteins` argument. The sequence identifiers must follow the Prodigal convention: `<contigID>_<GeneID>`. This naming format ensures proper mapping of each gene to its corresponding contig.  
+You can provide protein sequences in FASTA format to Binette using the `--proteins` argument. The sequence identifiers must follow the Prodigal convention: `<contigID>_<GeneID>`. This naming format ensures proper mapping of each gene to its contig.  
 
 By using this option, the gene prediction step is skipped.  
 

From 199660a833766d2b84899c8395f3e545a4e043d6 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 18:14:40 +0100
Subject: [PATCH 08/12] fix diamond resume and using external faa

---
 binette/main.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/binette/main.py b/binette/main.py
index d4b983d..8bd6bac 100755
--- a/binette/main.py
+++ b/binette/main.py
@@ -236,7 +236,9 @@ def parse_input_files(bin_dirs: List[Path],
 
 def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_length: Dict[str, int],
                                 contigs_in_bins: Set[str], diamond_result_file: Path,
-                                checkm2_db: Optional[Path], threads: int, use_existing_protein_file: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
+                                checkm2_db: Optional[Path], threads: int, use_existing_protein_file: bool, 
+                                resume_diamond:bool,
+                                low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
     """
     Predicts or reuses proteins prediction and runs diamond on them.
     
@@ -248,6 +250,7 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len
     :param checkm2_db: The path to the CheckM2 database.
     :param threads: Number of threads for parallel processing.
     :param use_existing_protein_file: Boolean indicating whether to use an existing protein file.
+    :param resume_diamond: Boolean indicating whether to resume diamond alignement.
     :param low_mem: Boolean indicating whether to use low memory mode.
 
     :return: A tuple containing dictionaries - contig_to_kegg_counter and contig_to_genes.
@@ -263,6 +266,7 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len
         contigs_iterator = (s for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix()) if s.name in contigs_in_bins)
         contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads)
 
+    if not resume_diamond:
         if checkm2_db is None:
             # get checkm2 db stored in checkm2 install
             diamond_db_path = diamond.get_checkm2_db()
@@ -411,7 +415,8 @@ def main():
     contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(faa_file=faa_file, contigs_fasta=args.contigs, contig_to_length=contig_to_length,
                                                                         contigs_in_bins=contigs_in_bins,
                                                                         diamond_result_file=diamond_result_file, checkm2_db=args.checkm2_db,
-                                                                        threads=args.threads, use_existing_protein_file=use_existing_protein_file, low_mem=args.low_mem)
+                                                                        threads=args.threads, use_existing_protein_file=use_existing_protein_file, 
+                                                                        resume_diamond=args.resume, low_mem=args.low_mem)
     
     # Use contig index instead of contig name to save memory
     contig_to_index, index_to_contig = contig_manager.make_contig_index(contigs_in_bins)

From 59b6a9320ab359317e41800d05dfa059583cdbcb Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 18:17:44 +0100
Subject: [PATCH 09/12] adjust tests with diamond resume flag

---
 tests/main_binette_test.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/main_binette_test.py b/tests/main_binette_test.py
index 6729638..6c7b2cd 100644
--- a/tests/main_binette_test.py
+++ b/tests/main_binette_test.py
@@ -100,7 +100,7 @@ def test_manage_protein_alignement_resume(tmp_path):
 
     faa_file = tmp_path / "proteins.faa"
     faa_file_content = (
-    ">contig1_1\nACGT\n>contig2_1\nTGCA\n>contig2_2\nAAAA\n>contig3_1\nCCCC\n"
+    ">contig1_1\nMCGT\n>contig2_1\nTGCA\n>contig2_2\nAAAA\n>contig3_1\nCCCC\n"
     )
 
     contig_to_length={"contig1":40, "contig2":80, "contig3":20}
@@ -127,6 +127,7 @@ def test_manage_protein_alignement_resume(tmp_path):
             checkm2_db=None,
             threads=1,
             use_existing_protein_file=True,
+            resume_diamond=True,
             low_mem=False
         )
 
@@ -141,7 +142,7 @@ def test_manage_protein_alignement_not_resume(tmpdir, tmp_path):
 
     faa_file = tmp_path / "proteins.faa"
     faa_file_content = (
-    ">contig1_1\nACGT\n>contig2_1\nTGCA\n>contig2_2\nAAAA\n>contig3_1\nCCCC\n"
+    ">contig1_1\nMLKPACGT\n>contig2_1\nMMMKPTGCA\n>contig2_2\nMMMAAAA\n>contig3_1\nMLPALP\n"
     )
 
     contig_to_length={"contig1":40, "contig2":80, "contig3":20}
@@ -172,6 +173,7 @@ def test_manage_protein_alignement_not_resume(tmpdir, tmp_path):
             checkm2_db=None,
             threads=1,
             use_existing_protein_file=True,
+            resume_diamond=True,
             low_mem=False
         )
 
@@ -352,7 +354,7 @@ def test_manage_protein_alignment_no_resume(tmp_path):
         # Call the function
         contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(
             faa_file, contigs_fasta, contig_to_length, contigs_in_bins,
-            diamond_result_file, checkm2_db, threads, resume, low_mem
+            diamond_result_file, checkm2_db, threads, resume, resume, low_mem
         )
         
         # Assertions to check if functions were called

From d21c6c7ae69614d2a4ba9ce8fb66034824966fc3 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 18:23:38 +0100
Subject: [PATCH 10/12] binette/io_manager.py

improve contig consistency error message
---
 binette/io_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/binette/io_manager.py b/binette/io_manager.py
index 04213e4..7dc1ca4 100644
--- a/binette/io_manager.py
+++ b/binette/io_manager.py
@@ -167,8 +167,8 @@ def check_contig_consistency(contigs_from_assembly: Iterable[str],
 
     issue_countigs = len(set(contigs_from_elsewhere) - set(contigs_from_assembly))
     
-    message = f"{issue_countigs} contigs found in file {elsewhere_file} \
-                were not found in assembly_file ({assembly_file})."
+    message = (f"{issue_countigs} contigs found in file '{elsewhere_file}' "
+    f"were not found in assembly_file '{assembly_file}'")
     assert are_contigs_consistent, message
 
 

From 445e69057d0874fc5353ef38cbe932629a8fea11 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 18:24:14 +0100
Subject: [PATCH 11/12] add test for proteins input

---
 .github/workflows/binette_ci.yml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/binette_ci.yml b/.github/workflows/binette_ci.yml
index ac98620..aff372b 100644
--- a/.github/workflows/binette_ci.yml
+++ b/.github/workflows/binette_ci.yml
@@ -96,3 +96,16 @@ jobs:
         python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs/final_bins_quality_reports.tsv
 
 
+    - name: Run simple test case from bin dirs and with proteins input
+      run: |
+        cd test_data
+        binette -d binning_results/A/ binning_results/B/ binning_results/C/ \
+                --contigs all_contigs.fna --checkm2_db checkm2_tiny_db/checkm2_tiny_db.dmnd  -v -o test_results_from_dirs_and_prot_input --proteins proteins.faa
+
+    - name: Compare results from bin dirs with expectation
+      run: |
+        cd test_data
+        head  expected_results/final_bins_quality_reports.tsv test_results_from_dirs/final_bins_quality_reports.tsv
+        python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv
+
+

From a04a6c015b1dca51f84931c3c721210221f615bb Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 27 Nov 2024 18:38:44 +0100
Subject: [PATCH 12/12] fix comparison of result file in CI

---
 .github/workflows/binette_ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/binette_ci.yml b/.github/workflows/binette_ci.yml
index aff372b..4a8e9be 100644
--- a/.github/workflows/binette_ci.yml
+++ b/.github/workflows/binette_ci.yml
@@ -105,7 +105,7 @@ jobs:
     - name: Compare results from bin dirs with expectation
       run: |
         cd test_data
-        head  expected_results/final_bins_quality_reports.tsv test_results_from_dirs/final_bins_quality_reports.tsv
+        head  expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv
         python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv