From 5d468c106afa406e1f4f74c14a517e2548c5ec7b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 13:58:17 +0100 Subject: [PATCH 01/12] add proteins input arg --- binette/main.py | 50 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/binette/main.py b/binette/main.py index 1c92a38..befbaee 100755 --- a/binette/main.py +++ b/binette/main.py @@ -10,6 +10,7 @@ from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, Action, Namespace +from email.parser import Parser import sys import logging import os @@ -68,6 +69,21 @@ def __call__( setattr(namespace, self.dest, values) +def is_valid_file(parser: ArgumentParser, arg: str) -> Path: + """ + Validates that the provided input file exists. + + :param parser: The ArgumentParser instance handling command-line arguments. + :param arg: The path to the file provided as an argument. + :return: A Path object representing the valid file. + """ + path_arg = Path(arg) + + # Check if the file exists at the provided path + if not path_arg.exists(): + parser.error(f"Error: The specified file '{arg}' does not exist.") + + return path_arg def parse_arguments(args): """Parse script arguments.""" @@ -85,7 +101,7 @@ def parse_arguments(args): "-d", "--bin_dirs", nargs="+", - type=Path, + type=lambda x: is_valid_file(parser, x), action=UniqueStore, help="List of bin folders containing each bin in a fasta file.", ) @@ -95,12 +111,22 @@ def parse_arguments(args): "--contig2bin_tables", nargs="+", action=UniqueStore, - type=Path, + type=lambda x: is_valid_file(parser, x), help="List of contig2bin table with two columns separated\ with a tabulation: contig, bin", ) - input_group.add_argument("-c", "--contigs", required=True, type=Path, help="Contigs in fasta format.") + input_group.add_argument("-c", "--contigs", required=True, + type=lambda x: is_valid_file(parser, x), + help="Contigs in fasta format.") + + input_group.add_argument( + "-p", "--proteins", + type=lambda x: is_valid_file(parser, x), + help="FASTA file of predicted proteins in Prodigal format (>contigID_geneID). " + "Skips the gene prediction step if provided." + ) + # Other parameters category other_group = parser.add_argument_group('Other Arguments') @@ -211,7 +237,7 @@ def parse_input_files(bin_dirs: List[Path], def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_length: Dict[str, int], contigs_in_bins: Set[str], diamond_result_file: Path, - checkm2_db: Optional[Path], threads: int, resume: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]: + checkm2_db: Optional[Path], threads: int, use_existing_protein_file: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]: """ Predicts or reuses proteins prediction and runs diamond on them. @@ -222,14 +248,14 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len :param diamond_result_file: The path to the diamond result file. :param checkm2_db: The path to the CheckM2 database. :param threads: Number of threads for parallel processing. - :param resume: Boolean indicating whether to resume the process. + :param use_existing_protein_file: Boolean indicating whether to use an existing protein file. :param low_mem: Boolean indicating whether to use low memory mode. :return: A tuple containing dictionaries - contig_to_kegg_counter and contig_to_genes. """ # Predict or reuse proteins prediction and run diamond on them - if resume: + if use_existing_protein_file: logging.info(f"Parsing faa file: {faa_file}.") contig_to_genes = cds.parse_faa_file(faa_file.as_posix()) io.check_contig_consistency(contig_to_length, contig_to_genes, contigs_fasta.as_posix(), faa_file.as_posix()) @@ -360,8 +386,15 @@ def main(): # Temporary files # out_tmp_dir:Path = args.outdir / "temporary_files" os.makedirs(out_tmp_dir, exist_ok=True) + + use_existing_protein_file = False + + if args.proteins: + faa_file = args.protein + use_existing_protein_file = True + else: + faa_file = out_tmp_dir / "assembly_proteins.faa" - faa_file = out_tmp_dir / "assembly_proteins.faa" diamond_result_file = out_tmp_dir / "diamond_result.tsv" # Output files # @@ -370,13 +403,14 @@ def main(): if args.resume: io.check_resume_file(faa_file, diamond_result_file) + use_existing_protein_file = True bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = parse_input_files(args.bin_dirs, args.contig2bin_tables, args.contigs, fasta_extensions=set(args.fasta_extensions)) contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(faa_file=faa_file, contigs_fasta=args.contigs, contig_to_length=contig_to_length, contigs_in_bins=contigs_in_bins, diamond_result_file=diamond_result_file, checkm2_db=args.checkm2_db, - threads=args.threads, resume=args.resume, low_mem=args.low_mem) + threads=args.threads, use_existing_protein_file=use_existing_protein_file, low_mem=args.low_mem) # Use contig index instead of contig name to save memory contig_to_index, index_to_contig = contig_manager.make_contig_index(contigs_in_bins) From 814c83402a18aa331e24ef43ece8442065a9be6a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 13:58:38 +0100 Subject: [PATCH 02/12] add and update tests with proteins args --- tests/main_binette_test.py | 92 +++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 21 deletions(-) diff --git a/tests/main_binette_test.py b/tests/main_binette_test.py index 0e20acd..6729638 100644 --- a/tests/main_binette_test.py +++ b/tests/main_binette_test.py @@ -1,7 +1,7 @@ import pytest import logging -from binette.main import log_selected_bin_info, select_bins_and_write_them, manage_protein_alignement, parse_input_files, parse_arguments, init_logging, main, UniqueStore +from binette.main import log_selected_bin_info, select_bins_and_write_them, manage_protein_alignement, parse_input_files, parse_arguments, init_logging, main, UniqueStore, is_valid_file from binette.bin_manager import Bin from binette import diamond, contig_manager, cds import os @@ -13,6 +13,21 @@ from argparse import ArgumentParser from pathlib import Path +@pytest.fixture +def test_environment(tmp_path: Path): + """ + Fixture to set up a test environment with required directories and files. + """ + folder1 = tmp_path / "folder1" + folder2 = tmp_path / "folder2" + contigs_file = tmp_path / "contigs.fasta" + + folder1.mkdir() + folder2.mkdir() + contigs_file.write_text(">contig1\nATCG") # Sample content for the FASTA file + + return folder1, folder2, contigs_file + @pytest.fixture def bins(): b1 = Bin(contigs={"contig1"}, origin="set1", name="bin1") @@ -111,7 +126,7 @@ def test_manage_protein_alignement_resume(tmp_path): diamond_result_file=Path("diamond_result_file"), checkm2_db=None, threads=1, - resume=True, + use_existing_protein_file=True, low_mem=False ) @@ -156,7 +171,7 @@ def test_manage_protein_alignement_not_resume(tmpdir, tmp_path): diamond_result_file=Path(diamond_result_file), checkm2_db=None, threads=1, - resume=True, + use_existing_protein_file=True, low_mem=False ) @@ -254,17 +269,32 @@ def test_argument_used_multiple_times(): parser.parse_args(['--example', 'value', '--example', 'value2']) -def test_parse_arguments_required_arguments(): - # Test when only required arguments are provided - args = parse_arguments(["-d", "folder1", "folder2", "-c", "contigs.fasta"]) - assert args.bin_dirs == [Path("folder1"), Path("folder2")] - assert args.contigs == Path("contigs.fasta") +def test_parse_arguments_required_arguments(test_environment): + """ + Test parsing when only required arguments are provided. + Ensure that input arguments exist before parsing. + """ + # Create temporary directories and files + folder1, folder2, contigs_file = test_environment + + # Parse arguments with existing files and directories + args = parse_arguments(["-d", str(folder1), str(folder2), "-c", str(contigs_file)]) -def test_parse_arguments_optional_arguments(): + # Assert that the parsed arguments match the expected paths + assert args.bin_dirs == [folder1, folder2] + assert args.contigs == contigs_file + + +def test_parse_arguments_optional_arguments(test_environment): # Test when required and optional arguments are provided - args = parse_arguments(["-d", "folder1", "folder2", "-c", "contigs.fasta", "--threads", "4", "--outdir", "output"]) - assert args.bin_dirs == [Path("folder1"), Path("folder2")] - assert args.contigs == Path("contigs.fasta") + + # Create temporary directories and files + folder1, folder2, contigs_file = test_environment + + # Parse arguments with existing files and directories + args = parse_arguments(["-d", str(folder1), str(folder2), "-c", str(contigs_file), "--threads", "4", "--outdir", "output"]) + assert args.bin_dirs == [folder1, folder2] + assert args.contigs == contigs_file assert args.threads == 4 assert args.outdir == Path("output") @@ -333,13 +363,12 @@ def test_manage_protein_alignment_no_resume(tmp_path): faa_file.as_posix(), diamond_result_file.as_posix(), checkm2_db.as_posix(), f"{os.path.splitext(diamond_result_file.as_posix())[0]}.log", threads, low_mem=low_mem ) -def test_main_resume_when_not_possible(monkeypatch): +def test_main_resume_when_not_possible(monkeypatch, test_environment): # Define or mock the necessary inputs/arguments + folder1, folder2, contigs_file = test_environment # Mock sys.argv to use test_args - test_args = [ - "-d", "bin_dir1", "bin_dir2", - "-c", "contigs.fasta", + test_args = ["-d", str(folder1), str(folder2), "-c", str(contigs_file), # ... more arguments as required ... "--debug", "--resume" @@ -350,13 +379,12 @@ def test_main_resume_when_not_possible(monkeypatch): with pytest.raises(FileNotFoundError): main() -def test_main(monkeypatch): +def test_main(monkeypatch, test_environment): # Define or mock the necessary inputs/arguments - + folder1, folder2, contigs_file = test_environment # Mock sys.argv to use test_args test_args = [ - "-d", "bin_dir1", "bin_dir2", - "-c", "contigs.fasta", + "-d", str(folder1), str(folder2), "-c", str(contigs_file), # ... more arguments as required ... "--debug" ] @@ -399,4 +427,26 @@ def test_main(monkeypatch): mock_write_original_bin_metrics.assert_called_once() assert mock_apply_contig_index.call_count == 3 - assert mock_add_bin_metrics.call_count == 2 \ No newline at end of file + assert mock_add_bin_metrics.call_count == 2 + + +def test_is_valid_file_existing_file(tmp_path: Path): + """Test is_valid_file with a file that exists.""" + # Create a temporary file + test_file = tmp_path / "test_file.txt" + test_file.write_text("Sample content") + + parser = ArgumentParser() + + # Assert that the function correctly returns the file path + result = is_valid_file(parser, str(test_file)) + assert result == test_file + +def test_is_valid_file_non_existing_file(): + """Test is_valid_file with a file that does not exist.""" + parser = ArgumentParser() + non_existing_file = "non_existing_file.txt" + + # Expect the function to call parser.error, which will raise a SystemExit exception + with pytest.raises(SystemExit): + is_valid_file(parser, non_existing_file) \ No newline at end of file From 5d2e5fba54877dd50b078f219a292357aaaba29a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 15:17:18 +0100 Subject: [PATCH 03/12] add some check on the input faa file type and pytests accordingly --- binette/cds.py | 40 +++++++++++++++++++++++++++++++++++ tests/cds_test.py | 53 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 86 insertions(+), 7 deletions(-) diff --git a/binette/cds.py b/binette/cds.py index 8845abe..8294641 100644 --- a/binette/cds.py +++ b/binette/cds.py @@ -71,19 +71,59 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) - for contig_id, genes in contig_to_genes: genes.write_translations(fl, contig_id) + +def is_nucleic_acid(sequence: str) -> bool: + """ + Determines whether the given sequence is a DNA or RNA sequence. + + :param sequence: The sequence to check. + :return: True if the sequence is a DNA or RNA sequence, False otherwise. + """ + # Define nucleotidic bases (DNA and RNA) + nucleotidic_bases = set('ATCGNUatcgnu') + + # Check if all characters in the sequence are valid nucleotidic bases (DNA or RNA) + if all(base in nucleotidic_bases for base in sequence): + return True + + # If any character is invalid, return False + return False + + + def parse_faa_file(faa_file: str) -> Dict[str, List[str]]: """ Parse a FASTA file containing protein sequences and organize them by contig. :param faa_file: Path to the input FASTA file. :return: A dictionary mapping contig names to lists of protein sequences. + :raises ValueError: If the file contains nucleotidic sequences instead of protein sequences. """ contig_to_genes = defaultdict(list) + checked_sequences = [] + + # Iterate through the FASTA file and parse sequences for name, seq in pyfastx.Fastx(faa_file): contig = get_contig_from_cds_name(name) contig_to_genes[contig].append(seq) + + # Concatenate up to the first 20 sequences for validation + if len(checked_sequences) < 20: + checked_sequences.append(seq) + + # Concatenate all checked sequences for a more reliable nucleic acid check + concatenated_seq = "".join(checked_sequences) + + # Check if the concatenated sequence appears to be nucleic acid + if is_nucleic_acid(concatenated_seq): + raise ValueError( + f"The file '{faa_file}' appears to contain nucleotide sequences. " + "Ensure that the file contains valid protein sequences in FASTA format." + ) return dict(contig_to_genes) + + def get_aa_composition(genes: List[str]) -> Counter: """ diff --git a/tests/cds_test.py b/tests/cds_test.py index 92db12c..20eb787 100644 --- a/tests/cds_test.py +++ b/tests/cds_test.py @@ -103,9 +103,6 @@ def test_extract_contig_name_from_cds_name(): assert result == "contig1" -# Import the functions write_faa and parse_faa_file here - - def test_write_faa(contig1, orf_finder): predicted_genes = orf_finder.find_genes(contig1.seq) @@ -122,10 +119,11 @@ def test_write_faa(contig1, orf_finder): def test_parse_faa_file(tmp_path): - # Mock a FASTA file + # Mock a FASTA file of protein sequences + # at least one protein sequence to not triger the error fasta_content = ( ">contig1_gene1\n" - "AAAAAAAAAAA\n" + "MPPPAOSKNSKSS\n" ">contig1_gene2\n" "CCCCCCCCCCC\n" ">contig2_gene1\n" @@ -139,11 +137,32 @@ def test_parse_faa_file(tmp_path): # Check if the output matches the expected dictionary expected_result = { - 'contig1': ['AAAAAAAAAAA', 'CCCCCCCCCCC'], + 'contig1': ['MPPPAOSKNSKSS', 'CCCCCCCCCCC'], 'contig2': ['TTTTTTTTTTTT'] } assert result == expected_result + +def test_parse_faa_file_raises_error_for_dna(tmp_path): + # Mock a DNA FASTA file + fasta_content = ( + ">contig1_gene1\n" + "AAAAAAAAAAA\n" + ">contig1_gene2\n" + "CCCCCCCCCCC\n" + ">contig2_gene1\n" + "TTTTTTTTTTTT\n" + ) + fna_file = tmp_path / "mock_file.fna" + fna_file.write_text(fasta_content) + + + # Check that ValueError is raised when DNA sequences are encountered + with pytest.raises(ValueError): + cds.parse_faa_file(fna_file) + + + def test_get_aa_composition(): genes = ['AAAA', @@ -175,4 +194,24 @@ def test_get_contig_cds_metadata(): assert contig_metadata['contig_to_cds_count'] == {"c1":3, "c2":2} assert contig_metadata['contig_to_aa_counter'] == {"c1": {'A': 4, 'G': 4, "C":4} , "c2":{'C': 4, 'T': 4}} - assert contig_metadata['contig_to_aa_length'] == {"c1":12, "c2":8} \ No newline at end of file + assert contig_metadata['contig_to_aa_length'] == {"c1":12, "c2":8} + + + +# Test function +def test_is_nucleic_acid(): + # Valid DNA sequence + assert cds.is_nucleic_acid("ATCG") is True + assert cds.is_nucleic_acid("ATCNNNNNG") is True # N can be found in DNA seq + # Valid RNA sequence + assert cds.is_nucleic_acid("AUGCAUGC") is True + + # Mixed case + assert cds.is_nucleic_acid("AtCg") is True + + # Invalid sequence (contains characters not part of DNA or RNA) + assert cds.is_nucleic_acid("ATCX") is False # 'X' is not a valid base + assert cds.is_nucleic_acid("AUG#C") is False # '#' is not a valid base + + # Amino acid sequence + assert cds.is_nucleic_acid("MSIRGVGGNGNSR") is False # Numbers are invalid From 6749ae1a289e5c1a4c35c617c90caeb02996b179 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 15:18:21 +0100 Subject: [PATCH 04/12] rm wrong import --- binette/main.py | 1 - 1 file changed, 1 deletion(-) diff --git a/binette/main.py b/binette/main.py index befbaee..63288c5 100755 --- a/binette/main.py +++ b/binette/main.py @@ -10,7 +10,6 @@ from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter, Action, Namespace -from email.parser import Parser import sys import logging import os From 10cef1cd0b72efaaad5011e80b69177220ad39cb Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 16:20:28 +0100 Subject: [PATCH 05/12] fix typo in arg name and add log --- binette/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/binette/main.py b/binette/main.py index 63288c5..d4b983d 100755 --- a/binette/main.py +++ b/binette/main.py @@ -389,11 +389,13 @@ def main(): use_existing_protein_file = False if args.proteins: - faa_file = args.protein + logging.info(f"Using the provided protein sequences file: {args.proteins}") + faa_file = args.proteins use_existing_protein_file = True else: faa_file = out_tmp_dir / "assembly_proteins.faa" + diamond_result_file = out_tmp_dir / "diamond_result.tsv" # Output files # From 29b81dfa3123a058fd582bce317a50f85f79fc5b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 16:33:52 +0100 Subject: [PATCH 06/12] add doc explaining precomputed protein input --- docs/usage.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 9b37065..60b3b13 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -58,6 +58,20 @@ For example, consider the following two `contig2bin_tables`: In both formats, the `--contigs` argument should specify a FASTA file containing all the contigs found in the bins. Typically, this file would be the assembly FASTA file used to generate the bins. In these exemple the `assembly.fasta` file should contain at least the five contigs mentioned in the `contig2bin_tables` files or in the bin fasta files: `contig_1`, `contig_8`, `contig_15`, `contig_9`, and `contig_10`. + +## Providing Protein Sequences + +You can provide protein sequences in FASTA format to Binette using the `--proteins` argument. The sequence identifiers must follow the Prodigal convention: `_`. This naming format ensures proper mapping of each gene to its corresponding contig. + +By using this option, the gene prediction step is skipped. + +### Example +If your contig is named `contig_A`, the gene identifiers should follow this pattern: +- `contig_A_1` +- `contig_A_2` +- `contig_A_3` + + ## Outputs Binette results are stored in the `results` directory. You can specify a different directory using the `--outdir` option. From 2abff6dc7671461abd584ee9b5a985eb4e214edb Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 16:35:12 +0100 Subject: [PATCH 07/12] improve doc --- docs/usage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 60b3b13..9cec137 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -59,9 +59,9 @@ For example, consider the following two `contig2bin_tables`: In both formats, the `--contigs` argument should specify a FASTA file containing all the contigs found in the bins. Typically, this file would be the assembly FASTA file used to generate the bins. In these exemple the `assembly.fasta` file should contain at least the five contigs mentioned in the `contig2bin_tables` files or in the bin fasta files: `contig_1`, `contig_8`, `contig_15`, `contig_9`, and `contig_10`. -## Providing Protein Sequences +## Providing Precomputed Protein Sequences -You can provide protein sequences in FASTA format to Binette using the `--proteins` argument. The sequence identifiers must follow the Prodigal convention: `_`. This naming format ensures proper mapping of each gene to its corresponding contig. +You can provide protein sequences in FASTA format to Binette using the `--proteins` argument. The sequence identifiers must follow the Prodigal convention: `_`. This naming format ensures proper mapping of each gene to its contig. By using this option, the gene prediction step is skipped. From 199660a833766d2b84899c8395f3e545a4e043d6 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 18:14:40 +0100 Subject: [PATCH 08/12] fix diamond resume and using external faa --- binette/main.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/binette/main.py b/binette/main.py index d4b983d..8bd6bac 100755 --- a/binette/main.py +++ b/binette/main.py @@ -236,7 +236,9 @@ def parse_input_files(bin_dirs: List[Path], def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_length: Dict[str, int], contigs_in_bins: Set[str], diamond_result_file: Path, - checkm2_db: Optional[Path], threads: int, use_existing_protein_file: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]: + checkm2_db: Optional[Path], threads: int, use_existing_protein_file: bool, + resume_diamond:bool, + low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]: """ Predicts or reuses proteins prediction and runs diamond on them. @@ -248,6 +250,7 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len :param checkm2_db: The path to the CheckM2 database. :param threads: Number of threads for parallel processing. :param use_existing_protein_file: Boolean indicating whether to use an existing protein file. + :param resume_diamond: Boolean indicating whether to resume diamond alignement. :param low_mem: Boolean indicating whether to use low memory mode. :return: A tuple containing dictionaries - contig_to_kegg_counter and contig_to_genes. @@ -263,6 +266,7 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len contigs_iterator = (s for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix()) if s.name in contigs_in_bins) contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads) + if not resume_diamond: if checkm2_db is None: # get checkm2 db stored in checkm2 install diamond_db_path = diamond.get_checkm2_db() @@ -411,7 +415,8 @@ def main(): contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(faa_file=faa_file, contigs_fasta=args.contigs, contig_to_length=contig_to_length, contigs_in_bins=contigs_in_bins, diamond_result_file=diamond_result_file, checkm2_db=args.checkm2_db, - threads=args.threads, use_existing_protein_file=use_existing_protein_file, low_mem=args.low_mem) + threads=args.threads, use_existing_protein_file=use_existing_protein_file, + resume_diamond=args.resume, low_mem=args.low_mem) # Use contig index instead of contig name to save memory contig_to_index, index_to_contig = contig_manager.make_contig_index(contigs_in_bins) From 59b6a9320ab359317e41800d05dfa059583cdbcb Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 18:17:44 +0100 Subject: [PATCH 09/12] adjust tests with diamond resume flag --- tests/main_binette_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/main_binette_test.py b/tests/main_binette_test.py index 6729638..6c7b2cd 100644 --- a/tests/main_binette_test.py +++ b/tests/main_binette_test.py @@ -100,7 +100,7 @@ def test_manage_protein_alignement_resume(tmp_path): faa_file = tmp_path / "proteins.faa" faa_file_content = ( - ">contig1_1\nACGT\n>contig2_1\nTGCA\n>contig2_2\nAAAA\n>contig3_1\nCCCC\n" + ">contig1_1\nMCGT\n>contig2_1\nTGCA\n>contig2_2\nAAAA\n>contig3_1\nCCCC\n" ) contig_to_length={"contig1":40, "contig2":80, "contig3":20} @@ -127,6 +127,7 @@ def test_manage_protein_alignement_resume(tmp_path): checkm2_db=None, threads=1, use_existing_protein_file=True, + resume_diamond=True, low_mem=False ) @@ -141,7 +142,7 @@ def test_manage_protein_alignement_not_resume(tmpdir, tmp_path): faa_file = tmp_path / "proteins.faa" faa_file_content = ( - ">contig1_1\nACGT\n>contig2_1\nTGCA\n>contig2_2\nAAAA\n>contig3_1\nCCCC\n" + ">contig1_1\nMLKPACGT\n>contig2_1\nMMMKPTGCA\n>contig2_2\nMMMAAAA\n>contig3_1\nMLPALP\n" ) contig_to_length={"contig1":40, "contig2":80, "contig3":20} @@ -172,6 +173,7 @@ def test_manage_protein_alignement_not_resume(tmpdir, tmp_path): checkm2_db=None, threads=1, use_existing_protein_file=True, + resume_diamond=True, low_mem=False ) @@ -352,7 +354,7 @@ def test_manage_protein_alignment_no_resume(tmp_path): # Call the function contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( faa_file, contigs_fasta, contig_to_length, contigs_in_bins, - diamond_result_file, checkm2_db, threads, resume, low_mem + diamond_result_file, checkm2_db, threads, resume, resume, low_mem ) # Assertions to check if functions were called From d21c6c7ae69614d2a4ba9ce8fb66034824966fc3 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 18:23:38 +0100 Subject: [PATCH 10/12] binette/io_manager.py improve contig consistency error message --- binette/io_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/binette/io_manager.py b/binette/io_manager.py index 04213e4..7dc1ca4 100644 --- a/binette/io_manager.py +++ b/binette/io_manager.py @@ -167,8 +167,8 @@ def check_contig_consistency(contigs_from_assembly: Iterable[str], issue_countigs = len(set(contigs_from_elsewhere) - set(contigs_from_assembly)) - message = f"{issue_countigs} contigs found in file {elsewhere_file} \ - were not found in assembly_file ({assembly_file})." + message = (f"{issue_countigs} contigs found in file '{elsewhere_file}' " + f"were not found in assembly_file '{assembly_file}'") assert are_contigs_consistent, message From 445e69057d0874fc5353ef38cbe932629a8fea11 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 18:24:14 +0100 Subject: [PATCH 11/12] add test for proteins input --- .github/workflows/binette_ci.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/binette_ci.yml b/.github/workflows/binette_ci.yml index ac98620..aff372b 100644 --- a/.github/workflows/binette_ci.yml +++ b/.github/workflows/binette_ci.yml @@ -96,3 +96,16 @@ jobs: python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs/final_bins_quality_reports.tsv + - name: Run simple test case from bin dirs and with proteins input + run: | + cd test_data + binette -d binning_results/A/ binning_results/B/ binning_results/C/ \ + --contigs all_contigs.fna --checkm2_db checkm2_tiny_db/checkm2_tiny_db.dmnd -v -o test_results_from_dirs_and_prot_input --proteins proteins.faa + + - name: Compare results from bin dirs with expectation + run: | + cd test_data + head expected_results/final_bins_quality_reports.tsv test_results_from_dirs/final_bins_quality_reports.tsv + python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv + + From a04a6c015b1dca51f84931c3c721210221f615bb Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 27 Nov 2024 18:38:44 +0100 Subject: [PATCH 12/12] fix comparison of result file in CI --- .github/workflows/binette_ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/binette_ci.yml b/.github/workflows/binette_ci.yml index aff372b..4a8e9be 100644 --- a/.github/workflows/binette_ci.yml +++ b/.github/workflows/binette_ci.yml @@ -105,7 +105,7 @@ jobs: - name: Compare results from bin dirs with expectation run: | cd test_data - head expected_results/final_bins_quality_reports.tsv test_results_from_dirs/final_bins_quality_reports.tsv + head expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv