From f3780e8ae2be121920f151d006477f65476f0a5d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Dec 2024 18:30:58 +0100 Subject: [PATCH 1/5] filter input prot to only use gene of interest --- binette/cds.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++- binette/main.py | 17 +++++++---- tests/cds_test.py | 64 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+), 7 deletions(-) diff --git a/binette/cds.py b/binette/cds.py index cc125c5..a5d1389 100644 --- a/binette/cds.py +++ b/binette/cds.py @@ -2,11 +2,13 @@ import multiprocessing.pool import logging from collections import Counter, defaultdict -from typing import Dict, List, Iterator, Tuple, Any, Union +from typing import Dict, List, Iterator, Tuple, Any, Union, Set import pyfastx import pyrodigal from tqdm import tqdm +from pathlib import Path +import gzip def get_contig_from_cds_name(cds_name: str) -> str: @@ -207,3 +209,74 @@ def get_contig_cds_metadata( } return contig_info + + +from typing import Set +from pathlib import Path +import gzip +import pyfastx +import logging + +from typing import Set +from pathlib import Path +import gzip +import pyfastx +import logging + + +def filter_faa_file( + contigs_to_keep: Set[str], + input_faa_file: Path, + filtered_faa_file: Path, +): + """ + Filters a FASTA file containing protein sequences to only include sequences + from contigs present in the provided set of contigs (`contigs_to_keep`). + + This function processes the input FASTA file, identifies protein sequences + originating from contigs listed in `contigs_to_keep`, and writes the filtered + sequences to a new FASTA file. The output file supports optional `.gz` compression. + + Metrics computed and logged: + - Total number of contigs in `contigs_to_keep`. + - Number and proportion of contigs with at least one protein-coding gene. + - Number and proportion of contigs without any protein-coding genes. + - Number of contigs from the input FASTA file that are not in `contigs_to_keep`. + + :param contigs_to_keep: A set of contig names to retain in the output FASTA file. + :param input_faa_file: Path to the input FASTA file containing protein sequences. + :param filtered_faa_file: Path to the output FASTA file for filtered sequences. + If the filename ends with `.gz`, the output will be compressed. + """ + # Determine whether the output file should be compressed + proper_open = gzip.open if str(filtered_faa_file).endswith(".gz") else open + + # Initialize tracking sets for metrics + contigs_with_genes = set() + contigs_parsed = set() + + # Process the input FASTA file and filter sequences based on contigs_to_keep + with proper_open(filtered_faa_file, "wt") as fl: + for name, seq in pyfastx.Fastx(input_faa_file): + contig = get_contig_from_cds_name(name) + contigs_parsed.add(contig) + if contig in contigs_to_keep: + contigs_with_genes.add(contig) + fl.write(f">{name}\n{seq}\n") + + # Calculate metrics + total_contigs = len(contigs_to_keep) + contigs_with_no_genes = total_contigs - len(contigs_with_genes) + contigs_not_in_keep_list = len(contigs_parsed - contigs_to_keep) + + # Log the computed metrics + logging.info(f"Processing protein sequences from '{input_faa_file}'.") + logging.info( + f"Filtered {input_faa_file} to retain genes from {total_contigs} contigs that are included the input bins." + ) + logging.info( + f"Found {contigs_with_no_genes} contigs ({contigs_with_no_genes / total_contigs:.2%}) with no genes." + ) + logging.debug( + f"{contigs_not_in_keep_list} contigs from the input FASTA file are not in the keep list." + ) diff --git a/binette/main.py b/binette/main.py index 7f1ad60..e47a67d 100755 --- a/binette/main.py +++ b/binette/main.py @@ -482,12 +482,7 @@ def main(): use_existing_protein_file = False - if args.proteins: - logging.info(f"Using the provided protein sequences file: {args.proteins}") - faa_file = args.proteins - use_existing_protein_file = True - else: - faa_file = out_tmp_dir / "assembly_proteins.faa" + faa_file = out_tmp_dir / "assembly_proteins.faa" diamond_result_file = out_tmp_dir / "diamond_result.tsv" @@ -506,6 +501,16 @@ def main(): fasta_extensions=set(args.fasta_extensions), ) + if args.proteins and not args.resume: + logging.info(f"Using the provided protein sequences file: {args.proteins}") + use_existing_protein_file = True + + cds.filter_faa_file( + contigs_in_bins, + input_faa_file=args.proteins, + filtered_faa_file=faa_file, + ) + contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( faa_file=faa_file, contigs_fasta=args.contigs, diff --git a/tests/cds_test.py b/tests/cds_test.py index 97380fc..32b6580 100644 --- a/tests/cds_test.py +++ b/tests/cds_test.py @@ -5,6 +5,8 @@ from pathlib import Path from unittest.mock import mock_open, patch +import gzip + class MockContig: def __init__(self, name, seq): @@ -226,3 +228,65 @@ def test_is_nucleic_acid(): # Amino acid sequence assert cds.is_nucleic_acid("MSIRGVGGNGNSR") is False # Numbers are invalid + + +def test_filter_faa_file_basic(tmp_path): + """Test basic functionality of filtering a FASTA file.""" + # Create input data + input_faa = tmp_path / "input.faa" + filtered_faa = tmp_path / "filtered.faa" + + input_faa.write_text( + ">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n" + ) + + # Contigs to keep + contigs_to_keep = {"contig1", "contig3"} + + # Run the function + cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa) + + # Check the filtered output + expected_output = ">contig1_gene1\nATGCGT\n>contig3_gene1\nATGAAA\n" + assert filtered_faa.read_text() == expected_output + + +def test_filter_faa_file_gz_output(tmp_path): + """Test filtering with gzipped output.""" + input_faa = tmp_path / "input.faa" + filtered_faa = tmp_path / "filtered.faa.gz" + + input_faa.write_text( + ">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n" + ) + + # Contigs to keep + contigs_to_keep = {"contig2"} + + # Run the function + cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa) + + # Check the filtered output + with gzip.open(filtered_faa, "rt") as f: + filtered_content = f.read() + expected_output = ">contig2_gene1\nATGCCG\n" + assert filtered_content == expected_output + + +def test_filter_faa_file_no_matching_contigs(tmp_path): + """Test filtering when no contigs match the input list.""" + input_faa = tmp_path / "input.faa" + filtered_faa = tmp_path / "filtered_no_match.faa" + + input_faa.write_text( + ">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n" + ) + + # Contigs to keep + contigs_to_keep = {"contig4", "contig5"} + + # Run the function + cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa) + + # Check the output file is empty + assert filtered_faa.read_text() == "" From 5649c647d9cc527b9d9eb5a1bd6e7381928d2e58 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Dec 2024 18:59:13 +0100 Subject: [PATCH 2/5] gzip temporary files --- binette/cds.py | 8 +------- binette/diamond.py | 1 + binette/main.py | 15 +++++++++------ 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/binette/cds.py b/binette/cds.py index a5d1389..e60762f 100644 --- a/binette/cds.py +++ b/binette/cds.py @@ -72,7 +72,7 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) - """ logging.info("Writing predicted protein sequences.") - with open(outfaa, "w") as fl: + with gzip.open(outfaa, "wt") as fl: for contig_id, genes in contig_to_genes: genes.write_translations(fl, contig_id) @@ -237,12 +237,6 @@ def filter_faa_file( originating from contigs listed in `contigs_to_keep`, and writes the filtered sequences to a new FASTA file. The output file supports optional `.gz` compression. - Metrics computed and logged: - - Total number of contigs in `contigs_to_keep`. - - Number and proportion of contigs with at least one protein-coding gene. - - Number and proportion of contigs without any protein-coding genes. - - Number of contigs from the input FASTA file that are not in `contigs_to_keep`. - :param contigs_to_keep: A set of contig names to retain in the output FASTA file. :param input_faa_file: Path to the input FASTA file containing protein sequences. :param filtered_faa_file: Path to the output FASTA file for filtered sequences. diff --git a/binette/diamond.py b/binette/diamond.py index 33fc834..3d34cbb 100644 --- a/binette/diamond.py +++ b/binette/diamond.py @@ -92,6 +92,7 @@ def run( f"-o {output} " f"--threads {threads} " f"--db {db} " + f"--compress 1 " f"--query-cover {query_cover} " f"--subject-cover {subject_cover} " f"--id {percent_id} " diff --git a/binette/main.py b/binette/main.py index e47a67d..5f95cb4 100755 --- a/binette/main.py +++ b/binette/main.py @@ -315,9 +315,9 @@ def manage_protein_alignement( else: contigs_iterator = ( - s - for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix()) - if s.name in contigs_in_bins + seq + for seq in contig_manager.parse_fasta_file(contigs_fasta.as_posix()) + if seq.name in contigs_in_bins ) contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads) @@ -330,7 +330,10 @@ def manage_protein_alignement( else: raise FileNotFoundError(checkm2_db) - diamond_log = diamond_result_file.parents[0] / f"{diamond_result_file.stem}.log" + diamond_log = ( + diamond_result_file.parents[0] + / f"{diamond_result_file.stem.split('.')[0]}.log" + ) diamond.run( faa_file.as_posix(), @@ -482,9 +485,9 @@ def main(): use_existing_protein_file = False - faa_file = out_tmp_dir / "assembly_proteins.faa" + faa_file = out_tmp_dir / "assembly_proteins.faa.gz" - diamond_result_file = out_tmp_dir / "diamond_result.tsv" + diamond_result_file = out_tmp_dir / "diamond_result.tsv.gz" # Output files # final_bin_report: Path = args.outdir / "final_bins_quality_reports.tsv" From 788f168d5fa24b1896315132f03f7a9cc3b76ef2 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Dec 2024 19:09:21 +0100 Subject: [PATCH 3/5] pyfastx index stored in temporary file dir --- binette/contig_manager.py | 4 ++-- binette/main.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/binette/contig_manager.py b/binette/contig_manager.py index fc783f5..91cdf6e 100644 --- a/binette/contig_manager.py +++ b/binette/contig_manager.py @@ -2,7 +2,7 @@ from typing import Dict, Iterable, Tuple, Set, Any, Union -def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta: +def parse_fasta_file(fasta_file: str, index_file: str) -> pyfastx.Fasta: """ Parse a FASTA file and return a pyfastx.Fasta object. @@ -10,7 +10,7 @@ def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta: :return: A pyfastx.Fasta object representing the parsed FASTA file. """ - fa = pyfastx.Fasta(fasta_file, build_index=True) + fa = pyfastx.Fasta(fasta_file, build_index=True, index_file=index_file) return fa diff --git a/binette/main.py b/binette/main.py index 5f95cb4..8533f4e 100755 --- a/binette/main.py +++ b/binette/main.py @@ -213,6 +213,7 @@ def parse_input_files( bin_dirs: List[Path], contig2bin_tables: List[Path], contigs_fasta: Path, + temporary_dir: Path, fasta_extensions: Set[str] = {".fasta", ".fna", ".fa"}, ) -> Tuple[ Dict[str, Set[bin_manager.Bin]], Set[bin_manager.Bin], Set[str], Dict[str, int] @@ -254,7 +255,10 @@ def parse_input_files( original_bins = bin_manager.dereplicate_bin_sets(bin_set_name_to_bins.values()) logging.info(f"Parsing contig fasta file: {contigs_fasta}") - contigs_object = contig_manager.parse_fasta_file(contigs_fasta.as_posix()) + index_file = temporary_dir / f"{contigs_fasta.name}.fxi" + contigs_object = contig_manager.parse_fasta_file( + contigs_fasta.as_posix(), index_file=index_file.as_posix() + ) unexpected_contigs = { contig for contig in contigs_in_bins if contig not in contigs_object @@ -276,6 +280,7 @@ def parse_input_files( def manage_protein_alignement( faa_file: Path, contigs_fasta: Path, + temporary_dir: Path, contig_to_length: Dict[str, int], contigs_in_bins: Set[str], diamond_result_file: Path, @@ -314,9 +319,12 @@ def manage_protein_alignement( ) else: + index_file = temporary_dir / f"{contigs_fasta.name}.fxi" contigs_iterator = ( seq - for seq in contig_manager.parse_fasta_file(contigs_fasta.as_posix()) + for seq in contig_manager.parse_fasta_file( + contigs_fasta.as_posix(), index_file=index_file.as_posix() + ) if seq.name in contigs_in_bins ) contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads) @@ -502,6 +510,7 @@ def main(): args.contig2bin_tables, args.contigs, fasta_extensions=set(args.fasta_extensions), + temporary_dir=out_tmp_dir, ) if args.proteins and not args.resume: @@ -517,6 +526,7 @@ def main(): contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( faa_file=faa_file, contigs_fasta=args.contigs, + temporary_dir=out_tmp_dir, contig_to_length=contig_to_length, contigs_in_bins=contigs_in_bins, diamond_result_file=diamond_result_file, From af582a7147d33df32f63ceaca6bc5a375016a86d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Dec 2024 14:40:41 +0100 Subject: [PATCH 4/5] adjust logging --- binette/cds.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/binette/cds.py b/binette/cds.py index e60762f..27d2e64 100644 --- a/binette/cds.py +++ b/binette/cds.py @@ -266,10 +266,10 @@ def filter_faa_file( # Log the computed metrics logging.info(f"Processing protein sequences from '{input_faa_file}'.") logging.info( - f"Filtered {input_faa_file} to retain genes from {total_contigs} contigs that are included the input bins." + f"Filtered {input_faa_file} to retain genes from {total_contigs} contigs that are included in the input bins." ) - logging.info( - f"Found {contigs_with_no_genes} contigs ({contigs_with_no_genes / total_contigs:.2%}) with no genes." + logging.debug( + f"Found {contigs_with_no_genes}/{total_contigs} contigs ({contigs_with_no_genes / total_contigs:.2%}) with no genes." ) logging.debug( f"{contigs_not_in_keep_list} contigs from the input FASTA file are not in the keep list." From 98cdde5e0e16e375a64d4af96e57ff346155a699 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Dec 2024 14:41:16 +0100 Subject: [PATCH 5/5] update tests --- tests/cds_test.py | 4 ++-- tests/contig_manager_test.py | 9 +++++---- tests/diamonds_test.py | 2 +- tests/main_binette_test.py | 11 +++++++---- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/cds_test.py b/tests/cds_test.py index 32b6580..285a9d1 100644 --- a/tests/cds_test.py +++ b/tests/cds_test.py @@ -118,14 +118,14 @@ def test_write_faa(contig1, orf_finder): predicted_genes = orf_finder.find_genes(contig1.seq) contig_name = "contig" - output_file = "tests/tmp_file.faa" + output_file = "tests/tmp_file.faa.gz" cds.write_faa(output_file, [(contig_name, predicted_genes)]) # Check if the file was created and first seq starts # with the contig name as expected assert Path(output_file).exists() - with open(output_file, "r") as f: + with gzip.open(output_file, "rt") as f: assert f.read().startswith(f">{contig_name}") diff --git a/tests/contig_manager_test.py b/tests/contig_manager_test.py index 38aca8b..5214232 100644 --- a/tests/contig_manager_test.py +++ b/tests/contig_manager_test.py @@ -4,12 +4,13 @@ # Parses a valid FASTA file and returns a pyfastx.Fasta object. -def test_valid_fasta_file(): +def test_valid_fasta_file(tmp_path): # Arrange fasta_file = "tests/contigs.fasta" - # Act - result = contig_manager.parse_fasta_file(fasta_file) + index_file = "tests/contigs.fasta.fxi" + + result = contig_manager.parse_fasta_file(fasta_file, str(index_file)) # Assert assert isinstance(result, pyfastx.Fasta) @@ -22,7 +23,7 @@ def test_invalid_fasta_file(): # Act and Assert with pytest.raises(RuntimeError): - contig_manager.parse_fasta_file(fasta_file) + contig_manager.parse_fasta_file(fasta_file, "./index.fxi") # The function returns a tuple containing two dictionaries. diff --git a/tests/diamonds_test.py b/tests/diamonds_test.py index eda2975..8d628e2 100644 --- a/tests/diamonds_test.py +++ b/tests/diamonds_test.py @@ -155,7 +155,7 @@ def __init__(self, returncode): # Simulating successful run of diamond command if ( args[0] - == "diamond blastp --outfmt 6 --max-target-seqs 1 --query test.faa -o output.txt --threads 1 --db db --query-cover 80 --subject-cover 80 --id 30 --evalue 1e-05 --block-size 2 2> log.txt" + == "diamond blastp --outfmt 6 --max-target-seqs 1 --query test.faa -o output.txt --threads 1 --db db --compress 1 --query-cover 80 --subject-cover 80 --id 30 --evalue 1e-05 --block-size 2 2> log.txt" ): return CompletedProcess(0) diff --git a/tests/main_binette_test.py b/tests/main_binette_test.py index 0b2aba7..c519693 100644 --- a/tests/main_binette_test.py +++ b/tests/main_binette_test.py @@ -136,6 +136,7 @@ def test_manage_protein_alignement_resume(tmp_path): contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( faa_file=Path(faa_file), contigs_fasta=Path("contigs_fasta"), + temporary_dir=Path(tmp_path), contig_to_length=contig_to_length, contigs_in_bins=set(), diamond_result_file=Path("diamond_result_file"), @@ -179,6 +180,7 @@ def test_manage_protein_alignement_not_resume(tmpdir, tmp_path): contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( faa_file=Path(faa_file), contigs_fasta=Path(contigs_fasta), + temporary_dir=Path(tmp_path), contig_to_length=contig_to_length, contigs_in_bins=set(), diamond_result_file=Path(diamond_result_file), @@ -208,7 +210,7 @@ def test_parse_input_files_with_contig2bin_tables(tmp_path): # Call the function and capture the return values original_bins, contigs_in_bins, contig_to_length = parse_input_files( - None, [bin_set1, bin_set2], fasta_file + None, [bin_set1, bin_set2], fasta_file, tmp_path ) # # Perform assertions on the returned values @@ -230,7 +232,7 @@ def test_parse_input_files_with_contig2bin_tables_with_unknown_contig(tmp_path): fasta_file.write_text(fasta_file_content) with pytest.raises(ValueError): - parse_input_files(None, [bin_set3], fasta_file) + parse_input_files(None, [bin_set3], fasta_file, tmp_path) def test_parse_input_files_bin_dirs(create_temp_bin_directories, tmp_path): @@ -247,7 +249,7 @@ def test_parse_input_files_bin_dirs(create_temp_bin_directories, tmp_path): # Call the function and capture the return values original_bins, contigs_in_bins, contig_to_length = parse_input_files( - bin_dirs, contig2bin_tables, fasta_file + bin_dirs, contig2bin_tables, fasta_file, temporary_dir=tmp_path ) # # Perform assertions on the returned values @@ -384,6 +386,7 @@ def test_manage_protein_alignment_no_resume(tmp_path): contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( faa_file, contigs_fasta, + tmp_path, contig_to_length, contigs_in_bins, diamond_result_file, @@ -395,7 +398,7 @@ def test_manage_protein_alignment_no_resume(tmp_path): ) # Assertions to check if functions were called - mock_parse_fasta_file.assert_called_once_with(contigs_fasta.as_posix()) + mock_parse_fasta_file.assert_called_once() mock_predict.assert_called_once() mock_diamond_get_contig_to_kegg_id.assert_called_once() mock_diamond_run.assert_called_once_with(