From 9ffe382e38b2ad1667e0320edd80bd4018ef5e6a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Dec 2024 16:42:45 +0100 Subject: [PATCH 1/2] fix wrong management with identical input bins --- binette/bin_manager.py | 27 +++++++++------------------ binette/io_manager.py | 18 ++++++++++-------- binette/main.py | 19 ++++++++----------- 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/binette/bin_manager.py b/binette/bin_manager.py index 6cc30c7..65da134 100644 --- a/binette/bin_manager.py +++ b/binette/bin_manager.py @@ -342,26 +342,19 @@ def get_bins_from_contig2bin_table(contig2bin_table: Path, set_name: str) -> Lis return bins -def from_bin_sets_to_bin_graph( - bin_name_to_bin_set: Mapping[str, Iterable[Bin]] -) -> nx.Graph: +def from_bins_to_bin_graph(bins) -> nx.Graph: """ - Creates a bin graph from a dictionary of bin sets. + Creates a bin graph made of overlapping gram a set of bins. - :param bin_name_to_bin_set: A dictionary mapping bin names to their respective bin sets. + :param bins: a set of bins :return: A networkx Graph representing the bin graph of overlapping bins. """ G = nx.Graph() - for set1_name, set2_name in itertools.combinations(bin_name_to_bin_set, 2): - set1 = bin_name_to_bin_set[set1_name] - set2 = bin_name_to_bin_set[set2_name] - - for bin1, bin2 in itertools.product(set1, set2): - - if bin1.overlaps_with(bin2): - G.add_edge(bin1, bin2) + for bin1, bin2 in itertools.combinations(bins, 2): + if bin1.overlaps_with(bin2): + G.add_edge(bin1, bin2) return G @@ -618,18 +611,16 @@ def rename_bin_contigs(bins: Iterable[Bin], contig_to_index: dict): b.hash = hash(str(sorted(b.contigs))) -def create_intermediate_bins( - bin_set_name_to_bins: Mapping[str, Iterable[Bin]] -) -> Set[Bin]: +def create_intermediate_bins(original_bins: Set[Bin]) -> Set[Bin]: """ Creates intermediate bins from a dictionary of bin sets. - :param bin_set_name_to_bins: A dictionary mapping bin set names to corresponding bins. + :param original_bins: Set of input bins. :return: A set of intermediate bins created from intersections, differences, and unions. """ logging.info("Making bin graph...") - connected_bins_graph = from_bin_sets_to_bin_graph(bin_set_name_to_bins) + connected_bins_graph = from_bins_to_bin_graph(original_bins) logging.info("Creating intersection bins...") intersection_bins = get_intersection_bins(connected_bins_graph) diff --git a/binette/io_manager.py b/binette/io_manager.py index 2c1aafe..1a5fa3f 100644 --- a/binette/io_manager.py +++ b/binette/io_manager.py @@ -1,3 +1,4 @@ +from collections import defaultdict import logging import pyfastx from typing import Iterable, List, Dict, Tuple, Set @@ -234,23 +235,24 @@ def check_resume_file(faa_file: Path, diamond_result_file: Path) -> None: raise FileNotFoundError(error_msg) -def write_original_bin_metrics( - bin_set_name_to_bins: Dict[str, Set[Bin]], original_bin_report_dir: Path -): +def write_original_bin_metrics(original_bins: Set[Bin], original_bin_report_dir: Path): """ Write metrics of original input bins to a specified directory. - This function takes a dictionary mapping bin set names to sets of bins and writes - the metrics for each bin set to a TSV file in the specified directory. Each bin set - will have its own TSV file named according to its set name. + This function writes the metrics for each bin set to a TSV file in the specified directory. + Each bin set will have its own TSV file named according to its set name. - :param bin_set_name_to_bins: A dictionary where the keys are bin set names (str) and - the values are sets of Bin objects representing bins. + :param original_bins: A set containing input bins :param original_bin_report_dir: The directory path (Path) where the bin metrics will be saved. """ original_bin_report_dir.mkdir(parents=True, exist_ok=True) + bin_set_name_to_bins = defaultdict(set) + for bin_obj in original_bins: + for origin in bin_obj.origin: + bin_set_name_to_bins[origin].add(bin_obj) + for i, (set_name, bins) in enumerate(sorted(bin_set_name_to_bins.items())): bins_metric_file = ( original_bin_report_dir diff --git a/binette/main.py b/binette/main.py index bd528d5..7f1ad60 100755 --- a/binette/main.py +++ b/binette/main.py @@ -226,7 +226,6 @@ def parse_input_files( :fasta_extensions: Possible fasta extensions to look for in the bin directory. :return: A tuple containing: - - Dictionary mapping bin set names to lists of bins. - List of original bins. - Dictionary mapping bins to lists of contigs. - Dictionary mapping contig names to their lengths. @@ -271,7 +270,7 @@ def parse_input_files( seq.name: len(seq) for seq in contigs_object if seq.name in contigs_in_bins } - return bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length + return original_bins, contigs_in_bins, contig_to_length def manage_protein_alignement( @@ -500,13 +499,11 @@ def main(): io.check_resume_file(faa_file, diamond_result_file) use_existing_protein_file = True - bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = ( - parse_input_files( - args.bin_dirs, - args.contig2bin_tables, - args.contigs, - fasta_extensions=set(args.fasta_extensions), - ) + original_bins, contigs_in_bins, contig_to_length = parse_input_files( + args.bin_dirs, + args.contig2bin_tables, + args.contigs, + fasta_extensions=set(args.fasta_extensions), ) contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( @@ -552,10 +549,10 @@ def main(): logging.info( f"Writting original input bin metrics to directory: {original_bin_report_dir}" ) - io.write_original_bin_metrics(bin_set_name_to_bins, original_bin_report_dir) + io.write_original_bin_metrics(original_bins, original_bin_report_dir) logging.info("Create intermediate bins:") - new_bins = bin_manager.create_intermediate_bins(bin_set_name_to_bins) + new_bins = bin_manager.create_intermediate_bins(original_bins) logging.info("Assess quality for supplementary intermediate bins.") new_bins = bin_quality.add_bin_metrics( From 570150616177ad2047695250512528125c47ce51 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Dec 2024 16:57:40 +0100 Subject: [PATCH 2/2] update test --- binette/bin_quality.py | 2 +- tests/bin_manager_test.py | 22 ++++++++++------------ tests/io_manager_test.py | 11 +++++------ tests/main_binette_test.py | 14 +++++--------- 4 files changed, 21 insertions(+), 28 deletions(-) diff --git a/binette/bin_quality.py b/binette/bin_quality.py index 154660d..e84f8ab 100644 --- a/binette/bin_quality.py +++ b/binette/bin_quality.py @@ -242,7 +242,7 @@ def assess_bins_quality_by_chunk( for i, chunk_bins_iter in enumerate(chunks(bins, chunk_size)): chunk_bins = set(chunk_bins_iter) - logging.debug(f"chunk {i}: assessing quality of {len(chunk_bins)}") + logging.debug(f"chunk {i}: assessing quality of {len(chunk_bins)} bins") assess_bins_quality( bins=chunk_bins, contig_to_kegg_counter=contig_to_kegg_counter, diff --git a/tests/bin_manager_test.py b/tests/bin_manager_test.py index cac472d..7a036ce 100644 --- a/tests/bin_manager_test.py +++ b/tests/bin_manager_test.py @@ -256,11 +256,11 @@ def test_select_best_bins_with_equality(): # The function should create intersection bins when there are overlapping contigs between bins. def test_intersection_bins_created(): - set1 = [ + set1 = { bin_manager.Bin(contigs={"1", "2"}, origin="A", name="bin1"), bin_manager.Bin(contigs={"3", "4"}, origin="A", name="bin2"), bin_manager.Bin(contigs={"5"}, origin="A", name="bin2"), - ] + } # need to defined completeness and conta # because when too low the bin is not used in all operation for b in set1: @@ -270,14 +270,12 @@ def test_intersection_bins_created(): binA = bin_manager.Bin(contigs={"1", "3"}, origin="B", name="binA") binA.contamination = 0 binA.completeness = 100 - set2 = [ + set2 = { binA, - ] - bin_set_name_to_bins = {"set1": set1, "set2": set2} + } + input_bins = set1 | set2 - intermediate_bins_result = bin_manager.create_intermediate_bins( - bin_set_name_to_bins - ) + intermediate_bins_result = bin_manager.create_intermediate_bins(input_bins) expected_intermediate_bins = { bin_manager.Bin(contigs={"1", "2", "3"}, origin="bin1 | binA ", name="NA"), @@ -337,19 +335,19 @@ def test_dereplicate_bin_sets(): assert derep_bins_result == {b1, b2, b3} -def test_from_bin_sets_to_bin_graph(): +def test_from_bins_to_bin_graph(): bin1 = bin_manager.Bin(contigs={"1", "2"}, origin="A", name="bin1") bin2 = bin_manager.Bin(contigs={"3", "4"}, origin="A", name="bin2") bin3 = bin_manager.Bin(contigs={"5"}, origin="A", name="bin3") - set1 = [bin1, bin2, bin3] + set1 = {bin1, bin2, bin3} binA = bin_manager.Bin(contigs={"1", "3"}, origin="B", name="binA") - set2 = [binA] + set2 = {binA} - result_graph = bin_manager.from_bin_sets_to_bin_graph({"B": set2, "A": set1}) + result_graph = bin_manager.from_bins_to_bin_graph(set1 | set2) assert result_graph.number_of_edges() == 2 # bin3 is not connected to any bin so it is not in the graph diff --git a/tests/io_manager_test.py b/tests/io_manager_test.py index 3d857ab..5562140 100644 --- a/tests/io_manager_test.py +++ b/tests/io_manager_test.py @@ -350,17 +350,16 @@ def test_write_original_bin_metrics(mock_write_bin_info, bin1, bin2, tmp_path): temp_directory = tmp_path / "test_output" - mock_bins = {"set1": {bin1}, "set2": {bin2}} # Call the function with mock data - io_manager.write_original_bin_metrics(mock_bins, temp_directory) + io_manager.write_original_bin_metrics({bin1, bin2}, temp_directory) # Check if the output directory was created assert temp_directory.exists(), "Output directory should be created." # Check that the correct files are created expected_files = [ - temp_directory / "input_bins_1.set1.tsv", - temp_directory / "input_bins_2.set2.tsv", + temp_directory / "input_bins_1.origin1.tsv", + temp_directory / "input_bins_2.origin2.tsv", ] assert ( @@ -373,5 +372,5 @@ def test_write_original_bin_metrics(mock_write_bin_info, bin1, bin2, tmp_path): ), "write_bin_info should be called once for each bin set." # Verify the specific calls to `write_bin_info` - mock_write_bin_info.assert_any_call(mock_bins["set1"], expected_files[0]) - mock_write_bin_info.assert_any_call(mock_bins["set2"], expected_files[1]) + mock_write_bin_info.assert_any_call({bin1}, expected_files[0]) + mock_write_bin_info.assert_any_call({bin2}, expected_files[1]) diff --git a/tests/main_binette_test.py b/tests/main_binette_test.py index 8de8238..0b2aba7 100644 --- a/tests/main_binette_test.py +++ b/tests/main_binette_test.py @@ -207,17 +207,15 @@ def test_parse_input_files_with_contig2bin_tables(tmp_path): fasta_file.write_text(fasta_file_content) # Call the function and capture the return values - bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = ( - parse_input_files(None, [bin_set1, bin_set2], fasta_file) + original_bins, contigs_in_bins, contig_to_length = parse_input_files( + None, [bin_set1, bin_set2], fasta_file ) # # Perform assertions on the returned values - assert isinstance(bin_set_name_to_bins, dict) assert isinstance(original_bins, set) assert isinstance(contigs_in_bins, set) assert isinstance(contig_to_length, dict) - assert set(bin_set_name_to_bins) == {"bin_set1", "bin_set2"} assert len(original_bins) == 4 assert contigs_in_bins == {"contig1", "contig2", "contig3", "contig4"} assert len(contig_to_length) == 4 @@ -248,17 +246,15 @@ def test_parse_input_files_bin_dirs(create_temp_bin_directories, tmp_path): fasta_file.write_text(fasta_file_content) # Call the function and capture the return values - bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = ( - parse_input_files(bin_dirs, contig2bin_tables, fasta_file) + original_bins, contigs_in_bins, contig_to_length = parse_input_files( + bin_dirs, contig2bin_tables, fasta_file ) # # Perform assertions on the returned values - assert isinstance(bin_set_name_to_bins, dict) assert isinstance(original_bins, set) assert isinstance(contigs_in_bins, set) assert isinstance(contig_to_length, dict) - assert set(bin_set_name_to_bins) == {"set1", "set2"} assert len(original_bins) == 3 assert contigs_in_bins == { "contig1", @@ -471,7 +467,7 @@ def test_main(monkeypatch, test_environment): ) as mock_select_bins_and_write_them: # Set return values for mocked functions if needed - mock_parse_input_files.return_value = (None, None, None, None) + mock_parse_input_files.return_value = (None, None, None) mock_manage_protein_alignement.return_value = ( {"contig1": 1}, {"contig1": ["gene1"]},