From 9ffe382e38b2ad1667e0320edd80bd4018ef5e6a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Dec 2024 16:42:45 +0100 Subject: [PATCH] fix wrong management with identical input bins --- binette/bin_manager.py | 27 +++++++++------------------ binette/io_manager.py | 18 ++++++++++-------- binette/main.py | 19 ++++++++----------- 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/binette/bin_manager.py b/binette/bin_manager.py index 6cc30c7..65da134 100644 --- a/binette/bin_manager.py +++ b/binette/bin_manager.py @@ -342,26 +342,19 @@ def get_bins_from_contig2bin_table(contig2bin_table: Path, set_name: str) -> Lis return bins -def from_bin_sets_to_bin_graph( - bin_name_to_bin_set: Mapping[str, Iterable[Bin]] -) -> nx.Graph: +def from_bins_to_bin_graph(bins) -> nx.Graph: """ - Creates a bin graph from a dictionary of bin sets. + Creates a bin graph made of overlapping gram a set of bins. - :param bin_name_to_bin_set: A dictionary mapping bin names to their respective bin sets. + :param bins: a set of bins :return: A networkx Graph representing the bin graph of overlapping bins. """ G = nx.Graph() - for set1_name, set2_name in itertools.combinations(bin_name_to_bin_set, 2): - set1 = bin_name_to_bin_set[set1_name] - set2 = bin_name_to_bin_set[set2_name] - - for bin1, bin2 in itertools.product(set1, set2): - - if bin1.overlaps_with(bin2): - G.add_edge(bin1, bin2) + for bin1, bin2 in itertools.combinations(bins, 2): + if bin1.overlaps_with(bin2): + G.add_edge(bin1, bin2) return G @@ -618,18 +611,16 @@ def rename_bin_contigs(bins: Iterable[Bin], contig_to_index: dict): b.hash = hash(str(sorted(b.contigs))) -def create_intermediate_bins( - bin_set_name_to_bins: Mapping[str, Iterable[Bin]] -) -> Set[Bin]: +def create_intermediate_bins(original_bins: Set[Bin]) -> Set[Bin]: """ Creates intermediate bins from a dictionary of bin sets. - :param bin_set_name_to_bins: A dictionary mapping bin set names to corresponding bins. + :param original_bins: Set of input bins. :return: A set of intermediate bins created from intersections, differences, and unions. """ logging.info("Making bin graph...") - connected_bins_graph = from_bin_sets_to_bin_graph(bin_set_name_to_bins) + connected_bins_graph = from_bins_to_bin_graph(original_bins) logging.info("Creating intersection bins...") intersection_bins = get_intersection_bins(connected_bins_graph) diff --git a/binette/io_manager.py b/binette/io_manager.py index 2c1aafe..1a5fa3f 100644 --- a/binette/io_manager.py +++ b/binette/io_manager.py @@ -1,3 +1,4 @@ +from collections import defaultdict import logging import pyfastx from typing import Iterable, List, Dict, Tuple, Set @@ -234,23 +235,24 @@ def check_resume_file(faa_file: Path, diamond_result_file: Path) -> None: raise FileNotFoundError(error_msg) -def write_original_bin_metrics( - bin_set_name_to_bins: Dict[str, Set[Bin]], original_bin_report_dir: Path -): +def write_original_bin_metrics(original_bins: Set[Bin], original_bin_report_dir: Path): """ Write metrics of original input bins to a specified directory. - This function takes a dictionary mapping bin set names to sets of bins and writes - the metrics for each bin set to a TSV file in the specified directory. Each bin set - will have its own TSV file named according to its set name. + This function writes the metrics for each bin set to a TSV file in the specified directory. + Each bin set will have its own TSV file named according to its set name. - :param bin_set_name_to_bins: A dictionary where the keys are bin set names (str) and - the values are sets of Bin objects representing bins. + :param original_bins: A set containing input bins :param original_bin_report_dir: The directory path (Path) where the bin metrics will be saved. """ original_bin_report_dir.mkdir(parents=True, exist_ok=True) + bin_set_name_to_bins = defaultdict(set) + for bin_obj in original_bins: + for origin in bin_obj.origin: + bin_set_name_to_bins[origin].add(bin_obj) + for i, (set_name, bins) in enumerate(sorted(bin_set_name_to_bins.items())): bins_metric_file = ( original_bin_report_dir diff --git a/binette/main.py b/binette/main.py index bd528d5..7f1ad60 100755 --- a/binette/main.py +++ b/binette/main.py @@ -226,7 +226,6 @@ def parse_input_files( :fasta_extensions: Possible fasta extensions to look for in the bin directory. :return: A tuple containing: - - Dictionary mapping bin set names to lists of bins. - List of original bins. - Dictionary mapping bins to lists of contigs. - Dictionary mapping contig names to their lengths. @@ -271,7 +270,7 @@ def parse_input_files( seq.name: len(seq) for seq in contigs_object if seq.name in contigs_in_bins } - return bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length + return original_bins, contigs_in_bins, contig_to_length def manage_protein_alignement( @@ -500,13 +499,11 @@ def main(): io.check_resume_file(faa_file, diamond_result_file) use_existing_protein_file = True - bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = ( - parse_input_files( - args.bin_dirs, - args.contig2bin_tables, - args.contigs, - fasta_extensions=set(args.fasta_extensions), - ) + original_bins, contigs_in_bins, contig_to_length = parse_input_files( + args.bin_dirs, + args.contig2bin_tables, + args.contigs, + fasta_extensions=set(args.fasta_extensions), ) contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( @@ -552,10 +549,10 @@ def main(): logging.info( f"Writting original input bin metrics to directory: {original_bin_report_dir}" ) - io.write_original_bin_metrics(bin_set_name_to_bins, original_bin_report_dir) + io.write_original_bin_metrics(original_bins, original_bin_report_dir) logging.info("Create intermediate bins:") - new_bins = bin_manager.create_intermediate_bins(bin_set_name_to_bins) + new_bins = bin_manager.create_intermediate_bins(original_bins) logging.info("Assess quality for supplementary intermediate bins.") new_bins = bin_quality.add_bin_metrics(