Skip to content

Commit

Permalink
Merge pull request #36 from genotoul-bioinfo/dev
Browse files Browse the repository at this point in the history
Fix missing score in input bins
  • Loading branch information
JeanMainguy authored Dec 19, 2024
2 parents 92e78f0 + 5701506 commit 476d7e8
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 65 deletions.
27 changes: 9 additions & 18 deletions binette/bin_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,26 +342,19 @@ def get_bins_from_contig2bin_table(contig2bin_table: Path, set_name: str) -> Lis
return bins


def from_bin_sets_to_bin_graph(
bin_name_to_bin_set: Mapping[str, Iterable[Bin]]
) -> nx.Graph:
def from_bins_to_bin_graph(bins) -> nx.Graph:
"""
Creates a bin graph from a dictionary of bin sets.
Creates a bin graph made of overlapping gram a set of bins.
:param bin_name_to_bin_set: A dictionary mapping bin names to their respective bin sets.
:param bins: a set of bins
:return: A networkx Graph representing the bin graph of overlapping bins.
"""
G = nx.Graph()

for set1_name, set2_name in itertools.combinations(bin_name_to_bin_set, 2):
set1 = bin_name_to_bin_set[set1_name]
set2 = bin_name_to_bin_set[set2_name]

for bin1, bin2 in itertools.product(set1, set2):

if bin1.overlaps_with(bin2):
G.add_edge(bin1, bin2)
for bin1, bin2 in itertools.combinations(bins, 2):
if bin1.overlaps_with(bin2):
G.add_edge(bin1, bin2)
return G


Expand Down Expand Up @@ -618,18 +611,16 @@ def rename_bin_contigs(bins: Iterable[Bin], contig_to_index: dict):
b.hash = hash(str(sorted(b.contigs)))


def create_intermediate_bins(
bin_set_name_to_bins: Mapping[str, Iterable[Bin]]
) -> Set[Bin]:
def create_intermediate_bins(original_bins: Set[Bin]) -> Set[Bin]:
"""
Creates intermediate bins from a dictionary of bin sets.
:param bin_set_name_to_bins: A dictionary mapping bin set names to corresponding bins.
:param original_bins: Set of input bins.
:return: A set of intermediate bins created from intersections, differences, and unions.
"""
logging.info("Making bin graph...")
connected_bins_graph = from_bin_sets_to_bin_graph(bin_set_name_to_bins)
connected_bins_graph = from_bins_to_bin_graph(original_bins)

logging.info("Creating intersection bins...")
intersection_bins = get_intersection_bins(connected_bins_graph)
Expand Down
2 changes: 1 addition & 1 deletion binette/bin_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def assess_bins_quality_by_chunk(

for i, chunk_bins_iter in enumerate(chunks(bins, chunk_size)):
chunk_bins = set(chunk_bins_iter)
logging.debug(f"chunk {i}: assessing quality of {len(chunk_bins)}")
logging.debug(f"chunk {i}: assessing quality of {len(chunk_bins)} bins")
assess_bins_quality(
bins=chunk_bins,
contig_to_kegg_counter=contig_to_kegg_counter,
Expand Down
18 changes: 10 additions & 8 deletions binette/io_manager.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import defaultdict
import logging
import pyfastx
from typing import Iterable, List, Dict, Tuple, Set
Expand Down Expand Up @@ -234,23 +235,24 @@ def check_resume_file(faa_file: Path, diamond_result_file: Path) -> None:
raise FileNotFoundError(error_msg)


def write_original_bin_metrics(
bin_set_name_to_bins: Dict[str, Set[Bin]], original_bin_report_dir: Path
):
def write_original_bin_metrics(original_bins: Set[Bin], original_bin_report_dir: Path):
"""
Write metrics of original input bins to a specified directory.
This function takes a dictionary mapping bin set names to sets of bins and writes
the metrics for each bin set to a TSV file in the specified directory. Each bin set
will have its own TSV file named according to its set name.
This function writes the metrics for each bin set to a TSV file in the specified directory.
Each bin set will have its own TSV file named according to its set name.
:param bin_set_name_to_bins: A dictionary where the keys are bin set names (str) and
the values are sets of Bin objects representing bins.
:param original_bins: A set containing input bins
:param original_bin_report_dir: The directory path (Path) where the bin metrics will be saved.
"""

original_bin_report_dir.mkdir(parents=True, exist_ok=True)

bin_set_name_to_bins = defaultdict(set)
for bin_obj in original_bins:
for origin in bin_obj.origin:
bin_set_name_to_bins[origin].add(bin_obj)

for i, (set_name, bins) in enumerate(sorted(bin_set_name_to_bins.items())):
bins_metric_file = (
original_bin_report_dir
Expand Down
19 changes: 8 additions & 11 deletions binette/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ def parse_input_files(
:fasta_extensions: Possible fasta extensions to look for in the bin directory.
:return: A tuple containing:
- Dictionary mapping bin set names to lists of bins.
- List of original bins.
- Dictionary mapping bins to lists of contigs.
- Dictionary mapping contig names to their lengths.
Expand Down Expand Up @@ -271,7 +270,7 @@ def parse_input_files(
seq.name: len(seq) for seq in contigs_object if seq.name in contigs_in_bins
}

return bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length
return original_bins, contigs_in_bins, contig_to_length


def manage_protein_alignement(
Expand Down Expand Up @@ -500,13 +499,11 @@ def main():
io.check_resume_file(faa_file, diamond_result_file)
use_existing_protein_file = True

bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = (
parse_input_files(
args.bin_dirs,
args.contig2bin_tables,
args.contigs,
fasta_extensions=set(args.fasta_extensions),
)
original_bins, contigs_in_bins, contig_to_length = parse_input_files(
args.bin_dirs,
args.contig2bin_tables,
args.contigs,
fasta_extensions=set(args.fasta_extensions),
)

contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(
Expand Down Expand Up @@ -552,10 +549,10 @@ def main():
logging.info(
f"Writting original input bin metrics to directory: {original_bin_report_dir}"
)
io.write_original_bin_metrics(bin_set_name_to_bins, original_bin_report_dir)
io.write_original_bin_metrics(original_bins, original_bin_report_dir)

logging.info("Create intermediate bins:")
new_bins = bin_manager.create_intermediate_bins(bin_set_name_to_bins)
new_bins = bin_manager.create_intermediate_bins(original_bins)

logging.info("Assess quality for supplementary intermediate bins.")
new_bins = bin_quality.add_bin_metrics(
Expand Down
22 changes: 10 additions & 12 deletions tests/bin_manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,11 @@ def test_select_best_bins_with_equality():

# The function should create intersection bins when there are overlapping contigs between bins.
def test_intersection_bins_created():
set1 = [
set1 = {
bin_manager.Bin(contigs={"1", "2"}, origin="A", name="bin1"),
bin_manager.Bin(contigs={"3", "4"}, origin="A", name="bin2"),
bin_manager.Bin(contigs={"5"}, origin="A", name="bin2"),
]
}
# need to defined completeness and conta
# because when too low the bin is not used in all operation
for b in set1:
Expand All @@ -270,14 +270,12 @@ def test_intersection_bins_created():
binA = bin_manager.Bin(contigs={"1", "3"}, origin="B", name="binA")
binA.contamination = 0
binA.completeness = 100
set2 = [
set2 = {
binA,
]
bin_set_name_to_bins = {"set1": set1, "set2": set2}
}
input_bins = set1 | set2

intermediate_bins_result = bin_manager.create_intermediate_bins(
bin_set_name_to_bins
)
intermediate_bins_result = bin_manager.create_intermediate_bins(input_bins)

expected_intermediate_bins = {
bin_manager.Bin(contigs={"1", "2", "3"}, origin="bin1 | binA ", name="NA"),
Expand Down Expand Up @@ -337,19 +335,19 @@ def test_dereplicate_bin_sets():
assert derep_bins_result == {b1, b2, b3}


def test_from_bin_sets_to_bin_graph():
def test_from_bins_to_bin_graph():

bin1 = bin_manager.Bin(contigs={"1", "2"}, origin="A", name="bin1")
bin2 = bin_manager.Bin(contigs={"3", "4"}, origin="A", name="bin2")
bin3 = bin_manager.Bin(contigs={"5"}, origin="A", name="bin3")

set1 = [bin1, bin2, bin3]
set1 = {bin1, bin2, bin3}

binA = bin_manager.Bin(contigs={"1", "3"}, origin="B", name="binA")

set2 = [binA]
set2 = {binA}

result_graph = bin_manager.from_bin_sets_to_bin_graph({"B": set2, "A": set1})
result_graph = bin_manager.from_bins_to_bin_graph(set1 | set2)

assert result_graph.number_of_edges() == 2
# bin3 is not connected to any bin so it is not in the graph
Expand Down
11 changes: 5 additions & 6 deletions tests/io_manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,17 +350,16 @@ def test_write_original_bin_metrics(mock_write_bin_info, bin1, bin2, tmp_path):

temp_directory = tmp_path / "test_output"

mock_bins = {"set1": {bin1}, "set2": {bin2}}
# Call the function with mock data
io_manager.write_original_bin_metrics(mock_bins, temp_directory)
io_manager.write_original_bin_metrics({bin1, bin2}, temp_directory)

# Check if the output directory was created
assert temp_directory.exists(), "Output directory should be created."

# Check that the correct files are created
expected_files = [
temp_directory / "input_bins_1.set1.tsv",
temp_directory / "input_bins_2.set2.tsv",
temp_directory / "input_bins_1.origin1.tsv",
temp_directory / "input_bins_2.origin2.tsv",
]

assert (
Expand All @@ -373,5 +372,5 @@ def test_write_original_bin_metrics(mock_write_bin_info, bin1, bin2, tmp_path):
), "write_bin_info should be called once for each bin set."

# Verify the specific calls to `write_bin_info`
mock_write_bin_info.assert_any_call(mock_bins["set1"], expected_files[0])
mock_write_bin_info.assert_any_call(mock_bins["set2"], expected_files[1])
mock_write_bin_info.assert_any_call({bin1}, expected_files[0])
mock_write_bin_info.assert_any_call({bin2}, expected_files[1])
14 changes: 5 additions & 9 deletions tests/main_binette_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,17 +207,15 @@ def test_parse_input_files_with_contig2bin_tables(tmp_path):
fasta_file.write_text(fasta_file_content)

# Call the function and capture the return values
bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = (
parse_input_files(None, [bin_set1, bin_set2], fasta_file)
original_bins, contigs_in_bins, contig_to_length = parse_input_files(
None, [bin_set1, bin_set2], fasta_file
)

# # Perform assertions on the returned values
assert isinstance(bin_set_name_to_bins, dict)
assert isinstance(original_bins, set)
assert isinstance(contigs_in_bins, set)
assert isinstance(contig_to_length, dict)

assert set(bin_set_name_to_bins) == {"bin_set1", "bin_set2"}
assert len(original_bins) == 4
assert contigs_in_bins == {"contig1", "contig2", "contig3", "contig4"}
assert len(contig_to_length) == 4
Expand Down Expand Up @@ -248,17 +246,15 @@ def test_parse_input_files_bin_dirs(create_temp_bin_directories, tmp_path):
fasta_file.write_text(fasta_file_content)

# Call the function and capture the return values
bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = (
parse_input_files(bin_dirs, contig2bin_tables, fasta_file)
original_bins, contigs_in_bins, contig_to_length = parse_input_files(
bin_dirs, contig2bin_tables, fasta_file
)

# # Perform assertions on the returned values
assert isinstance(bin_set_name_to_bins, dict)
assert isinstance(original_bins, set)
assert isinstance(contigs_in_bins, set)
assert isinstance(contig_to_length, dict)

assert set(bin_set_name_to_bins) == {"set1", "set2"}
assert len(original_bins) == 3
assert contigs_in_bins == {
"contig1",
Expand Down Expand Up @@ -471,7 +467,7 @@ def test_main(monkeypatch, test_environment):
) as mock_select_bins_and_write_them:

# Set return values for mocked functions if needed
mock_parse_input_files.return_value = (None, None, None, None)
mock_parse_input_files.return_value = (None, None, None)
mock_manage_protein_alignement.return_value = (
{"contig1": 1},
{"contig1": ["gene1"]},
Expand Down

0 comments on commit 476d7e8

Please sign in to comment.