From 25144570229ef27a5637d7152401c840e943df42 Mon Sep 17 00:00:00 2001 From: Jean Mainguy Date: Wed, 10 Jul 2024 15:18:01 +0200 Subject: [PATCH 01/36] Run CI on macOS runner as well --- .github/workflows/binette_ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/binette_ci.yml b/.github/workflows/binette_ci.yml index 6972dee..98bf1b8 100644 --- a/.github/workflows/binette_ci.yml +++ b/.github/workflows/binette_ci.yml @@ -13,7 +13,7 @@ on: jobs: build: - runs-on: ubuntu-latest + os: ['ubuntu-latest', 'macos-13'] defaults: run: shell: bash -el {0} From 9e3098e8851657e8d5a46f479b289bc65d6f4119 Mon Sep 17 00:00:00 2001 From: Jean Mainguy Date: Wed, 10 Jul 2024 15:20:41 +0200 Subject: [PATCH 02/36] fix improper typo --- .github/workflows/binette_ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/binette_ci.yml b/.github/workflows/binette_ci.yml index 98bf1b8..6485dc6 100644 --- a/.github/workflows/binette_ci.yml +++ b/.github/workflows/binette_ci.yml @@ -12,14 +12,14 @@ on: jobs: build: - - os: ['ubuntu-latest', 'macos-13'] defaults: run: shell: bash -el {0} + runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: + os: ['ubuntu-latest', 'macos-13'] python-version: [3.8] #["3.8", "3.9", "3.10"] steps: From 8b4f80f3459ac6369188d3310b11c22a16f6ea5b Mon Sep 17 00:00:00 2001 From: Jean Mainguy Date: Wed, 10 Jul 2024 15:30:11 +0200 Subject: [PATCH 03/36] Update setup-miniconda --- .github/workflows/binette_ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/binette_ci.yml b/.github/workflows/binette_ci.yml index 6485dc6..ac98620 100644 --- a/.github/workflows/binette_ci.yml +++ b/.github/workflows/binette_ci.yml @@ -24,10 +24,10 @@ jobs: steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 # Install requirements - - uses: conda-incubator/setup-miniconda@v2 + - uses: conda-incubator/setup-miniconda@v3 with: mamba-version: "*" python-version: ${{ matrix.python-version }} From 7fe9dd7fc33e160a8f2a9809913a7995ca14b0f2 Mon Sep 17 00:00:00 2001 From: Jean Mainguy Date: Mon, 22 Jul 2024 10:18:28 +0200 Subject: [PATCH 04/36] improve clarity as suggested in the JOSS review see : https://github.com/openjournals/joss-reviews/issues/6782#issuecomment-2187193217 --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index b88ec81..3213cde 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -41,7 +41,7 @@ Binette is a Python reimplementation and enhanced version of the bin refinement ![**Overview of Binette Steps**. **(A) Intermediate Bin Creation Example**: Bins are represented as square shapes, each containing colored lines representing the contigs they contain. Creation of intermediate bins involves the initial bins sharing at least one contig. Set operations are applied to the contigs within the bins to generate these intermediate bins. **(B) Binette Workflow Overview**: Input bins serve as the basis for generating intermediate bins. Each bin undergoes a scoring process utilizing quality metrics provided by CheckM2. Subsequently, the bins are sorted based on their scores, and a selection process is executed to retain non-redundant bins.\label{fig:overview}](./binette_overview.pdf) -Bin completeness and contamination are assessed using CheckM2 [@chklovski2023checkm2]. Bins are scored using the following scoring function: $completeness - weight * contamination$, with the default weight set to 2. These scored bins are then sorted, facilitating the selection of a final new set of non-redundant bins (\autoref{fig:overview}.B). The ability to score bins is based on CheckM2 rather than CheckM1 as in the metaWRAP pipeline. CheckM2 uses a novel approach to evaluate bin quality based on machine learning techniques. This approach improves speed and also provides better results than CheckM1. Binette initiates CheckM2 processing by running its initial steps once for all contigs within the input bins. These initial steps involve gene prediction using Prodigal and alignment against the CheckM2 database using Diamond [@buchfink2015diamond]. Binette uses Pyrodigal [@larralde2022pyrodigal], a Python module that uses Cython to provide bindings to Prodigal [@hyatt2010prodigal]. The intermediate Checkm2 results are then used to assess the quality of individual bins, eliminating redundant calculations and speeding up the refinement process. +Bin completeness and contamination are assessed using CheckM2 [@chklovski2023checkm2]. Bins are scored using the following scoring function: $completeness - weight * contamination$, with the default weight set to 2. These scored bins are then sorted, facilitating the selection of a final new set of non-redundant bins (\autoref{fig:overview}.B). The ability to score bins is based on CheckM2 rather than CheckM1, which is what the metaWRAP pipeline uses. CheckM2 uses a novel approach to evaluate bin quality based on machine learning techniques. This approach improves speed and also provides better results than CheckM1. Binette initiates CheckM2 processing by running its initial steps once for all contigs within the input bins. These initial steps involve gene prediction using Prodigal and alignment against the CheckM2 database using Diamond [@buchfink2015diamond]. Binette uses Pyrodigal [@larralde2022pyrodigal], a Python module that uses Cython to provide bindings to Prodigal [@hyatt2010prodigal]. The intermediate Checkm2 results are then used to assess the quality of individual bins, eliminating redundant calculations and speeding up the refinement process. Binette serves as the bin refinement tool within the [metagWGS](https://forgemia.inra.fr/genotoul-bioinfo/metagwgs) metagenomic analysis pipeline [@metagWGS_inprep], providing a robust and faster alternative to the bin refinement module of the metaWRAP pipeline as well as other similar bin refinement tools. From 56bcd7f25b89ee3febe443e175af41bb752be618 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 28 Aug 2024 17:49:55 +0200 Subject: [PATCH 05/36] fix type pylance warning --- binette/bin_manager.py | 63 ++++++++++++++++++++++++++++++++------- binette/bin_quality.py | 25 ++++++++-------- binette/cds.py | 10 +++---- binette/contig_manager.py | 6 ++-- binette/diamond.py | 6 ++-- binette/io_manager.py | 10 +++---- binette/main.py | 54 ++++++++++++++++++++++++++------- 7 files changed, 125 insertions(+), 49 deletions(-) diff --git a/binette/bin_manager.py b/binette/bin_manager.py index 06d25c7..8f4fd6c 100644 --- a/binette/bin_manager.py +++ b/binette/bin_manager.py @@ -7,12 +7,12 @@ import itertools import networkx as nx -from typing import List, Dict, Iterable, Tuple, Set +from typing import List, Dict, Iterable, Tuple, Set, Mapping class Bin: counter = 0 - def __init__(self, contigs: Iterable[str], origin: str, name: str) -> None: + def __init__(self, contigs: Iterable[str], origin: str, name: str, is_original:bool=False) -> None: """ Initialize a Bin object. @@ -35,6 +35,8 @@ def __init__(self, contigs: Iterable[str], origin: str, name: str) -> None: self.contamination = None self.score = None + self.is_original = is_original + def __eq__(self, other: 'Bin') -> bool: """ Compare the Bin object with another object for equality. @@ -163,6 +165,47 @@ def union(self, *others: 'Bin') -> 'Bin': return Bin(contigs, origin, name) + def is_complete_enough(self, min_completeness: float) -> bool: + """ + Determine if a bin is complete enough based on completeness threshold. + + :param min_completeness: The minimum completeness required for a bin. + + :raises ValueError: If completeness has not been set (is None). + + :return: True if the bin meets the min_completeness threshold; False otherwise. + """ + + if self.completeness is None: + raise ValueError( + f"The bin '{self.name}' with ID '{self.id}' has not been evaluated for completeness or contamination, " + "and therefore cannot be assessed." + ) + + return self.completeness >= min_completeness + + + def is_high_quality(self, min_completeness: float, max_contamination: float) -> bool: + """ + Determine if a bin is considered high quality based on completeness and contamination thresholds. + + :param min_completeness: The minimum completeness required for a bin to be considered high quality. + :param max_contamination: The maximum allowed contamination for a bin to be considered high quality. + + :raises ValueError: If either completeness or contamination has not been set (is None). + + :return: True if the bin meets the high quality criteria; False otherwise. + """ + if self.completeness is None or self.contamination is None: + raise ValueError( + f"The bin '{self.name}' with ID '{self.id}' has not been evaluated for completeness or contamination, " + "and therefore cannot be assessed for high quality." + ) + + return self.completeness >= min_completeness and self.contamination <= max_contamination + + + def get_bins_from_directory(bin_dir: str, set_name: str, fasta_extensions: Set[str]) -> List[Bin]: """ Retrieves a list of Bin objects from a directory containing bin FASTA files. @@ -239,7 +282,7 @@ def get_bins_from_contig2bin_table(contig2bin_table: str, set_name: str) -> List if line.startswith("#") or line.startswith("@"): logging.debug(f"Ignoring a line from {contig2bin_table}: {line}") continue - contig_name = line.strip().split("\t")[0] + contig_name = line.strip().split()[0] bin_name = line.strip().split("\t")[1] bin_name2contigs[bin_name].add(contig_name) @@ -250,7 +293,7 @@ def get_bins_from_contig2bin_table(contig2bin_table: str, set_name: str) -> List return bins -def from_bin_sets_to_bin_graph(bin_name_to_bin_set: Dict[str, set]) -> nx.Graph: +def from_bin_sets_to_bin_graph(bin_name_to_bin_set: Mapping[str, Iterable[Bin]]) -> nx.Graph: """ Creates a bin graph from a dictionary of bin sets. @@ -272,7 +315,7 @@ def from_bin_sets_to_bin_graph(bin_name_to_bin_set: Dict[str, set]) -> nx.Graph: -def get_all_possible_combinations(clique: Iterable) -> Iterable[Tuple]: +def get_all_possible_combinations(clique: List) -> Iterable[Tuple]: """ Generates all possible combinations of elements from a given clique. @@ -366,7 +409,7 @@ def get_union_bins(G: nx.Graph, max_conta: int = 50) -> Set[Bin]: return union_bins -def select_best_bins(bins: List[Bin]) -> List[Bin]: +def select_best_bins(bins: Set[Bin]) -> List[Bin]: """ Selects the best bins from a list of bins based on their scores, N50 values, and IDs. @@ -392,7 +435,7 @@ def select_best_bins(bins: List[Bin]) -> List[Bin]: return selected_bins -def dereplicate_bin_sets(bin_sets): +def dereplicate_bin_sets(bin_sets) -> Set[Bin]: """ Dereplicates bins from different bin sets to obtain a non-redundant bin set. @@ -403,7 +446,7 @@ def dereplicate_bin_sets(bin_sets): return set().union(*bin_sets) -def get_contigs_in_bins(bins: List[Bin]) -> Set[str]: +def get_contigs_in_bins(bins: Iterable[Bin]) -> Set[str]: """ Retrieves all contigs present in the given list of bins. @@ -414,7 +457,7 @@ def get_contigs_in_bins(bins: List[Bin]) -> Set[str]: return set().union(*(b.contigs for b in bins)) -def rename_bin_contigs(bins: List[Bin], contig_to_index: dict): +def rename_bin_contigs(bins: Iterable[Bin], contig_to_index: dict): """ Renames the contigs in the bins based on the provided mapping. @@ -425,7 +468,7 @@ def rename_bin_contigs(bins: List[Bin], contig_to_index: dict): b.contigs = {contig_to_index[contig] for contig in b.contigs} b.hash = hash(str(sorted(b.contigs))) -def create_intermediate_bins(bin_set_name_to_bins: Dict[str, Set[Bin]]) -> Set[Bin]: +def create_intermediate_bins(bin_set_name_to_bins: Mapping[str, Iterable[Bin]]) -> Set[Bin]: """ Creates intermediate bins from a dictionary of bin sets. diff --git a/binette/bin_quality.py b/binette/bin_quality.py index b9303cf..4722188 100644 --- a/binette/bin_quality.py +++ b/binette/bin_quality.py @@ -1,10 +1,9 @@ #!/usr/bin/env python3 -import concurrent.futures as cf import logging import os from collections import Counter from itertools import islice -from typing import Dict, Iterable, List, Tuple, Iterator +from typing import Dict, Iterable, Optional, Tuple, Iterator, Set import numpy as np import pandas as pd @@ -17,7 +16,7 @@ from checkm2 import keggData, modelPostprocessing, modelProcessing from binette.bin_manager import Bin -def get_bins_metadata_df(bins: List, contig_to_cds_count: Dict[str, int], contig_to_aa_counter: Dict[str, Counter], contig_to_aa_length: Dict[str, int]) -> pd.DataFrame: +def get_bins_metadata_df(bins: Iterable[Bin], contig_to_cds_count: Dict[str, int], contig_to_aa_counter: Dict[str, Counter], contig_to_aa_length: Dict[str, int]) -> pd.DataFrame: """ Generate a DataFrame containing metadata for a list of bins. @@ -56,7 +55,7 @@ def get_bins_metadata_df(bins: List, contig_to_cds_count: Dict[str, int], contig metadata_df = metadata_df.set_index("Name", drop=False) return metadata_df -def get_diamond_feature_per_bin_df(bins: List, contig_to_kegg_counter: Dict[str, Counter]) -> Tuple[pd.DataFrame, int]: +def get_diamond_feature_per_bin_df(bins: Iterable[Bin], contig_to_kegg_counter: Dict[str, Counter]) -> Tuple[pd.DataFrame, int]: """ Generate a DataFrame containing Diamond feature counts per bin and completeness information for pathways, categories, and modules. @@ -135,11 +134,11 @@ def add_bin_size_and_N50(bins: Iterable[Bin], contig_to_size: Dict[str,int]): bin_obj.add_N50(n50) -def add_bin_metrics(bins: List, contig_info: Dict, contamination_weight: float, threads: int = 1): +def add_bin_metrics(bins: Set[Bin], contig_info: Dict, contamination_weight: float, threads: int = 1): """ - Add metrics to a list of bins. + Add metrics to a Set of bins. - :param bins: List of bin objects. + :param bins: Set of bin objects. :param contig_info: Dictionary containing contig information. :param contamination_weight: Weight for contamination assessment. :param threads: Number of threads for parallel processing (default is 1). @@ -183,13 +182,13 @@ def chunks(iterable: Iterable, size: int) -> Iterator[Tuple]: return iter(lambda: tuple(islice(it, size)), ()) -def assess_bins_quality_by_chunk(bins: List, +def assess_bins_quality_by_chunk(bins: Iterable[Bin], contig_to_kegg_counter: Dict, contig_to_cds_count: Dict, contig_to_aa_counter: Dict, contig_to_aa_length: Dict, contamination_weight: float, - postProcessor:modelPostprocessing.modelProcessor = None, + postProcessor:Optional[modelPostprocessing.modelProcessor] = None, threads: int = 1, chunk_size: int = 2500): """ @@ -223,13 +222,13 @@ def assess_bins_quality_by_chunk(bins: List, ) def assess_bins_quality( - bins: List, + bins: Iterable[Bin], contig_to_kegg_counter: Dict, contig_to_cds_count: Dict, contig_to_aa_counter: Dict, contig_to_aa_length: Dict, contamination_weight: float, - postProcessor: modelPostprocessing.modelProcessor = None, + postProcessor: Optional[modelPostprocessing.modelProcessor] = None, threads: int = 1,): """ Assess the quality of bins. @@ -284,7 +283,7 @@ def assess_bins_quality( final_results["Contamination"] = np.round(final_cont, 2) for bin_obj in bins: - completeness = final_results.loc[bin_obj.id, "Completeness"] - contamination = final_results.loc[bin_obj.id, "Contamination"] + completeness = final_results.at[bin_obj.id, "Completeness"] + contamination = final_results.at[bin_obj.id, "Contamination"] bin_obj.add_quality(completeness, contamination, contamination_weight) diff --git a/binette/cds.py b/binette/cds.py index cebb76e..c665b70 100644 --- a/binette/cds.py +++ b/binette/cds.py @@ -3,7 +3,7 @@ import multiprocessing.pool import logging from collections import Counter, defaultdict -from typing import Dict, List, Iterator, Tuple +from typing import Dict, List, Iterator, Tuple, Any import pyfastx import pyrodigal @@ -52,13 +52,13 @@ def predict(contigs_iterator: Iterator, outfaa: str, threads: int =1) -> Dict[st return contig_to_genes -def predict_genes(find_genes, seq): +def predict_genes(find_genes, seq) -> Tuple[str, pyrodigal.Genes]: return (seq.name, find_genes(seq.seq) ) -def write_faa(outfaa: str, contig_to_genes: Dict[str, List[str]]) -> None: +def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) -> None: """ Write predicted protein sequences to a FASTA file. @@ -71,7 +71,7 @@ def write_faa(outfaa: str, contig_to_genes: Dict[str, List[str]]) -> None: for contig_id, genes in contig_to_genes: genes.write_translations(fl, contig_id) -def parse_faa_file(faa_file: str) -> Dict[str, List]: +def parse_faa_file(faa_file: str) -> Dict[str, List[str]]: """ Parse a FASTA file containing protein sequences and organize them by contig. @@ -115,7 +115,7 @@ def get_contig_cds_metadata_flat(contig_to_genes: Dict[str, List[str]]) -> Tuple return contig_to_cds_count, contig_to_aa_counter, contig_to_aa_length -def get_contig_cds_metadata(contig_to_genes: Dict[str, List[str]], threads: int) -> Tuple[Dict[str, int], Dict[str, Counter], Dict[str, int]]: +def get_contig_cds_metadata(contig_to_genes: Dict[int, Any | List[Any]], threads: int) -> Dict[str, Dict]: """ Calculate metadata for contigs in parallel, including CDS count, amino acid composition, and total amino acid length. diff --git a/binette/contig_manager.py b/binette/contig_manager.py index c6f4f65..4b43733 100644 --- a/binette/contig_manager.py +++ b/binette/contig_manager.py @@ -1,5 +1,5 @@ import pyfastx -from typing import Dict, Tuple +from typing import Dict, Iterable, Tuple, Set, Any, Union def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta: @@ -14,7 +14,7 @@ def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta: return fa -def make_contig_index(contigs: list) -> Tuple[Dict[str, int], Dict[int, str]]: +def make_contig_index(contigs: Set[str]) -> Tuple[Dict[str, int], Dict[int, str]]: """ Create an index mapping for contigs. @@ -27,7 +27,7 @@ def make_contig_index(contigs: list) -> Tuple[Dict[str, int], Dict[int, str]]: return contig_to_index, index_to_contig -def apply_contig_index(contig_to_index: Dict[str, int], contig_to_info: Dict[str, str]) -> Dict[int, str]: +def apply_contig_index(contig_to_index: Dict[str, int], contig_to_info: Dict[str, Any]) -> Dict[int, Union[Any,Iterable[Any]]]: """ Apply the contig index mapping to the contig info dictionary. diff --git a/binette/diamond.py b/binette/diamond.py index b5ee166..500f090 100644 --- a/binette/diamond.py +++ b/binette/diamond.py @@ -27,11 +27,11 @@ def get_checkm2_db() -> str: reg_result = re.search("INFO: (/.*.dmnd)", checkm2_database_raw.stderr) - try: - db_path = reg_result.group(1) - except AttributeError: + if reg_result is None: logging.error(f"Something went wrong when retrieving checkm2 db path:\n{checkm2_database_raw.stderr}") sys.exit(1) + else: + db_path = reg_result.group(1) return db_path diff --git a/binette/io_manager.py b/binette/io_manager.py index 467bcc3..bb6c38d 100644 --- a/binette/io_manager.py +++ b/binette/io_manager.py @@ -1,7 +1,7 @@ import logging import os import pyfastx -from typing import List, Dict +from typing import Iterable, List, Dict import csv from binette.bin_manager import Bin @@ -14,7 +14,7 @@ def infer_bin_name_from_bin_inputs(input_bins: List[str]) -> Dict[str, str]: :param input_bins: List of input bin directories. :return: Dictionary mapping inferred bin names to their corresponding directories. """ - logging.debug(f"Inferring bin names from input bins:") + logging.debug("Inferring bin names from input bins:") commonprefix_len = len(os.path.commonprefix(input_bins)) reversed_strings = [s[::-1] for s in input_bins] @@ -30,7 +30,7 @@ def infer_bin_name_from_bin_inputs(input_bins: List[str]) -> Dict[str, str]: return bin_name_to_bin_dir -def write_bin_info(bins: List[Bin], output: str, add_contigs: bool = False): +def write_bin_info(bins: Iterable[Bin], output: str, add_contigs: bool = False): """ Write bin information to a TSV file. @@ -86,8 +86,8 @@ def write_bins_fasta(selected_bins: List[Bin], contigs_fasta: str, outdir: str): outfl.write("\n".join(sequences) + "\n") -def check_contig_consistency(contigs_from_assembly: List[str], - contigs_from_elsewhere: List[str], +def check_contig_consistency(contigs_from_assembly: Iterable[str], + contigs_from_elsewhere: Iterable[str], assembly_file: str, elsewhere_file: str ): """ diff --git a/binette/main.py b/binette/main.py index d4eb9f8..9051fdb 100755 --- a/binette/main.py +++ b/binette/main.py @@ -16,7 +16,7 @@ import binette from binette import contig_manager, cds, diamond, bin_quality, bin_manager, io_manager as io -from typing import List, Dict, Set, Tuple +from typing import List, Dict, Optional, Set, Tuple, Union, Sequence, Any def init_logging(verbose, debug): @@ -39,12 +39,19 @@ def init_logging(verbose, debug): f'command line: {" ".join(sys.argv)}', ) + class UniqueStore(Action): """ Custom argparse action to ensure an argument is provided only once. """ - def __call__(self, parser: ArgumentParser, namespace: Namespace, values: str, option_string: str = None) -> None: + def __call__( + self, + parser: ArgumentParser, + namespace: Namespace, + values: Union[str, Sequence[Any], None], + option_string: Optional[str] = None + ) -> None: """ Ensures the argument is only used once. Raises an error if the argument appears multiple times. @@ -61,6 +68,7 @@ def __call__(self, parser: ArgumentParser, namespace: Namespace, values: str, op setattr(namespace, self.dest, values) + def parse_arguments(args): """Parse script arguments.""" @@ -149,7 +157,10 @@ def parse_arguments(args): args = parser.parse_args(args) return args -def parse_input_files(bin_dirs: List[str], contig2bin_tables: List[str], contigs_fasta: str, fasta_extensions:Set[str] = {".fasta", ".fna", ".fa"}) -> Tuple[Dict[str, List], List, Dict[str, List], Dict[str, int]]: +def parse_input_files(bin_dirs: List[str], + contig2bin_tables: List[str], + contigs_fasta: str, + fasta_extensions:Set[str] = {".fasta", ".fna", ".fa"}) -> Tuple[Dict[str, List[bin_manager.Bin]], Set[bin_manager.Bin], Set[str], Dict[str, int]]: """ Parses input files to retrieve information related to bins and contigs. @@ -195,9 +206,9 @@ def parse_input_files(bin_dirs: List[str], contig2bin_tables: List[str], contigs return bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length -def manage_protein_alignement(faa_file: str, contigs_fasta: str, contig_to_length: Dict[str, List], - contigs_in_bins: Dict[str, List], diamond_result_file: str, - checkm2_db: str, threads: int, resume: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, int]]: +def manage_protein_alignement(faa_file: str, contigs_fasta: str, contig_to_length: Dict[str, int], + contigs_in_bins: Set[str], diamond_result_file: str, + checkm2_db: str, threads: int, resume: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]: """ Predicts or reuses proteins prediction and runs diamond on them. @@ -285,7 +296,7 @@ def select_bins_and_write_them(all_bins: Set[bin_manager.Bin], contigs_fasta: st logging.info(f"Bin Selection: {len(selected_bins)} selected bins") logging.info(f"Filtering bins: only bins with completeness >= {min_completeness} are kept") - selected_bins = [b for b in selected_bins if b.completeness >= min_completeness] + selected_bins = [b for b in selected_bins if b.is_complete_enough(min_completeness)] logging.info(f"Filtering bins: {len(selected_bins)} selected bins") @@ -317,11 +328,11 @@ def log_selected_bin_info(selected_bins: List[bin_manager.Bin], hq_min_completen # Log completeness and contamination in debug log logging.debug("High quality bins:") for sb in selected_bins: - if sb.completeness >= hq_min_completeness and sb.contamination <= hq_max_conta: + if sb.is_high_quality(min_completeness=hq_min_completeness, max_contamination=hq_max_conta): logging.debug(f"> {sb} completeness={sb.completeness}, contamination={sb.contamination}") # Count high-quality bins and single-contig high-quality bins - hq_bins = len([sb for sb in selected_bins if sb.completeness >= hq_min_completeness and sb.contamination <= hq_max_conta]) + hq_bins = len([sb for sb in selected_bins if sb.is_high_quality(min_completeness=hq_min_completeness, max_contamination=hq_max_conta)]) # Log information about high-quality bins thresholds = f"(completeness >= {hq_min_completeness} and contamination <= {hq_max_conta})" @@ -348,7 +359,7 @@ def main(): # Output files # final_bin_report = os.path.join(args.outdir, "final_bins_quality_reports.tsv") - + original_bin_report = os.path.join(args.outdir, "original_bins_quality_reports.tsv") if args.resume: io.check_resume_file(faa_file, diamond_result_file) @@ -381,6 +392,29 @@ def main(): logging.info("Add size and assess quality of input bins") bin_quality.add_bin_metrics(original_bins, contig_metadat, args.contamination_weight, args.threads) + + # for bin_set, bins in bin_set_name_to_bins.items(): + # print(bin_set) + # bin_set_name = bin_set.replace("/", "_") + # original_bin_report = os.path.join(args.outdir, f"{bin_set_name}_bins_quality_reports.tsv") + # bins_with_metric = [] + # for bin_obj in bins: + # print(bin_obj.id, bin_obj.score, bin_obj.N50) + # if bin_obj.score is None: + + # matching_bins = [bin_with_metric for bin_with_metric in original_bins if bin_obj == bin_with_metric] + # assert len(matching_bins) == 1, len(matching_bins) + # bins_with_metric.append(matching_bins[0]) + # print("HAS NOT USE MATCHING BIN IN ORIGINAL SET",matching_bins[0].id, matching_bins[0].score, matching_bins[0].N50) + + # else: + # print("has score") + # print(bin_obj.id, bin_obj.score, bin_obj.N50) + # bins_with_metric.append(bin_obj) + + + # io.write_bin_info(bins_with_metric, original_bin_report) + logging.info("Create intermediate bins:") new_bins = bin_manager.create_intermediate_bins(bin_set_name_to_bins) From 79bbbe7e4e9808ecfdf226beb93d7c5f26c3311b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 28 Aug 2024 18:25:53 +0200 Subject: [PATCH 06/36] silence import warning --- binette/bin_quality.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/binette/bin_quality.py b/binette/bin_quality.py index 4722188..5a46ab2 100644 --- a/binette/bin_quality.py +++ b/binette/bin_quality.py @@ -7,14 +7,16 @@ import numpy as np import pandas as pd +from binette.bin_manager import Bin + # Suppress unnecessary TensorFlow warnings os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" logging.getLogger("tensorflow").setLevel(logging.FATAL) -from checkm2 import keggData, modelPostprocessing, modelProcessing -from binette.bin_manager import Bin +from checkm2 import keggData, modelPostprocessing, modelProcessing # noqa: E402 + def get_bins_metadata_df(bins: Iterable[Bin], contig_to_cds_count: Dict[str, int], contig_to_aa_counter: Dict[str, Counter], contig_to_aa_length: Dict[str, int]) -> pd.DataFrame: """ From 31263bdf5ab945841cbe50a57ababd813c4ccc5e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 28 Aug 2024 18:29:09 +0200 Subject: [PATCH 07/36] use union instead of pipe for compatibility reason --- binette/cds.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/binette/cds.py b/binette/cds.py index c665b70..8845abe 100644 --- a/binette/cds.py +++ b/binette/cds.py @@ -3,7 +3,7 @@ import multiprocessing.pool import logging from collections import Counter, defaultdict -from typing import Dict, List, Iterator, Tuple, Any +from typing import Dict, List, Iterator, Tuple, Any, Union import pyfastx import pyrodigal @@ -33,9 +33,9 @@ def predict(contigs_iterator: Iterator, outfaa: str, threads: int =1) -> Dict[st """ try: # for version >=3 of pyrodigal - orf_finder = pyrodigal.GeneFinder(meta="meta") + orf_finder = pyrodigal.GeneFinder(meta="meta") # type: ignore except AttributeError: - orf_finder = pyrodigal.OrfFinder(meta="meta") + orf_finder = pyrodigal.OrfFinder(meta="meta") # type: ignore logging.info(f"Predicting cds sequences with Pyrodigal using {threads} threads.") @@ -115,7 +115,7 @@ def get_contig_cds_metadata_flat(contig_to_genes: Dict[str, List[str]]) -> Tuple return contig_to_cds_count, contig_to_aa_counter, contig_to_aa_length -def get_contig_cds_metadata(contig_to_genes: Dict[int, Any | List[Any]], threads: int) -> Dict[str, Dict]: +def get_contig_cds_metadata(contig_to_genes: Dict[int, Union[Any, List[Any]]], threads: int) -> Dict[str, Dict]: """ Calculate metadata for contigs in parallel, including CDS count, amino acid composition, and total amino acid length. From 55f7f5592c5a76ec8c3062a0b2a691f9eb4c763f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 28 Aug 2024 20:58:53 +0200 Subject: [PATCH 08/36] add some warning if contig are duplicated in an input set and use id in bin name --- binette/bin_manager.py | 175 ++++++++++++++++++++++++++++++-------- binette/main.py | 8 +- tests/bin_manager_test.py | 97 +++++++++++++++++---- 3 files changed, 225 insertions(+), 55 deletions(-) diff --git a/binette/bin_manager.py b/binette/bin_manager.py index 8f4fd6c..457423b 100644 --- a/binette/bin_manager.py +++ b/binette/bin_manager.py @@ -1,5 +1,4 @@ import logging -import os from collections import defaultdict from pathlib import Path @@ -22,7 +21,7 @@ def __init__(self, contigs: Iterable[str], origin: str, name: str, is_original:b """ Bin.counter += 1 - self.origin = origin + self.origin = {origin} self.name = name self.id = Bin.counter self.contigs = set(contigs) @@ -37,7 +36,7 @@ def __init__(self, contigs: Iterable[str], origin: str, name: str, is_original:b self.is_original = is_original - def __eq__(self, other: 'Bin') -> bool: + def __eq__(self, other) -> bool: """ Compare the Bin object with another object for equality. @@ -60,7 +59,7 @@ def __str__(self) -> str: :return: The string representation of the Bin object. """ - return f"{self.origin}_{self.id} ({len(self.contigs)} contigs)" + return f"Bin {self.id} from {';'.join(self.origin)} ({len(self.contigs)} contigs)" def overlaps_with(self, other: 'Bin') -> Set[str]: """ @@ -71,18 +70,18 @@ def overlaps_with(self, other: 'Bin') -> Set[str]: """ return self.contigs & other.contigs - def __and__(self, other: 'Bin') -> 'Bin': - """ - Perform a logical AND operation between this bin and another bin. + # def __and__(self, other: 'Bin') -> 'Bin': + # """ + # Perform a logical AND operation between this bin and another bin. - :param other: The other Bin object. - :return: A new Bin object representing the intersection of the bins. - """ - contigs = self.contigs & other.contigs - name = f"{self.name} & {other.name}" - origin = f"{self.origin} & {other.origin}" + # :param other: The other Bin object. + # :return: A new Bin object representing the intersection of the bins. + # """ + # contigs = self.contigs & other.contigs + # name = f"{self.name} & {other.name}" + # origin = "intersection" - return Bin(contigs, origin, name) + # return Bin(contigs, origin, name) def add_length(self, length: int) -> None: @@ -131,7 +130,7 @@ def intersection(self, *others: 'Bin') -> 'Bin': """ other_contigs = (o.contigs for o in others) contigs = self.contigs.intersection(*other_contigs) - name = f"{self.name} & {' & '.join([other.name for other in others])}" + name = f"{self.id} & {' & '.join([str(other.id) for other in others])}" origin = "intersec" return Bin(contigs, origin, name) @@ -145,7 +144,7 @@ def difference(self, *others: 'Bin') -> 'Bin': """ other_contigs = (o.contigs for o in others) contigs = self.contigs.difference(*other_contigs) - name = f"{self.name} - {' - '.join([other.name for other in others])}" + name = f"{self.id} - {' - '.join([str(other.id) for other in others])}" origin = "diff" return Bin(contigs, origin, name) @@ -159,7 +158,7 @@ def union(self, *others: 'Bin') -> 'Bin': """ other_contigs = (o.contigs for o in others) contigs = self.contigs.union(*other_contigs) - name = f"{self.name} | {' | '.join([other.name for other in others])}" + name = f"{self.id} | {' | '.join([str(other.id) for other in others])}" origin = "union" return Bin(contigs, origin, name) @@ -234,7 +233,7 @@ def get_bins_from_directory(bin_dir: str, set_name: str, fasta_extensions: Set[s -def parse_bin_directories(bin_name_to_bin_dir: Dict[str, str], fasta_extensions:Set[str]) -> Dict[str, list]: +def parse_bin_directories(bin_name_to_bin_dir: Dict[str, str], fasta_extensions:Set[str]) -> Dict[str, Set[Bin]]: """ Parses multiple bin directories and returns a dictionary mapping bin names to a list of Bin objects. @@ -243,28 +242,58 @@ def parse_bin_directories(bin_name_to_bin_dir: Dict[str, str], fasta_extensions: :return: A dictionary mapping bin names to a list of Bin objects created from the bin directories. """ - bin_name_to_bins = {} + bin_set_name_to_bins = {} for name, bin_dir in bin_name_to_bin_dir.items(): - bin_name_to_bins[name] = get_bins_from_directory(bin_dir, name, fasta_extensions) + bins = get_bins_from_directory(bin_dir, name, fasta_extensions) + set_of_bins = set(bins) + + # Calculate the number of duplicates + num_duplicates = len(bins) - len(set_of_bins) + + if num_duplicates > 0: + logging.warning( + f'{num_duplicates} bins with identical contig compositions detected in bin set "{name}". ' + 'These bins were merged to ensure uniqueness.' + ) + + # Store the unique set of bins + bin_set_name_to_bins[name] = set_of_bins - return bin_name_to_bins + return bin_set_name_to_bins -def parse_contig2bin_tables(bin_name_to_bin_tables: Dict[str, str]) -> Dict[str, list]: +def parse_contig2bin_tables(bin_name_to_bin_tables: Dict[str, str]) -> Dict[str, Set['Bin']]: """ - Parses multiple contig-to-bin tables and returns a dictionary mapping bin names to a list of Bin objects. + Parses multiple contig-to-bin tables and returns a dictionary mapping bin names to a set of unique Bin objects. - :param bin_name_to_bin_tables: A dictionary mapping bin names to their respective contig-to-bin tables. + Logs a warning if duplicate bins are detected within a bin set. - :return: A dictionary mapping bin names to a list of Bin objects created from the contig-to-bin tables. + :param bin_name_to_bin_tables: A dictionary where keys are bin set names and values are file paths or identifiers + for contig-to-bin tables. Each table is parsed to extract Bin objects. + + :return: A dictionary where keys are bin set names and values are sets of Bin objects. Duplicates are removed based + on contig composition. """ - bin_name_to_bins = {} + bin_set_name_to_bins = {} for name, contig2bin_table in bin_name_to_bin_tables.items(): - bin_name_to_bins[name] = get_bins_from_contig2bin_table(contig2bin_table, name) + bins = get_bins_from_contig2bin_table(contig2bin_table, name) + set_of_bins = set(bins) + + # Calculate the number of duplicates + num_duplicates = len(bins) - len(set_of_bins) + + if num_duplicates > 0: + logging.warning( + f'{num_duplicates*2} bins with identical contig compositions detected in bin set "{name}". ' + 'These bins were merged to ensure uniqueness.' + ) - return bin_name_to_bins + # Store the unique set of bins + bin_set_name_to_bins[name] = set_of_bins + + return bin_set_name_to_bins def get_bins_from_contig2bin_table(contig2bin_table: str, set_name: str) -> List[Bin]: @@ -434,27 +463,101 @@ def select_best_bins(bins: Set[Bin]) -> List[Bin]: logging.info(f"Selected {len(selected_bins)} bins") return selected_bins +def group_identical_bins(bins:Iterable[Bin]) -> List[List[Bin]]: + """ + Group identical bins together + + :param bins: list of bins + + return List of list of identical bins + """ + binhash_to_bins = defaultdict(list) + + # Collect bins by their hash values + for bin_obj in bins: + binhash_to_bins[bin_obj.hash].append(bin_obj) + + return list(binhash_to_bins.values()) -def dereplicate_bin_sets(bin_sets) -> Set[Bin]: + +def dereplicate_bin_sets(bin_sets: Iterable[Set['Bin']]) -> Set['Bin']: + """ + Consolidate bins from multiple bin sets into a single set of non-redundant bins. + + Bins with the same hash are considered duplicates. For each group of duplicates, + the origins are merged, and only one representative bin is kept. + + :param bin_sets: An iterable of sets, where each set contains `Bin` objects. These sets are merged + into a single set of unique bins by consolidating bins with the same hash. + + :return: A set of `Bin` objects with duplicates removed. Each `Bin` in the resulting set has + merged origins from the bins it was consolidated with. """ - Dereplicates bins from different bin sets to obtain a non-redundant bin set. + all_bins = (bin_obj for bins in bin_sets for bin_obj in bins) + list_of_identical_bins = group_identical_bins(all_bins) - :param bin_sets: A list of bin sets. + dereplicated_bins = set() - :return: A set of non-redundant bins. + # Merge bins with the same hash + for identical_bins in list_of_identical_bins: + # Select the first bin as the representative + selected_bin = identical_bins[0] + for bin_obj in identical_bins[1:]: + # Merge origins of all bins with the same hash + selected_bin.origin |= bin_obj.origin + + # Add the representative bin to the result set + dereplicated_bins.add(selected_bin) + + return dereplicated_bins + +def get_contigs_in_bin_sets(bin_set_name_to_bins: Dict[str, Set[Bin]]) -> Set[str]: """ - return set().union(*bin_sets) + Processes bin sets to check for duplicated contigs and logs detailed information about each bin set. + + :param bin_set_name_to_bins: A dictionary where keys are bin set names and values are sets of Bin objects. + + :return: A set of contig names found in bin sets + """ + # To track all unique contigs across bin sets + all_contigs_in_bins = set() + + for bin_set_name, bins in bin_set_name_to_bins.items(): + list_contigs_in_bin_sets = get_contigs_in_bins(bins) + + # Count duplicates + contig_counts = {contig: list_contigs_in_bin_sets.count(contig) for contig in list_contigs_in_bin_sets} + duplicated_contigs = {contig: count for contig, count in contig_counts.items() if count > 1} + + if duplicated_contigs: + logging.warning( + f"Bin set '{bin_set_name}' contains {len(duplicated_contigs)} duplicated contigs. " + "Details: " + ", ".join(f"{contig} (found {count} times)" for contig, count in duplicated_contigs.items()) + ) + + # Unique contigs in current bin set + unique_contigs_in_bin_set = set(list_contigs_in_bin_sets) + + # Update global contig tracker + all_contigs_in_bins |= unique_contigs_in_bin_set + + # Log summary for the current bin set + logging.debug( + f"Bin set '{bin_set_name}': {len(bins)} bins, {len(unique_contigs_in_bin_set)} unique contigs." + ) + + return all_contigs_in_bins -def get_contigs_in_bins(bins: Iterable[Bin]) -> Set[str]: +def get_contigs_in_bins(bins: Iterable[Bin]) -> List[str]: """ Retrieves all contigs present in the given list of bins. :param bins: A list of Bin objects. - :return: A set of contigs present in the bins. + :return: A list of contigs present in the bins. """ - return set().union(*(b.contigs for b in bins)) + return [contig for b in bins for contig in b.contigs] def rename_bin_contigs(bins: Iterable[Bin], contig_to_index: dict): diff --git a/binette/main.py b/binette/main.py index 9051fdb..055a74d 100755 --- a/binette/main.py +++ b/binette/main.py @@ -160,7 +160,7 @@ def parse_arguments(args): def parse_input_files(bin_dirs: List[str], contig2bin_tables: List[str], contigs_fasta: str, - fasta_extensions:Set[str] = {".fasta", ".fna", ".fa"}) -> Tuple[Dict[str, List[bin_manager.Bin]], Set[bin_manager.Bin], Set[str], Dict[str, int]]: + fasta_extensions:Set[str] = {".fasta", ".fna", ".fa"}) -> Tuple[Dict[str, Set[bin_manager.Bin]], Set[bin_manager.Bin], Set[str], Dict[str, int]]: """ Parses input files to retrieve information related to bins and contigs. @@ -185,12 +185,12 @@ def parse_input_files(bin_dirs: List[str], bin_name_to_bin_table = io.infer_bin_name_from_bin_inputs(contig2bin_tables) bin_set_name_to_bins = bin_manager.parse_contig2bin_tables(bin_name_to_bin_table) - logging.info(f"{len(bin_set_name_to_bins)} bin sets processed:") + logging.info(f"Processing {len(bin_set_name_to_bins)} bin sets.") for bin_set_id, bins in bin_set_name_to_bins.items(): logging.info(f" {bin_set_id} - {len(bins)} bins") + contigs_in_bins = bin_manager.get_contigs_in_bin_sets(bin_set_name_to_bins) original_bins = bin_manager.dereplicate_bin_sets(bin_set_name_to_bins.values()) - contigs_in_bins = bin_manager.get_contigs_in_bins(original_bins) logging.info(f"Parsing contig fasta file: {contigs_fasta}") contigs_object = contig_manager.parse_fasta_file(contigs_fasta) @@ -405,7 +405,7 @@ def main(): # matching_bins = [bin_with_metric for bin_with_metric in original_bins if bin_obj == bin_with_metric] # assert len(matching_bins) == 1, len(matching_bins) # bins_with_metric.append(matching_bins[0]) - # print("HAS NOT USE MATCHING BIN IN ORIGINAL SET",matching_bins[0].id, matching_bins[0].score, matching_bins[0].N50) + # print("HAS NOT USE MATCHING BIN IN ORIGINAL SET", matching_bins[0].id, matching_bins[0].score, matching_bins[0].N50) # else: # print("has score") diff --git a/tests/bin_manager_test.py b/tests/bin_manager_test.py index 4b57f92..f5c5829 100644 --- a/tests/bin_manager_test.py +++ b/tests/bin_manager_test.py @@ -8,6 +8,8 @@ from binette import bin_manager import networkx as nx +import logging + def test_get_all_possible_combinations(): input_list = ["2", "3", "4"] expected_list = [("2", "3"), ("2", "4"), ("3", "4"), ("2", "3", "4")] @@ -21,6 +23,7 @@ def example_bin_set1(): bin2 = bin_manager.Bin(contigs={"3", "4"}, origin="test1", name="bin2") bin3 = bin_manager.Bin(contigs={"5"}, origin="test1", name="bin2") return {bin1, bin2, bin3} + @pytest.fixture def example_bin_set2(): bin1 = bin_manager.Bin(contigs={"1", "2", "3"}, origin="test2", name="binA") @@ -96,13 +99,13 @@ def test_add_quality(): -def test_two_bin_intersection(): - bin1 = bin_manager.Bin(contigs={"1", "2", "e", "987"}, origin="test1", name="bin1") - bin2 = bin_manager.Bin(contigs={"1", "e", "2", "33"}, origin="test2", name="binA") +# def test_two_bin_intersection(): +# bin1 = bin_manager.Bin(contigs={"1", "2", "e", "987"}, origin="test1", name="bin1") +# bin2 = bin_manager.Bin(contigs={"1", "e", "2", "33"}, origin="test2", name="binA") - bin_intersection = bin1 & bin2 +# bin_intersection = bin1 & bin2 - assert bin_intersection == bin_manager.Bin({"1", "2", "e"}, "", "") +# assert bin_intersection == bin_manager.Bin({"1", "2", "e"}, "", "") def test_multiple_bins_intersection(): @@ -159,7 +162,7 @@ def test_bin_union2(): # Check the result expected_contigs = {'contig1', 'contig2', 'contig3', 'contig4', 'contig5'} expected_name = 'bin1 | bin2 | bin3' - expected_origin = 'union' + expected_origin = {'union'} assert union_bin.contigs == expected_contigs assert union_bin.name == expected_name @@ -308,7 +311,7 @@ def test_get_contigs_in_bins(): contigs = bin_manager.get_contigs_in_bins(bin_set) - assert contigs == {"c1", "c2", "c3", "c4", "c18"} + assert set(contigs) == {"c1", "c2", "c3", "c4", "c18"} def test_dereplicate_bin_sets(): @@ -467,12 +470,32 @@ def test_parse_contig2bin_tables(tmp_path): for name, expected in expected_bins.items(): assert name in result_bin_dict assert len(result_bin_dict[name]) == len(expected) - for result_bin, expected_bin in zip(result_bin_dict[name], expected): - assert result_bin.contigs == expected_bin.contigs - assert result_bin.name == expected_bin.name - assert result_bin.origin == expected_bin.origin + for result_bin in result_bin_dict[name]: + assert result_bin in expected + + +def test_parse_contig2bin_tables_with_duplicated_bins(tmp_path, caplog): + # Create temporary contig-to-bin tables for testing + test_tables = { + "set1": [ + "# Sample contig-to-bin table for bin1", + "contig1\tbin1", + "contig2\tbin1", + "contig3\tbin2", + "contig3\tbin3", + ] + } + # Create temporary files for contig-to-bin tables + for name, content in test_tables.items(): + table_path = tmp_path / f"test_{name}_contig2bin_table.txt" + table_path.write_text("\n".join(content)) + # Call the function to parse contig-to-bin tables + bin_manager.parse_contig2bin_tables({name: str(tmp_path / f"test_{name}_contig2bin_table.txt") for name in test_tables}) + expected_log_message = ('2 bins with identical contig compositions detected in bin set "set1". ' + 'These bins were merged to ensure uniqueness.') + assert expected_log_message in caplog.text @pytest.fixture @@ -518,8 +541,8 @@ def test_get_bins_from_directory(create_temp_bin_files): assert isinstance(bins[1], bin_manager.Bin) assert bins[1].contigs in [{"contig1", "contig2"}, {"contig3", "contig4"}] assert bins[0].contigs in [{"contig1", "contig2"}, {"contig3", "contig4"}] - assert bins[0].origin == set_name - assert bins[1].origin == set_name + assert bins[0].origin == {set_name} + assert bins[1].origin == {set_name} assert bins[1].name in ["bin2.fasta", "bin1.fasta"] assert bins[0].name in ["bin2.fasta", "bin1.fasta"] @@ -551,10 +574,54 @@ def test_parse_bin_directories(create_temp_bin_directories): assert len(bins) == 2 # Ensure that the correct number of bin directories is parsed # Check if the Bin objects are created with the correct contigs, set name, and bin names - assert isinstance(bins["set1"][0], bin_manager.Bin) - assert isinstance(bins["set2"][0], bin_manager.Bin) + assert isinstance(list(bins["set1"])[0], bin_manager.Bin) + assert isinstance(list(bins["set2"])[0], bin_manager.Bin) assert len(bins["set2"]) == 1 assert len(bins["set1"]) == 2 +def test_get_contigs_in_bin_sets(example_bin_set1, example_bin_set2, caplog): + """ + Test the get_contigs_in_bin_sets function for correct behavior. + + :param mock_bins: The mock_bins fixture providing test bin data. + :param caplog: The pytest caplog fixture to capture logging output. + """ + + bin_set_name_to_bins = {"set1":example_bin_set1, + "set2":example_bin_set2} + + # Test the function with valid data + with caplog.at_level(logging.WARNING): + result = bin_manager.get_contigs_in_bin_sets(bin_set_name_to_bins) + + # Expected unique contigs + expected_contigs = {"1", "2", "3", "4", "5"} + + # Check if the result matches expected contigs + assert result == expected_contigs, "The returned set of contigs is incorrect." + +def test_get_contigs_in_bin_sets_with_duplicated_warning(example_bin_set1, caplog): + + bin1 = bin_manager.Bin(contigs={"contig1", "2"}, origin="test1", name="bin1") + bin2 = bin_manager.Bin(contigs={"contig1"}, origin="test1", name="binA") + + bin_set_name_to_bins = { + "set1":example_bin_set1, + "set_dup":{bin1, bin2}, + } + + # Test the function with valid data + with caplog.at_level(logging.WARNING): + result = bin_manager.get_contigs_in_bin_sets(bin_set_name_to_bins) + + # Expected unique contigs + expected_contigs = {"1", "2", "3", "4", "5", "contig1"} + + # Check if the result matches expected contigs + assert result == expected_contigs, "The returned set of contigs is incorrect." + + # Check for expected warnings about duplicate contigs + duplicate_warning = "Bin set 'set_dup' contains 1 duplicated contigs. Details: contig1 (found 2 times)" + assert duplicate_warning in caplog.text, "The warning for duplicate contigs was not logged correctly." From bff2899189841ba5e0324949b47d24dbcb4214f9 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 28 Aug 2024 21:04:58 +0200 Subject: [PATCH 09/36] update test with new bin name --- tests/bin_manager_test.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/bin_manager_test.py b/tests/bin_manager_test.py index f5c5829..f08d6a1 100644 --- a/tests/bin_manager_test.py +++ b/tests/bin_manager_test.py @@ -144,10 +144,11 @@ def test_bin_union(): bin1 = bin_manager.Bin(contigs={"13", "21"}, origin="test1", name="bin1") bin2 = bin_manager.Bin(contigs={"1", "e", "2", "33"}, origin="test2", name="binA") - union_bin = bin_manager.Bin(contigs={"13", "21", "1", "e", "2", "33"}, origin="", name="") + expected_union_bin = bin_manager.Bin(contigs={"13", "21", "1", "e", "2", "33"}, origin="", name="") + union_bin = bin1.union(bin2) - assert bin1.union(bin2) == union_bin - assert bin1.union(bin2).name == "bin1 | binA" + assert union_bin == expected_union_bin + assert union_bin.name == f"{bin1.id} | {bin2.id}" def test_bin_union2(): @@ -161,11 +162,9 @@ def test_bin_union2(): # Check the result expected_contigs = {'contig1', 'contig2', 'contig3', 'contig4', 'contig5'} - expected_name = 'bin1 | bin2 | bin3' expected_origin = {'union'} assert union_bin.contigs == expected_contigs - assert union_bin.name == expected_name assert union_bin.origin == expected_origin @@ -179,7 +178,7 @@ def test_bin_difference(): assert bin1.difference(bin2, bin3) == diff_bin1_23 assert bin1.difference(bin2) == diff_bin1_2 - assert bin1.difference(bin2, bin3).name == "bin1 - bin2 - bin3" + assert bin1.difference(bin2, bin3).name == f"{bin1.id} - {bin2.id} - {bin3.id}" def test_bin_intersection(): @@ -192,7 +191,7 @@ def test_bin_intersection(): assert bin1.intersection(bin2, bin3) == inter_bin123 assert bin1.intersection(bin2) == iner_bin1_2 - assert bin1.intersection(bin2, bin3).name == "bin1 & bin2 & bin3" + assert bin1.intersection(bin2, bin3).name == f"{bin1.id} & {bin2.id} & {bin3.id}" def test_select_best_bins_simple(): From 99fafaa517173f45ec7b5c305a6a78a5956ab1c0 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 29 Aug 2024 00:45:09 +0200 Subject: [PATCH 10/36] improve fct to infer bin set name --- binette/io_manager.py | 105 +++++++++++++++++++++++++------- tests/io_manager_test.py | 128 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 203 insertions(+), 30 deletions(-) diff --git a/binette/io_manager.py b/binette/io_manager.py index bb6c38d..0a268e5 100644 --- a/binette/io_manager.py +++ b/binette/io_manager.py @@ -1,36 +1,97 @@ import logging -import os import pyfastx -from typing import Iterable, List, Dict +from typing import Iterable, List, Dict, Tuple import csv from binette.bin_manager import Bin +from pathlib import Path -def infer_bin_name_from_bin_inputs(input_bins: List[str]) -> Dict[str, str]: +def get_paths_common_prefix_suffix(paths: List[Path]) -> Tuple[List[str], List[str], List[str]]: """ - Infer bin names from a list of bin input directories. + Determine the common prefix parts, suffix parts, and common extensions of the last part of a list of pathlib.Path objects. - :param input_bins: List of input bin directories. - :return: Dictionary mapping inferred bin names to their corresponding directories. + :param paths: List of pathlib.Path objects. + :return: A tuple containing three lists: + - The common prefix parts. + - The common suffix parts. + - The common extensions of the last part of the paths. """ - logging.debug("Inferring bin names from input bins:") + # Extract parts for all paths + parts = [list(path.parts) for path in paths] + + # Find the common prefix + if not parts: + return [], [], [] + + # Initialize common prefix and suffix lists + common_prefix = list(parts[0]) + common_suffix = list(parts[0]) + # Determine common prefix + for part_tuple in parts[1:]: + common_prefix_length = min(len(common_prefix), len(part_tuple)) + common_prefix = [common_prefix[i] for i in range(common_prefix_length) if common_prefix[:i+1] == part_tuple[:i+1]] + if not common_prefix: + break + + # Determine common suffix + for part_tuple in parts[1:]: + common_suffix_length = min(len(common_suffix), len(part_tuple)) + common_suffix = [common_suffix[-i] for i in range(1, common_suffix_length + 1) if common_suffix[-i:] == part_tuple[-i:]] + if not common_suffix: + break + if len(parts) > 1: + common_suffix.reverse() + + # Determine common extensions of the last part of the paths + if len(paths) == 1: + common_extensions = paths[0].suffixes + else: + common_extensions = list(paths[0].suffixes) + for path in paths[1:]: + common_extension_length = min(len(common_extensions), len(path.suffixes)) + common_extensions = [common_extensions[i] for i in range(common_extension_length) if common_extensions[i] == path.suffixes[i]] + if not common_extensions: + break + + return common_prefix, common_suffix, common_extensions + +def infer_bin_set_names_from_input_paths(input_bins: List[Path]) -> Dict[str, Path]: + """ + Infer bin set names from a list of bin input directories or files. + + :param input_bins: List of input bin directories or files. + :return: Dictionary mapping inferred bin names to their corresponding directories or files. + """ + bin_name_to_bin_dir = {} + + common_prefix, common_suffix, common_extensions = get_paths_common_prefix_suffix(input_bins) + print(common_prefix, common_suffix, common_extensions ) + for path in input_bins: + + specific_parts = path.parts[len(common_prefix):len(path.parts)-len(common_suffix)] + + if not common_suffix and common_extensions: + last_specific_part = specific_parts[-1].split('.')[:-len(common_extensions)] + specific_parts = list(specific_parts[:-1]) + last_specific_part + - commonprefix_len = len(os.path.commonprefix(input_bins)) - reversed_strings = [s[::-1] for s in input_bins] - commonsufix_len = len(os.path.commonprefix(reversed_strings)) + bin_set_name = '/'.join(specific_parts) + if bin_set_name == "": + bin_set_name = path.as_posix() - bin_name_to_bin_dir = {d[commonprefix_len: len(d) - commonsufix_len]: d for d in input_bins} + bin_name_to_bin_dir[bin_set_name] = path - logging.debug(f"Input bins: {' '.join(input_bins)}") - logging.debug(f"Common prefix to remove: {os.path.commonprefix(reversed_strings)[::-1]}") - logging.debug(f"Common suffix to remove: {os.path.commonprefix(input_bins)}") + logging.debug(f"Input bins: {' '.join([path.as_posix() for path in input_bins])}") + logging.debug(f"Common prefix to remove: {common_prefix}") + logging.debug(f"Common suffix to remove: {common_suffix}") + logging.debug(f"Common extension to remove: {common_suffix}") logging.debug(f"bin_name_to_bin_dir: {bin_name_to_bin_dir}") return bin_name_to_bin_dir -def write_bin_info(bins: Iterable[Bin], output: str, add_contigs: bool = False): +def write_bin_info(bins: Iterable[Bin], output: Path, add_contigs: bool = False): """ Write bin information to a TSV file. @@ -67,7 +128,7 @@ def write_bin_info(bins: Iterable[Bin], output: str, add_contigs: bool = False): writer.writerows(bin_infos) -def write_bins_fasta(selected_bins: List[Bin], contigs_fasta: str, outdir: str): +def write_bins_fasta(selected_bins: List[Bin], contigs_fasta: Path, outdir: Path): """ Write selected bins' contigs to separate FASTA files. @@ -76,10 +137,10 @@ def write_bins_fasta(selected_bins: List[Bin], contigs_fasta: str, outdir: str): :param outdir: Output directory to save the individual bin FASTA files. """ - fa = pyfastx.Fasta(contigs_fasta, build_index=True) + fa = pyfastx.Fasta(contigs_fasta.as_posix(), build_index=True) for sbin in selected_bins: - outfile = os.path.join(outdir, f"bin_{sbin.id}.fa") + outfile = outdir / f"bin_{sbin.id}.fa" with open(outfile, "w") as outfl: sequences = (f">{c}\n{fa[c]}" for c in sbin.contigs) @@ -111,7 +172,7 @@ def check_contig_consistency(contigs_from_assembly: Iterable[str], assert are_contigs_consistent, message -def check_resume_file(faa_file: str, diamond_result_file: str) -> None: +def check_resume_file(faa_file: Path, diamond_result_file: Path) -> None: """ Check the existence of files required for resuming the process. @@ -120,15 +181,15 @@ def check_resume_file(faa_file: str, diamond_result_file: str) -> None: :raises FileNotFoundError: If the required files don't exist for resuming. """ - if os.path.isfile(faa_file) and os.path.isfile(diamond_result_file): + if faa_file.exists() and diamond_result_file.exists(): return - if not os.path.isfile(faa_file): + if not faa_file.exists(): error_msg = f"Protein file '{faa_file}' does not exist. Resuming is not possible." logging.error(error_msg) raise FileNotFoundError(error_msg) - if not os.path.isfile(diamond_result_file): + if not diamond_result_file.exists(): error_msg = f"Diamond result file '{diamond_result_file}' does not exist. Resuming is not possible." logging.error(error_msg) raise FileNotFoundError(error_msg) diff --git a/tests/io_manager_test.py b/tests/io_manager_test.py index 01909c2..aa1856c 100644 --- a/tests/io_manager_test.py +++ b/tests/io_manager_test.py @@ -36,18 +36,130 @@ def test_infer_bin_name_from_bin_inputs(): ] # Call the function - result = io_manager.infer_bin_name_from_bin_inputs(input_bins) + result = io_manager.infer_bin_set_names_from_input_paths(list(map(Path, input_bins))) # Define the expected output expected_result = { - '1': '/path/to/bin1', - '2': '/path/to/bin2', - '3': '/path/to/bin3' + 'bin1': Path('/path/to/bin1'), + 'bin2': Path('/path/to/bin2'), + 'bin3': Path('/path/to/bin3') } # Check if the output matches the expected dictionary assert result == expected_result +def test_infer_bin_name_from_single_path(): + # Mock input data + input_bins = [ + '/path/to/bin1', + ] + + # Call the function + result = io_manager.infer_bin_set_names_from_input_paths(list(map(Path, input_bins))) + + # Define the expected output + expected_result = { + '/path/to/bin1': Path('/path/to/bin1'), + } + + # Check if the output matches the expected dictionary + assert result == expected_result + + +def test_infer_bin_name_from_bin_table_inputs(): + # Mock input data + input_bins = [ + '/path/to/bin1.tsv', + '/path/to/bin2.tsv', + '/path/to/bin3.tsv' + ] + + # Call the function + result = io_manager.infer_bin_set_names_from_input_paths(list(map(Path, input_bins))) + + # Define the expected output + expected_result = { + 'bin1': Path('/path/to/bin1.tsv'), + 'bin2': Path('/path/to/bin2.tsv'), + 'bin3': Path('/path/to/bin3.tsv') + } + + # Check if the output matches the expected dictionary + assert result == expected_result + + +def test_infer_bin_name_from_bin_table_with_different_ext(): + # Mock input data + input_bins = [ + '/path/to/bin1.tsv', + '/path/to/bin2.tsv', + '/path/to/bin3.txt' + ] + + # Call the function + result = io_manager.infer_bin_set_names_from_input_paths(list(map(Path, input_bins))) + + # Define the expected output + expected_result = { + 'bin1.tsv': Path('/path/to/bin1.tsv'), + 'bin2.tsv': Path('/path/to/bin2.tsv'), + 'bin3.txt': Path('/path/to/bin3.txt') + } + + # Check if the output matches the expected dictionary + assert result == expected_result + +def test_infer_bin_name_from_bin_table_with_different_dir(): + # Mock input data + input_bins = [ + '/path/to/bins', + '/path2/result_bins', + '/path2/result/bins', + ] + + # Call the function + result = io_manager.infer_bin_set_names_from_input_paths(list(map(Path, input_bins))) + + # Define the expected output + expected_result = { + 'path/to/bins' : Path('/path/to/bins'), + 'path2/result_bins': Path('/path2/result_bins'), + 'path2/result/bins': Path('/path2/result/bins'), + } + + # Check if the output matches the expected dictionary + assert result == expected_result + +def test_get_paths_common_prefix_suffix(): + # Test case 1: No paths provided + assert io_manager.get_paths_common_prefix_suffix([]) == ([], [], []) + + # # Test case 2: Single path + assert io_manager.get_paths_common_prefix_suffix([Path('/home/user/project')]) == (['/', 'home', 'user', 'project'], ['/', 'home', 'user', 'project'], []) + + # Test case 3: Multiple paths with common prefix and suffix + paths = [Path('/home/user/project/src'), Path('/home/user/project/docs'), Path('/home/user/project/tests')] + assert io_manager.get_paths_common_prefix_suffix(paths) == (['/', 'home', 'user', 'project'], [], []) + + # Test case 4: Multiple paths with no common prefix or suffix + paths = [Path('/var/log/syslog'), Path('/usr/local/bin/python'), Path('/etc/nginx/nginx.conf')] + assert io_manager.get_paths_common_prefix_suffix(paths) == (['/'], [], []) + + # Test case 5: Multiple paths with common suffix + paths = [Path('/home/user/docs/report.txt'), Path('/home/admin/docs/report.txt')] + assert io_manager.get_paths_common_prefix_suffix(paths) == (['/', 'home'], ['docs', 'report.txt'], ['.txt']) + + # Test case 6: Paths with a deeper common prefix and suffix + paths = [Path('/data/project_a/results/output.txt'), Path('/data/project_b/results/output.txt')] + assert io_manager.get_paths_common_prefix_suffix(paths) == (['/', 'data'], ['results', 'output.txt'], ['.txt']) + + # Test case 7: Paths with only the root as common prefix and different suffix + paths = [Path('/project_a/output.txt'), Path('/project_b/output.txt')] + assert io_manager.get_paths_common_prefix_suffix(paths) == (['/'], ['output.txt'], ['.txt']) + + # Test case 8: Paths with only the root as common prefix and different suffix + paths = [Path('/project_a/output.txt'), Path('/project_a/output.tsv')] + assert io_manager.get_paths_common_prefix_suffix(paths) == (['/', 'project_a'], [], []) def test_write_bin_info(tmp_path, bin1, bin2): # Mock input data @@ -103,7 +215,7 @@ def test_write_bins_fasta(tmp_path, bin1, bin2): outdir.mkdir() # Call the function - io_manager.write_bins_fasta(selected_bins, str(contigs_fasta), str(outdir)) + io_manager.write_bins_fasta(selected_bins, contigs_fasta, outdir) # Check if the files were created and their content matches the expected output assert (outdir / "bin_1.fa").exists() @@ -152,7 +264,7 @@ def temp_files(tmp_path): def test_check_resume_file_exists(temp_files, caplog): # Test when both files exist faa_file, diamond_result_file = temp_files - io_manager.check_resume_file(faa_file, diamond_result_file) + io_manager.check_resume_file(Path(faa_file), Path(diamond_result_file)) assert "Protein file" not in caplog.text assert "Diamond result file" not in caplog.text @@ -160,7 +272,7 @@ def test_check_resume_file_missing_faa(temp_files, caplog): # Test when faa_file is missing _, diamond_result_file = temp_files with pytest.raises(FileNotFoundError): - io_manager.check_resume_file("nonexistent.faa", diamond_result_file) + io_manager.check_resume_file(Path("nonexistent.faa"), Path(diamond_result_file)) assert "Protein file" in caplog.text assert "Diamond result file" not in caplog.text @@ -168,6 +280,6 @@ def test_check_resume_file_missing_diamond(temp_files, caplog): # Test when diamond_result_file is missing faa_file, _ = temp_files with pytest.raises(FileNotFoundError): - io_manager.check_resume_file(faa_file, "nonexistent_diamond_result.txt") + io_manager.check_resume_file(Path(faa_file), Path("nonexistent_diamond_result.txt")) assert "Protein file" not in caplog.text assert "Diamond result file" in caplog.text From b7ee3f77517e85e362280a20ade957419d711a6e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 29 Aug 2024 10:12:45 +0200 Subject: [PATCH 11/36] apply pathlib in tests --- tests/main_binette_test.py | 58 ++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/tests/main_binette_test.py b/tests/main_binette_test.py index d4ceba9..f2129db 100644 --- a/tests/main_binette_test.py +++ b/tests/main_binette_test.py @@ -11,7 +11,7 @@ from collections import Counter from tests.bin_manager_test import create_temp_bin_directories, create_temp_bin_files from argparse import ArgumentParser - +from pathlib import Path @pytest.fixture def bins(): @@ -62,7 +62,7 @@ def test_select_bins_and_write_them(tmp_path, tmpdir, bins): # Run the function with test data selected_bins = select_bins_and_write_them( - set(bins), str(contigs_fasta), final_bin_report, min_completeness=60, index_to_contig=index_to_contig, outdir=str(outdir), debug=True + set(bins), contigs_fasta, Path(final_bin_report), min_completeness=60, index_to_contig=index_to_contig, outdir=outdir, debug=True ) # Assertions to check the function output or file existence @@ -104,11 +104,11 @@ def test_manage_protein_alignement_resume(tmp_path): # Run the function with test data contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( - faa_file=str(faa_file), - contigs_fasta="contigs_fasta", + faa_file=Path(faa_file), + contigs_fasta=Path("contigs_fasta"), contig_to_length=contig_to_length, - contigs_in_bins={}, - diamond_result_file="diamond_result_file", + contigs_in_bins=set(), + diamond_result_file=Path("diamond_result_file"), checkm2_db=None, threads=1, resume=True, @@ -149,11 +149,11 @@ def test_manage_protein_alignement_not_resume(tmpdir, tmp_path): # Call the function contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( - faa_file=str(faa_file), - contigs_fasta=contigs_fasta, + faa_file=Path(faa_file), + contigs_fasta=Path(contigs_fasta), contig_to_length=contig_to_length, - contigs_in_bins={}, - diamond_result_file=diamond_result_file, + contigs_in_bins=set(), + diamond_result_file=Path(diamond_result_file), checkm2_db=None, threads=1, resume=True, @@ -180,7 +180,7 @@ def test_parse_input_files_with_contig2bin_tables(tmp_path): fasta_file.write_text(fasta_file_content) # Call the function and capture the return values - bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = parse_input_files(None, [str(bin_set1), str(bin_set2)], str(fasta_file)) + bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = parse_input_files(None, [bin_set1, bin_set2], fasta_file) # # Perform assertions on the returned values @@ -190,7 +190,7 @@ def test_parse_input_files_with_contig2bin_tables(tmp_path): assert isinstance(contig_to_length, dict) - assert set(bin_set_name_to_bins) == {'1', "2"} + assert set(bin_set_name_to_bins) == {'bin_set1', "bin_set2"} assert len(original_bins) == 4 assert contigs_in_bins == {"contig1","contig2", "contig3","contig4"} assert len(contig_to_length) == 4 @@ -206,12 +206,12 @@ def test_parse_input_files_with_contig2bin_tables_with_unknown_contig(tmp_path): fasta_file.write_text(fasta_file_content) with pytest.raises(ValueError): - parse_input_files(None, [str(bin_set3)], str(fasta_file)) + parse_input_files(None, [bin_set3], fasta_file) def test_parse_input_files_bin_dirs(create_temp_bin_directories, tmp_path): - bin_dirs = list(create_temp_bin_directories.values()) + bin_dirs = [Path(d) for d in create_temp_bin_directories.values()] contig2bin_tables = [] @@ -224,7 +224,7 @@ def test_parse_input_files_bin_dirs(create_temp_bin_directories, tmp_path): fasta_file.write_text(fasta_file_content) # Call the function and capture the return values - bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = parse_input_files(bin_dirs, contig2bin_tables, str(fasta_file)) + bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = parse_input_files(bin_dirs, contig2bin_tables, fasta_file) # # Perform assertions on the returned values assert isinstance(bin_set_name_to_bins, dict) @@ -233,7 +233,7 @@ def test_parse_input_files_bin_dirs(create_temp_bin_directories, tmp_path): assert isinstance(contig_to_length, dict) - assert set(bin_set_name_to_bins) == {'1', "2"} + assert set(bin_set_name_to_bins) == {'set1', 'set2'} assert len(original_bins) == 3 assert contigs_in_bins == {"contig1","contig2", "contig3","contig4","contig5",} assert len(contig_to_length) == 5 @@ -257,16 +257,16 @@ def test_argument_used_multiple_times(): def test_parse_arguments_required_arguments(): # Test when only required arguments are provided args = parse_arguments(["-d", "folder1", "folder2", "-c", "contigs.fasta"]) - assert args.bin_dirs == ["folder1", "folder2"] - assert args.contigs == "contigs.fasta" + assert args.bin_dirs == [Path("folder1"), Path("folder2")] + assert args.contigs == Path("contigs.fasta") def test_parse_arguments_optional_arguments(): # Test when required and optional arguments are provided args = parse_arguments(["-d", "folder1", "folder2", "-c", "contigs.fasta", "--threads", "4", "--outdir", "output"]) - assert args.bin_dirs == ["folder1", "folder2"] - assert args.contigs == "contigs.fasta" + assert args.bin_dirs == [Path("folder1"), Path("folder2")] + assert args.contigs == Path("contigs.fasta") assert args.threads == 4 - assert args.outdir == "output" + assert args.outdir == Path("output") def test_parse_arguments_invalid_arguments(): # Test when invalid arguments are provided @@ -294,14 +294,16 @@ def test_init_logging_command_line(caplog): # @patch('diamond.run') -def test_manage_protein_alignment_no_resume(): +def test_manage_protein_alignment_no_resume(tmp_path): # Set up the input parameters - faa_file = "test.faa" - contigs_fasta = "test.fasta" + faa_file = Path("test.faa") + contigs_fasta = Path("test.fasta") contig_to_length = {"contig1": [1000]} contigs_in_bins = {"bin1": ["contig1"]} - diamond_result_file = "test_diamond_result.txt" - checkm2_db = "checkm2_db" + diamond_result_file = Path("test_diamond_result.txt") + checkm2_db = tmp_path / "checkm2_db" + with open(checkm2_db, "w"): + pass threads = 4 resume = False low_mem = False @@ -324,11 +326,11 @@ def test_manage_protein_alignment_no_resume(): ) # Assertions to check if functions were called - mock_parse_fasta_file.assert_called_once_with(contigs_fasta) + mock_parse_fasta_file.assert_called_once_with(contigs_fasta.as_posix()) mock_predict.assert_called_once() mock_diamond_get_contig_to_kegg_id.assert_called_once() mock_diamond_run.assert_called_once_with( - faa_file, diamond_result_file, "checkm2_db", f"{os.path.splitext(diamond_result_file)[0]}.log", threads, low_mem=low_mem + faa_file.as_posix(), diamond_result_file.as_posix(), checkm2_db.as_posix(), f"{os.path.splitext(diamond_result_file.as_posix())[0]}.log", threads, low_mem=low_mem ) def test_main_resume_when_not_possible(monkeypatch): From f3abada350662016b6d477389a9ef06e3d5a6677 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 29 Aug 2024 10:13:49 +0200 Subject: [PATCH 12/36] output original bin metric files --- binette/bin_manager.py | 10 ++-- binette/io_manager.py | 2 +- binette/main.py | 118 +++++++++++++++++++++-------------------- 3 files changed, 67 insertions(+), 63 deletions(-) diff --git a/binette/bin_manager.py b/binette/bin_manager.py index 457423b..6891ac5 100644 --- a/binette/bin_manager.py +++ b/binette/bin_manager.py @@ -205,7 +205,7 @@ def is_high_quality(self, min_completeness: float, max_contamination: float) -> -def get_bins_from_directory(bin_dir: str, set_name: str, fasta_extensions: Set[str]) -> List[Bin]: +def get_bins_from_directory(bin_dir: Path, set_name: str, fasta_extensions: Set[str]) -> List[Bin]: """ Retrieves a list of Bin objects from a directory containing bin FASTA files. @@ -217,7 +217,7 @@ def get_bins_from_directory(bin_dir: str, set_name: str, fasta_extensions: Set[s """ bins = [] fasta_extensions |= {f".{ext}" for ext in fasta_extensions if not ext.startswith(".")} # adding a dot in case given extension are lacking one - bin_fasta_files = (fasta_file for fasta_file in Path(bin_dir).glob("*") if set(fasta_file.suffixes) & fasta_extensions) + bin_fasta_files = (fasta_file for fasta_file in bin_dir.glob("*") if set(fasta_file.suffixes) & fasta_extensions) for bin_fasta_path in bin_fasta_files: @@ -233,7 +233,7 @@ def get_bins_from_directory(bin_dir: str, set_name: str, fasta_extensions: Set[s -def parse_bin_directories(bin_name_to_bin_dir: Dict[str, str], fasta_extensions:Set[str]) -> Dict[str, Set[Bin]]: +def parse_bin_directories(bin_name_to_bin_dir: Dict[str, Path], fasta_extensions:Set[str]) -> Dict[str, Set[Bin]]: """ Parses multiple bin directories and returns a dictionary mapping bin names to a list of Bin objects. @@ -263,7 +263,7 @@ def parse_bin_directories(bin_name_to_bin_dir: Dict[str, str], fasta_extensions: return bin_set_name_to_bins -def parse_contig2bin_tables(bin_name_to_bin_tables: Dict[str, str]) -> Dict[str, Set['Bin']]: +def parse_contig2bin_tables(bin_name_to_bin_tables: Dict[str, Path]) -> Dict[str, Set['Bin']]: """ Parses multiple contig-to-bin tables and returns a dictionary mapping bin names to a set of unique Bin objects. @@ -296,7 +296,7 @@ def parse_contig2bin_tables(bin_name_to_bin_tables: Dict[str, str]) -> Dict[str, return bin_set_name_to_bins -def get_bins_from_contig2bin_table(contig2bin_table: str, set_name: str) -> List[Bin]: +def get_bins_from_contig2bin_table(contig2bin_table: Path, set_name: str) -> List[Bin]: """ Retrieves a list of Bin objects from a contig-to-bin table. diff --git a/binette/io_manager.py b/binette/io_manager.py index 0a268e5..5ea3041 100644 --- a/binette/io_manager.py +++ b/binette/io_manager.py @@ -108,7 +108,7 @@ def write_bin_info(bins: Iterable[Bin], output: Path, add_contigs: bool = False) for bin_obj in sorted(bins, key=lambda x: (x.score, x.N50, -x.id), reverse=True): bin_info = [ bin_obj.id, - bin_obj.origin, + ';'.join(bin_obj.origin), bin_obj.name, bin_obj.completeness, bin_obj.contamination, diff --git a/binette/main.py b/binette/main.py index 055a74d..3044fda 100755 --- a/binette/main.py +++ b/binette/main.py @@ -17,7 +17,7 @@ import binette from binette import contig_manager, cds, diamond, bin_quality, bin_manager, io_manager as io from typing import List, Dict, Optional, Set, Tuple, Union, Sequence, Any - +from pathlib import Path def init_logging(verbose, debug): """Initialise logging.""" @@ -85,6 +85,7 @@ def parse_arguments(args): "-d", "--bin_dirs", nargs="+", + type=Path, action=UniqueStore, help="List of bin folders containing each bin in a fasta file.", ) @@ -94,11 +95,12 @@ def parse_arguments(args): "--contig2bin_tables", nargs="+", action=UniqueStore, + type=Path, help="List of contig2bin table with two columns separated\ with a tabulation: contig, bin", ) - input_group.add_argument("-c", "--contigs", required=True, help="Contigs in fasta format.") + input_group.add_argument("-c", "--contigs", required=True, type=Path, help="Contigs in fasta format.") # Other parameters category other_group = parser.add_argument_group('Other Arguments') @@ -113,7 +115,7 @@ def parse_arguments(args): other_group.add_argument("-t", "--threads", default=1, type=int, help="Number of threads to use.") - other_group.add_argument("-o", "--outdir", default="results", help="Output directory.") + other_group.add_argument("-o", "--outdir", default=Path("results"), type=Path, help="Output directory.") other_group.add_argument( "-w", @@ -135,8 +137,9 @@ def parse_arguments(args): other_group.add_argument( "--checkm2_db", + type=Path, help="Provide a path for the CheckM2 diamond database. " - "By default the database set via is used.", + "By default the database set via is used." ) other_group.add_argument("--low_mem", help="Use low mem mode when running diamond", action="store_true") @@ -157,9 +160,9 @@ def parse_arguments(args): args = parser.parse_args(args) return args -def parse_input_files(bin_dirs: List[str], - contig2bin_tables: List[str], - contigs_fasta: str, +def parse_input_files(bin_dirs: List[Path], + contig2bin_tables: List[Path], + contigs_fasta: Path, fasta_extensions:Set[str] = {".fasta", ".fna", ".fa"}) -> Tuple[Dict[str, Set[bin_manager.Bin]], Set[bin_manager.Bin], Set[str], Dict[str, int]]: """ Parses input files to retrieve information related to bins and contigs. @@ -178,11 +181,11 @@ def parse_input_files(bin_dirs: List[str], if bin_dirs: logging.info("Parsing bin directories.") - bin_name_to_bin_dir = io.infer_bin_name_from_bin_inputs(bin_dirs) + bin_name_to_bin_dir = io.infer_bin_set_names_from_input_paths(bin_dirs) bin_set_name_to_bins = bin_manager.parse_bin_directories(bin_name_to_bin_dir, fasta_extensions) else: logging.info("Parsing bin2contig files.") - bin_name_to_bin_table = io.infer_bin_name_from_bin_inputs(contig2bin_tables) + bin_name_to_bin_table = io.infer_bin_set_names_from_input_paths(contig2bin_tables) bin_set_name_to_bins = bin_manager.parse_contig2bin_tables(bin_name_to_bin_table) logging.info(f"Processing {len(bin_set_name_to_bins)} bin sets.") @@ -193,7 +196,7 @@ def parse_input_files(bin_dirs: List[str], original_bins = bin_manager.dereplicate_bin_sets(bin_set_name_to_bins.values()) logging.info(f"Parsing contig fasta file: {contigs_fasta}") - contigs_object = contig_manager.parse_fasta_file(contigs_fasta) + contigs_object = contig_manager.parse_fasta_file(contigs_fasta.as_posix()) unexpected_contigs = {contig for contig in contigs_in_bins if contig not in contigs_object} @@ -206,9 +209,9 @@ def parse_input_files(bin_dirs: List[str], return bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length -def manage_protein_alignement(faa_file: str, contigs_fasta: str, contig_to_length: Dict[str, int], - contigs_in_bins: Set[str], diamond_result_file: str, - checkm2_db: str, threads: int, resume: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]: +def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_length: Dict[str, int], + contigs_in_bins: Set[str], diamond_result_file: Path, + checkm2_db: Optional[Path], threads: int, resume: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]: """ Predicts or reuses proteins prediction and runs diamond on them. @@ -228,41 +231,45 @@ def manage_protein_alignement(faa_file: str, contigs_fasta: str, contig_to_lengt # Predict or reuse proteins prediction and run diamond on them if resume: logging.info(f"Parsing faa file: {faa_file}.") - contig_to_genes = cds.parse_faa_file(faa_file) - io.check_contig_consistency(contig_to_length, contig_to_genes, contigs_fasta, faa_file) + contig_to_genes = cds.parse_faa_file(faa_file.as_posix()) + io.check_contig_consistency(contig_to_length, contig_to_genes, contigs_fasta.as_posix(), faa_file.as_posix()) else: - contigs_iterator = (s for s in contig_manager.parse_fasta_file(contigs_fasta) if s.name in contigs_in_bins) - contig_to_genes = cds.predict(contigs_iterator, faa_file, threads) + contigs_iterator = (s for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix()) if s.name in contigs_in_bins) + contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads) - if checkm2_db: - diamond_db_path = checkm2_db - else: + if checkm2_db is None: # get checkm2 db stored in checkm2 install diamond_db_path = diamond.get_checkm2_db() - - diamond_log = f"{os.path.splitext(diamond_result_file)[0]}.log" + elif checkm2_db.exists(): + diamond_db_path = checkm2_db.as_posix() + else: + raise FileNotFoundError(checkm2_db) + + diamond_log = diamond_result_file.parents[0] / f"{diamond_result_file.stem}.log" diamond.run( - faa_file, - diamond_result_file, + faa_file.as_posix(), + diamond_result_file.as_posix(), diamond_db_path, - diamond_log, + diamond_log.as_posix(), threads, low_mem=low_mem, ) logging.info("Parsing diamond results.") - contig_to_kegg_counter = diamond.get_contig_to_kegg_id(diamond_result_file) + contig_to_kegg_counter = diamond.get_contig_to_kegg_id(diamond_result_file.as_posix()) # Check contigs from diamond vs input assembly consistency - io.check_contig_consistency(contig_to_length, contig_to_kegg_counter, contigs_fasta, diamond_result_file) + io.check_contig_consistency(contig_to_length, contig_to_kegg_counter, contigs_fasta.as_posix(), diamond_result_file.as_posix()) return contig_to_kegg_counter, contig_to_genes -def select_bins_and_write_them(all_bins: Set[bin_manager.Bin], contigs_fasta: str, final_bin_report: str, min_completeness: float, - index_to_contig: dict, outdir: str, debug: bool) -> List[bin_manager.Bin]: +def select_bins_and_write_them(all_bins: Set[bin_manager.Bin], + contigs_fasta: Path, + final_bin_report: Path, min_completeness: float, + index_to_contig: dict, outdir: Path, debug: bool) -> List[bin_manager.Bin]: """ Selects and writes bins based on specific criteria. @@ -276,12 +283,12 @@ def select_bins_and_write_them(all_bins: Set[bin_manager.Bin], contigs_fasta: st :return: Selected bins that meet the completeness threshold. """ - outdir_final_bin_set = os.path.join(outdir, "final_bins") + outdir_final_bin_set = outdir / "final_bins" os.makedirs(outdir_final_bin_set, exist_ok=True) if debug: all_bins_for_debug = set(all_bins) - all_bin_compo_file = os.path.join(outdir, "all_bins_quality_reports.tsv") + all_bin_compo_file = outdir / "all_bins_quality_reports.tsv" logging.info(f"Writing all bins in {all_bin_compo_file}") @@ -338,6 +345,23 @@ def log_selected_bin_info(selected_bins: List[bin_manager.Bin], hq_min_completen thresholds = f"(completeness >= {hq_min_completeness} and contamination <= {hq_max_conta})" logging.info(f"{hq_bins}/{len(selected_bins)} selected bins have a high quality {thresholds}.") +def write_original_bin_metrics(bin_set_name_to_bins:Dict[str, Set[bin_manager.Bin]], original_bin_report_dir:Path): + """ + + """ + + logging.info(f"Writing original input bins metrics in {original_bin_report_dir}") + + + original_bin_report_dir.mkdir(parents=True, exist_ok=True) + + for i, (set_name, bins) in enumerate(sorted(bin_set_name_to_bins.items())): + bins_metric_file = original_bin_report_dir / f"input_bins_{i+1}.{set_name.replace('/', '_')}.tsv" + + logging.info(f"Writing bin_set {set_name} input bins metrics in {bins_metric_file}") + io.write_bin_info(bins, bins_metric_file) + + def main(): "Orchestrate the execution of the program" @@ -351,15 +375,15 @@ def main(): hq_min_completeness = 90 # Temporary files # - out_tmp_dir = os.path.join(args.outdir, "temporary_files") + out_tmp_dir:Path = args.outdir / "temporary_files" os.makedirs(out_tmp_dir, exist_ok=True) - faa_file = os.path.join(out_tmp_dir, "assembly_proteins.faa") - diamond_result_file = os.path.join(out_tmp_dir, "diamond_result.tsv") + faa_file = out_tmp_dir / "assembly_proteins.faa" + diamond_result_file = out_tmp_dir / "diamond_result.tsv" # Output files # - final_bin_report = os.path.join(args.outdir, "final_bins_quality_reports.tsv") - original_bin_report = os.path.join(args.outdir, "original_bins_quality_reports.tsv") + final_bin_report:Path = args.outdir / "final_bins_quality_reports.tsv" + original_bin_report_dir:Path = args.outdir / "input_bins_quality_reports" if args.resume: io.check_resume_file(faa_file, diamond_result_file) @@ -392,28 +416,8 @@ def main(): logging.info("Add size and assess quality of input bins") bin_quality.add_bin_metrics(original_bins, contig_metadat, args.contamination_weight, args.threads) + write_original_bin_metrics(bin_set_name_to_bins, original_bin_report_dir) - # for bin_set, bins in bin_set_name_to_bins.items(): - # print(bin_set) - # bin_set_name = bin_set.replace("/", "_") - # original_bin_report = os.path.join(args.outdir, f"{bin_set_name}_bins_quality_reports.tsv") - # bins_with_metric = [] - # for bin_obj in bins: - # print(bin_obj.id, bin_obj.score, bin_obj.N50) - # if bin_obj.score is None: - - # matching_bins = [bin_with_metric for bin_with_metric in original_bins if bin_obj == bin_with_metric] - # assert len(matching_bins) == 1, len(matching_bins) - # bins_with_metric.append(matching_bins[0]) - # print("HAS NOT USE MATCHING BIN IN ORIGINAL SET", matching_bins[0].id, matching_bins[0].score, matching_bins[0].N50) - - # else: - # print("has score") - # print(bin_obj.id, bin_obj.score, bin_obj.N50) - # bins_with_metric.append(bin_obj) - - - # io.write_bin_info(bins_with_metric, original_bin_report) logging.info("Create intermediate bins:") new_bins = bin_manager.create_intermediate_bins(bin_set_name_to_bins) From 14bc19481978945a7f1b80a8990d517ff073065e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 29 Aug 2024 10:43:52 +0200 Subject: [PATCH 13/36] update tests --- binette/io_manager.py | 24 +++++++++++++++++++++++- binette/main.py | 22 ++++------------------ tests/bin_manager_test.py | 9 +++++---- tests/io_manager_test.py | 36 +++++++++++++++++++++++++++++++++--- tests/main_binette_test.py | 3 +++ 5 files changed, 68 insertions(+), 26 deletions(-) diff --git a/binette/io_manager.py b/binette/io_manager.py index 5ea3041..5899722 100644 --- a/binette/io_manager.py +++ b/binette/io_manager.py @@ -1,6 +1,6 @@ import logging import pyfastx -from typing import Iterable, List, Dict, Tuple +from typing import Iterable, List, Dict, Tuple, Set import csv from binette.bin_manager import Bin @@ -195,3 +195,25 @@ def check_resume_file(faa_file: Path, diamond_result_file: Path) -> None: raise FileNotFoundError(error_msg) +def write_original_bin_metrics(bin_set_name_to_bins: Dict[str, Set[Bin]], original_bin_report_dir: Path): + """ + Write metrics of original input bins to a specified directory. + + This function takes a dictionary mapping bin set names to sets of bins and writes + the metrics for each bin set to a TSV file in the specified directory. Each bin set + will have its own TSV file named according to its set name. + + :param bin_set_name_to_bins: A dictionary where the keys are bin set names (str) and + the values are sets of Bin objects representing bins. + :param original_bin_report_dir: The directory path (Path) where the bin metrics will be saved. + """ + + original_bin_report_dir.mkdir(parents=True, exist_ok=True) + + for i, (set_name, bins) in enumerate(sorted(bin_set_name_to_bins.items())): + bins_metric_file = original_bin_report_dir / f"input_bins_{i + 1}.{set_name.replace('/', '_')}.tsv" + + logging.debug(f"Writing metrics for bin set '{set_name}' to file: {bins_metric_file}") + write_bin_info(bins, bins_metric_file) + + logging.debug("Completed writing all original input bin metrics.") diff --git a/binette/main.py b/binette/main.py index 3044fda..1c92a38 100755 --- a/binette/main.py +++ b/binette/main.py @@ -345,23 +345,6 @@ def log_selected_bin_info(selected_bins: List[bin_manager.Bin], hq_min_completen thresholds = f"(completeness >= {hq_min_completeness} and contamination <= {hq_max_conta})" logging.info(f"{hq_bins}/{len(selected_bins)} selected bins have a high quality {thresholds}.") -def write_original_bin_metrics(bin_set_name_to_bins:Dict[str, Set[bin_manager.Bin]], original_bin_report_dir:Path): - """ - - """ - - logging.info(f"Writing original input bins metrics in {original_bin_report_dir}") - - - original_bin_report_dir.mkdir(parents=True, exist_ok=True) - - for i, (set_name, bins) in enumerate(sorted(bin_set_name_to_bins.items())): - bins_metric_file = original_bin_report_dir / f"input_bins_{i+1}.{set_name.replace('/', '_')}.tsv" - - logging.info(f"Writing bin_set {set_name} input bins metrics in {bins_metric_file}") - io.write_bin_info(bins, bins_metric_file) - - def main(): "Orchestrate the execution of the program" @@ -416,7 +399,10 @@ def main(): logging.info("Add size and assess quality of input bins") bin_quality.add_bin_metrics(original_bins, contig_metadat, args.contamination_weight, args.threads) - write_original_bin_metrics(bin_set_name_to_bins, original_bin_report_dir) + + + logging.info(f"Writting original input bin metrics to directory: {original_bin_report_dir}") + io.write_original_bin_metrics(bin_set_name_to_bins, original_bin_report_dir) logging.info("Create intermediate bins:") diff --git a/tests/bin_manager_test.py b/tests/bin_manager_test.py index f08d6a1..f939b53 100644 --- a/tests/bin_manager_test.py +++ b/tests/bin_manager_test.py @@ -9,6 +9,7 @@ import networkx as nx import logging +from pathlib import Path def test_get_all_possible_combinations(): input_list = ["2", "3", "4"] @@ -524,14 +525,14 @@ def create_temp_bin_directories(tmpdir, create_temp_bin_files): bin2 = bin_dir2.join("binA.fasta") bin2.write(">contig3\nTTAG\n>contig4\nCGAT\n>contig5\nCGGC") - return {"set1": str(bin_dir1), "set2": str(bin_dir2)} + return {"set1": Path(bin_dir1), "set2": Path(bin_dir2)} def test_get_bins_from_directory(create_temp_bin_files): bin_dir = create_temp_bin_files set_name = "TestSet" - bins = bin_manager.get_bins_from_directory(str(bin_dir), set_name, fasta_extensions={'.fasta'}) + bins = bin_manager.get_bins_from_directory(Path(bin_dir), set_name, fasta_extensions={'.fasta'}) assert len(bins) == 2 # Ensure that the correct number of Bin objects is returned @@ -546,7 +547,7 @@ def test_get_bins_from_directory(create_temp_bin_files): assert bins[0].name in ["bin2.fasta", "bin1.fasta"] def test_get_bins_from_directory_no_files(tmpdir): - bin_dir = str(tmpdir.mkdir("empty_bins")) + bin_dir = Path(tmpdir.mkdir("empty_bins")) set_name = "EmptySet" bins = bin_manager.get_bins_from_directory(bin_dir, set_name, fasta_extensions={'.fasta'}) @@ -554,7 +555,7 @@ def test_get_bins_from_directory_no_files(tmpdir): assert len(bins) == 0 # Ensure that no Bin objects are returned for an empty directory def test_get_bins_from_directory_no_wrong_extensions(create_temp_bin_files): - bin_dir = create_temp_bin_files + bin_dir = Path(create_temp_bin_files) set_name = "TestSet" bins = bin_manager.get_bins_from_directory(bin_dir, set_name, fasta_extensions={'.fna'}) diff --git a/tests/io_manager_test.py b/tests/io_manager_test.py index aa1856c..3a44368 100644 --- a/tests/io_manager_test.py +++ b/tests/io_manager_test.py @@ -1,7 +1,7 @@ import pytest from binette import io_manager from pathlib import Path - +from unittest.mock import patch @@ -9,7 +9,7 @@ class Bin: def __init__(self, bin_id, origin, name, completeness, contamination, score, length, N50, contigs): self.id = bin_id - self.origin = origin + self.origin = {origin} self.name = name self.completeness = completeness self.contamination = contamination @@ -65,7 +65,7 @@ def test_infer_bin_name_from_single_path(): # Check if the output matches the expected dictionary assert result == expected_result - + def test_infer_bin_name_from_bin_table_inputs(): # Mock input data input_bins = [ @@ -283,3 +283,33 @@ def test_check_resume_file_missing_diamond(temp_files, caplog): io_manager.check_resume_file(Path(faa_file), Path("nonexistent_diamond_result.txt")) assert "Protein file" not in caplog.text assert "Diamond result file" in caplog.text + + +@patch('binette.io_manager.write_bin_info') +def test_write_original_bin_metrics(mock_write_bin_info, bin1,bin2, tmp_path): + # Test that `write_original_bin_metrics` correctly writes bin metrics to files + + temp_directory = tmp_path / "test_output" + + mock_bins = {"set1":{bin1}, + "set2":{bin2}} + # Call the function with mock data + io_manager.write_original_bin_metrics(mock_bins, temp_directory) + + # Check if the output directory was created + assert temp_directory.exists(), "Output directory should be created." + + # Check that the correct files are created + expected_files = [ + temp_directory / "input_bins_1.set1.tsv", + temp_directory / "input_bins_2.set2.tsv" + ] + + assert temp_directory.exists(), f"Expected temp_directory {temp_directory} was not created." + + # Check if `write_bin_info` was called correctly + assert mock_write_bin_info.call_count == 2, "write_bin_info should be called once for each bin set." + + # Verify the specific calls to `write_bin_info` + mock_write_bin_info.assert_any_call(mock_bins['set1'], expected_files[0]) + mock_write_bin_info.assert_any_call(mock_bins['set2'], expected_files[1]) \ No newline at end of file diff --git a/tests/main_binette_test.py b/tests/main_binette_test.py index f2129db..0e20acd 100644 --- a/tests/main_binette_test.py +++ b/tests/main_binette_test.py @@ -371,6 +371,7 @@ def test_main(monkeypatch): patch('binette.bin_quality.add_bin_metrics') as mock_add_bin_metrics, \ patch('binette.main.log_selected_bin_info') as mock_log_selected_bin_info, \ patch('binette.contig_manager.make_contig_index') as mock_make_contig_index, \ + patch('binette.io_manager.write_original_bin_metrics') as mock_write_original_bin_metrics, \ patch('binette.main.select_bins_and_write_them') as mock_select_bins_and_write_them: # Set return values for mocked functions if needed @@ -395,5 +396,7 @@ def test_main(monkeypatch): mock_log_selected_bin_info.assert_called_once() mock_select_bins_and_write_them.assert_called_once() + mock_write_original_bin_metrics.assert_called_once() + assert mock_apply_contig_index.call_count == 3 assert mock_add_bin_metrics.call_count == 2 \ No newline at end of file From 956cb1773142db6bddbc0b55a1cdb0d384f7a18d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 14:56:25 +0200 Subject: [PATCH 14/36] add first draft of the tutorial --- docs/conf.py | 56 +- docs/index.md | 6 +- docs/tutorial/analyse_binette_result.ipynb | 1688 ++++++++++++++++++++ docs/tutorial/assembly.md | 43 + docs/tutorial/binette.md | 7 + docs/tutorial/binning.md | 67 + docs/tutorial/set_env_and_get_data.md | 78 + docs/tutorial/tutorial_main.md | 52 + pyproject.toml | 7 +- 9 files changed, 1996 insertions(+), 8 deletions(-) create mode 100644 docs/tutorial/analyse_binette_result.ipynb create mode 100644 docs/tutorial/assembly.md create mode 100644 docs/tutorial/binette.md create mode 100644 docs/tutorial/binning.md create mode 100644 docs/tutorial/set_env_and_get_data.md create mode 100644 docs/tutorial/tutorial_main.md diff --git a/docs/conf.py b/docs/conf.py index b8e839a..bb2e303 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,28 +18,37 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [ - "myst_parser", + # # "sphinxcontrib.jquery", "sphinx.ext.duration", "sphinx.ext.autosectionlabel", "sphinx.ext.autodoc", - 'sphinx_search.extension' + 'sphinx_search.extension', + # "myst_nb", + "myst_parser", + 'nbsphinx', + 'nbsphinx_link', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + "myst_parser", + ] source_suffix = { - '.md': 'markdown' + '.md': 'markdown', } templates_path = ['_templates'] - +nb_execution_mode = "off" +nbsphinx_execute = 'never' # Prefix document path to section labels, to use: # `path/to/file:heading` instead of just `heading` autosectionlabel_prefix_document = True -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build', "api"] @@ -55,3 +64,40 @@ + + +# Include the Plotly JavaScript in the HTML output +nbsphinx_requirejs_path = "" + +# Ensures that the `require.js` is loaded for Plotly to function correctly +nbsphinx_requirejs_options = { + 'paths': { + 'plotly': 'https://cdn.plot.ly/plotly-latest.min' + }, + 'shim': { + 'plotly': { + 'exports': 'Plotly' + } + } +} + +# Specify the default language for syntax highlighting in Sphinx +highlight_language = 'python' + +# -- Options for HTML output ------------------------------------------------- + + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add plotly renderer options +nbsphinx_prolog = r""" +.. raw:: html + + +""" + + + diff --git a/docs/index.md b/docs/index.md index 45dd4a6..e7ec04b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -31,14 +31,18 @@ Binette is inspired from the metaWRAP bin-refinement tool but it effectively sol - Enhanced Speed: Binette significantly improves the speed of the refinement process. It achieves this by launching the initial steps of CheckM2, such as Prodigal and Diamond runs, only once on all contigs. These intermediate results are then utilized to assess the quality of any given bin, eliminating redundant computations and accelerating the refinement process. - No Limit on Input Bin Sets: Unlike its predecessor, Binette is not constrained by the number of input bin sets. It can handle and process multiple bin sets simultaneously. + + + ```{toctree} :caption: 'Documentation' :maxdepth: 2 installation usage +tutorial/tutorial_main contributing -tests.md +tests api/api_ref ``` diff --git a/docs/tutorial/analyse_binette_result.ipynb b/docs/tutorial/analyse_binette_result.ipynb new file mode 100644 index 0000000..30247a8 --- /dev/null +++ b/docs/tutorial/analyse_binette_result.ipynb @@ -0,0 +1,1688 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "52e7f39c", + "metadata": {}, + "source": [ + "## Analyse Binette results" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e6a1e1ee-681d-4823-b974-7027bafd2ba9", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pathlib import Path\n", + "import plotly.express as px\n", + "import plotly.io as pio\n", + "pio.renderers.default = \"sphinx_gallery\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "34e80119-f59b-41b0-b0e5-de2d6ed0c6a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bin_idoriginnamecompletenesscontaminationscoresizeN50contig_counttoolindex
017075diff44 - 10100.000.0599.9046726658208493binette0
139427diff36 - 699.900.2099.5027966054115198binette1
247060union58 | 3398.590.8396.93460133641016165binette2
347177union91 | 25 | 5596.100.3495.42259871811891312binette3
421248diff65 - 8 - 2891.981.7188.5617680959976250binette4
544137diff76 - 13 - 2892.632.4187.8137262545669850binette5
631703diff31 - 7 - 6181.730.8480.0516652338518248binette6
713475diff47 - 3772.892.3968.1112418295061252binette7
847926union75 | 3074.314.2665.79329394929541262binette8
946775union42 | 10262.942.7557.4412935713783419binette9
1033569diff83 - 7 - 38 - 3159.182.2454.7020425274437514binette10
1139350diff57 - 16 - 7552.161.3149.5426012825332509binette11
1239558diff78 - 6 - 4364.638.0348.57185821014301293binette12
1351082union120 | 152.335.0642.216888791446472binette13
1419689diff118 - 18 - 61 - 3148.228.2331.76178267614021265binette14
\n", + "
" + ], + "text/plain": [ + " bin_id origin name completeness contamination score \\\n", + "0 17075 diff 44 - 10 100.00 0.05 99.90 \n", + "1 39427 diff 36 - 6 99.90 0.20 99.50 \n", + "2 47060 union 58 | 33 98.59 0.83 96.93 \n", + "3 47177 union 91 | 25 | 55 96.10 0.34 95.42 \n", + "4 21248 diff 65 - 8 - 28 91.98 1.71 88.56 \n", + "5 44137 diff 76 - 13 - 28 92.63 2.41 87.81 \n", + "6 31703 diff 31 - 7 - 61 81.73 0.84 80.05 \n", + "7 13475 diff 47 - 37 72.89 2.39 68.11 \n", + "8 47926 union 75 | 30 74.31 4.26 65.79 \n", + "9 46775 union 42 | 102 62.94 2.75 57.44 \n", + "10 33569 diff 83 - 7 - 38 - 31 59.18 2.24 54.70 \n", + "11 39350 diff 57 - 16 - 75 52.16 1.31 49.54 \n", + "12 39558 diff 78 - 6 - 43 64.63 8.03 48.57 \n", + "13 51082 union 120 | 1 52.33 5.06 42.21 \n", + "14 19689 diff 118 - 18 - 61 - 31 48.22 8.23 31.76 \n", + "\n", + " size N50 contig_count tool index \n", + "0 4672665 82084 93 binette 0 \n", + "1 2796605 41151 98 binette 1 \n", + "2 4601336 41016 165 binette 2 \n", + "3 2598718 11891 312 binette 3 \n", + "4 1768095 9976 250 binette 4 \n", + "5 3726254 5669 850 binette 5 \n", + "6 1665233 8518 248 binette 6 \n", + "7 1241829 5061 252 binette 7 \n", + "8 3293949 2954 1262 binette 8 \n", + "9 1293571 3783 419 binette 9 \n", + "10 2042527 4437 514 binette 10 \n", + "11 2601282 5332 509 binette 11 \n", + "12 1858210 1430 1293 binette 12 \n", + "13 688879 1446 472 binette 13 \n", + "14 1782676 1402 1265 binette 14 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "binette_result_file = \"./binette_results/final_bins_quality_reports.tsv\"\n", + "df_binette = pd.read_csv(binette_result_file, sep='\\t')\n", + "df_binette['tool'] = \"binette\"\n", + "df_binette['index'] = df_binette.index\n", + "df_binette" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "189038d3-77a0-435a-9590-4d8b3038341e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
completenesscontaminationtool
0100.000.05binette
199.900.20binette
298.590.83binette
396.100.34binette
491.981.71binette
............
208.280.01semibin2
218.120.02semibin2
227.740.01semibin2
236.180.00semibin2
244.410.13semibin2
\n", + "

140 rows Γ— 3 columns

\n", + "
" + ], + "text/plain": [ + " completeness contamination tool\n", + "0 100.00 0.05 binette\n", + "1 99.90 0.20 binette\n", + "2 98.59 0.83 binette\n", + "3 96.10 0.34 binette\n", + "4 91.98 1.71 binette\n", + ".. ... ... ...\n", + "20 8.28 0.01 semibin2\n", + "21 8.12 0.02 semibin2\n", + "22 7.74 0.01 semibin2\n", + "23 6.18 0.00 semibin2\n", + "24 4.41 0.13 semibin2\n", + "\n", + "[140 rows x 3 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "input_bins_quality_reports_dir = Path(\"binette_results/input_bins_quality_reports/\")\n", + "\n", + "df_input_bin_list = [df_binette]\n", + "for input_bin_metric_file in input_bins_quality_reports_dir.glob(\"*tsv\"):\n", + " tool = input_bin_metric_file.name.split('.')[1].split('_')[0]\n", + " df_input = pd.read_csv(input_bin_metric_file, sep='\\t')\n", + " df_input['index'] = df_input.index\n", + " df_input['tool'] = tool\n", + " df_input_bin_list.append(df_input)\n", + "\n", + "df_bins = pd.concat(df_input_bin_list)\n", + " \n", + "set(df_bins['tool'])\n", + "df_bins[\"High quality bin\"] = (df_bins['completeness'] >= 90) & (df_bins['contamination'] <= 5)\n", + "#df_binette = pd.read_csv(binette_result_file, sep='\\t')\n", + "#df_binette\n", + "df_bins[[\"completeness\", \"contamination\", \"tool\"]]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "911d598f-a6c7-4178-aff2-6059235e7fc4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = px.scatter(df_bins, x=\"completeness\",y=\"contamination\", color=\"High quality bin\", size=\"size\", facet_row=\"tool\")\n", + "fig.update_layout(\n", + " width=800,\n", + " height=800)\n", + " \n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "35c46beb-1ac9-4014-9672-91edcc1bf439", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_bins['completeness - 2*contamination'] = df_bins['completeness'] - 2*df_bins['contamination']\n", + "fig = px.line(df_bins, x=\"index\",y='completeness - 2*contamination', color=\"tool\",markers=True)\n", + "fig.update_layout(\n", + " width=800,\n", + " height=500)\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "af74bfb2-457c-4cf4-9c13-3ee9642be7ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bin_idoriginnamecompletenesscontaminationscoresizeN50contig_counttoolindexHigh quality bincompleteness - 2*contaminationContamination ≀ 10 and<br>Completeness
017075diff44 - 10100.000.0599.9046726658208493binette0True99.90> 90%
139427diff36 - 699.900.2099.5027966054115198binette1True99.50> 90%
247060union58 | 3398.590.8396.93460133641016165binette2True96.93> 90%
347177union91 | 25 | 5596.100.3495.42259871811891312binette3True95.42> 90%
421248diff65 - 8 - 2891.981.7188.5617680959976250binette4True88.56> 90%
544137diff76 - 13 - 2892.632.4187.8137262545669850binette5True87.81> 90%
631703diff31 - 7 - 6181.730.8480.0516652338518248binette6False80.05> 70% and ≀ 90%
713475diff47 - 3772.892.3968.1112418295061252binette7False68.11> 70% and ≀ 90%
847926union75 | 3074.314.2665.79329394929541262binette8False65.79> 70% and ≀ 90%
946775union42 | 10262.942.7557.4412935713783419binette9False57.44> 50% and ≀ 70%
1033569diff83 - 7 - 38 - 3159.182.2454.7020425274437514binette10False54.70> 50% and ≀ 70%
1139350diff57 - 16 - 7552.161.3149.5426012825332509binette11False49.54> 50% and ≀ 70%
1239558diff78 - 6 - 4364.638.0348.57185821014301293binette12False48.57> 50% and ≀ 70%
1351082union120 | 152.335.0642.216888791446472binette13False42.21> 50% and ≀ 70%
0125concoct/bins9.fa100.000.3899.24303358637523131concoct0True99.24> 90%
167concoct/bins41.fa100.000.4699.08476546682084101concoct1True99.08> 90%
291concoct/bins7.fa92.760.3492.08227495112187265concoct2True92.08> 90%
376concoct/bins6.fa92.633.4285.7937519505674855concoct3True85.79> 90%
465concoct/bins62.fa87.351.8083.75191785910911259concoct4False83.75> 70% and ≀ 90%
675concoct/bins48.fa73.354.2664.83328537429501261concoct6False64.83> 70% and ≀ 90%
022maxbin2maxbin2.001.fasta99.814.8190.19461681889436133maxbin20True90.19> 90%
114maxbin2maxbin2.002.fasta93.923.5386.86287437337523195maxbin21True86.86> 90%
35maxbin2maxbin2.009.fasta62.698.1446.4124384926141604maxbin23False46.41> 50% and ≀ 70%
036metabat2metabat2.14.fa99.900.2499.4227995724115199metabat20True99.42> 90%
125metabat2metabat2.8.fa93.170.2292.73214809712225226metabat21True92.73> 90%
233metabat2metabat2.12.fa93.520.9291.68426613439217157metabat22True91.68> 90%
327metabat2metabat2.11.fa84.401.5381.34190276111352218metabat23False81.34> 70% and ≀ 90%
437metabat2metabat2.1.fa84.992.7379.5329805266876502metabat24False79.53> 70% and ≀ 90%
531metabat2metabat2.2.fa83.213.1676.8918070287852274metabat25False76.89> 70% and ≀ 90%
635metabat2metabat2.4.fa76.530.1176.3134776368208471metabat26False76.31> 70% and ≀ 90%
729metabat2metabat2.7.fa71.785.7760.2413846534937292metabat27False60.24> 70% and ≀ 90%
824metabat2metabat2.3.fa51.752.9945.7717070784929362metabat28False45.77> 50% and ≀ 70%
044semibin2/output_binsSemiBin_27.fa.gz100.000.0999.8246813698208494semibin20True99.82> 90%
153semibin2/output_binsSemiBin_33.fa.gz99.920.2899.36293767837523113semibin21True99.36> 90%
250semibin2/output_binsSemiBin_10.fa.gz93.430.1493.15212929512519216semibin22True93.15> 90%
362semibin2/output_binsSemiBin_24.fa.gz92.130.0392.07416291140395139semibin23True92.07> 90%
438semibin2/output_binsSemiBin_26.fa.gz83.092.2578.5916741568389245semibin24False78.59> 70% and ≀ 90%
549semibin2/output_binsSemiBin_32.fa.gz81.871.6678.55182007311737205semibin25False78.55> 70% and ≀ 90%
660semibin2/output_binsSemiBin_22.fa.gz80.251.6376.9927909487117450semibin26False76.99> 70% and ≀ 90%
747semibin2/output_binsSemiBin_11.fa.gz72.572.4567.6712450315061253semibin27False67.67> 70% and ≀ 90%
861semibin2/output_binsSemiBin_3.fa.gz53.341.3350.6817286904913367semibin28False50.68> 50% and ≀ 70%
957semibin2/output_binsSemiBin_12.fa.gz51.921.3149.3026094515292511semibin29False49.30> 50% and ≀ 70%
\n", + "
" + ], + "text/plain": [ + " bin_id origin name completeness \\\n", + "0 17075 diff 44 - 10 100.00 \n", + "1 39427 diff 36 - 6 99.90 \n", + "2 47060 union 58 | 33 98.59 \n", + "3 47177 union 91 | 25 | 55 96.10 \n", + "4 21248 diff 65 - 8 - 28 91.98 \n", + "5 44137 diff 76 - 13 - 28 92.63 \n", + "6 31703 diff 31 - 7 - 61 81.73 \n", + "7 13475 diff 47 - 37 72.89 \n", + "8 47926 union 75 | 30 74.31 \n", + "9 46775 union 42 | 102 62.94 \n", + "10 33569 diff 83 - 7 - 38 - 31 59.18 \n", + "11 39350 diff 57 - 16 - 75 52.16 \n", + "12 39558 diff 78 - 6 - 43 64.63 \n", + "13 51082 union 120 | 1 52.33 \n", + "0 125 concoct/bins 9.fa 100.00 \n", + "1 67 concoct/bins 41.fa 100.00 \n", + "2 91 concoct/bins 7.fa 92.76 \n", + "3 76 concoct/bins 6.fa 92.63 \n", + "4 65 concoct/bins 62.fa 87.35 \n", + "6 75 concoct/bins 48.fa 73.35 \n", + "0 22 maxbin2 maxbin2.001.fasta 99.81 \n", + "1 14 maxbin2 maxbin2.002.fasta 93.92 \n", + "3 5 maxbin2 maxbin2.009.fasta 62.69 \n", + "0 36 metabat2 metabat2.14.fa 99.90 \n", + "1 25 metabat2 metabat2.8.fa 93.17 \n", + "2 33 metabat2 metabat2.12.fa 93.52 \n", + "3 27 metabat2 metabat2.11.fa 84.40 \n", + "4 37 metabat2 metabat2.1.fa 84.99 \n", + "5 31 metabat2 metabat2.2.fa 83.21 \n", + "6 35 metabat2 metabat2.4.fa 76.53 \n", + "7 29 metabat2 metabat2.7.fa 71.78 \n", + "8 24 metabat2 metabat2.3.fa 51.75 \n", + "0 44 semibin2/output_bins SemiBin_27.fa.gz 100.00 \n", + "1 53 semibin2/output_bins SemiBin_33.fa.gz 99.92 \n", + "2 50 semibin2/output_bins SemiBin_10.fa.gz 93.43 \n", + "3 62 semibin2/output_bins SemiBin_24.fa.gz 92.13 \n", + "4 38 semibin2/output_bins SemiBin_26.fa.gz 83.09 \n", + "5 49 semibin2/output_bins SemiBin_32.fa.gz 81.87 \n", + "6 60 semibin2/output_bins SemiBin_22.fa.gz 80.25 \n", + "7 47 semibin2/output_bins SemiBin_11.fa.gz 72.57 \n", + "8 61 semibin2/output_bins SemiBin_3.fa.gz 53.34 \n", + "9 57 semibin2/output_bins SemiBin_12.fa.gz 51.92 \n", + "\n", + " contamination score size N50 contig_count tool index \\\n", + "0 0.05 99.90 4672665 82084 93 binette 0 \n", + "1 0.20 99.50 2796605 41151 98 binette 1 \n", + "2 0.83 96.93 4601336 41016 165 binette 2 \n", + "3 0.34 95.42 2598718 11891 312 binette 3 \n", + "4 1.71 88.56 1768095 9976 250 binette 4 \n", + "5 2.41 87.81 3726254 5669 850 binette 5 \n", + "6 0.84 80.05 1665233 8518 248 binette 6 \n", + "7 2.39 68.11 1241829 5061 252 binette 7 \n", + "8 4.26 65.79 3293949 2954 1262 binette 8 \n", + "9 2.75 57.44 1293571 3783 419 binette 9 \n", + "10 2.24 54.70 2042527 4437 514 binette 10 \n", + "11 1.31 49.54 2601282 5332 509 binette 11 \n", + "12 8.03 48.57 1858210 1430 1293 binette 12 \n", + "13 5.06 42.21 688879 1446 472 binette 13 \n", + "0 0.38 99.24 3033586 37523 131 concoct 0 \n", + "1 0.46 99.08 4765466 82084 101 concoct 1 \n", + "2 0.34 92.08 2274951 12187 265 concoct 2 \n", + "3 3.42 85.79 3751950 5674 855 concoct 3 \n", + "4 1.80 83.75 1917859 10911 259 concoct 4 \n", + "6 4.26 64.83 3285374 2950 1261 concoct 6 \n", + "0 4.81 90.19 4616818 89436 133 maxbin2 0 \n", + "1 3.53 86.86 2874373 37523 195 maxbin2 1 \n", + "3 8.14 46.41 2438492 6141 604 maxbin2 3 \n", + "0 0.24 99.42 2799572 41151 99 metabat2 0 \n", + "1 0.22 92.73 2148097 12225 226 metabat2 1 \n", + "2 0.92 91.68 4266134 39217 157 metabat2 2 \n", + "3 1.53 81.34 1902761 11352 218 metabat2 3 \n", + "4 2.73 79.53 2980526 6876 502 metabat2 4 \n", + "5 3.16 76.89 1807028 7852 274 metabat2 5 \n", + "6 0.11 76.31 3477636 82084 71 metabat2 6 \n", + "7 5.77 60.24 1384653 4937 292 metabat2 7 \n", + "8 2.99 45.77 1707078 4929 362 metabat2 8 \n", + "0 0.09 99.82 4681369 82084 94 semibin2 0 \n", + "1 0.28 99.36 2937678 37523 113 semibin2 1 \n", + "2 0.14 93.15 2129295 12519 216 semibin2 2 \n", + "3 0.03 92.07 4162911 40395 139 semibin2 3 \n", + "4 2.25 78.59 1674156 8389 245 semibin2 4 \n", + "5 1.66 78.55 1820073 11737 205 semibin2 5 \n", + "6 1.63 76.99 2790948 7117 450 semibin2 6 \n", + "7 2.45 67.67 1245031 5061 253 semibin2 7 \n", + "8 1.33 50.68 1728690 4913 367 semibin2 8 \n", + "9 1.31 49.30 2609451 5292 511 semibin2 9 \n", + "\n", + " High quality bin completeness - 2*contamination \\\n", + "0 True 99.90 \n", + "1 True 99.50 \n", + "2 True 96.93 \n", + "3 True 95.42 \n", + "4 True 88.56 \n", + "5 True 87.81 \n", + "6 False 80.05 \n", + "7 False 68.11 \n", + "8 False 65.79 \n", + "9 False 57.44 \n", + "10 False 54.70 \n", + "11 False 49.54 \n", + "12 False 48.57 \n", + "13 False 42.21 \n", + "0 True 99.24 \n", + "1 True 99.08 \n", + "2 True 92.08 \n", + "3 True 85.79 \n", + "4 False 83.75 \n", + "6 False 64.83 \n", + "0 True 90.19 \n", + "1 True 86.86 \n", + "3 False 46.41 \n", + "0 True 99.42 \n", + "1 True 92.73 \n", + "2 True 91.68 \n", + "3 False 81.34 \n", + "4 False 79.53 \n", + "5 False 76.89 \n", + "6 False 76.31 \n", + "7 False 60.24 \n", + "8 False 45.77 \n", + "0 True 99.82 \n", + "1 True 99.36 \n", + "2 True 93.15 \n", + "3 True 92.07 \n", + "4 False 78.59 \n", + "5 False 78.55 \n", + "6 False 76.99 \n", + "7 False 67.67 \n", + "8 False 50.68 \n", + "9 False 49.30 \n", + "\n", + " Contamination ≀ 10 and
Completeness \n", + "0 > 90% \n", + "1 > 90% \n", + "2 > 90% \n", + "3 > 90% \n", + "4 > 90% \n", + "5 > 90% \n", + "6 > 70% and ≀ 90% \n", + "7 > 70% and ≀ 90% \n", + "8 > 70% and ≀ 90% \n", + "9 > 50% and ≀ 70% \n", + "10 > 50% and ≀ 70% \n", + "11 > 50% and ≀ 70% \n", + "12 > 50% and ≀ 70% \n", + "13 > 50% and ≀ 70% \n", + "0 > 90% \n", + "1 > 90% \n", + "2 > 90% \n", + "3 > 90% \n", + "4 > 70% and ≀ 90% \n", + "6 > 70% and ≀ 90% \n", + "0 > 90% \n", + "1 > 90% \n", + "3 > 50% and ≀ 70% \n", + "0 > 90% \n", + "1 > 90% \n", + "2 > 90% \n", + "3 > 70% and ≀ 90% \n", + "4 > 70% and ≀ 90% \n", + "5 > 70% and ≀ 90% \n", + "6 > 70% and ≀ 90% \n", + "7 > 70% and ≀ 90% \n", + "8 > 50% and ≀ 70% \n", + "0 > 90% \n", + "1 > 90% \n", + "2 > 90% \n", + "3 > 90% \n", + "4 > 70% and ≀ 90% \n", + "5 > 70% and ≀ 90% \n", + "6 > 70% and ≀ 90% \n", + "7 > 70% and ≀ 90% \n", + "8 > 50% and ≀ 70% \n", + "9 > 50% and ≀ 70% " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "contamination_cutoff = 10\n", + "low_contamination_filt = df_bins['contamination'] <= contamination_cutoff\n", + "high_completeness_filt = df_bins['completeness'] > 90\n", + "medium_completeness_filt = df_bins['completeness'] > 70\n", + "low_completeness_filt = df_bins['completeness'] > 50\n", + "\n", + "quality = f'Contamination ≀ {contamination_cutoff} and
Completeness'\n", + "df_bins.loc[low_contamination_filt & low_completeness_filt, quality] = '> 50% and ≀ 70%'\n", + "df_bins.loc[low_contamination_filt & medium_completeness_filt, quality] = '> 70% and ≀ 90%'\n", + "df_bins.loc[low_contamination_filt & high_completeness_filt, quality] = '> 90%'\n", + "df_bins.loc[~df_bins[quality].isna()]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fa71ff37-9846-4826-a4bb-6c4b0069cea0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Contamination ≀ 10 and<br>Completenesstoolbin_count
0> 50% and ≀ 70%binette5
1> 50% and ≀ 70%maxbin21
2> 50% and ≀ 70%metabat21
3> 50% and ≀ 70%semibin22
4> 70% and ≀ 90%binette3
5> 70% and ≀ 90%concoct2
6> 70% and ≀ 90%metabat25
7> 70% and ≀ 90%semibin24
8> 90%binette6
9> 90%concoct4
10> 90%maxbin22
11> 90%metabat23
12> 90%semibin24
\n", + "
" + ], + "text/plain": [ + " Contamination ≀ 10 and
Completeness tool bin_count\n", + "0 > 50% and ≀ 70% binette 5\n", + "1 > 50% and ≀ 70% maxbin2 1\n", + "2 > 50% and ≀ 70% metabat2 1\n", + "3 > 50% and ≀ 70% semibin2 2\n", + "4 > 70% and ≀ 90% binette 3\n", + "5 > 70% and ≀ 90% concoct 2\n", + "6 > 70% and ≀ 90% metabat2 5\n", + "7 > 70% and ≀ 90% semibin2 4\n", + "8 > 90% binette 6\n", + "9 > 90% concoct 4\n", + "10 > 90% maxbin2 2\n", + "11 > 90% metabat2 3\n", + "12 > 90% semibin2 4" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_bins.groupby([quality, 'tool']).value_counts(ascending=True).reset_index()\n", + "\n", + "df_bins_quality_grouped = df_bins.groupby([quality, 'tool']).agg(bin_count=('bin_id', 'count')).reset_index()\n", + "df_bins_quality_grouped" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "250def29-167e-4a3b-8194-282f602945c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "color_discrete_map={\"> 90%\": px.colors.qualitative.Prism[4],\n", + " \"> 70% and ≀ 90%\": px.colors.qualitative.Prism[2],\n", + " \"> 50% and ≀ 70%\": px.colors.qualitative.Prism[6]}\n", + "\n", + "fig = px.bar(df_bins_quality_grouped, x='tool', y=\"bin_count\", color=quality,\n", + " barmode='stack', color_discrete_map=color_discrete_map, text=\"bin_count\",\n", + " category_orders={\"tool\":[\"binette\", \"semibin2\", \"concoct\", \"metabat2\", \"maxbin2\"]},\n", + " opacity = 0.9)#[ \"#008c8a\", px.colors.qualitative.Safe[4], '#2596be'])\n", + "\n", + "fig.update_layout(\n", + " width=800,\n", + " height=500,\n", + " legend=dict(\n", + " traceorder=\"reversed\",\n", + " ))\n", + "fig" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md new file mode 100644 index 0000000..cd51e76 --- /dev/null +++ b/docs/tutorial/assembly.md @@ -0,0 +1,43 @@ + + +## Assemble the reads + +We will use megahit to assemble the reads + +```bash + +cd /home/jmainguy/Analysis/Binette/Binette_tutorial/ncezid-biome_datasets/exec_tutorial_jupyter +``` + +```bash + +megahit -1 coal-metagenomics/Kickstart_1.fastq.gz -2 coal-metagenomics/Kickstart_2.fastq.gz --out-dir Kickstart.megahit --out-prefix R1 --num-cpu-threads 12 + +``` + + +This take 27m49,879s + +```{note} +We can use spade as well. It performs generally better that megahit but is generally longer and consume more memory than megahit. See cami benchmark ??? +``` + + + +## Align the reads over the assembly + +First we need to map the reads back against the assembly to get coverage information + +```bash + +mkdir -p alignments_bwa/ + +bwa-mem2 index Kickstart.megahit/R1.contigs.fa -p Kickstart.megahit/R1.contigs.fa + +bwa-mem2 mem -t 12 Kickstart.megahit/R1.contigs.fa coal-metagenomics/Kickstart_*.fastq.gz | samtools view -@ 12 -bS - | samtools sort -@ 12 - -o alignments_bwa/Kickstart.bam + +samtools index alignments_bwa/Kickstart.bam + +``` + +This take around 12 minutes \ No newline at end of file diff --git a/docs/tutorial/binette.md b/docs/tutorial/binette.md new file mode 100644 index 0000000..687398f --- /dev/null +++ b/docs/tutorial/binette.md @@ -0,0 +1,7 @@ +## Run Binette + +```{code-cell} bash + +binette --bin_dirs maxbin2/ metabat2/ semibin2/output_bins/ concoct/bins/ -c Kickstart.megahit/R1.contigs.fa --verbose -t 12 -o binette_results + +``` \ No newline at end of file diff --git a/docs/tutorial/binning.md b/docs/tutorial/binning.md new file mode 100644 index 0000000..36efbed --- /dev/null +++ b/docs/tutorial/binning.md @@ -0,0 +1,67 @@ + +## Run binning tools + + +### metabat2 + +We first generate a depth file from the bam file using jgi_summarize_bam_contig_depths script from metabat2. This depth file will be used also with maxbin2. +```bash + +jgi_summarize_bam_contig_depths --outputDepth depth_Kickstart.txt alignments_bwa/Kickstart.bam +``` + +Now we can run metabat2: + +```bash + +metabat2 --inFile Kickstart.megahit/R1.contigs.fa --abdFile depth_Kickstart.txt --outFile metabat2/metabat2 --numThreads 12 --seed 1 + +``` + + +### maxbin2 + +We use the depth file produced by `jgi_summarize_bam_contig_depths` + +```bash + +mkdir -p maxbin2 +run_MaxBin.pl -contig Kickstart.megahit/R1.contigs.fa -abund depth_Kickstart.txt -thread 12 -out maxbin2/maxbin2 + +``` + +### concoct + +Then we can also run concoct with the folowing commands: + +```bash + +mkdir -p concoct/ + +cut_up_fasta.py Kickstart.megahit/R1.contigs.fa --chunk_size 10000 --overlap_size 0 --merge_last --bedfile concoct/contigs_10K.bed > concoct/contigs_10K.fa + +concoct_coverage_table.py concoct/contigs_10K.bed alignments_bwa/Kickstart.bam > concoct/coverage_table.tsv + +concoct --composition_file concoct/contigs_10K.fa --coverage_file concoct/coverage_table.tsv --basename concoct/bins --threads 12 + +merge_cutup_clustering.py concoct/bins_clustering_gt1000.csv > concoct/clustering_merge.csv + +mkdir -p concoct/bins + +extract_fasta_bins.py Kickstart.megahit/R1.contigs.fa concoct/clustering_merge.csv --output_path concoct/bins +``` + +### SemiBin2 + +We can launch semibin2 as well with its `single_easy_bin` command. + +```{note} +This take some time so it can be skipped. +``` + +```bash + +SemiBin2 single_easy_bin -i Kickstart.megahit/R1.contigs.fa -b alignments_bwa/Kickstart.bam -o semibin2/ -p 12 + +``` + diff --git a/docs/tutorial/set_env_and_get_data.md b/docs/tutorial/set_env_and_get_data.md new file mode 100644 index 0000000..b950a44 --- /dev/null +++ b/docs/tutorial/set_env_and_get_data.md @@ -0,0 +1,78 @@ + +## Set tutorial environment + +We will download necessary tool in a dedicated conda envrionnement. + + + +Let's create a directory to run the tutorial: + + +```bash + +mamba env create -f binette_tutorial_env.yaml -n binette_tuto + +``` + + + +## Get the Data + +### Using ncezid-biome datasets tool + +I downloaded the metagenome Kickstart from the above dataset (SAMN05024035) that correspond to this sra SRR5058924 https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRR5058924&o=acc_s%3Aa + + +We will donwload the data of the Kickstart (SAMN05024035) dataset this repository that https://github.com/ncezid-biome/datasets?tab=readme-ov-file#edlb + +We had use conda as detailed here https://github.com/ncezid-biome/datasets/blob/master/INSTALL.md#conda + +Now we can download the Kickstart dataset with the folowing commands. + +We first download the coal-metagenomic table from the github repository : https://github.com/ncezid-biome/datasets/blob/master/datasets/coal-metagenomics.tsv +ANd just select the line corresponding to the Kickstart dataset. + + + + +```bash +# download the coal-metagenomic tsv file from the github repository +wget https://raw.githubusercontent.com/ncezid-biome/datasets/master/datasets/coal-metagenomics.tsv + +# select the header of the table as it is necessary for the download + +head -n7 coal-metagenomics.tsv > coal-metagenomics_Kickstart_only.tsv +grep SRR5058924 coal-metagenomics.tsv >> coal-metagenomics_Kickstart_only.tsv + +GenFSGopher.pl --numcpus 12 --compressed --outdir coal-metagenomics coal-metagenomics.tsv + +``` + +It takes around 16min to run + +You should hae the folowing structure +``` +β”œβ”€β”€ coal-metagenomics_Kickstart_only.tsv +└── data + β”œβ”€β”€ in.tsv + β”œβ”€β”€ Kickstart_1.fastq.gz + β”œβ”€β”€ Kickstart_1.fastq.sha256 + β”œβ”€β”€ Kickstart_2.fastq.gz + β”œβ”€β”€ Kickstart_2.fastq.sha256 + β”œβ”€β”€ Makefile + β”œβ”€β”€ prefetch.done + β”œβ”€β”€ sha256sum.log + β”œβ”€β”€ SRR5058924 + β”‚Β Β  └── SRR5058924.sra + └── tree.dnd + + +``` + +```{tip} +You can remove the SRA file `data/SRR5058924/SRR5058924.sra` as we do not need it anymore as we will exclusively use the fastq files. with `rm data/SRR5058924/SRR5058924.sra` +``` + +```{note} +You can also download the data using SRA toolkit which what the tool does in the background but add some check sum to ensure data integrity. After instaling sra toolkit (with conda for example : https://anaconda.org/bioconda/sra-tools) you can run the two commands folowing commands to retrived the data: `prefetch SRR5058924` and `fastq-dump --defline-seq '@$ac_$sn/$ri' --defline-qual '+' --split-3 -O . SRR5058924.sra` +``` \ No newline at end of file diff --git a/docs/tutorial/tutorial_main.md b/docs/tutorial/tutorial_main.md new file mode 100644 index 0000000..1d35a5a --- /dev/null +++ b/docs/tutorial/tutorial_main.md @@ -0,0 +1,52 @@ + +# Tutorial + +The goal of this tutorial is to show an example of commands on how Binette can be used on real data. We will start ou journey from metagenomics reads that we gonna download, then we will assemble these reads in contigs that we will bin with different binning tool. I finally we will use Binette to refine those bins. + + + +```{toctree} +:caption: 'Tutorial' +:maxdepth: 2 + +set_env_and_get_data +assembly +binning +binette +analyse_binette_result.ipynb +``` + + + + + + diff --git a/pyproject.toml b/pyproject.toml index 8ff3ff8..06553ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,8 +44,11 @@ doc = [ "readthedocs-sphinx-search==0.3.1", "sphinx-autobuild==2021.3.14", "myst-parser==1.0.0", - "docutils==0.18.1" -] + "docutils==0.18.1", + "myst-nb", + "nbsphinx" + ] + dev = [ "pytest>=7.0.0", "pytest-cov" From a1f983c2fd8b60c6e8be6230f5b4f451ef910d7b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 15:00:04 +0200 Subject: [PATCH 15/36] adjust nbsphinx version in doc deps --- pyproject.toml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 06553ae..9bbee26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,11 +44,10 @@ doc = [ "readthedocs-sphinx-search==0.3.1", "sphinx-autobuild==2021.3.14", "myst-parser==1.0.0", - "docutils==0.18.1", - "myst-nb", - "nbsphinx" + "docutils==0.18.1", #"myst-nb", + "nbsphinx==0.9.5" ] - + dev = [ "pytest>=7.0.0", "pytest-cov" From 41ef85e7396a93b98543378c920ef275eea6d155 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 15:05:56 +0200 Subject: [PATCH 16/36] manage sphinx deps --- docs/conf.py | 4 ++-- pyproject.toml | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index bb2e303..eae8582 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -28,8 +28,8 @@ "myst_parser", 'nbsphinx', 'nbsphinx_link', - 'sphinx.ext.napoleon', - 'sphinx.ext.viewcode', + # 'sphinx.ext.napoleon', + # 'sphinx.ext.viewcode', "myst_parser", ] diff --git a/pyproject.toml b/pyproject.toml index 9bbee26..51802de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,8 @@ doc = [ "sphinx-autobuild==2021.3.14", "myst-parser==1.0.0", "docutils==0.18.1", #"myst-nb", - "nbsphinx==0.9.5" + "nbsphinx==0.9.5", + nbsphinx_link==1.3.0 ] dev = [ From 6e670cf61b454cf1ea7c6b57a525146cbb864dab Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 15:07:08 +0200 Subject: [PATCH 17/36] add missing quote --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 51802de..6687291 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ doc = [ "myst-parser==1.0.0", "docutils==0.18.1", #"myst-nb", "nbsphinx==0.9.5", - nbsphinx_link==1.3.0 + "nbsphinx_link==1.3.0" ] dev = [ From 863352070f51d7eca1d50e96706d6b405192bde8 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 15:25:33 +0200 Subject: [PATCH 18/36] test with sphinx-book-theme --- docs/conf.py | 3 ++- pyproject.toml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index eae8582..ff33a85 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -55,7 +55,8 @@ # -- Options for HTML output ------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output -html_theme = 'sphinx_rtd_theme' #'alabaster' # +# html_theme = 'sphinx_rtd_theme' #'alabaster' # +html_theme = 'sphinx_book_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/pyproject.toml b/pyproject.toml index 6687291..6fad059 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,8 @@ doc = [ "myst-parser==1.0.0", "docutils==0.18.1", #"myst-nb", "nbsphinx==0.9.5", - "nbsphinx_link==1.3.0" + "nbsphinx_link==1.3.0", + "sphinx-book-theme==1.0.1" ] dev = [ From ba8c70b67de9b5b7ccd1fc85010d43a33de174b3 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 17:59:14 +0200 Subject: [PATCH 19/36] improve tutorial --- docs/conf.py | 17 +++++- docs/tutorial/assembly.md | 56 ++++++++++--------- docs/tutorial/binette.md | 56 +++++++++++++++++-- docs/tutorial/binning.md | 70 ++++++++++++++---------- docs/tutorial/set_env_and_get_data.md | 78 --------------------------- docs/tutorial/set_environment.md | 27 ++++++++++ docs/tutorial/tutorial_main.md | 73 +++++++++++++++++++++++-- 7 files changed, 236 insertions(+), 141 deletions(-) delete mode 100644 docs/tutorial/set_env_and_get_data.md create mode 100644 docs/tutorial/set_environment.md diff --git a/docs/conf.py b/docs/conf.py index ff33a85..37654c6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -31,9 +31,22 @@ # 'sphinx.ext.napoleon', # 'sphinx.ext.viewcode', "myst_parser", - + 'sphinxcontrib.mermaid' +] +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "fieldlist", + "html_admonition", + "html_image", + "replacements", + "smartquotes", + "strikethrough", + "substitution", + "tasklist", ] - source_suffix = { '.md': 'markdown', diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md index cd51e76..c7104ed 100644 --- a/docs/tutorial/assembly.md +++ b/docs/tutorial/assembly.md @@ -1,43 +1,49 @@ +## Assemble the Reads +We will use **MEGAHIT** to assemble the reads from our dataset. Run the following command: -## Assemble the reads - -We will use megahit to assemble the reads - -```bash - -cd /home/jmainguy/Analysis/Binette/Binette_tutorial/ncezid-biome_datasets/exec_tutorial_jupyter +```{code-block} bash +megahit -1 coal-metagenomics/Kickstart_1.fastq.gz \ + -2 coal-metagenomics/Kickstart_2.fastq.gz \ + --out-dir Kickstart.megahit --out-prefix R1 --num-cpu-threads 12 ``` -```bash +:::{admonition} βŒ› Expected Time +:class: note -megahit -1 coal-metagenomics/Kickstart_1.fastq.gz -2 coal-metagenomics/Kickstart_2.fastq.gz --out-dir Kickstart.megahit --out-prefix R1 --num-cpu-threads 12 - -``` +This process takes approximately 28 minutes to complete. +::: -This take 27m49,879s +```{admonition} Note +:class: note -```{note} -We can use spade as well. It performs generally better that megahit but is generally longer and consume more memory than megahit. See cami benchmark ??? +You can also use **SPAdes** for assembly. It generally performs better than MEGAHIT but takes longer and requires more memory. Refer to the CAMI benchmark for a detailed comparison. ``` +## Align the Reads Over the Assembly +To get coverage information, we first need to map the reads back to the assembly. -## Align the reads over the assembly - -First we need to map the reads back against the assembly to get coverage information - -```bash - +```{code-block} bash +# Create a directory for the alignments mkdir -p alignments_bwa/ +# Index the contigs file using BWA-MEM2 bwa-mem2 index Kickstart.megahit/R1.contigs.fa -p Kickstart.megahit/R1.contigs.fa -bwa-mem2 mem -t 12 Kickstart.megahit/R1.contigs.fa coal-metagenomics/Kickstart_*.fastq.gz | samtools view -@ 12 -bS - | samtools sort -@ 12 - -o alignments_bwa/Kickstart.bam - -samtools index alignments_bwa/Kickstart.bam +# Map reads back to the assembly, convert to BAM format, and sort +bwa-mem2 mem -t 12 Kickstart.megahit/R1.contigs.fa coal-metagenomics/Kickstart_*.fastq.gz | \ +samtools view -@ 12 -bS - | \ +samtools sort -@ 12 - -o alignments_bwa/Kickstart.bam +# Index the BAM file +samtools index alignments_bwa/Kickstart.bam ``` - -This take around 12 minutes \ No newline at end of file + + +:::{admonition} βŒ› Expected Time +:class: note + +This process takes approximately 12 minutes to complete. +::: diff --git a/docs/tutorial/binette.md b/docs/tutorial/binette.md index 687398f..926ce3b 100644 --- a/docs/tutorial/binette.md +++ b/docs/tutorial/binette.md @@ -1,7 +1,55 @@ -## Run Binette -```{code-cell} bash +## Run Binette -binette --bin_dirs maxbin2/ metabat2/ semibin2/output_bins/ concoct/bins/ -c Kickstart.megahit/R1.contigs.fa --verbose -t 12 -o binette_results +Binette will use the previously computed bins to refine and improve them, generating a new set of higher-quality bins. + +To run Binette, use the following command: + +```bash +binette --bin_dirs maxbin2/ metabat2/ semibin2/output_bins/ concoct/bins/ \ + -c Kickstart.megahit/R1.contigs.fa \ + --verbose -t 12 -o binette_results +``` + +Once Binette completes, the `binette_results` directory should have the following structure: + +```plaintext +binette_results/ +β”œβ”€β”€ final_bins +β”‚ β”œβ”€β”€ bin_13475.fa +β”‚ β”œβ”€β”€ bin_17075.fa +β”‚ β”œβ”€β”€ bin_19689.fa +β”‚ β”œβ”€β”€ bin_21248.fa +β”‚ β”œβ”€β”€ bin_31703.fa +β”‚ β”œβ”€β”€ bin_33569.fa +β”‚ β”œβ”€β”€ bin_39350.fa +β”‚ β”œβ”€β”€ bin_39427.fa +β”‚ β”œβ”€β”€ bin_39558.fa +β”‚ β”œβ”€β”€ bin_44137.fa +β”‚ β”œβ”€β”€ bin_46775.fa +β”‚ β”œβ”€β”€ bin_47060.fa +β”‚ β”œβ”€β”€ bin_47177.fa +β”‚ β”œβ”€β”€ bin_47926.fa +β”‚ └── bin_51082.fa +β”œβ”€β”€ final_bins_quality_reports.tsv +β”œβ”€β”€ input_bins_quality_reports +β”‚ β”œβ”€β”€ input_bins_1.concoct_bins.tsv +β”‚ β”œβ”€β”€ input_bins_2.maxbin2.tsv +β”‚ β”œβ”€β”€ input_bins_3.metabat2.tsv +β”‚ └── input_bins_4.semibin2_output_bins.tsv +└── temporary_files + β”œβ”€β”€ assembly_proteins.faa + β”œβ”€β”€ diamond_result.log + └── diamond_result.tsv +``` + +### Key Output Files: + +- **`final_bins/`**: Contains the refined bins in FASTA format. +- **`final_bins_quality_reports.tsv`**: A summary report containing CheckM2 metrics for the final bin selection. +- **`input_bins_quality_reports/`**: Quality reports for each of the input bin sets from MaxBin2, MetaBAT2, CONCOCT, and SemiBin2. + +### Next Steps + +In the next section, we will use `final_bins_quality_reports.tsv` along with the reports from `binette_results/input_bins_quality_reports` to visualize Binette's bins and compare them with the initial bin sets. -``` \ No newline at end of file diff --git a/docs/tutorial/binning.md b/docs/tutorial/binning.md index 36efbed..8624cf6 100644 --- a/docs/tutorial/binning.md +++ b/docs/tutorial/binning.md @@ -1,49 +1,62 @@ +## Run Binning Tools -## Run binning tools - +In this section, we'll use different binning tools to group contigs of assembly. -### metabat2 +### MetaBAT2 -We first generate a depth file from the bam file using jgi_summarize_bam_contig_depths script from metabat2. This depth file will be used also with maxbin2. -```bash +First, generate a depth file from the BAM file using the `jgi_summarize_bam_contig_depths` script from MetaBAT2. This depth file will also be used for MaxBin2. -jgi_summarize_bam_contig_depths --outputDepth depth_Kickstart.txt alignments_bwa/Kickstart.bam +```bash +jgi_summarize_bam_contig_depths --outputDepth depth_Kickstart.txt alignments_bwa/Kickstart.bam ``` -Now we can run metabat2: +Now, run MetaBAT2 with the generated depth file: ```bash +metabat2 --inFile Kickstart.megahit/R1.contigs.fa --abdFile depth_Kickstart.txt --outFile metabat2/metabat2 --numThreads 12 --seed 1 +``` -metabat2 --inFile Kickstart.megahit/R1.contigs.fa --abdFile depth_Kickstart.txt --outFile metabat2/metabat2 --numThreads 12 --seed 1 +### MaxBin2 +We will use the same depth file produced by `jgi_summarize_bam_contig_depths` for MetaBAT2: + +```bash +mkdir -p maxbin2 +run_MaxBin.pl -contig Kickstart.megahit/R1.contigs.fa \ + -abund depth_Kickstart.txt -thread 12 -out maxbin2/maxbin2 ``` +### CONCOCT -### maxbin2 +To run CONCOCT, follow these steps: -We use the depth file produced by `jgi_summarize_bam_contig_depths` +1. **Cut up the FASTA file** into chunks for processing: ```bash +mkdir -p concoct/ -mkdir -p maxbin2 -run_MaxBin.pl -contig Kickstart.megahit/R1.contigs.fa -abund depth_Kickstart.txt -thread 12 -out maxbin2/maxbin2 - +cut_up_fasta.py Kickstart.megahit/R1.contigs.fa --chunk_size 10000 \ + --overlap_size 0 --merge_last \ + --bedfile concoct/contigs_10K.bed > concoct/contigs_10K.fa ``` -### concoct - -Then we can also run concoct with the folowing commands: +2. **Generate the coverage table** from the BAM file: ```bash +concoct_coverage_table.py concoct/contigs_10K.bed alignments_bwa/Kickstart.bam > concoct/coverage_table.tsv +``` -mkdir -p concoct/ - -cut_up_fasta.py Kickstart.megahit/R1.contigs.fa --chunk_size 10000 --overlap_size 0 --merge_last --bedfile concoct/contigs_10K.bed > concoct/contigs_10K.fa +3. **Run CONCOCT** with the composition and coverage files: -concoct_coverage_table.py concoct/contigs_10K.bed alignments_bwa/Kickstart.bam > concoct/coverage_table.tsv +```bash +concoct --composition_file concoct/contigs_10K.fa \ + --coverage_file concoct/coverage_table.tsv \ + --basename concoct/bins --threads 12 +``` -concoct --composition_file concoct/contigs_10K.fa --coverage_file concoct/coverage_table.tsv --basename concoct/bins --threads 12 +4. **Merge the clustering results** and extract bins: +```bash merge_cutup_clustering.py concoct/bins_clustering_gt1000.csv > concoct/clustering_merge.csv mkdir -p concoct/bins @@ -53,15 +66,16 @@ extract_fasta_bins.py Kickstart.megahit/R1.contigs.fa concoct/clustering_merge.c ### SemiBin2 -We can launch semibin2 as well with its `single_easy_bin` command. +You can also run SemiBin2 with its `single_easy_bin` command: + +```{admonition} ⏳ Time Note +:class: note -```{note} -This take some time so it can be skipped. +This process can take some time, so it may be skipped. ``` ```bash - -SemiBin2 single_easy_bin -i Kickstart.megahit/R1.contigs.fa -b alignments_bwa/Kickstart.bam -o semibin2/ -p 12 - +SemiBin2 single_easy_bin -i Kickstart.megahit/R1.contigs.fa \ + -b alignments_bwa/Kickstart.bam \ + -o semibin2/ -p 12 ``` - diff --git a/docs/tutorial/set_env_and_get_data.md b/docs/tutorial/set_env_and_get_data.md deleted file mode 100644 index b950a44..0000000 --- a/docs/tutorial/set_env_and_get_data.md +++ /dev/null @@ -1,78 +0,0 @@ - -## Set tutorial environment - -We will download necessary tool in a dedicated conda envrionnement. - - - -Let's create a directory to run the tutorial: - - -```bash - -mamba env create -f binette_tutorial_env.yaml -n binette_tuto - -``` - - - -## Get the Data - -### Using ncezid-biome datasets tool - -I downloaded the metagenome Kickstart from the above dataset (SAMN05024035) that correspond to this sra SRR5058924 https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRR5058924&o=acc_s%3Aa - - -We will donwload the data of the Kickstart (SAMN05024035) dataset this repository that https://github.com/ncezid-biome/datasets?tab=readme-ov-file#edlb - -We had use conda as detailed here https://github.com/ncezid-biome/datasets/blob/master/INSTALL.md#conda - -Now we can download the Kickstart dataset with the folowing commands. - -We first download the coal-metagenomic table from the github repository : https://github.com/ncezid-biome/datasets/blob/master/datasets/coal-metagenomics.tsv -ANd just select the line corresponding to the Kickstart dataset. - - - - -```bash -# download the coal-metagenomic tsv file from the github repository -wget https://raw.githubusercontent.com/ncezid-biome/datasets/master/datasets/coal-metagenomics.tsv - -# select the header of the table as it is necessary for the download - -head -n7 coal-metagenomics.tsv > coal-metagenomics_Kickstart_only.tsv -grep SRR5058924 coal-metagenomics.tsv >> coal-metagenomics_Kickstart_only.tsv - -GenFSGopher.pl --numcpus 12 --compressed --outdir coal-metagenomics coal-metagenomics.tsv - -``` - -It takes around 16min to run - -You should hae the folowing structure -``` -β”œβ”€β”€ coal-metagenomics_Kickstart_only.tsv -└── data - β”œβ”€β”€ in.tsv - β”œβ”€β”€ Kickstart_1.fastq.gz - β”œβ”€β”€ Kickstart_1.fastq.sha256 - β”œβ”€β”€ Kickstart_2.fastq.gz - β”œβ”€β”€ Kickstart_2.fastq.sha256 - β”œβ”€β”€ Makefile - β”œβ”€β”€ prefetch.done - β”œβ”€β”€ sha256sum.log - β”œβ”€β”€ SRR5058924 - β”‚Β Β  └── SRR5058924.sra - └── tree.dnd - - -``` - -```{tip} -You can remove the SRA file `data/SRR5058924/SRR5058924.sra` as we do not need it anymore as we will exclusively use the fastq files. with `rm data/SRR5058924/SRR5058924.sra` -``` - -```{note} -You can also download the data using SRA toolkit which what the tool does in the background but add some check sum to ensure data integrity. After instaling sra toolkit (with conda for example : https://anaconda.org/bioconda/sra-tools) you can run the two commands folowing commands to retrived the data: `prefetch SRR5058924` and `fastq-dump --defline-seq '@$ac_$sn/$ri' --defline-qual '+' --split-3 -O . SRR5058924.sra` -``` \ No newline at end of file diff --git a/docs/tutorial/set_environment.md b/docs/tutorial/set_environment.md new file mode 100644 index 0000000..52d6092 --- /dev/null +++ b/docs/tutorial/set_environment.md @@ -0,0 +1,27 @@ +## Set Up the Tutorial Environment + +To get started, we'll download the necessary tools and set them up in a dedicated Conda environment. + +### Create a Conda Environment + +First, let's create a new Conda environment specifically for this tutorial: + +```{code-block} bash +mamba env create -f binette_tutorial_env.yaml -n binette_tuto +``` + +This command will create a Conda environment named `binette_tuto` using the environment file `binette_tutorial_env.yaml`. + +### Activate the Environment + +After the environment is created, activate it by running: + +```{code-block} bash +conda activate binette_tuto +``` + +Below is the content of the `binette_tutorial_env.yaml` file: + +```{include} binette_tutorial_env.yaml +:code: yaml +``` diff --git a/docs/tutorial/tutorial_main.md b/docs/tutorial/tutorial_main.md index 1d35a5a..d21931c 100644 --- a/docs/tutorial/tutorial_main.md +++ b/docs/tutorial/tutorial_main.md @@ -1,15 +1,80 @@ # Tutorial -The goal of this tutorial is to show an example of commands on how Binette can be used on real data. We will start ou journey from metagenomics reads that we gonna download, then we will assemble these reads in contigs that we will bin with different binning tool. I finally we will use Binette to refine those bins. +In this tutorial, we'll walk through a practical example of how to use Binette with real data. We'll start by downloading metagenomics reads and then assemble these reads into contigs. Next, we'll use different binning tools to group the contigs. Finally, we'll use Binette to refine these bins and improve our results. +```{mermaid} +--- +title: "Tutorial Overview:" +align: center +--- + +%%{init: {'theme':'default'}}%% + +graph LR + + A[Download Metagenomics Reads] --> B + B[Assemble Reads into Contigs] --> c + subgraph Pangenome creation + a:::workflow + c:::workflow + g:::workflow + p:::workflow + a("annotate") --> c + c(cluster) --> g(graph) + g(graph) --> p(partition) + end + + + C[Bin Contigs with Binning Tools] --> D[Refine Bins with Binette] + + + + classDef panrgp fill:#4066d4 + classDef panmodule fill:#d44066 + classDef workflow fill:#d4ae40 + + +``` + +```{mermaid} + +--- +title: "Tutorial Overview:" +align: center +--- + + +graph TD + + i[Get Metagenomics Reads] --> B[Assembly & Reads alignment] + + + B --> metabat2 --> r[Binette] + B --> maxbin2 --> r + B --> concoct --> r + B --> semibin2 --> r + + subgraph Binning + metabat2:::binning + maxbin2:::binning + concoct:::binning + semibin2:::binning + end + + + classDef binning fill:#d4ae40 + + +``` ```{toctree} -:caption: 'Tutorial' -:maxdepth: 2 +:caption: 'Tutorial steps' +:maxdepth: 1 -set_env_and_get_data +set_environment +get_dataset assembly binning binette From e285579e27147a28b97fc37ec2acd91e05c7601b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 18:00:28 +0200 Subject: [PATCH 20/36] rm print --- binette/io_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binette/io_manager.py b/binette/io_manager.py index 5899722..04213e4 100644 --- a/binette/io_manager.py +++ b/binette/io_manager.py @@ -66,7 +66,7 @@ def infer_bin_set_names_from_input_paths(input_bins: List[Path]) -> Dict[str, Pa bin_name_to_bin_dir = {} common_prefix, common_suffix, common_extensions = get_paths_common_prefix_suffix(input_bins) - print(common_prefix, common_suffix, common_extensions ) + for path in input_bins: specific_parts = path.parts[len(common_prefix):len(path.parts)-len(common_suffix)] From de93964300e81baa9878aa390e10c0038f981d69 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 3 Sep 2024 18:02:13 +0200 Subject: [PATCH 21/36] add mermaid deps --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6fad059..8f12e86 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,8 @@ doc = [ "docutils==0.18.1", #"myst-nb", "nbsphinx==0.9.5", "nbsphinx_link==1.3.0", - "sphinx-book-theme==1.0.1" + "sphinx-book-theme==1.0.1", + "sphinxcontrib.mermaid" ] dev = [ From 7f2c52f06a1c04b102dcffa0877802e0c6cdd222 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 12:04:41 +0200 Subject: [PATCH 22/36] improve tutorial --- docs/conf.py | 4 +- docs/tutorial/analyse_binette_result.ipynb | 1365 ++++------------- docs/tutorial/assembly.md | 21 +- .../final_bins_quality_reports.tsv | 16 + .../input_bins_1.concoct_bins.tsv | 64 + .../input_bins_2.maxbin2.tsv | 24 + .../input_bins_3.metabat2.tsv | 15 + .../input_bins_4.semibin2_output_bins.tsv | 26 + docs/tutorial/binning.md | 44 +- docs/tutorial/tutorial_main.md | 52 +- 10 files changed, 538 insertions(+), 1093 deletions(-) create mode 100644 docs/tutorial/binette_results/final_bins_quality_reports.tsv create mode 100644 docs/tutorial/binette_results/input_bins_quality_reports/input_bins_1.concoct_bins.tsv create mode 100644 docs/tutorial/binette_results/input_bins_quality_reports/input_bins_2.maxbin2.tsv create mode 100644 docs/tutorial/binette_results/input_bins_quality_reports/input_bins_3.metabat2.tsv create mode 100644 docs/tutorial/binette_results/input_bins_quality_reports/input_bins_4.semibin2_output_bins.tsv diff --git a/docs/conf.py b/docs/conf.py index 37654c6..245781e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,7 +61,7 @@ # `path/to/file:heading` instead of just `heading` autosectionlabel_prefix_document = True -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build', "api"] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build', "api", "jupyter_execute"] @@ -69,7 +69,7 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output # html_theme = 'sphinx_rtd_theme' #'alabaster' # -html_theme = 'sphinx_book_theme' +html_theme = 'sphinx_rtd_theme' #'sphinx_book_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/tutorial/analyse_binette_result.ipynb b/docs/tutorial/analyse_binette_result.ipynb index 30247a8..94d1fc8 100644 --- a/docs/tutorial/analyse_binette_result.ipynb +++ b/docs/tutorial/analyse_binette_result.ipynb @@ -2,31 +2,57 @@ "cells": [ { "cell_type": "markdown", - "id": "52e7f39c", + "id": "edcb3b82", "metadata": {}, "source": [ - "## Analyse Binette results" + "## Analyse Binette results\n", + "\n", + "Let's visualize the results from Binette and compare them to the initial bin sets used as input. " + ] + }, + { + "cell_type": "markdown", + "id": "dbe1d73b", + "metadata": {}, + "source": [ + "### Import Necessary Libraries\n", + "\n", + "First, we'll need to import the necessary libraries for our analysis and plotting:" ] }, { "cell_type": "code", "execution_count": 1, - "id": "e6a1e1ee-681d-4823-b974-7027bafd2ba9", + "id": "9e9153ef", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from pathlib import Path\n", "import plotly.express as px\n", + "\n", + "# This is needed to properly display Plotly graphs in the documentation\n", "import plotly.io as pio\n", "pio.renderers.default = \"sphinx_gallery\"" ] }, + { + "cell_type": "markdown", + "id": "b93e8a0e", + "metadata": {}, + "source": [ + "### Load Binette Results\n", + "\n", + "Now, let's load the final Binette quality report into a Pandas DataFrame:" + ] + }, { "cell_type": "code", "execution_count": 2, - "id": "34e80119-f59b-41b0-b0e5-de2d6ed0c6a3", - "metadata": {}, + "id": "d95ad45c", + "metadata": { + "lines_to_next_cell": 0 + }, "outputs": [ { "data": { @@ -321,15 +347,25 @@ "source": [ "binette_result_file = \"./binette_results/final_bins_quality_reports.tsv\"\n", "df_binette = pd.read_csv(binette_result_file, sep='\\t')\n", - "df_binette['tool'] = \"binette\"\n", - "df_binette['index'] = df_binette.index\n", + "df_binette['tool'] = \"binette\" # Add a column to label the tool\n", + "df_binette['index'] = df_binette.index # Add an index column\n", "df_binette" ] }, + { + "cell_type": "markdown", + "id": "c1372a73", + "metadata": {}, + "source": [ + "### Load and Combine Input Bin Quality Reports\n", + "\n", + "Next, we will load the quality reports of the input bin sets, computed by various tools and saved by Binette. We’ll combine these into a single DataFrame and add a column to indicate high-quality bins. We define a high-quality bin as one with contamination ≀ 5% and completeness β‰₯ 90%." + ] + }, { "cell_type": "code", "execution_count": 3, - "id": "189038d3-77a0-435a-9590-4d8b3038341e", + "id": "fcb016f2", "metadata": {}, "outputs": [ { @@ -353,98 +389,134 @@ " \n", " \n", " \n", + " tool\n", " completeness\n", " contamination\n", - " tool\n", + " size\n", + " N50\n", + " contig_count\n", " \n", " \n", " \n", " \n", " 0\n", + " binette\n", " 100.00\n", " 0.05\n", - " binette\n", + " 4672665\n", + " 82084\n", + " 93\n", " \n", " \n", " 1\n", + " binette\n", " 99.90\n", " 0.20\n", - " binette\n", + " 2796605\n", + " 41151\n", + " 98\n", " \n", " \n", " 2\n", + " binette\n", " 98.59\n", " 0.83\n", - " binette\n", + " 4601336\n", + " 41016\n", + " 165\n", " \n", " \n", " 3\n", + " binette\n", " 96.10\n", " 0.34\n", - " binette\n", + " 2598718\n", + " 11891\n", + " 312\n", " \n", " \n", " 4\n", + " binette\n", " 91.98\n", " 1.71\n", - " binette\n", + " 1768095\n", + " 9976\n", + " 250\n", " \n", " \n", " ...\n", " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", + " ...\n", " \n", " \n", " 20\n", + " semibin2\n", " 8.28\n", " 0.01\n", - " semibin2\n", + " 358822\n", + " 3296\n", + " 106\n", " \n", " \n", " 21\n", + " semibin2\n", " 8.12\n", " 0.02\n", - " semibin2\n", + " 353499\n", + " 3949\n", + " 90\n", " \n", " \n", " 22\n", + " semibin2\n", " 7.74\n", " 0.01\n", - " semibin2\n", + " 351540\n", + " 4284\n", + " 85\n", " \n", " \n", " 23\n", + " semibin2\n", " 6.18\n", " 0.00\n", - " semibin2\n", + " 250833\n", + " 3607\n", + " 66\n", " \n", " \n", " 24\n", + " semibin2\n", " 4.41\n", " 0.13\n", - " semibin2\n", + " 217541\n", + " 3425\n", + " 64\n", " \n", " \n", "\n", - "

140 rows Γ— 3 columns

\n", + "

140 rows Γ— 6 columns

\n", "" ], "text/plain": [ - " completeness contamination tool\n", - "0 100.00 0.05 binette\n", - "1 99.90 0.20 binette\n", - "2 98.59 0.83 binette\n", - "3 96.10 0.34 binette\n", - "4 91.98 1.71 binette\n", - ".. ... ... ...\n", - "20 8.28 0.01 semibin2\n", - "21 8.12 0.02 semibin2\n", - "22 7.74 0.01 semibin2\n", - "23 6.18 0.00 semibin2\n", - "24 4.41 0.13 semibin2\n", + " tool completeness contamination size N50 contig_count\n", + "0 binette 100.00 0.05 4672665 82084 93\n", + "1 binette 99.90 0.20 2796605 41151 98\n", + "2 binette 98.59 0.83 4601336 41016 165\n", + "3 binette 96.10 0.34 2598718 11891 312\n", + "4 binette 91.98 1.71 1768095 9976 250\n", + ".. ... ... ... ... ... ...\n", + "20 semibin2 8.28 0.01 358822 3296 106\n", + "21 semibin2 8.12 0.02 353499 3949 90\n", + "22 semibin2 7.74 0.01 351540 4284 85\n", + "23 semibin2 6.18 0.00 250833 3607 66\n", + "24 semibin2 4.41 0.13 217541 3425 64\n", "\n", - "[140 rows x 3 columns]" + "[140 rows x 6 columns]" ] }, "execution_count": 3, @@ -453,36 +525,51 @@ } ], "source": [ + "from pathlib import Path\n", + "\n", "input_bins_quality_reports_dir = Path(\"binette_results/input_bins_quality_reports/\")\n", "\n", + "# Initialize the list with Binette results\n", "df_input_bin_list = [df_binette]\n", + "\n", + "# Load each input bin quality report\n", "for input_bin_metric_file in input_bins_quality_reports_dir.glob(\"*tsv\"):\n", - " tool = input_bin_metric_file.name.split('.')[1].split('_')[0]\n", + " tool = input_bin_metric_file.name.split('.')[1].split('_')[0] # Extract tool name from file name\n", " df_input = pd.read_csv(input_bin_metric_file, sep='\\t')\n", " df_input['index'] = df_input.index\n", " df_input['tool'] = tool\n", " df_input_bin_list.append(df_input)\n", "\n", - "df_bins = pd.concat(df_input_bin_list)\n", - " \n", - "set(df_bins['tool'])\n", - "df_bins[\"High quality bin\"] = (df_bins['completeness'] >= 90) & (df_bins['contamination'] <= 5)\n", - "#df_binette = pd.read_csv(binette_result_file, sep='\\t')\n", - "#df_binette\n", - "df_bins[[\"completeness\", \"contamination\", \"tool\"]]\n" + "# Combine all DataFrames into one\n", + "df_bins = pd.concat(df_input_bin_list)\n", + "\n", + "# Add a column to indicate high-quality bins\n", + "df_bins[\"High quality bin\"] = (df_bins['completeness'] >= 90) & (df_bins['contamination'] <= 5)\n", + "\n", + "# Display relevant columns\n", + "df_bins[[ \"tool\", \"completeness\", \"contamination\", \"size\", \"N50\", \"contig_count\"]]\n" + ] + }, + { + "cell_type": "markdown", + "id": "80ef2544", + "metadata": {}, + "source": [ + "### Plot bin completeness and contamination\n", + "With the DataFrame containing both Binette’s final bins and the input bins, we can now create a scatter plot to visualize the results:" ] }, { "cell_type": "code", "execution_count": 4, - "id": "911d598f-a6c7-4178-aff2-6059235e7fc4", + "id": "277cb781", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "
" + "
" ] }, "metadata": {}, @@ -490,25 +577,63 @@ } ], "source": [ - "fig = px.scatter(df_bins, x=\"completeness\",y=\"contamination\", color=\"High quality bin\", size=\"size\", facet_row=\"tool\")\n", + "import plotly.express as px\n", + "\n", + "# Create a scatter plot to visualize completeness and contamination\n", + "fig = px.scatter(df_bins, \n", + " x=\"completeness\", \n", + " y=\"contamination\", \n", + " color=\"High quality bin\", \n", + " size=\"size\", \n", + " facet_row=\"tool\",\n", + " title=\"Bin Quality Comparison\",\n", + " )\n", + "\n", + "# Update layout for better visibility\n", "fig.update_layout(\n", - " width=800,\n", - " height=800)\n", - " \n", + " width=600,\n", + " height=800,\n", + " legend_title=\"High Quality Bin\",\n", + " title=\"Comparison of Bin Quality Metrics\"\n", + ")\n", + "\n", + "# Show the plot\n", "fig.show()" ] }, + { + "cell_type": "markdown", + "id": "06a14412", + "metadata": {}, + "source": [ + "We can see that binette bins are the one displaying the most high quality bins (completeness β‰₯ 90% and contamination ≀ 5%).\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "323f5637", + "metadata": {}, + "source": [ + "### Comparing Binning Tools Using Bin Score Curves\n", + "\n", + "A common way to compare bin sets is by sorting the bins based on their scores and plotting them against their index.\n", + "\n", + "Here’s how we can create such a plot:" + ] + }, { "cell_type": "code", "execution_count": 5, - "id": "35c46beb-1ac9-4014-9672-91edcc1bf439", + "id": "79faaa3a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "
" + "
" ] }, "metadata": {}, @@ -516,18 +641,79 @@ } ], "source": [ - "df_bins['completeness - 2*contamination'] = df_bins['completeness'] - 2*df_bins['contamination']\n", - "fig = px.line(df_bins, x=\"index\",y='completeness - 2*contamination', color=\"tool\",markers=True)\n", - "fig.update_layout(\n", - " width=800,\n", - " height=500)\n", + "# Calculate the score for each bin\n", + "df_bins['completeness - 2*contamination'] = df_bins['completeness'] - 2 * df_bins['contamination']\n", + "\n", + "# Plot the score against the bin index\n", + "fig = px.line(df_bins, x=\"index\", y='completeness - 2*contamination', color=\"tool\", markers=True)\n", + "fig.update_layout(width=600, height=500)\n", "fig.show()" ] }, + { + "cell_type": "markdown", + "id": "97aee4d0", + "metadata": {}, + "source": [ + "From the plot, you might notice that Concoct has a lot of bins with lower quality scores. Let’s zoom in to get a better look:" + ] + }, { "cell_type": "code", "execution_count": 6, - "id": "af74bfb2-457c-4cf4-9c13-3ee9642be7ce", + "id": "063974f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Adjust the plot view to zoom in\n", + "fig.update_layout(\n", + " xaxis_range=[-1, 20], # Zoom on x-axis\n", + " yaxis_range=[0, 100], # Zoom on y-axis\n", + " width=600,\n", + " height=500\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "136b17e4", + "metadata": {}, + "source": [ + "Binette line consistently appears above the other binning tools. This indicates that Binette produce higher-quality bins compared to the initial bin sets." + ] + }, + { + "cell_type": "markdown", + "id": "46f1b3d0", + "metadata": {}, + "source": [ + "### Plot Number of High-Quality Bins per Bin Set\n", + "\n", + "Let's plot the number of bins falling into different quality categories. We’ll focus on bins with a maximum of 10% contamination and classify them into three completeness categories:\n", + "\n", + "- **`> 50% and ≀ 70%`**\n", + "- **`> 70% and ≀ 90%`**\n", + "- **`> 90%`**\n", + "\n", + "First, let’s group and count the bins in each category:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "943f88b4", "metadata": {}, "outputs": [ { @@ -551,1006 +737,47 @@ " \n", " \n", " \n", - " bin_id\n", - " origin\n", - " name\n", - " completeness\n", - " contamination\n", - " score\n", - " size\n", - " N50\n", - " contig_count\n", - " tool\n", - " index\n", - " High quality bin\n", - " completeness - 2*contamination\n", " Contamination ≀ 10 and<br>Completeness\n", + " tool\n", + " bin_count\n", " \n", " \n", " \n", " \n", " 0\n", - " 17075\n", - " diff\n", - " 44 - 10\n", - " 100.00\n", - " 0.05\n", - " 99.90\n", - " 4672665\n", - " 82084\n", - " 93\n", + " > 50% and ≀ 70%\n", " binette\n", - " 0\n", - " True\n", - " 99.90\n", - " > 90%\n", + " 5\n", " \n", " \n", " 1\n", - " 39427\n", - " diff\n", - " 36 - 6\n", - " 99.90\n", - " 0.20\n", - " 99.50\n", - " 2796605\n", - " 41151\n", - " 98\n", - " binette\n", + " > 50% and ≀ 70%\n", + " maxbin2\n", " 1\n", - " True\n", - " 99.50\n", - " > 90%\n", " \n", " \n", " 2\n", - " 47060\n", - " union\n", - " 58 | 33\n", - " 98.59\n", - " 0.83\n", - " 96.93\n", - " 4601336\n", - " 41016\n", - " 165\n", - " binette\n", - " 2\n", - " True\n", - " 96.93\n", - " > 90%\n", + " > 50% and ≀ 70%\n", + " metabat2\n", + " 1\n", " \n", " \n", " 3\n", - " 47177\n", - " union\n", - " 91 | 25 | 55\n", - " 96.10\n", - " 0.34\n", - " 95.42\n", - " 2598718\n", - " 11891\n", - " 312\n", - " binette\n", - " 3\n", - " True\n", - " 95.42\n", - " > 90%\n", + " > 50% and ≀ 70%\n", + " semibin2\n", + " 2\n", " \n", " \n", " 4\n", - " 21248\n", - " diff\n", - " 65 - 8 - 28\n", - " 91.98\n", - " 1.71\n", - " 88.56\n", - " 1768095\n", - " 9976\n", - " 250\n", + " > 70% and ≀ 90%\n", " binette\n", - " 4\n", - " True\n", - " 88.56\n", - " > 90%\n", + " 3\n", " \n", " \n", " 5\n", - " 44137\n", - " diff\n", - " 76 - 13 - 28\n", - " 92.63\n", - " 2.41\n", - " 87.81\n", - " 3726254\n", - " 5669\n", - " 850\n", - " binette\n", - " 5\n", - " True\n", - " 87.81\n", - " > 90%\n", - " \n", - " \n", - " 6\n", - " 31703\n", - " diff\n", - " 31 - 7 - 61\n", - " 81.73\n", - " 0.84\n", - " 80.05\n", - " 1665233\n", - " 8518\n", - " 248\n", - " binette\n", - " 6\n", - " False\n", - " 80.05\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 7\n", - " 13475\n", - " diff\n", - " 47 - 37\n", - " 72.89\n", - " 2.39\n", - " 68.11\n", - " 1241829\n", - " 5061\n", - " 252\n", - " binette\n", - " 7\n", - " False\n", - " 68.11\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 8\n", - " 47926\n", - " union\n", - " 75 | 30\n", - " 74.31\n", - " 4.26\n", - " 65.79\n", - " 3293949\n", - " 2954\n", - " 1262\n", - " binette\n", - " 8\n", - " False\n", - " 65.79\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 9\n", - " 46775\n", - " union\n", - " 42 | 102\n", - " 62.94\n", - " 2.75\n", - " 57.44\n", - " 1293571\n", - " 3783\n", - " 419\n", - " binette\n", - " 9\n", - " False\n", - " 57.44\n", - " > 50% and ≀ 70%\n", - " \n", - " \n", - " 10\n", - " 33569\n", - " diff\n", - " 83 - 7 - 38 - 31\n", - " 59.18\n", - " 2.24\n", - " 54.70\n", - " 2042527\n", - " 4437\n", - " 514\n", - " binette\n", - " 10\n", - " False\n", - " 54.70\n", - " > 50% and ≀ 70%\n", - " \n", - " \n", - " 11\n", - " 39350\n", - " diff\n", - " 57 - 16 - 75\n", - " 52.16\n", - " 1.31\n", - " 49.54\n", - " 2601282\n", - " 5332\n", - " 509\n", - " binette\n", - " 11\n", - " False\n", - " 49.54\n", - " > 50% and ≀ 70%\n", - " \n", - " \n", - " 12\n", - " 39558\n", - " diff\n", - " 78 - 6 - 43\n", - " 64.63\n", - " 8.03\n", - " 48.57\n", - " 1858210\n", - " 1430\n", - " 1293\n", - " binette\n", - " 12\n", - " False\n", - " 48.57\n", - " > 50% and ≀ 70%\n", - " \n", - " \n", - " 13\n", - " 51082\n", - " union\n", - " 120 | 1\n", - " 52.33\n", - " 5.06\n", - " 42.21\n", - " 688879\n", - " 1446\n", - " 472\n", - " binette\n", - " 13\n", - " False\n", - " 42.21\n", - " > 50% and ≀ 70%\n", - " \n", - " \n", - " 0\n", - " 125\n", - " concoct/bins\n", - " 9.fa\n", - " 100.00\n", - " 0.38\n", - " 99.24\n", - " 3033586\n", - " 37523\n", - " 131\n", - " concoct\n", - " 0\n", - " True\n", - " 99.24\n", - " > 90%\n", - " \n", - " \n", - " 1\n", - " 67\n", - " concoct/bins\n", - " 41.fa\n", - " 100.00\n", - " 0.46\n", - " 99.08\n", - " 4765466\n", - " 82084\n", - " 101\n", - " concoct\n", - " 1\n", - " True\n", - " 99.08\n", - " > 90%\n", - " \n", - " \n", - " 2\n", - " 91\n", - " concoct/bins\n", - " 7.fa\n", - " 92.76\n", - " 0.34\n", - " 92.08\n", - " 2274951\n", - " 12187\n", - " 265\n", - " concoct\n", - " 2\n", - " True\n", - " 92.08\n", - " > 90%\n", - " \n", - " \n", - " 3\n", - " 76\n", - " concoct/bins\n", - " 6.fa\n", - " 92.63\n", - " 3.42\n", - " 85.79\n", - " 3751950\n", - " 5674\n", - " 855\n", - " concoct\n", - " 3\n", - " True\n", - " 85.79\n", - " > 90%\n", - " \n", - " \n", - " 4\n", - " 65\n", - " concoct/bins\n", - " 62.fa\n", - " 87.35\n", - " 1.80\n", - " 83.75\n", - " 1917859\n", - " 10911\n", - " 259\n", - " concoct\n", - " 4\n", - " False\n", - " 83.75\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 6\n", - " 75\n", - " concoct/bins\n", - " 48.fa\n", - " 73.35\n", - " 4.26\n", - " 64.83\n", - " 3285374\n", - " 2950\n", - " 1261\n", - " concoct\n", - " 6\n", - " False\n", - " 64.83\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 0\n", - " 22\n", - " maxbin2\n", - " maxbin2.001.fasta\n", - " 99.81\n", - " 4.81\n", - " 90.19\n", - " 4616818\n", - " 89436\n", - " 133\n", - " maxbin2\n", - " 0\n", - " True\n", - " 90.19\n", - " > 90%\n", - " \n", - " \n", - " 1\n", - " 14\n", - " maxbin2\n", - " maxbin2.002.fasta\n", - " 93.92\n", - " 3.53\n", - " 86.86\n", - " 2874373\n", - " 37523\n", - " 195\n", - " maxbin2\n", - " 1\n", - " True\n", - " 86.86\n", - " > 90%\n", - " \n", - " \n", - " 3\n", - " 5\n", - " maxbin2\n", - " maxbin2.009.fasta\n", - " 62.69\n", - " 8.14\n", - " 46.41\n", - " 2438492\n", - " 6141\n", - " 604\n", - " maxbin2\n", - " 3\n", - " False\n", - " 46.41\n", - " > 50% and ≀ 70%\n", - " \n", - " \n", - " 0\n", - " 36\n", - " metabat2\n", - " metabat2.14.fa\n", - " 99.90\n", - " 0.24\n", - " 99.42\n", - " 2799572\n", - " 41151\n", - " 99\n", - " metabat2\n", - " 0\n", - " True\n", - " 99.42\n", - " > 90%\n", - " \n", - " \n", - " 1\n", - " 25\n", - " metabat2\n", - " metabat2.8.fa\n", - " 93.17\n", - " 0.22\n", - " 92.73\n", - " 2148097\n", - " 12225\n", - " 226\n", - " metabat2\n", - " 1\n", - " True\n", - " 92.73\n", - " > 90%\n", - " \n", - " \n", - " 2\n", - " 33\n", - " metabat2\n", - " metabat2.12.fa\n", - " 93.52\n", - " 0.92\n", - " 91.68\n", - " 4266134\n", - " 39217\n", - " 157\n", - " metabat2\n", - " 2\n", - " True\n", - " 91.68\n", - " > 90%\n", - " \n", - " \n", - " 3\n", - " 27\n", - " metabat2\n", - " metabat2.11.fa\n", - " 84.40\n", - " 1.53\n", - " 81.34\n", - " 1902761\n", - " 11352\n", - " 218\n", - " metabat2\n", - " 3\n", - " False\n", - " 81.34\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 4\n", - " 37\n", - " metabat2\n", - " metabat2.1.fa\n", - " 84.99\n", - " 2.73\n", - " 79.53\n", - " 2980526\n", - " 6876\n", - " 502\n", - " metabat2\n", - " 4\n", - " False\n", - " 79.53\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 5\n", - " 31\n", - " metabat2\n", - " metabat2.2.fa\n", - " 83.21\n", - " 3.16\n", - " 76.89\n", - " 1807028\n", - " 7852\n", - " 274\n", - " metabat2\n", - " 5\n", - " False\n", - " 76.89\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 6\n", - " 35\n", - " metabat2\n", - " metabat2.4.fa\n", - " 76.53\n", - " 0.11\n", - " 76.31\n", - " 3477636\n", - " 82084\n", - " 71\n", - " metabat2\n", - " 6\n", - " False\n", - " 76.31\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 7\n", - " 29\n", - " metabat2\n", - " metabat2.7.fa\n", - " 71.78\n", - " 5.77\n", - " 60.24\n", - " 1384653\n", - " 4937\n", - " 292\n", - " metabat2\n", - " 7\n", - " False\n", - " 60.24\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 8\n", - " 24\n", - " metabat2\n", - " metabat2.3.fa\n", - " 51.75\n", - " 2.99\n", - " 45.77\n", - " 1707078\n", - " 4929\n", - " 362\n", - " metabat2\n", - " 8\n", - " False\n", - " 45.77\n", - " > 50% and ≀ 70%\n", - " \n", - " \n", - " 0\n", - " 44\n", - " semibin2/output_bins\n", - " SemiBin_27.fa.gz\n", - " 100.00\n", - " 0.09\n", - " 99.82\n", - " 4681369\n", - " 82084\n", - " 94\n", - " semibin2\n", - " 0\n", - " True\n", - " 99.82\n", - " > 90%\n", - " \n", - " \n", - " 1\n", - " 53\n", - " semibin2/output_bins\n", - " SemiBin_33.fa.gz\n", - " 99.92\n", - " 0.28\n", - " 99.36\n", - " 2937678\n", - " 37523\n", - " 113\n", - " semibin2\n", - " 1\n", - " True\n", - " 99.36\n", - " > 90%\n", - " \n", - " \n", - " 2\n", - " 50\n", - " semibin2/output_bins\n", - " SemiBin_10.fa.gz\n", - " 93.43\n", - " 0.14\n", - " 93.15\n", - " 2129295\n", - " 12519\n", - " 216\n", - " semibin2\n", - " 2\n", - " True\n", - " 93.15\n", - " > 90%\n", - " \n", - " \n", - " 3\n", - " 62\n", - " semibin2/output_bins\n", - " SemiBin_24.fa.gz\n", - " 92.13\n", - " 0.03\n", - " 92.07\n", - " 4162911\n", - " 40395\n", - " 139\n", - " semibin2\n", - " 3\n", - " True\n", - " 92.07\n", - " > 90%\n", - " \n", - " \n", - " 4\n", - " 38\n", - " semibin2/output_bins\n", - " SemiBin_26.fa.gz\n", - " 83.09\n", - " 2.25\n", - " 78.59\n", - " 1674156\n", - " 8389\n", - " 245\n", - " semibin2\n", - " 4\n", - " False\n", - " 78.59\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 5\n", - " 49\n", - " semibin2/output_bins\n", - " SemiBin_32.fa.gz\n", - " 81.87\n", - " 1.66\n", - " 78.55\n", - " 1820073\n", - " 11737\n", - " 205\n", - " semibin2\n", - " 5\n", - " False\n", - " 78.55\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 6\n", - " 60\n", - " semibin2/output_bins\n", - " SemiBin_22.fa.gz\n", - " 80.25\n", - " 1.63\n", - " 76.99\n", - " 2790948\n", - " 7117\n", - " 450\n", - " semibin2\n", - " 6\n", - " False\n", - " 76.99\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 7\n", - " 47\n", - " semibin2/output_bins\n", - " SemiBin_11.fa.gz\n", - " 72.57\n", - " 2.45\n", - " 67.67\n", - " 1245031\n", - " 5061\n", - " 253\n", - " semibin2\n", - " 7\n", - " False\n", - " 67.67\n", - " > 70% and ≀ 90%\n", - " \n", - " \n", - " 8\n", - " 61\n", - " semibin2/output_bins\n", - " SemiBin_3.fa.gz\n", - " 53.34\n", - " 1.33\n", - " 50.68\n", - " 1728690\n", - " 4913\n", - " 367\n", - " semibin2\n", - " 8\n", - " False\n", - " 50.68\n", - " > 50% and ≀ 70%\n", - " \n", - " \n", - " 9\n", - " 57\n", - " semibin2/output_bins\n", - " SemiBin_12.fa.gz\n", - " 51.92\n", - " 1.31\n", - " 49.30\n", - " 2609451\n", - " 5292\n", - " 511\n", - " semibin2\n", - " 9\n", - " False\n", - " 49.30\n", - " > 50% and ≀ 70%\n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " bin_id origin name completeness \\\n", - "0 17075 diff 44 - 10 100.00 \n", - "1 39427 diff 36 - 6 99.90 \n", - "2 47060 union 58 | 33 98.59 \n", - "3 47177 union 91 | 25 | 55 96.10 \n", - "4 21248 diff 65 - 8 - 28 91.98 \n", - "5 44137 diff 76 - 13 - 28 92.63 \n", - "6 31703 diff 31 - 7 - 61 81.73 \n", - "7 13475 diff 47 - 37 72.89 \n", - "8 47926 union 75 | 30 74.31 \n", - "9 46775 union 42 | 102 62.94 \n", - "10 33569 diff 83 - 7 - 38 - 31 59.18 \n", - "11 39350 diff 57 - 16 - 75 52.16 \n", - "12 39558 diff 78 - 6 - 43 64.63 \n", - "13 51082 union 120 | 1 52.33 \n", - "0 125 concoct/bins 9.fa 100.00 \n", - "1 67 concoct/bins 41.fa 100.00 \n", - "2 91 concoct/bins 7.fa 92.76 \n", - "3 76 concoct/bins 6.fa 92.63 \n", - "4 65 concoct/bins 62.fa 87.35 \n", - "6 75 concoct/bins 48.fa 73.35 \n", - "0 22 maxbin2 maxbin2.001.fasta 99.81 \n", - "1 14 maxbin2 maxbin2.002.fasta 93.92 \n", - "3 5 maxbin2 maxbin2.009.fasta 62.69 \n", - "0 36 metabat2 metabat2.14.fa 99.90 \n", - "1 25 metabat2 metabat2.8.fa 93.17 \n", - "2 33 metabat2 metabat2.12.fa 93.52 \n", - "3 27 metabat2 metabat2.11.fa 84.40 \n", - "4 37 metabat2 metabat2.1.fa 84.99 \n", - "5 31 metabat2 metabat2.2.fa 83.21 \n", - "6 35 metabat2 metabat2.4.fa 76.53 \n", - "7 29 metabat2 metabat2.7.fa 71.78 \n", - "8 24 metabat2 metabat2.3.fa 51.75 \n", - "0 44 semibin2/output_bins SemiBin_27.fa.gz 100.00 \n", - "1 53 semibin2/output_bins SemiBin_33.fa.gz 99.92 \n", - "2 50 semibin2/output_bins SemiBin_10.fa.gz 93.43 \n", - "3 62 semibin2/output_bins SemiBin_24.fa.gz 92.13 \n", - "4 38 semibin2/output_bins SemiBin_26.fa.gz 83.09 \n", - "5 49 semibin2/output_bins SemiBin_32.fa.gz 81.87 \n", - "6 60 semibin2/output_bins SemiBin_22.fa.gz 80.25 \n", - "7 47 semibin2/output_bins SemiBin_11.fa.gz 72.57 \n", - "8 61 semibin2/output_bins SemiBin_3.fa.gz 53.34 \n", - "9 57 semibin2/output_bins SemiBin_12.fa.gz 51.92 \n", - "\n", - " contamination score size N50 contig_count tool index \\\n", - "0 0.05 99.90 4672665 82084 93 binette 0 \n", - "1 0.20 99.50 2796605 41151 98 binette 1 \n", - "2 0.83 96.93 4601336 41016 165 binette 2 \n", - "3 0.34 95.42 2598718 11891 312 binette 3 \n", - "4 1.71 88.56 1768095 9976 250 binette 4 \n", - "5 2.41 87.81 3726254 5669 850 binette 5 \n", - "6 0.84 80.05 1665233 8518 248 binette 6 \n", - "7 2.39 68.11 1241829 5061 252 binette 7 \n", - "8 4.26 65.79 3293949 2954 1262 binette 8 \n", - "9 2.75 57.44 1293571 3783 419 binette 9 \n", - "10 2.24 54.70 2042527 4437 514 binette 10 \n", - "11 1.31 49.54 2601282 5332 509 binette 11 \n", - "12 8.03 48.57 1858210 1430 1293 binette 12 \n", - "13 5.06 42.21 688879 1446 472 binette 13 \n", - "0 0.38 99.24 3033586 37523 131 concoct 0 \n", - "1 0.46 99.08 4765466 82084 101 concoct 1 \n", - "2 0.34 92.08 2274951 12187 265 concoct 2 \n", - "3 3.42 85.79 3751950 5674 855 concoct 3 \n", - "4 1.80 83.75 1917859 10911 259 concoct 4 \n", - "6 4.26 64.83 3285374 2950 1261 concoct 6 \n", - "0 4.81 90.19 4616818 89436 133 maxbin2 0 \n", - "1 3.53 86.86 2874373 37523 195 maxbin2 1 \n", - "3 8.14 46.41 2438492 6141 604 maxbin2 3 \n", - "0 0.24 99.42 2799572 41151 99 metabat2 0 \n", - "1 0.22 92.73 2148097 12225 226 metabat2 1 \n", - "2 0.92 91.68 4266134 39217 157 metabat2 2 \n", - "3 1.53 81.34 1902761 11352 218 metabat2 3 \n", - "4 2.73 79.53 2980526 6876 502 metabat2 4 \n", - "5 3.16 76.89 1807028 7852 274 metabat2 5 \n", - "6 0.11 76.31 3477636 82084 71 metabat2 6 \n", - "7 5.77 60.24 1384653 4937 292 metabat2 7 \n", - "8 2.99 45.77 1707078 4929 362 metabat2 8 \n", - "0 0.09 99.82 4681369 82084 94 semibin2 0 \n", - "1 0.28 99.36 2937678 37523 113 semibin2 1 \n", - "2 0.14 93.15 2129295 12519 216 semibin2 2 \n", - "3 0.03 92.07 4162911 40395 139 semibin2 3 \n", - "4 2.25 78.59 1674156 8389 245 semibin2 4 \n", - "5 1.66 78.55 1820073 11737 205 semibin2 5 \n", - "6 1.63 76.99 2790948 7117 450 semibin2 6 \n", - "7 2.45 67.67 1245031 5061 253 semibin2 7 \n", - "8 1.33 50.68 1728690 4913 367 semibin2 8 \n", - "9 1.31 49.30 2609451 5292 511 semibin2 9 \n", - "\n", - " High quality bin completeness - 2*contamination \\\n", - "0 True 99.90 \n", - "1 True 99.50 \n", - "2 True 96.93 \n", - "3 True 95.42 \n", - "4 True 88.56 \n", - "5 True 87.81 \n", - "6 False 80.05 \n", - "7 False 68.11 \n", - "8 False 65.79 \n", - "9 False 57.44 \n", - "10 False 54.70 \n", - "11 False 49.54 \n", - "12 False 48.57 \n", - "13 False 42.21 \n", - "0 True 99.24 \n", - "1 True 99.08 \n", - "2 True 92.08 \n", - "3 True 85.79 \n", - "4 False 83.75 \n", - "6 False 64.83 \n", - "0 True 90.19 \n", - "1 True 86.86 \n", - "3 False 46.41 \n", - "0 True 99.42 \n", - "1 True 92.73 \n", - "2 True 91.68 \n", - "3 False 81.34 \n", - "4 False 79.53 \n", - "5 False 76.89 \n", - "6 False 76.31 \n", - "7 False 60.24 \n", - "8 False 45.77 \n", - "0 True 99.82 \n", - "1 True 99.36 \n", - "2 True 93.15 \n", - "3 True 92.07 \n", - "4 False 78.59 \n", - "5 False 78.55 \n", - "6 False 76.99 \n", - "7 False 67.67 \n", - "8 False 50.68 \n", - "9 False 49.30 \n", - "\n", - " Contamination ≀ 10 and
Completeness \n", - "0 > 90% \n", - "1 > 90% \n", - "2 > 90% \n", - "3 > 90% \n", - "4 > 90% \n", - "5 > 90% \n", - "6 > 70% and ≀ 90% \n", - "7 > 70% and ≀ 90% \n", - "8 > 70% and ≀ 90% \n", - "9 > 50% and ≀ 70% \n", - "10 > 50% and ≀ 70% \n", - "11 > 50% and ≀ 70% \n", - "12 > 50% and ≀ 70% \n", - "13 > 50% and ≀ 70% \n", - "0 > 90% \n", - "1 > 90% \n", - "2 > 90% \n", - "3 > 90% \n", - "4 > 70% and ≀ 90% \n", - "6 > 70% and ≀ 90% \n", - "0 > 90% \n", - "1 > 90% \n", - "3 > 50% and ≀ 70% \n", - "0 > 90% \n", - "1 > 90% \n", - "2 > 90% \n", - "3 > 70% and ≀ 90% \n", - "4 > 70% and ≀ 90% \n", - "5 > 70% and ≀ 90% \n", - "6 > 70% and ≀ 90% \n", - "7 > 70% and ≀ 90% \n", - "8 > 50% and ≀ 70% \n", - "0 > 90% \n", - "1 > 90% \n", - "2 > 90% \n", - "3 > 90% \n", - "4 > 70% and ≀ 90% \n", - "5 > 70% and ≀ 90% \n", - "6 > 70% and ≀ 90% \n", - "7 > 70% and ≀ 90% \n", - "8 > 50% and ≀ 70% \n", - "9 > 50% and ≀ 70% " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "contamination_cutoff = 10\n", - "low_contamination_filt = df_bins['contamination'] <= contamination_cutoff\n", - "high_completeness_filt = df_bins['completeness'] > 90\n", - "medium_completeness_filt = df_bins['completeness'] > 70\n", - "low_completeness_filt = df_bins['completeness'] > 50\n", - "\n", - "quality = f'Contamination ≀ {contamination_cutoff} and
Completeness'\n", - "df_bins.loc[low_contamination_filt & low_completeness_filt, quality] = '> 50% and ≀ 70%'\n", - "df_bins.loc[low_contamination_filt & medium_completeness_filt, quality] = '> 70% and ≀ 90%'\n", - "df_bins.loc[low_contamination_filt & high_completeness_filt, quality] = '> 90%'\n", - "df_bins.loc[~df_bins[quality].isna()]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "fa71ff37-9846-4826-a4bb-6c4b0069cea0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1621,23 +848,45 @@ } ], "source": [ - "df_bins.groupby([quality, 'tool']).value_counts(ascending=True).reset_index()\n", + "# Define the contamination cutoff\n", + "contamination_cutoff = 10\n", + "\n", + "# Create filters for completeness categories\n", + "low_contamination_filt = df_bins['contamination'] <= contamination_cutoff\n", + "high_completeness_filt = df_bins['completeness'] > 90\n", + "medium_completeness_filt = df_bins['completeness'] > 70\n", + "low_completeness_filt = df_bins['completeness'] > 50\n", "\n", + "# Define quality categories\n", + "quality = f'Contamination ≀ {contamination_cutoff} and
Completeness'\n", + "df_bins.loc[low_contamination_filt & low_completeness_filt, quality] = '> 50% and ≀ 70%'\n", + "df_bins.loc[low_contamination_filt & medium_completeness_filt, quality] = '> 70% and ≀ 90%'\n", + "df_bins.loc[low_contamination_filt & high_completeness_filt, quality] = '> 90%'\n", + "\n", + "# Group and count bins by quality category and tool\n", "df_bins_quality_grouped = df_bins.groupby([quality, 'tool']).agg(bin_count=('bin_id', 'count')).reset_index()\n", "df_bins_quality_grouped" ] }, + { + "cell_type": "markdown", + "id": "6eec391a", + "metadata": {}, + "source": [ + "Now, let’s create a bar plot to visualize the number of bins in each quality category for each bin sets:" + ] + }, { "cell_type": "code", "execution_count": 8, - "id": "250def29-167e-4a3b-8194-282f602945c8", + "id": "36ce51ac", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "
" + "
" ] }, "metadata": {}, @@ -1645,22 +894,42 @@ } ], "source": [ - "color_discrete_map={\"> 90%\": px.colors.qualitative.Prism[4],\n", - " \"> 70% and ≀ 90%\": px.colors.qualitative.Prism[2],\n", - " \"> 50% and ≀ 70%\": px.colors.qualitative.Prism[6]}\n", + "# Define colors for each completeness category\n", + "color_discrete_map = {\n", + " \"> 90%\": px.colors.qualitative.Prism[4],\n", + " \"> 70% and ≀ 90%\": px.colors.qualitative.Prism[2],\n", + " \"> 50% and ≀ 70%\": px.colors.qualitative.Prism[6]\n", + "}\n", "\n", - "fig = px.bar(df_bins_quality_grouped, x='tool', y=\"bin_count\", color=quality,\n", - " barmode='stack', color_discrete_map=color_discrete_map, text=\"bin_count\",\n", - " category_orders={\"tool\":[\"binette\", \"semibin2\", \"concoct\", \"metabat2\", \"maxbin2\"]},\n", - " opacity = 0.9)#[ \"#008c8a\", px.colors.qualitative.Safe[4], '#2596be'])\n", + "# Create the bar plot\n", + "fig = px.bar(\n", + " df_bins_quality_grouped, \n", + " x='tool', \n", + " y=\"bin_count\", \n", + " color=quality,\n", + " barmode='stack', \n", + " color_discrete_map=color_discrete_map, \n", + " text=\"bin_count\",\n", + " category_orders={\"tool\": [\"binette\", \"semibin2\", \"concoct\", \"metabat2\", \"maxbin2\"]},\n", + " opacity=0.9\n", + ")\n", "\n", + "# Update layout for better appearance\n", "fig.update_layout(\n", - " width=800,\n", - " height=500,\n", - " legend=dict(\n", - " traceorder=\"reversed\",\n", - " ))\n", - "fig" + " width=600,\n", + " height=500,\n", + " legend=dict(traceorder=\"reversed\")\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f78d0f29", + "metadata": {}, + "source": [ + "From the plot, you can see that Binette produces more high-quality bins compared to the initial bin sets! πŸŽ‰" ] } ], @@ -1680,7 +949,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.8.19" } }, "nbformat": 4, diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md index c7104ed..d2084bb 100644 --- a/docs/tutorial/assembly.md +++ b/docs/tutorial/assembly.md @@ -18,9 +18,28 @@ This process takes approximately 28 minutes to complete. ```{admonition} Note :class: note -You can also use **SPAdes** for assembly. It generally performs better than MEGAHIT but takes longer and requires more memory. Refer to the CAMI benchmark for a detailed comparison. +You can also use **SPAdes** for assembly. It generally performs better than MEGAHIT but takes longer and requires more memory. ``` + +```{admonition} Best Practices +:class: tip + +Here are some general tips that might help improve your assembly results, depending on your data: + +- **Read Cleaning:** If your reads have low-quality bases or adapters, consider cleaning them with a tool like `sickle`. It can boost the overall quality of your assembly. + +- **Quality Check:** Tools like `metaQUAST` are handy for checking your assembly’s quality. It’s a good way to ensure your results are solid before moving on. + +- **Assembly Filtering:** After assembling, it’s often a good idea to filter out small or low-coverage contigs. + + +These steps aren’t mandatory, and since this tutorial focuses on binning and using Binette, we’ll skip them for now. + +``` + + + ## Align the Reads Over the Assembly To get coverage information, we first need to map the reads back to the assembly. diff --git a/docs/tutorial/binette_results/final_bins_quality_reports.tsv b/docs/tutorial/binette_results/final_bins_quality_reports.tsv new file mode 100644 index 0000000..f575212 --- /dev/null +++ b/docs/tutorial/binette_results/final_bins_quality_reports.tsv @@ -0,0 +1,16 @@ +bin_id origin name completeness contamination score size N50 contig_count +17075 diff 44 - 10 100.0 0.05 99.9 4672665 82084 93 +39427 diff 36 - 6 99.9 0.2 99.5 2796605 41151 98 +47060 union 58 | 33 98.59 0.83 96.93 4601336 41016 165 +47177 union 91 | 25 | 55 96.1 0.34 95.41999999999999 2598718 11891 312 +21248 diff 65 - 8 - 28 91.98 1.71 88.56 1768095 9976 250 +44137 diff 76 - 13 - 28 92.63 2.41 87.81 3726254 5669 850 +31703 diff 31 - 7 - 61 81.73 0.84 80.05 1665233 8518 248 +13475 diff 47 - 37 72.89 2.39 68.11 1241829 5061 252 +47926 union 75 | 30 74.31 4.26 65.79 3293949 2954 1262 +46775 union 42 | 102 62.94 2.75 57.44 1293571 3783 419 +33569 diff 83 - 7 - 38 - 31 59.18 2.24 54.7 2042527 4437 514 +39350 diff 57 - 16 - 75 52.16 1.31 49.54 2601282 5332 509 +39558 diff 78 - 6 - 43 64.63 8.03 48.56999999999999 1858210 1430 1293 +51082 union 120 | 1 52.33 5.06 42.21 688879 1446 472 +19689 diff 118 - 18 - 61 - 31 48.22 8.23 31.759999999999998 1782676 1402 1265 diff --git a/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_1.concoct_bins.tsv b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_1.concoct_bins.tsv new file mode 100644 index 0000000..f4a995a --- /dev/null +++ b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_1.concoct_bins.tsv @@ -0,0 +1,64 @@ +bin_id origin name completeness contamination score size N50 contig_count +125 concoct/bins 9.fa 100.0 0.38 99.24 3033586 37523 131 +67 concoct/bins 41.fa 100.0 0.46 99.08 4765466 82084 101 +91 concoct/bins 7.fa 92.76 0.34 92.08 2274951 12187 265 +76 concoct/bins 6.fa 92.63 3.42 85.78999999999999 3751950 5674 855 +65 concoct/bins 62.fa 87.35 1.8 83.75 1917859 10911 259 +109 concoct/bins 31.fa 100.0 14.25 71.5 7431952 25567 536 +75 concoct/bins 48.fa 73.35 4.26 64.83 3285374 2950 1261 +78 concoct/bins 2.fa 69.37 13.16 43.050000000000004 2385110 1471 1615 +113 concoct/bins 39.fa 46.66 3.76 39.14 809087 1511 527 +120 concoct/bins 29.fa 35.92 0.57 34.78 480789 1467 325 +86 concoct/bins 51.fa 45.9 7.36 31.18 4821129 1550 3079 +117 concoct/bins 20.fa 32.76 1.09 30.58 717576 1577 464 +118 concoct/bins 11.fa 47.24 9.31 28.62 1944164 1431 1340 +83 concoct/bins 26.fa 99.96 37.25 25.459999999999994 3984942 5809 809 +104 concoct/bins 38.fa 25.03 1.18 22.67 445168 1362 322 +119 concoct/bins 19.fa 35.1 7.86 19.380000000000003 2403536 1599 1464 +82 concoct/bins 59.fa 100.0 40.63 18.739999999999995 8287537 3951 2241 +89 concoct/bins 3.fa 87.66 34.5 18.659999999999997 6341799 1568 4092 +121 concoct/bins 30.fa 87.47 34.51 18.450000000000003 3480539 4299 901 +102 concoct/bins 0.fa 17.11 0.07 16.97 344299 1699 211 +97 concoct/bins 43.fa 12.5 0.15 12.2 345166 1384 238 +110 concoct/bins 35.fa 8.66 0.01 8.64 483789 1273 355 +108 concoct/bins 52.fa 6.65 0.0 6.65 89878 2232 40 +63 concoct/bins 27.fa 6.62 0.0 6.62 1016 1016 1 +101 concoct/bins 24.fa 6.45 0.0 6.45 3381 1107 3 +124 concoct/bins 18.fa 6.48 0.02 6.44 193358 1267 148 +106 concoct/bins 36.fa 6.43 0.0 6.43 12090 1997 7 +123 concoct/bins 14.fa 6.38 0.0 6.38 4193 3113 2 +114 concoct/bins 60.fa 6.38 0.0 6.38 8476 2499 5 +93 concoct/bins 5.fa 6.38 0.0 6.38 5082 1686 3 +87 concoct/bins 28.fa 6.38 0.0 6.38 5015 1668 3 +80 concoct/bins 13.fa 6.38 0.0 6.38 5338 1601 3 +112 concoct/bins 50.fa 6.38 0.0 6.38 13671 1587 9 +96 concoct/bins 54.fa 6.38 0.0 6.38 2727 1576 2 +107 concoct/bins 58.fa 6.38 0.0 6.38 1491 1491 1 +74 concoct/bins 45.fa 6.38 0.0 6.38 2475 1448 2 +70 concoct/bins 22.fa 6.38 0.0 6.38 1344 1344 1 +116 concoct/bins 10.fa 6.38 0.0 6.38 2524 1332 2 +98 concoct/bins 25.fa 6.38 0.0 6.38 10545 1304 8 +71 concoct/bins 32.fa 6.38 0.0 6.38 2290 1266 2 +92 concoct/bins 57.fa 6.38 0.0 6.38 4999 1246 4 +105 concoct/bins 34.fa 6.38 0.0 6.38 1240 1240 1 +66 concoct/bins 23.fa 6.38 0.0 6.38 1236 1236 1 +72 concoct/bins 1.fa 6.38 0.0 6.38 12304 1223 10 +88 concoct/bins 53.fa 6.38 0.0 6.38 1160 1160 1 +68 concoct/bins 4.fa 6.38 0.0 6.38 6739 1136 6 +64 concoct/bins 37.fa 6.38 0.0 6.38 1123 1123 1 +94 concoct/bins 33.fa 6.38 0.0 6.38 1032 1032 1 +69 concoct/bins 17.fa 6.05 0.0 6.05 8012 1402 6 +85 concoct/bins 55.fa 5.85 0.0 5.85 117297 100818 12 +79 concoct/bins 40.fa 5.58 0.0 5.58 16429 1658 10 +99 concoct/bins 8.fa 5.35 0.0 5.35 98557 1192 80 +122 concoct/bins 61.fa 10.6 3.22 4.159999999999999 173292 1225 136 +77 concoct/bins 42.fa 3.74 0.0 3.74 122021 3383 50 +90 concoct/bins 15.fa 3.68 0.01 3.66 106174 24244 6 +111 concoct/bins 49.fa 3.6 0.02 3.56 75967 2458 39 +84 concoct/bins 46.fa 3.32 0.0 3.32 55857 1166 47 +115 concoct/bins 16.fa 3.21 0.0 3.21 36685 1138 31 +100 concoct/bins 21.fa 2.98 0.01 2.96 20489 1588 12 +95 concoct/bins 56.fa 2.73 0.0 2.73 28603 1276 21 +73 concoct/bins 47.fa 2.67 0.0 2.67 48903 2372 23 +103 concoct/bins 12.fa 2.53 0.0 2.53 41153 1182 34 +81 concoct/bins 44.fa 2.5 0.0 2.5 44603 1410 30 diff --git a/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_2.maxbin2.tsv b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_2.maxbin2.tsv new file mode 100644 index 0000000..d515b9d --- /dev/null +++ b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_2.maxbin2.tsv @@ -0,0 +1,24 @@ +bin_id origin name completeness contamination score size N50 contig_count +22 maxbin2 maxbin2.001.fasta 99.81 4.81 90.19 4616818 89436 133 +14 maxbin2 maxbin2.002.fasta 93.92 3.53 86.86 2874373 37523 195 +11 maxbin2 maxbin2.006.fasta 75.2 12.31 50.58 2634516 12326 626 +5 maxbin2 maxbin2.009.fasta 62.69 8.14 46.41 2438492 6141 604 +18 maxbin2 maxbin2.012.fasta 56.93 14.12 28.69 3473782 2619 1410 +7 maxbin2 maxbin2.011.fasta 40.68 7.03 26.619999999999997 2087133 6988 510 +23 maxbin2 maxbin2.010.fasta 45.34 9.62 26.100000000000005 4743354 1971 2401 +3 maxbin2 maxbin2.018.fasta 80.35 27.53 25.289999999999992 5331237 4487 1756 +13 maxbin2 maxbin2.013.fasta 69.31 22.06 25.190000000000005 3958158 5259 1353 +21 maxbin2 maxbin2.007.fasta 34.6 4.79 25.020000000000003 1586278 12519 451 +6 maxbin2 maxbin2.021.fasta 42.81 9.69 23.430000000000003 1690737 2715 767 +19 maxbin2 maxbin2.020.fasta 27.99 2.4 23.189999999999998 1033153 3328 310 +20 maxbin2 maxbin2.014.fasta 26.95 2.05 22.85 1112378 1806 570 +10 maxbin2 maxbin2.008.fasta 56.41 17.23 21.949999999999996 3237421 2381 1425 +2 maxbin2 maxbin2.003.fasta 23.72 1.41 20.9 1419869 2539 575 +17 maxbin2 maxbin2.019.fasta 76.19 29.54 17.11 2765576 3328 1163 +1 maxbin2 maxbin2.023.fasta 27.26 6.52 14.220000000000002 454808 1432 314 +9 maxbin2 maxbin2.004.fasta 17.07 1.62 13.83 1180579 2361 491 +4 maxbin2 maxbin2.022.fasta 28.6 7.98 12.64 804525 1593 497 +8 maxbin2 maxbin2.005.fasta 11.11 0.04 11.03 488546 17602 45 +15 maxbin2 maxbin2.015.fasta 10.27 0.56 9.149999999999999 379048 3202 126 +16 maxbin2 maxbin2.016.fasta 4.92 0.0 4.92 103037 3558 49 +12 maxbin2 maxbin2.017.fasta 93.2 48.33 -3.4599999999999937 4710071 2372 2074 diff --git a/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_3.metabat2.tsv b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_3.metabat2.tsv new file mode 100644 index 0000000..fdc6bdd --- /dev/null +++ b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_3.metabat2.tsv @@ -0,0 +1,15 @@ +bin_id origin name completeness contamination score size N50 contig_count +36 metabat2 metabat2.14.fa 99.9 0.24 99.42 2799572 41151 99 +25 metabat2 metabat2.8.fa 93.17 0.22 92.73 2148097 12225 226 +33 metabat2 metabat2.12.fa 93.52 0.92 91.67999999999999 4266134 39217 157 +27 metabat2 metabat2.11.fa 84.4 1.53 81.34 1902761 11352 218 +37 metabat2 metabat2.1.fa 84.99 2.73 79.53 2980526 6876 502 +31 metabat2 metabat2.2.fa 83.21 3.16 76.88999999999999 1807028 7852 274 +35 metabat2 metabat2.4.fa 76.53 0.11 76.31 3477636 82084 71 +29 metabat2 metabat2.7.fa 71.78 5.77 60.24 1384653 4937 292 +24 metabat2 metabat2.3.fa 51.75 2.99 45.769999999999996 1707078 4929 362 +30 metabat2 metabat2.13.fa 44.85 0.49 43.870000000000005 1724699 4259 415 +26 metabat2 metabat2.10.fa 44.15 1.11 41.93 982239 4743 219 +32 metabat2 metabat2.5.fa 25.31 0.03 25.25 1077467 91995 14 +28 metabat2 metabat2.9.fa 98.03 37.1 23.83 8543557 4347 1974 +34 metabat2 metabat2.6.fa 7.06 0.03 7.0 252404 64012 6 diff --git a/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_4.semibin2_output_bins.tsv b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_4.semibin2_output_bins.tsv new file mode 100644 index 0000000..c3a150b --- /dev/null +++ b/docs/tutorial/binette_results/input_bins_quality_reports/input_bins_4.semibin2_output_bins.tsv @@ -0,0 +1,26 @@ +bin_id origin name completeness contamination score size N50 contig_count +44 semibin2/output_bins SemiBin_27.fa.gz 100.0 0.09 99.82 4681369 82084 94 +53 semibin2/output_bins SemiBin_33.fa.gz 99.92 0.28 99.36 2937678 37523 113 +50 semibin2/output_bins SemiBin_10.fa.gz 93.43 0.14 93.15 2129295 12519 216 +62 semibin2/output_bins SemiBin_24.fa.gz 92.13 0.03 92.07 4162911 40395 139 +38 semibin2/output_bins SemiBin_26.fa.gz 83.09 2.25 78.59 1674156 8389 245 +49 semibin2/output_bins SemiBin_32.fa.gz 81.87 1.66 78.55000000000001 1820073 11737 205 +60 semibin2/output_bins SemiBin_22.fa.gz 80.25 1.63 76.99 2790948 7117 450 +47 semibin2/output_bins SemiBin_11.fa.gz 72.57 2.45 67.66999999999999 1245031 5061 253 +61 semibin2/output_bins SemiBin_3.fa.gz 53.34 1.33 50.68000000000001 1728690 4913 367 +57 semibin2/output_bins SemiBin_12.fa.gz 51.92 1.31 49.300000000000004 2609451 5292 511 +56 semibin2/output_bins SemiBin_17.fa.gz 47.29 0.37 46.55 1934420 4160 470 +42 semibin2/output_bins SemiBin_14.fa.gz 47.28 0.73 45.82 990463 4692 222 +51 semibin2/output_bins SemiBin_13.fa.gz 36.67 6.12 24.43 1699695 4402 395 +54 semibin2/output_bins SemiBin_18.fa.gz 17.07 0.69 15.690000000000001 1131272 3943 277 +59 semibin2/output_bins SemiBin_15.fa.gz 14.04 1.01 12.02 884790 4349 206 +45 semibin2/output_bins SemiBin_20.fa.gz 9.95 0.01 9.93 515894 8389 67 +43 semibin2/output_bins SemiBin_5.fa.gz 9.95 0.05 9.85 513202 3891 131 +39 semibin2/output_bins SemiBin_35.fa.gz 9.45 0.0 9.45 213606 3336 63 +58 semibin2/output_bins SemiBin_84.fa.gz 8.7 0.0 8.7 358311 64012 9 +55 semibin2/output_bins SemiBin_66.fa.gz 8.66 0.19 8.28 290297 6707 44 +48 semibin2/output_bins SemiBin_52.fa.gz 8.28 0.01 8.26 358822 3296 106 +41 semibin2/output_bins SemiBin_19.fa.gz 8.12 0.02 8.08 353499 3949 90 +52 semibin2/output_bins SemiBin_6.fa.gz 7.74 0.01 7.720000000000001 351540 4284 85 +46 semibin2/output_bins SemiBin_37.fa.gz 6.18 0.0 6.18 250833 3607 66 +40 semibin2/output_bins SemiBin_80.fa.gz 4.41 0.13 4.15 217541 3425 64 diff --git a/docs/tutorial/binning.md b/docs/tutorial/binning.md index 8624cf6..bace377 100644 --- a/docs/tutorial/binning.md +++ b/docs/tutorial/binning.md @@ -1,10 +1,50 @@ + +## Align the Reads to the Assembly + +Binning tools rely on coverage information, among other criteria, to evaluate each contig. + +To obtain this coverage data, we first need to map the reads back to the assembly. + +```{code-block} bash +# Create a directory for the alignments +mkdir -p alignments_bwa/ + +# Index the contigs file using BWA-MEM2 +bwa-mem2 index Kickstart.megahit/R1.contigs.fa -p Kickstart.megahit/R1.contigs.fa + +# Map reads back to the assembly, convert to BAM format, and sort +bwa-mem2 mem -t 12 Kickstart.megahit/R1.contigs.fa coal-metagenomics/Kickstart_*.fastq.gz | \ +samtools view -@ 12 -bS - | \ +samtools sort -@ 12 - -o alignments_bwa/Kickstart.bam + +# Index the BAM file +samtools index alignments_bwa/Kickstart.bam +``` + + +:::{admonition} βŒ› Expected Time +:class: note + +This process takes approximately 12 minutes to complete. +::: + +```{admonition} +:class: tip + +If you have multiple samples and assemble them separately, cross-aligning the samples can significantly improve binning. Align each sample to all assemblies and use the resulting BAM files in binning. This approach gives the binning tools more coverage variation, which can be beneficial. However, keep in mind that this process can be resource-intensive, especially with many samples. + +If you did a cross-assembly with your samples, make sure to map the reads separately for each one, generating as many BAM files as you have samples, to help the binning tool. πŸš€ + +``` + + ## Run Binning Tools -In this section, we'll use different binning tools to group contigs of assembly. +Let's use different binning tools to group the contigs into bins, which we'll refine in the next section with Binette. ### MetaBAT2 -First, generate a depth file from the BAM file using the `jgi_summarize_bam_contig_depths` script from MetaBAT2. This depth file will also be used for MaxBin2. +First, generate a depth file from the BAM file using the `jgi_summarize_bam_contig_depths` script from MetaBAT2. This depth file will also be used by MaxBin2. ```bash jgi_summarize_bam_contig_depths --outputDepth depth_Kickstart.txt alignments_bwa/Kickstart.bam diff --git a/docs/tutorial/tutorial_main.md b/docs/tutorial/tutorial_main.md index d21931c..4493b3e 100644 --- a/docs/tutorial/tutorial_main.md +++ b/docs/tutorial/tutorial_main.md @@ -1,59 +1,30 @@ # Tutorial -In this tutorial, we'll walk through a practical example of how to use Binette with real data. We'll start by downloading metagenomics reads and then assemble these reads into contigs. Next, we'll use different binning tools to group the contigs. Finally, we'll use Binette to refine these bins and improve our results. +In this tutorial, we'll walk through a practical example of how to use Binette with real data. We'll start by downloading metagenomics reads and then assemble these reads into contigs. Next, we'll use different binning tools to group the contigs into bins. Finally, we'll use Binette to refine these bins. ```{mermaid} ---- -title: "Tutorial Overview:" -align: center ---- - -%%{init: {'theme':'default'}}%% - -graph LR - - A[Download Metagenomics Reads] --> B - B[Assemble Reads into Contigs] --> c - subgraph Pangenome creation - a:::workflow - c:::workflow - g:::workflow - p:::workflow - a("annotate") --> c - c(cluster) --> g(graph) - g(graph) --> p(partition) - end - - - C[Bin Contigs with Binning Tools] --> D[Refine Bins with Binette] - - - classDef panrgp fill:#4066d4 - classDef panmodule fill:#d44066 - classDef workflow fill:#d4ae40 - - -``` - -```{mermaid} - ---- +--- title: "Tutorial Overview:" align: center + +config: + look: handDrawn + theme: neutral --- graph TD - i[Get Metagenomics Reads] --> B[Assembly & Reads alignment] + i[metagenomics reads] --> B[assembly] - B --> metabat2 --> r[Binette] + B --> metabat2 --> r[binette] B --> maxbin2 --> r B --> concoct --> r B --> semibin2 --> r + r --> f[final bins] subgraph Binning metabat2:::binning @@ -62,8 +33,8 @@ graph TD semibin2:::binning end - - classDef binning fill:#d4ae40 + + classDef binning fill:#d4ae40 ``` @@ -79,6 +50,7 @@ assembly binning binette analyse_binette_result.ipynb +analyse_binette_result.myst ``` From 25443d416cba4ef2d66f985c8de5793de46e8caa Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 13:57:37 +0200 Subject: [PATCH 23/36] improve doc --- docs/api/api_ref.md | 1 - docs/api/binette.md | 13 +--- docs/api/modules.md | 7 -- docs/conf.py | 4 +- docs/contributing.md | 2 +- docs/tests.md | 2 +- docs/tutorial/assembly.md | 31 ++++++--- docs/tutorial/binette.md | 2 +- docs/tutorial/binette_tutorial_env.yaml | 21 ++++++ docs/tutorial/binning.md | 51 ++------------ docs/tutorial/get_dataset.md | 88 +++++++++++++++++++++++++ docs/tutorial/set_environment.md | 12 ++-- docs/tutorial/tutorial_main.md | 21 +++--- docs/usage.md | 4 +- 14 files changed, 162 insertions(+), 97 deletions(-) delete mode 100644 docs/api/modules.md create mode 100644 docs/tutorial/binette_tutorial_env.yaml create mode 100644 docs/tutorial/get_dataset.md diff --git a/docs/api/api_ref.md b/docs/api/api_ref.md index 3de18e1..58bea10 100644 --- a/docs/api/api_ref.md +++ b/docs/api/api_ref.md @@ -3,6 +3,5 @@ ```{toctree} :maxdepth: 2 binette -indice_and_table ``` diff --git a/docs/api/binette.md b/docs/api/binette.md index bc3c754..a4e2c9a 100644 --- a/docs/api/binette.md +++ b/docs/api/binette.md @@ -20,15 +20,6 @@ :show-inheritance: ``` -## binette.binette module - -```{eval-rst} -.. automodule:: binette.binette - :members: - :undoc-members: - :show-inheritance: -``` - ## binette.cds module ```{eval-rst} @@ -65,10 +56,10 @@ :show-inheritance: ``` -## Module contents +## binette.main module ```{eval-rst} -.. automodule:: binette +.. automodule:: binette.main :members: :undoc-members: :show-inheritance: diff --git a/docs/api/modules.md b/docs/api/modules.md deleted file mode 100644 index b83d27c..0000000 --- a/docs/api/modules.md +++ /dev/null @@ -1,7 +0,0 @@ -# binette - -```{toctree} -:maxdepth: 4 - -binette -``` diff --git a/docs/conf.py b/docs/conf.py index 245781e..0cce3b6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,13 +24,13 @@ "sphinx.ext.autosectionlabel", "sphinx.ext.autodoc", 'sphinx_search.extension', + 'sphinx_togglebutton', # "myst_nb", "myst_parser", 'nbsphinx', 'nbsphinx_link', # 'sphinx.ext.napoleon', # 'sphinx.ext.viewcode', - "myst_parser", 'sphinxcontrib.mermaid' ] myst_enable_extensions = [ @@ -61,7 +61,7 @@ # `path/to/file:heading` instead of just `heading` autosectionlabel_prefix_document = True -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build', "api", "jupyter_execute"] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', 'build', "jupyter_execute"] diff --git a/docs/contributing.md b/docs/contributing.md index f400a68..e9b69ee 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -26,7 +26,7 @@ For minor changes like fixing typos or making small edits, create a new Pull Req - Clone your forked repository to your local machine. 2. **Get an Environment:** - Create an environment with all Binette prerequisites installed by following the installation instructions [here](./installation.md#installing-from-source-code-within-a-conda-environnement). + Create an environment with all Binette prerequisites installed by following the installation instructions [here](./installation.md#from-the-source-code-within-a-conda-environnement). 3. **Install in Editable Mode:** To enable code editing and testing of new functionality, you can install Binette in editable mode using the following command: diff --git a/docs/tests.md b/docs/tests.md index b6c0e60..ed1cdd7 100644 --- a/docs/tests.md +++ b/docs/tests.md @@ -8,7 +8,7 @@ Tests have been implemented to ensure the correctness of Binette. Unit tests have been implmented in the tests directory using pytest. -To run the test suit you would need to have install Binette from the source code. For that, you can follow installation instructions [here](./installation.md#installing-from-source-code-within-a-conda-environnement). +To run the test suit you would need to have install Binette from the source code. For that, you can follow installation instructions [here](./installation.md#from-the-source-code-within-a-conda-environnement). To install pytest in you environement you can run : diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md index d2084bb..f919f00 100644 --- a/docs/tutorial/assembly.md +++ b/docs/tutorial/assembly.md @@ -10,39 +10,40 @@ megahit -1 coal-metagenomics/Kickstart_1.fastq.gz \ :::{admonition} βŒ› Expected Time :class: note +:class: dropdown This process takes approximately 28 minutes to complete. ::: -```{admonition} Note -:class: note - -You can also use **SPAdes** for assembly. It generally performs better than MEGAHIT but takes longer and requires more memory. -``` - -```{admonition} Best Practices +```{admonition} Assembly tips :class: tip +:class: dropdown Here are some general tips that might help improve your assembly results, depending on your data: - **Read Cleaning:** If your reads have low-quality bases or adapters, consider cleaning them with a tool like `sickle`. It can boost the overall quality of your assembly. +- **Use SPAdes rather than MEGAHIT** **SPAdes** generally performs better than MEGAHIT but takes longer and requires more memory. + - **Quality Check:** Tools like `metaQUAST` are handy for checking your assembly’s quality. It’s a good way to ensure your results are solid before moving on. - **Assembly Filtering:** After assembling, it’s often a good idea to filter out small or low-coverage contigs. -These steps aren’t mandatory, and since this tutorial focuses on binning and using Binette, we’ll skip them for now. +These steps aren’t mandatory, and since this tutorial focuses on binning refinement with Binette, we’ll skip them. ``` -## Align the Reads Over the Assembly -To get coverage information, we first need to map the reads back to the assembly. +## Align the Reads to the Assembly + +Binning tools rely on coverage information, among other criteria, to evaluate each contig. + +To obtain this coverage data, we first need to map the reads back to the assembly. ```{code-block} bash # Create a directory for the alignments @@ -63,6 +64,16 @@ samtools index alignments_bwa/Kickstart.bam :::{admonition} βŒ› Expected Time :class: note +:class: dropdown This process takes approximately 12 minutes to complete. ::: + +```{admonition} Read alignment strategy +:class: tip + +If you have multiple samples and assemble them separately, cross-aligning the samples can significantly improve binning. Align each sample to all assemblies and use the resulting BAM files in binning. This approach gives the binning tools more coverage variation, which can be beneficial. However, keep in mind that this process can be resource-intensive, especially with many samples. + +If you did a cross-assembly with your samples, make sure to map the reads separately for each one, generating as many BAM files as you have samples, to help the binning tool. πŸš€ + +``` \ No newline at end of file diff --git a/docs/tutorial/binette.md b/docs/tutorial/binette.md index 926ce3b..a2d7796 100644 --- a/docs/tutorial/binette.md +++ b/docs/tutorial/binette.md @@ -13,7 +13,7 @@ binette --bin_dirs maxbin2/ metabat2/ semibin2/output_bins/ concoct/bins/ \ Once Binette completes, the `binette_results` directory should have the following structure: -```plaintext +``` binette_results/ β”œβ”€β”€ final_bins β”‚ β”œβ”€β”€ bin_13475.fa diff --git a/docs/tutorial/binette_tutorial_env.yaml b/docs/tutorial/binette_tutorial_env.yaml new file mode 100644 index 0000000..72655d3 --- /dev/null +++ b/docs/tutorial/binette_tutorial_env.yaml @@ -0,0 +1,21 @@ +name: binette_tutorial +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - uscdc-datasets-sars-cov-2 # Dataset downloader to get the tutorial initial data + - fastqc # Quality control for high-throughput sequencing data + - samtools=1 # Tools for manipulating sequencing data in SAM format + - bedtools=2 # Suite of tools for genome arithmetic + - bwa-mem2=2 # Align reads to a reference genome (using BWA-MEM2) + - megahit=1 # De novo assembler for large genomes + - maxbin2=2 # Binning tool for metagenomic datasets + - metabat2=2 # Binning tool for metagenomic datasets + - semibin=2 # Binning tool for metagenomic datasets + - concoct=1 # Binning tool for metagenomic datasets + - binette=1.0.1 # Binette for binning and genome analysis + - das_tool=1 # Bin refiner to compare with Binette + - jupyter # Jupyter notebook for interactive analysis + - pandas=1 # Data manipulation and analysis + - plotly=5 # Interactive graphing \ No newline at end of file diff --git a/docs/tutorial/binning.md b/docs/tutorial/binning.md index bace377..cb82166 100644 --- a/docs/tutorial/binning.md +++ b/docs/tutorial/binning.md @@ -1,43 +1,3 @@ - -## Align the Reads to the Assembly - -Binning tools rely on coverage information, among other criteria, to evaluate each contig. - -To obtain this coverage data, we first need to map the reads back to the assembly. - -```{code-block} bash -# Create a directory for the alignments -mkdir -p alignments_bwa/ - -# Index the contigs file using BWA-MEM2 -bwa-mem2 index Kickstart.megahit/R1.contigs.fa -p Kickstart.megahit/R1.contigs.fa - -# Map reads back to the assembly, convert to BAM format, and sort -bwa-mem2 mem -t 12 Kickstart.megahit/R1.contigs.fa coal-metagenomics/Kickstart_*.fastq.gz | \ -samtools view -@ 12 -bS - | \ -samtools sort -@ 12 - -o alignments_bwa/Kickstart.bam - -# Index the BAM file -samtools index alignments_bwa/Kickstart.bam -``` - - -:::{admonition} βŒ› Expected Time -:class: note - -This process takes approximately 12 minutes to complete. -::: - -```{admonition} -:class: tip - -If you have multiple samples and assemble them separately, cross-aligning the samples can significantly improve binning. Align each sample to all assemblies and use the resulting BAM files in binning. This approach gives the binning tools more coverage variation, which can be beneficial. However, keep in mind that this process can be resource-intensive, especially with many samples. - -If you did a cross-assembly with your samples, make sure to map the reads separately for each one, generating as many BAM files as you have samples, to help the binning tool. πŸš€ - -``` - - ## Run Binning Tools Let's use different binning tools to group the contigs into bins, which we'll refine in the next section with Binette. @@ -108,14 +68,15 @@ extract_fasta_bins.py Kickstart.megahit/R1.contigs.fa concoct/clustering_merge.c You can also run SemiBin2 with its `single_easy_bin` command: -```{admonition} ⏳ Time Note -:class: note - -This process can take some time, so it may be skipped. -``` ```bash SemiBin2 single_easy_bin -i Kickstart.megahit/R1.contigs.fa \ -b alignments_bwa/Kickstart.bam \ -o semibin2/ -p 12 ``` + +```{admonition} ⏳ Time Note +:class: note + +This process can take some time. +``` diff --git a/docs/tutorial/get_dataset.md b/docs/tutorial/get_dataset.md new file mode 100644 index 0000000..3967bc3 --- /dev/null +++ b/docs/tutorial/get_dataset.md @@ -0,0 +1,88 @@ +## Obtaining Metagenomic Data for the Tutorial + +### Using the ncezid-biome Datasets Tool + +For this tutorial, we’ll use the "Kickstart" metagenome dataset from the [ncezid-biome datasets GitHub repository](https://github.com/ncezid-biome/). This dataset corresponds to sample [SAMN05024035](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRR5058924&o=acc_s%3Aa) and SRA [SRR5058924](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRR5058924&o=acc_s%3Aa). + + +We'll download the "Kickstart" dataset using the ncezid-biome datasets tool. You can find the tool and instructions on how to use it in their [GitHub repository](https://github.com/ncezid-biome/datasets?tab=readme-ov-file#edlb). + +The tool called `uscdc-datasets-sars-cov-2` on bioconda is part of the Conda environment created in the [previous section](./set_environment.md). + + +#### Download the Kickstart Dataset + +Once the tool is installed, you can download the "Kickstart" dataset with the following steps: + +1. **Download the coal-metagenomics table** from the GitHub repository: + + ```{code-block} bash + wget https://raw.githubusercontent.com/ncezid-biome/datasets/master/datasets/coal-metagenomics.tsv + ``` + +2. **Select the relevant line** corresponding to the "Kickstart" dataset (SRR5058924) by extracting the header and the specific entry: + + ```{code-block} bash + # Select the header of the table + head -n7 coal-metagenomics.tsv > coal-metagenomics_Kickstart_only.tsv + + # Append the relevant line for the Kickstart dataset + grep SRR5058924 coal-metagenomics.tsv >> coal-metagenomics_Kickstart_only.tsv + ``` + +3. **Run the dataset download** using the `GenFSGopher.pl` script: + + ```{code-block} bash + GenFSGopher.pl --numcpus 12 --compressed --outdir coal-metagenomics coal-metagenomics_Kickstart_only.tsv + ``` + + +:::{admonition} βŒ› Expected Time +:class: note + +This process takes approximately 16 minutes to complete. +::: + +#### Directory Structure + +After downloading, your directory structure should look like this: + +```{code-block} text +β”œβ”€β”€ coal-metagenomics_Kickstart_only.tsv +└── data + β”œβ”€β”€ in.tsv + β”œβ”€β”€ Kickstart_1.fastq.gz + β”œβ”€β”€ Kickstart_1.fastq.sha256 + β”œβ”€β”€ Kickstart_2.fastq.gz + β”œβ”€β”€ Kickstart_2.fastq.sha256 + β”œβ”€β”€ Makefile + β”œβ”€β”€ prefetch.done + β”œβ”€β”€ sha256sum.log + β”œβ”€β”€ SRR5058924 + β”‚Β Β  └── SRR5058924.sra + └── tree.dnd +``` + +In the next section, will assemble the two reads files to obtain an assembly of the dataset: +- `data/Kickstart_1.fastq.gz` +- `data/Kickstart_2.fastq.gz` + + +:::{admonition} 🧹 Cleaning Tip +:class: tip + +You can remove the SRA file `data/SRR5058924/SRR5058924.sra` as it is no longer needed; we will use only the FASTQ files. To remove it, run: + +```{code-block} bash +rm data/SRR5058924/SRR5058924.sra +::: + +```{note} +Alternatively, you can download the data using the SRA Toolkit, which is what the ncezid-biome tool uses in the background. +Note that ncezid-biome tool provides additional checksum verification to ensure data integrity. +You can retrieve the data with the following commands after installing the SRA Toolkit (e.g., via Conda: [sra-tools on Anaconda](https://anaconda.org/bioconda/sra-tools)): +```{code-block} bash +prefetch SRR5058924 +fastq-dump --defline-seq '@$ac_$sn/$ri' --defline-qual '+' --split-3 -O . SRR5058924.sra +``` + diff --git a/docs/tutorial/set_environment.md b/docs/tutorial/set_environment.md index 52d6092..7f039fa 100644 --- a/docs/tutorial/set_environment.md +++ b/docs/tutorial/set_environment.md @@ -12,6 +12,13 @@ mamba env create -f binette_tutorial_env.yaml -n binette_tuto This command will create a Conda environment named `binette_tuto` using the environment file `binette_tutorial_env.yaml`. +Below is the content of the `binette_tutorial_env.yaml` file: + +```{include} binette_tutorial_env.yaml +:code: yaml +``` + + ### Activate the Environment After the environment is created, activate it by running: @@ -20,8 +27,3 @@ After the environment is created, activate it by running: conda activate binette_tuto ``` -Below is the content of the `binette_tutorial_env.yaml` file: - -```{include} binette_tutorial_env.yaml -:code: yaml -``` diff --git a/docs/tutorial/tutorial_main.md b/docs/tutorial/tutorial_main.md index 4493b3e..8cd8082 100644 --- a/docs/tutorial/tutorial_main.md +++ b/docs/tutorial/tutorial_main.md @@ -7,7 +7,7 @@ In this tutorial, we'll walk through a practical example of how to use Binette w --- title: "Tutorial Overview:" -align: center +align: right config: look: handDrawn @@ -17,20 +17,20 @@ config: graph TD - i[metagenomics reads] --> B[assembly] + i[Metagenomics reads] --> B[Assembly] - B --> metabat2 --> r[binette] - B --> maxbin2 --> r - B --> concoct --> r - B --> semibin2 --> r + B --> MetaBAT2 --> r[Binette] + B --> MaxBin2 --> r + B --> CONCOCT --> r + B --> SemiBin2 --> r r --> f[final bins] subgraph Binning - metabat2:::binning - maxbin2:::binning - concoct:::binning - semibin2:::binning + MetaBAT2:::binning + MaxBin2:::binning + CONCOCT:::binning + SemiBin2:::binning end @@ -50,7 +50,6 @@ assembly binning binette analyse_binette_result.ipynb -analyse_binette_result.myst ``` diff --git a/docs/usage.md b/docs/usage.md index 2108a50..063a51b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -11,7 +11,7 @@ For example, consider the following two `contig2bin_tables`: - `bin_set1.tsv`: - ```tsv + ``` contig_1 binA contig_8 binA contig_15 binB @@ -20,7 +20,7 @@ For example, consider the following two `contig2bin_tables`: - `bin_set2.tsv`: - ```tsv + ``` contig_1 bin.0 contig_8 bin.0 contig_15 bin.1 From 716e2e55cacb964f50f1fba19f54625b78ca3e83 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 14:19:11 +0200 Subject: [PATCH 24/36] update usage with input bin sets report new output --- docs/usage.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/usage.md b/docs/usage.md index 063a51b..9b37065 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -65,6 +65,7 @@ Binette results are stored in the `results` directory. You can specify a differe In this directory you will find: - `final_bins_quality_reports.tsv`: This is a TSV (tab-separated values) file containing quality information about the final selected bins. - `final_bins/`: This directory stores all the selected bins in fasta format. +- `input_bins_quality_reports/`: A directory storing quality reports for the input bin sets, with files following the same structure as `final_bins_quality_reports.tsv`. - `temporary_files/`: This directory contains intermediate files. If you choose to use the `--resume` option, Binette will utilize files in this directory to prevent the recomputation of time-consuming steps. From 3344a4548e41312eed143ed3d32d92bdf3c4e139 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 14:20:32 +0200 Subject: [PATCH 25/36] add missing sphinx_togglebutton extension --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8f12e86..3f99085 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,8 @@ doc = [ "nbsphinx==0.9.5", "nbsphinx_link==1.3.0", "sphinx-book-theme==1.0.1", - "sphinxcontrib.mermaid" + "sphinxcontrib.mermaid", + "sphinx_togglebutton=0.3.2" ] dev = [ From eeee06e6e7678d91c7ef450863c209a4932317dc Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 14:27:16 +0200 Subject: [PATCH 26/36] fix pip format for added ext --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3f99085..2769cd9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ doc = [ "nbsphinx_link==1.3.0", "sphinx-book-theme==1.0.1", "sphinxcontrib.mermaid", - "sphinx_togglebutton=0.3.2" + "sphinx_togglebutton==0.3.2" ] dev = [ From 05696e8c8faaaa211a9269f230afd854296bcbfe Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 15:04:50 +0200 Subject: [PATCH 27/36] improve doc --- docs/tutorial/assembly.md | 1 - docs/tutorial/binette.md | 8 ++++++++ docs/tutorial/binning.md | 5 +++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md index f919f00..60d25fa 100644 --- a/docs/tutorial/assembly.md +++ b/docs/tutorial/assembly.md @@ -19,7 +19,6 @@ This process takes approximately 28 minutes to complete. ```{admonition} Assembly tips :class: tip -:class: dropdown Here are some general tips that might help improve your assembly results, depending on your data: diff --git a/docs/tutorial/binette.md b/docs/tutorial/binette.md index a2d7796..5f83676 100644 --- a/docs/tutorial/binette.md +++ b/docs/tutorial/binette.md @@ -11,6 +11,14 @@ binette --bin_dirs maxbin2/ metabat2/ semibin2/output_bins/ concoct/bins/ \ --verbose -t 12 -o binette_results ``` +```{admonition} βŒ› Expected Time +:class: note +:class: dropdown + +This process should talke around 9 minutes to complete. +``` + + Once Binette completes, the `binette_results` directory should have the following structure: ``` diff --git a/docs/tutorial/binning.md b/docs/tutorial/binning.md index cb82166..b495db1 100644 --- a/docs/tutorial/binning.md +++ b/docs/tutorial/binning.md @@ -75,8 +75,9 @@ SemiBin2 single_easy_bin -i Kickstart.megahit/R1.contigs.fa \ -o semibin2/ -p 12 ``` -```{admonition} ⏳ Time Note +```{admonition} βŒ› Expected Time :class: note +:class: dropdown -This process can take some time. +This process take around 1 hour to complete. ``` From 8a31ecb840ecc3bececc6d05ee3326759616a257 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 15:05:44 +0200 Subject: [PATCH 28/36] add environement.yml for binder --- environment.yml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 environment.yml diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..dd03bfa --- /dev/null +++ b/environment.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge + - defaults +dependencies: + - jupyter # Jupyter notebook for interactive analysis + - pandas=1 # Data manipulation and analysis + - plotly=5 # Interactive graphing + - nbgitpuller \ No newline at end of file From 46cd347cfd565d240aecd6731be33e8ac494da24 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 15:21:13 +0200 Subject: [PATCH 29/36] fix typo in assembly tuto doc --- docs/tutorial/assembly.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorial/assembly.md b/docs/tutorial/assembly.md index 60d25fa..fdd1003 100644 --- a/docs/tutorial/assembly.md +++ b/docs/tutorial/assembly.md @@ -24,7 +24,7 @@ Here are some general tips that might help improve your assembly results, depend - **Read Cleaning:** If your reads have low-quality bases or adapters, consider cleaning them with a tool like `sickle`. It can boost the overall quality of your assembly. -- **Use SPAdes rather than MEGAHIT** **SPAdes** generally performs better than MEGAHIT but takes longer and requires more memory. +- **Use SPAdes rather than MEGAHIT:** SPAdes generally performs better than MEGAHIT but takes longer and requires more memory. - **Quality Check:** Tools like `metaQUAST` are handy for checking your assembly’s quality. It’s a good way to ensure your results are solid before moving on. From 2e5b7b04de8c1011cdd6e5cd1fe7eb8e20175523 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 15:21:39 +0200 Subject: [PATCH 30/36] use proper name for env --- docs/tutorial/set_environment.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorial/set_environment.md b/docs/tutorial/set_environment.md index 7f039fa..5d003dc 100644 --- a/docs/tutorial/set_environment.md +++ b/docs/tutorial/set_environment.md @@ -7,7 +7,7 @@ To get started, we'll download the necessary tools and set them up in a dedicate First, let's create a new Conda environment specifically for this tutorial: ```{code-block} bash -mamba env create -f binette_tutorial_env.yaml -n binette_tuto +mamba env create -f binette_tutorial_env.yaml -n binette_tutorial ``` This command will create a Conda environment named `binette_tuto` using the environment file `binette_tutorial_env.yaml`. From 91c34528d14907d9d3ab4ec835b91ecf571fa5c0 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 16:03:11 +0200 Subject: [PATCH 31/36] improve env file --- environment.yml => binder/environment.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename environment.yml => binder/environment.yml (61%) diff --git a/environment.yml b/binder/environment.yml similarity index 61% rename from environment.yml rename to binder/environment.yml index dd03bfa..4f9610f 100644 --- a/environment.yml +++ b/binder/environment.yml @@ -1,8 +1,8 @@ +name: binder_tutorial_env channels: - conda-forge - - defaults dependencies: - jupyter # Jupyter notebook for interactive analysis - - pandas=1 # Data manipulation and analysis + - pandas # Data manipulation and analysis - plotly=5 # Interactive graphing - - nbgitpuller \ No newline at end of file + # - nbgitpuller \ No newline at end of file From b5bd70b705f77d91705f5375a5dbb9eb7a1765db Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 17:04:49 +0200 Subject: [PATCH 32/36] try binder build from requirements.txt --- binder/environment.yml | 8 -------- binder/requirements.txt | 3 +++ 2 files changed, 3 insertions(+), 8 deletions(-) delete mode 100644 binder/environment.yml create mode 100644 binder/requirements.txt diff --git a/binder/environment.yml b/binder/environment.yml deleted file mode 100644 index 4f9610f..0000000 --- a/binder/environment.yml +++ /dev/null @@ -1,8 +0,0 @@ -name: binder_tutorial_env -channels: - - conda-forge -dependencies: - - jupyter # Jupyter notebook for interactive analysis - - pandas # Data manipulation and analysis - - plotly=5 # Interactive graphing - # - nbgitpuller \ No newline at end of file diff --git a/binder/requirements.txt b/binder/requirements.txt new file mode 100644 index 0000000..8b0c82c --- /dev/null +++ b/binder/requirements.txt @@ -0,0 +1,3 @@ +jupyter # Jupyter notebook for interactive analysis +pandas # Data manipulation and analysis +plotly # Interactive graphing From 8719f3c1b4bb621ee502ef6c0f8870a5b3526cbc Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 17:17:18 +0200 Subject: [PATCH 33/36] improve tutorial --- docs/tutorial/get_dataset.md | 8 ++++---- docs/tutorial/tutorial_main.md | 6 +++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/tutorial/get_dataset.md b/docs/tutorial/get_dataset.md index 3967bc3..f099b17 100644 --- a/docs/tutorial/get_dataset.md +++ b/docs/tutorial/get_dataset.md @@ -37,11 +37,11 @@ Once the tool is installed, you can download the "Kickstart" dataset with the fo ``` -:::{admonition} βŒ› Expected Time -:class: note + :::{admonition} βŒ› Expected Time + :class: note -This process takes approximately 16 minutes to complete. -::: + This process takes approximately 16 minutes to complete. + ::: #### Directory Structure diff --git a/docs/tutorial/tutorial_main.md b/docs/tutorial/tutorial_main.md index 8cd8082..1fdc365 100644 --- a/docs/tutorial/tutorial_main.md +++ b/docs/tutorial/tutorial_main.md @@ -1,7 +1,11 @@ # Tutorial -In this tutorial, we'll walk through a practical example of how to use Binette with real data. We'll start by downloading metagenomics reads and then assemble these reads into contigs. Next, we'll use different binning tools to group the contigs into bins. Finally, we'll use Binette to refine these bins. +In this tutorial, we'll walk through a practical example of how to use Binette with real data. + + 1. We'll start by downloading metagenomics reads and then assemble these reads into contigs. + 2. Next, we'll use different binning tools to group the contigs into bins. + 3. Finally, we'll use Binette to refine these bins. ```{mermaid} From 8aec08230b9a1921b588ad7ce4d297cc146ecd1f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 17:18:40 +0200 Subject: [PATCH 34/36] remvoe binder env as it does not work --- binder/requirements.txt | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 binder/requirements.txt diff --git a/binder/requirements.txt b/binder/requirements.txt deleted file mode 100644 index 8b0c82c..0000000 --- a/binder/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -jupyter # Jupyter notebook for interactive analysis -pandas # Data manipulation and analysis -plotly # Interactive graphing From 9099a553ec5faaab535c92355c3901f129b6d1a3 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 17:41:52 +0200 Subject: [PATCH 35/36] bump to version 1.0.2 --- binette/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/binette/__init__.py b/binette/__init__.py index 6c4c011..34c1db3 100644 --- a/binette/__init__.py +++ b/binette/__init__.py @@ -1 +1 @@ -__version__ = '1.0.1' \ No newline at end of file +__version__ = '1.0.2' \ No newline at end of file From 888923c38926ecac6a1c9d57a4fda31b295893dc Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 4 Sep 2024 17:47:53 +0200 Subject: [PATCH 36/36] update tuto env file for binette 1.0.2 --- docs/tutorial/binette_tutorial_env.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tutorial/binette_tutorial_env.yaml b/docs/tutorial/binette_tutorial_env.yaml index 72655d3..be03147 100644 --- a/docs/tutorial/binette_tutorial_env.yaml +++ b/docs/tutorial/binette_tutorial_env.yaml @@ -14,7 +14,7 @@ dependencies: - metabat2=2 # Binning tool for metagenomic datasets - semibin=2 # Binning tool for metagenomic datasets - concoct=1 # Binning tool for metagenomic datasets - - binette=1.0.1 # Binette for binning and genome analysis + - binette=1.0.2 # Binette for binning and genome analysis - das_tool=1 # Bin refiner to compare with Binette - jupyter # Jupyter notebook for interactive analysis - pandas=1 # Data manipulation and analysis
Contamination ≀ 10 and<br>Completenesstoolbin_count
0> 50% and ≀ 70%binette5
1> 50% and ≀ 70%maxbin21
2> 50% and ≀ 70%metabat21
3> 50% and ≀ 70%semibin22
4> 70% and ≀ 90%binette3
5> 70% and ≀ 90%concoct2> 70% and ≀ 90%concoct2
6