diff --git a/README.md b/README.md index de3c209..c58b5e1 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,62 @@ singularity exec binette.sif binette -h # Usage +## Input Formats + +Binette supports two input formats for bin sets: + +1. **Contig2bin Tables:** You can provide bin sets using contig2bin tables, which establish the relationship between each contig and its corresponding bin. In this format, you need to specify the `--contig2bin_tables` argument. + +For example, consider the following two `contig2bin_tables`: + +- `bin_set1.tsv`: + + ```tsv + contig_1 binA + contig_8 binA + contig_15 binB + contig_9 binC + ``` + +- `bin_set2.tsv`: + + ```tsv + contig_1 bin.0 + contig_8 bin.0 + contig_15 bin.1 + contig_9 bin.2 + contig_10 bin.0 + ``` + + The `binette` command to process this input would be: + + ```bash + binette --contig2bin_tables bin_set1.tsv bin_set2.tsv --contigs assembly.fasta + ``` + +2. **Bin Directories:** Alternatively, you can use bin directories, where each bin is represented by a separate FASTA file. For this format, you need to provide the `--bin_dirs` argument. Here's an example of two bin directories: + + ``` + bin_set1/ + ├── binA.fa: contains sequences of contig_1, contig_8 + ├── binB.fa: contains sequences of contig_15 + └── binC.fa: contains sequences of contig_9 + ``` + + ``` + bin_set2/ + ├── binA.fa: contains sequences of contig_1, contig_8, contig_10 + ├── binB.fa: contains sequences of contig_15 + └── binC.fa: contains sequences of contig_9 + ``` + + The `binette` command to process this input would be: + + ```bash + binette --bin_dirs bin_set1 bin_set2 --contigs assembly.fasta + ``` + +In both formats, the `--contigs` argument should specify a FASTA file containing all the contigs found in the bins. Typically, this file would be the assembly FASTA file used to generate the bins. In these exemple the `assembly.fasta` file should contain at least the five contigs mentioned in the `contig2bin_tables` files or in the bin fasta files: `contig_1`, `contig_8`, `contig_15`, `contig_9`, and `contig_10`. # Bug reporting and feature requests @@ -109,3 +165,4 @@ Please submit bug reports and feature requests to the issue tracker: # Licence This program is released as an open source software under the terms of [MIT License](https://forgemia.inra.fr/jean.mainguy/binette/-/raw/main/LICENSE). + diff --git a/binette/bin_quality.py b/binette/bin_quality.py index 018e77c..ad7d7b7 100644 --- a/binette/bin_quality.py +++ b/binette/bin_quality.py @@ -154,12 +154,12 @@ def add_bin_metrics(bins, contig_info, contamination_weight, threads=1): contig_to_length = contig_info["contig_to_length"] logging.info("Assess bin length and N50") - bin_and_contigsize_args = ((bin, contig_to_length) for bin in bins) + # bin_and_contigsize_args = ((bin, contig_to_length) for bin in bins) - with Pool(processes=threads) as pool: - pool.starmap(get_bin_size_and_N50, bin_and_contigsize_args) + # with Pool(processes=threads) as pool: + # pool.starmap(get_bin_size_and_N50, bin_and_contigsize_args) - # add_bin_size_and_N50(bins, contig_to_length) + add_bin_size_and_N50(bins, contig_to_length) logging.info("Asses bin quality") assess_bins_quality_by_chunk( diff --git a/binette/cds.py b/binette/cds.py index 96993a3..6465d0d 100644 --- a/binette/cds.py +++ b/binette/cds.py @@ -62,10 +62,10 @@ def get_contig_cds_metadata_flat(contig_to_genes): contig_to_cds_count = {contig: len(genes) for contig, genes in contig_to_genes.items()} - contig_to_aa_counter = {contig: get_aa_composition(genes) for contig, genes in tqdm(contig_to_genes.items())} + contig_to_aa_counter = {contig: get_aa_composition(genes) for contig, genes in tqdm(contig_to_genes.items(), unit="contig")} logging.info("contig_to_aa_counter done. ") - contig_to_aa_length = {contig: sum(counter.values()) for contig, counter in contig_to_aa_counter.items()} + contig_to_aa_length = {contig: sum(counter.values()) for contig, counter in tqdm(contig_to_aa_counter.items(), unit="contig")} logging.info("contig_to_aa_length done. ") return contig_to_cds_count, contig_to_aa_counter, contig_to_aa_length @@ -81,12 +81,12 @@ def get_contig_cds_metadata(contig_to_genes, threads): for contig, genes in tqdm(contig_to_genes.items()): contig_to_future[contig] = tpe.submit(get_aa_composition, genes) - contig_to_aa_counter = {contig: future.result() for contig, future in tqdm(contig_to_future.items())} + contig_to_aa_counter = {contig: future.result() for contig, future in tqdm(contig_to_future.items(), unit="contig")} # contig_to_aa_counter = {contig:get_aa_composition(genes) for contig, genes in tqdm(contig_to_genes.items())} logging.info("contig_to_aa_counter done. ") - contig_to_aa_length = {contig: sum(counter.values()) for contig, counter in contig_to_aa_counter.items()} + contig_to_aa_length = {contig: sum(counter.values()) for contig, counter in tqdm(contig_to_aa_counter.items(), unit="contig")} logging.info("contig_to_aa_length done. ") return contig_to_cds_count, contig_to_aa_counter, contig_to_aa_length diff --git a/setup.py b/setup.py index f6b3bed..a8ccc28 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name="binette", - version="0.1.4", + version="0.1.5", author="Jean Mainguy", packages=find_packages(), entry_points={"console_scripts": ["binette = binette.binette:main"]},