Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix wrong input proteins management #37

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 69 additions & 2 deletions binette/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
import multiprocessing.pool
import logging
from collections import Counter, defaultdict
from typing import Dict, List, Iterator, Tuple, Any, Union
from typing import Dict, List, Iterator, Tuple, Any, Union, Set

import pyfastx
import pyrodigal
from tqdm import tqdm
from pathlib import Path
import gzip


def get_contig_from_cds_name(cds_name: str) -> str:
Expand Down Expand Up @@ -70,7 +72,7 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) -

"""
logging.info("Writing predicted protein sequences.")
with open(outfaa, "w") as fl:
with gzip.open(outfaa, "wt") as fl:
for contig_id, genes in contig_to_genes:
genes.write_translations(fl, contig_id)

Expand Down Expand Up @@ -207,3 +209,68 @@ def get_contig_cds_metadata(
}

return contig_info


from typing import Set
from pathlib import Path
import gzip
import pyfastx
import logging

from typing import Set
from pathlib import Path
import gzip
import pyfastx
import logging


def filter_faa_file(
contigs_to_keep: Set[str],
input_faa_file: Path,
filtered_faa_file: Path,
):
"""
Filters a FASTA file containing protein sequences to only include sequences
from contigs present in the provided set of contigs (`contigs_to_keep`).

This function processes the input FASTA file, identifies protein sequences
originating from contigs listed in `contigs_to_keep`, and writes the filtered
sequences to a new FASTA file. The output file supports optional `.gz` compression.

:param contigs_to_keep: A set of contig names to retain in the output FASTA file.
:param input_faa_file: Path to the input FASTA file containing protein sequences.
:param filtered_faa_file: Path to the output FASTA file for filtered sequences.
If the filename ends with `.gz`, the output will be compressed.
"""
# Determine whether the output file should be compressed
proper_open = gzip.open if str(filtered_faa_file).endswith(".gz") else open

# Initialize tracking sets for metrics
contigs_with_genes = set()
contigs_parsed = set()

# Process the input FASTA file and filter sequences based on contigs_to_keep
with proper_open(filtered_faa_file, "wt") as fl:
for name, seq in pyfastx.Fastx(input_faa_file):
contig = get_contig_from_cds_name(name)
contigs_parsed.add(contig)
if contig in contigs_to_keep:
contigs_with_genes.add(contig)
fl.write(f">{name}\n{seq}\n")

# Calculate metrics
total_contigs = len(contigs_to_keep)
contigs_with_no_genes = total_contigs - len(contigs_with_genes)
contigs_not_in_keep_list = len(contigs_parsed - contigs_to_keep)

# Log the computed metrics
logging.info(f"Processing protein sequences from '{input_faa_file}'.")
logging.info(
f"Filtered {input_faa_file} to retain genes from {total_contigs} contigs that are included in the input bins."
)
logging.debug(
f"Found {contigs_with_no_genes}/{total_contigs} contigs ({contigs_with_no_genes / total_contigs:.2%}) with no genes."
)
logging.debug(
f"{contigs_not_in_keep_list} contigs from the input FASTA file are not in the keep list."
)
4 changes: 2 additions & 2 deletions binette/contig_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
from typing import Dict, Iterable, Tuple, Set, Any, Union


def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta:
def parse_fasta_file(fasta_file: str, index_file: str) -> pyfastx.Fasta:
"""
Parse a FASTA file and return a pyfastx.Fasta object.

:param fasta_file: The path to the FASTA file.

:return: A pyfastx.Fasta object representing the parsed FASTA file.
"""
fa = pyfastx.Fasta(fasta_file, build_index=True)
fa = pyfastx.Fasta(fasta_file, build_index=True, index_file=index_file)
return fa


Expand Down
1 change: 1 addition & 0 deletions binette/diamond.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def run(
f"-o {output} "
f"--threads {threads} "
f"--db {db} "
f"--compress 1 "
f"--query-cover {query_cover} "
f"--subject-cover {subject_cover} "
f"--id {percent_id} "
Expand Down
42 changes: 30 additions & 12 deletions binette/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ def parse_input_files(
bin_dirs: List[Path],
contig2bin_tables: List[Path],
contigs_fasta: Path,
temporary_dir: Path,
fasta_extensions: Set[str] = {".fasta", ".fna", ".fa"},
) -> Tuple[
Dict[str, Set[bin_manager.Bin]], Set[bin_manager.Bin], Set[str], Dict[str, int]
Expand Down Expand Up @@ -254,7 +255,10 @@ def parse_input_files(
original_bins = bin_manager.dereplicate_bin_sets(bin_set_name_to_bins.values())

logging.info(f"Parsing contig fasta file: {contigs_fasta}")
contigs_object = contig_manager.parse_fasta_file(contigs_fasta.as_posix())
index_file = temporary_dir / f"{contigs_fasta.name}.fxi"
contigs_object = contig_manager.parse_fasta_file(
contigs_fasta.as_posix(), index_file=index_file.as_posix()
)

unexpected_contigs = {
contig for contig in contigs_in_bins if contig not in contigs_object
Expand All @@ -276,6 +280,7 @@ def parse_input_files(
def manage_protein_alignement(
faa_file: Path,
contigs_fasta: Path,
temporary_dir: Path,
contig_to_length: Dict[str, int],
contigs_in_bins: Set[str],
diamond_result_file: Path,
Expand Down Expand Up @@ -314,10 +319,13 @@ def manage_protein_alignement(
)

else:
index_file = temporary_dir / f"{contigs_fasta.name}.fxi"
contigs_iterator = (
s
for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix())
if s.name in contigs_in_bins
seq
for seq in contig_manager.parse_fasta_file(
contigs_fasta.as_posix(), index_file=index_file.as_posix()
)
if seq.name in contigs_in_bins
)
contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads)

Expand All @@ -330,7 +338,10 @@ def manage_protein_alignement(
else:
raise FileNotFoundError(checkm2_db)

diamond_log = diamond_result_file.parents[0] / f"{diamond_result_file.stem}.log"
diamond_log = (
diamond_result_file.parents[0]
/ f"{diamond_result_file.stem.split('.')[0]}.log"
)

diamond.run(
faa_file.as_posix(),
Expand Down Expand Up @@ -482,14 +493,9 @@ def main():

use_existing_protein_file = False

if args.proteins:
logging.info(f"Using the provided protein sequences file: {args.proteins}")
faa_file = args.proteins
use_existing_protein_file = True
else:
faa_file = out_tmp_dir / "assembly_proteins.faa"
faa_file = out_tmp_dir / "assembly_proteins.faa.gz"

diamond_result_file = out_tmp_dir / "diamond_result.tsv"
diamond_result_file = out_tmp_dir / "diamond_result.tsv.gz"

# Output files #
final_bin_report: Path = args.outdir / "final_bins_quality_reports.tsv"
Expand All @@ -504,11 +510,23 @@ def main():
args.contig2bin_tables,
args.contigs,
fasta_extensions=set(args.fasta_extensions),
temporary_dir=out_tmp_dir,
)

if args.proteins and not args.resume:
logging.info(f"Using the provided protein sequences file: {args.proteins}")
use_existing_protein_file = True

cds.filter_faa_file(
contigs_in_bins,
input_faa_file=args.proteins,
filtered_faa_file=faa_file,
)

contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(
faa_file=faa_file,
contigs_fasta=args.contigs,
temporary_dir=out_tmp_dir,
contig_to_length=contig_to_length,
contigs_in_bins=contigs_in_bins,
diamond_result_file=diamond_result_file,
Expand Down
68 changes: 66 additions & 2 deletions tests/cds_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from pathlib import Path
from unittest.mock import mock_open, patch

import gzip


class MockContig:
def __init__(self, name, seq):
Expand Down Expand Up @@ -116,14 +118,14 @@ def test_write_faa(contig1, orf_finder):

predicted_genes = orf_finder.find_genes(contig1.seq)
contig_name = "contig"
output_file = "tests/tmp_file.faa"
output_file = "tests/tmp_file.faa.gz"

cds.write_faa(output_file, [(contig_name, predicted_genes)])

# Check if the file was created and first seq starts
# with the contig name as expected
assert Path(output_file).exists()
with open(output_file, "r") as f:
with gzip.open(output_file, "rt") as f:
assert f.read().startswith(f">{contig_name}")


Expand Down Expand Up @@ -226,3 +228,65 @@ def test_is_nucleic_acid():

# Amino acid sequence
assert cds.is_nucleic_acid("MSIRGVGGNGNSR") is False # Numbers are invalid


def test_filter_faa_file_basic(tmp_path):
"""Test basic functionality of filtering a FASTA file."""
# Create input data
input_faa = tmp_path / "input.faa"
filtered_faa = tmp_path / "filtered.faa"

input_faa.write_text(
">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n"
)

# Contigs to keep
contigs_to_keep = {"contig1", "contig3"}

# Run the function
cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa)

# Check the filtered output
expected_output = ">contig1_gene1\nATGCGT\n>contig3_gene1\nATGAAA\n"
assert filtered_faa.read_text() == expected_output


def test_filter_faa_file_gz_output(tmp_path):
"""Test filtering with gzipped output."""
input_faa = tmp_path / "input.faa"
filtered_faa = tmp_path / "filtered.faa.gz"

input_faa.write_text(
">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n"
)

# Contigs to keep
contigs_to_keep = {"contig2"}

# Run the function
cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa)

# Check the filtered output
with gzip.open(filtered_faa, "rt") as f:
filtered_content = f.read()
expected_output = ">contig2_gene1\nATGCCG\n"
assert filtered_content == expected_output


def test_filter_faa_file_no_matching_contigs(tmp_path):
"""Test filtering when no contigs match the input list."""
input_faa = tmp_path / "input.faa"
filtered_faa = tmp_path / "filtered_no_match.faa"

input_faa.write_text(
">contig1_gene1\nATGCGT\n" ">contig2_gene1\nATGCCG\n" ">contig3_gene1\nATGAAA\n"
)

# Contigs to keep
contigs_to_keep = {"contig4", "contig5"}

# Run the function
cds.filter_faa_file(contigs_to_keep, input_faa, filtered_faa)

# Check the output file is empty
assert filtered_faa.read_text() == ""
9 changes: 5 additions & 4 deletions tests/contig_manager_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@


# Parses a valid FASTA file and returns a pyfastx.Fasta object.
def test_valid_fasta_file():
def test_valid_fasta_file(tmp_path):
# Arrange
fasta_file = "tests/contigs.fasta"

# Act
result = contig_manager.parse_fasta_file(fasta_file)
index_file = "tests/contigs.fasta.fxi"

result = contig_manager.parse_fasta_file(fasta_file, str(index_file))

# Assert
assert isinstance(result, pyfastx.Fasta)
Expand All @@ -22,7 +23,7 @@ def test_invalid_fasta_file():

# Act and Assert
with pytest.raises(RuntimeError):
contig_manager.parse_fasta_file(fasta_file)
contig_manager.parse_fasta_file(fasta_file, "./index.fxi")


# The function returns a tuple containing two dictionaries.
Expand Down
2 changes: 1 addition & 1 deletion tests/diamonds_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def __init__(self, returncode):
# Simulating successful run of diamond command
if (
args[0]
== "diamond blastp --outfmt 6 --max-target-seqs 1 --query test.faa -o output.txt --threads 1 --db db --query-cover 80 --subject-cover 80 --id 30 --evalue 1e-05 --block-size 2 2> log.txt"
== "diamond blastp --outfmt 6 --max-target-seqs 1 --query test.faa -o output.txt --threads 1 --db db --compress 1 --query-cover 80 --subject-cover 80 --id 30 --evalue 1e-05 --block-size 2 2> log.txt"
):
return CompletedProcess(0)

Expand Down
Loading
Loading