Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Provide Precomputed Protein Sequences #31

Merged
merged 12 commits into from
Nov 27, 2024
13 changes: 13 additions & 0 deletions .github/workflows/binette_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,16 @@ jobs:
python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs/final_bins_quality_reports.tsv


- name: Run simple test case from bin dirs and with proteins input
run: |
cd test_data
binette -d binning_results/A/ binning_results/B/ binning_results/C/ \
--contigs all_contigs.fna --checkm2_db checkm2_tiny_db/checkm2_tiny_db.dmnd -v -o test_results_from_dirs_and_prot_input --proteins proteins.faa

- name: Compare results from bin dirs with expectation
run: |
cd test_data
head expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv
python scripts/compare_results.py expected_results/final_bins_quality_reports.tsv test_results_from_dirs_and_prot_input/final_bins_quality_reports.tsv


40 changes: 40 additions & 0 deletions binette/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,19 +71,59 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) -
for contig_id, genes in contig_to_genes:
genes.write_translations(fl, contig_id)


def is_nucleic_acid(sequence: str) -> bool:
"""
Determines whether the given sequence is a DNA or RNA sequence.

:param sequence: The sequence to check.
:return: True if the sequence is a DNA or RNA sequence, False otherwise.
"""
# Define nucleotidic bases (DNA and RNA)
nucleotidic_bases = set('ATCGNUatcgnu')

# Check if all characters in the sequence are valid nucleotidic bases (DNA or RNA)
if all(base in nucleotidic_bases for base in sequence):
return True

# If any character is invalid, return False
return False



def parse_faa_file(faa_file: str) -> Dict[str, List[str]]:
"""
Parse a FASTA file containing protein sequences and organize them by contig.

:param faa_file: Path to the input FASTA file.
:return: A dictionary mapping contig names to lists of protein sequences.
:raises ValueError: If the file contains nucleotidic sequences instead of protein sequences.
"""
contig_to_genes = defaultdict(list)
checked_sequences = []

# Iterate through the FASTA file and parse sequences
for name, seq in pyfastx.Fastx(faa_file):
contig = get_contig_from_cds_name(name)
contig_to_genes[contig].append(seq)

# Concatenate up to the first 20 sequences for validation
if len(checked_sequences) < 20:
checked_sequences.append(seq)

# Concatenate all checked sequences for a more reliable nucleic acid check
concatenated_seq = "".join(checked_sequences)

# Check if the concatenated sequence appears to be nucleic acid
if is_nucleic_acid(concatenated_seq):
raise ValueError(
f"The file '{faa_file}' appears to contain nucleotide sequences. "
"Ensure that the file contains valid protein sequences in FASTA format."
)

return dict(contig_to_genes)



def get_aa_composition(genes: List[str]) -> Counter:
"""
Expand Down
4 changes: 2 additions & 2 deletions binette/io_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ def check_contig_consistency(contigs_from_assembly: Iterable[str],

issue_countigs = len(set(contigs_from_elsewhere) - set(contigs_from_assembly))

message = f"{issue_countigs} contigs found in file {elsewhere_file} \
were not found in assembly_file ({assembly_file})."
message = (f"{issue_countigs} contigs found in file '{elsewhere_file}' "
f"were not found in assembly_file '{assembly_file}'")
assert are_contigs_consistent, message


Expand Down
56 changes: 48 additions & 8 deletions binette/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,21 @@ def __call__(
setattr(namespace, self.dest, values)


def is_valid_file(parser: ArgumentParser, arg: str) -> Path:
"""
Validates that the provided input file exists.

:param parser: The ArgumentParser instance handling command-line arguments.
:param arg: The path to the file provided as an argument.
:return: A Path object representing the valid file.
"""
path_arg = Path(arg)

# Check if the file exists at the provided path
if not path_arg.exists():
parser.error(f"Error: The specified file '{arg}' does not exist.")

return path_arg

def parse_arguments(args):
"""Parse script arguments."""
Expand All @@ -85,7 +100,7 @@ def parse_arguments(args):
"-d",
"--bin_dirs",
nargs="+",
type=Path,
type=lambda x: is_valid_file(parser, x),
action=UniqueStore,
help="List of bin folders containing each bin in a fasta file.",
)
Expand All @@ -95,12 +110,22 @@ def parse_arguments(args):
"--contig2bin_tables",
nargs="+",
action=UniqueStore,
type=Path,
type=lambda x: is_valid_file(parser, x),
help="List of contig2bin table with two columns separated\
with a tabulation: contig, bin",
)

input_group.add_argument("-c", "--contigs", required=True, type=Path, help="Contigs in fasta format.")
input_group.add_argument("-c", "--contigs", required=True,
type=lambda x: is_valid_file(parser, x),
help="Contigs in fasta format.")

input_group.add_argument(
"-p", "--proteins",
type=lambda x: is_valid_file(parser, x),
help="FASTA file of predicted proteins in Prodigal format (>contigID_geneID). "
"Skips the gene prediction step if provided."
)


# Other parameters category
other_group = parser.add_argument_group('Other Arguments')
Expand Down Expand Up @@ -211,7 +236,9 @@ def parse_input_files(bin_dirs: List[Path],

def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_length: Dict[str, int],
contigs_in_bins: Set[str], diamond_result_file: Path,
checkm2_db: Optional[Path], threads: int, resume: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
checkm2_db: Optional[Path], threads: int, use_existing_protein_file: bool,
resume_diamond:bool,
low_mem: bool) -> Tuple[Dict[str, int], Dict[str, List[str]]]:
"""
Predicts or reuses proteins prediction and runs diamond on them.

Expand All @@ -222,14 +249,15 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len
:param diamond_result_file: The path to the diamond result file.
:param checkm2_db: The path to the CheckM2 database.
:param threads: Number of threads for parallel processing.
:param resume: Boolean indicating whether to resume the process.
:param use_existing_protein_file: Boolean indicating whether to use an existing protein file.
:param resume_diamond: Boolean indicating whether to resume diamond alignement.
:param low_mem: Boolean indicating whether to use low memory mode.

:return: A tuple containing dictionaries - contig_to_kegg_counter and contig_to_genes.
"""

# Predict or reuse proteins prediction and run diamond on them
if resume:
if use_existing_protein_file:
logging.info(f"Parsing faa file: {faa_file}.")
contig_to_genes = cds.parse_faa_file(faa_file.as_posix())
io.check_contig_consistency(contig_to_length, contig_to_genes, contigs_fasta.as_posix(), faa_file.as_posix())
Expand All @@ -238,6 +266,7 @@ def manage_protein_alignement(faa_file: Path, contigs_fasta: Path, contig_to_len
contigs_iterator = (s for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix()) if s.name in contigs_in_bins)
contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads)

if not resume_diamond:
if checkm2_db is None:
# get checkm2 db stored in checkm2 install
diamond_db_path = diamond.get_checkm2_db()
Expand Down Expand Up @@ -360,8 +389,17 @@ def main():
# Temporary files #
out_tmp_dir:Path = args.outdir / "temporary_files"
os.makedirs(out_tmp_dir, exist_ok=True)

use_existing_protein_file = False

if args.proteins:
logging.info(f"Using the provided protein sequences file: {args.proteins}")
faa_file = args.proteins
use_existing_protein_file = True
else:
faa_file = out_tmp_dir / "assembly_proteins.faa"


faa_file = out_tmp_dir / "assembly_proteins.faa"
diamond_result_file = out_tmp_dir / "diamond_result.tsv"

# Output files #
Expand All @@ -370,13 +408,15 @@ def main():

if args.resume:
io.check_resume_file(faa_file, diamond_result_file)
use_existing_protein_file = True

bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = parse_input_files(args.bin_dirs, args.contig2bin_tables, args.contigs, fasta_extensions=set(args.fasta_extensions))

contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(faa_file=faa_file, contigs_fasta=args.contigs, contig_to_length=contig_to_length,
contigs_in_bins=contigs_in_bins,
diamond_result_file=diamond_result_file, checkm2_db=args.checkm2_db,
threads=args.threads, resume=args.resume, low_mem=args.low_mem)
threads=args.threads, use_existing_protein_file=use_existing_protein_file,
resume_diamond=args.resume, low_mem=args.low_mem)

# Use contig index instead of contig name to save memory
contig_to_index, index_to_contig = contig_manager.make_contig_index(contigs_in_bins)
Expand Down
14 changes: 14 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,20 @@ For example, consider the following two `contig2bin_tables`:

In both formats, the `--contigs` argument should specify a FASTA file containing all the contigs found in the bins. Typically, this file would be the assembly FASTA file used to generate the bins. In these exemple the `assembly.fasta` file should contain at least the five contigs mentioned in the `contig2bin_tables` files or in the bin fasta files: `contig_1`, `contig_8`, `contig_15`, `contig_9`, and `contig_10`.


## Providing Precomputed Protein Sequences

You can provide protein sequences in FASTA format to Binette using the `--proteins` argument. The sequence identifiers must follow the Prodigal convention: `<contigID>_<GeneID>`. This naming format ensures proper mapping of each gene to its contig.

By using this option, the gene prediction step is skipped.

### Example
If your contig is named `contig_A`, the gene identifiers should follow this pattern:
- `contig_A_1`
- `contig_A_2`
- `contig_A_3`


## Outputs

Binette results are stored in the `results` directory. You can specify a different directory using the `--outdir` option.
Expand Down
53 changes: 46 additions & 7 deletions tests/cds_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@ def test_extract_contig_name_from_cds_name():
assert result == "contig1"


# Import the functions write_faa and parse_faa_file here


def test_write_faa(contig1, orf_finder):

predicted_genes = orf_finder.find_genes(contig1.seq)
Expand All @@ -122,10 +119,11 @@ def test_write_faa(contig1, orf_finder):


def test_parse_faa_file(tmp_path):
# Mock a FASTA file
# Mock a FASTA file of protein sequences
# at least one protein sequence to not triger the error
fasta_content = (
">contig1_gene1\n"
"AAAAAAAAAAA\n"
"MPPPAOSKNSKSS\n"
">contig1_gene2\n"
"CCCCCCCCCCC\n"
">contig2_gene1\n"
Expand All @@ -139,11 +137,32 @@ def test_parse_faa_file(tmp_path):

# Check if the output matches the expected dictionary
expected_result = {
'contig1': ['AAAAAAAAAAA', 'CCCCCCCCCCC'],
'contig1': ['MPPPAOSKNSKSS', 'CCCCCCCCCCC'],
'contig2': ['TTTTTTTTTTTT']
}
assert result == expected_result


def test_parse_faa_file_raises_error_for_dna(tmp_path):
# Mock a DNA FASTA file
fasta_content = (
">contig1_gene1\n"
"AAAAAAAAAAA\n"
">contig1_gene2\n"
"CCCCCCCCCCC\n"
">contig2_gene1\n"
"TTTTTTTTTTTT\n"
)
fna_file = tmp_path / "mock_file.fna"
fna_file.write_text(fasta_content)


# Check that ValueError is raised when DNA sequences are encountered
with pytest.raises(ValueError):
cds.parse_faa_file(fna_file)



def test_get_aa_composition():

genes = ['AAAA',
Expand Down Expand Up @@ -175,4 +194,24 @@ def test_get_contig_cds_metadata():

assert contig_metadata['contig_to_cds_count'] == {"c1":3, "c2":2}
assert contig_metadata['contig_to_aa_counter'] == {"c1": {'A': 4, 'G': 4, "C":4} , "c2":{'C': 4, 'T': 4}}
assert contig_metadata['contig_to_aa_length'] == {"c1":12, "c2":8}
assert contig_metadata['contig_to_aa_length'] == {"c1":12, "c2":8}



# Test function
def test_is_nucleic_acid():
# Valid DNA sequence
assert cds.is_nucleic_acid("ATCG") is True
assert cds.is_nucleic_acid("ATCNNNNNG") is True # N can be found in DNA seq
# Valid RNA sequence
assert cds.is_nucleic_acid("AUGCAUGC") is True

# Mixed case
assert cds.is_nucleic_acid("AtCg") is True

# Invalid sequence (contains characters not part of DNA or RNA)
assert cds.is_nucleic_acid("ATCX") is False # 'X' is not a valid base
assert cds.is_nucleic_acid("AUG#C") is False # '#' is not a valid base

# Amino acid sequence
assert cds.is_nucleic_acid("MSIRGVGGNGNSR") is False # Numbers are invalid
Loading
Loading