From 788f168d5fa24b1896315132f03f7a9cc3b76ef2 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Dec 2024 19:09:21 +0100 Subject: [PATCH] pyfastx index stored in temporary file dir --- binette/contig_manager.py | 4 ++-- binette/main.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/binette/contig_manager.py b/binette/contig_manager.py index fc783f5..91cdf6e 100644 --- a/binette/contig_manager.py +++ b/binette/contig_manager.py @@ -2,7 +2,7 @@ from typing import Dict, Iterable, Tuple, Set, Any, Union -def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta: +def parse_fasta_file(fasta_file: str, index_file: str) -> pyfastx.Fasta: """ Parse a FASTA file and return a pyfastx.Fasta object. @@ -10,7 +10,7 @@ def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta: :return: A pyfastx.Fasta object representing the parsed FASTA file. """ - fa = pyfastx.Fasta(fasta_file, build_index=True) + fa = pyfastx.Fasta(fasta_file, build_index=True, index_file=index_file) return fa diff --git a/binette/main.py b/binette/main.py index 5f95cb4..8533f4e 100755 --- a/binette/main.py +++ b/binette/main.py @@ -213,6 +213,7 @@ def parse_input_files( bin_dirs: List[Path], contig2bin_tables: List[Path], contigs_fasta: Path, + temporary_dir: Path, fasta_extensions: Set[str] = {".fasta", ".fna", ".fa"}, ) -> Tuple[ Dict[str, Set[bin_manager.Bin]], Set[bin_manager.Bin], Set[str], Dict[str, int] @@ -254,7 +255,10 @@ def parse_input_files( original_bins = bin_manager.dereplicate_bin_sets(bin_set_name_to_bins.values()) logging.info(f"Parsing contig fasta file: {contigs_fasta}") - contigs_object = contig_manager.parse_fasta_file(contigs_fasta.as_posix()) + index_file = temporary_dir / f"{contigs_fasta.name}.fxi" + contigs_object = contig_manager.parse_fasta_file( + contigs_fasta.as_posix(), index_file=index_file.as_posix() + ) unexpected_contigs = { contig for contig in contigs_in_bins if contig not in contigs_object @@ -276,6 +280,7 @@ def parse_input_files( def manage_protein_alignement( faa_file: Path, contigs_fasta: Path, + temporary_dir: Path, contig_to_length: Dict[str, int], contigs_in_bins: Set[str], diamond_result_file: Path, @@ -314,9 +319,12 @@ def manage_protein_alignement( ) else: + index_file = temporary_dir / f"{contigs_fasta.name}.fxi" contigs_iterator = ( seq - for seq in contig_manager.parse_fasta_file(contigs_fasta.as_posix()) + for seq in contig_manager.parse_fasta_file( + contigs_fasta.as_posix(), index_file=index_file.as_posix() + ) if seq.name in contigs_in_bins ) contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads) @@ -502,6 +510,7 @@ def main(): args.contig2bin_tables, args.contigs, fasta_extensions=set(args.fasta_extensions), + temporary_dir=out_tmp_dir, ) if args.proteins and not args.resume: @@ -517,6 +526,7 @@ def main(): contig_to_kegg_counter, contig_to_genes = manage_protein_alignement( faa_file=faa_file, contigs_fasta=args.contigs, + temporary_dir=out_tmp_dir, contig_to_length=contig_to_length, contigs_in_bins=contigs_in_bins, diamond_result_file=diamond_result_file,