Skip to content

Commit

Permalink
gzip temporary files
Browse files Browse the repository at this point in the history
  • Loading branch information
JeanMainguy committed Dec 18, 2024
1 parent f3780e8 commit 5649c64
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 13 deletions.
8 changes: 1 addition & 7 deletions binette/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def write_faa(outfaa: str, contig_to_genes: List[Tuple[str, pyrodigal.Genes]]) -
"""
logging.info("Writing predicted protein sequences.")
with open(outfaa, "w") as fl:
with gzip.open(outfaa, "wt") as fl:
for contig_id, genes in contig_to_genes:
genes.write_translations(fl, contig_id)

Expand Down Expand Up @@ -237,12 +237,6 @@ def filter_faa_file(
originating from contigs listed in `contigs_to_keep`, and writes the filtered
sequences to a new FASTA file. The output file supports optional `.gz` compression.
Metrics computed and logged:
- Total number of contigs in `contigs_to_keep`.
- Number and proportion of contigs with at least one protein-coding gene.
- Number and proportion of contigs without any protein-coding genes.
- Number of contigs from the input FASTA file that are not in `contigs_to_keep`.
:param contigs_to_keep: A set of contig names to retain in the output FASTA file.
:param input_faa_file: Path to the input FASTA file containing protein sequences.
:param filtered_faa_file: Path to the output FASTA file for filtered sequences.
Expand Down
1 change: 1 addition & 0 deletions binette/diamond.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def run(
f"-o {output} "
f"--threads {threads} "
f"--db {db} "
f"--compress 1 "
f"--query-cover {query_cover} "
f"--subject-cover {subject_cover} "
f"--id {percent_id} "
Expand Down
15 changes: 9 additions & 6 deletions binette/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,9 +315,9 @@ def manage_protein_alignement(

else:
contigs_iterator = (
s
for s in contig_manager.parse_fasta_file(contigs_fasta.as_posix())
if s.name in contigs_in_bins
seq
for seq in contig_manager.parse_fasta_file(contigs_fasta.as_posix())
if seq.name in contigs_in_bins
)
contig_to_genes = cds.predict(contigs_iterator, faa_file.as_posix(), threads)

Expand All @@ -330,7 +330,10 @@ def manage_protein_alignement(
else:
raise FileNotFoundError(checkm2_db)

diamond_log = diamond_result_file.parents[0] / f"{diamond_result_file.stem}.log"
diamond_log = (
diamond_result_file.parents[0]
/ f"{diamond_result_file.stem.split('.')[0]}.log"
)

diamond.run(
faa_file.as_posix(),
Expand Down Expand Up @@ -482,9 +485,9 @@ def main():

use_existing_protein_file = False

faa_file = out_tmp_dir / "assembly_proteins.faa"
faa_file = out_tmp_dir / "assembly_proteins.faa.gz"

diamond_result_file = out_tmp_dir / "diamond_result.tsv"
diamond_result_file = out_tmp_dir / "diamond_result.tsv.gz"

# Output files #
final_bin_report: Path = args.outdir / "final_bins_quality_reports.tsv"
Expand Down

0 comments on commit 5649c64

Please sign in to comment.