From c1a961baa480bbfd0bfdc56e3df27e1d32500e7e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Nov 2024 14:45:36 +0100 Subject: [PATCH] improve parsing fasta fct --- ppanggolin/annotate/annotate.py | 17 ++--- ppanggolin/annotate/synta.py | 98 +++++++++++++++++------------ ppanggolin/projection/projection.py | 6 +- 3 files changed, 65 insertions(+), 56 deletions(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 9f84594e..ec868264 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -22,7 +22,7 @@ # local libraries from ppanggolin.annotate.synta import ( annotate_organism, - read_fasta, + get_contigs_from_fasta_file, get_dna_sequence, init_contig_counter, contig_counter, @@ -1182,9 +1182,6 @@ def check_chevrons_in_start_and_stop( rna_counter += 1 contig.add_rna(rna) - # Correct coordinates of genes that overlap the edge of circulars contig - correct_putative_overlaps(org.contigs) - # Fix partial genes coordinates for contig in org.contigs: for gene in contig.genes: @@ -1201,14 +1198,10 @@ def check_chevrons_in_start_and_stop( has_fasta = False if has_fasta: - contig_sequences = read_fasta( - org, fasta_string.split("\n") - ) # _ is total contig length + contig_sequences = get_contigs_from_fasta_file(org, fasta_string.split("\n")) + + correct_putative_overlaps(org.contigs) for contig in org.contigs: - if contig.length != len(contig_sequences[contig.name]): - raise ValueError( - "The contig length defined is different than the sequence length" - ) for gene in contig.genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) @@ -1616,7 +1609,7 @@ def get_gene_sequences_from_fastas( f"your fasta file are different." ) with read_compressed_or_not(Path(elements[1])) as currFastaFile: - fasta_dict[org] = read_fasta(org, currFastaFile) + fasta_dict[org] = get_contigs_from_fasta_file(org, currFastaFile) if set(pangenome.organisms) > set(fasta_dict.keys()): missing = pangenome.number_of_organisms - len( diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 2b65ea40..69043050 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -9,7 +9,7 @@ from subprocess import Popen, PIPE import ast from collections import defaultdict -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Generator, Tuple from pathlib import Path # install libraries @@ -245,49 +245,65 @@ def launch_infernal( return gene_objs -def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]: - """Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value. +def parse_fasta( + fna_file: Union[TextIOWrapper, list] +) -> Generator[Tuple[str, str], None, None]: + """Yields each header and sequence from a FASTA file or stream as a tuple. - :param org: Organism corresponding to fasta file - :param fna_file: Input fasta file with sequences or list of each line as sequence + :param fna_file: Input FASTA file or list of lines as sequences. + :yield: Tuple with contig header (without '>') and sequence. + """ + header = None + sequence = [] + + for line in fna_file: + line = line.strip() + if line.startswith(">"): # New header + if header: # Yield previous header and sequence + yield (header, "".join(sequence)) + header = line[1:] # Strip '>' + sequence = [] + else: + sequence.append(line) + + if header: # Yield the final contig + yield (header, "".join(sequence)) - :return: Dictionary with contig_name as keys and contig sequence in values + +def get_contigs_from_fasta_file( + org: Organism, fna_file: Union[TextIOWrapper, list] +) -> Dict[str, str]: + """Processes contigs from a parsed FASTA generator and stores in a dictionary. + + :param org: Organism instance to update with contig info. + :param fna_file: Input FASTA file or list of lines as sequences. + :return: Dictionary with contig names as keys and sequences as values. """ + global contig_counter - try: - contigs = {} - contig_seq = "" - contig = None - for line in fna_file: - if line.startswith(">"): - if len(contig_seq) >= 1: # contig filter = 1 - contigs[contig.name] = contig_seq.upper() - contig.length = len(contig_seq) - contig_seq = "" - try: - contig = org.get(line.split()[0][1:]) - except KeyError: - with contig_counter.get_lock(): - contig = Contig(contig_counter.value, line.split()[0][1:]) - contig_counter.value += 1 - org.add(contig) - else: - contig_seq += line.strip() - if len(contig_seq) >= 1: # processing the last contig - contigs[contig.name] = contig_seq.upper() - contig.length = len(contig_seq) - - except AttributeError as e: - raise AttributeError( - f"{e}\nAn error was raised when reading file: '{fna_file.name}'. " - f"One possibility for this error is that the file did not start with a '>' " - f"as it would be expected from a fna file." - ) - except Exception as err: # To manage other exception which can occur - raise Exception( - f"{err}: Please check your input file and if everything looks fine, " - "please post an issue on our github" - ) + contigs = {} + + for header, sequence in parse_fasta(fna_file): + contig_name = header.split()[0] + + # Retrieve or create the contig + try: + contig = org.get(contig_name) + except KeyError: + with contig_counter.get_lock(): + contig = Contig(contig_counter.value, contig_name) + contig_counter.value += 1 + org.add(contig) + + # Update contig information + if contig.length is not None and contig.length != len(sequence): + raise ValueError( + f"Length mismatch for contig {contig_name}: expected {contig.length}, found {len(sequence)} from the fasta sequence." + ) + + contig.length = len(sequence) + contigs[contig_name] = sequence.upper() + return contigs @@ -464,7 +480,7 @@ def annotate_organism( fasta_file = read_compressed_or_not(file_name) - contig_sequences = read_fasta(org, fasta_file) + contig_sequences = get_contigs_from_fasta_file(org, fasta_file) if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) if procedure is None: # prodigal procedure is not force by user diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index d577e840..b0a36c5f 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -20,7 +20,7 @@ import yaml # # local libraries -from ppanggolin.annotate.synta import read_fasta, get_dna_sequence +from ppanggolin.annotate.synta import get_contigs_from_fasta_file, get_dna_sequence from ppanggolin.annotate.annotate import ( init_contig_counter, read_anno_file, @@ -679,7 +679,7 @@ def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path): org_fasta_file = genome_name_to_annot_path[org.name]["path"] with read_compressed_or_not(org_fasta_file) as currFastaFile: - org_contig_to_seq = read_fasta(org, currFastaFile) + org_contig_to_seq = get_contigs_from_fasta_file(org, currFastaFile) for contig in org.contigs: try: @@ -997,7 +997,7 @@ def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file): """ with read_compressed_or_not(fasta_file) as currFastaFile: - contig_id2seq = read_fasta(input_organism, currFastaFile) + contig_id2seq = get_contigs_from_fasta_file(input_organism, currFastaFile) for contig in input_organism.contigs: try: