Skip to content

Commit

Permalink
improve parsing fasta fct
Browse files Browse the repository at this point in the history
  • Loading branch information
JeanMainguy committed Nov 18, 2024
1 parent c76c860 commit c1a961b
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 56 deletions.
17 changes: 5 additions & 12 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# local libraries
from ppanggolin.annotate.synta import (
annotate_organism,
read_fasta,
get_contigs_from_fasta_file,
get_dna_sequence,
init_contig_counter,
contig_counter,
Expand Down Expand Up @@ -1182,9 +1182,6 @@ def check_chevrons_in_start_and_stop(
rna_counter += 1
contig.add_rna(rna)

# Correct coordinates of genes that overlap the edge of circulars contig
correct_putative_overlaps(org.contigs)

# Fix partial genes coordinates
for contig in org.contigs:
for gene in contig.genes:
Expand All @@ -1201,14 +1198,10 @@ def check_chevrons_in_start_and_stop(
has_fasta = False

if has_fasta:
contig_sequences = read_fasta(
org, fasta_string.split("\n")
) # _ is total contig length
contig_sequences = get_contigs_from_fasta_file(org, fasta_string.split("\n"))

correct_putative_overlaps(org.contigs)
for contig in org.contigs:
if contig.length != len(contig_sequences[contig.name]):
raise ValueError(
"The contig length defined is different than the sequence length"
)

for gene in contig.genes:
gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
Expand Down Expand Up @@ -1616,7 +1609,7 @@ def get_gene_sequences_from_fastas(
f"your fasta file are different."
)
with read_compressed_or_not(Path(elements[1])) as currFastaFile:
fasta_dict[org] = read_fasta(org, currFastaFile)
fasta_dict[org] = get_contigs_from_fasta_file(org, currFastaFile)

if set(pangenome.organisms) > set(fasta_dict.keys()):
missing = pangenome.number_of_organisms - len(
Expand Down
98 changes: 57 additions & 41 deletions ppanggolin/annotate/synta.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from subprocess import Popen, PIPE
import ast
from collections import defaultdict
from typing import Dict, List, Optional, Union
from typing import Dict, List, Optional, Union, Generator, Tuple
from pathlib import Path

# install libraries
Expand Down Expand Up @@ -245,49 +245,65 @@ def launch_infernal(
return gene_objs


def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]:
"""Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value.
def parse_fasta(
fna_file: Union[TextIOWrapper, list]
) -> Generator[Tuple[str, str], None, None]:
"""Yields each header and sequence from a FASTA file or stream as a tuple.
:param org: Organism corresponding to fasta file
:param fna_file: Input fasta file with sequences or list of each line as sequence
:param fna_file: Input FASTA file or list of lines as sequences.
:yield: Tuple with contig header (without '>') and sequence.
"""
header = None
sequence = []

for line in fna_file:
line = line.strip()
if line.startswith(">"): # New header
if header: # Yield previous header and sequence
yield (header, "".join(sequence))
header = line[1:] # Strip '>'
sequence = []
else:
sequence.append(line)

if header: # Yield the final contig
yield (header, "".join(sequence))

:return: Dictionary with contig_name as keys and contig sequence in values

def get_contigs_from_fasta_file(
org: Organism, fna_file: Union[TextIOWrapper, list]
) -> Dict[str, str]:
"""Processes contigs from a parsed FASTA generator and stores in a dictionary.
:param org: Organism instance to update with contig info.
:param fna_file: Input FASTA file or list of lines as sequences.
:return: Dictionary with contig names as keys and sequences as values.
"""

global contig_counter
try:
contigs = {}
contig_seq = ""
contig = None
for line in fna_file:
if line.startswith(">"):
if len(contig_seq) >= 1: # contig filter = 1
contigs[contig.name] = contig_seq.upper()
contig.length = len(contig_seq)
contig_seq = ""
try:
contig = org.get(line.split()[0][1:])
except KeyError:
with contig_counter.get_lock():
contig = Contig(contig_counter.value, line.split()[0][1:])
contig_counter.value += 1
org.add(contig)
else:
contig_seq += line.strip()
if len(contig_seq) >= 1: # processing the last contig
contigs[contig.name] = contig_seq.upper()
contig.length = len(contig_seq)

except AttributeError as e:
raise AttributeError(
f"{e}\nAn error was raised when reading file: '{fna_file.name}'. "
f"One possibility for this error is that the file did not start with a '>' "
f"as it would be expected from a fna file."
)
except Exception as err: # To manage other exception which can occur
raise Exception(
f"{err}: Please check your input file and if everything looks fine, "
"please post an issue on our github"
)
contigs = {}

for header, sequence in parse_fasta(fna_file):
contig_name = header.split()[0]

# Retrieve or create the contig
try:
contig = org.get(contig_name)
except KeyError:
with contig_counter.get_lock():
contig = Contig(contig_counter.value, contig_name)
contig_counter.value += 1
org.add(contig)

# Update contig information
if contig.length is not None and contig.length != len(sequence):
raise ValueError(
f"Length mismatch for contig {contig_name}: expected {contig.length}, found {len(sequence)} from the fasta sequence."
)

contig.length = len(sequence)
contigs[contig_name] = sequence.upper()

return contigs


Expand Down Expand Up @@ -464,7 +480,7 @@ def annotate_organism(

fasta_file = read_compressed_or_not(file_name)

contig_sequences = read_fasta(org, fasta_file)
contig_sequences = get_contigs_from_fasta_file(org, fasta_file)
if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj
fasta_file = write_tmp_fasta(contig_sequences, tmpdir)
if procedure is None: # prodigal procedure is not force by user
Expand Down
6 changes: 3 additions & 3 deletions ppanggolin/projection/projection.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import yaml

# # local libraries
from ppanggolin.annotate.synta import read_fasta, get_dna_sequence
from ppanggolin.annotate.synta import get_contigs_from_fasta_file, get_dna_sequence
from ppanggolin.annotate.annotate import (
init_contig_counter,
read_anno_file,
Expand Down Expand Up @@ -679,7 +679,7 @@ def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path):
org_fasta_file = genome_name_to_annot_path[org.name]["path"]

with read_compressed_or_not(org_fasta_file) as currFastaFile:
org_contig_to_seq = read_fasta(org, currFastaFile)
org_contig_to_seq = get_contigs_from_fasta_file(org, currFastaFile)

for contig in org.contigs:
try:
Expand Down Expand Up @@ -997,7 +997,7 @@ def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file):
"""

with read_compressed_or_not(fasta_file) as currFastaFile:
contig_id2seq = read_fasta(input_organism, currFastaFile)
contig_id2seq = get_contigs_from_fasta_file(input_organism, currFastaFile)

for contig in input_organism.contigs:
try:
Expand Down

0 comments on commit c1a961b

Please sign in to comment.