Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Contig Length Detection for GFF Parsing #301

Merged
merged 8 commits into from
Nov 19, 2024
38 changes: 19 additions & 19 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# local libraries
from ppanggolin.annotate.synta import (
annotate_organism,
read_fasta,
get_contigs_from_fasta_file,
get_dna_sequence,
init_contig_counter,
contig_counter,
Expand Down Expand Up @@ -1034,12 +1034,10 @@ def check_chevrons_in_start_and_stop(
dbxref_metadata
)

if fields_gff[gff_seqname] in circular_contigs or (
if contig is not None and (
axbazin marked this conversation as resolved.
Show resolved Hide resolved
"IS_CIRCULAR" in attributes
and attributes["IS_CIRCULAR"] == "true"
):
# WARNING: In case we have prodigal gff with is_circular attributes.
# This would fail as contig is not defined. However, is_circular should not be found in prodigal gff
logging.getLogger("PPanGGOLiN").debug(
f"Contig {contig.name} is circular."
)
Expand Down Expand Up @@ -1120,9 +1118,9 @@ def check_chevrons_in_start_and_stop(
gene_frame = int(fields_gff[frame])

if (
gene_id in id_attr_to_gene_id
id_attribute in id_attr_to_gene_id
): # the ID has already been seen at least once in this genome
existing_gene = id_attr_to_gene_id[gene_id]
existing_gene = id_attr_to_gene_id[id_attribute]
new_gene_info = {
"strand": fields_gff[gff_strand],
"type": fields_gff[gff_type],
Expand All @@ -1141,7 +1139,7 @@ def check_chevrons_in_start_and_stop(

gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4))

id_attr_to_gene_id[gene_id] = gene
id_attr_to_gene_id[id_attribute] = gene

# here contig is filled in order, so position is the number of genes already stored in the contig.
gene.fill_annotations(
Expand Down Expand Up @@ -1182,9 +1180,6 @@ def check_chevrons_in_start_and_stop(
rna_counter += 1
contig.add_rna(rna)

# Correct coordinates of genes that overlap the edge of circulars contig
correct_putative_overlaps(org.contigs)

# Fix partial genes coordinates
for contig in org.contigs:
for gene in contig.genes:
Expand All @@ -1201,14 +1196,11 @@ def check_chevrons_in_start_and_stop(
has_fasta = False

if has_fasta:
contig_sequences = read_fasta(
org, fasta_string.split("\n")
) # _ is total contig length
contig_sequences = get_contigs_from_fasta_file(org, fasta_string.split("\n"))

correct_putative_overlaps(org.contigs)

for contig in org.contigs:
if contig.length != len(contig_sequences[contig.name]):
raise ValueError(
"The contig length defined is different than the sequence length"
)

for gene in contig.genes:
gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
Expand All @@ -1220,7 +1212,7 @@ def check_chevrons_in_start_and_stop(
add_metadata_from_gff_file(contig_name_to_region_info, org, gff_file_path)

if used_transl_table_arg:
logging.getLogger("PPanGGOLiN").info(
logging.getLogger("PPanGGOLiN").debug(
f"transl_table tag was not found for {used_transl_table_arg} CDS "
f"in {gff_file_path}. Provided translation_table argument value was used instead: {translation_table}."
)
Expand Down Expand Up @@ -1616,7 +1608,15 @@ def get_gene_sequences_from_fastas(
f"your fasta file are different."
)
with read_compressed_or_not(Path(elements[1])) as currFastaFile:
fasta_dict[org] = read_fasta(org, currFastaFile)
fasta_dict[org] = get_contigs_from_fasta_file(org, currFastaFile)

# When dealing with GFF files, some genes may have coordinates extending beyond the actual
# length of contigs, especially when they overlap the edges. This usually needs to be split
# into two parts to handle the circular genome wrapping.
# If the GFF file lacks associated FASTA sequences and it was not possible to determine the
# contig length from the GFF file, we must apply this correction while parsing the external FASTA file.

correct_putative_overlaps(org.contigs)

if set(pangenome.organisms) > set(fasta_dict.keys()):
missing = pangenome.number_of_organisms - len(
Expand Down
98 changes: 57 additions & 41 deletions ppanggolin/annotate/synta.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from subprocess import Popen, PIPE
import ast
from collections import defaultdict
from typing import Dict, List, Optional, Union
from typing import Dict, List, Optional, Union, Generator, Tuple
from pathlib import Path

# install libraries
Expand Down Expand Up @@ -245,49 +245,65 @@ def launch_infernal(
return gene_objs


def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]:
"""Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value.
def parse_fasta(
fna_file: Union[TextIOWrapper, list]
) -> Generator[Tuple[str, str], None, None]:
"""Yields each header and sequence from a FASTA file or stream as a tuple.

:param org: Organism corresponding to fasta file
:param fna_file: Input fasta file with sequences or list of each line as sequence
:param fna_file: Input FASTA file or list of lines as sequences.
:yield: Tuple with contig header (without '>') and sequence.
"""
header = None
sequence = []

for line in fna_file:
line = line.strip()
if line.startswith(">"): # New header
if header: # Yield previous header and sequence
yield (header, "".join(sequence))
header = line[1:] # Strip '>'
sequence = []
else:
sequence.append(line)

if header: # Yield the final contig
yield (header, "".join(sequence))

:return: Dictionary with contig_name as keys and contig sequence in values

def get_contigs_from_fasta_file(
org: Organism, fna_file: Union[TextIOWrapper, list]
) -> Dict[str, str]:
"""Processes contigs from a parsed FASTA generator and stores in a dictionary.

:param org: Organism instance to update with contig info.
:param fna_file: Input FASTA file or list of lines as sequences.
:return: Dictionary with contig names as keys and sequences as values.
"""

global contig_counter
try:
contigs = {}
contig_seq = ""
contig = None
for line in fna_file:
if line.startswith(">"):
if len(contig_seq) >= 1: # contig filter = 1
contigs[contig.name] = contig_seq.upper()
contig.length = len(contig_seq)
contig_seq = ""
try:
contig = org.get(line.split()[0][1:])
except KeyError:
with contig_counter.get_lock():
contig = Contig(contig_counter.value, line.split()[0][1:])
contig_counter.value += 1
org.add(contig)
else:
contig_seq += line.strip()
if len(contig_seq) >= 1: # processing the last contig
contigs[contig.name] = contig_seq.upper()
contig.length = len(contig_seq)

except AttributeError as e:
raise AttributeError(
JeanMainguy marked this conversation as resolved.
Show resolved Hide resolved
f"{e}\nAn error was raised when reading file: '{fna_file.name}'. "
f"One possibility for this error is that the file did not start with a '>' "
f"as it would be expected from a fna file."
)
except Exception as err: # To manage other exception which can occur
raise Exception(
f"{err}: Please check your input file and if everything looks fine, "
"please post an issue on our github"
)
contigs = {}

for header, sequence in parse_fasta(fna_file):
contig_name = header.split()[0]

# Retrieve or create the contig
try:
contig = org.get(contig_name)
except KeyError:
with contig_counter.get_lock():
contig = Contig(contig_counter.value, contig_name)
contig_counter.value += 1
org.add(contig)

# Update contig information
if contig.length is not None and contig.length != len(sequence):
raise ValueError(
f"Length mismatch for contig {contig_name}: expected {contig.length}, found {len(sequence)} from the fasta sequence."
)

contig.length = len(sequence)
contigs[contig_name] = sequence.upper()

return contigs


Expand Down Expand Up @@ -464,7 +480,7 @@ def annotate_organism(

fasta_file = read_compressed_or_not(file_name)

contig_sequences = read_fasta(org, fasta_file)
contig_sequences = get_contigs_from_fasta_file(org, fasta_file)
if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj
fasta_file = write_tmp_fasta(contig_sequences, tmpdir)
if procedure is None: # prodigal procedure is not force by user
Expand Down
2 changes: 0 additions & 2 deletions ppanggolin/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,8 +553,6 @@ def __setitem__(self, coordinate: Tuple[int, int, str], gene: Gene):
@property
def length(self) -> Union[int, None]:
"""Get the length of the contig"""
if self._length is None:
logging.getLogger("PPanGGOLiN").warning("Contig length is unknown")
return self._length

@length.setter
Expand Down
6 changes: 3 additions & 3 deletions ppanggolin/projection/projection.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import yaml

# # local libraries
from ppanggolin.annotate.synta import read_fasta, get_dna_sequence
from ppanggolin.annotate.synta import get_contigs_from_fasta_file, get_dna_sequence
from ppanggolin.annotate.annotate import (
init_contig_counter,
read_anno_file,
Expand Down Expand Up @@ -679,7 +679,7 @@ def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path):
org_fasta_file = genome_name_to_annot_path[org.name]["path"]

with read_compressed_or_not(org_fasta_file) as currFastaFile:
org_contig_to_seq = read_fasta(org, currFastaFile)
org_contig_to_seq = get_contigs_from_fasta_file(org, currFastaFile)

for contig in org.contigs:
try:
Expand Down Expand Up @@ -997,7 +997,7 @@ def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file):
"""

with read_compressed_or_not(fasta_file) as currFastaFile:
contig_id2seq = read_fasta(input_organism, currFastaFile)
contig_id2seq = get_contigs_from_fasta_file(input_organism, currFastaFile)

for contig in input_organism.contigs:
try:
Expand Down
Loading