From 7ee9bd41f44527ce67fa5afcfca5c736f7474498 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Nov 2024 15:22:42 +0100 Subject: [PATCH] add overlap correction when parsing external fasta along GFF files --- ppanggolin/annotate/annotate.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index ec868264..c478a4f6 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -1034,12 +1034,10 @@ def check_chevrons_in_start_and_stop( dbxref_metadata ) - if fields_gff[gff_seqname] in circular_contigs or ( + if contig is not None and ( "IS_CIRCULAR" in attributes and attributes["IS_CIRCULAR"] == "true" ): - # WARNING: In case we have prodigal gff with is_circular attributes. - # This would fail as contig is not defined. However, is_circular should not be found in prodigal gff logging.getLogger("PPanGGOLiN").debug( f"Contig {contig.name} is circular." ) @@ -1201,6 +1199,7 @@ def check_chevrons_in_start_and_stop( contig_sequences = get_contigs_from_fasta_file(org, fasta_string.split("\n")) correct_putative_overlaps(org.contigs) + for contig in org.contigs: for gene in contig.genes: @@ -1611,6 +1610,14 @@ def get_gene_sequences_from_fastas( with read_compressed_or_not(Path(elements[1])) as currFastaFile: fasta_dict[org] = get_contigs_from_fasta_file(org, currFastaFile) + # When dealing with GFF files, some genes may have coordinates extending beyond the actual + # length of contigs, especially when they overlap the edges. This usually needs to be split + # into two parts to handle the circular genome wrapping. + # If the GFF file lacks associated FASTA sequences and it was not possible to determine the + # contig length from the GFF file, we must apply this correction while parsing the external FASTA file. + + correct_putative_overlaps(org.contigs) + if set(pangenome.organisms) > set(fasta_dict.keys()): missing = pangenome.number_of_organisms - len( set(pangenome.organisms) & set(fasta_dict.keys())