Skip to content

Commit

Permalink
manage chevron in GFF start and stop
Browse files Browse the repository at this point in the history
  • Loading branch information
JeanMainguy committed Jun 12, 2024
1 parent 6a2be8b commit 05ac0bb
Showing 1 changed file with 32 additions and 4 deletions.
36 changes: 32 additions & 4 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,29 @@ def get_id_attribute(attributes_dict: dict) -> str:
raise Exception(f"Each CDS type of the gff files must own a unique ID attribute. "
f"Not the case for file: {gff_file_path}")
return element_id


def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, bool]:
"""
Checks for the presence of chevrons ('<' or '>') in the start and stop strings, removes them if present,
and converts the remaining parts to integers.
:param start: The start string which may contain chevrons.
:param stop: The stop string which may contain chevrons.
:return: A tuple containing the integer values of start and stop, and a boolean indicating if chevrons were present in either string.
"""
chevrons_present = '>' in start or '<' in start or '>' in stop or '<' in stop

if chevrons_present:
start = int(start.replace('<', '').replace('>', ''))
stop = int(stop.replace('<', '').replace('>', ''))
else:
start = int(start)
stop = int(stop)

return start, stop, chevrons_present


contig = None # initialize contig
has_fasta = False
Expand Down Expand Up @@ -646,8 +669,13 @@ def get_id_attribute(attributes_dict: dict) -> str:
else:
fields_gff = [el.strip() for el in line.split('\t')]
attributes = get_gff_attributes(fields_gff)

pseudogene = False

start, stop, has_chevron = check_chevrons_in_start_and_stop(start=fields_gff[gff_start], stop=fields_gff[gff_end])
if has_chevron:
pseudogene = True

if fields_gff[gff_type] == 'region':
# keep region attributes to add them as metadata of genome and contigs
# excluding some info as they are alredy contained in contig object.
Expand Down Expand Up @@ -714,8 +742,8 @@ def get_id_attribute(attributes_dict: dict) -> str:
"position":contig.number_of_genes,
"product":product,
"local_identifier":gene_id,
"start": int(fields_gff[gff_start]),
"stop": int(fields_gff[gff_end]),
"start": start,
"stop": stop,
"ID": id_attribute}

check_and_add_extra_gene_part(existing_gene, new_gene_info)
Expand All @@ -728,7 +756,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
id_attr_to_gene_id[id_attribute] = gene

# here contig is filled in order, so position is the number of genes already stored in the contig.
gene.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]),
gene.fill_annotations(start=start, stop=stop,
strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name,
position=contig.number_of_genes, product=product,
local_identifier=gene_id,
Expand All @@ -742,7 +770,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
rna_type = fields_gff[gff_type]
rna = RNA(org.name + f"_{rna_type}_" + str(rna_counter).zfill(4))

rna.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]),
rna.fill_annotations(start=start, stop=stop,
strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name,
product=product, local_identifier=gene_id)
rna.fill_parents(org, contig)
Expand Down

0 comments on commit 05ac0bb

Please sign in to comment.