Skip to content

Commit

Permalink
Merge branch 'dev' into issue195
Browse files Browse the repository at this point in the history
  • Loading branch information
jpjarnoux authored Jun 13, 2024
2 parents 45655be + 8517fe6 commit 5ecadc7
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 13 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ jobs:
--genome_name chlam_annotated_with_prodigal --anno GBFF/GCF_003788785.1_ct114V1_genomic_prodigal_annotation.gff.gz \
--gff --table --cpu $NUM_CPUS
# projection of a plasmid with chevron that have been added manually to test chevron handeling in GFF
ppanggolin projection --pangenome myannopang/pangenome.h5 --anno GBFF/plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz --cpu $NUM_CPUS -o projection_plasmid_with_chevron
- name: testing write_genome_cmds
shell: bash -l {0}
run: |
Expand Down
53 changes: 40 additions & 13 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,29 @@ def get_id_attribute(attributes_dict: dict) -> str:
raise Exception(f"Each CDS type of the gff files must own a unique ID attribute. "
f"Not the case for file: {gff_file_path}")
return element_id


def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, bool]:
"""
Checks for the presence of chevrons ('<' or '>') in the start and stop strings, removes them if present,
and converts the remaining parts to integers.
:param start: The start string which may contain chevrons.
:param stop: The stop string which may contain chevrons.
:return: A tuple containing the integer values of start and stop, and a boolean indicating if chevrons were present in either string.
"""
chevrons_present = '>' in start or '<' in start or '>' in stop or '<' in stop

if chevrons_present:
start = int(start.replace('<', '').replace('>', ''))
stop = int(stop.replace('<', '').replace('>', ''))
else:
start = int(start)
stop = int(stop)

return start, stop, chevrons_present


contig = None # initialize contig
has_fasta = False
Expand Down Expand Up @@ -667,8 +690,13 @@ def get_id_attribute(attributes_dict: dict) -> str:
else:
fields_gff = [el.strip() for el in line.split('\t')]
attributes = get_gff_attributes(fields_gff)

pseudogene = False

start, stop, has_chevron = check_chevrons_in_start_and_stop(start=fields_gff[gff_start], stop=fields_gff[gff_end])
if has_chevron:
pseudogene = True

if fields_gff[gff_type] == 'region':
# keep region attributes to add them as metadata of genome and contigs
# excluding some info as they are alredy contained in contig object.
Expand Down Expand Up @@ -731,17 +759,16 @@ def get_id_attribute(attributes_dict: dict) -> str:
if id_attribute in id_attr_to_gene_id: # the ID has already been seen at least once in this genome

existing_gene = id_attr_to_gene_id[id_attribute]

new_gene_info = {"strand": fields_gff[gff_strand],
"type": fields_gff[gff_type],
"name": name,
"position": contig.number_of_genes,
"product": product,
"local_identifier": gene_id,
"start": int(fields_gff[gff_start]),
"stop": int(fields_gff[gff_end]),
"ID": id_attribute}

new_gene_info = {"strand":fields_gff[gff_strand],
"type":fields_gff[gff_type],
"name":name,
"position":contig.number_of_genes,
"product":product,
"local_identifier":gene_id,
"start": start,
"stop": stop,
"ID": id_attribute}

check_and_add_extra_gene_part(existing_gene, new_gene_info)

continue
Expand All @@ -751,7 +778,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
id_attr_to_gene_id[id_attribute] = gene

# here contig is filled in order, so position is the number of genes already stored in the contig.
gene.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]),
gene.fill_annotations(start=start, stop=stop,
strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name,
position=contig.number_of_genes, product=product,
local_identifier=gene_id,
Expand All @@ -765,7 +792,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
rna_type = fields_gff[gff_type]
rna = RNA(org.name + f"_{rna_type}_" + str(rna_counter).zfill(4))

rna.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]),
rna.fill_annotations(start=start, stop=stop,
strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name,
product=product, local_identifier=gene_id)
rna.fill_parents(org, contig)
Expand Down
Binary file not shown.

0 comments on commit 5ecadc7

Please sign in to comment.