Merge branch 'dev' into issue195

labgem · Jun 13, 2024 · 5ecadc7 · 5ecadc7
2 parents 45655be + 8517fe6
commit 5ecadc7
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 13 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -191,6 +191,9 @@ jobs:
                               --genome_name chlam_annotated_with_prodigal --anno GBFF/GCF_003788785.1_ct114V1_genomic_prodigal_annotation.gff.gz \
                                --gff  --table --cpu $NUM_CPUS
 
+        # projection of a plasmid with chevron that have been added manually to test chevron handeling in GFF
+        ppanggolin projection --pangenome myannopang/pangenome.h5 --anno GBFF/plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz --cpu $NUM_CPUS -o projection_plasmid_with_chevron
+
     - name: testing write_genome_cmds
       shell: bash -l {0}
       run: |

diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
@@ -622,6 +622,29 @@ def get_id_attribute(attributes_dict: dict) -> str:
             raise Exception(f"Each CDS type of the gff files must own a unique ID attribute. "
                             f"Not the case for file: {gff_file_path}")
         return element_id
+
+
+    def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, bool]:
+        """
+        Checks for the presence of chevrons ('<' or '>') in the start and stop strings, removes them if present,
+        and converts the remaining parts to integers.
+
+        :param start: The start string which may contain chevrons.
+        :param stop: The stop string which may contain chevrons.
+
+        :return: A tuple containing the integer values of start and stop, and a boolean indicating if chevrons were present in either string.
+        """
+        chevrons_present = '>' in start or '<' in start or '>' in stop or '<' in stop
+
+        if chevrons_present:
+            start = int(start.replace('<', '').replace('>', ''))
+            stop = int(stop.replace('<', '').replace('>', ''))
+        else:
+            start = int(start)
+            stop = int(stop)
+
+        return start, stop, chevrons_present
+
 
     contig = None  # initialize contig
     has_fasta = False
@@ -667,8 +690,13 @@ def get_id_attribute(attributes_dict: dict) -> str:
             else:
                 fields_gff = [el.strip() for el in line.split('\t')]
                 attributes = get_gff_attributes(fields_gff)
+
                 pseudogene = False
 
+                start, stop, has_chevron = check_chevrons_in_start_and_stop(start=fields_gff[gff_start], stop=fields_gff[gff_end])
+                if has_chevron:
+                    pseudogene = True
+
                 if fields_gff[gff_type] == 'region':
                     # keep region attributes to add them as metadata of genome and contigs
                     # excluding some info as they are alredy contained in contig object.
@@ -731,17 +759,16 @@ def get_id_attribute(attributes_dict: dict) -> str:
                         if id_attribute in id_attr_to_gene_id:  # the ID has already been seen at least once in this genome
 
                             existing_gene = id_attr_to_gene_id[id_attribute]
-
-                            new_gene_info = {"strand": fields_gff[gff_strand],
-                                             "type": fields_gff[gff_type],
-                                             "name": name,
-                                             "position": contig.number_of_genes,
-                                             "product": product,
-                                             "local_identifier": gene_id,
-                                             "start": int(fields_gff[gff_start]),
-                                             "stop": int(fields_gff[gff_end]),
-                                             "ID": id_attribute}
-
+                            new_gene_info = {"strand":fields_gff[gff_strand], 
+                                            "type":fields_gff[gff_type],
+                                            "name":name,
+                                            "position":contig.number_of_genes,
+                                            "product":product,
+                                            "local_identifier":gene_id,
+                                            "start": start,
+                                            "stop": stop,
+                                            "ID": id_attribute}
+
                             check_and_add_extra_gene_part(existing_gene, new_gene_info)
 
                             continue
@@ -751,7 +778,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
                         id_attr_to_gene_id[id_attribute] = gene
 
                         # here contig is filled in order, so position is the number of genes already stored in the contig.
-                        gene.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]),
+                        gene.fill_annotations(start=start, stop=stop,
                                               strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name,
                                               position=contig.number_of_genes, product=product,
                                               local_identifier=gene_id,
@@ -765,7 +792,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
                         rna_type = fields_gff[gff_type]
                         rna = RNA(org.name + f"_{rna_type}_" + str(rna_counter).zfill(4))
 
-                        rna.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]),
+                        rna.fill_annotations(start=start, stop=stop,
                                              strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name,
                                              product=product, local_identifier=gene_id)
                         rna.fill_parents(org, contig)

diff --git a/testingDataset/GBFF/plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz b/testingDataset/GBFF/plasmid_NZ_CP007132_with_manually_added_chevrons.gff.gz