add assertion and comment to prevent unlikely but possible bugs

labgem · Nov 30, 2023 · 897de65 · 897de65
1 parent c289a7c
commit 897de65
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 4 deletions.
diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
@@ -87,7 +87,7 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i
                                   genetic_code=genetic_code)
         contig.add(new_gene)
     else:  # if not CDS, it is RNA
-        new_gene = RNA(org.name + "_RNA_" + str(rna_counter).zfill(4))
+        new_gene = RNA(org.name + f"_{gene_type}_" + str(rna_counter).zfill(4))
         new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name,
                                   product=product)
         contig.add_rna(new_gene)
@@ -257,6 +257,8 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str]
 
     :return: Organism object and if there are sequences associated or not
     """
+    # TODO: This function would need some refactoring. 
+
     global ctg_counter
 
     (gff_seqname, _, gff_type, gff_start, gff_end, _, gff_strand, _, gff_attribute) = range(0, 9)
@@ -301,11 +303,13 @@ def get_id_attribute(attributes_dict: dict) -> str:
     gene_counter = 0
     rna_counter = 0
     attr_prodigal = None
+
     with read_compressed_or_not(gff_file_path) as gff_file:
         for line in gff_file:
             if has_fasta:
                 fasta_string += line
                 continue
+
             elif line.startswith('##', 0, 2):
                 if line.startswith('FASTA', 2, 7):
                     has_fasta = True
@@ -319,34 +323,47 @@ def get_id_attribute(attributes_dict: dict) -> str:
                     contig.length = int(fields[-1]) - int(fields[2]) + 1
                 else:
                     continue
+
             elif line.startswith('#'):
                 if line.startswith('Sequence Data', 2, 15):  # GFF from prodigal
                     fields_prodigal = [el.strip() for el in line.split(': ')[1].split(";")]
                     attr_prodigal = {field.split("=")[0]: field.split("=")[1] for field in fields_prodigal}
                 else:  # comment lines to be ignores by parsers
                     continue
+
             elif line == "":  # empty lines are not expected, but they do not carry information, so we'll ignore them
                 continue
+
             else:
                 fields_gff = [el.strip() for el in line.split('\t')]
                 attributes = get_gff_attributes(fields_gff)
                 pseudogene = False
+
                 if fields_gff[gff_type] == 'region':
                     if fields_gff[gff_seqname] in circular_contigs or ('Is_circular' in attributes and
                                                                        attributes['Is_circular']):
+                        # WARNING: In case we have prodigal gff with is_circular attributes. 
+                        # This would fail as contig is not defined. However is_circular should not be found in prodigal gff
                         contig.is_circular = True
+                        assert contig.name == fields_gff[gff_seqname]
+
                 elif fields_gff[gff_type] == 'CDS' or "RNA" in fields_gff[gff_type]:
                     gene_id = attributes.get("PROTEIN_ID")
                     # if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that.
+
                     if gene_id is None:
                         # if it's not found, we get the one under the 'ID' field which must exist
                         # (otherwise not a gff3 compliant file)
                         gene_id = get_id_attribute(attributes)
+
                     name = attributes.pop('NAME', attributes.pop('GENE', ""))
+
                     if "pseudo" in attributes or "pseudogene" in attributes:
                         pseudogene = True
+
                     product = attributes.pop('PRODUCT', "")
                     genetic_code = int(attributes.pop("TRANSL_TABLE", 11))
+
                     if contig is None or contig.name != fields_gff[gff_seqname]:
                         # get the current contig
                         try:
@@ -372,7 +389,8 @@ def get_id_attribute(attributes_dict: dict) -> str:
                         contig.add(gene)
 
                     elif "RNA" in fields_gff[gff_type]:
-                        rna = RNA(org.name + "_CDS_" + str(rna_counter).zfill(4))
+                        rna_type = fields_gff[gff_type]
+                        rna = RNA(org.name + f"_{rna_type}_" + str(rna_counter).zfill(4))
                         rna.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]),
                                              strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name,
                                              product=product, local_identifier=gene_id)

diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py
@@ -318,14 +318,14 @@ def detect_filetype(filename: Path) -> str:
         first_line = f.readline()
     if first_line.startswith("LOCUS       "):  # then this is probably a gbff/gbk file
         return "gbff"
-    elif first_line.startswith("##gff-version 3"):
+    elif first_line.startswith("##gff-version 3") or first_line.startswith("##gff-version  3"): # prodigal gff header has two spaces betwene gff-version and 3... 
         return 'gff'
     elif first_line.startswith(">"):
         return 'fasta'
     elif "\t" in first_line:
         return "tsv"
     else:
-        raise Exception("Filetype was not gff3 (file starts with '##gff-version 3') "
+        raise Exception(f"Filetype {filename} was not gff3 (file starts with '##gff-version 3') "
                         "nor gbff/gbk (file starts with 'LOCUS       '). "
                         "Only those two file formats are supported (for now).")