Skip to content

Commit

Permalink
add assertion and comment to prevent unlikely but possible bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
JeanMainguy committed Nov 30, 2023
1 parent c289a7c commit 897de65
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 4 deletions.
22 changes: 20 additions & 2 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i
genetic_code=genetic_code)
contig.add(new_gene)
else: # if not CDS, it is RNA
new_gene = RNA(org.name + "_RNA_" + str(rna_counter).zfill(4))
new_gene = RNA(org.name + f"_{gene_type}_" + str(rna_counter).zfill(4))
new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name,
product=product)
contig.add_rna(new_gene)
Expand Down Expand Up @@ -257,6 +257,8 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str]
:return: Organism object and if there are sequences associated or not
"""
# TODO: This function would need some refactoring.

global ctg_counter

(gff_seqname, _, gff_type, gff_start, gff_end, _, gff_strand, _, gff_attribute) = range(0, 9)
Expand Down Expand Up @@ -301,11 +303,13 @@ def get_id_attribute(attributes_dict: dict) -> str:
gene_counter = 0
rna_counter = 0
attr_prodigal = None

with read_compressed_or_not(gff_file_path) as gff_file:
for line in gff_file:
if has_fasta:
fasta_string += line
continue

elif line.startswith('##', 0, 2):
if line.startswith('FASTA', 2, 7):
has_fasta = True
Expand All @@ -319,34 +323,47 @@ def get_id_attribute(attributes_dict: dict) -> str:
contig.length = int(fields[-1]) - int(fields[2]) + 1
else:
continue

elif line.startswith('#'):
if line.startswith('Sequence Data', 2, 15): # GFF from prodigal
fields_prodigal = [el.strip() for el in line.split(': ')[1].split(";")]
attr_prodigal = {field.split("=")[0]: field.split("=")[1] for field in fields_prodigal}
else: # comment lines to be ignores by parsers
continue

elif line == "": # empty lines are not expected, but they do not carry information, so we'll ignore them
continue

else:
fields_gff = [el.strip() for el in line.split('\t')]
attributes = get_gff_attributes(fields_gff)
pseudogene = False

if fields_gff[gff_type] == 'region':
if fields_gff[gff_seqname] in circular_contigs or ('Is_circular' in attributes and
attributes['Is_circular']):
# WARNING: In case we have prodigal gff with is_circular attributes.
# This would fail as contig is not defined. However is_circular should not be found in prodigal gff
contig.is_circular = True
assert contig.name == fields_gff[gff_seqname]

elif fields_gff[gff_type] == 'CDS' or "RNA" in fields_gff[gff_type]:
gene_id = attributes.get("PROTEIN_ID")
# if there is a 'PROTEIN_ID' attribute, it's where the ncbi stores the actual gene ids, so we use that.

if gene_id is None:
# if it's not found, we get the one under the 'ID' field which must exist
# (otherwise not a gff3 compliant file)
gene_id = get_id_attribute(attributes)

name = attributes.pop('NAME', attributes.pop('GENE', ""))

if "pseudo" in attributes or "pseudogene" in attributes:
pseudogene = True

product = attributes.pop('PRODUCT', "")
genetic_code = int(attributes.pop("TRANSL_TABLE", 11))

if contig is None or contig.name != fields_gff[gff_seqname]:
# get the current contig
try:
Expand All @@ -372,7 +389,8 @@ def get_id_attribute(attributes_dict: dict) -> str:
contig.add(gene)

elif "RNA" in fields_gff[gff_type]:
rna = RNA(org.name + "_CDS_" + str(rna_counter).zfill(4))
rna_type = fields_gff[gff_type]
rna = RNA(org.name + f"_{rna_type}_" + str(rna_counter).zfill(4))
rna.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]),
strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name,
product=product, local_identifier=gene_id)
Expand Down
4 changes: 2 additions & 2 deletions ppanggolin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,14 +318,14 @@ def detect_filetype(filename: Path) -> str:
first_line = f.readline()
if first_line.startswith("LOCUS "): # then this is probably a gbff/gbk file
return "gbff"
elif first_line.startswith("##gff-version 3"):
elif first_line.startswith("##gff-version 3") or first_line.startswith("##gff-version 3"): # prodigal gff header has two spaces betwene gff-version and 3...
return 'gff'
elif first_line.startswith(">"):
return 'fasta'
elif "\t" in first_line:
return "tsv"
else:
raise Exception("Filetype was not gff3 (file starts with '##gff-version 3') "
raise Exception(f"Filetype {filename} was not gff3 (file starts with '##gff-version 3') "
"nor gbff/gbk (file starts with 'LOCUS '). "
"Only those two file formats are supported (for now).")

Expand Down

0 comments on commit 897de65

Please sign in to comment.