Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignoring weird tRNA from Aragorn #245

Merged
merged 5 commits into from
Jul 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ppanggolin/annotate/synta.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ def launch_aragorn(fna_file: str, org: Organism) -> defaultdict:
elif len(line) > 0: # if the line isn't empty, there's data to get.
line_data = line.split()
start, stop = map(int, ast.literal_eval(line_data[2].replace("c", "")))
if start < 1 or stop < 1:
# In some case aragorn gives negative coordinates. This case is just ignored.
logging.warning(f'Aragorn gives non valide coordiates for a RNA gene: {line_data} This RNA is ignored.')
continue
c += 1
gene = RNA(rna_id=locustag + '_tRNA_' + str(c).zfill(4))
gene.fill_annotations(start=start, stop=stop, strand="-" if line_data[2].startswith("c") else "+",
Expand Down
7 changes: 7 additions & 0 deletions ppanggolin/formats/readBinaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,13 @@ def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int
for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar):
rna = RNA(row["ID"].decode())
genedata = genedata_dict[row["genedata_id"]]
if genedata.start > genedata.stop:
logging.warning(f"Wrong coordinates in RNA gene {genedata.name}: Start ({genedata.start}) should not be greater than stop ({genedata.stop}). This gene is ignored.")
continue
if genedata.start < 1 or genedata.stop < 1:
logging.warning(f"Wrong coordinates in RNA gene {genedata.name}: Start ({genedata.start}) and stop ({genedata.stop}) should be greater than 0. This gene is ignored.")
continue

rna.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand,
gene_type=genedata.gene_type, name=genedata.name,
product=genedata.product)
Expand Down
33 changes: 17 additions & 16 deletions ppanggolin/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,8 @@ def contig(self, contig: Contig):
raise TypeError(f'Expected type Contig, got {type(contig)}')
self._contig = contig

def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "",
product: str = "", local_identifier: str = "", coordinates:List[Tuple[int]] = None):
def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "",
product: str = "", local_identifier: str = "", coordinates: List[Tuple[int]] = None):
"""
Fill general annotation for child classes

Expand All @@ -162,32 +162,33 @@ def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str =
coordinates = [(start, stop)]

if not isinstance(start, int):
raise TypeError("Start should be int")
raise TypeError(f"Start should be int. Got {type(start)} instead in {self} from {self.organism}.")
if not isinstance(stop, int):
raise TypeError("Stop should be int")
raise TypeError(f"Stop should be int. Got {type(stop)} instead in {self} from {self.organism}.")
if not isinstance(strand, str):
raise TypeError("Strand should be str")
raise TypeError(f"Strand should be str. Got {type(strand)} instead in {self} from {self.organism}.")
if not isinstance(gene_type, str):
raise TypeError("Gene type should be str")
raise TypeError(f"Gene type should be str. Got {type(gene_type)} instead in {self} from {self.organism}.")
if not isinstance(name, str):
raise TypeError("Name should be str")
raise TypeError(f"Name should be str. Got {type(name)} instead in {self} from {self.organism}.")
if not isinstance(product, str):
raise TypeError("Product should be str")
raise TypeError(f"Product should be str. Got {type(product)} instead in {self} from {self.organism}.")
if not isinstance(local_identifier, str):
raise TypeError("Local identifier should be str")
raise TypeError(f"Local identifier should be str. Got {type(local_identifier)} instead in {self} from {self.organism}.")
if strand not in ["+", "-"]:
raise ValueError("Strand should be + or -")
raise ValueError(f"Strand should be '+' or '-'. Got {strand} instead in {self} from {self.organism}.")
if not isinstance(coordinates, list):
raise TypeError(f"coordinates should be of type list. Type {type(coordinates)} was given instead")
raise TypeError(f"Coordinates should be of type list. Got {type(coordinates)} instead in {self} from {self.organism}.")

for start_i, stop_i in coordinates:
if not isinstance(start_i, int):
raise TypeError("Start should be int")
raise TypeError(f"Start should be int. Got {type(start_i)} instead in {self} from {self.organism}.")
if not isinstance(stop_i, int):
raise TypeError("Stop should be int")
raise TypeError(f"Stop should be int. Got {type(stop_i)} instead in {self} from {self.organism}.")
if stop_i < start_i:
raise ValueError(f"Wrong coordinates: {coordinates}. start ({start_i}) should not be greater than stop ({stop_i}).")

raise ValueError(f"Wrong coordinates: {coordinates}. Start ({start_i}) should not be greater than stop ({stop_i}) in {self} from {self.organism}.")
if start_i < 1 or stop_i < 1:
raise ValueError(f"Wrong coordinates: {coordinates}. Start ({start_i}) and stop ({stop_i}) should be greater than 0 in {self} from {self.organism}.")

self.start = start
self.stop = stop
Expand Down
2 changes: 1 addition & 1 deletion tests/context/test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def simple_contig():
organism = Organism('organism_A')
for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')):
family = GeneFamily(i, family_name)
gene.fill_annotations(start=i, stop=i+1, strand="+", position=i)
gene.fill_annotations(start=i+1, stop=i+2, strand="+", position=i)

gene.fill_parents(organism, contig)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_pangenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ def add_element_to_pangenome(self, pangenome):
ctg = Contig(0, "Ctg")
org.add(ctg)
gene = Gene("Gene")
gene.fill_annotations(start=0, stop=100, position=0, strand='+')
gene.fill_annotations(start=1, stop=100, position=0, strand='+')
gene.add_metadata(metadata=metadata)
ctg.add(gene)
pangenome.add_organism(org)
Expand Down
30 changes: 15 additions & 15 deletions tests/test_region.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def genes(contig) -> Generator[Set[Gene], None, None]:
@pytest.fixture
def gene(contig) -> Gene:
gene = Gene('gene')
gene.fill_annotations(start=0, stop=10, strand='+', position=0)
gene.fill_annotations(start=1, stop=10, strand='+', position=0)
contig = Contig(0, 'contig_name')
contig.length = 10
gene.contig = contig
Expand Down Expand Up @@ -145,7 +145,7 @@ def test_add_genes_at_position_already_taken(self, region, contig):
"""Test that adding genes with same position return a ValueError
"""
gene = Gene('gene')
gene.fill_annotations(start=0, stop=10, strand='+', position=0)
gene.fill_annotations(start=1, stop=10, strand='+', position=0)
gene.contig = contig

region.add(gene)
Expand All @@ -159,7 +159,7 @@ def test_add_genes_from_different_contigs(self, region):
"""Test that adding genes from different contigs return an Exception
"""
gene1, gene2 = Gene('gene_1'), Gene('gene_2')
gene1.fill_annotations(start=0, stop=10, strand='+', position=0)
gene1.fill_annotations(start=1, stop=10, strand='+', position=0)
gene2.fill_annotations(start=11, stop=20, strand='+', position=1)
gene1.fill_parents(None, Contig(1, 'contig_1'))
region.add(gene1)
Expand All @@ -171,7 +171,7 @@ def test_add_genes_from_different_organisms(self, region):
"""Test that adding genes from different organisms return an Exception
"""
gene1, gene2 = Gene('gene_1'), Gene('gene_2')
gene1.fill_annotations(start=0, stop=10, strand='+', position=0)
gene1.fill_annotations(start=1, stop=10, strand='+', position=0)
gene2.fill_annotations(start=11, stop=20, strand='+', position=1)
gene1.fill_parents(Organism("org_1"))
region.add(gene1)
Expand All @@ -183,7 +183,7 @@ def test_get_genes(self, region):
"""Tests that genes can be retrieved from the region
"""
gene = Gene('gene')
gene.fill_annotations(start=0, stop=10, strand='+', position=0)
gene.fill_annotations(start=1, stop=10, strand='+', position=0)
region.add(gene)
assert region.get(0) == gene

Expand All @@ -203,7 +203,7 @@ def test_del_gene(self, region):
"""Tests that genes can be deleted from the region
"""
gene = Gene('gene')
gene.fill_annotations(start=0, stop=10, strand='+', position=0)
gene.fill_annotations(start=1, stop=10, strand='+', position=0)
region.add(gene)
assert region.get(0) == gene
region.remove(0)
Expand Down Expand Up @@ -232,7 +232,7 @@ def test_get_organism(self, region, contig):
"""Tests that the organism linked to the region can be retrieved
"""
gene = Gene('gene')
gene.fill_annotations(start=0, stop=10, strand='+', position=0)
gene.fill_annotations(start=1, stop=10, strand='+', position=0)
gene.fill_parents(Organism("org"), contig)
region.add(gene)
assert region.organism.name == 'org'
Expand All @@ -241,7 +241,7 @@ def test_get_contig(self, region):
"""Tests that the contig linked to the region can be retrieved
"""
gene = Gene('gene')
gene.fill_annotations(start=0, stop=10, strand='+', position=0)
gene.fill_annotations(start=1, stop=10, strand='+', position=0)
gene.fill_parents(contig=Contig(0, "contig"))
region.add(gene)
assert region.contig.name == 'contig'
Expand All @@ -250,7 +250,7 @@ def test_is_whole_contig_true(self, region):
"""Tests that the property is_whole_contig return True if the region has the same length as contig
"""
starter, stopper = Gene('starter'), Gene('stopper')
starter.fill_annotations(start=0, stop=10, strand='+', position=0)
starter.fill_annotations(start=1, stop=10, strand='+', position=0)
stopper.fill_annotations(start=11, stop=20, strand='+', position=1)
contig = Contig(0, "contig")
contig[starter.start], contig[stopper.start] = starter, stopper
Expand All @@ -262,7 +262,7 @@ def test_is_whole_contig_false(self, region):
"""Tests that the property is_whole_contig return False if the region has not the same length as contig
"""
before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after')
before.fill_annotations(start=0, stop=10, strand='+', position=0)
before.fill_annotations(start=1, stop=10, strand='+', position=0)
starter.fill_annotations(start=11, stop=20, strand='+', position=1)
stopper.fill_annotations(start=21, stop=30, strand='+', position=2)
after.fill_annotations(start=31, stop=40, strand='+', position=3)
Expand All @@ -278,7 +278,7 @@ def test_is_contig_border_true(self, region):
"""Test that property is_contig_border return true if the region is bordering the contig
"""
before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after')
before.fill_annotations(start=0, stop=10, strand='+', position=0)
before.fill_annotations(start=1, stop=10, strand='+', position=0)
starter.fill_annotations(start=11, stop=20, strand='+', position=1)
stopper.fill_annotations(start=21, stop=30, strand='+', position=2)
after.fill_annotations(start=31, stop=40, strand='+', position=3)
Expand All @@ -299,7 +299,7 @@ def test_is_contig_border_false(self, region):
"""Tests that the property is_contig_border return False if the region is not bordering the contig
"""
before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after')
before.fill_annotations(start=0, stop=10, strand='+', position=0)
before.fill_annotations(start=1, stop=10, strand='+', position=0)
starter.fill_annotations(start=11, stop=20, strand='+', position=1)
stopper.fill_annotations(start=21, stop=30, strand='+', position=2)
after.fill_annotations(start=31, stop=40, strand='+', position=3)
Expand Down Expand Up @@ -679,8 +679,8 @@ def test_add_different_region_with_same_name(self, spot):
"""
region_1, region_2 = Region("RGP"), Region("RGP")
gene_1, gene_2 = Gene("gene_1"), Gene("gene_2")
gene_1.fill_annotations(start=0, stop=10, strand='+', position=0)
gene_2.fill_annotations(start=0, stop=10, strand='+', position=0)
gene_1.fill_annotations(start=1, stop=10, strand='+', position=0)
gene_2.fill_annotations(start=1, stop=10, strand='+', position=0)
gene_1.family, gene_2.family = GeneFamily(0, "Fam_0"), GeneFamily(1, "Fam_1")
region_1[0], region_2[0] = gene_1, gene_2
spot[region_1.name] = region_1
Expand All @@ -691,7 +691,7 @@ def test_add_two_time_the_same_region(self, spot, region):
"""Test that adding a two time the same region is working as expected
"""
gene = Gene("gene")
gene.fill_annotations(start=0, stop=10, strand='+', position=0)
gene.fill_annotations(start=1, stop=10, strand='+', position=0)
gene.family = GeneFamily(0, "Fam")
region[0] = gene
spot[region.name] = region
Expand Down