From 881d2da0d64eaa7472f79aa4ea952bd4abcf45a8 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 1 Jul 2024 18:39:43 +0200 Subject: [PATCH 1/5] improve error message in fill_annotation method --- ppanggolin/genome.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 87e2ef46..57d2d04a 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -141,8 +141,8 @@ def contig(self, contig: Contig): raise TypeError(f'Expected type Contig, got {type(contig)}') self._contig = contig - def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "", - product: str = "", local_identifier: str = "", coordinates:List[Tuple[int]] = None): + def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "", + product: str = "", local_identifier: str = "", coordinates: List[Tuple[int]] = None): """ Fill general annotation for child classes @@ -162,31 +162,31 @@ def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = coordinates = [(start, stop)] if not isinstance(start, int): - raise TypeError("Start should be int") + raise TypeError(f"Start should be int. Got {type(start)} instead in {self} from {self.organism}.") if not isinstance(stop, int): - raise TypeError("Stop should be int") + raise TypeError(f"Stop should be int. Got {type(stop)} instead in {self} from {self.organism}.") if not isinstance(strand, str): - raise TypeError("Strand should be str") + raise TypeError(f"Strand should be str. Got {type(strand)} instead in {self} from {self.organism}.") if not isinstance(gene_type, str): - raise TypeError("Gene type should be str") + raise TypeError(f"Gene type should be str. Got {type(gene_type)} instead in {self} from {self.organism}.") if not isinstance(name, str): - raise TypeError("Name should be str") + raise TypeError(f"Name should be str. Got {type(name)} instead in {self} from {self.organism}.") if not isinstance(product, str): - raise TypeError("Product should be str") + raise TypeError(f"Product should be str. Got {type(product)} instead in {self} from {self.organism}.") if not isinstance(local_identifier, str): - raise TypeError("Local identifier should be str") + raise TypeError(f"Local identifier should be str. Got {type(local_identifier)} instead in {self} from {self.organism}.") if strand not in ["+", "-"]: - raise ValueError("Strand should be + or -") + raise ValueError(f"Strand should be '+' or '-'. Got {strand} instead in {self} from {self.organism}.") if not isinstance(coordinates, list): - raise TypeError(f"coordinates should be of type list. Type {type(coordinates)} was given instead") - + raise TypeError(f"Coordinates should be of type list. Got {type(coordinates)} instead in {self} from {self.organism}.") + for start_i, stop_i in coordinates: if not isinstance(start_i, int): - raise TypeError("Start should be int") + raise TypeError(f"Start should be int. Got {type(start_i)} instead in {self} from {self.organism}.") if not isinstance(stop_i, int): - raise TypeError("Stop should be int") + raise TypeError(f"Stop should be int. Got {type(stop_i)} instead in {self} from {self.organism}.") if stop_i < start_i: - raise ValueError(f"Wrong coordinates: {coordinates}. start ({start_i}) should not be greater than stop ({stop_i}).") + raise ValueError(f"Wrong coordinates: {coordinates}. Start ({start_i}) should not be greater than stop ({stop_i}) in {self} from {self.organism}.") self.start = start From 94fc39ab05a66c46e0e6d270156b557cc820dc82 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 1 Jul 2024 19:01:26 +0200 Subject: [PATCH 2/5] add check on negative coordinates --- ppanggolin/genome.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 57d2d04a..19359a42 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -187,7 +187,8 @@ def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = raise TypeError(f"Stop should be int. Got {type(stop_i)} instead in {self} from {self.organism}.") if stop_i < start_i: raise ValueError(f"Wrong coordinates: {coordinates}. Start ({start_i}) should not be greater than stop ({stop_i}) in {self} from {self.organism}.") - + if start_i < 1 or stop_i < 1: + raise ValueError(f"Wrong coordinates: {coordinates}. Start ({start_i}) and stop ({stop_i}) should be greater than 0 in {self} from {self.organism}.") self.start = start self.stop = stop From 55ef46d05eb9d36d704cd32e10c47df1b8c03c9d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 1 Jul 2024 19:17:29 +0200 Subject: [PATCH 3/5] ignoring weird RNA prediction from aragorn --- ppanggolin/annotate/synta.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 866c98aa..5d5447fc 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -73,6 +73,10 @@ def launch_aragorn(fna_file: str, org: Organism) -> defaultdict: elif len(line) > 0: # if the line isn't empty, there's data to get. line_data = line.split() start, stop = map(int, ast.literal_eval(line_data[2].replace("c", ""))) + if start < 1 or stop < 1: + # In some case aragorn gives negative coordinates. This case is just ignored. + logging.warning(f'Aragorn gives non valide coordiates for a RNA gene: {line_data} This RNA is ignored.') + continue c += 1 gene = RNA(rna_id=locustag + '_tRNA_' + str(c).zfill(4)) gene.fill_annotations(start=start, stop=stop, strand="-" if line_data[2].startswith("c") else "+", From e1723b316bba1f4693f2045a63cda3daafa41bd8 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 1 Jul 2024 19:36:16 +0200 Subject: [PATCH 4/5] ignore weird RNA genes when reading pangneome file --- ppanggolin/formats/readBinaries.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 85c1e7ac..fbfa00aa 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -547,6 +547,13 @@ def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): rna = RNA(row["ID"].decode()) genedata = genedata_dict[row["genedata_id"]] + if genedata.start > genedata.stop: + logging.warning(f"Wrong coordinates in RNA gene {genedata.name}: Start ({genedata.start}) should not be greater than stop ({genedata.stop}). This gene is ignored.") + continue + if genedata.start < 1 or genedata.stop < 1: + logging.warning(f"Wrong coordinates in RNA gene {genedata.name}: Start ({genedata.start}) and stop ({genedata.stop}) should be greater than 0. This gene is ignored.") + continue + rna.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, gene_type=genedata.gene_type, name=genedata.name, product=genedata.product) From 71e0be2e4bf2f23e3bbf067b4996c2f278225bf8 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 1 Jul 2024 21:59:23 +0200 Subject: [PATCH 5/5] fix start position of test gene from 0 to 1 --- tests/context/test_context.py | 2 +- tests/test_pangenome.py | 2 +- tests/test_region.py | 30 +++++++++++++++--------------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 6a610d07..cda9e77c 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -71,7 +71,7 @@ def simple_contig(): organism = Organism('organism_A') for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')): family = GeneFamily(i, family_name) - gene.fill_annotations(start=i, stop=i+1, strand="+", position=i) + gene.fill_annotations(start=i+1, stop=i+2, strand="+", position=i) gene.fill_parents(organism, contig) diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index 90483079..8fd13e5d 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -811,7 +811,7 @@ def add_element_to_pangenome(self, pangenome): ctg = Contig(0, "Ctg") org.add(ctg) gene = Gene("Gene") - gene.fill_annotations(start=0, stop=100, position=0, strand='+') + gene.fill_annotations(start=1, stop=100, position=0, strand='+') gene.add_metadata(metadata=metadata) ctg.add(gene) pangenome.add_organism(org) diff --git a/tests/test_region.py b/tests/test_region.py index 294000c4..9a41d024 100644 --- a/tests/test_region.py +++ b/tests/test_region.py @@ -30,7 +30,7 @@ def genes(contig) -> Generator[Set[Gene], None, None]: @pytest.fixture def gene(contig) -> Gene: gene = Gene('gene') - gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_annotations(start=1, stop=10, strand='+', position=0) contig = Contig(0, 'contig_name') contig.length = 10 gene.contig = contig @@ -145,7 +145,7 @@ def test_add_genes_at_position_already_taken(self, region, contig): """Test that adding genes with same position return a ValueError """ gene = Gene('gene') - gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_annotations(start=1, stop=10, strand='+', position=0) gene.contig = contig region.add(gene) @@ -159,7 +159,7 @@ def test_add_genes_from_different_contigs(self, region): """Test that adding genes from different contigs return an Exception """ gene1, gene2 = Gene('gene_1'), Gene('gene_2') - gene1.fill_annotations(start=0, stop=10, strand='+', position=0) + gene1.fill_annotations(start=1, stop=10, strand='+', position=0) gene2.fill_annotations(start=11, stop=20, strand='+', position=1) gene1.fill_parents(None, Contig(1, 'contig_1')) region.add(gene1) @@ -171,7 +171,7 @@ def test_add_genes_from_different_organisms(self, region): """Test that adding genes from different organisms return an Exception """ gene1, gene2 = Gene('gene_1'), Gene('gene_2') - gene1.fill_annotations(start=0, stop=10, strand='+', position=0) + gene1.fill_annotations(start=1, stop=10, strand='+', position=0) gene2.fill_annotations(start=11, stop=20, strand='+', position=1) gene1.fill_parents(Organism("org_1")) region.add(gene1) @@ -183,7 +183,7 @@ def test_get_genes(self, region): """Tests that genes can be retrieved from the region """ gene = Gene('gene') - gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_annotations(start=1, stop=10, strand='+', position=0) region.add(gene) assert region.get(0) == gene @@ -203,7 +203,7 @@ def test_del_gene(self, region): """Tests that genes can be deleted from the region """ gene = Gene('gene') - gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_annotations(start=1, stop=10, strand='+', position=0) region.add(gene) assert region.get(0) == gene region.remove(0) @@ -232,7 +232,7 @@ def test_get_organism(self, region, contig): """Tests that the organism linked to the region can be retrieved """ gene = Gene('gene') - gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_annotations(start=1, stop=10, strand='+', position=0) gene.fill_parents(Organism("org"), contig) region.add(gene) assert region.organism.name == 'org' @@ -241,7 +241,7 @@ def test_get_contig(self, region): """Tests that the contig linked to the region can be retrieved """ gene = Gene('gene') - gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_annotations(start=1, stop=10, strand='+', position=0) gene.fill_parents(contig=Contig(0, "contig")) region.add(gene) assert region.contig.name == 'contig' @@ -250,7 +250,7 @@ def test_is_whole_contig_true(self, region): """Tests that the property is_whole_contig return True if the region has the same length as contig """ starter, stopper = Gene('starter'), Gene('stopper') - starter.fill_annotations(start=0, stop=10, strand='+', position=0) + starter.fill_annotations(start=1, stop=10, strand='+', position=0) stopper.fill_annotations(start=11, stop=20, strand='+', position=1) contig = Contig(0, "contig") contig[starter.start], contig[stopper.start] = starter, stopper @@ -262,7 +262,7 @@ def test_is_whole_contig_false(self, region): """Tests that the property is_whole_contig return False if the region has not the same length as contig """ before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') - before.fill_annotations(start=0, stop=10, strand='+', position=0) + before.fill_annotations(start=1, stop=10, strand='+', position=0) starter.fill_annotations(start=11, stop=20, strand='+', position=1) stopper.fill_annotations(start=21, stop=30, strand='+', position=2) after.fill_annotations(start=31, stop=40, strand='+', position=3) @@ -278,7 +278,7 @@ def test_is_contig_border_true(self, region): """Test that property is_contig_border return true if the region is bordering the contig """ before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') - before.fill_annotations(start=0, stop=10, strand='+', position=0) + before.fill_annotations(start=1, stop=10, strand='+', position=0) starter.fill_annotations(start=11, stop=20, strand='+', position=1) stopper.fill_annotations(start=21, stop=30, strand='+', position=2) after.fill_annotations(start=31, stop=40, strand='+', position=3) @@ -299,7 +299,7 @@ def test_is_contig_border_false(self, region): """Tests that the property is_contig_border return False if the region is not bordering the contig """ before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') - before.fill_annotations(start=0, stop=10, strand='+', position=0) + before.fill_annotations(start=1, stop=10, strand='+', position=0) starter.fill_annotations(start=11, stop=20, strand='+', position=1) stopper.fill_annotations(start=21, stop=30, strand='+', position=2) after.fill_annotations(start=31, stop=40, strand='+', position=3) @@ -679,8 +679,8 @@ def test_add_different_region_with_same_name(self, spot): """ region_1, region_2 = Region("RGP"), Region("RGP") gene_1, gene_2 = Gene("gene_1"), Gene("gene_2") - gene_1.fill_annotations(start=0, stop=10, strand='+', position=0) - gene_2.fill_annotations(start=0, stop=10, strand='+', position=0) + gene_1.fill_annotations(start=1, stop=10, strand='+', position=0) + gene_2.fill_annotations(start=1, stop=10, strand='+', position=0) gene_1.family, gene_2.family = GeneFamily(0, "Fam_0"), GeneFamily(1, "Fam_1") region_1[0], region_2[0] = gene_1, gene_2 spot[region_1.name] = region_1 @@ -691,7 +691,7 @@ def test_add_two_time_the_same_region(self, spot, region): """Test that adding a two time the same region is working as expected """ gene = Gene("gene") - gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_annotations(start=1, stop=10, strand='+', position=0) gene.family = GeneFamily(0, "Fam") region[0] = gene spot[region.name] = region