From dd9c726f2a4b05a46daa4c62fdf35996646ebefc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 5 Sep 2023 14:44:40 +0200 Subject: [PATCH 01/15] commit before merge with unitTest branch --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 2 +- ppanggolin/formats/writeBinaries.py | 6 +++++- ppanggolin/formats/writeFlat.py | 4 ++-- ppanggolin/formats/writeMSA.py | 2 +- ppanggolin/formats/writeSequences.py | 4 ++-- ppanggolin/metrics/fluidity.py | 2 +- ppanggolin/pangenome.py | 3 ++- 8 files changed, 15 insertions(+), 10 deletions(-) diff --git a/VERSION b/VERSION index e829013f..e144fab6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.132 +1.2.133 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index c03ba66f..5bc8d935 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -251,7 +251,7 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain multigenics = pangenome.get_multigenics(dup_margin) logging.getLogger().info("Compute Regions of Genomic Plasticity ...") name_scheme = naming_scheme(pangenome) - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar): + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genomes", disable=disable_bar): pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme)) logging.getLogger().info(f"Predicted {len(pangenome.regions)} RGP") diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index b01a3a35..6a5a8a5d 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -145,6 +145,10 @@ def get_genedata(gene:Feature) -> Tuple[int, str, str, int, str, str, int]: gene.product, genetic_code) +def write_organism(pangenome: Pangenome, h5f: tables.File, disable_bar = False): + organism_table = h5f.create_table(annotation, "organism", gene_desc(*get_max_len_annotations(pangenome)), + expectedrows=pangenome.number_of_organisms) + def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ Function writing all the pangenome annotations @@ -163,7 +167,7 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool genedata_counter = 0 gene_row = gene_table.row - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genome", disable=disable_bar): + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): for contig in org.contigs: for gene in contig.genes + list(contig.RNAs): gene_row["organism"] = org.name diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index aa0078bc..7a3bde02 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -487,9 +487,9 @@ def write_stats(output: str, soft_core: float = 0.95, dup_margin: float = 0.05, soft = set() # could use bitarrays if speed is needed core = set() for fam in pan.gene_families: - if len(fam.organisms) >= pan.number_of_organisms() * soft_core: + if len(fam.organisms) >= pan.number_of_organisms * soft_core: soft.add(fam) - if len(fam.organisms) == pan.number_of_organisms(): + if len(fam.organisms) == pan.number_of_organisms: core.add(fam) with write_compressed_or_not(output + "/organisms_statistics.tsv", compress) as outfile: diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index f8af1135..72c2f4bf 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -51,7 +51,7 @@ def getFamiliesToWrite(pangenome, partition_filter, soft_core=0.95, dup_margin=0 :return: set of families unique to one partition """ fams = set() - nb_org = pangenome.number_of_organisms() + nb_org = pangenome.number_of_organisms if partition_filter == "all": return set(pangenome.gene_families) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 725b1e20..29b5ce86 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -106,14 +106,14 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c elif partition == "softcore": logging.getLogger().info( f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes") - threshold = pangenome.number_of_organisms() * soft_core + threshold = pangenome.number_of_organisms * soft_core for fam in pangenome.gene_families: if len(fam.organisms) >= threshold: genefams.add(fam) elif partition == "core": logging.getLogger().info(f"Writing the representative {type_name} of the {partition} gene families...") for fam in pangenome.gene_families: - if len(fam.organisms) == pangenome.number_of_organisms(): + if len(fam.organisms) == pangenome.number_of_organisms: genefams.add(fam) elif "module_" in partition: logging.getLogger().info(f"Writing the representation {type_name} of {partition} gene families...") diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py index f7fd1282..2d1a2d97 100644 --- a/ppanggolin/metrics/fluidity.py +++ b/ppanggolin/metrics/fluidity.py @@ -40,7 +40,7 @@ def gen_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: common_fam = popcount(c_organisms[0].bitarray & c_organisms[1].bitarray) - 1 if tot_fam > 0 and common_fam > 0: g_sum += (tot_fam - 2 * common_fam) / tot_fam - fluidity_dict[subset] = (2 / (pangenome.number_of_organisms() * (pangenome.number_of_organisms() - 1))) * g_sum + fluidity_dict[subset] = (2 / (pangenome.number_of_organisms * (pangenome.number_of_organisms - 1))) * g_sum return fluidity_dict diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 92836995..93848cda 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -94,7 +94,7 @@ def _yield_genes(self) -> Iterator[Gene]: :return: an iterator of Gene """ - if self.number_of_organisms() > 0: # if we have organisms, they're supposed to have genes + if self.number_of_organisms > 0: # if we have organisms, they're supposed to have genes for org in self.organisms: for contig in org.contigs: for gene in contig.genes: @@ -245,6 +245,7 @@ def organisms(self) -> List[Organism]: """ return list(self._orgGetter.values()) + @property def number_of_organisms(self) -> int: """Returns the number of organisms present in the pangenome From 5eb33bcafbc3b4d2c64cc0966ecb8a849db1fa36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 5 Sep 2023 17:48:10 +0200 Subject: [PATCH 02/15] Change HDF5 to write contig and organism table --- VERSION | 2 +- ppanggolin/formats/writeAnnotations.py | 356 +++++++++++++++++++++++++ ppanggolin/formats/writeBinaries.py | 282 +------------------- ppanggolin/genome.py | 14 +- ppanggolin/pangenome.py | 14 +- 5 files changed, 379 insertions(+), 289 deletions(-) create mode 100644 ppanggolin/formats/writeAnnotations.py diff --git a/VERSION b/VERSION index c262507c..8464960b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.174 +1.2.175 diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py new file mode 100644 index 00000000..907034de --- /dev/null +++ b/ppanggolin/formats/writeAnnotations.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 +# coding:utf-8 + +# default libraries +import logging +from typing import Dict, Tuple, Union + +# installed libraries +from tqdm import tqdm +import tables + +# local libraries +from ppanggolin.pangenome import Pangenome +from ppanggolin.genome import Feature, Gene +from ppanggolin.formats.readBinaries import Genedata + + +def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int, int]: + """ + Get the maximum size of each annotation information to optimize disk space + + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = 1, 1, 1, 1, 1, 1 + for org in pangenome.organisms: + if len(org.name) > max_org_len: + max_org_len = len(org.name) + for contig in org.contigs: + if len(contig.name) > max_contig_len: + max_contig_len = len(contig.name) + for gene in contig.genes: + if len(gene.ID) > max_gene_id_len: + max_gene_id_len = len(gene.ID) + if len(gene.local_identifier) > max_gene_local_id: + max_gene_local_id = len(gene.local_identifier) + for rna in contig.RNAs: + if len(rna.ID) > max_rna_id_len: + max_rna_id_len = len(gene.ID) + if len(rna.local_identifier) > max_rna_local_id: + max_rna_local_id = len(gene.local_identifier) + + return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id + + +def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]: + """ + Create a table to save organism-related information + + :param org_len: Maximum size of organism + :param contig_len: Maximum size of contigs + + :return: Formatted table + """ + return {'name': tables.StringCol(itemsize=org_len), + "contig": tables.StringCol(itemsize=contig_len)} + + +def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + organism_desc: Dict[str, tables.StringCol], disable_bar=False): + organism_table = h5f.create_table(annotation, "genome", organism_desc, + expectedrows=pangenome.number_of_organisms) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes") + organism_row = organism_table.row + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): + for contig in org.contigs: + organism_row["name"] = org.name + organism_row["contig"] = contig.name + organism_row.append() + organism_table.flush() + + +def contig_desc(contig_len: int, max_gene_id_len: int, + max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol]]: + """ + Create a table to save organism-related information + + :param org_len: Maximum size of organism + :param contig_len: Maximum size of contigs + + :return: Formatted table + """ + return {'name': tables.StringCol(itemsize=contig_len), + "is_circular": tables.BoolCol(dflt=False), + "gene": tables.StringCol(itemsize=max_gene_id_len), + "rna": tables.StringCol(itemsize=max_rna_id_len)} + + +def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol]], + disable_bar=False): + contig_table = h5f.create_table(annotation, "contig", contig_desc, expectedrows=pangenome.number_of_contigs) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") + contig_row = contig_table.row + for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): + rna_list = list(contig.RNAs) + for index, gene in enumerate(contig.genes): + contig_row["name"] = contig.name + contig_row["is_circular"] = contig.is_circular + contig_row["gene"] = gene.ID + if index < len(rna_list): + contig_row["rna"] = rna_list[index].ID + contig_row.append() + contig_table.flush() + + +def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: + """ + Create a table to save gene-related information + + :param org_len: Maximum size of organism + :param contig_len: Maximum size of contigs + :param id_len: Maximum size of gene ID + + :param max_local_id: Maximum size of gene local identifier + + :return: Formatted table + """ + return {'ID': tables.StringCol(itemsize=id_len), + 'genedata_id': tables.UInt32Col(), + 'local': tables.StringCol(itemsize=max_local_id), + 'is_fragment': tables.BoolCol(dflt=False)} + + +def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + gene_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]], + disable_bar=False) -> Dict[Genedata, int]: + genedata2gene = {} + genedata_counter = 0 + gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") + gene_row = gene_table.row + for gene in tqdm(pangenome.genes, total=pangenome.number_of_organisms, unit="gene", disable=disable_bar): + gene_row["ID"] = gene.ID + gene_row["is_fragment"] = gene.is_fragment + if gene.type == "CDS": + gene_row["local"] = gene.local_identifier + genedata = get_genedata(gene) + genedata_id = genedata2gene.get(genedata) + if genedata_id is None: + genedata_id = genedata_counter + genedata2gene[genedata] = genedata_id + genedata_counter += 1 + gene_row["genedata_id"] = genedata_id + gene_row.append() + gene_table.flush() + return genedata2gene + + +def genedata_desc(type_len, name_len, product_len): + """ + Creates a table for gene-related data + + :param type_len: Maximum size of gene Type + :param name_len: Maximum size of gene name + :param product_len: Maximum size of gene product + :return: Formatted table for gene metadata + """ + return { + 'genedata_id': tables.UInt32Col(), + 'start': tables.UInt32Col(), + 'stop': tables.UInt32Col(), + 'strand': tables.StringCol(itemsize=1), + 'gene_type': tables.StringCol(itemsize=type_len), + 'position': tables.UInt32Col(), + 'name': tables.StringCol(itemsize=name_len), + 'product': tables.StringCol(itemsize=product_len), + 'genetic_code': tables.UInt32Col(dflt=11), + } + + +def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]: + """ + Get the maximum size of each gene data information to optimize disk space + + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_name_len = 1 + max_product_len = 1 + max_type_len = 1 + for org in pangenome.organisms: + for contig in org.contigs: + for gene in contig.genes: + if len(gene.name) > max_name_len: + max_name_len = len(gene.name) + if len(gene.product) > max_product_len: + max_product_len = len(gene.product) + if len(gene.type) > max_type_len: + max_type_len = len(gene.type) + for gene in contig.RNAs: + if len(gene.name) > max_name_len: + max_name_len = len(gene.name) + if len(gene.product) > max_product_len: + max_product_len = len(gene.product) + if len(gene.type) > max_type_len: + max_type_len = len(gene.type) + + return max_type_len, max_name_len, max_product_len + + +def get_genedata(gene: Feature) -> Genedata: + """ + Gets the genedata type of Feature + + :param gene: a Feature + :return: Tuple with a Feature associated data + """ + position = None + genetic_code = 11 + if gene.type == "CDS": + gene: Gene + position = gene.position + genetic_code = gene.genetic_code + return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name, + gene.product, genetic_code) + + +def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + genedata2gene: Dict[Genedata, int], disable_bar=False): + genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), + expectedrows=len(genedata2gene)) + logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data " + "(can be lower than the number of genes)") + genedata_row = genedata_table.row + for genedata, genedata_id in genedata2gene.items(): + genedata_row["genedata_id"] = genedata_id + genedata_row["start"] = genedata.start + genedata_row["stop"] = genedata.stop + genedata_row["strand"] = genedata.strand + genedata_row["gene_type"] = genedata.gene_type + if genedata.gene_type == "CDS": + genedata_row["position"] = genedata.position + genedata_row["genetic_code"] = genedata.genetic_code + genedata_row["name"] = genedata.name + genedata_row["product"] = genedata.product + genedata_row.append() + genedata_table.flush() + + +def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, + rec_contigs: bool = True, rec_genes: bool = True, disable_bar: bool = False): + """ + Function writing all the pangenome annotations + + :param pangenome: Annotated pangenome + :param h5f: Pangenome HDF5 file + :param disable_bar: Alow to disable progress bar + """ + annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") + + max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = get_max_len_annotations( + pangenome) + + if rec_organisms: + desc = organism_desc(max_org_len, max_contig_len) + write_organisms(pangenome, h5f, annotation, desc, disable_bar) + if rec_contigs: + desc = contig_desc(max_contig_len, max_gene_id_len, max_rna_id_len) + write_contigs(pangenome, h5f, annotation, desc, disable_bar) + if rec_genes: + desc = gene_desc(max_gene_id_len, max_gene_local_id) + genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar) + write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar) + + +def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: + """ + Get the maximum size of gene sequences to optimize disk space + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_gene_id_len = 1 + max_gene_type = 1 + for gene in pangenome.genes: + if len(gene.ID) > max_gene_id_len: + max_gene_id_len = len(gene.ID) + if len(gene.type) > max_gene_type: + max_gene_type = len(gene.type) + return max_gene_id_len, max_gene_type + + +def gene_sequences_desc(gene_id_len, gene_type_len) -> dict: + """ + Create table to save gene sequences + :param gene_id_len: Maximum size of gene sequence identifier + :param gene_type_len: Maximum size of gene type + :return: Formated table + """ + return { + "gene": tables.StringCol(itemsize=gene_id_len), + "seqid": tables.UInt32Col(), + "type": tables.StringCol(itemsize=gene_type_len) + } + + +def get_sequence_len(pangenome: Pangenome) -> int: + """ + Get the maximum size of gene sequences to optimize disk space + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_seq_len = 1 + for gene in pangenome.genes: + if len(gene.dna) > max_seq_len: + max_seq_len = len(gene.dna) + return max_seq_len + + +def sequence_desc(max_seq_len: int) -> dict: + """ + Table description to save sequences + :param max_seq_len: Maximum size of gene type + :return: Formated table + """ + return { + "seqid": tables.UInt32Col(), + "dna": tables.StringCol(itemsize=max_seq_len) + } + + +def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Function writing all the pangenome gene sequences + :param pangenome: Pangenome with gene sequences + :param h5f: Pangenome HDF5 file without sequences + :param disable_bar: Disable progress bar + """ + gene_seq = h5f.create_table("/annotations", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), + expectedrows=pangenome.number_of_genes) + # process sequences to save them only once + seq2seqid = {} + id_counter = 0 + gene_row = gene_seq.row + for gene in tqdm(sorted(pangenome.genes, key=lambda x: x.ID), total=pangenome.number_of_genes, unit="gene", + disable=disable_bar): + curr_seq_id = seq2seqid.get(gene.dna) + if curr_seq_id is None: + curr_seq_id = id_counter + seq2seqid[gene.dna] = id_counter + id_counter += 1 + gene_row["gene"] = gene.ID + gene_row["seqid"] = curr_seq_id + gene_row["type"] = gene.type + gene_row.append() + gene_seq.flush() + + seq_table = h5f.create_table("/annotations", "sequences", sequence_desc(get_sequence_len(pangenome)), + expectedrows=len(seq2seqid)) + + seq_row = seq_table.row + for seq, seqid in seq2seqid.items(): + seq_row["dna"] = seq + seq_row["seqid"] = seqid + seq_row.append() + seq_table.flush() diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 3a9df929..1a3fcc56 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -5,7 +5,7 @@ import logging from collections import Counter, defaultdict import statistics -from typing import Tuple +from typing import Tuple, Union import pkg_resources # installed libraries @@ -15,290 +15,12 @@ # local libraries from ppanggolin.pangenome import Pangenome +from ppanggolin.formats.writeAnnotations import write_annotations, write_gene_sequences from ppanggolin.formats.writeMetadata import write_metadata, erase_metadata, write_metadata_status from ppanggolin.genome import Feature, Gene from ppanggolin.formats.readBinaries import read_genedata, Genedata -def gene_desc(org_len, contig_len, id_len, max_local_id) -> dict: - """ - Create a table to save gene-related information - - :param org_len: Maximum size of organism - :param contig_len: Maximum size of contigs - :param id_len: Maximum size of gene ID - - :param max_local_id: Maximum size of gene local identifier - - :return: Formatted table - """ - return { - 'organism': tables.StringCol(itemsize=org_len), - "contig": { - 'name': tables.StringCol(itemsize=contig_len), - "is_circular": tables.BoolCol(dflt=False) - }, - "gene": { - 'ID': tables.StringCol(itemsize=id_len), - 'genedata_id': tables.UInt32Col(), - 'local': tables.StringCol(itemsize=max_local_id), - 'is_fragment': tables.BoolCol(dflt=False) - } - } - - -def genedata_desc(type_len, name_len, product_len): - """ - Creates a table for gene-related data - - :param type_len: Maximum size of gene Type - :param name_len: Maximum size of gene name - :param product_len: Maximum size of gene product - :return: Formatted table for gene metadata - """ - return { - 'genedata_id': tables.UInt32Col(), - 'start': tables.UInt32Col(), - 'stop': tables.UInt32Col(), - 'strand': tables.StringCol(itemsize=1), - 'gene_type': tables.StringCol(itemsize=type_len), - 'position': tables.UInt32Col(), - 'name': tables.StringCol(itemsize=name_len), - 'product': tables.StringCol(itemsize=product_len), - 'genetic_code': tables.UInt32Col(dflt=11), - } - - -def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int]: - """ - Get the maximum size of each annotation information to optimize disk space - - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_org_len = 1 - max_contig_len = 1 - max_gene_id_len = 1 - max_local_id = 1 - for org in pangenome.organisms: - if len(org.name) > max_org_len: - max_org_len = len(org.name) - for contig in org.contigs: - if len(contig.name) > max_contig_len: - max_contig_len = len(contig.name) - for gene in contig.genes: - if len(gene.ID) > max_gene_id_len: - max_gene_id_len = len(gene.ID) - if len(gene.local_identifier) > max_local_id: - max_local_id = len(gene.local_identifier) - for gene in contig.RNAs: - if len(gene.ID) > max_gene_id_len: - max_gene_id_len = len(gene.ID) - if len(gene.local_identifier) > max_local_id: - max_local_id = len(gene.local_identifier) - - return max_org_len, max_contig_len, max_gene_id_len, max_local_id - - -def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]: - """ - Get the maximum size of each gene data information to optimize disk space - - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_name_len = 1 - max_product_len = 1 - max_type_len = 1 - for org in pangenome.organisms: - for contig in org.contigs: - for gene in contig.genes: - if len(gene.name) > max_name_len: - max_name_len = len(gene.name) - if len(gene.product) > max_product_len: - max_product_len = len(gene.product) - if len(gene.type) > max_type_len: - max_type_len = len(gene.type) - for gene in contig.RNAs: - if len(gene.name) > max_name_len: - max_name_len = len(gene.name) - if len(gene.product) > max_product_len: - max_product_len = len(gene.product) - if len(gene.type) > max_type_len: - max_type_len = len(gene.type) - - return max_type_len, max_name_len, max_product_len - - -def get_genedata(gene: Feature) -> Genedata: - """ - Gets the genedata type of Feature - - :param gene: a Feature - :return: Tuple with a Feature associated data - """ - position = None - genetic_code = 11 - if gene.type == "CDS": - gene: Gene - position = gene.position - genetic_code = gene.genetic_code - return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name, - gene.product, genetic_code) - - -def write_organism(pangenome: Pangenome, h5f: tables.File, disable_bar = False): - organism_table = h5f.create_table(annotation, "organism", gene_desc(*get_max_len_annotations(pangenome)), - expectedrows=pangenome.number_of_organisms) - -def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): - """ - Function writing all the pangenome annotations - - :param pangenome: Annotated pangenome - :param h5f: Pangenome HDF5 file - :param disable_bar: Alow to disable progress bar - """ - annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") - gene_table = h5f.create_table(annotation, "genes", gene_desc(*get_max_len_annotations(pangenome)), - expectedrows=pangenome.number_of_genes) - - logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") - - genedata2gene = {} - genedata_counter = 0 - - gene_row = gene_table.row - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): - for contig in org.contigs: - for gene in list(contig.genes) + list(contig.RNAs): - gene_row["organism"] = org.name - gene_row["contig/name"] = contig.name - gene_row["contig/is_circular"] = contig.is_circular - gene_row["gene/ID"] = gene.ID - gene_row["gene/is_fragment"] = gene.is_fragment - if gene.type == "CDS": - gene_row["gene/local"] = gene.local_identifier - genedata = get_genedata(gene) - genedata_id = genedata2gene.get(genedata) - if genedata_id is None: - genedata_id = genedata_counter - genedata2gene[genedata] = genedata_id - genedata_counter += 1 - gene_row["gene/genedata_id"] = genedata_id - gene_row.append() - gene_table.flush() - - genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), - expectedrows=len(genedata2gene)) - logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data (can be lower than the number of genes)") - genedata_row = genedata_table.row - for genedata, genedata_id in genedata2gene.items(): - genedata_row["genedata_id"] = genedata_id - genedata_row["start"] = genedata.start - genedata_row["stop"] = genedata.stop - genedata_row["strand"] = genedata.strand - genedata_row["gene_type"] = genedata.gene_type - if genedata.gene_type == "CDS": - genedata_row["position"] = genedata.position - genedata_row["genetic_code"] = genedata.genetic_code - genedata_row["name"] = genedata.name - genedata_row["product"] = genedata.product - genedata_row.append() - genedata_table.flush() - - -def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: - """ - Get the maximum size of gene sequences to optimize disk space - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_gene_id_len = 1 - max_gene_type = 1 - for gene in pangenome.genes: - if len(gene.ID) > max_gene_id_len: - max_gene_id_len = len(gene.ID) - if len(gene.type) > max_gene_type: - max_gene_type = len(gene.type) - return max_gene_id_len, max_gene_type - - -def gene_sequences_desc(gene_id_len, gene_type_len) -> dict: - """ - Create table to save gene sequences - :param gene_id_len: Maximum size of gene sequence identifier - :param gene_type_len: Maximum size of gene type - :return: Formated table - """ - return { - "gene": tables.StringCol(itemsize=gene_id_len), - "seqid": tables.UInt32Col(), - "type": tables.StringCol(itemsize=gene_type_len) - } - - -def get_sequence_len(pangenome: Pangenome) -> int: - """ - Get the maximum size of gene sequences to optimize disk space - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_seq_len = 1 - for gene in pangenome.genes: - if len(gene.dna) > max_seq_len: - max_seq_len = len(gene.dna) - return max_seq_len - - -def sequence_desc(max_seq_len: int) -> dict: - """ - Table description to save sequences - :param max_seq_len: Maximum size of gene type - :return: Formated table - """ - return { - "seqid": tables.UInt32Col(), - "dna": tables.StringCol(itemsize=max_seq_len) - } - - -def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): - """ - Function writing all the pangenome gene sequences - :param pangenome: Pangenome with gene sequences - :param h5f: Pangenome HDF5 file without sequences - :param disable_bar: Disable progress bar - """ - gene_seq = h5f.create_table("/", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), - expectedrows=pangenome.number_of_genes) - # process sequences to save them only once - seq2seqid = {} - id_counter = 0 - gene_row = gene_seq.row - for gene in tqdm(sorted(pangenome.genes, key=lambda x: x.ID), total=pangenome.number_of_genes, unit="gene", disable=disable_bar): - curr_seq_id = seq2seqid.get(gene.dna) - if curr_seq_id is None: - curr_seq_id = id_counter - seq2seqid[gene.dna] = id_counter - id_counter += 1 - gene_row["gene"] = gene.ID - gene_row["seqid"] = curr_seq_id - gene_row["type"] = gene.type - gene_row.append() - gene_seq.flush() - - seq_table = h5f.create_table("/", "sequences", sequence_desc(get_sequence_len(pangenome)), - expectedrows=len(seq2seqid)) - - seq_row = seq_table.row - for seq, seqid in seq2seqid.items(): - seq_row["dna"] = seq - seq_row["seqid"] = seqid - seq_row.append() - seq_table.flush() - - def gene_fam_desc(max_name_len: int, max_sequence_length: int, max_part_len: int) -> dict: """ Create a formated table for gene families description diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index d63818b6..84462d77 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -581,6 +581,13 @@ def __delitem__(self, name): except KeyError: raise KeyError("Position of the gene in the contig does not exist") + def __len__(self): + """ Get number of contigs in organism + + :return: Number of contigs in organism + """ + return len(self._contigs_getter) + @property def families(self): """Return the gene families present in the organism @@ -625,13 +632,6 @@ def contigs(self) -> Generator[Contig, None, None]: """ yield from self._contigs_getter.values() - def number_of_contigs(self) -> int: - """ Get number of contigs in organism - - :return: Number of contigs in organism - """ - return len(self._contigs_getter) - def add(self, contig: Contig): """Add a contig to organism diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index ba35e4c1..c63bc3ec 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -8,7 +8,7 @@ from pathlib import Path # local libraries -from ppanggolin.genome import Organism, Gene +from ppanggolin.genome import Organism, Contig, Gene from ppanggolin.region import Region, Spot, Module from ppanggolin.geneFamily import GeneFamily from ppanggolin.edge import Edge @@ -281,6 +281,18 @@ def number_of_organisms(self) -> int: """ return len(self._orgGetter) + @property + def contigs(self) -> Generator[Contig, None, None]: + for organism in self.organisms: + yield from organism.contigs + @property + def number_of_contigs(self) -> int: + """Returns the number of contigs present in the pangenome + + :return: The number of contigs + """ + return sum(len(org) for org in self.organisms) + def get_organism(self, name: str) -> Organism: """ Get an organism that is expected to be in the pangenome using its name, which is supposedly unique. From 084ce284d42fe9082536cecec15d6a417ee1925f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 6 Sep 2023 16:09:59 +0200 Subject: [PATCH 03/15] Read new version of annotation Tables --- VERSION | 2 +- ppanggolin/cluster/cluster.py | 2 +- ppanggolin/formats/readBinaries.py | 210 ++++++++++++++----------- ppanggolin/formats/writeAnnotations.py | 210 ++++++++++++++++++------- ppanggolin/formats/writeBinaries.py | 4 +- ppanggolin/genome.py | 6 + ppanggolin/pangenome.py | 20 +++ tests/test_genome.py | 3 +- 8 files changed, 304 insertions(+), 153 deletions(-) diff --git a/VERSION b/VERSION index 8464960b..bfdb369d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.175 +1.2.176 diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 8d2763cb..77637015 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -284,7 +284,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = """ Main function to cluster pangenome gene sequences into families - :param pangenome: Annoatated Pangenome + :param pangenome: Annotated Pangenome :param tmpdir: Path to temporary directory :param cpu: number of CPU cores to use :param defrag: Allow to remove fragment diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 900a8c4e..215ad7a3 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -5,10 +5,9 @@ import logging import sys from pathlib import Path -# installed libraries -from typing import TextIO -from typing import List +from typing import TextIO, List, Dict, Tuple +# installed libraries from tables import Table from tqdm import tqdm import tables @@ -27,10 +26,10 @@ class Genedata: This is a general class storing unique gene-related data to be written in a specific genedata table - :param start: Start position of a gene - :param stop: Stop position of a gene - :param strand: associated strand - :param gene_type: Type of gene + :param start: Gene start position + :param stop: Gene stop position + :param strand: Associated strand + :param gene_type: Gene type :param position: Position of the gene on its contig :param name: Name of the feature :param product: Associated product @@ -164,7 +163,7 @@ def read_chunks(table: Table, column: str = None, chunk: int = 10000): yield row -def read_genedata(h5f: tables.File) -> dict: +def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: """ Reads the genedata table and returns a genedata_id2genedata dictionnary :param h5f: the hdf5 file handler @@ -192,7 +191,7 @@ def read_sequences(h5f: tables.File) -> dict: :param h5f: the hdf5 file handler :return: dictionnary linking sequences to the seq identifier """ - table = h5f.root.sequences + table = h5f.root.annotations.sequences seqid2seq = {} for row in read_chunks(table, chunk=20000): seqid2seq[row["seqid"]] = row['dna'].decode() @@ -204,6 +203,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter """ Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. + :param filename: Name of the pangenome file :param file_obj: Name of the output file :param list_cds: An iterable object of CDS @@ -212,7 +212,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter """ logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...") h5f = tables.open_file(filename, "r", driver_core_backing_store=0) - table = h5f.root.geneSequences + table = h5f.root.annotations.geneSequences list_cds = set(list_cds) if list_cds is not None else None seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): @@ -225,60 +225,6 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter h5f.close() -def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circular_contigs: dict, genedata_dict: dict, - link: bool = False): - """ - Read information from pangenome to assign to organism object - - :param pangenome: Input pangenome - :param org_name: Name of the organism - :param contig_dict: Dictionary with all contig and associate genes - :param circular_contigs: Dictionary of contigs - :param genedata_dict: dictionnary linking genedata to the genedata identifier - :param link: get the gene object if the genes are clustered - """ - org = Organism(org_name) - gene, gene_type = (None, None) - for contig_name, gene_list in contig_dict.items(): - try: - contig = org.get(contig_name) - except KeyError: - contig = Contig(contig_name, is_circular=circular_contigs[contig_name]) - org.add(contig) - for row in gene_list: - if link: # if the gene families are already computed/loaded the gene exists. - gene = pangenome.get_gene(row["ID"].decode()) - else: # else creating the gene. - curr_genedata = genedata_dict[row["genedata_id"]] - gene_type = curr_genedata.gene_type - if gene_type == "CDS": - gene = Gene(row["ID"].decode()) - elif "RNA" in gene_type: - gene = RNA(row["ID"].decode()) - try: - local = row["local"].decode() - except ValueError: - local = "" - if isinstance(gene, Gene): - gene.fill_annotations(start=curr_genedata.start, stop=curr_genedata.stop, strand=curr_genedata.strand, - gene_type=gene_type, name=curr_genedata.name, position=curr_genedata.position, - genetic_code=curr_genedata.genetic_code, product=curr_genedata.product, - local_identifier=local) - else: - gene.fill_annotations(start=curr_genedata.start, stop=curr_genedata.stop, strand=curr_genedata.strand, - gene_type=gene_type, name=curr_genedata.name, - product=curr_genedata.product, local_identifier=local) - gene.is_fragment = row["is_fragment"] - gene.fill_parents(org, contig) - if gene_type == "CDS": - contig[gene.start] = gene - elif "RNA" in gene_type: - contig.add_rna(gene) - else: - raise Exception(f"A strange type '{gene_type}', which we do not know what to do with, was met.") - pangenome.add_organism(org) - - def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ Read information about graph in pangenome hdf5 file to add in pangenome object @@ -350,6 +296,7 @@ def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar: def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ Read gene sequences in pangenome hdf5 file to add in pangenome object + :param pangenome: Pangenome object without gene sequence associate to gene :param h5f: Pangenome HDF5 file with gene sequence associate to gene :param disable_bar: Disable the progress bar @@ -357,7 +304,7 @@ def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: boo if pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"]: raise Exception("It's not possible to read the pangenome gene dna sequences " "if the annotations have not been loaded.") - table = h5f.root.geneSequences + table = h5f.root.annotations.geneSequences seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): @@ -437,8 +384,80 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal pangenome.add_module(module) pangenome.status["modules"] = "Loaded" - -def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): +def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size: int = 20000, + disable_bar: bool = False) -> Tuple[Dict[str, Gene], Dict[str, RNA]]: + table = annotations.genomes + contig2organism = {} + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar): + try: + organism = pangenome.get_organism(row["name"].decode()) + except: + organism = Organism(row["name"].decode()) + pangenome.add_organism(organism) + contig = Contig(name=row["contig"].decode()) + organism.add(contig) + contig2organism[contig.name] = organism.name + table = annotations.contigs + contig_name = None + genes_dict = {} + rna_dict = {} + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar): + if contig_name != row["name"].decode(): + contig_name = row["name"].decode() + organism = pangenome.get_organism(contig2organism[contig_name]) + contig = organism.get(contig_name) + contig.is_circular = row["is_circular"] + try: + gene = Gene(row["gene"].decode()) + except ValueError: + pass + else: + gene.fill_parents(organism, contig) + if row["gene"].decode() in genes_dict: + logging.getLogger().warning("A gene with the same ID already pass. " + "It could be a problem in the number of genes") + genes_dict[gene.ID] = gene + try: + rna = RNA(row["rna"].decode()) + except ValueError: + pass + else: + rna_dict[rna.ID] = rna + rna.fill_parents(organism, contig) + return genes_dict, rna_dict + +def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], + gene_dict: Dict[str, Gene], chunk_size: int = 20000, disable_bar: bool = False): + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): + gene = gene_dict[row["ID"].decode()] + genedata = genedata_dict[row["genedata_id"]] + try: + local = row["local"].decode() + except ValueError: + local = "" + gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, + gene_type=genedata.gene_type, name=genedata.name, position=genedata.position, + genetic_code=genedata.genetic_code, product=genedata.product, + local_identifier=local) + gene.is_fragment = row["is_fragment"] + if gene.contig is not None: + gene.contig.add(gene) + + +def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], + rna_dict: Dict[str, RNA], chunk_size: int = 20000, disable_bar: bool = False): + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): + rna = rna_dict[row["ID"].decode()] + genedata = genedata_dict[row["genedata_id"]] + rna.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, + gene_type=genedata.gene_type, name=genedata.name, + product=genedata.product) + if rna.contig is not None: + rna.contig.add_rna(rna) + + +def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_genes: bool = True, + load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False): """ Read annotation in pangenome hdf5 file to add in pangenome object @@ -448,35 +467,33 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = """ annotations = h5f.root.annotations - table = annotations.genes - pangenome_dict = {} - circular_contigs = {} + if load_organisms: + if load_genes: + genedata_dict = read_genedata(h5f) + if load_rnas: + gene_dict, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar) - genedata_dict = read_genedata(h5f) - - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): - decode_org = row["organism"].decode() - try: - # new gene, seen contig, seen org - pangenome_dict[decode_org][row["contig"]["name"].decode()].append(row["gene"]) - except KeyError: - try: - # new contig, seen org - pangenome_dict[decode_org][row["contig"]["name"].decode()] = [row["gene"]] - circular_contigs[decode_org][row["contig"]["name"].decode()] = row["contig"]["is_circular"] - except KeyError: - # new org - pangenome_dict[sys.intern(decode_org)] = {row["contig"]["name"].decode(): [row["gene"]]} - circular_contigs[decode_org] = {row["contig"]["name"].decode(): row["contig"]["is_circular"]} - - link = True if pangenome.status["genesClustered"] in ["Computed", "Loaded"] else False - - for org_name, contig_dict in tqdm(pangenome_dict.items(), total=len(pangenome_dict), - unit="organism", disable=disable_bar): - read_organism(pangenome, org_name, contig_dict, circular_contigs[org_name], genedata_dict, link) + read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) + read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar) + else: + gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar) + read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) + else: + if load_rnas: + genedata_dict = read_genedata(h5f) + _, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar) + read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar) + else: + if load_genes: + if pangenome.status["genesClustered"] not in ["Loaded", "Computed"]: + raise Exception("Genes must be linked to gene families or organisms, but none are laoded") + gene_dict = {gene.ID: gene for gene in pangenome.genes} # Dictionary with genes in families + gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar) + read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) pangenome.status["genomesAnnotated"] = "Loaded" + def read_info(h5f: tables.File): """ Read the pangenome content @@ -630,12 +647,14 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa fix_partitioned(pangenome.file) h5f = tables.open_file(filename, "r") - if annotation: + + if annotation: # I place annotation here, to link gene to gene families if organism are not loaded if h5f.root.status._v_attrs.genomesAnnotated: logging.getLogger("PPanGGOLiN").info("Reading pangenome annotations...") read_annotation(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' has not been annotated, or has been improperly filled") + if gene_sequences: if h5f.root.status._v_attrs.geneSequences: logging.getLogger("PPanGGOLiN").info("Reading pangenome gene dna sequences...") @@ -652,6 +671,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception( f"The pangenome in file '{filename}' does not have gene families, or has been improperly filled") + if graph: if h5f.root.status._v_attrs.NeighborsGraph: logging.getLogger("PPanGGOLiN").info("Reading the neighbors graph edges...") @@ -659,6 +679,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have graph information, " f"or has been improperly filled") + if rgp: if h5f.root.status._v_attrs.predictedRGP: logging.getLogger("PPanGGOLiN").info("Reading the RGP...") @@ -666,6 +687,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have RGP information, " f"or has been improperly filled") + if spots: if h5f.root.status._v_attrs.spots: logging.getLogger("PPanGGOLiN").info("Reading the spots...") @@ -673,6 +695,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have spots information, " f"or has been improperly filled") + if modules: if h5f.root.status._v_attrs.modules: logging.getLogger("PPanGGOLiN").info("Reading the modules...") @@ -680,6 +703,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have modules information, " f"or has been improperly filled") + if metadata: assert metatype is not None if sources is None: diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index 907034de..fc4e2e70 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -11,18 +11,22 @@ # local libraries from ppanggolin.pangenome import Pangenome -from ppanggolin.genome import Feature, Gene +from ppanggolin.genome import Gene, RNA from ppanggolin.formats.readBinaries import Genedata -def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int, int]: +genedata_counter = 0 + + +def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int]: """ Get the maximum size of each annotation information to optimize disk space :param pangenome: Annotated pangenome - :return: maximum size of each annotation + + :return: Maximum size of each annotation """ - max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = 1, 1, 1, 1, 1, 1 + max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id = 1, 1, 1, 1, 1 for org in pangenome.organisms: if len(org.name) > max_org_len: max_org_len = len(org.name) @@ -36,19 +40,17 @@ def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, i max_gene_local_id = len(gene.local_identifier) for rna in contig.RNAs: if len(rna.ID) > max_rna_id_len: - max_rna_id_len = len(gene.ID) - if len(rna.local_identifier) > max_rna_local_id: - max_rna_local_id = len(gene.local_identifier) + max_rna_id_len = len(rna.ID) - return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id + return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]: """ - Create a table to save organism-related information + Table description to save organism-related information - :param org_len: Maximum size of organism - :param contig_len: Maximum size of contigs + :param org_len: Maximum size of organism name. + :param contig_len: Maximum size of contigs name :return: Formatted table """ @@ -58,7 +60,15 @@ def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]: def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, organism_desc: Dict[str, tables.StringCol], disable_bar=False): - organism_table = h5f.create_table(annotation, "genome", organism_desc, + """Write organisms information in the pangenome file + + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param organism_desc: Organisms table description. + :param disable_bar: Allow disabling progress bar + """ + organism_table = h5f.create_table(annotation, "genomes", organism_desc, expectedrows=pangenome.number_of_organisms) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes") organism_row = organism_table.row @@ -72,11 +82,11 @@ def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.G def contig_desc(contig_len: int, max_gene_id_len: int, max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol]]: - """ - Create a table to save organism-related information + """Table description to save contig-related information - :param org_len: Maximum size of organism - :param contig_len: Maximum size of contigs + :param contig_len: Maximum size of contig name + :param max_gene_id_len: Maximum size of gene name + :param max_rna_id_len: Maximum size of rna name :return: Formatted table """ @@ -89,29 +99,44 @@ def contig_desc(contig_len: int, max_gene_id_len: int, def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol]], disable_bar=False): - contig_table = h5f.create_table(annotation, "contig", contig_desc, expectedrows=pangenome.number_of_contigs) + """Write contigs information in the pangenome file + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param contig_desc: Contigs table description + :param disable_bar: Allow disabling progress bar + """ + contig_table = h5f.create_table(annotation, "contigs", contig_desc, expectedrows=pangenome.number_of_contigs) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") contig_row = contig_table.row for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): - rna_list = list(contig.RNAs) - for index, gene in enumerate(contig.genes): - contig_row["name"] = contig.name - contig_row["is_circular"] = contig.is_circular - contig_row["gene"] = gene.ID - if index < len(rna_list): - contig_row["rna"] = rna_list[index].ID - contig_row.append() + if len(contig) >= contig.number_of_rnas: + rna_list = list(contig.RNAs) + for index, gene in enumerate(contig.genes): + contig_row["name"] = contig.name + contig_row["is_circular"] = contig.is_circular + contig_row["gene"] = gene.ID + if index < len(rna_list): + if rna_list[index].ID == 'GCF_001293965.1_ASM129396v1_genomic_tRNA_005': + print("pika") + contig_row["rna"] = rna_list[index].ID + contig_row.append() + else: + gene_list = list(contig.genes) + for index, rna in enumerate(contig.RNAs): + contig_row["name"] = contig.name + contig_row["is_circular"] = contig.is_circular + contig_row["rna"] = rna.ID + if index < len(gene_list): + contig_row["gene"] = gene_list[index].ID + contig_row.append() contig_table.flush() def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: - """ - Create a table to save gene-related information - - :param org_len: Maximum size of organism - :param contig_len: Maximum size of contigs - :param id_len: Maximum size of gene ID + """Table description to save gene-related information + :param id_len: Maximum size of gene name :param max_local_id: Maximum size of gene local identifier :return: Formatted table @@ -125,16 +150,26 @@ def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables. def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, gene_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]], disable_bar=False) -> Dict[Genedata, int]: + """Write genes information in the pangenome file + + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param gene_desc: Genes table description + :param disable_bar: Allow to disable progress bar + + :returns: Dictionnary linking genedata to gene identifier + """ + global genedata_counter + print(genedata_counter) genedata2gene = {} - genedata_counter = 0 gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") gene_row = gene_table.row - for gene in tqdm(pangenome.genes, total=pangenome.number_of_organisms, unit="gene", disable=disable_bar): + for gene in tqdm(pangenome.genes, total=pangenome.number_of_genes, unit="gene", disable=disable_bar): gene_row["ID"] = gene.ID gene_row["is_fragment"] = gene.is_fragment - if gene.type == "CDS": - gene_row["local"] = gene.local_identifier + gene_row["local"] = gene.local_identifier genedata = get_genedata(gene) genedata_id = genedata2gene.get(genedata) if genedata_id is None: @@ -144,14 +179,60 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou gene_row["genedata_id"] = genedata_id gene_row.append() gene_table.flush() + print(genedata_counter) return genedata2gene +def rna_desc(id_len) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]: + """Table description to save rna-related information + + :param id_len: Maximum size of RNA identifier + + :return: Formatted table + """ + return {'ID': tables.StringCol(itemsize=id_len), + 'genedata_id': tables.UInt32Col()} + + +def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + rna_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col]], + disable_bar=False) -> Dict[Genedata, int]: + """Write RNAs information in the pangenome file + + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param rna_desc: RNAs table description + :param disable_bar: Allow to disable progress bar + + :returns: Dictionnary linking genedata to RNA identifier + """ + global genedata_counter + print(genedata_counter) + genedata2rna = {} + rna_table = h5f.create_table(annotation, "RNAs", rna_desc, expectedrows=pangenome.number_of_genes) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") + rna_row = rna_table.row + for rna in tqdm(pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar): + rna_row["ID"] = rna.ID + genedata = get_genedata(rna) + genedata_id = genedata2rna.get(genedata) + if genedata_id is None: + genedata_id = genedata_counter + genedata2rna[genedata] = genedata_id + genedata_counter += 1 + rna_row["genedata_id"] = genedata_id + rna_row.append() + rna_table.flush() + print(genedata_counter) + return genedata2rna + + def genedata_desc(type_len, name_len, product_len): """ Creates a table for gene-related data - :param type_len: Maximum size of gene Type + :param type_len: Maximum size of gene Type. :param name_len: Maximum size of gene name :param product_len: Maximum size of gene product :return: Formatted table for gene metadata @@ -199,31 +280,42 @@ def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]: return max_type_len, max_name_len, max_product_len -def get_genedata(gene: Feature) -> Genedata: +def get_genedata(feature: Union[Gene, RNA]) -> Genedata: """ Gets the genedata type of Feature - :param gene: a Feature + :param feature: Gene or RNA object + :return: Tuple with a Feature associated data """ position = None genetic_code = 11 - if gene.type == "CDS": - gene: Gene - position = gene.position - genetic_code = gene.genetic_code - return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name, - gene.product, genetic_code) + if isinstance(feature, Gene): + position = feature.position + genetic_code = feature.genetic_code + return Genedata(feature.start, feature.stop, feature.strand, feature.type, position, feature.name, + feature.product, genetic_code) def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, genedata2gene: Dict[Genedata, int], disable_bar=False): - genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), - expectedrows=len(genedata2gene)) + """Writting genedata information in pangenome file + + :param pangenome: Pangenome object filled with annotation. + :param h5f: Pangenome file + :param annotation: Annotation group in Table + :param genedata2gene: Dictionnary linking genedata to gene identifier. + :param disable_bar: Allow disabling progress bar + """ + try: + genedata_table = annotation.genedata + except tables.exceptions.NoSuchNodeError: + genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), + expectedrows=len(genedata2gene)) logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data " "(can be lower than the number of genes)") genedata_row = genedata_table.row - for genedata, genedata_id in genedata2gene.items(): + for genedata, genedata_id in tqdm(genedata2gene.items(), unit="genedata", disable=disable_bar): genedata_row["genedata_id"] = genedata_id genedata_row["start"] = genedata.start genedata_row["stop"] = genedata.stop @@ -239,29 +331,37 @@ def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.G def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, - rec_contigs: bool = True, rec_genes: bool = True, disable_bar: bool = False): - """ - Function writing all the pangenome annotations + rec_contigs: bool = True, rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False): + """Function writing all the pangenome annotations :param pangenome: Annotated pangenome :param h5f: Pangenome HDF5 file + :param rec_organisms: Allow writing organisms in pangenomes + :param rec_contigs: Allow writing contigs in pangenomes + :param rec_genes: Allow writing genes in pangenomes + :param rec_rnas: Allow writing RNAs in pangenomes :param disable_bar: Alow to disable progress bar """ annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") - max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = get_max_len_annotations( - pangenome) + org_len, contig_len, gene_id_len, rna_id_len, gene_local_id = get_max_len_annotations(pangenome) + + # I add these boolean in case we would one day only load organism, contig or genes, without the other. if rec_organisms: - desc = organism_desc(max_org_len, max_contig_len) + desc = organism_desc(org_len, contig_len) write_organisms(pangenome, h5f, annotation, desc, disable_bar) if rec_contigs: - desc = contig_desc(max_contig_len, max_gene_id_len, max_rna_id_len) + desc = contig_desc(contig_len, gene_id_len, rna_id_len) write_contigs(pangenome, h5f, annotation, desc, disable_bar) if rec_genes: - desc = gene_desc(max_gene_id_len, max_gene_local_id) + desc = gene_desc(gene_id_len, gene_local_id) genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar) write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar) + if rec_rnas: + desc = rna_desc(rna_id_len) + genedata2rna = write_rnas(pangenome, h5f, annotation, desc, disable_bar) + write_genedata(pangenome, h5f, annotation, genedata2rna, disable_bar) def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 1a3fcc56..dd0123ca 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -606,9 +606,9 @@ def update_gene_fragments(pangenome: Pangenome, h5f: tables.File, disable_bar: b table = h5f.root.annotations.genes for row in tqdm(table, total=table.nrows, unit="gene", disable=disable_bar): - genedata_id = row['gene/genedata_id'] + genedata_id = row['genedata_id'] if genedataid2genedata[genedata_id].gene_type == 'CDS': - row['gene/is_fragment'] = pangenome.get_gene(row['gene/ID'].decode()).is_fragment + row['is_fragment'] = pangenome.get_gene(row['ID'].decode()).is_fragment row.update() table.flush() diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 84462d77..48b846b0 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -488,6 +488,12 @@ def RNAs(self) -> Generator[RNA, None, None]: """ yield from self._rna_getter + @property + def number_of_rnas(self) -> int: + """Get the number of RNA in the contig + """ + return len(self._rna_getter) + class Organism(MetaFeatures): """ diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index c63bc3ec..28ab459b 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -146,6 +146,26 @@ def number_of_genes(self) -> int: self._mk_gene_getter() # make it return len(self._geneGetter) + """RNAs methods""" + + @property + def RNAs(self) -> Generator[Gene, None, None]: + """Generator of genes in the pangenome. + + :return: gene generator + """ + for org in self.organisms: + for contig in org.contigs: + yield from contig.RNAs + + @property + def number_of_rnas(self) -> int: + """Returns the number of gene present in the pangenome + + :return: The number of genes + """ + return sum(ctg.number_of_rnas for ctg in self.contigs) + """Gene families methods""" @property def max_fam_id(self): diff --git a/tests/test_genome.py b/tests/test_genome.py index c58b296c..b19eda01 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -523,7 +523,8 @@ def test_number_of_contigs(self, organism): """ organism.add(Contig('contig1')) organism.add(Contig('contig2')) - assert organism.number_of_contigs() == 2 + assert isinstance(len(organism), int) + assert len(organism) == 2 def test_get_families(self, organism, contig, gene): """Tests that gene families in an organism can be retrieved From 6f7229ed060ce933044d19a61bd114d41bec9137 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 6 Sep 2023 16:20:42 +0200 Subject: [PATCH 04/15] Fix compatibility problem with merge --- VERSION | 2 +- ppanggolin/formats/writeAnnotations.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index bfdb369d..27a69d0e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.176 +1.2.177 diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index fc4e2e70..e1dd1640 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -110,7 +110,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") contig_row = contig_table.row for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): - if len(contig) >= contig.number_of_rnas: + if contig.number_of_genes >= contig.number_of_rnas: rna_list = list(contig.RNAs) for index, gene in enumerate(contig.genes): contig_row["name"] = contig.name From 9499d2abde7ba3cec1efeb0ba9498052396afb8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 7 Sep 2023 14:28:26 +0200 Subject: [PATCH 05/15] Get length of contig for GFF and GBFF files --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 123 ++++++++++--------------- ppanggolin/formats/writeAnnotations.py | 8 +- ppanggolin/genome.py | 20 +++- ppanggolin/pangenome.py | 1 - testingDataset/organisms.gbff.list | 3 +- 6 files changed, 71 insertions(+), 86 deletions(-) diff --git a/VERSION b/VERSION index 27a69d0e..8c12c1db 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.177 +1.2.178 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index ad0dd348..e4e9053d 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -32,13 +32,13 @@ def check_annotate_args(args): raise Exception("You must provide at least a file with the --fasta option to annotate from sequences, " "or a file with the --gff option to load annotations from.") - if hasattr(args, "fasta") and args.fasta is not None: check_input_files(args.fasta, True) if hasattr(args, "anno") and args.anno is not None: check_input_files(args.anno, True) + def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set, start: int, stop: int, strand: str, gene_type: str, position: int = None, gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = ""): @@ -91,20 +91,19 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_parents(org, contig) -def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> ( +def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> ( Organism, bool): """ Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file - :param organism: Organism name - :param gbff_file_path: Path to corresponding GFF file + :param organism_name: Organism name + :param gbff_file_path: Path to corresponding GBFF file :param circular_contigs: list of contigs :param pseudo: Allow to read pseudogène :return: Organism complete and true for sequence in file """ - org = Organism(organism) - + organism = Organism(organism_name) logging.getLogger("PPanGGOLiN").debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}") # revert the order of the file, to read the first line first. lines = read_compressed_or_not(gbff_file_path).readlines()[::-1] @@ -114,35 +113,29 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p line = lines.pop() # beginning of contig contig = None - set_contig = False is_circ = False - contig_locus_id = None + contig_id = None if line.startswith('LOCUS'): if "CIRCULAR" in line.upper(): # this line contains linear/circular word telling if the dna sequence is circularized or not is_circ = True - contig_locus_id = line.split()[1] + # TODO maybe it could be a good thing to add a elif for linear + # and if circular or linear are not found raise a warning + + contig_id = line.split()[1] # If contig_id is not specified in VERSION afterward like with Prokka, in that case we use the one in LOCUS while not line.startswith('FEATURES'): - if line.startswith('VERSION'): + if line.startswith('VERSION') and line[12:].strip() != "": contig_id = line[12:].strip() - if contig_id != "": - try: - contig = org.get(contig_id) - except KeyError: - contig = Contig(contig_id, True if contig_id in circular_contigs else False) - org.add(contig) - set_contig = True line = lines.pop() - if not set_contig: - # if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. - # Should be unique in a dataset, but if there's an update the contig ID - # might still be the same even though it should not(?) - try: - contig = org.get(contig_locus_id) - except KeyError: - contig = Contig(contig_locus_id, True if contig_locus_id in circular_contigs else False) - org.add(contig) + # If no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. + # Should be unique in a dataset, but if there's an update + # the contig ID might still be the same even though it should not(?) + try: + contig = organism.get(contig_id) + except KeyError: + contig = Contig(contig_id, True if contig_id in circular_contigs or is_circ else False) + organism.add(contig) # start of the feature object. dbxref = set() gene_name = "" @@ -160,8 +153,8 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p curr_type = line[5:21].strip() if curr_type != "": if useful_info: - create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, - contig.number_of_genes, gene_name, product, genetic_code, protein_id) + create_gene(organism, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, + obj_type, contig.number_of_genes, gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 else: @@ -192,6 +185,9 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p pass # don't know what to do with that, ignoring for now. # there is a protein with a frameshift mecanism. + elif curr_type == 'source': # Get Contig length + start, end = map(int, map(str.strip, line[21:].split('..'))) + contig.length = end - start + 1 elif useful_info: # current info goes to current objtype, if it's useful. if line[21:].startswith("/db_xref"): dbxref.add(line.split("=")[1].replace('"', '').strip()) @@ -220,7 +216,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p line = lines.pop() # end of contig if useful_info: # saving the last element... - create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, + create_gene(organism, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, contig.number_of_genes, gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 @@ -237,7 +233,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p # get each gene's sequence. for gene in contig.genes: gene.add_sequence(get_dna_sequence(sequence, gene)) - return org, True + return organism, True def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: bool = False) -> (Organism, bool): @@ -245,22 +241,22 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: b Read annotation from GFF file :param organism: Organism name - :param gff_file_path: Path to corresponding GFF file - :param circular_contigs: list of contigs + :param gff_file_path: Path corresponding to GFF file + :param circular_contigs: List of circular contigs :param pseudo: Allow to read pseudogène - :return: Organism object and if there are sequences associate or not + :return: Organism object and if there are sequences associated or not """ (gff_seqname, _, gff_type, gff_start, gff_end, _, gff_strand, _, gff_attribute) = range(0, 9) - # missing values : source, score, frame. They are unused. + # Missing values: source, score, frame. They are unused. def get_gff_attributes(gff_fields: list) -> dict: - """ - Parses the gff attribute's line and outputs the attributes_get in a dict structure. - :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff. + """Parses the gff attribute's line and outputs the attributes_get in a dict structure. - :return: attributes get + :param gff_fields: A gff line stored as a list. Each element of the list is a column of the gff. + + :return: Attributes get """ attributes_field = [f for f in gff_fields[gff_attribute].strip().split(';') if len(f) > 0] attributes_get = {} @@ -276,7 +272,8 @@ def get_id_attribute(attributes_dict: dict) -> str: """ Gets the ID of the element from which the provided attributes_get were extracted. Raises an error if no ID is found. - :param attributes_dict: attributes from one gff line + + :param attributes_dict: Attributes from one gff line :return: CDS identifier """ @@ -302,11 +299,9 @@ def get_id_attribute(attributes_dict: dict) -> str: has_fasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] - try: - contig = org.get(fields[1]) - except KeyError: - contig = Contig(fields[1], True if fields[1] in circular_contigs else False) - org.add(contig) + contig = Contig(fields[1], True if fields[1] in circular_contigs else False) + org.add(contig) + contig.length = int(fields[-1]) - int(fields[3]) + 1 continue elif line.startswith('#'): # comment lines to be ignores by parsers @@ -326,32 +321,14 @@ def get_id_attribute(attributes_dict: dict) -> str: # if it's not found, we get the one under the 'ID' field which must exist # (otherwise not a gff3 compliant file) gene_id = get_id_attribute(attributes) - try: - name = attributes.pop('NAME') - except KeyError: - try: - name = attributes.pop('GENE') - except KeyError: - name = "" + name = attributes.pop('NAME', attributes.pop('GENE', "")) if "pseudo" in attributes or "pseudogene" in attributes: pseudogene = True - try: - product = attributes.pop('PRODUCT') - except KeyError: - product = "" - - try: - genetic_code = int(attributes.pop("TRANSL_TABLE")) - except KeyError: - genetic_code = 11 + product = attributes.pop('PRODUCT', "") + genetic_code = int(attributes.pop("TRANSL_TABLE", 11)) if contig is None or contig.name != fields_gff[gff_seqname]: # get the current contig - try: - contig = org.get(fields_gff[gff_seqname]) - except KeyError: - contig = Contig(fields_gff[gff_seqname], - True if fields_gff[gff_seqname] in circular_contigs else False) - org.add(contig) + contig = org.get(fields_gff[gff_seqname]) if fields_gff[gff_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) @@ -391,8 +368,8 @@ def launch_read_anno(args: tuple) -> (Organism, bool): return read_anno_file(*args) -def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, pseudo: bool = False) -> ( - Organism, bool): +def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, + pseudo: bool = False) -> (Organism, bool): """ Read a GBFF file for one organism @@ -543,8 +520,8 @@ def launch_annotate_organism(pack: tuple) -> Organism: def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11, - kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, - disable_bar: bool = False): + kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, + procedure: str = None, disable_bar: bool = False): """ Main function to annotate a pangenome @@ -570,14 +547,13 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: if not org_path.exists(): # Check tsv sanity test if it's not one it's the other org_path = fasta_list.parent.joinpath(org_path) - + arguments.append((elements[0], org_path, elements[2:], tmpdir, translation_table, norna, kingdom, allow_overlap, procedure)) if len(arguments) == 0: raise Exception("There are no genomes in the provided file") - logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") with get_context('fork').Pool(processes=cpu) as p: for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", @@ -622,7 +598,6 @@ def launch(args: argparse.Namespace): logging.getLogger("PPanGGOLiN").warning( "You will be able to proceed with your analysis ONLY if you provide " "the clustering results in the next step.") - write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) @@ -684,8 +659,6 @@ def parser_annot(parser: argparse.ArgumentParser): if __name__ == '__main__': """To test local change and allow using debugger""" - import tempfile - from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index e1dd1640..d74a97a3 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -109,7 +109,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro contig_table = h5f.create_table(annotation, "contigs", contig_desc, expectedrows=pangenome.number_of_contigs) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") contig_row = contig_table.row - for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): + for contig in tqdm(pangenome.contigs, total=pangenome.number_of_contigs, unit="contigs", disable=disable_bar): if contig.number_of_genes >= contig.number_of_rnas: rna_list = list(contig.RNAs) for index, gene in enumerate(contig.genes): @@ -117,8 +117,6 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro contig_row["is_circular"] = contig.is_circular contig_row["gene"] = gene.ID if index < len(rna_list): - if rna_list[index].ID == 'GCF_001293965.1_ASM129396v1_genomic_tRNA_005': - print("pika") contig_row["rna"] = rna_list[index].ID contig_row.append() else: @@ -161,7 +159,6 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou :returns: Dictionnary linking genedata to gene identifier """ global genedata_counter - print(genedata_counter) genedata2gene = {} gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") @@ -179,7 +176,6 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou gene_row["genedata_id"] = genedata_id gene_row.append() gene_table.flush() - print(genedata_counter) return genedata2gene @@ -208,7 +204,6 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group :returns: Dictionnary linking genedata to RNA identifier """ global genedata_counter - print(genedata_counter) genedata2rna = {} rna_table = h5f.create_table(annotation, "RNAs", rna_desc, expectedrows=pangenome.number_of_genes) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") @@ -224,7 +219,6 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group rna_row["genedata_id"] = genedata_id rna_row.append() rna_table.flush() - print(genedata_counter) return genedata2rna diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 499abd37..205b08dc 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -311,6 +311,7 @@ def __init__(self, name: str, is_circular: bool = False): self._genes_getter = {} self._genes_position = [] self._organism = None + self._length = None def __str__(self) -> str: return self.name @@ -342,6 +343,23 @@ def __setitem__(self, start: int, gene: Gene): # TODO define eq function + @property + def length(self): + if self._length is None: + logging.getLogger("PPanGGOLiN").warning("Contig length is unknown") + return self._length + + @length.setter + def length(self, contig_len: int): + if not isinstance(contig_len, int): + raise TypeError("Contig length is expected to be an integer") + if contig_len < 0: + raise ValueError("Contig length must be positive") + self._length = contig_len + + def __len__(self): + return self.length + # retrieve gene by start position def __getitem__(self, position: int) -> Gene: """Get the gene for the given position @@ -593,7 +611,7 @@ def __len__(self): :return: Number of contigs in organism """ - return len(self._contigs_getter) + return len(self._contigs_getter.keys()) @property def families(self): diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 28ab459b..ef37bb4b 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -147,7 +147,6 @@ def number_of_genes(self) -> int: return len(self._geneGetter) """RNAs methods""" - @property def RNAs(self) -> Generator[Gene, None, None]: """Generator of genes in the pangenome. diff --git a/testingDataset/organisms.gbff.list b/testingDataset/organisms.gbff.list index 1537925a..093d4beb 100644 --- a/testingDataset/organisms.gbff.list +++ b/testingDataset/organisms.gbff.list @@ -21,6 +21,7 @@ GCF_000472205.1 GBFF/GCF_000472205.1_E_CS88_f__genomic.gbff.gz GCF_000590575.1 GBFF/GCF_000590575.1_ASM59057v1_genomic.gbff.gz GCF_000590635.1 GBFF/GCF_000590635.1_ASM59063v1_genomic.gbff.gz GCF_000590695.1 GBFF/GCF_000590695.1_ASM59069v1_genomic.gbff.gz +GCF_001183765.1 GBFF/GCF_001183765.1_ASM118376v1_genomic.gbff.gz GCF_001183805.1 GBFF/GCF_001183805.1_ASM118380v1_genomic.gbff.gz GCF_001183825.1 GBFF/GCF_001183825.1_ASM118382v1_genomic.gbff.gz GCF_001183845.1 GBFF/GCF_001183845.1_ASM118384v1_genomic.gbff.gz @@ -50,4 +51,4 @@ GCF_006508265.1 GBFF/GCF_006508265.1_ASM650826v1_genomic.gbff.gz GCF_000068585.1 GBFF/NC_010287.1.gff.gz GCF_000008725.1 GBFF/NC_000117.1.gbk.gz GCF_001183765.1_gff GBFF/PROKKA_12132021.gff.gz -GCF_001183765.1_gbk GBFF/PROKKA_12132021.gbk.gz +GCF_001183765.1_gbk GBFF/PROKKA_12132021.gbk.gz \ No newline at end of file From 1452b239587b719fa0716eebb0e7d97f7e10e0db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 7 Sep 2023 15:16:40 +0200 Subject: [PATCH 06/15] Get contig length for fasta files --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 54 ++++++++++++++++++--------------- ppanggolin/annotate/synta.py | 16 ++++++++-- 3 files changed, 45 insertions(+), 27 deletions(-) diff --git a/VERSION b/VERSION index 8c12c1db..e9a3656e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.178 +1.2.179 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index e4e9053d..baad0c20 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -9,6 +9,7 @@ from pathlib import Path import tempfile import time +from typing import List, Set, Tuple # installed libraries from tqdm import tqdm @@ -21,7 +22,7 @@ from ppanggolin.formats import write_pangenome -def check_annotate_args(args): +def check_annotate_args(args: argparse.Namespace): """Check That the given arguments are usable :param args: All arguments provide by user @@ -39,7 +40,7 @@ def check_annotate_args(args): check_input_files(args.anno, True) -def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set, +def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: Set[str], start: int, stop: int, strand: str, gene_type: str, position: int = None, gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = ""): """ @@ -82,7 +83,7 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, position=position, product=product, local_identifier=gene_id, genetic_code=genetic_code) - contig[new_gene.start] = new_gene + contig.add(new_gene) else: # if not CDS, it is RNA new_gene = RNA(org.name + "_RNA_" + str(rna_counter).zfill(4)) new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, @@ -91,8 +92,8 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_parents(org, contig) -def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> ( - Organism, bool): +def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: List[str], + pseudo: bool = False) -> Tuple[Organism, bool]: """ Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file @@ -236,7 +237,8 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: li return organism, True -def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: bool = False) -> (Organism, bool): +def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str], + pseudo: bool = False) -> Tuple[Organism, bool]: """ Read annotation from GFF file @@ -358,7 +360,7 @@ def get_id_attribute(attributes_dict: dict) -> str: return org, has_fasta -def launch_read_anno(args: tuple) -> (Organism, bool): +def launch_read_anno(args: Tuple[str, Path, List[str], bool]) -> Tuple[Organism, bool]: """ Allow to launch in multiprocessing the read of genome annotation :param args: Pack of argument for annotate_organism function @@ -368,8 +370,8 @@ def launch_read_anno(args: tuple) -> (Organism, bool): return read_anno_file(*args) -def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, - pseudo: bool = False) -> (Organism, bool): +def read_anno_file(organism_name: str, filename: Path, circular_contigs: List[str], + pseudo: bool = False) -> Tuple[Organism, bool]: """ Read a GBFF file for one organism @@ -396,7 +398,7 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, "You may be able to use --fasta instead.") -def chose_gene_identifiers(pangenome) -> bool: +def chose_gene_identifiers(pangenome: Pangenome) -> bool: """ Parses the pangenome genes to decide whether to use local_identifiers or ppanggolin generated gene identifiers. If the local identifiers are unique within the pangenome they are picked, otherwise ppanggolin ones are used. @@ -467,15 +469,15 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p pangenome.parameters["annotation"]["read_annotations_from_file"] = True -def get_gene_sequences_from_fastas(pangenome, fasta_file): +def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: List[Path]): """ Get gene sequences from fastas - :param pangenome: input pangenome - :param fasta_file: list of fasta file + :param pangenome: Input pangenome + :param fasta_files: list of fasta file """ fasta_dict = {} - for line in read_compressed_or_not(fasta_file): + for line in read_compressed_or_not(fasta_files): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: logging.getLogger("PPanGGOLiN").error("No tabulation separator found in organisms file") @@ -483,15 +485,15 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): try: org = pangenome.get_organism(elements[0]) except KeyError: - raise KeyError(f"One of the genome in your '{fasta_file}' was not found in the pan." + raise KeyError(f"One of the genome in your '{fasta_files}' was not found in the pan." f" This might mean that the genome names between your annotation file and " f"your fasta file are different.") with read_compressed_or_not(elements[1]) as currFastaFile: fasta_dict[org], _ = read_fasta(org, currFastaFile) if set(pangenome.organisms) > set(fasta_dict.keys()): - missing = pangenome.number_of_organisms() - len(set(pangenome.organisms) & set(fasta_dict.keys())) + missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys())) raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. " - f"{missing} are missing (out of {pangenome.number_of_organisms()}).") + f"{missing} are missing (out of {pangenome.number_of_organisms}).") for org in pangenome.organisms: for contig in org.contigs: @@ -509,7 +511,7 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): pangenome.status["geneSequences"] = "Computed" -def launch_annotate_organism(pack: tuple) -> Organism: +def launch_annotate_organism(pack: Tuple[str, Path, List[str], str, int, bool, str, bool, str]) -> Organism: """ Allow to launch in multiprocessing the genome annotation :param pack: Pack of argument for annotate_organism function @@ -592,12 +594,16 @@ def launch(args: argparse.Namespace): if args.fasta: get_gene_sequences_from_fastas(pangenome, args.fasta) else: - logging.getLogger("PPanGGOLiN").warning( - "You provided gff files without sequences, and you did not provide " - "fasta sequences. Thus it was not possible to get the gene sequences.") - logging.getLogger("PPanGGOLiN").warning( - "You will be able to proceed with your analysis ONLY if you provide " - "the clustering results in the next step.") + logging.getLogger("PPanGGOLiN").warning("You provided gff files without sequences, " + "and you did not provide fasta sequences. " + "Thus it was not possible to get the gene sequences.") + logging.getLogger("PPanGGOLiN").warning("You will be able to proceed with your analysis " + "ONLY if you provide the clustering results in the next step.") + else: + if args.fasta: + logging.getLogger("PPanGGOLiN").warning("You provided fasta sequences " + "but your gff files were already with sequences." + "PPanGGOLiN will use sequences in GFF and not from your fasta.") write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index afad1cca..6220db85 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -6,6 +6,7 @@ import os import tempfile from io import TextIOWrapper +from multiprocessing import Manager from subprocess import Popen, PIPE import ast from collections import defaultdict @@ -17,6 +18,10 @@ from ppanggolin.utils import is_compressed, read_compressed_or_not +manager = Manager() +contig_len = manager.dict() + + def reverse_complement(seq: str): """reverse complement the given dna sequence @@ -79,6 +84,8 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str :return: Annotated genes in a list of gene objects """ + global contig_len + locustag = org.name cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"])) logging.getLogger("PPanGGOLiN").debug(f"prodigal command : {' '.join(cmd)}") @@ -89,9 +96,13 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str header = "" for line in p.communicate()[0].decode().split("\n"): if line.startswith("# Sequence Data: "): + length = None for data in line.split(";"): - if data.startswith("seqhdr"): + if data.startswith("seqlen"): + length = int(data.split("=")[1]) + elif data.startswith("seqhdr"): header = data.split("=")[1].replace('"', "").split()[0] + contig_len[header] = length elif line.startswith(">"): c += 1 @@ -330,11 +341,12 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: except KeyError: contig = Contig(contig_name, True if contig_name in circular_contigs else False) org.add(contig) + contig.length = contig_len[contig.name] for gene in genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) gene.fill_parents(org, contig) if isinstance(gene, Gene): - contig[gene.start] = gene + contig.add(gene) elif isinstance(gene, RNA): contig.add_rna(gene) return org From 3e4f6a0f6a4c89d5594cd02898a6529dd17cd11c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 8 Sep 2023 09:53:16 +0200 Subject: [PATCH 07/15] Write and read contig length --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 1 + ppanggolin/formats/writeAnnotations.py | 6 ++++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/VERSION b/VERSION index e9a3656e..ea25a270 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.179 +1.2.180 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 215ad7a3..1b323599 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -407,6 +407,7 @@ def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size: organism = pangenome.get_organism(contig2organism[contig_name]) contig = organism.get(contig_name) contig.is_circular = row["is_circular"] + contig.length = row["length"] try: gene = Gene(row["gene"].decode()) except ValueError: diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index d74a97a3..a8c0c6f1 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -81,7 +81,7 @@ def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.G def contig_desc(contig_len: int, max_gene_id_len: int, - max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol]]: + max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]: """Table description to save contig-related information :param contig_len: Maximum size of contig name @@ -92,12 +92,13 @@ def contig_desc(contig_len: int, max_gene_id_len: int, """ return {'name': tables.StringCol(itemsize=contig_len), "is_circular": tables.BoolCol(dflt=False), + 'length': tables.UInt32Col(), "gene": tables.StringCol(itemsize=max_gene_id_len), "rna": tables.StringCol(itemsize=max_rna_id_len)} def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, - contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol]], + contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]], disable_bar=False): """Write contigs information in the pangenome file :param pangenome: Annotated pangenome object @@ -115,6 +116,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro for index, gene in enumerate(contig.genes): contig_row["name"] = contig.name contig_row["is_circular"] = contig.is_circular + contig_row["length"] = len(contig) contig_row["gene"] = gene.ID if index < len(rna_list): contig_row["rna"] = rna_list[index].ID From aa4e308754bca3594af647d1c5c79f72cd622467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 8 Sep 2023 10:54:03 +0200 Subject: [PATCH 08/15] Remove manager dict to save contig length --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 5 ++-- ppanggolin/annotate/synta.py | 44 +++++++++++---------------------- 3 files changed, 17 insertions(+), 34 deletions(-) diff --git a/VERSION b/VERSION index ea25a270..177b0d1a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.180 +1.2.181 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index baad0c20..6c5ca044 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -113,7 +113,6 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li while len(lines) != 0: line = lines.pop() # beginning of contig - contig = None is_circ = False contig_id = None if line.startswith('LOCUS'): @@ -351,7 +350,7 @@ def get_id_attribute(attributes_dict: dict) -> str: # GET THE FASTA SEQUENCES OF THE GENES if has_fasta and fasta_string != "": - contig_sequences, _ = read_fasta(org, fasta_string.split('\n')) # _ is total contig length + contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length for contig in org.contigs: for gene in contig.genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) @@ -489,7 +488,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: List[Path] f" This might mean that the genome names between your annotation file and " f"your fasta file are different.") with read_compressed_or_not(elements[1]) as currFastaFile: - fasta_dict[org], _ = read_fasta(org, currFastaFile) + fasta_dict[org] = read_fasta(org, currFastaFile) if set(pangenome.organisms) > set(fasta_dict.keys()): missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys())) raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. " diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 6220db85..677da87d 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -6,11 +6,10 @@ import os import tempfile from io import TextIOWrapper -from multiprocessing import Manager from subprocess import Popen, PIPE import ast from collections import defaultdict -from typing import Union +from typing import Dict, List, Union from pathlib import Path # local libraries @@ -18,10 +17,6 @@ from ppanggolin.utils import is_compressed, read_compressed_or_not -manager = Manager() -contig_len = manager.dict() - - def reverse_complement(seq: str): """reverse complement the given dna sequence @@ -84,7 +79,6 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str :return: Annotated genes in a list of gene objects """ - global contig_len locustag = org.name cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"])) @@ -96,13 +90,9 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str header = "" for line in p.communicate()[0].decode().split("\n"): if line.startswith("# Sequence Data: "): - length = None for data in line.split(";"): - if data.startswith("seqlen"): - length = int(data.split("=")[1]) - elif data.startswith("seqhdr"): + if data.startswith("seqhdr"): header = data.split("=")[1].replace('"', "").split()[0] - contig_len[header] = length elif line.startswith(">"): c += 1 @@ -162,7 +152,7 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = " return gene_objs -def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, int): +def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]: """ Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value. :param org: Organism corresponding to fasta file @@ -173,24 +163,21 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in try: contigs = {} contig_seq = "" - all_contig_len = 0 contig = None for line in fna_file: if line.startswith('>'): if len(contig_seq) >= 1: # contig filter = 1 contigs[contig.name] = contig_seq.upper() - all_contig_len += len(contig_seq) + contig.length = len(contig_seq) contig_seq = "" - try: - contig = org.get(line.split()[0][1:]) - except KeyError: - contig = Contig(line.split()[0][1:]) - org.add(contig) + contig = Contig(line.split()[0][1:]) + org.add(contig) else: contig_seq += line.strip() if len(contig_seq) >= 1: # processing the last contig contigs[contig.name] = contig_seq.upper() - all_contig_len += len(contig_seq) + contig.length = len(contig_seq) + except AttributeError as e: raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. " f"One possibility for this error is that the file did not start with a '>' " @@ -198,7 +185,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in except Exception: # To manage other exception which can occur raise Exception("Unexpected error. Please check your input file and if everything looks fine, " "please post an issue on our github") - return contigs, all_contig_len + return contigs def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrapper: @@ -301,7 +288,7 @@ def get_dna_sequence(contig_seq: str, gene: Gene) -> str: return reverse_complement(contig_seq[gene.start - 1:gene.stop]) -def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: str, +def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str], tmpdir: str, code: int = 11, norna: bool = False, kingdom: str = "bacteria", allow_overlap: bool = False, procedure: str = None) -> Organism: """ @@ -323,10 +310,11 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: fasta_file = read_compressed_or_not(file_name) - contig_sequences, all_contig_len = read_fasta(org, fasta_file) + contig_sequences = read_fasta(org, fasta_file) if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) if procedure is None: # prodigal procedure is not force by user + all_contig_len = sum(len(contig) for contig in org.contigs) logging.getLogger("PPanGGOLiN").debug(all_contig_len) if all_contig_len < 20000: # case of short sequence procedure = "meta" @@ -336,12 +324,8 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: genes = overlap_filter(genes, allow_overlap=allow_overlap) for contig_name, genes in genes.items(): - try: - contig = org.get(contig_name) - except KeyError: - contig = Contig(contig_name, True if contig_name in circular_contigs else False) - org.add(contig) - contig.length = contig_len[contig.name] + contig = org.get(contig_name) + contig.is_circular = True if contig.name in circular_contigs else False for gene in genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) gene.fill_parents(org, contig) From cb8fecaaf55970a2b7ec51b8ebfdad524489ca19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 8 Sep 2023 10:58:02 +0200 Subject: [PATCH 09/15] Fix bug to read annotation --- VERSION | 2 +- ppanggolin/annotate/synta.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/VERSION b/VERSION index 177b0d1a..12b3f290 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.181 +1.2.182 diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 677da87d..c21dbdfd 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -170,8 +170,11 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, contigs[contig.name] = contig_seq.upper() contig.length = len(contig_seq) contig_seq = "" - contig = Contig(line.split()[0][1:]) - org.add(contig) + try: + contig = org.get(line.split()[0][1:]) + except KeyError: + contig = Contig(line.split()[0][1:]) + org.add(contig) else: contig_seq += line.strip() if len(contig_seq) >= 1: # processing the last contig From 40c3687db48333d7edb155a10fea621939f9e435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 8 Sep 2023 11:32:30 +0200 Subject: [PATCH 10/15] Fix bug to read contig len --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 12b3f290..2870b30c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.182 +1.2.183 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 1b323599..46cf1e1f 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -407,7 +407,7 @@ def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size: organism = pangenome.get_organism(contig2organism[contig_name]) contig = organism.get(contig_name) contig.is_circular = row["is_circular"] - contig.length = row["length"] + contig.length = int(row["length"]) try: gene = Gene(row["gene"].decode()) except ValueError: From 1a04c77ae61f3dd7d9e5aebb21bf84cb4e67773d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 8 Sep 2023 15:32:03 +0200 Subject: [PATCH 11/15] Remove GCF_001183765 organism in GBFF list to prevent gene overlap in contig --- VERSION | 2 +- testingDataset/organisms.gbff.list | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 2870b30c..9df9b3c2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.183 +1.2.184 diff --git a/testingDataset/organisms.gbff.list b/testingDataset/organisms.gbff.list index 093d4beb..d7fb3bf5 100644 --- a/testingDataset/organisms.gbff.list +++ b/testingDataset/organisms.gbff.list @@ -21,7 +21,6 @@ GCF_000472205.1 GBFF/GCF_000472205.1_E_CS88_f__genomic.gbff.gz GCF_000590575.1 GBFF/GCF_000590575.1_ASM59057v1_genomic.gbff.gz GCF_000590635.1 GBFF/GCF_000590635.1_ASM59063v1_genomic.gbff.gz GCF_000590695.1 GBFF/GCF_000590695.1_ASM59069v1_genomic.gbff.gz -GCF_001183765.1 GBFF/GCF_001183765.1_ASM118376v1_genomic.gbff.gz GCF_001183805.1 GBFF/GCF_001183805.1_ASM118380v1_genomic.gbff.gz GCF_001183825.1 GBFF/GCF_001183825.1_ASM118382v1_genomic.gbff.gz GCF_001183845.1 GBFF/GCF_001183845.1_ASM118384v1_genomic.gbff.gz From b4768ae4fbb415d1b767844dd707437d8b0f0858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 14 Sep 2023 17:08:33 +0200 Subject: [PATCH 12/15] Change format of table more relational DB --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 133 ++++++++++++------------- ppanggolin/formats/writeAnnotations.py | 67 +++++-------- ppanggolin/pangenome.py | 34 ++++++- 4 files changed, 121 insertions(+), 115 deletions(-) diff --git a/VERSION b/VERSION index 9df9b3c2..7844733d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.184 +1.2.185 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 46cf1e1f..976499c1 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -384,81 +384,80 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal pangenome.add_module(module) pangenome.status["modules"] = "Loaded" -def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size: int = 20000, - disable_bar: bool = False) -> Tuple[Dict[str, Gene], Dict[str, RNA]]: - table = annotations.genomes +def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, + disable_bar: bool = False): contig2organism = {} for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar): - try: - organism = pangenome.get_organism(row["name"].decode()) - except: - organism = Organism(row["name"].decode()) - pangenome.add_organism(organism) - contig = Contig(name=row["contig"].decode()) - organism.add(contig) - contig2organism[contig.name] = organism.name - table = annotations.contigs - contig_name = None - genes_dict = {} - rna_dict = {} + organism = Organism(row["name"].decode()) + pangenome.add_organism(organism) + + +def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, + disable_bar: bool = False): for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar): - if contig_name != row["name"].decode(): - contig_name = row["name"].decode() - organism = pangenome.get_organism(contig2organism[contig_name]) - contig = organism.get(contig_name) - contig.is_circular = row["is_circular"] - contig.length = int(row["length"]) + contig = Contig(name=row["name"].decode()) + contig.is_circular = row["is_circular"] + contig.length = int(row["length"]) try: - gene = Gene(row["gene"].decode()) - except ValueError: - pass - else: - gene.fill_parents(organism, contig) - if row["gene"].decode() in genes_dict: - logging.getLogger().warning("A gene with the same ID already pass. " - "It could be a problem in the number of genes") - genes_dict[gene.ID] = gene - try: - rna = RNA(row["rna"].decode()) - except ValueError: + organism = pangenome.get_organism(row["organism"].decode()) + except KeyError: pass else: - rna_dict[rna.ID] = rna - rna.fill_parents(organism, contig) - return genes_dict, rna_dict + organism.add(contig) def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], - gene_dict: Dict[str, Gene], chunk_size: int = 20000, disable_bar: bool = False): + link: bool = True, chunk_size: int = 20000, disable_bar: bool = False): + """Read genes in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: Genes table + :param genedata_dict: Dictionary to link genedata with gene + :param link: Allow to link gene to organism and contig + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): - gene = gene_dict[row["ID"].decode()] + gene = Gene(row["ID"].decode()) genedata = genedata_dict[row["genedata_id"]] try: local = row["local"].decode() except ValueError: local = "" gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, - gene_type=genedata.gene_type, name=genedata.name, position=genedata.position, - genetic_code=genedata.genetic_code, product=genedata.product, - local_identifier=local) + gene_type=genedata.gene_type, name=genedata.name, position=genedata.position, + genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local) gene.is_fragment = row["is_fragment"] - if gene.contig is not None: - gene.contig.add(gene) + if link: + contig = pangenome.get_contig(row["contig"].decode()) + gene.fill_parents(contig.organism, contig) + contig.add(gene) def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], - rna_dict: Dict[str, RNA], chunk_size: int = 20000, disable_bar: bool = False): + link: bool = True, chunk_size: int = 20000, disable_bar: bool = False): + """Read RNAs in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: RNAs table + :param genedata_dict: Dictionary to link genedata with gene + :param link: Allow to link gene to organism and contig + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): - rna = rna_dict[row["ID"].decode()] + rna = RNA(row["ID"].decode()) genedata = genedata_dict[row["genedata_id"]] rna.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, gene_type=genedata.gene_type, name=genedata.name, product=genedata.product) - if rna.contig is not None: - rna.contig.add_rna(rna) + if link: + contig = pangenome.get_contig(row["contig"].decode()) + rna.fill_parents(contig.organism, contig) + contig.add_rna(rna) -def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_genes: bool = True, - load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False): +def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_contigs: bool = True, + load_genes: bool = True, load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False): """ Read annotation in pangenome hdf5 file to add in pangenome object @@ -467,30 +466,20 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool :param disable_bar: Disable the progress bar """ annotations = h5f.root.annotations - + genedata_dict = None if load_organisms: - if load_genes: - genedata_dict = read_genedata(h5f) - if load_rnas: - gene_dict, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar) - - read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) - read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar) - else: - gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar) - read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) - else: - if load_rnas: - genedata_dict = read_genedata(h5f) - _, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar) - read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar) - else: - if load_genes: - if pangenome.status["genesClustered"] not in ["Loaded", "Computed"]: - raise Exception("Genes must be linked to gene families or organisms, but none are laoded") - gene_dict = {gene.ID: gene for gene in pangenome.genes} # Dictionary with genes in families - gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar) - read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) + read_organisms(pangenome, annotations.genomes, disable_bar=disable_bar) + + if load_contigs: + read_contigs(pangenome, annotations.contigs, disable_bar=disable_bar) + + if load_genes: + genedata_dict = read_genedata(h5f) + read_genes(pangenome, annotations.genes, genedata_dict, + all([load_organisms, load_contigs]), disable_bar=disable_bar) + if load_rnas: + read_rnas(pangenome, annotations.RNAs, read_genedata(h5f) if genedata_dict is None else genedata_dict, + all([load_organisms, load_contigs]), disable_bar=disable_bar) pangenome.status["genomesAnnotated"] = "Loaded" diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index a8c0c6f1..821fd513 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -45,17 +45,15 @@ def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, i return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id -def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]: +def organism_desc(org_len: int) -> Dict[str, tables.StringCol]: """ Table description to save organism-related information :param org_len: Maximum size of organism name. - :param contig_len: Maximum size of contigs name :return: Formatted table """ - return {'name': tables.StringCol(itemsize=org_len), - "contig": tables.StringCol(itemsize=contig_len)} + return {'name': tables.StringCol(itemsize=org_len)} def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -73,28 +71,23 @@ def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.G logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes") organism_row = organism_table.row for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): - for contig in org.contigs: - organism_row["name"] = org.name - organism_row["contig"] = contig.name - organism_row.append() + organism_row["name"] = org.name + organism_row.append() organism_table.flush() -def contig_desc(contig_len: int, max_gene_id_len: int, - max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]: +def contig_desc(contig_len: int, org_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]: """Table description to save contig-related information :param contig_len: Maximum size of contig name - :param max_gene_id_len: Maximum size of gene name - :param max_rna_id_len: Maximum size of rna name + :param org_len: Maximum size of organism name. :return: Formatted table """ return {'name': tables.StringCol(itemsize=contig_len), "is_circular": tables.BoolCol(dflt=False), 'length': tables.UInt32Col(), - "gene": tables.StringCol(itemsize=max_gene_id_len), - "rna": tables.StringCol(itemsize=max_rna_id_len)} + "organism": tables.StringCol(itemsize=org_len)} def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -111,40 +104,28 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") contig_row = contig_table.row for contig in tqdm(pangenome.contigs, total=pangenome.number_of_contigs, unit="contigs", disable=disable_bar): - if contig.number_of_genes >= contig.number_of_rnas: - rna_list = list(contig.RNAs) - for index, gene in enumerate(contig.genes): - contig_row["name"] = contig.name - contig_row["is_circular"] = contig.is_circular - contig_row["length"] = len(contig) - contig_row["gene"] = gene.ID - if index < len(rna_list): - contig_row["rna"] = rna_list[index].ID - contig_row.append() - else: - gene_list = list(contig.genes) - for index, rna in enumerate(contig.RNAs): - contig_row["name"] = contig.name - contig_row["is_circular"] = contig.is_circular - contig_row["rna"] = rna.ID - if index < len(gene_list): - contig_row["gene"] = gene_list[index].ID - contig_row.append() + contig_row["name"] = contig.name + contig_row["is_circular"] = contig.is_circular + contig_row["length"] = len(contig) + contig_row["organism"] = contig.organism.name + contig_row.append() contig_table.flush() -def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: +def gene_desc(id_len: int, max_local_id: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: """Table description to save gene-related information :param id_len: Maximum size of gene name :param max_local_id: Maximum size of gene local identifier + :param max_contig_len: Maximum size of contig identifier :return: Formatted table """ return {'ID': tables.StringCol(itemsize=id_len), 'genedata_id': tables.UInt32Col(), 'local': tables.StringCol(itemsize=max_local_id), - 'is_fragment': tables.BoolCol(dflt=False)} + 'is_fragment': tables.BoolCol(dflt=False), + 'contig': tables.StringCol(itemsize=max_contig_len)} def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -169,6 +150,7 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou gene_row["ID"] = gene.ID gene_row["is_fragment"] = gene.is_fragment gene_row["local"] = gene.local_identifier + gene_row["contig"] = gene.contig.name genedata = get_genedata(gene) genedata_id = genedata2gene.get(genedata) if genedata_id is None: @@ -181,15 +163,17 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou return genedata2gene -def rna_desc(id_len) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]: +def rna_desc(id_len: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]: """Table description to save rna-related information :param id_len: Maximum size of RNA identifier + :param max_contig_len: Maximum size of contig identifier :return: Formatted table """ return {'ID': tables.StringCol(itemsize=id_len), - 'genedata_id': tables.UInt32Col()} + 'genedata_id': tables.UInt32Col(), + 'contig': tables.StringCol(itemsize=max_contig_len)} def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -212,6 +196,7 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group rna_row = rna_table.row for rna in tqdm(pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar): rna_row["ID"] = rna.ID + rna_row["contig"] = rna.contig.name genedata = get_genedata(rna) genedata_id = genedata2rna.get(genedata) if genedata_id is None: @@ -345,17 +330,17 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: boo # I add these boolean in case we would one day only load organism, contig or genes, without the other. if rec_organisms: - desc = organism_desc(org_len, contig_len) + desc = organism_desc(org_len) write_organisms(pangenome, h5f, annotation, desc, disable_bar) if rec_contigs: - desc = contig_desc(contig_len, gene_id_len, rna_id_len) + desc = contig_desc(contig_len, org_len) write_contigs(pangenome, h5f, annotation, desc, disable_bar) if rec_genes: - desc = gene_desc(gene_id_len, gene_local_id) + desc = gene_desc(gene_id_len, gene_local_id, contig_len) genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar) write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar) if rec_rnas: - desc = rna_desc(rna_id_len) + desc = rna_desc(rna_id_len, contig_len) genedata2rna = write_rnas(pangenome, h5f, annotation, desc, disable_bar) write_genedata(pangenome, h5f, annotation, genedata2rna, disable_bar) diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index ef37bb4b..b5bf7c88 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -38,7 +38,6 @@ def __init__(self): self._regionGetter = {} self._spotGetter = {} self._moduleGetter = {} - self.status = { 'genomesAnnotated': "No", 'geneSequences': "No", @@ -312,6 +311,39 @@ def number_of_contigs(self) -> int: """ return sum(len(org) for org in self.organisms) + def _mk_contig_getter(self): + """ + Builds the attribute _contig_getter of the pangenome + + Since the genes are never explicitly 'added' to a pangenome (but rather to an organism), + the pangenome cannot directly extract a gene from a geneID since it does not 'know' them. + If at some point we want to extract contig from a pangenome we'll create a contig_getter. + The assumption behind this is that the pangenome has been filled and no more contig will be added. + """ + self._contig_getter = {} + for contig in self.contigs: + self._contig_getter[contig.name] = contig + + def get_contig(self, name: str) -> Contig: + """Returns the contig that has the given name + + :param name: The ,ame of the contig to look for + + :return: Returns the wanted contig + + :raises AssertionError: If the `gene_id` is not an integer + :raises KeyError: If the `gene_id` is not in the pangenome + """ + assert isinstance(name, str), "Contig name should be a string" + + try: + return self._contig_getter[name] + except AttributeError: + # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. + self._mk_contig_getter() # make it + return self.get_contig(name) # Return what was expected. If geneID does not exist it will raise an error. + except KeyError: + raise KeyError(f"Contig: {name}, does not exist in the pangenome.") def get_organism(self, name: str) -> Organism: """ Get an organism that is expected to be in the pangenome using its name, which is supposedly unique. From f4e6648a20e01edbd847adbc7d5f6f74e8e82d75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 15 Sep 2023 10:37:56 +0200 Subject: [PATCH 13/15] Add documentation --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 49 ++++++++++++++++++++------ ppanggolin/formats/writeAnnotations.py | 12 ++++--- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/VERSION b/VERSION index 7844733d..aaf4dca1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.185 +1.2.186 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 976499c1..dda50046 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -25,19 +25,21 @@ class Genedata: """ This is a general class storing unique gene-related data to be written in a specific genedata table - - :param start: Gene start position - :param stop: Gene stop position - :param strand: Associated strand - :param gene_type: Gene type - :param position: Position of the gene on its contig - :param name: Name of the feature - :param product: Associated product - :param genetic_code: associated genetic code, if any """ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: int, name: str, product: str, genetic_code: int): + """Constructor method + + :param start: Gene start position + :param stop: Gene stop position + :param strand: Associated strand + :param gene_type: Gene type + :param position: Position of the gene on its contig + :param name: Name of the feature + :param product: Associated product + :param genetic_code: associated genetic code, if any + """ self.start = start self.stop = stop self.strand = strand @@ -47,7 +49,7 @@ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: self.product = product self.genetic_code = genetic_code - def __eq__(self, other): + def __eq__(self, other: Genedata): return self.start == other.start \ and self.stop == other.stop \ and self.strand == other.strand \ @@ -166,7 +168,9 @@ def read_chunks(table: Table, column: str = None, chunk: int = 10000): def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: """ Reads the genedata table and returns a genedata_id2genedata dictionnary + :param h5f: the hdf5 file handler + :return: dictionnary linking genedata to the genedata identifier """ table = h5f.root.annotations.genedata @@ -386,6 +390,13 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, disable_bar: bool = False): + """Read organism table in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: Organism table + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ contig2organism = {} for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar): organism = Organism(row["name"].decode()) @@ -394,6 +405,13 @@ def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, disable_bar: bool = False): + """Read contig table in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: Contig table + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar): contig = Contig(name=row["name"].decode()) contig.is_circular = row["is_circular"] @@ -563,7 +581,16 @@ def read_modules_info(h5f: tables.File): f"\t\t\t- mean: {info_group._v_attrs['StatOfFamiliesInModules']['mean']}") -def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, sources: List[str] = None, disable_bar: bool = False): +def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, + sources: List[str] = None, disable_bar: bool = False): + """Read metadata to add them to the pangenome object + + :param pangenome: Pangenome object + :param h5f: Pangenome file + :param metatype: Object type to associate metadata + :param sources: Source name of metadata + :param disable_bar: Disable progress bar + """ metadata_group = h5f.root.metadata._f_get_child(metatype) for source in sources: source_table = metadata_group._f_get_child(source) diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index 821fd513..b09e4166 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -209,7 +209,7 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group return genedata2rna -def genedata_desc(type_len, name_len, product_len): +def genedata_desc(type_len: int, name_len: int, product_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: """ Creates a table for gene-related data @@ -311,8 +311,8 @@ def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.G genedata_table.flush() -def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, - rec_contigs: bool = True, rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False): +def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, rec_contigs: bool = True, + rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False): """Function writing all the pangenome annotations :param pangenome: Annotated pangenome @@ -361,11 +361,13 @@ def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: return max_gene_id_len, max_gene_type -def gene_sequences_desc(gene_id_len, gene_type_len) -> dict: +def gene_sequences_desc(gene_id_len: int, gene_type_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: """ Create table to save gene sequences + :param gene_id_len: Maximum size of gene sequence identifier :param gene_type_len: Maximum size of gene type + :return: Formated table """ return { @@ -388,7 +390,7 @@ def get_sequence_len(pangenome: Pangenome) -> int: return max_seq_len -def sequence_desc(max_seq_len: int) -> dict: +def sequence_desc(max_seq_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: """ Table description to save sequences :param max_seq_len: Maximum size of gene type From e992a17308e1a4e9d2353a8cdf62109ce1fe9a24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 15 Sep 2023 10:57:36 +0200 Subject: [PATCH 14/15] Remove Genedata in type for __eq__ --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index aaf4dca1..0da01cf8 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.186 +1.2.187 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index dda50046..968c1f2c 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -49,7 +49,7 @@ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: self.product = product self.genetic_code = genetic_code - def __eq__(self, other: Genedata): + def __eq__(self, other): return self.start == other.start \ and self.stop == other.stop \ and self.strand == other.strand \ From c2e8ef747b40a2cbd85d787f1fdc757e6c8fb11b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <39793176+jpjarnoux@users.noreply.github.com> Date: Tue, 19 Sep 2023 11:00:18 +0200 Subject: [PATCH 15/15] Update writeFlat.py Fix completness computation --- ppanggolin/formats/writeFlat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index c9a12ef3..4a9926c6 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -530,7 +530,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, nb_gene_core += 1 completeness = "NA" if len(single_copy_markers) > 0: - completeness = round(((org.number_of_families() + len(single_copy_markers)) / + completeness = round((len(set(org.families) & single_copy_markers) / len(single_copy_markers)) * 100, 2) outfile.write("\t".join(map(str, [org.name, org.number_of_families(),