From dd9c726f2a4b05a46daa4c62fdf35996646ebefc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 5 Sep 2023 14:44:40 +0200
Subject: [PATCH 01/15] commit before merge with unitTest branch

---
 VERSION                              | 2 +-
 ppanggolin/RGP/genomicIsland.py      | 2 +-
 ppanggolin/formats/writeBinaries.py  | 6 +++++-
 ppanggolin/formats/writeFlat.py      | 4 ++--
 ppanggolin/formats/writeMSA.py       | 2 +-
 ppanggolin/formats/writeSequences.py | 4 ++--
 ppanggolin/metrics/fluidity.py       | 2 +-
 ppanggolin/pangenome.py              | 3 ++-
 8 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/VERSION b/VERSION
index e829013f..e144fab6 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.132
+1.2.133
diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py
index c03ba66f..5bc8d935 100644
--- a/ppanggolin/RGP/genomicIsland.py
+++ b/ppanggolin/RGP/genomicIsland.py
@@ -251,7 +251,7 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain
     multigenics = pangenome.get_multigenics(dup_margin)
     logging.getLogger().info("Compute Regions of Genomic Plasticity ...")
     name_scheme = naming_scheme(pangenome)
-    for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar):
+    for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genomes", disable=disable_bar):
         pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length,
                                               min_score, naming=name_scheme))
     logging.getLogger().info(f"Predicted {len(pangenome.regions)} RGP")
diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py
index b01a3a35..6a5a8a5d 100644
--- a/ppanggolin/formats/writeBinaries.py
+++ b/ppanggolin/formats/writeBinaries.py
@@ -145,6 +145,10 @@ def get_genedata(gene:Feature) -> Tuple[int, str, str, int, str, str, int]:
                     gene.product, genetic_code)
 
 
+def write_organism(pangenome: Pangenome, h5f: tables.File, disable_bar = False):
+    organism_table = h5f.create_table(annotation, "organism", gene_desc(*get_max_len_annotations(pangenome)),
+                                      expectedrows=pangenome.number_of_organisms)
+
 def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
     """
     Function writing all the pangenome annotations
@@ -163,7 +167,7 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool
     genedata_counter = 0
 
     gene_row = gene_table.row
-    for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genome", disable=disable_bar):
+    for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar):
         for contig in org.contigs:
             for gene in contig.genes + list(contig.RNAs):
                 gene_row["organism"] = org.name
diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py
index aa0078bc..7a3bde02 100644
--- a/ppanggolin/formats/writeFlat.py
+++ b/ppanggolin/formats/writeFlat.py
@@ -487,9 +487,9 @@ def write_stats(output: str, soft_core: float = 0.95, dup_margin: float = 0.05,
     soft = set()  # could use bitarrays if speed is needed
     core = set()
     for fam in pan.gene_families:
-        if len(fam.organisms) >= pan.number_of_organisms() * soft_core:
+        if len(fam.organisms) >= pan.number_of_organisms * soft_core:
             soft.add(fam)
-        if len(fam.organisms) == pan.number_of_organisms():
+        if len(fam.organisms) == pan.number_of_organisms:
             core.add(fam)
 
     with write_compressed_or_not(output + "/organisms_statistics.tsv", compress) as outfile:
diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py
index f8af1135..72c2f4bf 100644
--- a/ppanggolin/formats/writeMSA.py
+++ b/ppanggolin/formats/writeMSA.py
@@ -51,7 +51,7 @@ def getFamiliesToWrite(pangenome, partition_filter, soft_core=0.95, dup_margin=0
     :return: set of families unique to one partition
     """
     fams = set()
-    nb_org = pangenome.number_of_organisms()
+    nb_org = pangenome.number_of_organisms
 
     if partition_filter == "all":
         return set(pangenome.gene_families)
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 725b1e20..29b5ce86 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -106,14 +106,14 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c
     elif partition == "softcore":
         logging.getLogger().info(
             f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes")
-        threshold = pangenome.number_of_organisms() * soft_core
+        threshold = pangenome.number_of_organisms * soft_core
         for fam in pangenome.gene_families:
             if len(fam.organisms) >= threshold:
                 genefams.add(fam)
     elif partition == "core":
         logging.getLogger().info(f"Writing the representative {type_name} of the {partition} gene families...")
         for fam in pangenome.gene_families:
-            if len(fam.organisms) == pangenome.number_of_organisms():
+            if len(fam.organisms) == pangenome.number_of_organisms:
                 genefams.add(fam)
     elif "module_" in partition:
         logging.getLogger().info(f"Writing the representation {type_name} of {partition} gene families...")
diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py
index f7fd1282..2d1a2d97 100644
--- a/ppanggolin/metrics/fluidity.py
+++ b/ppanggolin/metrics/fluidity.py
@@ -40,7 +40,7 @@ def gen_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict:
             common_fam = popcount(c_organisms[0].bitarray & c_organisms[1].bitarray) - 1
             if tot_fam > 0 and common_fam > 0:
                 g_sum += (tot_fam - 2 * common_fam) / tot_fam
-        fluidity_dict[subset] = (2 / (pangenome.number_of_organisms() * (pangenome.number_of_organisms() - 1))) * g_sum
+        fluidity_dict[subset] = (2 / (pangenome.number_of_organisms * (pangenome.number_of_organisms - 1))) * g_sum
     return fluidity_dict
 
 
diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py
index 92836995..93848cda 100644
--- a/ppanggolin/pangenome.py
+++ b/ppanggolin/pangenome.py
@@ -94,7 +94,7 @@ def _yield_genes(self) -> Iterator[Gene]:
 
         :return: an iterator of Gene
         """
-        if self.number_of_organisms() > 0:  # if we have organisms, they're supposed to have genes
+        if self.number_of_organisms > 0:  # if we have organisms, they're supposed to have genes
             for org in self.organisms:
                 for contig in org.contigs:
                     for gene in contig.genes:
@@ -245,6 +245,7 @@ def organisms(self) -> List[Organism]:
         """
         return list(self._orgGetter.values())
 
+    @property
     def number_of_organisms(self) -> int:
         """Returns the number of organisms present in the pangenome
         

From 5eb33bcafbc3b4d2c64cc0966ecb8a849db1fa36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Tue, 5 Sep 2023 17:48:10 +0200
Subject: [PATCH 02/15] Change HDF5 to write contig and organism table

---
 VERSION                                |   2 +-
 ppanggolin/formats/writeAnnotations.py | 356 +++++++++++++++++++++++++
 ppanggolin/formats/writeBinaries.py    | 282 +-------------------
 ppanggolin/genome.py                   |  14 +-
 ppanggolin/pangenome.py                |  14 +-
 5 files changed, 379 insertions(+), 289 deletions(-)
 create mode 100644 ppanggolin/formats/writeAnnotations.py

diff --git a/VERSION b/VERSION
index c262507c..8464960b 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.174
+1.2.175
diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py
new file mode 100644
index 00000000..907034de
--- /dev/null
+++ b/ppanggolin/formats/writeAnnotations.py
@@ -0,0 +1,356 @@
+#!/usr/bin/env python3
+# coding:utf-8
+
+# default libraries
+import logging
+from typing import Dict, Tuple, Union
+
+# installed libraries
+from tqdm import tqdm
+import tables
+
+# local libraries
+from ppanggolin.pangenome import Pangenome
+from ppanggolin.genome import Feature, Gene
+from ppanggolin.formats.readBinaries import Genedata
+
+
+def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int, int]:
+    """
+    Get the maximum size of each annotation information to optimize disk space
+
+    :param pangenome: Annotated pangenome
+    :return: maximum size of each annotation
+    """
+    max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = 1, 1, 1, 1, 1, 1
+    for org in pangenome.organisms:
+        if len(org.name) > max_org_len:
+            max_org_len = len(org.name)
+        for contig in org.contigs:
+            if len(contig.name) > max_contig_len:
+                max_contig_len = len(contig.name)
+            for gene in contig.genes:
+                if len(gene.ID) > max_gene_id_len:
+                    max_gene_id_len = len(gene.ID)
+                if len(gene.local_identifier) > max_gene_local_id:
+                    max_gene_local_id = len(gene.local_identifier)
+            for rna in contig.RNAs:
+                if len(rna.ID) > max_rna_id_len:
+                    max_rna_id_len = len(gene.ID)
+                if len(rna.local_identifier) > max_rna_local_id:
+                    max_rna_local_id = len(gene.local_identifier)
+
+    return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id
+
+
+def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]:
+    """
+    Create a table to save organism-related information
+
+    :param org_len: Maximum size of organism
+    :param contig_len: Maximum size of contigs
+
+    :return: Formatted table
+    """
+    return {'name': tables.StringCol(itemsize=org_len),
+            "contig": tables.StringCol(itemsize=contig_len)}
+
+
+def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group,
+                    organism_desc: Dict[str, tables.StringCol], disable_bar=False):
+    organism_table = h5f.create_table(annotation, "genome", organism_desc,
+                                      expectedrows=pangenome.number_of_organisms)
+    logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes")
+    organism_row = organism_table.row
+    for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar):
+        for contig in org.contigs:
+            organism_row["name"] = org.name
+            organism_row["contig"] = contig.name
+            organism_row.append()
+    organism_table.flush()
+
+
+def contig_desc(contig_len: int, max_gene_id_len: int,
+                max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol]]:
+    """
+    Create a table to save organism-related information
+
+    :param org_len: Maximum size of organism
+    :param contig_len: Maximum size of contigs
+
+    :return: Formatted table
+    """
+    return {'name': tables.StringCol(itemsize=contig_len),
+            "is_circular": tables.BoolCol(dflt=False),
+            "gene": tables.StringCol(itemsize=max_gene_id_len),
+            "rna": tables.StringCol(itemsize=max_rna_id_len)}
+
+
+def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group,
+                  contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol]],
+                  disable_bar=False):
+    contig_table = h5f.create_table(annotation, "contig", contig_desc, expectedrows=pangenome.number_of_contigs)
+    logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs")
+    contig_row = contig_table.row
+    for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar):
+        rna_list = list(contig.RNAs)
+        for index, gene in enumerate(contig.genes):
+            contig_row["name"] = contig.name
+            contig_row["is_circular"] = contig.is_circular
+            contig_row["gene"] = gene.ID
+            if index < len(rna_list):
+                contig_row["rna"] = rna_list[index].ID
+            contig_row.append()
+    contig_table.flush()
+
+
+def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]:
+    """
+    Create a table to save gene-related information
+
+    :param org_len: Maximum size of organism
+    :param contig_len: Maximum size of contigs
+    :param id_len: Maximum size of gene ID
+
+    :param max_local_id: Maximum size of gene local identifier
+
+    :return: Formatted table
+    """
+    return {'ID': tables.StringCol(itemsize=id_len),
+            'genedata_id': tables.UInt32Col(),
+            'local': tables.StringCol(itemsize=max_local_id),
+            'is_fragment': tables.BoolCol(dflt=False)}
+
+
+def write_genes(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group,
+                gene_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]],
+                disable_bar=False) -> Dict[Genedata, int]:
+    genedata2gene = {}
+    genedata_counter = 0
+    gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes)
+    logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes")
+    gene_row = gene_table.row
+    for gene in tqdm(pangenome.genes, total=pangenome.number_of_organisms, unit="gene", disable=disable_bar):
+        gene_row["ID"] = gene.ID
+        gene_row["is_fragment"] = gene.is_fragment
+        if gene.type == "CDS":
+            gene_row["local"] = gene.local_identifier
+        genedata = get_genedata(gene)
+        genedata_id = genedata2gene.get(genedata)
+        if genedata_id is None:
+            genedata_id = genedata_counter
+            genedata2gene[genedata] = genedata_id
+            genedata_counter += 1
+        gene_row["genedata_id"] = genedata_id
+        gene_row.append()
+    gene_table.flush()
+    return genedata2gene
+
+
+def genedata_desc(type_len, name_len, product_len):
+    """
+    Creates a table for gene-related data
+
+    :param type_len: Maximum size of gene Type
+    :param name_len: Maximum size of gene name
+    :param product_len: Maximum size of gene product
+    :return: Formatted table for gene metadata
+    """
+    return {
+        'genedata_id': tables.UInt32Col(),
+        'start': tables.UInt32Col(),
+        'stop': tables.UInt32Col(),
+        'strand': tables.StringCol(itemsize=1),
+        'gene_type': tables.StringCol(itemsize=type_len),
+        'position': tables.UInt32Col(),
+        'name': tables.StringCol(itemsize=name_len),
+        'product': tables.StringCol(itemsize=product_len),
+        'genetic_code': tables.UInt32Col(dflt=11),
+    }
+
+
+def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]:
+    """
+    Get the maximum size of each gene data information to optimize disk space
+
+    :param pangenome: Annotated pangenome
+    :return: maximum size of each annotation
+    """
+    max_name_len = 1
+    max_product_len = 1
+    max_type_len = 1
+    for org in pangenome.organisms:
+        for contig in org.contigs:
+            for gene in contig.genes:
+                if len(gene.name) > max_name_len:
+                    max_name_len = len(gene.name)
+                if len(gene.product) > max_product_len:
+                    max_product_len = len(gene.product)
+                if len(gene.type) > max_type_len:
+                    max_type_len = len(gene.type)
+            for gene in contig.RNAs:
+                if len(gene.name) > max_name_len:
+                    max_name_len = len(gene.name)
+                if len(gene.product) > max_product_len:
+                    max_product_len = len(gene.product)
+                if len(gene.type) > max_type_len:
+                    max_type_len = len(gene.type)
+
+    return max_type_len, max_name_len, max_product_len
+
+
+def get_genedata(gene: Feature) -> Genedata:
+    """
+    Gets the genedata type of Feature
+
+    :param gene: a Feature
+    :return: Tuple with a Feature associated data
+    """
+    position = None
+    genetic_code = 11
+    if gene.type == "CDS":
+        gene: Gene
+        position = gene.position
+        genetic_code = gene.genetic_code
+    return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name,
+                    gene.product, genetic_code)
+
+
+def write_genedata(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group,
+                   genedata2gene: Dict[Genedata, int], disable_bar=False):
+    genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)),
+                                      expectedrows=len(genedata2gene))
+    logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data "
+                                          "(can be lower than the number of genes)")
+    genedata_row = genedata_table.row
+    for genedata, genedata_id in genedata2gene.items():
+        genedata_row["genedata_id"] = genedata_id
+        genedata_row["start"] = genedata.start
+        genedata_row["stop"] = genedata.stop
+        genedata_row["strand"] = genedata.strand
+        genedata_row["gene_type"] = genedata.gene_type
+        if genedata.gene_type == "CDS":
+            genedata_row["position"] = genedata.position
+            genedata_row["genetic_code"] = genedata.genetic_code
+        genedata_row["name"] = genedata.name
+        genedata_row["product"] = genedata.product
+        genedata_row.append()
+    genedata_table.flush()
+
+
+def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True,
+                      rec_contigs: bool = True, rec_genes: bool = True, disable_bar: bool = False):
+    """
+    Function writing all the pangenome annotations
+
+    :param pangenome: Annotated pangenome
+    :param h5f: Pangenome HDF5 file
+    :param disable_bar: Alow to disable progress bar
+    """
+    annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms")
+
+    max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = get_max_len_annotations(
+        pangenome)
+
+    if rec_organisms:
+        desc = organism_desc(max_org_len, max_contig_len)
+        write_organisms(pangenome, h5f, annotation, desc, disable_bar)
+    if rec_contigs:
+        desc = contig_desc(max_contig_len, max_gene_id_len, max_rna_id_len)
+        write_contigs(pangenome, h5f, annotation, desc, disable_bar)
+    if rec_genes:
+        desc = gene_desc(max_gene_id_len, max_gene_local_id)
+        genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar)
+        write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar)
+
+
+def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]:
+    """
+    Get the maximum size of gene sequences to optimize disk space
+    :param pangenome: Annotated pangenome
+    :return: maximum size of each annotation
+    """
+    max_gene_id_len = 1
+    max_gene_type = 1
+    for gene in pangenome.genes:
+        if len(gene.ID) > max_gene_id_len:
+            max_gene_id_len = len(gene.ID)
+        if len(gene.type) > max_gene_type:
+            max_gene_type = len(gene.type)
+    return max_gene_id_len, max_gene_type
+
+
+def gene_sequences_desc(gene_id_len, gene_type_len) -> dict:
+    """
+    Create table to save gene sequences
+    :param gene_id_len: Maximum size of gene sequence identifier
+    :param gene_type_len: Maximum size of gene type
+    :return: Formated table
+    """
+    return {
+        "gene": tables.StringCol(itemsize=gene_id_len),
+        "seqid": tables.UInt32Col(),
+        "type": tables.StringCol(itemsize=gene_type_len)
+    }
+
+
+def get_sequence_len(pangenome: Pangenome) -> int:
+    """
+    Get the maximum size of gene sequences to optimize disk space
+    :param pangenome: Annotated pangenome
+    :return: maximum size of each annotation
+    """
+    max_seq_len = 1
+    for gene in pangenome.genes:
+        if len(gene.dna) > max_seq_len:
+            max_seq_len = len(gene.dna)
+    return max_seq_len
+
+
+def sequence_desc(max_seq_len: int) -> dict:
+    """
+    Table description to save sequences
+    :param max_seq_len: Maximum size of gene type
+    :return: Formated table
+    """
+    return {
+        "seqid": tables.UInt32Col(),
+        "dna": tables.StringCol(itemsize=max_seq_len)
+    }
+
+
+def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
+    """
+    Function writing all the pangenome gene sequences
+    :param pangenome: Pangenome with gene sequences
+    :param h5f: Pangenome HDF5 file without sequences
+    :param disable_bar: Disable progress bar
+    """
+    gene_seq = h5f.create_table("/annotations", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)),
+                                expectedrows=pangenome.number_of_genes)
+    # process sequences to save them only once
+    seq2seqid = {}
+    id_counter = 0
+    gene_row = gene_seq.row
+    for gene in tqdm(sorted(pangenome.genes, key=lambda x: x.ID), total=pangenome.number_of_genes, unit="gene",
+                     disable=disable_bar):
+        curr_seq_id = seq2seqid.get(gene.dna)
+        if curr_seq_id is None:
+            curr_seq_id = id_counter
+            seq2seqid[gene.dna] = id_counter
+            id_counter += 1
+        gene_row["gene"] = gene.ID
+        gene_row["seqid"] = curr_seq_id
+        gene_row["type"] = gene.type
+        gene_row.append()
+    gene_seq.flush()
+
+    seq_table = h5f.create_table("/annotations", "sequences", sequence_desc(get_sequence_len(pangenome)),
+                                 expectedrows=len(seq2seqid))
+
+    seq_row = seq_table.row
+    for seq, seqid in seq2seqid.items():
+        seq_row["dna"] = seq
+        seq_row["seqid"] = seqid
+        seq_row.append()
+    seq_table.flush()
diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py
index 3a9df929..1a3fcc56 100644
--- a/ppanggolin/formats/writeBinaries.py
+++ b/ppanggolin/formats/writeBinaries.py
@@ -5,7 +5,7 @@
 import logging
 from collections import Counter, defaultdict
 import statistics
-from typing import Tuple
+from typing import Tuple, Union
 import pkg_resources
 
 # installed libraries
@@ -15,290 +15,12 @@
 
 # local libraries
 from ppanggolin.pangenome import Pangenome
+from ppanggolin.formats.writeAnnotations import write_annotations, write_gene_sequences
 from ppanggolin.formats.writeMetadata import write_metadata, erase_metadata, write_metadata_status
 from ppanggolin.genome import Feature, Gene
 from ppanggolin.formats.readBinaries import read_genedata, Genedata
 
 
-def gene_desc(org_len, contig_len, id_len, max_local_id) -> dict:
-    """
-    Create a table to save gene-related information
-
-    :param org_len: Maximum size of organism
-    :param contig_len: Maximum size of contigs
-    :param id_len: Maximum size of gene ID
-
-    :param max_local_id: Maximum size of gene local identifier
-
-    :return: Formatted table
-    """
-    return {
-        'organism': tables.StringCol(itemsize=org_len),
-        "contig": {
-            'name': tables.StringCol(itemsize=contig_len),
-            "is_circular": tables.BoolCol(dflt=False)
-        },
-        "gene": {
-            'ID': tables.StringCol(itemsize=id_len),
-            'genedata_id': tables.UInt32Col(),
-            'local': tables.StringCol(itemsize=max_local_id),
-            'is_fragment': tables.BoolCol(dflt=False)
-        }
-    }
-
-
-def genedata_desc(type_len, name_len, product_len):
-    """
-    Creates a table for gene-related data
-
-    :param type_len: Maximum size of gene Type
-    :param name_len: Maximum size of gene name
-    :param product_len: Maximum size of gene product
-    :return: Formatted table for gene metadata
-    """
-    return {
-        'genedata_id': tables.UInt32Col(),
-        'start': tables.UInt32Col(),
-        'stop': tables.UInt32Col(),
-        'strand': tables.StringCol(itemsize=1),
-        'gene_type': tables.StringCol(itemsize=type_len),
-        'position': tables.UInt32Col(),
-        'name': tables.StringCol(itemsize=name_len),
-        'product': tables.StringCol(itemsize=product_len),
-        'genetic_code': tables.UInt32Col(dflt=11),
-    }
-
-
-def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int]:
-    """
-    Get the maximum size of each annotation information to optimize disk space
-
-    :param pangenome: Annotated pangenome
-    :return: maximum size of each annotation
-    """
-    max_org_len = 1
-    max_contig_len = 1
-    max_gene_id_len = 1
-    max_local_id = 1
-    for org in pangenome.organisms:
-        if len(org.name) > max_org_len:
-            max_org_len = len(org.name)
-        for contig in org.contigs:
-            if len(contig.name) > max_contig_len:
-                max_contig_len = len(contig.name)
-            for gene in contig.genes:
-                if len(gene.ID) > max_gene_id_len:
-                    max_gene_id_len = len(gene.ID)
-                if len(gene.local_identifier) > max_local_id:
-                    max_local_id = len(gene.local_identifier)
-            for gene in contig.RNAs:
-                if len(gene.ID) > max_gene_id_len:
-                    max_gene_id_len = len(gene.ID)
-                if len(gene.local_identifier) > max_local_id:
-                    max_local_id = len(gene.local_identifier)
-
-    return max_org_len, max_contig_len, max_gene_id_len, max_local_id
-
-
-def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]:
-    """
-    Get the maximum size of each gene data information to optimize disk space
-
-    :param pangenome: Annotated pangenome
-    :return: maximum size of each annotation
-    """
-    max_name_len = 1
-    max_product_len = 1
-    max_type_len = 1
-    for org in pangenome.organisms:
-        for contig in org.contigs:
-            for gene in contig.genes:
-                if len(gene.name) > max_name_len:
-                    max_name_len = len(gene.name)
-                if len(gene.product) > max_product_len:
-                    max_product_len = len(gene.product)
-                if len(gene.type) > max_type_len:
-                    max_type_len = len(gene.type)
-            for gene in contig.RNAs:
-                if len(gene.name) > max_name_len:
-                    max_name_len = len(gene.name)
-                if len(gene.product) > max_product_len:
-                    max_product_len = len(gene.product)
-                if len(gene.type) > max_type_len:
-                    max_type_len = len(gene.type)
-
-    return max_type_len, max_name_len, max_product_len
-
-
-def get_genedata(gene: Feature) -> Genedata:
-    """
-    Gets the genedata type of Feature
-
-    :param gene: a Feature
-    :return: Tuple with a Feature associated data
-    """
-    position = None
-    genetic_code = 11
-    if gene.type == "CDS":
-        gene: Gene
-        position = gene.position
-        genetic_code = gene.genetic_code
-    return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name,
-                    gene.product, genetic_code)
-
-
-def write_organism(pangenome: Pangenome, h5f: tables.File, disable_bar = False):
-    organism_table = h5f.create_table(annotation, "organism", gene_desc(*get_max_len_annotations(pangenome)),
-                                      expectedrows=pangenome.number_of_organisms)
-
-def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
-    """
-    Function writing all the pangenome annotations
-
-    :param pangenome: Annotated pangenome
-    :param h5f: Pangenome HDF5 file
-    :param disable_bar: Alow to disable progress bar
-    """
-    annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms")
-    gene_table = h5f.create_table(annotation, "genes", gene_desc(*get_max_len_annotations(pangenome)),
-                                  expectedrows=pangenome.number_of_genes)
-
-    logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes")
-
-    genedata2gene = {}
-    genedata_counter = 0
-
-    gene_row = gene_table.row
-    for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar):
-        for contig in org.contigs:
-            for gene in list(contig.genes) + list(contig.RNAs):
-                gene_row["organism"] = org.name
-                gene_row["contig/name"] = contig.name
-                gene_row["contig/is_circular"] = contig.is_circular
-                gene_row["gene/ID"] = gene.ID
-                gene_row["gene/is_fragment"] = gene.is_fragment
-                if gene.type == "CDS":
-                    gene_row["gene/local"] = gene.local_identifier
-                genedata = get_genedata(gene)
-                genedata_id = genedata2gene.get(genedata)
-                if genedata_id is None:
-                    genedata_id = genedata_counter
-                    genedata2gene[genedata] = genedata_id
-                    genedata_counter += 1
-                gene_row["gene/genedata_id"] = genedata_id
-                gene_row.append()
-    gene_table.flush()
-
-    genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)),
-                                      expectedrows=len(genedata2gene))
-    logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data (can be lower than the number of genes)")
-    genedata_row = genedata_table.row
-    for genedata, genedata_id in genedata2gene.items():
-        genedata_row["genedata_id"] = genedata_id
-        genedata_row["start"] = genedata.start
-        genedata_row["stop"] = genedata.stop
-        genedata_row["strand"] = genedata.strand
-        genedata_row["gene_type"] = genedata.gene_type
-        if genedata.gene_type == "CDS":
-            genedata_row["position"] = genedata.position
-            genedata_row["genetic_code"] = genedata.genetic_code
-        genedata_row["name"] = genedata.name
-        genedata_row["product"] = genedata.product
-        genedata_row.append()
-    genedata_table.flush()
-
-
-def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]:
-    """
-    Get the maximum size of gene sequences to optimize disk space
-    :param pangenome: Annotated pangenome
-    :return: maximum size of each annotation
-    """
-    max_gene_id_len = 1
-    max_gene_type = 1
-    for gene in pangenome.genes:
-        if len(gene.ID) > max_gene_id_len:
-            max_gene_id_len = len(gene.ID)
-        if len(gene.type) > max_gene_type:
-            max_gene_type = len(gene.type)
-    return max_gene_id_len, max_gene_type
-
-
-def gene_sequences_desc(gene_id_len, gene_type_len) -> dict:
-    """
-    Create table to save gene sequences
-    :param gene_id_len: Maximum size of gene sequence identifier
-    :param gene_type_len: Maximum size of gene type
-    :return: Formated table
-    """
-    return {
-        "gene": tables.StringCol(itemsize=gene_id_len),
-        "seqid": tables.UInt32Col(),
-        "type": tables.StringCol(itemsize=gene_type_len)
-    }
-
-
-def get_sequence_len(pangenome: Pangenome) -> int:
-    """
-    Get the maximum size of gene sequences to optimize disk space
-    :param pangenome: Annotated pangenome
-    :return: maximum size of each annotation
-    """
-    max_seq_len = 1
-    for gene in pangenome.genes:
-        if len(gene.dna) > max_seq_len:
-            max_seq_len = len(gene.dna)
-    return max_seq_len
-
-
-def sequence_desc(max_seq_len: int) -> dict:
-    """
-    Table description to save sequences
-    :param max_seq_len: Maximum size of gene type
-    :return: Formated table
-    """
-    return {
-        "seqid": tables.UInt32Col(),
-        "dna": tables.StringCol(itemsize=max_seq_len)
-    }
-
-
-def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
-    """
-    Function writing all the pangenome gene sequences
-    :param pangenome: Pangenome with gene sequences
-    :param h5f: Pangenome HDF5 file without sequences
-    :param disable_bar: Disable progress bar
-    """
-    gene_seq = h5f.create_table("/", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)),
-                                expectedrows=pangenome.number_of_genes)
-    # process sequences to save them only once
-    seq2seqid = {}
-    id_counter = 0
-    gene_row = gene_seq.row
-    for gene in tqdm(sorted(pangenome.genes, key=lambda x: x.ID), total=pangenome.number_of_genes, unit="gene", disable=disable_bar):
-        curr_seq_id = seq2seqid.get(gene.dna)
-        if curr_seq_id is None:
-            curr_seq_id = id_counter
-            seq2seqid[gene.dna] = id_counter
-            id_counter += 1
-        gene_row["gene"] = gene.ID
-        gene_row["seqid"] = curr_seq_id
-        gene_row["type"] = gene.type
-        gene_row.append()
-    gene_seq.flush()
-
-    seq_table = h5f.create_table("/", "sequences", sequence_desc(get_sequence_len(pangenome)),
-                                 expectedrows=len(seq2seqid))
-
-    seq_row = seq_table.row
-    for seq, seqid in seq2seqid.items():
-        seq_row["dna"] = seq
-        seq_row["seqid"] = seqid
-        seq_row.append()
-    seq_table.flush()
-
-
 def gene_fam_desc(max_name_len: int, max_sequence_length: int, max_part_len: int) -> dict:
     """
     Create a formated table for gene families description
diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py
index d63818b6..84462d77 100644
--- a/ppanggolin/genome.py
+++ b/ppanggolin/genome.py
@@ -581,6 +581,13 @@ def __delitem__(self, name):
         except KeyError:
             raise KeyError("Position of the gene in the contig does not exist")
 
+    def __len__(self):
+        """ Get number of contigs in organism
+
+        :return: Number of contigs in organism
+        """
+        return len(self._contigs_getter)
+
     @property
     def families(self):
         """Return the gene families present in the organism
@@ -625,13 +632,6 @@ def contigs(self) -> Generator[Contig, None, None]:
         """
         yield from self._contigs_getter.values()
 
-    def number_of_contigs(self) -> int:
-        """ Get number of contigs in organism
-
-        :return: Number of contigs in organism
-        """
-        return len(self._contigs_getter)
-
     def add(self, contig: Contig):
         """Add a contig to organism
 
diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py
index ba35e4c1..c63bc3ec 100644
--- a/ppanggolin/pangenome.py
+++ b/ppanggolin/pangenome.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 
 # local libraries
-from ppanggolin.genome import Organism, Gene
+from ppanggolin.genome import Organism, Contig, Gene
 from ppanggolin.region import Region, Spot, Module
 from ppanggolin.geneFamily import GeneFamily
 from ppanggolin.edge import Edge
@@ -281,6 +281,18 @@ def number_of_organisms(self) -> int:
         """
         return len(self._orgGetter)
 
+    @property
+    def contigs(self) -> Generator[Contig, None, None]:
+        for organism in self.organisms:
+            yield from organism.contigs
+    @property
+    def number_of_contigs(self) -> int:
+        """Returns the number of contigs present in the pangenome
+
+        :return: The number of contigs
+        """
+        return sum(len(org) for org in self.organisms)
+
     def get_organism(self, name: str) -> Organism:
         """
         Get an organism that is expected to be in the pangenome using its name, which is supposedly unique.

From 084ce284d42fe9082536cecec15d6a417ee1925f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Wed, 6 Sep 2023 16:09:59 +0200
Subject: [PATCH 03/15] Read new version of annotation Tables

---
 VERSION                                |   2 +-
 ppanggolin/cluster/cluster.py          |   2 +-
 ppanggolin/formats/readBinaries.py     | 210 ++++++++++++++-----------
 ppanggolin/formats/writeAnnotations.py | 210 ++++++++++++++++++-------
 ppanggolin/formats/writeBinaries.py    |   4 +-
 ppanggolin/genome.py                   |   6 +
 ppanggolin/pangenome.py                |  20 +++
 tests/test_genome.py                   |   3 +-
 8 files changed, 304 insertions(+), 153 deletions(-)

diff --git a/VERSION b/VERSION
index 8464960b..bfdb369d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.175
+1.2.176
diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index 8d2763cb..77637015 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -284,7 +284,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
     """
     Main function to cluster pangenome gene sequences into families
 
-    :param pangenome: Annoatated Pangenome
+    :param pangenome: Annotated Pangenome
     :param tmpdir: Path to temporary directory
     :param cpu: number of CPU cores to use
     :param defrag: Allow to remove fragment
diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 900a8c4e..215ad7a3 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -5,10 +5,9 @@
 import logging
 import sys
 from pathlib import Path
-# installed libraries
-from typing import TextIO
-from typing import List
+from typing import TextIO, List, Dict, Tuple
 
+# installed libraries
 from tables import Table
 from tqdm import tqdm
 import tables
@@ -27,10 +26,10 @@ class Genedata:
     This is a general class storing unique gene-related data to be written in a specific
     genedata table
 
-    :param start: Start position of a gene
-    :param stop: Stop position of a gene
-    :param strand: associated strand
-    :param gene_type: Type of gene
+    :param start: Gene start position
+    :param stop: Gene stop position
+    :param strand: Associated strand
+    :param gene_type: Gene type
     :param position: Position of the gene on its contig
     :param name: Name of the feature
     :param product: Associated product
@@ -164,7 +163,7 @@ def read_chunks(table: Table, column: str = None, chunk: int = 10000):
             yield row
 
 
-def read_genedata(h5f: tables.File) -> dict:
+def read_genedata(h5f: tables.File) -> Dict[int, Genedata]:
     """
     Reads the genedata table and returns a genedata_id2genedata dictionnary
     :param h5f: the hdf5 file handler
@@ -192,7 +191,7 @@ def read_sequences(h5f: tables.File) -> dict:
     :param h5f: the hdf5 file handler
     :return: dictionnary linking sequences to the seq identifier
     """
-    table = h5f.root.sequences
+    table = h5f.root.annotations.sequences
     seqid2seq = {}
     for row in read_chunks(table, chunk=20000):
         seqid2seq[row["seqid"]] = row['dna'].decode()
@@ -204,6 +203,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter
     """
     Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS,
     and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file.
+
     :param filename: Name of the pangenome file
     :param file_obj: Name of the output file
     :param list_cds: An iterable object of CDS
@@ -212,7 +212,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter
     """
     logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...")
     h5f = tables.open_file(filename, "r", driver_core_backing_store=0)
-    table = h5f.root.geneSequences
+    table = h5f.root.annotations.geneSequences
     list_cds = set(list_cds) if list_cds is not None else None
     seqid2seq = read_sequences(h5f)
     for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar):
@@ -225,60 +225,6 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter
     h5f.close()
 
 
-def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circular_contigs: dict, genedata_dict: dict,
-                  link: bool = False):
-    """
-    Read information from pangenome to assign to organism object
-
-    :param pangenome: Input     pangenome
-    :param org_name: Name of the organism
-    :param contig_dict: Dictionary with all contig and associate genes
-    :param circular_contigs: Dictionary of contigs
-    :param genedata_dict: dictionnary linking genedata to the genedata identifier
-    :param link: get the gene object if the genes are clustered
-    """
-    org = Organism(org_name)
-    gene, gene_type = (None, None)
-    for contig_name, gene_list in contig_dict.items():
-        try:
-            contig = org.get(contig_name)
-        except KeyError:
-            contig = Contig(contig_name, is_circular=circular_contigs[contig_name])
-            org.add(contig)
-        for row in gene_list:
-            if link:  # if the gene families are already computed/loaded the gene exists.
-                gene = pangenome.get_gene(row["ID"].decode())
-            else:  # else creating the gene.
-                curr_genedata = genedata_dict[row["genedata_id"]]
-                gene_type = curr_genedata.gene_type
-                if gene_type == "CDS":
-                    gene = Gene(row["ID"].decode())
-                elif "RNA" in gene_type:
-                    gene = RNA(row["ID"].decode())
-            try:
-                local = row["local"].decode()
-            except ValueError:
-                local = ""
-            if isinstance(gene, Gene):
-                gene.fill_annotations(start=curr_genedata.start, stop=curr_genedata.stop, strand=curr_genedata.strand,
-                                      gene_type=gene_type, name=curr_genedata.name, position=curr_genedata.position,
-                                      genetic_code=curr_genedata.genetic_code, product=curr_genedata.product,
-                                      local_identifier=local)
-            else:
-                gene.fill_annotations(start=curr_genedata.start, stop=curr_genedata.stop, strand=curr_genedata.strand,
-                                      gene_type=gene_type, name=curr_genedata.name,
-                                      product=curr_genedata.product, local_identifier=local)
-            gene.is_fragment = row["is_fragment"]
-            gene.fill_parents(org, contig)
-            if gene_type == "CDS":
-                contig[gene.start] = gene
-            elif "RNA" in gene_type:
-                contig.add_rna(gene)
-            else:
-                raise Exception(f"A strange type '{gene_type}', which we do not know what to do with, was met.")
-    pangenome.add_organism(org)
-
-
 def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
     """
     Read information about graph in pangenome hdf5 file to add in pangenome object
@@ -350,6 +296,7 @@ def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar:
 def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
     """
     Read gene sequences in pangenome hdf5 file to add in pangenome object
+
     :param pangenome: Pangenome object without gene sequence associate to gene
     :param h5f: Pangenome HDF5 file with gene sequence associate to gene
     :param disable_bar: Disable the progress bar
@@ -357,7 +304,7 @@ def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: boo
     if pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"]:
         raise Exception("It's not possible to read the pangenome gene dna sequences "
                         "if the annotations have not been loaded.")
-    table = h5f.root.geneSequences
+    table = h5f.root.annotations.geneSequences
 
     seqid2seq = read_sequences(h5f)
     for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar):
@@ -437,8 +384,80 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal
 	    pangenome.add_module(module)
     pangenome.status["modules"] = "Loaded"
 
-
-def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
+def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size: int = 20000,
+                   disable_bar: bool = False) -> Tuple[Dict[str, Gene], Dict[str, RNA]]:
+    table = annotations.genomes
+    contig2organism = {}
+    for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar):
+        try:
+            organism = pangenome.get_organism(row["name"].decode())
+        except:
+            organism = Organism(row["name"].decode())
+            pangenome.add_organism(organism)
+        contig = Contig(name=row["contig"].decode())
+        organism.add(contig)
+        contig2organism[contig.name] = organism.name
+    table = annotations.contigs
+    contig_name = None
+    genes_dict = {}
+    rna_dict = {}
+    for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar):
+        if contig_name != row["name"].decode():
+            contig_name = row["name"].decode()
+            organism = pangenome.get_organism(contig2organism[contig_name])
+            contig = organism.get(contig_name)
+            contig.is_circular = row["is_circular"]
+        try:
+            gene = Gene(row["gene"].decode())
+        except ValueError:
+            pass
+        else:
+            gene.fill_parents(organism, contig)
+            if row["gene"].decode() in genes_dict:
+                logging.getLogger().warning("A gene with the same ID already pass. "
+                                            "It could be a problem in the number of genes")
+            genes_dict[gene.ID] = gene
+        try:
+            rna = RNA(row["rna"].decode())
+        except ValueError:
+            pass
+        else:
+            rna_dict[rna.ID] = rna
+            rna.fill_parents(organism, contig)
+    return genes_dict, rna_dict
+
+def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata],
+               gene_dict: Dict[str, Gene], chunk_size: int = 20000, disable_bar: bool = False):
+    for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar):
+        gene = gene_dict[row["ID"].decode()]
+        genedata = genedata_dict[row["genedata_id"]]
+        try:
+            local = row["local"].decode()
+        except ValueError:
+            local = ""
+        gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand,
+                                  gene_type=genedata.gene_type, name=genedata.name, position=genedata.position,
+                                  genetic_code=genedata.genetic_code, product=genedata.product,
+                                  local_identifier=local)
+        gene.is_fragment = row["is_fragment"]
+        if gene.contig is not None:
+            gene.contig.add(gene)
+
+
+def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata],
+               rna_dict: Dict[str, RNA], chunk_size: int = 20000, disable_bar: bool = False):
+    for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar):
+        rna = rna_dict[row["ID"].decode()]
+        genedata = genedata_dict[row["genedata_id"]]
+        rna.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand,
+                             gene_type=genedata.gene_type, name=genedata.name,
+                             product=genedata.product)
+        if rna.contig is not None:
+            rna.contig.add_rna(rna)
+
+
+def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_genes: bool = True,
+                    load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False):
     """
     Read annotation in pangenome hdf5 file to add in pangenome object
 
@@ -448,35 +467,33 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool =
     """
     annotations = h5f.root.annotations
 
-    table = annotations.genes
-    pangenome_dict = {}
-    circular_contigs = {}
+    if load_organisms:
+        if load_genes:
+            genedata_dict = read_genedata(h5f)
+            if load_rnas:
+                gene_dict, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar)
 
-    genedata_dict = read_genedata(h5f)
-
-    for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar):
-        decode_org = row["organism"].decode()
-        try:
-            # new gene, seen contig, seen org
-            pangenome_dict[decode_org][row["contig"]["name"].decode()].append(row["gene"])
-        except KeyError:
-            try:
-                # new contig, seen org
-                pangenome_dict[decode_org][row["contig"]["name"].decode()] = [row["gene"]]
-                circular_contigs[decode_org][row["contig"]["name"].decode()] = row["contig"]["is_circular"]
-            except KeyError:
-                # new org
-                pangenome_dict[sys.intern(decode_org)] = {row["contig"]["name"].decode(): [row["gene"]]}
-                circular_contigs[decode_org] = {row["contig"]["name"].decode(): row["contig"]["is_circular"]}
-
-    link = True if pangenome.status["genesClustered"] in ["Computed", "Loaded"] else False
-
-    for org_name, contig_dict in tqdm(pangenome_dict.items(), total=len(pangenome_dict),
-                                      unit="organism", disable=disable_bar):
-        read_organism(pangenome, org_name, contig_dict, circular_contigs[org_name], genedata_dict, link)
+                read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar)
+                read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar)
+            else:
+                gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar)
+                read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar)
+        else:
+            if load_rnas:
+                genedata_dict = read_genedata(h5f)
+                _, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar)
+                read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar)
+    else:
+        if load_genes:
+            if pangenome.status["genesClustered"] not in ["Loaded", "Computed"]:
+                raise Exception("Genes must be linked to gene families or organisms, but none are laoded")
+            gene_dict = {gene.ID: gene for gene in pangenome.genes}  # Dictionary with genes in families
+            gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar)
+            read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar)
     pangenome.status["genomesAnnotated"] = "Loaded"
 
 
+
 def read_info(h5f: tables.File):
     """
     Read the pangenome content
@@ -630,12 +647,14 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa
     fix_partitioned(pangenome.file)
 
     h5f = tables.open_file(filename, "r")
-    if annotation:
+
+    if annotation:  # I place annotation here, to link gene to gene families if organism are not loaded
         if h5f.root.status._v_attrs.genomesAnnotated:
             logging.getLogger("PPanGGOLiN").info("Reading pangenome annotations...")
             read_annotation(pangenome, h5f, disable_bar=disable_bar)
         else:
             raise Exception(f"The pangenome in file '{filename}' has not been annotated, or has been improperly filled")
+
     if gene_sequences:
         if h5f.root.status._v_attrs.geneSequences:
             logging.getLogger("PPanGGOLiN").info("Reading pangenome gene dna sequences...")
@@ -652,6 +671,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa
         else:
             raise Exception(
                 f"The pangenome in file '{filename}' does not have gene families, or has been improperly filled")
+
     if graph:
         if h5f.root.status._v_attrs.NeighborsGraph:
             logging.getLogger("PPanGGOLiN").info("Reading the neighbors graph edges...")
@@ -659,6 +679,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa
         else:
             raise Exception(f"The pangenome in file '{filename}' does not have graph information, "
                             f"or has been improperly filled")
+
     if rgp:
         if h5f.root.status._v_attrs.predictedRGP:
             logging.getLogger("PPanGGOLiN").info("Reading the RGP...")
@@ -666,6 +687,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa
         else:
             raise Exception(f"The pangenome in file '{filename}' does not have RGP information, "
                             f"or has been improperly filled")
+
     if spots:
         if h5f.root.status._v_attrs.spots:
             logging.getLogger("PPanGGOLiN").info("Reading the spots...")
@@ -673,6 +695,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa
         else:
             raise Exception(f"The pangenome in file '{filename}' does not have spots information, "
                             f"or has been improperly filled")
+
     if modules:
         if h5f.root.status._v_attrs.modules:
             logging.getLogger("PPanGGOLiN").info("Reading the modules...")
@@ -680,6 +703,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa
         else:
             raise Exception(f"The pangenome in file '{filename}' does not have modules information, "
                             f"or has been improperly filled")
+
     if metadata:
         assert metatype is not None
         if sources is None:
diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py
index 907034de..fc4e2e70 100644
--- a/ppanggolin/formats/writeAnnotations.py
+++ b/ppanggolin/formats/writeAnnotations.py
@@ -11,18 +11,22 @@
 
 # local libraries
 from ppanggolin.pangenome import Pangenome
-from ppanggolin.genome import Feature, Gene
+from ppanggolin.genome import Gene, RNA
 from ppanggolin.formats.readBinaries import Genedata
 
 
-def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int, int]:
+genedata_counter = 0
+
+
+def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int]:
     """
     Get the maximum size of each annotation information to optimize disk space
 
     :param pangenome: Annotated pangenome
-    :return: maximum size of each annotation
+
+    :return: Maximum size of each annotation
     """
-    max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = 1, 1, 1, 1, 1, 1
+    max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id = 1, 1, 1, 1, 1
     for org in pangenome.organisms:
         if len(org.name) > max_org_len:
             max_org_len = len(org.name)
@@ -36,19 +40,17 @@ def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, i
                     max_gene_local_id = len(gene.local_identifier)
             for rna in contig.RNAs:
                 if len(rna.ID) > max_rna_id_len:
-                    max_rna_id_len = len(gene.ID)
-                if len(rna.local_identifier) > max_rna_local_id:
-                    max_rna_local_id = len(gene.local_identifier)
+                    max_rna_id_len = len(rna.ID)
 
-    return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id
+    return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id
 
 
 def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]:
     """
-    Create a table to save organism-related information
+    Table description to save organism-related information
 
-    :param org_len: Maximum size of organism
-    :param contig_len: Maximum size of contigs
+    :param org_len: Maximum size of organism name.
+    :param contig_len: Maximum size of contigs name
 
     :return: Formatted table
     """
@@ -58,7 +60,15 @@ def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]:
 
 def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group,
                     organism_desc: Dict[str, tables.StringCol], disable_bar=False):
-    organism_table = h5f.create_table(annotation, "genome", organism_desc,
+    """Write organisms information in the pangenome file
+
+    :param pangenome: Annotated pangenome object
+    :param h5f: Pangenome file
+    :param annotation: Annotation table group
+    :param organism_desc: Organisms table description.
+    :param disable_bar: Allow disabling progress bar
+    """
+    organism_table = h5f.create_table(annotation, "genomes", organism_desc,
                                       expectedrows=pangenome.number_of_organisms)
     logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes")
     organism_row = organism_table.row
@@ -72,11 +82,11 @@ def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.G
 
 def contig_desc(contig_len: int, max_gene_id_len: int,
                 max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol]]:
-    """
-    Create a table to save organism-related information
+    """Table description to save contig-related information
 
-    :param org_len: Maximum size of organism
-    :param contig_len: Maximum size of contigs
+    :param contig_len: Maximum size of contig name
+    :param max_gene_id_len: Maximum size of gene name
+    :param max_rna_id_len: Maximum size of rna name
 
     :return: Formatted table
     """
@@ -89,29 +99,44 @@ def contig_desc(contig_len: int, max_gene_id_len: int,
 def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group,
                   contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol]],
                   disable_bar=False):
-    contig_table = h5f.create_table(annotation, "contig", contig_desc, expectedrows=pangenome.number_of_contigs)
+    """Write contigs information in the pangenome file
+    :param pangenome: Annotated pangenome object
+    :param h5f: Pangenome file
+    :param annotation: Annotation table group
+    :param contig_desc: Contigs table description
+    :param disable_bar: Allow disabling progress bar
+    """
+    contig_table = h5f.create_table(annotation, "contigs", contig_desc, expectedrows=pangenome.number_of_contigs)
     logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs")
     contig_row = contig_table.row
     for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar):
-        rna_list = list(contig.RNAs)
-        for index, gene in enumerate(contig.genes):
-            contig_row["name"] = contig.name
-            contig_row["is_circular"] = contig.is_circular
-            contig_row["gene"] = gene.ID
-            if index < len(rna_list):
-                contig_row["rna"] = rna_list[index].ID
-            contig_row.append()
+        if len(contig) >= contig.number_of_rnas:
+            rna_list = list(contig.RNAs)
+            for index, gene in enumerate(contig.genes):
+                contig_row["name"] = contig.name
+                contig_row["is_circular"] = contig.is_circular
+                contig_row["gene"] = gene.ID
+                if index < len(rna_list):
+                    if rna_list[index].ID == 'GCF_001293965.1_ASM129396v1_genomic_tRNA_005':
+                        print("pika")
+                    contig_row["rna"] = rna_list[index].ID
+                contig_row.append()
+        else:
+            gene_list = list(contig.genes)
+            for index, rna in enumerate(contig.RNAs):
+                contig_row["name"] = contig.name
+                contig_row["is_circular"] = contig.is_circular
+                contig_row["rna"] = rna.ID
+                if index < len(gene_list):
+                    contig_row["gene"] = gene_list[index].ID
+                contig_row.append()
     contig_table.flush()
 
 
 def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]:
-    """
-    Create a table to save gene-related information
-
-    :param org_len: Maximum size of organism
-    :param contig_len: Maximum size of contigs
-    :param id_len: Maximum size of gene ID
+    """Table description to save gene-related information
 
+    :param id_len: Maximum size of gene name
     :param max_local_id: Maximum size of gene local identifier
 
     :return: Formatted table
@@ -125,16 +150,26 @@ def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables.
 def write_genes(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group,
                 gene_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]],
                 disable_bar=False) -> Dict[Genedata, int]:
+    """Write genes information in the pangenome file
+
+    :param pangenome: Annotated pangenome object
+    :param h5f: Pangenome file
+    :param annotation: Annotation table group
+    :param gene_desc: Genes table description
+    :param disable_bar: Allow to disable progress bar
+
+    :returns: Dictionnary linking genedata to gene identifier
+    """
+    global genedata_counter
+    print(genedata_counter)
     genedata2gene = {}
-    genedata_counter = 0
     gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes)
     logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes")
     gene_row = gene_table.row
-    for gene in tqdm(pangenome.genes, total=pangenome.number_of_organisms, unit="gene", disable=disable_bar):
+    for gene in tqdm(pangenome.genes, total=pangenome.number_of_genes, unit="gene", disable=disable_bar):
         gene_row["ID"] = gene.ID
         gene_row["is_fragment"] = gene.is_fragment
-        if gene.type == "CDS":
-            gene_row["local"] = gene.local_identifier
+        gene_row["local"] = gene.local_identifier
         genedata = get_genedata(gene)
         genedata_id = genedata2gene.get(genedata)
         if genedata_id is None:
@@ -144,14 +179,60 @@ def write_genes(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Grou
         gene_row["genedata_id"] = genedata_id
         gene_row.append()
     gene_table.flush()
+    print(genedata_counter)
     return genedata2gene
 
 
+def rna_desc(id_len) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]:
+    """Table description to save rna-related information
+
+    :param id_len: Maximum size of RNA identifier
+
+    :return: Formatted table
+    """
+    return {'ID': tables.StringCol(itemsize=id_len),
+            'genedata_id': tables.UInt32Col()}
+
+
+def write_rnas(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group,
+               rna_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col]],
+               disable_bar=False) -> Dict[Genedata, int]:
+    """Write RNAs information in the pangenome file
+
+    :param pangenome: Annotated pangenome object
+    :param h5f: Pangenome file
+    :param annotation: Annotation table group
+    :param rna_desc: RNAs table description
+    :param disable_bar: Allow to disable progress bar
+
+    :returns: Dictionnary linking genedata to RNA identifier
+    """
+    global genedata_counter
+    print(genedata_counter)
+    genedata2rna = {}
+    rna_table = h5f.create_table(annotation, "RNAs", rna_desc, expectedrows=pangenome.number_of_genes)
+    logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes")
+    rna_row = rna_table.row
+    for rna in tqdm(pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar):
+        rna_row["ID"] = rna.ID
+        genedata = get_genedata(rna)
+        genedata_id = genedata2rna.get(genedata)
+        if genedata_id is None:
+            genedata_id = genedata_counter
+            genedata2rna[genedata] = genedata_id
+            genedata_counter += 1
+        rna_row["genedata_id"] = genedata_id
+        rna_row.append()
+    rna_table.flush()
+    print(genedata_counter)
+    return genedata2rna
+
+
 def genedata_desc(type_len, name_len, product_len):
     """
     Creates a table for gene-related data
 
-    :param type_len: Maximum size of gene Type
+    :param type_len: Maximum size of gene Type.
     :param name_len: Maximum size of gene name
     :param product_len: Maximum size of gene product
     :return: Formatted table for gene metadata
@@ -199,31 +280,42 @@ def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]:
     return max_type_len, max_name_len, max_product_len
 
 
-def get_genedata(gene: Feature) -> Genedata:
+def get_genedata(feature: Union[Gene, RNA]) -> Genedata:
     """
     Gets the genedata type of Feature
 
-    :param gene: a Feature
+    :param feature: Gene or RNA object
+
     :return: Tuple with a Feature associated data
     """
     position = None
     genetic_code = 11
-    if gene.type == "CDS":
-        gene: Gene
-        position = gene.position
-        genetic_code = gene.genetic_code
-    return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name,
-                    gene.product, genetic_code)
+    if isinstance(feature, Gene):
+        position = feature.position
+        genetic_code = feature.genetic_code
+    return Genedata(feature.start, feature.stop, feature.strand, feature.type, position, feature.name,
+                    feature.product, genetic_code)
 
 
 def write_genedata(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group,
                    genedata2gene: Dict[Genedata, int], disable_bar=False):
-    genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)),
-                                      expectedrows=len(genedata2gene))
+    """Writting genedata information in pangenome file
+
+    :param pangenome: Pangenome object filled with annotation.
+    :param h5f: Pangenome file
+    :param annotation: Annotation group in Table
+    :param genedata2gene: Dictionnary linking genedata to gene identifier.
+    :param disable_bar: Allow disabling progress bar
+    """
+    try:
+        genedata_table = annotation.genedata
+    except tables.exceptions.NoSuchNodeError:
+        genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)),
+                                          expectedrows=len(genedata2gene))
     logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data "
                                           "(can be lower than the number of genes)")
     genedata_row = genedata_table.row
-    for genedata, genedata_id in genedata2gene.items():
+    for genedata, genedata_id in tqdm(genedata2gene.items(), unit="genedata", disable=disable_bar):
         genedata_row["genedata_id"] = genedata_id
         genedata_row["start"] = genedata.start
         genedata_row["stop"] = genedata.stop
@@ -239,29 +331,37 @@ def write_genedata(pangenome: Pangenome,  h5f: tables.File, annotation: tables.G
 
 
 def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True,
-                      rec_contigs: bool = True, rec_genes: bool = True, disable_bar: bool = False):
-    """
-    Function writing all the pangenome annotations
+                      rec_contigs: bool = True, rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False):
+    """Function writing all the pangenome annotations
 
     :param pangenome: Annotated pangenome
     :param h5f: Pangenome HDF5 file
+    :param rec_organisms: Allow writing organisms in pangenomes
+    :param rec_contigs: Allow writing contigs in pangenomes
+    :param rec_genes: Allow writing genes in pangenomes
+    :param rec_rnas: Allow writing RNAs in pangenomes
     :param disable_bar: Alow to disable progress bar
     """
     annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms")
 
-    max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = get_max_len_annotations(
-        pangenome)
+    org_len, contig_len, gene_id_len, rna_id_len, gene_local_id = get_max_len_annotations(pangenome)
+
+    # I add these boolean in case we would one day only load organism, contig or genes, without the other.
 
     if rec_organisms:
-        desc = organism_desc(max_org_len, max_contig_len)
+        desc = organism_desc(org_len, contig_len)
         write_organisms(pangenome, h5f, annotation, desc, disable_bar)
     if rec_contigs:
-        desc = contig_desc(max_contig_len, max_gene_id_len, max_rna_id_len)
+        desc = contig_desc(contig_len, gene_id_len, rna_id_len)
         write_contigs(pangenome, h5f, annotation, desc, disable_bar)
     if rec_genes:
-        desc = gene_desc(max_gene_id_len, max_gene_local_id)
+        desc = gene_desc(gene_id_len, gene_local_id)
         genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar)
         write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar)
+    if rec_rnas:
+        desc = rna_desc(rna_id_len)
+        genedata2rna = write_rnas(pangenome, h5f, annotation, desc, disable_bar)
+        write_genedata(pangenome, h5f, annotation, genedata2rna, disable_bar)
 
 
 def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]:
diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py
index 1a3fcc56..dd0123ca 100644
--- a/ppanggolin/formats/writeBinaries.py
+++ b/ppanggolin/formats/writeBinaries.py
@@ -606,9 +606,9 @@ def update_gene_fragments(pangenome: Pangenome, h5f: tables.File, disable_bar: b
 
     table = h5f.root.annotations.genes
     for row in tqdm(table, total=table.nrows, unit="gene", disable=disable_bar):
-        genedata_id = row['gene/genedata_id']
+        genedata_id = row['genedata_id']
         if genedataid2genedata[genedata_id].gene_type == 'CDS':
-            row['gene/is_fragment'] = pangenome.get_gene(row['gene/ID'].decode()).is_fragment
+            row['is_fragment'] = pangenome.get_gene(row['ID'].decode()).is_fragment
             row.update()
     table.flush()
 
diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py
index 84462d77..48b846b0 100644
--- a/ppanggolin/genome.py
+++ b/ppanggolin/genome.py
@@ -488,6 +488,12 @@ def RNAs(self) -> Generator[RNA, None, None]:
         """
         yield from self._rna_getter
 
+    @property
+    def number_of_rnas(self) -> int:
+        """Get the number of RNA in the contig
+        """
+        return len(self._rna_getter)
+
 
 class Organism(MetaFeatures):
     """
diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py
index c63bc3ec..28ab459b 100644
--- a/ppanggolin/pangenome.py
+++ b/ppanggolin/pangenome.py
@@ -146,6 +146,26 @@ def number_of_genes(self) -> int:
             self._mk_gene_getter()  # make it
             return len(self._geneGetter)
 
+    """RNAs methods"""
+
+    @property
+    def RNAs(self) -> Generator[Gene, None, None]:
+        """Generator of genes in the pangenome.
+
+        :return: gene generator
+        """
+        for org in self.organisms:
+            for contig in org.contigs:
+                yield from contig.RNAs
+
+    @property
+    def number_of_rnas(self) -> int:
+        """Returns the number of gene present in the pangenome
+
+        :return: The number of genes
+        """
+        return sum(ctg.number_of_rnas for ctg in self.contigs)
+
     """Gene families methods"""
     @property
     def max_fam_id(self):
diff --git a/tests/test_genome.py b/tests/test_genome.py
index c58b296c..b19eda01 100644
--- a/tests/test_genome.py
+++ b/tests/test_genome.py
@@ -523,7 +523,8 @@ def test_number_of_contigs(self, organism):
 		"""
 		organism.add(Contig('contig1'))
 		organism.add(Contig('contig2'))
-		assert organism.number_of_contigs() == 2
+		assert isinstance(len(organism), int)
+		assert len(organism) == 2
 
 	def test_get_families(self, organism, contig, gene):
 		"""Tests that gene families in an organism can be retrieved

From 6f7229ed060ce933044d19a61bd114d41bec9137 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Wed, 6 Sep 2023 16:20:42 +0200
Subject: [PATCH 04/15] Fix compatibility problem with merge

---
 VERSION                                | 2 +-
 ppanggolin/formats/writeAnnotations.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/VERSION b/VERSION
index bfdb369d..27a69d0e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.176
+1.2.177
diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py
index fc4e2e70..e1dd1640 100644
--- a/ppanggolin/formats/writeAnnotations.py
+++ b/ppanggolin/formats/writeAnnotations.py
@@ -110,7 +110,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro
     logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs")
     contig_row = contig_table.row
     for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar):
-        if len(contig) >= contig.number_of_rnas:
+        if contig.number_of_genes >= contig.number_of_rnas:
             rna_list = list(contig.RNAs)
             for index, gene in enumerate(contig.genes):
                 contig_row["name"] = contig.name

From 9499d2abde7ba3cec1efeb0ba9498052396afb8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Thu, 7 Sep 2023 14:28:26 +0200
Subject: [PATCH 05/15] Get length of contig for GFF and GBFF files

---
 VERSION                                |   2 +-
 ppanggolin/annotate/annotate.py        | 123 ++++++++++---------------
 ppanggolin/formats/writeAnnotations.py |   8 +-
 ppanggolin/genome.py                   |  20 +++-
 ppanggolin/pangenome.py                |   1 -
 testingDataset/organisms.gbff.list     |   3 +-
 6 files changed, 71 insertions(+), 86 deletions(-)

diff --git a/VERSION b/VERSION
index 27a69d0e..8c12c1db 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.177
+1.2.178
diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
index ad0dd348..e4e9053d 100644
--- a/ppanggolin/annotate/annotate.py
+++ b/ppanggolin/annotate/annotate.py
@@ -32,13 +32,13 @@ def check_annotate_args(args):
         raise Exception("You must provide at least a file with the --fasta option to annotate from sequences, "
                         "or a file with the --gff option to load annotations from.")
 
-    
     if hasattr(args, "fasta") and args.fasta is not None:
         check_input_files(args.fasta, True)
 
     if hasattr(args, "anno") and args.anno is not None:
         check_input_files(args.anno, True)
 
+
 def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set,
                 start: int, stop: int, strand: str, gene_type: str, position: int = None, gene_name: str = "",
                 product: str = "", genetic_code: int = 11, protein_id: str = ""):
@@ -91,20 +91,19 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i
     new_gene.fill_parents(org, contig)
 
 
-def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> (
+def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> (
         Organism, bool):
     """
     Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file
 
-    :param organism: Organism name
-    :param gbff_file_path: Path to corresponding GFF file
+    :param organism_name: Organism name
+    :param gbff_file_path: Path to corresponding GBFF file
     :param circular_contigs: list of contigs
     :param pseudo: Allow to read pseudogène
 
     :return: Organism complete and true for sequence in file
     """
-    org = Organism(organism)
-
+    organism = Organism(organism_name)
     logging.getLogger("PPanGGOLiN").debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}")
     # revert the order of the file, to read the first line first.
     lines = read_compressed_or_not(gbff_file_path).readlines()[::-1]
@@ -114,35 +113,29 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p
         line = lines.pop()
         # beginning of contig
         contig = None
-        set_contig = False
         is_circ = False
-        contig_locus_id = None
+        contig_id = None
         if line.startswith('LOCUS'):
             if "CIRCULAR" in line.upper():
                 # this line contains linear/circular word telling if the dna sequence is circularized or not
                 is_circ = True
-            contig_locus_id = line.split()[1]
+            # TODO maybe it could be a good thing to add a elif for linear
+            #  and if circular or linear are not found raise a warning
+
+            contig_id = line.split()[1]
             # If contig_id is not specified in VERSION afterward like with Prokka, in that case we use the one in LOCUS
             while not line.startswith('FEATURES'):
-                if line.startswith('VERSION'):
+                if line.startswith('VERSION') and line[12:].strip() != "":
                     contig_id = line[12:].strip()
-                    if contig_id != "":
-                        try:
-                            contig = org.get(contig_id)
-                        except KeyError:
-                            contig = Contig(contig_id, True if contig_id in circular_contigs else False)
-                            org.add(contig)
-                        set_contig = True
                 line = lines.pop()
-        if not set_contig:
-            # if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID.
-            # Should be unique in a dataset, but if there's an update the contig ID
-            # might still be the same even though it should not(?)
-            try:
-                contig = org.get(contig_locus_id)
-            except KeyError:
-                contig = Contig(contig_locus_id, True if contig_locus_id in circular_contigs else False)
-                org.add(contig)
+            # If no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID.
+            # Should be unique in a dataset, but if there's an update
+            # the contig ID might still be the same even though it should not(?)
+        try:
+            contig = organism.get(contig_id)
+        except KeyError:
+            contig = Contig(contig_id, True if contig_id in circular_contigs or is_circ else False)
+            organism.add(contig)
         # start of the feature object.
         dbxref = set()
         gene_name = ""
@@ -160,8 +153,8 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p
             curr_type = line[5:21].strip()
             if curr_type != "":
                 if useful_info:
-                    create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type,
-                                contig.number_of_genes, gene_name, product, genetic_code, protein_id)
+                    create_gene(organism, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand,
+                                obj_type, contig.number_of_genes, gene_name, product, genetic_code, protein_id)
                     if obj_type == "CDS":
                         gene_counter += 1
                     else:
@@ -192,6 +185,9 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p
                         pass
                         # don't know what to do with that, ignoring for now.
                         # there is a protein with a frameshift mecanism.
+                elif curr_type == 'source':  # Get Contig length
+                    start, end = map(int, map(str.strip, line[21:].split('..')))
+                    contig.length = end - start + 1
             elif useful_info:  # current info goes to current objtype, if it's useful.
                 if line[21:].startswith("/db_xref"):
                     dbxref.add(line.split("=")[1].replace('"', '').strip())
@@ -220,7 +216,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p
             line = lines.pop()
             # end of contig
         if useful_info:  # saving the last element...
-            create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type,
+            create_gene(organism, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type,
                         contig.number_of_genes, gene_name, product, genetic_code, protein_id)
             if obj_type == "CDS":
                 gene_counter += 1
@@ -237,7 +233,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p
         # get each gene's sequence.
         for gene in contig.genes:
             gene.add_sequence(get_dna_sequence(sequence, gene))
-    return org, True
+    return organism, True
 
 
 def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: bool = False) -> (Organism, bool):
@@ -245,22 +241,22 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: b
     Read annotation from GFF file
 
     :param organism: Organism name
-    :param gff_file_path: Path to corresponding GFF file
-    :param circular_contigs: list of contigs
+    :param gff_file_path: Path corresponding to GFF file
+    :param circular_contigs: List of circular contigs
     :param pseudo: Allow to read pseudogène
 
-    :return: Organism object and if there are sequences associate or not
+    :return: Organism object and if there are sequences associated or not
     """
     (gff_seqname, _, gff_type, gff_start, gff_end, _, gff_strand, _, gff_attribute) = range(0, 9)
 
-    # missing values : source, score, frame. They are unused.
+    # Missing values: source, score, frame. They are unused.
 
     def get_gff_attributes(gff_fields: list) -> dict:
-        """
-        Parses the gff attribute's line and outputs the attributes_get in a dict structure.
-        :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff.
+        """Parses the gff attribute's line and outputs the attributes_get in a dict structure.
 
-        :return: attributes get
+        :param gff_fields: A gff line stored as a list. Each element of the list is a column of the gff.
+
+        :return: Attributes get
         """
         attributes_field = [f for f in gff_fields[gff_attribute].strip().split(';') if len(f) > 0]
         attributes_get = {}
@@ -276,7 +272,8 @@ def get_id_attribute(attributes_dict: dict) -> str:
         """
         Gets the ID of the element from which the provided attributes_get were extracted.
         Raises an error if no ID is found.
-        :param attributes_dict: attributes from one gff line
+
+        :param attributes_dict: Attributes from one gff line
 
         :return: CDS identifier
         """
@@ -302,11 +299,9 @@ def get_id_attribute(attributes_dict: dict) -> str:
                     has_fasta = True
                 elif line.startswith('sequence-region', 2, 17):
                     fields = [el.strip() for el in line.split()]
-                    try:
-                        contig = org.get(fields[1])
-                    except KeyError:
-                        contig = Contig(fields[1], True if fields[1] in circular_contigs else False)
-                        org.add(contig)
+                    contig = Contig(fields[1], True if fields[1] in circular_contigs else False)
+                    org.add(contig)
+                    contig.length = int(fields[-1]) - int(fields[3]) + 1
 
                 continue
             elif line.startswith('#'):  # comment lines to be ignores by parsers
@@ -326,32 +321,14 @@ def get_id_attribute(attributes_dict: dict) -> str:
                     # if it's not found, we get the one under the 'ID' field which must exist
                     # (otherwise not a gff3 compliant file)
                     gene_id = get_id_attribute(attributes)
-                try:
-                    name = attributes.pop('NAME')
-                except KeyError:
-                    try:
-                        name = attributes.pop('GENE')
-                    except KeyError:
-                        name = ""
+                name = attributes.pop('NAME', attributes.pop('GENE', ""))
                 if "pseudo" in attributes or "pseudogene" in attributes:
                     pseudogene = True
-                try:
-                    product = attributes.pop('PRODUCT')
-                except KeyError:
-                    product = ""
-
-                try:
-                    genetic_code = int(attributes.pop("TRANSL_TABLE"))
-                except KeyError:
-                    genetic_code = 11
+                product = attributes.pop('PRODUCT', "")
+                genetic_code = int(attributes.pop("TRANSL_TABLE", 11))
                 if contig is None or contig.name != fields_gff[gff_seqname]:
                     # get the current contig
-                    try:
-                        contig = org.get(fields_gff[gff_seqname])
-                    except KeyError:
-                        contig = Contig(fields_gff[gff_seqname],
-                                        True if fields_gff[gff_seqname] in circular_contigs else False)
-                        org.add(contig)
+                    contig = org.get(fields_gff[gff_seqname])
 
                 if fields_gff[gff_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)):
                     gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4))
@@ -391,8 +368,8 @@ def launch_read_anno(args: tuple) -> (Organism, bool):
     return read_anno_file(*args)
 
 
-def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, pseudo: bool = False) -> (
-        Organism, bool):
+def read_anno_file(organism_name: str, filename: Path, circular_contigs: list,
+                   pseudo: bool = False) -> (Organism, bool):
     """
     Read a GBFF file for one organism
 
@@ -543,8 +520,8 @@ def launch_annotate_organism(pack: tuple) -> Organism:
 
 
 def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11,
-                       kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None,
-                       disable_bar: bool = False):
+                       kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False,
+                       procedure: str = None, disable_bar: bool = False):
     """
     Main function to annotate a pangenome
 
@@ -570,14 +547,13 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu:
 
         if not org_path.exists():  # Check tsv sanity test if it's not one it's the other
             org_path = fasta_list.parent.joinpath(org_path)
-            
+
         arguments.append((elements[0], org_path, elements[2:], tmpdir, translation_table,
                           norna, kingdom, allow_overlap, procedure))
 
     if len(arguments) == 0:
         raise Exception("There are no genomes in the provided file")
 
-
     logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...")
     with get_context('fork').Pool(processes=cpu) as p:
         for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome",
@@ -622,7 +598,6 @@ def launch(args: argparse.Namespace):
                 logging.getLogger("PPanGGOLiN").warning(
                     "You will be able to proceed with your analysis ONLY if you provide "
                     "the clustering results in the next step.")
-
     write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar)
 
 
@@ -684,8 +659,6 @@ def parser_annot(parser: argparse.ArgumentParser):
 
 if __name__ == '__main__':
     """To test local change and allow using debugger"""
-    import tempfile
-
     from ppanggolin.utils import set_verbosity_level, add_common_arguments
 
     main_parser = argparse.ArgumentParser(
diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py
index e1dd1640..d74a97a3 100644
--- a/ppanggolin/formats/writeAnnotations.py
+++ b/ppanggolin/formats/writeAnnotations.py
@@ -109,7 +109,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro
     contig_table = h5f.create_table(annotation, "contigs", contig_desc, expectedrows=pangenome.number_of_contigs)
     logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs")
     contig_row = contig_table.row
-    for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar):
+    for contig in tqdm(pangenome.contigs, total=pangenome.number_of_contigs, unit="contigs", disable=disable_bar):
         if contig.number_of_genes >= contig.number_of_rnas:
             rna_list = list(contig.RNAs)
             for index, gene in enumerate(contig.genes):
@@ -117,8 +117,6 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro
                 contig_row["is_circular"] = contig.is_circular
                 contig_row["gene"] = gene.ID
                 if index < len(rna_list):
-                    if rna_list[index].ID == 'GCF_001293965.1_ASM129396v1_genomic_tRNA_005':
-                        print("pika")
                     contig_row["rna"] = rna_list[index].ID
                 contig_row.append()
         else:
@@ -161,7 +159,6 @@ def write_genes(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Grou
     :returns: Dictionnary linking genedata to gene identifier
     """
     global genedata_counter
-    print(genedata_counter)
     genedata2gene = {}
     gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes)
     logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes")
@@ -179,7 +176,6 @@ def write_genes(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Grou
         gene_row["genedata_id"] = genedata_id
         gene_row.append()
     gene_table.flush()
-    print(genedata_counter)
     return genedata2gene
 
 
@@ -208,7 +204,6 @@ def write_rnas(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group
     :returns: Dictionnary linking genedata to RNA identifier
     """
     global genedata_counter
-    print(genedata_counter)
     genedata2rna = {}
     rna_table = h5f.create_table(annotation, "RNAs", rna_desc, expectedrows=pangenome.number_of_genes)
     logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes")
@@ -224,7 +219,6 @@ def write_rnas(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group
         rna_row["genedata_id"] = genedata_id
         rna_row.append()
     rna_table.flush()
-    print(genedata_counter)
     return genedata2rna
 
 
diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py
index 499abd37..205b08dc 100644
--- a/ppanggolin/genome.py
+++ b/ppanggolin/genome.py
@@ -311,6 +311,7 @@ def __init__(self, name: str, is_circular: bool = False):
         self._genes_getter = {}
         self._genes_position = []
         self._organism = None
+        self._length = None
 
     def __str__(self) -> str:
         return self.name
@@ -342,6 +343,23 @@ def __setitem__(self, start: int, gene: Gene):
 
     # TODO define eq function
 
+    @property
+    def length(self):
+        if self._length is None:
+            logging.getLogger("PPanGGOLiN").warning("Contig length is unknown")
+        return self._length
+
+    @length.setter
+    def length(self, contig_len: int):
+        if not isinstance(contig_len, int):
+            raise TypeError("Contig length is expected to be an integer")
+        if contig_len < 0:
+            raise ValueError("Contig length must be positive")
+        self._length = contig_len
+
+    def __len__(self):
+        return self.length
+
     # retrieve gene by start position
     def __getitem__(self, position: int) -> Gene:
         """Get the gene for the given position
@@ -593,7 +611,7 @@ def __len__(self):
 
         :return: Number of contigs in organism
         """
-        return len(self._contigs_getter)
+        return len(self._contigs_getter.keys())
 
     @property
     def families(self):
diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py
index 28ab459b..ef37bb4b 100644
--- a/ppanggolin/pangenome.py
+++ b/ppanggolin/pangenome.py
@@ -147,7 +147,6 @@ def number_of_genes(self) -> int:
             return len(self._geneGetter)
 
     """RNAs methods"""
-
     @property
     def RNAs(self) -> Generator[Gene, None, None]:
         """Generator of genes in the pangenome.
diff --git a/testingDataset/organisms.gbff.list b/testingDataset/organisms.gbff.list
index 1537925a..093d4beb 100644
--- a/testingDataset/organisms.gbff.list
+++ b/testingDataset/organisms.gbff.list
@@ -21,6 +21,7 @@ GCF_000472205.1	GBFF/GCF_000472205.1_E_CS88_f__genomic.gbff.gz
 GCF_000590575.1	GBFF/GCF_000590575.1_ASM59057v1_genomic.gbff.gz
 GCF_000590635.1	GBFF/GCF_000590635.1_ASM59063v1_genomic.gbff.gz
 GCF_000590695.1	GBFF/GCF_000590695.1_ASM59069v1_genomic.gbff.gz
+GCF_001183765.1	GBFF/GCF_001183765.1_ASM118376v1_genomic.gbff.gz
 GCF_001183805.1	GBFF/GCF_001183805.1_ASM118380v1_genomic.gbff.gz
 GCF_001183825.1	GBFF/GCF_001183825.1_ASM118382v1_genomic.gbff.gz
 GCF_001183845.1	GBFF/GCF_001183845.1_ASM118384v1_genomic.gbff.gz
@@ -50,4 +51,4 @@ GCF_006508265.1	GBFF/GCF_006508265.1_ASM650826v1_genomic.gbff.gz
 GCF_000068585.1	GBFF/NC_010287.1.gff.gz
 GCF_000008725.1	GBFF/NC_000117.1.gbk.gz
 GCF_001183765.1_gff	GBFF/PROKKA_12132021.gff.gz
-GCF_001183765.1_gbk	GBFF/PROKKA_12132021.gbk.gz
+GCF_001183765.1_gbk	GBFF/PROKKA_12132021.gbk.gz
\ No newline at end of file

From 1452b239587b719fa0716eebb0e7d97f7e10e0db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Thu, 7 Sep 2023 15:16:40 +0200
Subject: [PATCH 06/15] Get contig length for fasta files

---
 VERSION                         |  2 +-
 ppanggolin/annotate/annotate.py | 54 ++++++++++++++++++---------------
 ppanggolin/annotate/synta.py    | 16 ++++++++--
 3 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/VERSION b/VERSION
index 8c12c1db..e9a3656e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.178
+1.2.179
diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
index e4e9053d..baad0c20 100644
--- a/ppanggolin/annotate/annotate.py
+++ b/ppanggolin/annotate/annotate.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 import tempfile
 import time
+from typing import List, Set, Tuple
 
 # installed libraries
 from tqdm import tqdm
@@ -21,7 +22,7 @@
 from ppanggolin.formats import write_pangenome
 
 
-def check_annotate_args(args):
+def check_annotate_args(args: argparse.Namespace):
     """Check That the given arguments are usable
 
     :param args: All arguments provide by user
@@ -39,7 +40,7 @@ def check_annotate_args(args):
         check_input_files(args.anno, True)
 
 
-def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set,
+def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: Set[str],
                 start: int, stop: int, strand: str, gene_type: str, position: int = None, gene_name: str = "",
                 product: str = "", genetic_code: int = 11, protein_id: str = ""):
     """
@@ -82,7 +83,7 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i
         new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name,
                                   position=position, product=product, local_identifier=gene_id,
                                   genetic_code=genetic_code)
-        contig[new_gene.start] = new_gene
+        contig.add(new_gene)
     else:  # if not CDS, it is RNA
         new_gene = RNA(org.name + "_RNA_" + str(rna_counter).zfill(4))
         new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name,
@@ -91,8 +92,8 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i
     new_gene.fill_parents(org, contig)
 
 
-def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> (
-        Organism, bool):
+def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: List[str],
+                  pseudo: bool = False) -> Tuple[Organism, bool]:
     """
     Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file
 
@@ -236,7 +237,8 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: li
     return organism, True
 
 
-def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: bool = False) -> (Organism, bool):
+def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str],
+                 pseudo: bool = False) -> Tuple[Organism, bool]:
     """
     Read annotation from GFF file
 
@@ -358,7 +360,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
     return org, has_fasta
 
 
-def launch_read_anno(args: tuple) -> (Organism, bool):
+def launch_read_anno(args: Tuple[str, Path, List[str], bool]) -> Tuple[Organism, bool]:
     """ Allow to launch in multiprocessing the read of genome annotation
 
     :param args: Pack of argument for annotate_organism function
@@ -368,8 +370,8 @@ def launch_read_anno(args: tuple) -> (Organism, bool):
     return read_anno_file(*args)
 
 
-def read_anno_file(organism_name: str, filename: Path, circular_contigs: list,
-                   pseudo: bool = False) -> (Organism, bool):
+def read_anno_file(organism_name: str, filename: Path, circular_contigs: List[str],
+                   pseudo: bool = False) -> Tuple[Organism, bool]:
     """
     Read a GBFF file for one organism
 
@@ -396,7 +398,7 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list,
                         "You may be able to use --fasta instead.")
 
 
-def chose_gene_identifiers(pangenome) -> bool:
+def chose_gene_identifiers(pangenome: Pangenome) -> bool:
     """
     Parses the pangenome genes to decide whether to use local_identifiers or ppanggolin generated gene identifiers.
     If the local identifiers are unique within the pangenome they are picked, otherwise ppanggolin ones are used.
@@ -467,15 +469,15 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p
     pangenome.parameters["annotation"]["read_annotations_from_file"] = True
 
 
-def get_gene_sequences_from_fastas(pangenome, fasta_file):
+def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: List[Path]):
     """
     Get gene sequences from fastas
 
-    :param pangenome: input pangenome
-    :param fasta_file: list of fasta file
+    :param pangenome: Input pangenome
+    :param fasta_files: list of fasta file
     """
     fasta_dict = {}
-    for line in read_compressed_or_not(fasta_file):
+    for line in read_compressed_or_not(fasta_files):
         elements = [el.strip() for el in line.split("\t")]
         if len(elements) <= 1:
             logging.getLogger("PPanGGOLiN").error("No tabulation separator found in organisms file")
@@ -483,15 +485,15 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file):
         try:
             org = pangenome.get_organism(elements[0])
         except KeyError:
-            raise KeyError(f"One of the genome in your '{fasta_file}' was not found in the pan."
+            raise KeyError(f"One of the genome in your '{fasta_files}' was not found in the pan."
                            f" This might mean that the genome names between your annotation file and "
                            f"your fasta file are different.")
         with read_compressed_or_not(elements[1]) as currFastaFile:
             fasta_dict[org], _ = read_fasta(org, currFastaFile)
     if set(pangenome.organisms) > set(fasta_dict.keys()):
-        missing = pangenome.number_of_organisms() - len(set(pangenome.organisms) & set(fasta_dict.keys()))
+        missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys()))
         raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. "
-                        f"{missing} are missing (out of {pangenome.number_of_organisms()}).")
+                        f"{missing} are missing (out of {pangenome.number_of_organisms}).")
 
     for org in pangenome.organisms:
         for contig in org.contigs:
@@ -509,7 +511,7 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file):
     pangenome.status["geneSequences"] = "Computed"
 
 
-def launch_annotate_organism(pack: tuple) -> Organism:
+def launch_annotate_organism(pack: Tuple[str, Path, List[str], str, int, bool, str, bool, str]) -> Organism:
     """ Allow to launch in multiprocessing the genome annotation
 
     :param pack: Pack of argument for annotate_organism function
@@ -592,12 +594,16 @@ def launch(args: argparse.Namespace):
             if args.fasta:
                 get_gene_sequences_from_fastas(pangenome, args.fasta)
             else:
-                logging.getLogger("PPanGGOLiN").warning(
-                    "You provided gff files without sequences, and you did not provide "
-                    "fasta sequences. Thus it was not possible to get the gene sequences.")
-                logging.getLogger("PPanGGOLiN").warning(
-                    "You will be able to proceed with your analysis ONLY if you provide "
-                    "the clustering results in the next step.")
+                logging.getLogger("PPanGGOLiN").warning("You provided gff files without sequences, "
+                                                        "and you did not provide fasta sequences. "
+                                                        "Thus it was not possible to get the gene sequences.")
+                logging.getLogger("PPanGGOLiN").warning("You will be able to proceed with your analysis "
+                                                        "ONLY if you provide the clustering results in the next step.")
+        else:
+            if args.fasta:
+                logging.getLogger("PPanGGOLiN").warning("You provided fasta sequences "
+                                                        "but your gff files were already with sequences."
+                                                        "PPanGGOLiN will use sequences in GFF and not from your fasta.")
     write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar)
 
 
diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py
index afad1cca..6220db85 100644
--- a/ppanggolin/annotate/synta.py
+++ b/ppanggolin/annotate/synta.py
@@ -6,6 +6,7 @@
 import os
 import tempfile
 from io import TextIOWrapper
+from multiprocessing import Manager
 from subprocess import Popen, PIPE
 import ast
 from collections import defaultdict
@@ -17,6 +18,10 @@
 from ppanggolin.utils import is_compressed, read_compressed_or_not
 
 
+manager = Manager()
+contig_len = manager.dict()
+
+
 def reverse_complement(seq: str):
     """reverse complement the given dna sequence
 
@@ -79,6 +84,8 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str
 
     :return: Annotated genes in a list of gene objects
     """
+    global contig_len
+
     locustag = org.name
     cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"]))
     logging.getLogger("PPanGGOLiN").debug(f"prodigal command : {' '.join(cmd)}")
@@ -89,9 +96,13 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str
     header = ""
     for line in p.communicate()[0].decode().split("\n"):
         if line.startswith("# Sequence Data: "):
+            length = None
             for data in line.split(";"):
-                if data.startswith("seqhdr"):
+                if data.startswith("seqlen"):
+                    length = int(data.split("=")[1])
+                elif data.startswith("seqhdr"):
                     header = data.split("=")[1].replace('"', "").split()[0]
+            contig_len[header] = length
 
         elif line.startswith(">"):
             c += 1
@@ -330,11 +341,12 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir:
         except KeyError:
             contig = Contig(contig_name, True if contig_name in circular_contigs else False)
             org.add(contig)
+        contig.length = contig_len[contig.name]
         for gene in genes:
             gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
             gene.fill_parents(org, contig)
             if isinstance(gene, Gene):
-                contig[gene.start] = gene
+                contig.add(gene)
             elif isinstance(gene, RNA):
                 contig.add_rna(gene)
     return org

From 3e4f6a0f6a4c89d5594cd02898a6529dd17cd11c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 8 Sep 2023 09:53:16 +0200
Subject: [PATCH 07/15] Write and read contig length

---
 VERSION                                | 2 +-
 ppanggolin/formats/readBinaries.py     | 1 +
 ppanggolin/formats/writeAnnotations.py | 6 ++++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/VERSION b/VERSION
index e9a3656e..ea25a270 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.179
+1.2.180
diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 215ad7a3..1b323599 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -407,6 +407,7 @@ def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size:
             organism = pangenome.get_organism(contig2organism[contig_name])
             contig = organism.get(contig_name)
             contig.is_circular = row["is_circular"]
+            contig.length = row["length"]
         try:
             gene = Gene(row["gene"].decode())
         except ValueError:
diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py
index d74a97a3..a8c0c6f1 100644
--- a/ppanggolin/formats/writeAnnotations.py
+++ b/ppanggolin/formats/writeAnnotations.py
@@ -81,7 +81,7 @@ def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.G
 
 
 def contig_desc(contig_len: int, max_gene_id_len: int,
-                max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol]]:
+                max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]:
     """Table description to save contig-related information
 
     :param contig_len: Maximum size of contig name
@@ -92,12 +92,13 @@ def contig_desc(contig_len: int, max_gene_id_len: int,
     """
     return {'name': tables.StringCol(itemsize=contig_len),
             "is_circular": tables.BoolCol(dflt=False),
+            'length': tables.UInt32Col(),
             "gene": tables.StringCol(itemsize=max_gene_id_len),
             "rna": tables.StringCol(itemsize=max_rna_id_len)}
 
 
 def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group,
-                  contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol]],
+                  contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]],
                   disable_bar=False):
     """Write contigs information in the pangenome file
     :param pangenome: Annotated pangenome object
@@ -115,6 +116,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro
             for index, gene in enumerate(contig.genes):
                 contig_row["name"] = contig.name
                 contig_row["is_circular"] = contig.is_circular
+                contig_row["length"] = len(contig)
                 contig_row["gene"] = gene.ID
                 if index < len(rna_list):
                     contig_row["rna"] = rna_list[index].ID

From aa4e308754bca3594af647d1c5c79f72cd622467 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 8 Sep 2023 10:54:03 +0200
Subject: [PATCH 08/15] Remove manager dict to save contig length

---
 VERSION                         |  2 +-
 ppanggolin/annotate/annotate.py |  5 ++--
 ppanggolin/annotate/synta.py    | 44 +++++++++++----------------------
 3 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/VERSION b/VERSION
index ea25a270..177b0d1a 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.180
+1.2.181
diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
index baad0c20..6c5ca044 100644
--- a/ppanggolin/annotate/annotate.py
+++ b/ppanggolin/annotate/annotate.py
@@ -113,7 +113,6 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
     while len(lines) != 0:
         line = lines.pop()
         # beginning of contig
-        contig = None
         is_circ = False
         contig_id = None
         if line.startswith('LOCUS'):
@@ -351,7 +350,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
 
     # GET THE FASTA SEQUENCES OF THE GENES
     if has_fasta and fasta_string != "":
-        contig_sequences, _ = read_fasta(org, fasta_string.split('\n'))  # _ is total contig length
+        contig_sequences = read_fasta(org, fasta_string.split('\n'))  # _ is total contig length
         for contig in org.contigs:
             for gene in contig.genes:
                 gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
@@ -489,7 +488,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: List[Path]
                            f" This might mean that the genome names between your annotation file and "
                            f"your fasta file are different.")
         with read_compressed_or_not(elements[1]) as currFastaFile:
-            fasta_dict[org], _ = read_fasta(org, currFastaFile)
+            fasta_dict[org] = read_fasta(org, currFastaFile)
     if set(pangenome.organisms) > set(fasta_dict.keys()):
         missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys()))
         raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. "
diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py
index 6220db85..677da87d 100644
--- a/ppanggolin/annotate/synta.py
+++ b/ppanggolin/annotate/synta.py
@@ -6,11 +6,10 @@
 import os
 import tempfile
 from io import TextIOWrapper
-from multiprocessing import Manager
 from subprocess import Popen, PIPE
 import ast
 from collections import defaultdict
-from typing import Union
+from typing import Dict, List, Union
 from pathlib import Path
 
 # local libraries
@@ -18,10 +17,6 @@
 from ppanggolin.utils import is_compressed, read_compressed_or_not
 
 
-manager = Manager()
-contig_len = manager.dict()
-
-
 def reverse_complement(seq: str):
     """reverse complement the given dna sequence
 
@@ -84,7 +79,6 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str
 
     :return: Annotated genes in a list of gene objects
     """
-    global contig_len
 
     locustag = org.name
     cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"]))
@@ -96,13 +90,9 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str
     header = ""
     for line in p.communicate()[0].decode().split("\n"):
         if line.startswith("# Sequence Data: "):
-            length = None
             for data in line.split(";"):
-                if data.startswith("seqlen"):
-                    length = int(data.split("=")[1])
-                elif data.startswith("seqhdr"):
+                if data.startswith("seqhdr"):
                     header = data.split("=")[1].replace('"', "").split()[0]
-            contig_len[header] = length
 
         elif line.startswith(">"):
             c += 1
@@ -162,7 +152,7 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str,  kingdom: str = "
     return gene_objs
 
 
-def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, int):
+def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]:
     """ Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value.
 
     :param org: Organism corresponding to fasta file
@@ -173,24 +163,21 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in
     try:
         contigs = {}
         contig_seq = ""
-        all_contig_len = 0
         contig = None
         for line in fna_file:
             if line.startswith('>'):
                 if len(contig_seq) >= 1:  # contig filter = 1
                     contigs[contig.name] = contig_seq.upper()
-                    all_contig_len += len(contig_seq)
+                    contig.length = len(contig_seq)
                 contig_seq = ""
-                try:
-                    contig = org.get(line.split()[0][1:])
-                except KeyError:
-                    contig = Contig(line.split()[0][1:])
-                    org.add(contig)
+                contig = Contig(line.split()[0][1:])
+                org.add(contig)
             else:
                 contig_seq += line.strip()
         if len(contig_seq) >= 1:  # processing the last contig
             contigs[contig.name] = contig_seq.upper()
-            all_contig_len += len(contig_seq)
+            contig.length = len(contig_seq)
+
     except AttributeError as e:
         raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. "
                              f"One possibility for this error is that the file did not start with a '>' "
@@ -198,7 +185,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in
     except Exception:  # To manage other exception which can occur
         raise Exception("Unexpected error. Please check your input file and if everything looks fine, "
                         "please post an issue on our github")
-    return contigs, all_contig_len
+    return contigs
 
 
 def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrapper:
@@ -301,7 +288,7 @@ def get_dna_sequence(contig_seq: str, gene: Gene) -> str:
         return reverse_complement(contig_seq[gene.start - 1:gene.stop])
 
 
-def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: str,
+def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str], tmpdir: str,
                       code: int = 11, norna: bool = False, kingdom: str = "bacteria",
                       allow_overlap: bool = False, procedure: str = None) -> Organism:
     """
@@ -323,10 +310,11 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir:
 
     fasta_file = read_compressed_or_not(file_name)
 
-    contig_sequences, all_contig_len = read_fasta(org, fasta_file)
+    contig_sequences = read_fasta(org, fasta_file)
     if is_compressed(file_name):  # TODO simply copy file with shutil.copyfileobj
         fasta_file = write_tmp_fasta(contig_sequences, tmpdir)
     if procedure is None:  # prodigal procedure is not force by user
+        all_contig_len = sum(len(contig) for contig in org.contigs)
         logging.getLogger("PPanGGOLiN").debug(all_contig_len)
         if all_contig_len < 20000:  # case of short sequence
             procedure = "meta"
@@ -336,12 +324,8 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir:
     genes = overlap_filter(genes, allow_overlap=allow_overlap)
 
     for contig_name, genes in genes.items():
-        try:
-            contig = org.get(contig_name)
-        except KeyError:
-            contig = Contig(contig_name, True if contig_name in circular_contigs else False)
-            org.add(contig)
-        contig.length = contig_len[contig.name]
+        contig = org.get(contig_name)
+        contig.is_circular = True if contig.name in circular_contigs else False
         for gene in genes:
             gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
             gene.fill_parents(org, contig)

From cb8fecaaf55970a2b7ec51b8ebfdad524489ca19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 8 Sep 2023 10:58:02 +0200
Subject: [PATCH 09/15] Fix bug to read annotation

---
 VERSION                      | 2 +-
 ppanggolin/annotate/synta.py | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/VERSION b/VERSION
index 177b0d1a..12b3f290 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.181
+1.2.182
diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py
index 677da87d..c21dbdfd 100644
--- a/ppanggolin/annotate/synta.py
+++ b/ppanggolin/annotate/synta.py
@@ -170,8 +170,11 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str,
                     contigs[contig.name] = contig_seq.upper()
                     contig.length = len(contig_seq)
                 contig_seq = ""
-                contig = Contig(line.split()[0][1:])
-                org.add(contig)
+                try:
+                    contig = org.get(line.split()[0][1:])
+                except KeyError:
+                    contig = Contig(line.split()[0][1:])
+                    org.add(contig)
             else:
                 contig_seq += line.strip()
         if len(contig_seq) >= 1:  # processing the last contig

From 40c3687db48333d7edb155a10fea621939f9e435 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 8 Sep 2023 11:32:30 +0200
Subject: [PATCH 10/15] Fix bug to read contig len

---
 VERSION                            | 2 +-
 ppanggolin/formats/readBinaries.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/VERSION b/VERSION
index 12b3f290..2870b30c 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.182
+1.2.183
diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 1b323599..46cf1e1f 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -407,7 +407,7 @@ def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size:
             organism = pangenome.get_organism(contig2organism[contig_name])
             contig = organism.get(contig_name)
             contig.is_circular = row["is_circular"]
-            contig.length = row["length"]
+            contig.length = int(row["length"])
         try:
             gene = Gene(row["gene"].decode())
         except ValueError:

From 1a04c77ae61f3dd7d9e5aebb21bf84cb4e67773d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 8 Sep 2023 15:32:03 +0200
Subject: [PATCH 11/15] Remove GCF_001183765 organism in GBFF list to prevent
 gene overlap in contig

---
 VERSION                            | 2 +-
 testingDataset/organisms.gbff.list | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/VERSION b/VERSION
index 2870b30c..9df9b3c2 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.183
+1.2.184
diff --git a/testingDataset/organisms.gbff.list b/testingDataset/organisms.gbff.list
index 093d4beb..d7fb3bf5 100644
--- a/testingDataset/organisms.gbff.list
+++ b/testingDataset/organisms.gbff.list
@@ -21,7 +21,6 @@ GCF_000472205.1	GBFF/GCF_000472205.1_E_CS88_f__genomic.gbff.gz
 GCF_000590575.1	GBFF/GCF_000590575.1_ASM59057v1_genomic.gbff.gz
 GCF_000590635.1	GBFF/GCF_000590635.1_ASM59063v1_genomic.gbff.gz
 GCF_000590695.1	GBFF/GCF_000590695.1_ASM59069v1_genomic.gbff.gz
-GCF_001183765.1	GBFF/GCF_001183765.1_ASM118376v1_genomic.gbff.gz
 GCF_001183805.1	GBFF/GCF_001183805.1_ASM118380v1_genomic.gbff.gz
 GCF_001183825.1	GBFF/GCF_001183825.1_ASM118382v1_genomic.gbff.gz
 GCF_001183845.1	GBFF/GCF_001183845.1_ASM118384v1_genomic.gbff.gz

From b4768ae4fbb415d1b767844dd707437d8b0f0858 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Thu, 14 Sep 2023 17:08:33 +0200
Subject: [PATCH 12/15] Change format of table more relational DB

---
 VERSION                                |   2 +-
 ppanggolin/formats/readBinaries.py     | 133 ++++++++++++-------------
 ppanggolin/formats/writeAnnotations.py |  67 +++++--------
 ppanggolin/pangenome.py                |  34 ++++++-
 4 files changed, 121 insertions(+), 115 deletions(-)

diff --git a/VERSION b/VERSION
index 9df9b3c2..7844733d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.184
+1.2.185
diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 46cf1e1f..976499c1 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -384,81 +384,80 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal
 	    pangenome.add_module(module)
     pangenome.status["modules"] = "Loaded"
 
-def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size: int = 20000,
-                   disable_bar: bool = False) -> Tuple[Dict[str, Gene], Dict[str, RNA]]:
-    table = annotations.genomes
+def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000,
+                   disable_bar: bool = False):
     contig2organism = {}
     for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar):
-        try:
-            organism = pangenome.get_organism(row["name"].decode())
-        except:
-            organism = Organism(row["name"].decode())
-            pangenome.add_organism(organism)
-        contig = Contig(name=row["contig"].decode())
-        organism.add(contig)
-        contig2organism[contig.name] = organism.name
-    table = annotations.contigs
-    contig_name = None
-    genes_dict = {}
-    rna_dict = {}
+        organism = Organism(row["name"].decode())
+        pangenome.add_organism(organism)
+
+
+def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000,
+                   disable_bar: bool = False):
     for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar):
-        if contig_name != row["name"].decode():
-            contig_name = row["name"].decode()
-            organism = pangenome.get_organism(contig2organism[contig_name])
-            contig = organism.get(contig_name)
-            contig.is_circular = row["is_circular"]
-            contig.length = int(row["length"])
+        contig = Contig(name=row["name"].decode())
+        contig.is_circular = row["is_circular"]
+        contig.length = int(row["length"])
         try:
-            gene = Gene(row["gene"].decode())
-        except ValueError:
-            pass
-        else:
-            gene.fill_parents(organism, contig)
-            if row["gene"].decode() in genes_dict:
-                logging.getLogger().warning("A gene with the same ID already pass. "
-                                            "It could be a problem in the number of genes")
-            genes_dict[gene.ID] = gene
-        try:
-            rna = RNA(row["rna"].decode())
-        except ValueError:
+            organism = pangenome.get_organism(row["organism"].decode())
+        except KeyError:
             pass
         else:
-            rna_dict[rna.ID] = rna
-            rna.fill_parents(organism, contig)
-    return genes_dict, rna_dict
+            organism.add(contig)
 
 def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata],
-               gene_dict: Dict[str, Gene], chunk_size: int = 20000, disable_bar: bool = False):
+               link: bool = True, chunk_size: int = 20000, disable_bar: bool = False):
+    """Read genes in pangenome file to add them to the pangenome object
+
+    :param pangenome: Pangenome object
+    :param table: Genes table
+    :param genedata_dict: Dictionary to link genedata with gene
+    :param link: Allow to link gene to organism and contig
+    :param chunk_size: Size of the chunck reading
+    :param disable_bar: Disable progress bar
+    """
     for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar):
-        gene = gene_dict[row["ID"].decode()]
+        gene = Gene(row["ID"].decode())
         genedata = genedata_dict[row["genedata_id"]]
         try:
             local = row["local"].decode()
         except ValueError:
             local = ""
         gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand,
-                                  gene_type=genedata.gene_type, name=genedata.name, position=genedata.position,
-                                  genetic_code=genedata.genetic_code, product=genedata.product,
-                                  local_identifier=local)
+                              gene_type=genedata.gene_type, name=genedata.name, position=genedata.position,
+                              genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local)
         gene.is_fragment = row["is_fragment"]
-        if gene.contig is not None:
-            gene.contig.add(gene)
+        if link:
+            contig = pangenome.get_contig(row["contig"].decode())
+            gene.fill_parents(contig.organism, contig)
+            contig.add(gene)
 
 
 def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata],
-               rna_dict: Dict[str, RNA], chunk_size: int = 20000, disable_bar: bool = False):
+              link: bool = True, chunk_size: int = 20000, disable_bar: bool = False):
+    """Read RNAs in pangenome file to add them to the pangenome object
+
+    :param pangenome: Pangenome object
+    :param table: RNAs table
+    :param genedata_dict: Dictionary to link genedata with gene
+    :param link: Allow to link gene to organism and contig
+    :param chunk_size: Size of the chunck reading
+    :param disable_bar: Disable progress bar
+    """
     for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar):
-        rna = rna_dict[row["ID"].decode()]
+        rna = RNA(row["ID"].decode())
         genedata = genedata_dict[row["genedata_id"]]
         rna.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand,
                              gene_type=genedata.gene_type, name=genedata.name,
                              product=genedata.product)
-        if rna.contig is not None:
-            rna.contig.add_rna(rna)
+        if link:
+            contig = pangenome.get_contig(row["contig"].decode())
+            rna.fill_parents(contig.organism, contig)
+            contig.add_rna(rna)
 
 
-def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_genes: bool = True,
-                    load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False):
+def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_contigs: bool = True,
+                    load_genes: bool = True, load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False):
     """
     Read annotation in pangenome hdf5 file to add in pangenome object
 
@@ -467,30 +466,20 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool
     :param disable_bar: Disable the progress bar
     """
     annotations = h5f.root.annotations
-
+    genedata_dict = None
     if load_organisms:
-        if load_genes:
-            genedata_dict = read_genedata(h5f)
-            if load_rnas:
-                gene_dict, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar)
-
-                read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar)
-                read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar)
-            else:
-                gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar)
-                read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar)
-        else:
-            if load_rnas:
-                genedata_dict = read_genedata(h5f)
-                _, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar)
-                read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar)
-    else:
-        if load_genes:
-            if pangenome.status["genesClustered"] not in ["Loaded", "Computed"]:
-                raise Exception("Genes must be linked to gene families or organisms, but none are laoded")
-            gene_dict = {gene.ID: gene for gene in pangenome.genes}  # Dictionary with genes in families
-            gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar)
-            read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar)
+        read_organisms(pangenome, annotations.genomes, disable_bar=disable_bar)
+
+    if load_contigs:
+        read_contigs(pangenome, annotations.contigs, disable_bar=disable_bar)
+
+    if load_genes:
+        genedata_dict = read_genedata(h5f)
+        read_genes(pangenome, annotations.genes, genedata_dict,
+                   all([load_organisms, load_contigs]), disable_bar=disable_bar)
+    if load_rnas:
+        read_rnas(pangenome, annotations.RNAs, read_genedata(h5f) if genedata_dict is None else genedata_dict,
+                   all([load_organisms, load_contigs]), disable_bar=disable_bar)
     pangenome.status["genomesAnnotated"] = "Loaded"
 
 
diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py
index a8c0c6f1..821fd513 100644
--- a/ppanggolin/formats/writeAnnotations.py
+++ b/ppanggolin/formats/writeAnnotations.py
@@ -45,17 +45,15 @@ def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, i
     return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id
 
 
-def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]:
+def organism_desc(org_len: int) -> Dict[str, tables.StringCol]:
     """
     Table description to save organism-related information
 
     :param org_len: Maximum size of organism name.
-    :param contig_len: Maximum size of contigs name
 
     :return: Formatted table
     """
-    return {'name': tables.StringCol(itemsize=org_len),
-            "contig": tables.StringCol(itemsize=contig_len)}
+    return {'name': tables.StringCol(itemsize=org_len)}
 
 
 def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group,
@@ -73,28 +71,23 @@ def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.G
     logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes")
     organism_row = organism_table.row
     for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar):
-        for contig in org.contigs:
-            organism_row["name"] = org.name
-            organism_row["contig"] = contig.name
-            organism_row.append()
+        organism_row["name"] = org.name
+        organism_row.append()
     organism_table.flush()
 
 
-def contig_desc(contig_len: int, max_gene_id_len: int,
-                max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]:
+def contig_desc(contig_len: int, org_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]:
     """Table description to save contig-related information
 
     :param contig_len: Maximum size of contig name
-    :param max_gene_id_len: Maximum size of gene name
-    :param max_rna_id_len: Maximum size of rna name
+    :param org_len: Maximum size of organism name.
 
     :return: Formatted table
     """
     return {'name': tables.StringCol(itemsize=contig_len),
             "is_circular": tables.BoolCol(dflt=False),
             'length': tables.UInt32Col(),
-            "gene": tables.StringCol(itemsize=max_gene_id_len),
-            "rna": tables.StringCol(itemsize=max_rna_id_len)}
+            "organism": tables.StringCol(itemsize=org_len)}
 
 
 def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group,
@@ -111,40 +104,28 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro
     logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs")
     contig_row = contig_table.row
     for contig in tqdm(pangenome.contigs, total=pangenome.number_of_contigs, unit="contigs", disable=disable_bar):
-        if contig.number_of_genes >= contig.number_of_rnas:
-            rna_list = list(contig.RNAs)
-            for index, gene in enumerate(contig.genes):
-                contig_row["name"] = contig.name
-                contig_row["is_circular"] = contig.is_circular
-                contig_row["length"] = len(contig)
-                contig_row["gene"] = gene.ID
-                if index < len(rna_list):
-                    contig_row["rna"] = rna_list[index].ID
-                contig_row.append()
-        else:
-            gene_list = list(contig.genes)
-            for index, rna in enumerate(contig.RNAs):
-                contig_row["name"] = contig.name
-                contig_row["is_circular"] = contig.is_circular
-                contig_row["rna"] = rna.ID
-                if index < len(gene_list):
-                    contig_row["gene"] = gene_list[index].ID
-                contig_row.append()
+        contig_row["name"] = contig.name
+        contig_row["is_circular"] = contig.is_circular
+        contig_row["length"] = len(contig)
+        contig_row["organism"] = contig.organism.name
+        contig_row.append()
     contig_table.flush()
 
 
-def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]:
+def gene_desc(id_len: int, max_local_id: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]:
     """Table description to save gene-related information
 
     :param id_len: Maximum size of gene name
     :param max_local_id: Maximum size of gene local identifier
+    :param max_contig_len: Maximum size of contig identifier
 
     :return: Formatted table
     """
     return {'ID': tables.StringCol(itemsize=id_len),
             'genedata_id': tables.UInt32Col(),
             'local': tables.StringCol(itemsize=max_local_id),
-            'is_fragment': tables.BoolCol(dflt=False)}
+            'is_fragment': tables.BoolCol(dflt=False),
+            'contig': tables.StringCol(itemsize=max_contig_len)}
 
 
 def write_genes(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group,
@@ -169,6 +150,7 @@ def write_genes(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Grou
         gene_row["ID"] = gene.ID
         gene_row["is_fragment"] = gene.is_fragment
         gene_row["local"] = gene.local_identifier
+        gene_row["contig"] = gene.contig.name
         genedata = get_genedata(gene)
         genedata_id = genedata2gene.get(genedata)
         if genedata_id is None:
@@ -181,15 +163,17 @@ def write_genes(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Grou
     return genedata2gene
 
 
-def rna_desc(id_len) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]:
+def rna_desc(id_len: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]:
     """Table description to save rna-related information
 
     :param id_len: Maximum size of RNA identifier
+    :param max_contig_len: Maximum size of contig identifier
 
     :return: Formatted table
     """
     return {'ID': tables.StringCol(itemsize=id_len),
-            'genedata_id': tables.UInt32Col()}
+            'genedata_id': tables.UInt32Col(),
+            'contig': tables.StringCol(itemsize=max_contig_len)}
 
 
 def write_rnas(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group,
@@ -212,6 +196,7 @@ def write_rnas(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group
     rna_row = rna_table.row
     for rna in tqdm(pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar):
         rna_row["ID"] = rna.ID
+        rna_row["contig"] = rna.contig.name
         genedata = get_genedata(rna)
         genedata_id = genedata2rna.get(genedata)
         if genedata_id is None:
@@ -345,17 +330,17 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: boo
     # I add these boolean in case we would one day only load organism, contig or genes, without the other.
 
     if rec_organisms:
-        desc = organism_desc(org_len, contig_len)
+        desc = organism_desc(org_len)
         write_organisms(pangenome, h5f, annotation, desc, disable_bar)
     if rec_contigs:
-        desc = contig_desc(contig_len, gene_id_len, rna_id_len)
+        desc = contig_desc(contig_len, org_len)
         write_contigs(pangenome, h5f, annotation, desc, disable_bar)
     if rec_genes:
-        desc = gene_desc(gene_id_len, gene_local_id)
+        desc = gene_desc(gene_id_len, gene_local_id, contig_len)
         genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar)
         write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar)
     if rec_rnas:
-        desc = rna_desc(rna_id_len)
+        desc = rna_desc(rna_id_len, contig_len)
         genedata2rna = write_rnas(pangenome, h5f, annotation, desc, disable_bar)
         write_genedata(pangenome, h5f, annotation, genedata2rna, disable_bar)
 
diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py
index ef37bb4b..b5bf7c88 100644
--- a/ppanggolin/pangenome.py
+++ b/ppanggolin/pangenome.py
@@ -38,7 +38,6 @@ def __init__(self):
         self._regionGetter = {}
         self._spotGetter = {}
         self._moduleGetter = {}
-
         self.status = {
             'genomesAnnotated': "No",
             'geneSequences': "No",
@@ -312,6 +311,39 @@ def number_of_contigs(self) -> int:
         """
         return sum(len(org) for org in self.organisms)
 
+    def _mk_contig_getter(self):
+        """
+        Builds the attribute _contig_getter of the pangenome
+
+        Since the genes are never explicitly 'added' to a pangenome (but rather to an organism),
+        the pangenome cannot directly extract a gene from a geneID since it does not 'know' them.
+        If at some point we want to extract contig from a pangenome we'll create a contig_getter.
+        The assumption behind this is that the pangenome has been filled and no more contig will be added.
+        """
+        self._contig_getter = {}
+        for contig in self.contigs:
+            self._contig_getter[contig.name] = contig
+
+    def get_contig(self, name: str) -> Contig:
+        """Returns the contig that has the given name
+
+        :param name: The ,ame of the contig to look for
+
+        :return: Returns the wanted contig
+
+        :raises AssertionError: If the `gene_id` is not an integer
+        :raises KeyError: If the `gene_id` is not in the pangenome
+        """
+        assert isinstance(name, str), "Contig name should be a string"
+
+        try:
+            return self._contig_getter[name]
+        except AttributeError:
+            # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome.
+            self._mk_contig_getter()  # make it
+            return self.get_contig(name)  # Return what was expected. If geneID does not exist it will raise an error.
+        except KeyError:
+            raise KeyError(f"Contig: {name}, does not exist in the pangenome.")
     def get_organism(self, name: str) -> Organism:
         """
         Get an organism that is expected to be in the pangenome using its name, which is supposedly unique.

From f4e6648a20e01edbd847adbc7d5f6f74e8e82d75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 15 Sep 2023 10:37:56 +0200
Subject: [PATCH 13/15] Add documentation

---
 VERSION                                |  2 +-
 ppanggolin/formats/readBinaries.py     | 49 ++++++++++++++++++++------
 ppanggolin/formats/writeAnnotations.py | 12 ++++---
 3 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/VERSION b/VERSION
index 7844733d..aaf4dca1 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.185
+1.2.186
diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 976499c1..dda50046 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -25,19 +25,21 @@ class Genedata:
     """
     This is a general class storing unique gene-related data to be written in a specific
     genedata table
-
-    :param start: Gene start position
-    :param stop: Gene stop position
-    :param strand: Associated strand
-    :param gene_type: Gene type
-    :param position: Position of the gene on its contig
-    :param name: Name of the feature
-    :param product: Associated product
-    :param genetic_code: associated genetic code, if any
     """
 
     def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: int, name: str, product: str,
                  genetic_code: int):
+        """Constructor method
+
+        :param start: Gene start position
+        :param stop: Gene stop position
+        :param strand: Associated strand
+        :param gene_type: Gene type
+        :param position: Position of the gene on its contig
+        :param name: Name of the feature
+        :param product: Associated product
+        :param genetic_code: associated genetic code, if any
+        """
         self.start = start
         self.stop = stop
         self.strand = strand
@@ -47,7 +49,7 @@ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position:
         self.product = product
         self.genetic_code = genetic_code
 
-    def __eq__(self, other):
+    def __eq__(self, other: Genedata):
         return self.start == other.start \
             and self.stop == other.stop \
             and self.strand == other.strand \
@@ -166,7 +168,9 @@ def read_chunks(table: Table, column: str = None, chunk: int = 10000):
 def read_genedata(h5f: tables.File) -> Dict[int, Genedata]:
     """
     Reads the genedata table and returns a genedata_id2genedata dictionnary
+
     :param h5f: the hdf5 file handler
+
     :return: dictionnary linking genedata to the genedata identifier
     """
     table = h5f.root.annotations.genedata
@@ -386,6 +390,13 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal
 
 def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000,
                    disable_bar: bool = False):
+    """Read organism table in pangenome file to add them to the pangenome object
+
+    :param pangenome: Pangenome object
+    :param table: Organism table
+    :param chunk_size: Size of the chunck reading
+    :param disable_bar: Disable progress bar
+    """
     contig2organism = {}
     for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar):
         organism = Organism(row["name"].decode())
@@ -394,6 +405,13 @@ def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int =
 
 def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000,
                    disable_bar: bool = False):
+    """Read contig table in pangenome file to add them to the pangenome object
+
+    :param pangenome: Pangenome object
+    :param table: Contig table
+    :param chunk_size: Size of the chunck reading
+    :param disable_bar: Disable progress bar
+    """
     for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar):
         contig = Contig(name=row["name"].decode())
         contig.is_circular = row["is_circular"]
@@ -563,7 +581,16 @@ def read_modules_info(h5f: tables.File):
                   f"\t\t\t- mean: {info_group._v_attrs['StatOfFamiliesInModules']['mean']}")
 
 
-def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, sources: List[str] = None, disable_bar: bool = False):
+def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str,
+                  sources: List[str] = None, disable_bar: bool = False):
+    """Read metadata to add them to the pangenome object
+
+    :param pangenome: Pangenome object
+    :param h5f: Pangenome file
+    :param metatype: Object type to associate metadata
+    :param sources: Source name of metadata
+    :param disable_bar: Disable progress bar
+    """
     metadata_group = h5f.root.metadata._f_get_child(metatype)
     for source in sources:
         source_table = metadata_group._f_get_child(source)
diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py
index 821fd513..b09e4166 100644
--- a/ppanggolin/formats/writeAnnotations.py
+++ b/ppanggolin/formats/writeAnnotations.py
@@ -209,7 +209,7 @@ def write_rnas(pangenome: Pangenome,  h5f: tables.File, annotation: tables.Group
     return genedata2rna
 
 
-def genedata_desc(type_len, name_len, product_len):
+def genedata_desc(type_len: int, name_len: int, product_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]:
     """
     Creates a table for gene-related data
 
@@ -311,8 +311,8 @@ def write_genedata(pangenome: Pangenome,  h5f: tables.File, annotation: tables.G
     genedata_table.flush()
 
 
-def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True,
-                      rec_contigs: bool = True, rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False):
+def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, rec_contigs: bool = True,
+                      rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False):
     """Function writing all the pangenome annotations
 
     :param pangenome: Annotated pangenome
@@ -361,11 +361,13 @@ def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]:
     return max_gene_id_len, max_gene_type
 
 
-def gene_sequences_desc(gene_id_len, gene_type_len) -> dict:
+def gene_sequences_desc(gene_id_len: int, gene_type_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]:
     """
     Create table to save gene sequences
+
     :param gene_id_len: Maximum size of gene sequence identifier
     :param gene_type_len: Maximum size of gene type
+
     :return: Formated table
     """
     return {
@@ -388,7 +390,7 @@ def get_sequence_len(pangenome: Pangenome) -> int:
     return max_seq_len
 
 
-def sequence_desc(max_seq_len: int) -> dict:
+def sequence_desc(max_seq_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]:
     """
     Table description to save sequences
     :param max_seq_len: Maximum size of gene type

From e992a17308e1a4e9d2353a8cdf62109ce1fe9a24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <arnoux.jeromepj@gmail.com>
Date: Fri, 15 Sep 2023 10:57:36 +0200
Subject: [PATCH 14/15] Remove Genedata in type for __eq__

---
 VERSION                            | 2 +-
 ppanggolin/formats/readBinaries.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/VERSION b/VERSION
index aaf4dca1..0da01cf8 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.186
+1.2.187
diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index dda50046..968c1f2c 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -49,7 +49,7 @@ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position:
         self.product = product
         self.genetic_code = genetic_code
 
-    def __eq__(self, other: Genedata):
+    def __eq__(self, other):
         return self.start == other.start \
             and self.stop == other.stop \
             and self.strand == other.strand \

From c2e8ef747b40a2cbd85d787f1fdc757e6c8fb11b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?=
 <39793176+jpjarnoux@users.noreply.github.com>
Date: Tue, 19 Sep 2023 11:00:18 +0200
Subject: [PATCH 15/15] Update writeFlat.py

Fix completness computation
---
 ppanggolin/formats/writeFlat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py
index c9a12ef3..4a9926c6 100644
--- a/ppanggolin/formats/writeFlat.py
+++ b/ppanggolin/formats/writeFlat.py
@@ -530,7 +530,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05,
                         nb_gene_core += 1
             completeness = "NA"
             if len(single_copy_markers) > 0:
-                completeness = round(((org.number_of_families() + len(single_copy_markers)) /
+                completeness = round((len(set(org.families) & single_copy_markers) /
                                       len(single_copy_markers)) * 100, 2)
             outfile.write("\t".join(map(str, [org.name,
                                               org.number_of_families(),