From 96ec8651ec9e896101c6e07aec441930a0777d00 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Mon, 16 Sep 2024 17:27:56 +0200
Subject: [PATCH 01/14] write fasta by reading directly from the pangenome file

---
 ppanggolin/formats/readBinaries.py   | 83 +++++++++++++++++++++++++++-
 ppanggolin/formats/writeSequences.py | 32 +++++++++--
 2 files changed, 108 insertions(+), 7 deletions(-)

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 6c911fe7..588a4078 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -4,7 +4,7 @@
 import logging
 from optparse import Option
 from pathlib import Path
-from typing import Dict, Any, Set, List, Tuple, Optional
+from typing import Dict, Any, Iterator, Set, List, Tuple, Optional
 from collections import defaultdict
 
 # installed libraries
@@ -279,7 +279,7 @@ def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, output:
                 file_obj.write(f'{row["dna"].decode()}\n')
 
 
-def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Path, list_cds: iter = None,
+def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Path, list_cds: Optional[Iterator] = None,
                                              add: str = '', compress: bool = False, disable_bar: bool = False):
     """
     Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS,
@@ -309,6 +309,85 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa
                                           f"{output.absolute()}{'.gz' if compress else ''}")
 
 
+def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, partition: str,
+                         compress: bool = False, disable_bar=False):
+    """
+    Write representative nucleotide sequences of gene families
+
+    :param pangenome: Pangenome object with gene families sequences
+    :param output: Path to output directory
+    :param gene_families: Selected partition of gene families
+    :param soft_core: Soft core threshold to use
+    :param compress: Compress the file in .gz
+    :param disable_bar: Disable progress bar
+    """
+
+    outpath = output / f"{partition}_nucleotide_families.fasta"
+
+    parition_first_letter = partition[0].upper()
+
+    with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f:
+        gene_fam_info_table = h5f.root.geneFamiliesInfo
+        family_to_write = set()
+        seq_id_to_seqs = defaultdict(list)
+        for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar):
+            
+            if partition == "all" or row['partition'].decode().startswith(parition_first_letter):
+                family_to_write.add(row['name'])
+
+
+        gene_seq_table = h5f.root.annotations.geneSequences
+        for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar):
+            
+            if row['gene'] in family_to_write:
+                seq_id_to_seqs[row['seqid']].append(row['gene'].decode())
+
+        with write_compressed_or_not(outpath, compress) as file_obj:
+            seq_table = h5f.root.annotations.sequences
+            for row in read_chunks(seq_table, chunk=20000):
+                if row["seqid"] in seq_id_to_seqs:
+
+                    for seq_name in  seq_id_to_seqs[row["seqid"]]:
+                        
+                        file_obj.write(f">{seq_name}\n")
+                        file_obj.write(row["dna"].decode() + "\n")
+
+    logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences "
+                                         f"of the gene families : '{outpath}{'.gz' if compress else ''}")
+
+def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, partition: str,
+                         compress: bool = False, disable_bar=False):
+    """
+    Write representative amino acid sequences of gene families.
+
+    :param pangenome: Pangenome object with gene families sequences
+    :param output: Path to output directory
+    :param prot_families: Selected partition of protein families
+    :param soft_core: Soft core threshold to use
+    :param compress: Compress the file in .gz
+    :param disable_bar: Disable progress bar
+    """
+
+    outpath = output / f"{partition}_protein_families.faa"
+
+    parition_first_letter = partition[0].upper()
+    
+    with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f, write_compressed_or_not(outpath, compress) as fasta:
+
+        gene_fam_info_table = h5f.root.geneFamiliesInfo
+
+        for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar):
+    
+            if partition == "all" or row['partition'].decode().startswith(parition_first_letter):
+                
+                fasta.write(f">{row['name'].decode()}\n")
+                fasta.write(row['protein'].decode() + "\n")
+
+    logging.getLogger("PPanGGOLiN").info(f"Done writing the representative amino acid sequences of the gene families:"
+                                         f"'{outpath}{'.gz' if compress else ''}'")
+    
+
+
 def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
     """
     Read information about graph in pangenome hdf5 file to add in pangenome object
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index a7023812..a602b35e 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -19,7 +19,7 @@
 
 from ppanggolin.utils import (write_compressed_or_not, mk_outdir, create_tmpdir, read_compressed_or_not,
                               restricted_float, detect_filetype, run_subprocess)
-from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file
+from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file, write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file
 
 module_regex = re.compile(r'^module_\d+')  # \d == [0-9]
 poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex]
@@ -577,10 +577,32 @@ def launch(args: argparse.Namespace):
     mk_outdir(args.output, args.force)
     pangenome = Pangenome()
     pangenome.add_file(args.pangenome)
-    write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core,
-                         regions=args.regions, genes=args.genes, proteins=args.proteins,
-                         gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress,
-                         disable_bar=args.disable_prog_bar, **translate_kwgs)
+
+    # if all(element_to_write is None for element_to_write in [args.regions, args.genes, args.proteins]):
+    if args.gene_families in ['all', 'persistent', 'shell', 'cloud']:
+
+        logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences "
+                                            "of the gene families by reading the pangenome file directly.")
+
+        write_fasta_gene_fam_from_pangenome_file(pangenome_filename=args.pangenome, partition = args.gene_families,
+                                                output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar)
+        args.gene_families = None
+        
+    if args.prot_families in ['all', 'persistent', 'shell', 'cloud']:
+
+
+        logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences "
+                                            "of the gene families by reading the pangenome file directly.")
+        write_fasta_prot_fam_from_pangenome_file(pangenome_filename=args.pangenome, partition = args.prot_families,
+                                                output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar)
+
+        args.prot_families = None
+
+    if any(x for x in [args.regions, args.genes, args.proteins, args.gene_families, args.prot_families]):
+        write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core,
+                                regions=args.regions, genes=args.genes, proteins=args.proteins,
+                                gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress,
+                                disable_bar=args.disable_prog_bar, **translate_kwgs)
 
 
 def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser:

From 1b3b35f22931bb50cd4d4e074cfd29dd882f1deb Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Tue, 17 Sep 2024 18:44:32 +0200
Subject: [PATCH 02/14] add module filter in gene_families writing

---
 ppanggolin/formats/readBinaries.py   | 49 +++++++++++++++++++++-------
 ppanggolin/formats/writeSequences.py |  4 +--
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 588a4078..d1295d0c 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -308,8 +308,21 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa
     logging.getLogger("PPanGGOLiN").debug("Gene sequences from pangenome file was written to "
                                           f"{output.absolute()}{'.gz' if compress else ''}")
 
-
-def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, partition: str,
+def read_module_families_from_pangenome_file(h5f: tables.File, module_name:str):
+    """
+    
+    """
+    family_to_write = set()
+    module_id = int(module_name[len("module_"):])
+    print("MODULE", module_id)
+    module_table = h5f.root.modules
+    for row in read_chunks(module_table, chunk=20000):
+        if row['module'] == module_id:
+            family_to_write.add(row['geneFam'])
+    print("family_to_write", family_to_write)
+    return family_to_write
+
+def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str,
                          compress: bool = False, disable_bar=False):
     """
     Write representative nucleotide sequences of gene families
@@ -322,19 +335,31 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
     :param disable_bar: Disable progress bar
     """
 
-    outpath = output / f"{partition}_nucleotide_families.fasta"
+    outpath = output / f"{family_filter}_nucleotide_families.fasta"
 
-    parition_first_letter = partition[0].upper()
+    
 
     with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f:
-        gene_fam_info_table = h5f.root.geneFamiliesInfo
-        family_to_write = set()
+
+        
         seq_id_to_seqs = defaultdict(list)
-        for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar):
-            
-            if partition == "all" or row['partition'].decode().startswith(parition_first_letter):
-                family_to_write.add(row['name'])
 
+        if family_filter in ["all", 'persistent', 'shell', 'cloud']:
+            family_to_write = set()
+            gene_fam_info_table = h5f.root.geneFamiliesInfo
+            parition_first_letter = family_filter[0].upper()
+            for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar):
+                
+                if family_filter == "all" or row['partition'].decode().startswith(parition_first_letter):
+                    family_to_write.add(row['name'])
+
+
+        if family_filter.startswith("module_"):
+            family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter)
+
+        if len(family_to_write) == 0:
+            logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.")
+            return
 
         gene_seq_table = h5f.root.annotations.geneSequences
         for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar):
@@ -342,9 +367,9 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
             if row['gene'] in family_to_write:
                 seq_id_to_seqs[row['seqid']].append(row['gene'].decode())
 
-        with write_compressed_or_not(outpath, compress) as file_obj:
+        with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj:
             seq_table = h5f.root.annotations.sequences
-            for row in read_chunks(seq_table, chunk=20000):
+            for row in read_chunks(table=seq_table, chunk=20000):
                 if row["seqid"] in seq_id_to_seqs:
 
                     for seq_name in  seq_id_to_seqs[row["seqid"]]:
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index a602b35e..4d903c9d 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -579,12 +579,12 @@ def launch(args: argparse.Namespace):
     pangenome.add_file(args.pangenome)
 
     # if all(element_to_write is None for element_to_write in [args.regions, args.genes, args.proteins]):
-    if args.gene_families in ['all', 'persistent', 'shell', 'cloud']:
+    if args.gene_families and args.gene_families in ['all', 'persistent', 'shell', 'cloud'] or module_regex.match(args.gene_families):
 
         logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences "
                                             "of the gene families by reading the pangenome file directly.")
 
-        write_fasta_gene_fam_from_pangenome_file(pangenome_filename=args.pangenome, partition = args.gene_families,
+        write_fasta_gene_fam_from_pangenome_file(pangenome_filename=args.pangenome,  family_filter= args.gene_families,
                                                 output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar)
         args.gene_families = None
         

From c14eac408bf455e80c335b400602b3babc218d77 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Tue, 17 Sep 2024 18:51:50 +0200
Subject: [PATCH 03/14] fix none manageemnt

---
 ppanggolin/formats/writeSequences.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 4d903c9d..e63d978b 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -579,7 +579,7 @@ def launch(args: argparse.Namespace):
     pangenome.add_file(args.pangenome)
 
     # if all(element_to_write is None for element_to_write in [args.regions, args.genes, args.proteins]):
-    if args.gene_families and args.gene_families in ['all', 'persistent', 'shell', 'cloud'] or module_regex.match(args.gene_families):
+    if args.gene_families is not None and args.gene_families in ['all', 'persistent', 'shell', 'cloud'] or module_regex.match(args.gene_families):
 
         logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences "
                                             "of the gene families by reading the pangenome file directly.")

From 1ac566c12b658e14229eca5938ab36d8bcd3ec10 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 18 Sep 2024 16:04:58 +0200
Subject: [PATCH 04/14] write module and rgp family genes

---
 ppanggolin/formats/readBinaries.py   | 36 +++++++++++++++++++++++-----
 ppanggolin/formats/writeSequences.py |  4 +---
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index d1295d0c..dddac342 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -308,18 +308,37 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa
     logging.getLogger("PPanGGOLiN").debug("Gene sequences from pangenome file was written to "
                                           f"{output.absolute()}{'.gz' if compress else ''}")
 
+
+def read_rgp__genes_from_pangenome_file(h5f: tables.File) -> List[str]:
+    """
+    """
+    rgp_genes = [row["gene"] for row in read_chunks(h5f.root.RGP, chunk=20000)]
+
+    return rgp_genes
+
+
+def get_families_from_genes(h5f: tables.File , genes:Set[str]):
+    """
+    """
+    families = set()
+    for row in read_chunks(h5f.root.geneFamilies, chunk=20000):
+        if row['gene'] in genes:
+            families.add(row["geneFam"])
+
+    return families
+
 def read_module_families_from_pangenome_file(h5f: tables.File, module_name:str):
     """
     
     """
     family_to_write = set()
     module_id = int(module_name[len("module_"):])
-    print("MODULE", module_id)
     module_table = h5f.root.modules
+
     for row in read_chunks(module_table, chunk=20000):
         if row['module'] == module_id:
             family_to_write.add(row['geneFam'])
-    print("family_to_write", family_to_write)
+
     return family_to_write
 
 def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str,
@@ -337,11 +356,8 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
 
     outpath = output / f"{family_filter}_nucleotide_families.fasta"
 
-    
-
     with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f:
 
-        
         seq_id_to_seqs = defaultdict(list)
 
         if family_filter in ["all", 'persistent', 'shell', 'cloud']:
@@ -354,18 +370,26 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
                     family_to_write.add(row['name'])
 
 
-        if family_filter.startswith("module_"):
+        elif family_filter.startswith("module_"):
             family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter)
+        
+        elif family_filter == "rgp":
+            rgp_genes = read_rgp__genes_from_pangenome_file(h5f)
+            family_to_write = get_families_from_genes(h5f, rgp_genes)
 
         if len(family_to_write) == 0:
             logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.")
             return
 
         gene_seq_table = h5f.root.annotations.geneSequences
+        match_count = 0
         for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar):
             
             if row['gene'] in family_to_write:
                 seq_id_to_seqs[row['seqid']].append(row['gene'].decode())
+                match_count += 1
+        
+        assert match_count == len(family_to_write), f"Number of sequences found ({match_count}) does not match the number of expected family {len(family_to_write)}."
 
         with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj:
             seq_table = h5f.root.annotations.sequences
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index e63d978b..fe8aada6 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -578,8 +578,7 @@ def launch(args: argparse.Namespace):
     pangenome = Pangenome()
     pangenome.add_file(args.pangenome)
 
-    # if all(element_to_write is None for element_to_write in [args.regions, args.genes, args.proteins]):
-    if args.gene_families is not None and args.gene_families in ['all', 'persistent', 'shell', 'cloud'] or module_regex.match(args.gene_families):
+    if args.gene_families is not None and (args.gene_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(args.gene_families)):
 
         logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences "
                                             "of the gene families by reading the pangenome file directly.")
@@ -590,7 +589,6 @@ def launch(args: argparse.Namespace):
         
     if args.prot_families in ['all', 'persistent', 'shell', 'cloud']:
 
-
         logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences "
                                             "of the gene families by reading the pangenome file directly.")
         write_fasta_prot_fam_from_pangenome_file(pangenome_filename=args.pangenome, partition = args.prot_families,

From 25189429cc4cb4c6f96eb05c42bd293e047fa0e0 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 18 Sep 2024 16:23:24 +0200
Subject: [PATCH 05/14] add possibility to write module and rgp in prot fasta

---
 ppanggolin/formats/readBinaries.py   | 27 +++++++++++++++++++++------
 ppanggolin/formats/writeSequences.py |  4 ++--
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index dddac342..4b3ebb4e 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -404,7 +404,7 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
     logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences "
                                          f"of the gene families : '{outpath}{'.gz' if compress else ''}")
 
-def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, partition: str,
+def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str,
                          compress: bool = False, disable_bar=False):
     """
     Write representative amino acid sequences of gene families.
@@ -417,17 +417,32 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa
     :param disable_bar: Disable progress bar
     """
 
-    outpath = output / f"{partition}_protein_families.faa"
+    outpath = output / f"{family_filter}_protein_families.faa"
+
+    partition_filter = False
+    family_to_write = []
 
-    parition_first_letter = partition[0].upper()
-    
     with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f, write_compressed_or_not(outpath, compress) as fasta:
 
+        if family_filter in ["all", 'persistent', 'shell', 'cloud']:
+            partition_filter = True
+            parition_first_letter = family_filter[0].upper()
+
+        elif family_filter == "rgp":
+            rgp_genes = read_rgp__genes_from_pangenome_file(h5f)
+            family_to_write = get_families_from_genes(h5f, rgp_genes)
+        
+        elif family_filter.startswith("module_"):
+            family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter)
+        
         gene_fam_info_table = h5f.root.geneFamiliesInfo
 
         for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar):
-    
-            if partition == "all" or row['partition'].decode().startswith(parition_first_letter):
+            
+            partition_match = partition_filter and (family_filter == "all" or row['partition'].decode().startswith(parition_first_letter))
+            family_match = row['name'] in family_to_write
+
+            if partition_match or family_match:
                 
                 fasta.write(f">{row['name'].decode()}\n")
                 fasta.write(row['protein'].decode() + "\n")
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index fe8aada6..eaca363b 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -587,11 +587,11 @@ def launch(args: argparse.Namespace):
                                                 output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar)
         args.gene_families = None
         
-    if args.prot_families in ['all', 'persistent', 'shell', 'cloud']:
+    if args.prot_families is not None and (args.prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(args.prot_families)):
 
         logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences "
                                             "of the gene families by reading the pangenome file directly.")
-        write_fasta_prot_fam_from_pangenome_file(pangenome_filename=args.pangenome, partition = args.prot_families,
+        write_fasta_prot_fam_from_pangenome_file(pangenome_filename=args.pangenome, family_filter = args.prot_families,
                                                 output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar)
 
         args.prot_families = None

From 1157761987d9cec351588e78b6656321a1189272 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 18 Sep 2024 17:27:35 +0200
Subject: [PATCH 06/14] Add possibility to write genes by reading pangenome
 file directly

---
 ppanggolin/formats/readBinaries.py   | 135 +++++++++++++++++++++------
 ppanggolin/formats/writeSequences.py |  12 ++-
 2 files changed, 119 insertions(+), 28 deletions(-)

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 4b3ebb4e..bb1a868b 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python3
 
 # default libraries
+import dis
 import logging
 from optparse import Option
 from pathlib import Path
 from typing import Dict, Any, Iterator, Set, List, Tuple, Optional
 from collections import defaultdict
+import numpy
 
 # installed libraries
 from tqdm import tqdm
@@ -341,6 +343,67 @@ def read_module_families_from_pangenome_file(h5f: tables.File, module_name:str):
 
     return family_to_write
 
+def get_families_matching_partition(h5f: tables.File, partition:str):
+    """
+    """
+    family_to_write = set()
+    
+    gene_fam_info_table = h5f.root.geneFamiliesInfo
+    parition_first_letter = partition[0].upper()
+
+    for row in read_chunks(gene_fam_info_table, chunk=20000):
+        
+        if partition == "all" or row['partition'].decode().startswith(parition_first_letter):
+            family_to_write.add(row['name'])
+
+    return family_to_write
+
+def get_genes_from_families(h5f: tables.File, families:List[numpy.byte]):
+    """
+    """
+    matching_genes = set()
+    
+    gene_fam_table = h5f.root.geneFamilies
+
+    for row in read_chunks(gene_fam_table, chunk=20000):
+        if row["geneFam"] in families:
+            matching_genes.add(row['gene'])
+
+    return matching_genes
+
+def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = False, disable_bar:bool = False):
+    """
+    """
+
+    seq_id_to_genes = defaultdict(list)
+    gene_seq_table = h5f.root.annotations.geneSequences
+    match_count = 0
+    for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar):
+        
+        if get_all_genes or row['gene'] in genes:
+            seq_id_to_genes[row['seqid']].append(row['gene'].decode())
+            match_count += 1
+
+    assert match_count == len(genes), f"Number of sequences found ({match_count}) does not match the number of expected genes {len(genes)}."
+
+    return seq_id_to_genes
+
+def write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes):
+    """
+    """
+    
+    with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj:
+        seq_table = h5f.root.annotations.sequences
+        #TODO add tqm
+        for row in read_chunks(table=seq_table, chunk=20000):
+            if row["seqid"] in seq_id_to_genes:
+
+                for seq_name in  seq_id_to_genes[row["seqid"]]:
+                    
+                    file_obj.write(f">{seq_name}\n")
+                    file_obj.write(row["dna"].decode() + "\n")
+
+
 def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str,
                          compress: bool = False, disable_bar=False):
     """
@@ -358,16 +421,8 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
 
     with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f:
 
-        seq_id_to_seqs = defaultdict(list)
-
         if family_filter in ["all", 'persistent', 'shell', 'cloud']:
-            family_to_write = set()
-            gene_fam_info_table = h5f.root.geneFamiliesInfo
-            parition_first_letter = family_filter[0].upper()
-            for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar):
-                
-                if family_filter == "all" or row['partition'].decode().startswith(parition_first_letter):
-                    family_to_write.add(row['name'])
+            family_to_write = get_families_matching_partition(h5f, family_filter)
 
 
         elif family_filter.startswith("module_"):
@@ -381,25 +436,9 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
             logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.")
             return
 
-        gene_seq_table = h5f.root.annotations.geneSequences
-        match_count = 0
-        for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar):
-            
-            if row['gene'] in family_to_write:
-                seq_id_to_seqs[row['seqid']].append(row['gene'].decode())
-                match_count += 1
-        
-        assert match_count == len(family_to_write), f"Number of sequences found ({match_count}) does not match the number of expected family {len(family_to_write)}."
+        seq_id_to_genes = get_seqid_to_genes(h5f, family_to_write)
 
-        with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj:
-            seq_table = h5f.root.annotations.sequences
-            for row in read_chunks(table=seq_table, chunk=20000):
-                if row["seqid"] in seq_id_to_seqs:
-
-                    for seq_name in  seq_id_to_seqs[row["seqid"]]:
-                        
-                        file_obj.write(f">{seq_name}\n")
-                        file_obj.write(row["dna"].decode() + "\n")
+        write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes)
 
     logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences "
                                          f"of the gene families : '{outpath}{'.gz' if compress else ''}")
@@ -451,6 +490,48 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa
                                          f"'{outpath}{'.gz' if compress else ''}'")
     
 
+def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_filter: str,
+                         compress: bool = False, disable_bar=False):
+    """
+    Write representative nucleotide sequences of gene families
+
+    :param pangenome: Pangenome object with gene families sequences
+    :param output: Path to output directory
+    :param gene_families: Selected partition of gene families
+    :param soft_core: Soft core threshold to use
+    :param compress: Compress the file in .gz
+    :param disable_bar: Disable progress bar
+    """
+
+    outpath = output / f"{gene_filter}_genes.fna"
+    get_all_genes = False
+
+    with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f:
+
+        if gene_filter in ['persistent', 'shell', 'cloud'] or gene_filter.startswith("module_"):
+
+            if gene_filter.startswith("module_"):
+                families = read_module_families_from_pangenome_file(h5f, module_name=gene_filter)
+                
+            else:
+                families = get_families_matching_partition(h5f, gene_filter)
+
+            genes_to_write = get_genes_from_families(h5f, families)
+
+        elif gene_filter == "rgp":
+            genes_to_write = read_rgp__genes_from_pangenome_file(h5f)
+
+        elif gene_filter == "all":
+            genes_to_write = []
+            get_all_genes = True
+
+        seq_id_to_genes = get_seqid_to_genes(h5f, genes_to_write, get_all_genes=get_all_genes, disable_bar=disable_bar)
+
+        write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes)
+        
+    logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences "
+                                         f"of the gene families : '{outpath}{'.gz' if compress else ''}")
+
 
 def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False):
     """
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index eaca363b..108015c8 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -19,7 +19,7 @@
 
 from ppanggolin.utils import (write_compressed_or_not, mk_outdir, create_tmpdir, read_compressed_or_not,
                               restricted_float, detect_filetype, run_subprocess)
-from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file, write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file
+from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file, write_genes_from_pangenome_file,  write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file
 
 module_regex = re.compile(r'^module_\d+')  # \d == [0-9]
 poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex]
@@ -596,6 +596,16 @@ def launch(args: argparse.Namespace):
 
         args.prot_families = None
 
+
+    if args.genes is not None and (args.genes in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(args.genes)):
+
+        logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.")
+        write_genes_from_pangenome_file(pangenome_filename=args.pangenome, gene_filter = args.genes,
+                                                output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar)
+
+        args.prot_families = None
+
+
     if any(x for x in [args.regions, args.genes, args.proteins, args.gene_families, args.prot_families]):
         write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core,
                                 regions=args.regions, genes=args.genes, proteins=args.proteins,

From 3a6840c9f4fa3f034b1a96bbc83ab0061e7c3846 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Wed, 18 Sep 2024 17:30:45 +0200
Subject: [PATCH 07/14] reset genes var if it has been handle by writting fct

---
 ppanggolin/formats/writeSequences.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 108015c8..fe1ed13e 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -603,7 +603,7 @@ def launch(args: argparse.Namespace):
         write_genes_from_pangenome_file(pangenome_filename=args.pangenome, gene_filter = args.genes,
                                                 output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar)
 
-        args.prot_families = None
+        args.genes = None
 
 
     if any(x for x in [args.regions, args.genes, args.proteins, args.gene_families, args.prot_families]):

From 825e1e76a5c2da03df1bd8d0faff69bff6d71543 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Thu, 19 Sep 2024 14:22:25 +0200
Subject: [PATCH 08/14] add softcore and core filter in gene_families output

---
 ppanggolin/formats/readBinaries.py   |  49 ++++++++++++-
 ppanggolin/formats/writeSequences.py | 101 +++++++++++++++------------
 2 files changed, 103 insertions(+), 47 deletions(-)

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index bb1a868b..82bb3791 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -394,7 +394,8 @@ def write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes)
     
     with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj:
         seq_table = h5f.root.annotations.sequences
-        #TODO add tqm
+        #TODO add tqdm
+
         for row in read_chunks(table=seq_table, chunk=20000):
             if row["seqid"] in seq_id_to_genes:
 
@@ -404,8 +405,37 @@ def write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes)
                     file_obj.write(row["dna"].decode() + "\n")
 
 
-def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str,
-                         compress: bool = False, disable_bar=False):
+def get_gene_to_genome(h5f):
+    """
+    """
+    
+    contig_id_to_genome = {row["ID"]:row['genome']  for row in read_chunks( h5f.root.annotations.contigs, chunk=20000) }
+
+    gene_to_genome = {row["ID"]:contig_id_to_genome[row['contig'] ] for row in read_chunks( h5f.root.annotations.genes, chunk=20000) }
+
+    return gene_to_genome
+
+
+def get_family_to_genome_count(h5f) -> Dict[bytes, int]:
+    """
+    """
+    
+    contig_id_to_genome = {row["ID"]:row['genome']  for row in read_chunks( h5f.root.annotations.contigs, chunk=20000) }
+
+    gene_to_genome = {row["ID"]:contig_id_to_genome[row['contig'] ] for row in read_chunks( h5f.root.annotations.genes, chunk=20000) }
+
+    family_to_genomes = defaultdict(set)
+    for row in read_chunks( h5f.root.geneFamilies, chunk=20000):
+        family_to_genomes[row['geneFam']].add(gene_to_genome[row["gene"]])
+
+    family_to_genome_count = {fam: len(genomes) for fam, genomes in family_to_genomes.items()}
+
+    return family_to_genome_count
+
+
+
+def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, soft_core:float = 0.95,
+                                            compress: bool = False, disable_bar=False):
     """
     Write representative nucleotide sequences of gene families
 
@@ -432,6 +462,19 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
             rgp_genes = read_rgp__genes_from_pangenome_file(h5f)
             family_to_write = get_families_from_genes(h5f, rgp_genes)
 
+        elif family_filter in ["softcore", "core"]:
+            family_to_genome_count = get_family_to_genome_count(h5f)
+            pangenome_info = read_info(h5f)
+            genome_count = pangenome_info["Content"]["Genomes"]
+
+            if family_filter == "core":
+                family_to_write = {family for family, fam_genome_count in family_to_genome_count.items() if genome_count == fam_genome_count}
+
+            elif family_filter == "softcore":
+                genome_count_threshold = genome_count * soft_core
+                family_to_write = {family for family, fam_genome_count in family_to_genome_count.items() if fam_genome_count >= genome_count_threshold}
+                 
+
         if len(family_to_write) == 0:
             logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.")
             return
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index fe1ed13e..947b17f1 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -548,21 +548,62 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
     :param disable_bar: Disable progress bar
     """
 
-    check_pangenome_to_write_sequences(pangenome, regions, genes, proteins, gene_families, prot_families, disable_bar)
 
-    if prot_families is not None:
-        write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar)
-    if gene_families is not None:
-        write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar)
-    if genes is not None:
-        write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar)
-    if proteins is not None:
-        write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar,
-                                     **translate_kwgs)
-    if regions is not None:
-        write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar)
+    if gene_families is not None: #and (gene_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(gene_families)):
+
+        logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences "
+                                            "of the gene families by reading the pangenome file directly.")
+
+        write_fasta_gene_fam_from_pangenome_file(pangenome_filename=pangenome.file,  family_filter= gene_families,
+                                                output=output, compress=compress, disable_bar=disable_bar)
+        gene_families = None
+        
+    if prot_families is not None and (prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(prot_families)):
+
+        logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences "
+                                            "of the gene families by reading the pangenome file directly.")
+        write_fasta_prot_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter = prot_families,
+                                                output=output, compress=compress, disable_bar=disable_bar)
+
+        prot_families = None
 
 
+    if genes is not None and (genes in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(genes)):
+
+        logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.")
+        write_genes_from_pangenome_file(pangenome_filename=pangenome.file, gene_filter = genes,
+                                                output=output, compress=compress, disable_bar=disable_bar)
+
+        genes = None
+
+    if proteins is not None and (proteins in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(proteins)):
+        pass
+
+        # logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.")
+        # write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar,
+        #                     **translate_kwgs)
+
+        # write_genes_from_pangenome_file(pangenome_filename=pangenome, gene_filter = genes,
+        #                                         output=output, compress=compress, disable_bar=disable_bar)
+
+        # proteins = None
+
+    if any(x for x in [regions, genes, proteins, gene_families, prot_families]):
+        check_pangenome_to_write_sequences(pangenome, regions, genes, proteins, gene_families, prot_families, disable_bar)
+
+        if prot_families is not None:
+            write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar)
+        if gene_families is not None:
+            write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar)
+        if genes is not None:
+            write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar)
+        if proteins is not None:
+            write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar,
+                                        **translate_kwgs)
+        if regions is not None:
+            write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar)
+ 
+
 def launch(args: argparse.Namespace):
     """
     Command launcher
@@ -578,39 +619,11 @@ def launch(args: argparse.Namespace):
     pangenome = Pangenome()
     pangenome.add_file(args.pangenome)
 
-    if args.gene_families is not None and (args.gene_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(args.gene_families)):
-
-        logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences "
-                                            "of the gene families by reading the pangenome file directly.")
-
-        write_fasta_gene_fam_from_pangenome_file(pangenome_filename=args.pangenome,  family_filter= args.gene_families,
-                                                output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar)
-        args.gene_families = None
-        
-    if args.prot_families is not None and (args.prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(args.prot_families)):
-
-        logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences "
-                                            "of the gene families by reading the pangenome file directly.")
-        write_fasta_prot_fam_from_pangenome_file(pangenome_filename=args.pangenome, family_filter = args.prot_families,
-                                                output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar)
-
-        args.prot_families = None
-
-
-    if args.genes is not None and (args.genes in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(args.genes)):
-
-        logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.")
-        write_genes_from_pangenome_file(pangenome_filename=args.pangenome, gene_filter = args.genes,
-                                                output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar)
-
-        args.genes = None
-
 
-    if any(x for x in [args.regions, args.genes, args.proteins, args.gene_families, args.prot_families]):
-        write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core,
-                                regions=args.regions, genes=args.genes, proteins=args.proteins,
-                                gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress,
-                                disable_bar=args.disable_prog_bar, **translate_kwgs)
+    write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core,
+                            regions=args.regions, genes=args.genes, proteins=args.proteins,
+                            gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress,
+                            disable_bar=args.disable_prog_bar, **translate_kwgs)
 
 
 def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser:

From 25c61bc88cf6a2d641040b7bf1c6ed1039caa7ac Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Thu, 19 Sep 2024 14:41:58 +0200
Subject: [PATCH 09/14] add softcore and core filter in prot_families output

---
 ppanggolin/formats/readBinaries.py   | 32 ++++++++++++++++++----------
 ppanggolin/formats/writeSequences.py |  4 ++--
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 82bb3791..90ce9e90 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -432,7 +432,16 @@ def get_family_to_genome_count(h5f) -> Dict[bytes, int]:
 
     return family_to_genome_count
 
+def get_soft_core_families(h5f, soft_core:float):
+    """
+    """
+    family_to_genome_count = get_family_to_genome_count(h5f)
+    pangenome_info = read_info(h5f)
+    genome_count = pangenome_info["Content"]["Genomes"]
 
+    genome_count_threshold = genome_count * soft_core
+    soft_core_families = {family for family, fam_genome_count in family_to_genome_count.items() if fam_genome_count >= genome_count_threshold}
+    return soft_core_families
 
 def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, soft_core:float = 0.95,
                                             compress: bool = False, disable_bar=False):
@@ -463,17 +472,10 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
             family_to_write = get_families_from_genes(h5f, rgp_genes)
 
         elif family_filter in ["softcore", "core"]:
-            family_to_genome_count = get_family_to_genome_count(h5f)
-            pangenome_info = read_info(h5f)
-            genome_count = pangenome_info["Content"]["Genomes"]
-
             if family_filter == "core":
-                family_to_write = {family for family, fam_genome_count in family_to_genome_count.items() if genome_count == fam_genome_count}
-
-            elif family_filter == "softcore":
-                genome_count_threshold = genome_count * soft_core
-                family_to_write = {family for family, fam_genome_count in family_to_genome_count.items() if fam_genome_count >= genome_count_threshold}
-                 
+                soft_core = 1.0
+        
+            family_to_write = get_soft_core_families(h5f, soft_core)
 
         if len(family_to_write) == 0:
             logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.")
@@ -486,7 +488,8 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
     logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences "
                                          f"of the gene families : '{outpath}{'.gz' if compress else ''}")
 
-def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str,
+
+def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, soft_core:float=0.95,
                          compress: bool = False, disable_bar=False):
     """
     Write representative amino acid sequences of gene families.
@@ -517,8 +520,15 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa
         elif family_filter.startswith("module_"):
             family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter)
         
+        elif family_filter in ["softcore", "core"]:
+            
+            soft_core_to_apply = 1.0 if family_filter == "core" else soft_core
+
+            family_to_write = get_soft_core_families(h5f, soft_core=soft_core_to_apply)
+
         gene_fam_info_table = h5f.root.geneFamiliesInfo
 
+        # TODO improve tqdm with total being the number of family to write
         for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar):
             
             partition_match = partition_filter and (family_filter == "all" or row['partition'].decode().startswith(parition_first_letter))
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 947b17f1..2ea531f6 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -554,11 +554,11 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
         logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences "
                                             "of the gene families by reading the pangenome file directly.")
 
-        write_fasta_gene_fam_from_pangenome_file(pangenome_filename=pangenome.file,  family_filter= gene_families,
+        write_fasta_gene_fam_from_pangenome_file(pangenome_filename=pangenome.file,  family_filter= gene_families, soft_core=soft_core,
                                                 output=output, compress=compress, disable_bar=disable_bar)
         gene_families = None
         
-    if prot_families is not None and (prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(prot_families)):
+    if prot_families is not None: #and (prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(prot_families)):
 
         logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences "
                                             "of the gene families by reading the pangenome file directly.")

From baa9a0f90d9ed36ae86daf60608ca5233080b576 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Thu, 19 Sep 2024 16:08:14 +0200
Subject: [PATCH 10/14] handle proteins in faster way

---
 ppanggolin/formats/readBinaries.py   |  13 ++-
 ppanggolin/formats/writeSequences.py | 134 ++++++---------------------
 2 files changed, 37 insertions(+), 110 deletions(-)

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 90ce9e90..4aec2c7a 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -384,11 +384,11 @@ def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = F
             seq_id_to_genes[row['seqid']].append(row['gene'].decode())
             match_count += 1
 
-    assert match_count == len(genes), f"Number of sequences found ({match_count}) does not match the number of expected genes {len(genes)}."
+    assert get_all_genes or match_count == len(genes), f"Number of sequences found ({match_count}) does not match the number of expected genes {len(genes)}."
 
     return seq_id_to_genes
 
-def write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes):
+def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath, compress, seq_id_to_genes):
     """
     """
     
@@ -543,7 +543,7 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa
                                          f"'{outpath}{'.gz' if compress else ''}'")
     
 
-def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_filter: str,
+def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_filter: str, soft_core:float=0.95,
                          compress: bool = False, disable_bar=False):
     """
     Write representative nucleotide sequences of gene families
@@ -561,11 +561,16 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_
 
     with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f:
 
-        if gene_filter in ['persistent', 'shell', 'cloud'] or gene_filter.startswith("module_"):
+        if gene_filter in ['persistent', 'shell', 'cloud', "softcore", "core"] or gene_filter.startswith("module_"):
 
             if gene_filter.startswith("module_"):
                 families = read_module_families_from_pangenome_file(h5f, module_name=gene_filter)
+
+            elif gene_filter in ["softcore", "core"]:
                 
+                soft_core_to_apply = 1.0 if gene_filter == "core" else soft_core
+                families = get_soft_core_families(h5f, soft_core=soft_core_to_apply)
+                    
             else:
                 families = get_families_matching_partition(h5f, gene_filter)
 
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 2ea531f6..83562ae6 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -39,81 +39,6 @@ def check_write_sequences_args(args: argparse.Namespace) -> None:
                                              "(You need to provide the same file used to compute the pangenome)")
 
 
-def check_pangenome_to_write_sequences(pangenome: Pangenome, regions: str = None, genes: str = None,
-                                       genes_prot: str = None, gene_families: str = None,
-                                       prot_families: str = None, disable_bar: bool = False):
-    """Check and load required information from pangenome
-
-    :param pangenome: Empty pangenome
-    :param regions: Check and load the RGP
-    :param genes: Check and load the genes
-    :param genes_prot: Write amino acid CDS sequences.
-    :param gene_families: Check and load the gene families to write representative nucleotide sequences.
-    :param prot_families: Check and load the gene families to write representative amino acid sequences.
-    :param disable_bar: Disable progress bar
-
-    :raises AssertionError: if not any arguments to write any file is given
-    :raises ValueError: if the given filter is not recognized
-    :raises AttributeError: if the pangenome does not contain the required information
-    """
-    if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]):
-        raise AssertionError("You did not indicate what file you wanted to write.")
-
-    need_annotations = False
-    need_families = False
-    need_graph = False
-    need_partitions = False
-    need_spots = False
-    need_regions = False
-    need_modules = False
-
-    if prot_families is not None:
-        need_families = True
-        if prot_families in ["core", "softcore"]:
-            need_annotations = True
-
-    if any(x is not None for x in [regions, genes, genes_prot, gene_families]):
-        need_annotations = True
-        need_families = True
-    if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)):
-        need_annotations = True
-        need_regions = True
-    if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)):
-        need_partitions = True
-    for x in (genes, gene_families, prot_families):
-        if x is not None and 'module_' in x:
-            need_modules = True
-
-    if not (need_annotations or need_families or need_graph or need_partitions or
-            need_spots or need_regions or need_modules):
-        # then nothing is needed, then something is wrong.
-        # find which filter was provided
-        provided_filter = ''
-        if genes is not None:
-            provided_filter = genes
-        if genes_prot is not None:
-            provided_filter = genes_prot
-        if gene_families is not None:
-            provided_filter = gene_families
-        if prot_families is not None:
-            provided_filter = prot_families
-        if regions is not None:
-            provided_filter = regions
-        raise ValueError(f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. "
-                         f"{poss_values_log}")
-
-    if pangenome.status["geneSequences"] not in ["inFile"] and (genes or gene_families):
-        raise AttributeError("The provided pangenome has no gene sequences. "
-                             "This is not compatible with any of the following options : --genes, --gene_families")
-    if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"] and prot_families:
-        raise AttributeError("The provided pangenome has no gene families. This is not compatible with any of "
-                             "the following options : --prot_families, --gene_families")
-
-    check_pangenome_info(pangenome, need_annotations=need_annotations, need_families=need_families,
-                         need_graph=need_graph, need_partitions=need_partitions, need_rgp=need_regions,
-                         need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar)
-
-
 def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], output: Path, add: str = '',
                                           compress: bool = False, disable_bar: bool = False):
     """
@@ -222,7 +147,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
     return outpath
 
 
-def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: str, soft_core: float = 0.95,
+def write_gene_protein_sequences(pangenome_filename: str, output: Path, gene_filter: str, soft_core: float = 0.95,
                                  compress: bool = False, keep_tmp: bool = False, tmp: Path = None,
                                  cpu: int = 1, code: int = 11, disable_bar: bool = False):
     """ Write all amino acid sequences from given genes in pangenome
@@ -241,12 +166,17 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s
     with create_tmpdir(tmp if tmp is not None else Path(tempfile.gettempdir()),
                        basename="translateGenes", keep_tmp=keep_tmp) as tmpdir:
 
-        write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar)
+        write_genes_from_pangenome_file(pangenome_filename=pangenome_filename, gene_filter = gene_filter,
+                                                output=tmpdir, compress=compress, disable_bar=disable_bar)
 
-        pangenome_sequences = tmpdir / f"{proteins}_genes.fna{'.gz' if compress else ''}"
-        translate_db = translate_genes(sequences=pangenome_sequences, tmpdir=tmpdir,
+        genes_sequence_tmp_file = tmpdir / f"{gene_filter}_genes.fna{'.gz' if compress else ''}"
+        translate_db = translate_genes(sequences=genes_sequence_tmp_file, tmpdir=tmpdir,
                                        cpu=cpu, is_single_line_fasta=True, code=code)
-        outpath = output / f"{proteins}_protein_genes.fna"
+        
+        outpath = output / f"{gene_filter}_protein_genes.faa"
+        
+        logging.getLogger("PPanGGOLiN").info("Translating nucleotide gene sequence in protein sequences with mmseqs convert2fasta")
+
         cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath]))
         run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n")
         if compress:
@@ -254,9 +184,9 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s
                 with open(outpath) as sequence_file:
                     shutil.copyfileobj(sequence_file, compress_file)
             outpath.unlink()
-            logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}.gz'")
+            logging.getLogger("PPanGGOLiN").info(f"Done writing the gene protein sequences : '{outpath}.gz'")
         else:
-            logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'")
+            logging.getLogger("PPanGGOLiN").info(f"Done writing the gene protein sequences : '{outpath}'")
 
 
 def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]:
@@ -549,7 +479,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
     """
 
 
-    if gene_families is not None: #and (gene_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(gene_families)):
+    if gene_families is not None:
 
         logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences "
                                             "of the gene families by reading the pangenome file directly.")
@@ -558,48 +488,40 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
                                                 output=output, compress=compress, disable_bar=disable_bar)
         gene_families = None
         
-    if prot_families is not None: #and (prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(prot_families)):
+    if prot_families is not None:
 
         logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences "
                                             "of the gene families by reading the pangenome file directly.")
-        write_fasta_prot_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter = prot_families,
+        write_fasta_prot_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter = prot_families, soft_core=soft_core,
                                                 output=output, compress=compress, disable_bar=disable_bar)
 
         prot_families = None
 
 
-    if genes is not None and (genes in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(genes)):
+    if genes is not None:
 
         logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.")
-        write_genes_from_pangenome_file(pangenome_filename=pangenome.file, gene_filter = genes,
+        write_genes_from_pangenome_file(pangenome_filename=pangenome.file, gene_filter = genes, soft_core=soft_core,
                                                 output=output, compress=compress, disable_bar=disable_bar)
 
         genes = None
 
-    if proteins is not None and (proteins in ['all', 'persistent', 'shell', 'cloud', "rgp"]  or module_regex.match(proteins)):
-        pass
+    if proteins is not None:
+        
 
-        # logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.")
-        # write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar,
-        #                     **translate_kwgs)
+        logging.getLogger("PPanGGOLiN").info("Writing gene protein sequences by reading the pangenome file directly.")
+        write_gene_protein_sequences(pangenome_filename=pangenome.file, output=output, gene_filter=proteins, soft_core=soft_core, compress=compress, disable_bar=disable_bar,
+                                        **translate_kwgs)
+        
+        proteins = None
 
-        # write_genes_from_pangenome_file(pangenome_filename=pangenome, gene_filter = genes,
-        #                                         output=output, compress=compress, disable_bar=disable_bar)
+    if regions is not None:
+
+        # load pangenome when writing region sequence
+        check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_rgp=True, disable_bar=disable_bar)
 
-        # proteins = None
 
-    if any(x for x in [regions, genes, proteins, gene_families, prot_families]):
-        check_pangenome_to_write_sequences(pangenome, regions, genes, proteins, gene_families, prot_families, disable_bar)
 
-        if prot_families is not None:
-            write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar)
-        if gene_families is not None:
-            write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar)
-        if genes is not None:
-            write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar)
-        if proteins is not None:
-            write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar,
-                                        **translate_kwgs)
         if regions is not None:
             write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar)
  

From cfc9073aa4d797d068487fcd3e0e9d536d55d321 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Thu, 19 Sep 2024 17:04:10 +0200
Subject: [PATCH 11/14] clean region seq writting

---
 ppanggolin/formats/writeSequences.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 83562ae6..90234462 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -516,14 +516,10 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
         proteins = None
 
     if regions is not None:
-
         # load pangenome when writing region sequence
         check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_rgp=True, disable_bar=disable_bar)
 
-
-
-        if regions is not None:
-            write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar)
+        write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar)
  
 
 def launch(args: argparse.Namespace):

From bd592d091a4e19468f377b4081fcc870083fbaa4 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Thu, 19 Sep 2024 17:40:10 +0200
Subject: [PATCH 12/14] add docstring and clean fct not used

---
 ppanggolin/formats/readBinaries.py   | 118 +++++++++++++++++-----
 ppanggolin/formats/writeSequences.py | 145 ---------------------------
 2 files changed, 93 insertions(+), 170 deletions(-)

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 4aec2c7a..5f1b79e9 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -311,17 +311,27 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa
                                           f"{output.absolute()}{'.gz' if compress else ''}")
 
 
-def read_rgp__genes_from_pangenome_file(h5f: tables.File) -> List[str]:
+def read_rgp_genes_from_pangenome_file(h5f: tables.File) -> List[bytes]:
     """
+    Retrieves a list of RGP genes from the pangenome file.
+
+    :param h5f: The open HDF5 pangenome file containing RGP gene data.
+    :return: A list of gene names (as bytes) from the RGP.
     """
     rgp_genes = [row["gene"] for row in read_chunks(h5f.root.RGP, chunk=20000)]
 
     return rgp_genes
 
 
-def get_families_from_genes(h5f: tables.File , genes:Set[str]):
+def get_families_from_genes(h5f: tables.File, genes: Set[bytes]) -> Set[bytes]:
     """
+    Retrieves gene families associated with a specified set of genes from the pangenome file.
+
+    :param h5f: The open HDF5 pangenome file containing gene family data.
+    :param genes: A set of gene names (as bytes) for which to retrieve the associated families.
+    :return: A set of gene family names (as bytes) associated with the specified genes.
     """
+    
     families = set()
     for row in read_chunks(h5f.root.geneFamilies, chunk=20000):
         if row['gene'] in genes:
@@ -329,10 +339,17 @@ def get_families_from_genes(h5f: tables.File , genes:Set[str]):
 
     return families
 
-def read_module_families_from_pangenome_file(h5f: tables.File, module_name:str):
+def read_module_families_from_pangenome_file(h5f: tables.File, module_name: str) -> Set[bytes]:
     """
-    
+    Retrieves gene families associated with a specified module from the pangenome file.
+
+
+    :param h5f: The open HDF5 pangenome file containing module data.
+    :param module_name: The name of the module (as a string). The module ID is extracted from 
+                        the name by removing the "module_" prefix.
+    :return: A set of gene family names (as bytes) associated with the specified module.
     """
+    
     family_to_write = set()
     module_id = int(module_name[len("module_"):])
     module_table = h5f.root.modules
@@ -343,9 +360,16 @@ def read_module_families_from_pangenome_file(h5f: tables.File, module_name:str):
 
     return family_to_write
 
-def get_families_matching_partition(h5f: tables.File, partition:str):
+def get_families_matching_partition(h5f: tables.File, partition: str) -> Set[bytes]:
     """
+    Retrieves gene families that match the specified partition.
+
+    :param h5f: The open HDF5 pangenome file containing gene family information.
+    :param partition: The partition name (as a string). If "all", all gene families are included. 
+                      Otherwise, it filters by the first letter of the partition.
+    :return: A set of gene family names (as bytes) that match the partition criteria.
     """
+
     family_to_write = set()
     
     gene_fam_info_table = h5f.root.geneFamiliesInfo
@@ -358,9 +382,18 @@ def get_families_matching_partition(h5f: tables.File, partition:str):
 
     return family_to_write
 
-def get_genes_from_families(h5f: tables.File, families:List[numpy.byte]):
+def get_genes_from_families(h5f: tables.File, families: List[bytes]) -> Set[bytes]:
     """
+    Retrieves a set of genes that belong to the specified families.
+
+    This function reads the gene family data from an HDF5 pangenome file and returns
+    a set of genes that are part of the given list of gene families.
+
+    :param h5f: The open HDF5 pangenome file containing gene family data.
+    :param families: A list of gene families (as bytes) to filter genes by.
+    :return: A set of genes (as bytes) that belong to the specified families.
     """
+    
     matching_genes = set()
     
     gene_fam_table = h5f.root.geneFamilies
@@ -371,14 +404,22 @@ def get_genes_from_families(h5f: tables.File, families:List[numpy.byte]):
 
     return matching_genes
 
-def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = False, disable_bar:bool = False):
+def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = False, disable_bar:bool = False) ->  Dict[int, List[str]]:
     """
+    Creates a mapping of sequence IDs to gene names.
+
+    :param h5f: The open HDF5 pangenome file containing gene sequence data.
+    :param genes: A list of gene names to include in the mapping (if `get_all_genes` is False).
+    :param get_all_genes: Boolean flag to indicate if all genes should be included in the mapping. 
+                          If set to True, all genes will be added regardless of the `genes` parameter.
+    :param disable_bar: Boolean flag to disable the progress bar if set to True.
+    :return: A dictionary mapping sequence IDs (integers) to lists of gene names (strings).
     """
 
     seq_id_to_genes = defaultdict(list)
     gene_seq_table = h5f.root.annotations.geneSequences
     match_count = 0
-    for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar):
+    for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="gene", disable=disable_bar):
         
         if get_all_genes or row['gene'] in genes:
             seq_id_to_genes[row['seqid']].append(row['gene'].decode())
@@ -388,25 +429,42 @@ def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = F
 
     return seq_id_to_genes
 
-def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath, compress, seq_id_to_genes):
+def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath: Path, compress: bool, seq_id_to_genes: Dict[int, List[str]], disable_bar:bool):
     """
+    Writes gene sequences from the pangenome file to an output file. 
+    
+    Only sequences whose IDs match the ones in `seq_id_to_genes` will be written.
+
+    :param h5f: The open HDF5 pangenome file containing sequence data.
+    :param outpath: The path to the output file where sequences will be written.
+    :param compress: Boolean flag to indicate whether output should be compressed.
+    :param seq_id_to_genes: A dictionary mapping sequence IDs to lists of gene names.
+    :param disable_bar: Boolean flag to disable the progress bar if set to True.
     """
     
     with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj:
+
         seq_table = h5f.root.annotations.sequences
-        #TODO add tqdm
 
-        for row in read_chunks(table=seq_table, chunk=20000):
-            if row["seqid"] in seq_id_to_genes:
+        with tqdm(total=len(seq_id_to_genes), unit="sequence", disable=disable_bar) as pbar:
 
-                for seq_name in  seq_id_to_genes[row["seqid"]]:
-                    
-                    file_obj.write(f">{seq_name}\n")
-                    file_obj.write(row["dna"].decode() + "\n")
+            for row in read_chunks(table=seq_table, chunk=20000):
+                if row["seqid"] in seq_id_to_genes:
+
+                    for seq_name in seq_id_to_genes[row["seqid"]]:
+                        file_obj.write(f">{seq_name}\n")
+                        file_obj.write(row["dna"].decode() + "\n")
+                
+
+                    pbar.update(1)
 
 
-def get_gene_to_genome(h5f):
+def get_gene_to_genome(h5f: tables.File) -> Dict[bytes, bytes]:
     """
+    Generates a mapping between gene IDs and their corresponding genome.
+
+    :param h5f: The open HDF5 pangenome file containing contig and gene annotations.
+    :return: A dictionary mapping gene IDs to genome names.
     """
     
     contig_id_to_genome = {row["ID"]:row['genome']  for row in read_chunks( h5f.root.annotations.contigs, chunk=20000) }
@@ -416,8 +474,12 @@ def get_gene_to_genome(h5f):
     return gene_to_genome
 
 
-def get_family_to_genome_count(h5f) -> Dict[bytes, int]:
+def get_family_to_genome_count(h5f: tables.File) -> Dict[bytes, int]:
     """
+    Computes the number of unique genomes associated with each gene family.
+
+    :param h5f: The open HDF5 pangenome file containing contig, gene, and gene family data.
+    :return: A dictionary mapping gene family names (as bytes) to the count of unique genomes.
     """
     
     contig_id_to_genome = {row["ID"]:row['genome']  for row in read_chunks( h5f.root.annotations.contigs, chunk=20000) }
@@ -432,8 +494,14 @@ def get_family_to_genome_count(h5f) -> Dict[bytes, int]:
 
     return family_to_genome_count
 
-def get_soft_core_families(h5f, soft_core:float):
+def get_soft_core_families(h5f: tables.File, soft_core: float) -> Set[bytes]:
     """
+    Identifies gene families that are present in at least a specified proportion of genomes.
+
+    :param h5f: The open HDF5 pangenome file containing gene family and genome data.
+    :param soft_core: The proportion of genomes (between 0 and 1) that a gene family must be present in
+                      to be considered a soft core family.
+    :return: A set of gene family names (as bytes) that are classified as soft core.
     """
     family_to_genome_count = get_family_to_genome_count(h5f)
     pangenome_info = read_info(h5f)
@@ -468,7 +536,7 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
             family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter)
         
         elif family_filter == "rgp":
-            rgp_genes = read_rgp__genes_from_pangenome_file(h5f)
+            rgp_genes = read_rgp_genes_from_pangenome_file(h5f)
             family_to_write = get_families_from_genes(h5f, rgp_genes)
 
         elif family_filter in ["softcore", "core"]:
@@ -483,7 +551,7 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
 
         seq_id_to_genes = get_seqid_to_genes(h5f, family_to_write)
 
-        write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes)
+        write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes, disable_bar=disable_bar)
 
     logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences "
                                          f"of the gene families : '{outpath}{'.gz' if compress else ''}")
@@ -514,7 +582,7 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa
             parition_first_letter = family_filter[0].upper()
 
         elif family_filter == "rgp":
-            rgp_genes = read_rgp__genes_from_pangenome_file(h5f)
+            rgp_genes = read_rgp_genes_from_pangenome_file(h5f)
             family_to_write = get_families_from_genes(h5f, rgp_genes)
         
         elif family_filter.startswith("module_"):
@@ -528,7 +596,7 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa
 
         gene_fam_info_table = h5f.root.geneFamiliesInfo
 
-        # TODO improve tqdm with total being the number of family to write
+
         for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar):
             
             partition_match = partition_filter and (family_filter == "all" or row['partition'].decode().startswith(parition_first_letter))
@@ -577,7 +645,7 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_
             genes_to_write = get_genes_from_families(h5f, families)
 
         elif gene_filter == "rgp":
-            genes_to_write = read_rgp__genes_from_pangenome_file(h5f)
+            genes_to_write = read_rgp_genes_from_pangenome_file(h5f)
 
         elif gene_filter == "all":
             genes_to_write = []
@@ -585,7 +653,7 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_
 
         seq_id_to_genes = get_seqid_to_genes(h5f, genes_to_write, get_all_genes=get_all_genes, disable_bar=disable_bar)
 
-        write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes)
+        write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes, disable_bar=disable_bar)
         
     logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences "
                                          f"of the gene families : '{outpath}{'.gz' if compress else ''}")
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 90234462..49ec4430 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -104,48 +104,6 @@ def translate_genes(sequences: Union[Path, Iterable[Path]], tmpdir: Path, cpu: i
     return seqdb
 
 
-def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_core: float = 0.95,
-                         compress: bool = False, disable_bar: bool = False) -> Path:
-    """
-    Write all nucleotide CDS sequences
-
-    :param pangenome: Pangenome object with gene families sequences
-    :param output: Path to output directory
-    :param genes: Selected partition of gene
-    :param soft_core: Soft core threshold to use
-    :param compress: Compress the file in .gz
-    :param disable_bar: Disable progress bar
-
-    :raises AttributeError: If the pangenome does not contain gene sequences
-    """
-
-    logging.getLogger("PPanGGOLiN").info("Writing all the gene nucleotide sequences...")
-    outpath = output / f"{genes}_genes.fna"
-
-    genefams = set()
-    genes_to_write = []
-    if genes == "rgp":
-        logging.getLogger("PPanGGOLiN").info("Writing the gene nucleotide sequences in RGPs...")
-        for region in pangenome.regions:
-            genes_to_write.extend(region.genes)
-    else:
-        genefams = select_families(pangenome, genes, "gene nucleotide sequences", soft_core)
-
-    for fam in genefams:
-        genes_to_write.extend(fam.genes)
-
-    logging.getLogger("PPanGGOLiN").info(f"There are {len(genes_to_write)} genes to write")
-    if pangenome.status["geneSequences"] in ["inFile"]:
-        write_gene_sequences_from_pangenome_file(pangenome.file, outpath, set([gene.ID for gene in genes_to_write]),
-                                                 compress=compress, disable_bar=disable_bar)
-    elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
-        write_gene_sequences_from_annotations(genes_to_write, outpath, compress=compress, disable_bar=disable_bar)
-    else:
-        # this should never happen if the pangenome has been properly checked before launching this function.
-        raise AttributeError("The pangenome does not include gene sequences")
-    logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}{'.gz' if compress else ''}'")
-    return outpath
-
 
 def write_gene_protein_sequences(pangenome_filename: str, output: Path, gene_filter: str, soft_core: float = 0.95,
                                  compress: bool = False, keep_tmp: bool = False, tmp: Path = None,
@@ -189,109 +147,6 @@ def write_gene_protein_sequences(pangenome_filename: str, output: Path, gene_fil
             logging.getLogger("PPanGGOLiN").info(f"Done writing the gene protein sequences : '{outpath}'")
 
 
-def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]:
-    """
-    function used to filter down families to the given partition
-
-    :param pangenome: Pangenome object
-    :param partition: Selected partition
-    :param type_name: Which type of sequence we want. Gene families, protein, gene
-    :param soft_core: Soft core threshold to use
-
-    :return: Selected gene families
-    """
-    genefams = set()
-    if partition == 'all':
-        logging.getLogger("PPanGGOLiN").info(f"Writing all of the {type_name}...")
-        genefams = pangenome.gene_families
-
-    elif partition in ['persistent', 'shell', 'cloud']:
-        logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} of the {partition}...")
-        for fam in pangenome.gene_families:
-            if fam.named_partition == partition:
-                genefams.add(fam)
-
-    elif partition == "rgp":
-        logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} in RGPs...")
-        for region in pangenome.regions:
-            genefams |= set(region.families)
-
-    elif partition == "softcore":
-        logging.getLogger("PPanGGOLiN").info(
-            f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes")
-        threshold = pangenome.number_of_organisms * soft_core
-        for fam in pangenome.gene_families:
-            if fam.number_of_organisms >= threshold:
-                genefams.add(fam)
-
-    elif partition == "core":
-        logging.getLogger("PPanGGOLiN").info(f"Writing the representative {type_name} of the {partition} "
-                                             "gene families...")
-        for fam in pangenome.gene_families:
-            if fam.number_of_organisms == pangenome.number_of_organisms:
-                genefams.add(fam)
-
-    elif "module_" in partition:
-        logging.getLogger("PPanGGOLiN").info(f"Writing the representation {type_name} of {partition} gene families...")
-        mod_id = int(partition.replace("module_", ""))
-        for mod in pangenome.modules:
-            # could be way more efficient with a dict structure instead of a set
-            if mod.ID == mod_id:
-                genefams |= set(mod.families)
-                break
-    return genefams
-
-
-def write_fasta_gene_fam(pangenome: Pangenome, output: Path, gene_families: str, soft_core: float = 0.95,
-                         compress: bool = False, disable_bar=False):
-    """
-    Write representative nucleotide sequences of gene families
-
-    :param pangenome: Pangenome object with gene families sequences
-    :param output: Path to output directory
-    :param gene_families: Selected partition of gene families
-    :param soft_core: Soft core threshold to use
-    :param compress: Compress the file in .gz
-    :param disable_bar: Disable progress bar
-    """
-
-    outpath = output / f"{gene_families}_nucleotide_families.fasta"
-
-    genefams = select_families(pangenome, gene_families, "representative nucleotide sequences of the gene families",
-                               soft_core)
-
-    write_gene_sequences_from_pangenome_file(pangenome.file, outpath, [fam.name for fam in genefams],
-                                             compress=compress, disable_bar=disable_bar)
-
-    logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences "
-                                         f"of the gene families : '{outpath}{'.gz' if compress else ''}")
-
-
-def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, soft_core: float = 0.95,
-                         compress: bool = False, disable_bar: bool = False):
-    """
-    Write representative amino acid sequences of gene families.
-
-    :param pangenome: Pangenome object with gene families sequences
-    :param output: Path to output directory
-    :param prot_families: Selected partition of protein families
-    :param soft_core: Soft core threshold to use
-    :param compress: Compress the file in .gz
-    :param disable_bar: Disable progress bar
-    """
-
-    outpath = output / f"{prot_families}_protein_families.faa"
-
-    genefams = select_families(pangenome, prot_families, "representative amino acid sequences of the gene families",
-                               soft_core)
-
-    with write_compressed_or_not(outpath, compress) as fasta:
-        for fam in tqdm(genefams, unit="prot families", disable=disable_bar):
-            fasta.write('>' + fam.name + "\n")
-            fasta.write(fam.sequence + "\n")
-    logging.getLogger("PPanGGOLiN").info(f"Done writing the representative amino acid sequences of the gene families:"
-                                         f"'{outpath}{'.gz' if compress else ''}'")
-
 
 def read_fasta_or_gff(file_path: Path) -> Dict[str, str]:
     """

From cd30d15c79491469fa78620713fd169a0e07f40e Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Fri, 20 Sep 2024 11:33:09 +0200
Subject: [PATCH 13/14] clean import

---
 ppanggolin/formats/writeSequences.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 49ec4430..e69f3a5a 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -5,7 +5,7 @@
 import logging
 import re
 from pathlib import Path
-from typing import Dict, Set, Iterable, Union
+from typing import Dict, Iterable, Union
 import tempfile
 import shutil
 
@@ -14,12 +14,11 @@
 
 # local libraries
 from ppanggolin.pangenome import Pangenome
-from ppanggolin.geneFamily import GeneFamily
 from ppanggolin.genome import Gene, Organism
 
 from ppanggolin.utils import (write_compressed_or_not, mk_outdir, create_tmpdir, read_compressed_or_not,
                               restricted_float, detect_filetype, run_subprocess)
-from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file, write_genes_from_pangenome_file,  write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file
+from ppanggolin.formats.readBinaries import check_pangenome_info, write_genes_from_pangenome_file,  write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file
 
 module_regex = re.compile(r'^module_\d+')  # \d == [0-9]
 poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex]

From bc18e15d565eacd0c9eb55556e3816aeadbe8c44 Mon Sep 17 00:00:00 2001
From: JeanMainguy <jean.mainguy@outlook.fr>
Date: Fri, 20 Sep 2024 11:33:34 +0200
Subject: [PATCH 14/14] use set instead of list to speed gene selection

---
 ppanggolin/formats/readBinaries.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 5f1b79e9..b9499fe8 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -1,13 +1,10 @@
 #!/usr/bin/env python3
 
 # default libraries
-import dis
 import logging
-from optparse import Option
 from pathlib import Path
 from typing import Dict, Any, Iterator, Set, List, Tuple, Optional
 from collections import defaultdict
-import numpy
 
 # installed libraries
 from tqdm import tqdm
@@ -311,14 +308,14 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa
                                           f"{output.absolute()}{'.gz' if compress else ''}")
 
 
-def read_rgp_genes_from_pangenome_file(h5f: tables.File) -> List[bytes]:
+def read_rgp_genes_from_pangenome_file(h5f: tables.File) -> Set[bytes]:
     """
     Retrieves a list of RGP genes from the pangenome file.
 
     :param h5f: The open HDF5 pangenome file containing RGP gene data.
     :return: A list of gene names (as bytes) from the RGP.
     """
-    rgp_genes = [row["gene"] for row in read_chunks(h5f.root.RGP, chunk=20000)]
+    rgp_genes = {row["gene"] for row in read_chunks(h5f.root.RGP, chunk=20000)}
 
     return rgp_genes
 
@@ -404,7 +401,7 @@ def get_genes_from_families(h5f: tables.File, families: List[bytes]) -> Set[byte
 
     return matching_genes
 
-def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = False, disable_bar:bool = False) ->  Dict[int, List[str]]:
+def get_seqid_to_genes(h5f: tables.File, genes:Set[bytes], get_all_genes:bool = False, disable_bar:bool = False) ->  Dict[int, List[str]]:
     """
     Creates a mapping of sequence IDs to gene names.
 
@@ -429,6 +426,7 @@ def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = F
 
     return seq_id_to_genes
 
+
 def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath: Path, compress: bool, seq_id_to_genes: Dict[int, List[str]], disable_bar:bool):
     """
     Writes gene sequences from the pangenome file to an output file. 
@@ -549,7 +547,7 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa
             logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.")
             return
 
-        seq_id_to_genes = get_seqid_to_genes(h5f, family_to_write)
+        seq_id_to_genes = get_seqid_to_genes(h5f, set(family_to_write))
 
         write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes, disable_bar=disable_bar)
 
@@ -627,6 +625,7 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_
     outpath = output / f"{gene_filter}_genes.fna"
     get_all_genes = False
 
+
     with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f:
 
         if gene_filter in ['persistent', 'shell', 'cloud', "softcore", "core"] or gene_filter.startswith("module_"):
@@ -648,7 +647,7 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_
             genes_to_write = read_rgp_genes_from_pangenome_file(h5f)
 
         elif gene_filter == "all":
-            genes_to_write = []
+            genes_to_write = set()
             get_all_genes = True
 
         seq_id_to_genes = get_seqid_to_genes(h5f, genes_to_write, get_all_genes=get_all_genes, disable_bar=disable_bar)