Merge pull request #133 from labgem/AnnotHDF5Reformat

Annot hdf5 reformat
labgem · Sep 19, 2023 · 5513cd7 · 5513cd7
2 parents c2e8ef7 + e992a17
commit 5513cd7
Show file tree

Hide file tree

Showing 11 changed files with 775 additions and 504 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.2.173
+1.2.187
diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py
@@ -9,7 +9,7 @@
 from subprocess import Popen, PIPE
 import ast
 from collections import defaultdict
-from typing import Union
+from typing import Dict, List, Union
 from pathlib import Path
 
 # local libraries
@@ -79,6 +79,7 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str
 
     :return: Annotated genes in a list of gene objects
     """
+
     locustag = org.name
     cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"]))
     logging.getLogger("PPanGGOLiN").debug(f"prodigal command : {' '.join(cmd)}")
@@ -151,7 +152,7 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str,  kingdom: str = "
     return gene_objs
 
 
-def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, int):
+def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]:
     """ Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value.
 
     :param org: Organism corresponding to fasta file
@@ -162,13 +163,12 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in
     try:
         contigs = {}
         contig_seq = ""
-        all_contig_len = 0
         contig = None
         for line in fna_file:
             if line.startswith('>'):
                 if len(contig_seq) >= 1:  # contig filter = 1
                     contigs[contig.name] = contig_seq.upper()
-                    all_contig_len += len(contig_seq)
+                    contig.length = len(contig_seq)
                 contig_seq = ""
                 try:
                     contig = org.get(line.split()[0][1:])
@@ -179,15 +179,16 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in
                 contig_seq += line.strip()
         if len(contig_seq) >= 1:  # processing the last contig
             contigs[contig.name] = contig_seq.upper()
-            all_contig_len += len(contig_seq)
+            contig.length = len(contig_seq)
+
     except AttributeError as e:
         raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. "
                              f"One possibility for this error is that the file did not start with a '>' "
                              f"as it would be expected from a fna file.")
     except Exception:  # To manage other exception which can occur
         raise Exception("Unexpected error. Please check your input file and if everything looks fine, "
                         "please post an issue on our github")
-    return contigs, all_contig_len
+    return contigs
 
 
 def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrapper:
@@ -290,7 +291,7 @@ def get_dna_sequence(contig_seq: str, gene: Gene) -> str:
         return reverse_complement(contig_seq[gene.start - 1:gene.stop])
 
 
-def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: str,
+def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str], tmpdir: str,
                       code: int = 11, norna: bool = False, kingdom: str = "bacteria",
                       allow_overlap: bool = False, procedure: str = None) -> Organism:
     """
@@ -312,10 +313,11 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir:
 
     fasta_file = read_compressed_or_not(file_name)
 
-    contig_sequences, all_contig_len = read_fasta(org, fasta_file)
+    contig_sequences = read_fasta(org, fasta_file)
     if is_compressed(file_name):  # TODO simply copy file with shutil.copyfileobj
         fasta_file = write_tmp_fasta(contig_sequences, tmpdir)
     if procedure is None:  # prodigal procedure is not force by user
+        all_contig_len = sum(len(contig) for contig in org.contigs)
         logging.getLogger("PPanGGOLiN").debug(all_contig_len)
         if all_contig_len < 20000:  # case of short sequence
             procedure = "meta"
@@ -325,16 +327,13 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir:
     genes = overlap_filter(genes, allow_overlap=allow_overlap)
 
     for contig_name, genes in genes.items():
-        try:
-            contig = org.get(contig_name)
-        except KeyError:
-            contig = Contig(contig_name, True if contig_name in circular_contigs else False)
-            org.add(contig)
+        contig = org.get(contig_name)
+        contig.is_circular = True if contig.name in circular_contigs else False
         for gene in genes:
             gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
             gene.fill_parents(org, contig)
             if isinstance(gene, Gene):
-                contig[gene.start] = gene
+                contig.add(gene)
             elif isinstance(gene, RNA):
                 contig.add_rna(gene)
     return org
diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
@@ -284,7 +284,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
     """
     Main function to cluster pangenome gene sequences into families
 
-    :param pangenome: Annoatated Pangenome
+    :param pangenome: Annotated Pangenome
     :param tmpdir: Path to temporary directory
     :param cpu: number of CPU cores to use
     :param defrag: Allow to remove fragment