Skip to content

Commit

Permalink
Merge pull request #133 from labgem/AnnotHDF5Reformat
Browse files Browse the repository at this point in the history
Annot hdf5 reformat
  • Loading branch information
axbazin authored Sep 19, 2023
2 parents c2e8ef7 + e992a17 commit 5513cd7
Show file tree
Hide file tree
Showing 11 changed files with 775 additions and 504 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.2.173
1.2.187
176 changes: 77 additions & 99 deletions ppanggolin/annotate/annotate.py

Large diffs are not rendered by default.

27 changes: 13 additions & 14 deletions ppanggolin/annotate/synta.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from subprocess import Popen, PIPE
import ast
from collections import defaultdict
from typing import Union
from typing import Dict, List, Union
from pathlib import Path

# local libraries
Expand Down Expand Up @@ -79,6 +79,7 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str
:return: Annotated genes in a list of gene objects
"""

locustag = org.name
cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"]))
logging.getLogger("PPanGGOLiN").debug(f"prodigal command : {' '.join(cmd)}")
Expand Down Expand Up @@ -151,7 +152,7 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "
return gene_objs


def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, int):
def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]:
""" Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value.
:param org: Organism corresponding to fasta file
Expand All @@ -162,13 +163,12 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in
try:
contigs = {}
contig_seq = ""
all_contig_len = 0
contig = None
for line in fna_file:
if line.startswith('>'):
if len(contig_seq) >= 1: # contig filter = 1
contigs[contig.name] = contig_seq.upper()
all_contig_len += len(contig_seq)
contig.length = len(contig_seq)
contig_seq = ""
try:
contig = org.get(line.split()[0][1:])
Expand All @@ -179,15 +179,16 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in
contig_seq += line.strip()
if len(contig_seq) >= 1: # processing the last contig
contigs[contig.name] = contig_seq.upper()
all_contig_len += len(contig_seq)
contig.length = len(contig_seq)

except AttributeError as e:
raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. "
f"One possibility for this error is that the file did not start with a '>' "
f"as it would be expected from a fna file.")
except Exception: # To manage other exception which can occur
raise Exception("Unexpected error. Please check your input file and if everything looks fine, "
"please post an issue on our github")
return contigs, all_contig_len
return contigs


def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrapper:
Expand Down Expand Up @@ -290,7 +291,7 @@ def get_dna_sequence(contig_seq: str, gene: Gene) -> str:
return reverse_complement(contig_seq[gene.start - 1:gene.stop])


def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: str,
def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str], tmpdir: str,
code: int = 11, norna: bool = False, kingdom: str = "bacteria",
allow_overlap: bool = False, procedure: str = None) -> Organism:
"""
Expand All @@ -312,10 +313,11 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir:

fasta_file = read_compressed_or_not(file_name)

contig_sequences, all_contig_len = read_fasta(org, fasta_file)
contig_sequences = read_fasta(org, fasta_file)
if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj
fasta_file = write_tmp_fasta(contig_sequences, tmpdir)
if procedure is None: # prodigal procedure is not force by user
all_contig_len = sum(len(contig) for contig in org.contigs)
logging.getLogger("PPanGGOLiN").debug(all_contig_len)
if all_contig_len < 20000: # case of short sequence
procedure = "meta"
Expand All @@ -325,16 +327,13 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir:
genes = overlap_filter(genes, allow_overlap=allow_overlap)

for contig_name, genes in genes.items():
try:
contig = org.get(contig_name)
except KeyError:
contig = Contig(contig_name, True if contig_name in circular_contigs else False)
org.add(contig)
contig = org.get(contig_name)
contig.is_circular = True if contig.name in circular_contigs else False
for gene in genes:
gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
gene.fill_parents(org, contig)
if isinstance(gene, Gene):
contig[gene.start] = gene
contig.add(gene)
elif isinstance(gene, RNA):
contig.add_rna(gene)
return org
2 changes: 1 addition & 1 deletion ppanggolin/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
"""
Main function to cluster pangenome gene sequences into families
:param pangenome: Annoatated Pangenome
:param pangenome: Annotated Pangenome
:param tmpdir: Path to temporary directory
:param cpu: number of CPU cores to use
:param defrag: Allow to remove fragment
Expand Down
Loading

0 comments on commit 5513cd7

Please sign in to comment.