diff --git a/VERSION b/VERSION index 52cd5461..0da01cf8 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.173 +1.2.187 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index ad0dd348..6c5ca044 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -9,6 +9,7 @@ from pathlib import Path import tempfile import time +from typing import List, Set, Tuple # installed libraries from tqdm import tqdm @@ -21,7 +22,7 @@ from ppanggolin.formats import write_pangenome -def check_annotate_args(args): +def check_annotate_args(args: argparse.Namespace): """Check That the given arguments are usable :param args: All arguments provide by user @@ -32,14 +33,14 @@ def check_annotate_args(args): raise Exception("You must provide at least a file with the --fasta option to annotate from sequences, " "or a file with the --gff option to load annotations from.") - if hasattr(args, "fasta") and args.fasta is not None: check_input_files(args.fasta, True) if hasattr(args, "anno") and args.anno is not None: check_input_files(args.anno, True) -def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set, + +def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: Set[str], start: int, stop: int, strand: str, gene_type: str, position: int = None, gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = ""): """ @@ -82,7 +83,7 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, position=position, product=product, local_identifier=gene_id, genetic_code=genetic_code) - contig[new_gene.start] = new_gene + contig.add(new_gene) else: # if not CDS, it is RNA new_gene = RNA(org.name + "_RNA_" + str(rna_counter).zfill(4)) new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, @@ -91,20 +92,19 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_parents(org, contig) -def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> ( - Organism, bool): +def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: List[str], + pseudo: bool = False) -> Tuple[Organism, bool]: """ Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file - :param organism: Organism name - :param gbff_file_path: Path to corresponding GFF file + :param organism_name: Organism name + :param gbff_file_path: Path to corresponding GBFF file :param circular_contigs: list of contigs :param pseudo: Allow to read pseudogène :return: Organism complete and true for sequence in file """ - org = Organism(organism) - + organism = Organism(organism_name) logging.getLogger("PPanGGOLiN").debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}") # revert the order of the file, to read the first line first. lines = read_compressed_or_not(gbff_file_path).readlines()[::-1] @@ -113,36 +113,29 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p while len(lines) != 0: line = lines.pop() # beginning of contig - contig = None - set_contig = False is_circ = False - contig_locus_id = None + contig_id = None if line.startswith('LOCUS'): if "CIRCULAR" in line.upper(): # this line contains linear/circular word telling if the dna sequence is circularized or not is_circ = True - contig_locus_id = line.split()[1] + # TODO maybe it could be a good thing to add a elif for linear + # and if circular or linear are not found raise a warning + + contig_id = line.split()[1] # If contig_id is not specified in VERSION afterward like with Prokka, in that case we use the one in LOCUS while not line.startswith('FEATURES'): - if line.startswith('VERSION'): + if line.startswith('VERSION') and line[12:].strip() != "": contig_id = line[12:].strip() - if contig_id != "": - try: - contig = org.get(contig_id) - except KeyError: - contig = Contig(contig_id, True if contig_id in circular_contigs else False) - org.add(contig) - set_contig = True line = lines.pop() - if not set_contig: - # if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. - # Should be unique in a dataset, but if there's an update the contig ID - # might still be the same even though it should not(?) - try: - contig = org.get(contig_locus_id) - except KeyError: - contig = Contig(contig_locus_id, True if contig_locus_id in circular_contigs else False) - org.add(contig) + # If no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. + # Should be unique in a dataset, but if there's an update + # the contig ID might still be the same even though it should not(?) + try: + contig = organism.get(contig_id) + except KeyError: + contig = Contig(contig_id, True if contig_id in circular_contigs or is_circ else False) + organism.add(contig) # start of the feature object. dbxref = set() gene_name = "" @@ -160,8 +153,8 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p curr_type = line[5:21].strip() if curr_type != "": if useful_info: - create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, - contig.number_of_genes, gene_name, product, genetic_code, protein_id) + create_gene(organism, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, + obj_type, contig.number_of_genes, gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 else: @@ -192,6 +185,9 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p pass # don't know what to do with that, ignoring for now. # there is a protein with a frameshift mecanism. + elif curr_type == 'source': # Get Contig length + start, end = map(int, map(str.strip, line[21:].split('..'))) + contig.length = end - start + 1 elif useful_info: # current info goes to current objtype, if it's useful. if line[21:].startswith("/db_xref"): dbxref.add(line.split("=")[1].replace('"', '').strip()) @@ -220,7 +216,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p line = lines.pop() # end of contig if useful_info: # saving the last element... - create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, + create_gene(organism, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, contig.number_of_genes, gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 @@ -237,30 +233,31 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p # get each gene's sequence. for gene in contig.genes: gene.add_sequence(get_dna_sequence(sequence, gene)) - return org, True + return organism, True -def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: bool = False) -> (Organism, bool): +def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str], + pseudo: bool = False) -> Tuple[Organism, bool]: """ Read annotation from GFF file :param organism: Organism name - :param gff_file_path: Path to corresponding GFF file - :param circular_contigs: list of contigs + :param gff_file_path: Path corresponding to GFF file + :param circular_contigs: List of circular contigs :param pseudo: Allow to read pseudogène - :return: Organism object and if there are sequences associate or not + :return: Organism object and if there are sequences associated or not """ (gff_seqname, _, gff_type, gff_start, gff_end, _, gff_strand, _, gff_attribute) = range(0, 9) - # missing values : source, score, frame. They are unused. + # Missing values: source, score, frame. They are unused. def get_gff_attributes(gff_fields: list) -> dict: - """ - Parses the gff attribute's line and outputs the attributes_get in a dict structure. - :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff. + """Parses the gff attribute's line and outputs the attributes_get in a dict structure. - :return: attributes get + :param gff_fields: A gff line stored as a list. Each element of the list is a column of the gff. + + :return: Attributes get """ attributes_field = [f for f in gff_fields[gff_attribute].strip().split(';') if len(f) > 0] attributes_get = {} @@ -276,7 +273,8 @@ def get_id_attribute(attributes_dict: dict) -> str: """ Gets the ID of the element from which the provided attributes_get were extracted. Raises an error if no ID is found. - :param attributes_dict: attributes from one gff line + + :param attributes_dict: Attributes from one gff line :return: CDS identifier """ @@ -302,11 +300,9 @@ def get_id_attribute(attributes_dict: dict) -> str: has_fasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] - try: - contig = org.get(fields[1]) - except KeyError: - contig = Contig(fields[1], True if fields[1] in circular_contigs else False) - org.add(contig) + contig = Contig(fields[1], True if fields[1] in circular_contigs else False) + org.add(contig) + contig.length = int(fields[-1]) - int(fields[3]) + 1 continue elif line.startswith('#'): # comment lines to be ignores by parsers @@ -326,32 +322,14 @@ def get_id_attribute(attributes_dict: dict) -> str: # if it's not found, we get the one under the 'ID' field which must exist # (otherwise not a gff3 compliant file) gene_id = get_id_attribute(attributes) - try: - name = attributes.pop('NAME') - except KeyError: - try: - name = attributes.pop('GENE') - except KeyError: - name = "" + name = attributes.pop('NAME', attributes.pop('GENE', "")) if "pseudo" in attributes or "pseudogene" in attributes: pseudogene = True - try: - product = attributes.pop('PRODUCT') - except KeyError: - product = "" - - try: - genetic_code = int(attributes.pop("TRANSL_TABLE")) - except KeyError: - genetic_code = 11 + product = attributes.pop('PRODUCT', "") + genetic_code = int(attributes.pop("TRANSL_TABLE", 11)) if contig is None or contig.name != fields_gff[gff_seqname]: # get the current contig - try: - contig = org.get(fields_gff[gff_seqname]) - except KeyError: - contig = Contig(fields_gff[gff_seqname], - True if fields_gff[gff_seqname] in circular_contigs else False) - org.add(contig) + contig = org.get(fields_gff[gff_seqname]) if fields_gff[gff_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) @@ -372,7 +350,7 @@ def get_id_attribute(attributes_dict: dict) -> str: # GET THE FASTA SEQUENCES OF THE GENES if has_fasta and fasta_string != "": - contig_sequences, _ = read_fasta(org, fasta_string.split('\n')) # _ is total contig length + contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length for contig in org.contigs: for gene in contig.genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) @@ -381,7 +359,7 @@ def get_id_attribute(attributes_dict: dict) -> str: return org, has_fasta -def launch_read_anno(args: tuple) -> (Organism, bool): +def launch_read_anno(args: Tuple[str, Path, List[str], bool]) -> Tuple[Organism, bool]: """ Allow to launch in multiprocessing the read of genome annotation :param args: Pack of argument for annotate_organism function @@ -391,8 +369,8 @@ def launch_read_anno(args: tuple) -> (Organism, bool): return read_anno_file(*args) -def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, pseudo: bool = False) -> ( - Organism, bool): +def read_anno_file(organism_name: str, filename: Path, circular_contigs: List[str], + pseudo: bool = False) -> Tuple[Organism, bool]: """ Read a GBFF file for one organism @@ -419,7 +397,7 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, p "You may be able to use --fasta instead.") -def chose_gene_identifiers(pangenome) -> bool: +def chose_gene_identifiers(pangenome: Pangenome) -> bool: """ Parses the pangenome genes to decide whether to use local_identifiers or ppanggolin generated gene identifiers. If the local identifiers are unique within the pangenome they are picked, otherwise ppanggolin ones are used. @@ -490,15 +468,15 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p pangenome.parameters["annotation"]["read_annotations_from_file"] = True -def get_gene_sequences_from_fastas(pangenome, fasta_file): +def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: List[Path]): """ Get gene sequences from fastas - :param pangenome: input pangenome - :param fasta_file: list of fasta file + :param pangenome: Input pangenome + :param fasta_files: list of fasta file """ fasta_dict = {} - for line in read_compressed_or_not(fasta_file): + for line in read_compressed_or_not(fasta_files): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: logging.getLogger("PPanGGOLiN").error("No tabulation separator found in organisms file") @@ -506,15 +484,15 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): try: org = pangenome.get_organism(elements[0]) except KeyError: - raise KeyError(f"One of the genome in your '{fasta_file}' was not found in the pan." + raise KeyError(f"One of the genome in your '{fasta_files}' was not found in the pan." f" This might mean that the genome names between your annotation file and " f"your fasta file are different.") with read_compressed_or_not(elements[1]) as currFastaFile: - fasta_dict[org], _ = read_fasta(org, currFastaFile) + fasta_dict[org] = read_fasta(org, currFastaFile) if set(pangenome.organisms) > set(fasta_dict.keys()): - missing = pangenome.number_of_organisms() - len(set(pangenome.organisms) & set(fasta_dict.keys())) + missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys())) raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. " - f"{missing} are missing (out of {pangenome.number_of_organisms()}).") + f"{missing} are missing (out of {pangenome.number_of_organisms}).") for org in pangenome.organisms: for contig in org.contigs: @@ -532,7 +510,7 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): pangenome.status["geneSequences"] = "Computed" -def launch_annotate_organism(pack: tuple) -> Organism: +def launch_annotate_organism(pack: Tuple[str, Path, List[str], str, int, bool, str, bool, str]) -> Organism: """ Allow to launch in multiprocessing the genome annotation :param pack: Pack of argument for annotate_organism function @@ -543,8 +521,8 @@ def launch_annotate_organism(pack: tuple) -> Organism: def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11, - kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, - disable_bar: bool = False): + kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, + procedure: str = None, disable_bar: bool = False): """ Main function to annotate a pangenome @@ -570,14 +548,13 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: if not org_path.exists(): # Check tsv sanity test if it's not one it's the other org_path = fasta_list.parent.joinpath(org_path) - + arguments.append((elements[0], org_path, elements[2:], tmpdir, translation_table, norna, kingdom, allow_overlap, procedure)) if len(arguments) == 0: raise Exception("There are no genomes in the provided file") - logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") with get_context('fork').Pool(processes=cpu) as p: for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", @@ -616,13 +593,16 @@ def launch(args: argparse.Namespace): if args.fasta: get_gene_sequences_from_fastas(pangenome, args.fasta) else: - logging.getLogger("PPanGGOLiN").warning( - "You provided gff files without sequences, and you did not provide " - "fasta sequences. Thus it was not possible to get the gene sequences.") - logging.getLogger("PPanGGOLiN").warning( - "You will be able to proceed with your analysis ONLY if you provide " - "the clustering results in the next step.") - + logging.getLogger("PPanGGOLiN").warning("You provided gff files without sequences, " + "and you did not provide fasta sequences. " + "Thus it was not possible to get the gene sequences.") + logging.getLogger("PPanGGOLiN").warning("You will be able to proceed with your analysis " + "ONLY if you provide the clustering results in the next step.") + else: + if args.fasta: + logging.getLogger("PPanGGOLiN").warning("You provided fasta sequences " + "but your gff files were already with sequences." + "PPanGGOLiN will use sequences in GFF and not from your fasta.") write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) @@ -684,8 +664,6 @@ def parser_annot(parser: argparse.ArgumentParser): if __name__ == '__main__': """To test local change and allow using debugger""" - import tempfile - from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index afad1cca..c21dbdfd 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -9,7 +9,7 @@ from subprocess import Popen, PIPE import ast from collections import defaultdict -from typing import Union +from typing import Dict, List, Union from pathlib import Path # local libraries @@ -79,6 +79,7 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str :return: Annotated genes in a list of gene objects """ + locustag = org.name cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"])) logging.getLogger("PPanGGOLiN").debug(f"prodigal command : {' '.join(cmd)}") @@ -151,7 +152,7 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = " return gene_objs -def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, int): +def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]: """ Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value. :param org: Organism corresponding to fasta file @@ -162,13 +163,12 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in try: contigs = {} contig_seq = "" - all_contig_len = 0 contig = None for line in fna_file: if line.startswith('>'): if len(contig_seq) >= 1: # contig filter = 1 contigs[contig.name] = contig_seq.upper() - all_contig_len += len(contig_seq) + contig.length = len(contig_seq) contig_seq = "" try: contig = org.get(line.split()[0][1:]) @@ -179,7 +179,8 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in contig_seq += line.strip() if len(contig_seq) >= 1: # processing the last contig contigs[contig.name] = contig_seq.upper() - all_contig_len += len(contig_seq) + contig.length = len(contig_seq) + except AttributeError as e: raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. " f"One possibility for this error is that the file did not start with a '>' " @@ -187,7 +188,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in except Exception: # To manage other exception which can occur raise Exception("Unexpected error. Please check your input file and if everything looks fine, " "please post an issue on our github") - return contigs, all_contig_len + return contigs def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrapper: @@ -290,7 +291,7 @@ def get_dna_sequence(contig_seq: str, gene: Gene) -> str: return reverse_complement(contig_seq[gene.start - 1:gene.stop]) -def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: str, +def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str], tmpdir: str, code: int = 11, norna: bool = False, kingdom: str = "bacteria", allow_overlap: bool = False, procedure: str = None) -> Organism: """ @@ -312,10 +313,11 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: fasta_file = read_compressed_or_not(file_name) - contig_sequences, all_contig_len = read_fasta(org, fasta_file) + contig_sequences = read_fasta(org, fasta_file) if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) if procedure is None: # prodigal procedure is not force by user + all_contig_len = sum(len(contig) for contig in org.contigs) logging.getLogger("PPanGGOLiN").debug(all_contig_len) if all_contig_len < 20000: # case of short sequence procedure = "meta" @@ -325,16 +327,13 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: genes = overlap_filter(genes, allow_overlap=allow_overlap) for contig_name, genes in genes.items(): - try: - contig = org.get(contig_name) - except KeyError: - contig = Contig(contig_name, True if contig_name in circular_contigs else False) - org.add(contig) + contig = org.get(contig_name) + contig.is_circular = True if contig.name in circular_contigs else False for gene in genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) gene.fill_parents(org, contig) if isinstance(gene, Gene): - contig[gene.start] = gene + contig.add(gene) elif isinstance(gene, RNA): contig.add_rna(gene) return org diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 8d2763cb..77637015 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -284,7 +284,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = """ Main function to cluster pangenome gene sequences into families - :param pangenome: Annoatated Pangenome + :param pangenome: Annotated Pangenome :param tmpdir: Path to temporary directory :param cpu: number of CPU cores to use :param defrag: Allow to remove fragment diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 900a8c4e..968c1f2c 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -5,10 +5,9 @@ import logging import sys from pathlib import Path -# installed libraries -from typing import TextIO -from typing import List +from typing import TextIO, List, Dict, Tuple +# installed libraries from tables import Table from tqdm import tqdm import tables @@ -26,19 +25,21 @@ class Genedata: """ This is a general class storing unique gene-related data to be written in a specific genedata table - - :param start: Start position of a gene - :param stop: Stop position of a gene - :param strand: associated strand - :param gene_type: Type of gene - :param position: Position of the gene on its contig - :param name: Name of the feature - :param product: Associated product - :param genetic_code: associated genetic code, if any """ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: int, name: str, product: str, genetic_code: int): + """Constructor method + + :param start: Gene start position + :param stop: Gene stop position + :param strand: Associated strand + :param gene_type: Gene type + :param position: Position of the gene on its contig + :param name: Name of the feature + :param product: Associated product + :param genetic_code: associated genetic code, if any + """ self.start = start self.stop = stop self.strand = strand @@ -164,10 +165,12 @@ def read_chunks(table: Table, column: str = None, chunk: int = 10000): yield row -def read_genedata(h5f: tables.File) -> dict: +def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: """ Reads the genedata table and returns a genedata_id2genedata dictionnary + :param h5f: the hdf5 file handler + :return: dictionnary linking genedata to the genedata identifier """ table = h5f.root.annotations.genedata @@ -192,7 +195,7 @@ def read_sequences(h5f: tables.File) -> dict: :param h5f: the hdf5 file handler :return: dictionnary linking sequences to the seq identifier """ - table = h5f.root.sequences + table = h5f.root.annotations.sequences seqid2seq = {} for row in read_chunks(table, chunk=20000): seqid2seq[row["seqid"]] = row['dna'].decode() @@ -204,6 +207,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter """ Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. + :param filename: Name of the pangenome file :param file_obj: Name of the output file :param list_cds: An iterable object of CDS @@ -212,7 +216,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter """ logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...") h5f = tables.open_file(filename, "r", driver_core_backing_store=0) - table = h5f.root.geneSequences + table = h5f.root.annotations.geneSequences list_cds = set(list_cds) if list_cds is not None else None seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): @@ -225,60 +229,6 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter h5f.close() -def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circular_contigs: dict, genedata_dict: dict, - link: bool = False): - """ - Read information from pangenome to assign to organism object - - :param pangenome: Input pangenome - :param org_name: Name of the organism - :param contig_dict: Dictionary with all contig and associate genes - :param circular_contigs: Dictionary of contigs - :param genedata_dict: dictionnary linking genedata to the genedata identifier - :param link: get the gene object if the genes are clustered - """ - org = Organism(org_name) - gene, gene_type = (None, None) - for contig_name, gene_list in contig_dict.items(): - try: - contig = org.get(contig_name) - except KeyError: - contig = Contig(contig_name, is_circular=circular_contigs[contig_name]) - org.add(contig) - for row in gene_list: - if link: # if the gene families are already computed/loaded the gene exists. - gene = pangenome.get_gene(row["ID"].decode()) - else: # else creating the gene. - curr_genedata = genedata_dict[row["genedata_id"]] - gene_type = curr_genedata.gene_type - if gene_type == "CDS": - gene = Gene(row["ID"].decode()) - elif "RNA" in gene_type: - gene = RNA(row["ID"].decode()) - try: - local = row["local"].decode() - except ValueError: - local = "" - if isinstance(gene, Gene): - gene.fill_annotations(start=curr_genedata.start, stop=curr_genedata.stop, strand=curr_genedata.strand, - gene_type=gene_type, name=curr_genedata.name, position=curr_genedata.position, - genetic_code=curr_genedata.genetic_code, product=curr_genedata.product, - local_identifier=local) - else: - gene.fill_annotations(start=curr_genedata.start, stop=curr_genedata.stop, strand=curr_genedata.strand, - gene_type=gene_type, name=curr_genedata.name, - product=curr_genedata.product, local_identifier=local) - gene.is_fragment = row["is_fragment"] - gene.fill_parents(org, contig) - if gene_type == "CDS": - contig[gene.start] = gene - elif "RNA" in gene_type: - contig.add_rna(gene) - else: - raise Exception(f"A strange type '{gene_type}', which we do not know what to do with, was met.") - pangenome.add_organism(org) - - def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ Read information about graph in pangenome hdf5 file to add in pangenome object @@ -350,6 +300,7 @@ def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar: def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ Read gene sequences in pangenome hdf5 file to add in pangenome object + :param pangenome: Pangenome object without gene sequence associate to gene :param h5f: Pangenome HDF5 file with gene sequence associate to gene :param disable_bar: Disable the progress bar @@ -357,7 +308,7 @@ def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: boo if pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"]: raise Exception("It's not possible to read the pangenome gene dna sequences " "if the annotations have not been loaded.") - table = h5f.root.geneSequences + table = h5f.root.annotations.geneSequences seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): @@ -437,8 +388,94 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal pangenome.add_module(module) pangenome.status["modules"] = "Loaded" +def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, + disable_bar: bool = False): + """Read organism table in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: Organism table + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ + contig2organism = {} + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar): + organism = Organism(row["name"].decode()) + pangenome.add_organism(organism) + + +def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, + disable_bar: bool = False): + """Read contig table in pangenome file to add them to the pangenome object -def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + :param pangenome: Pangenome object + :param table: Contig table + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar): + contig = Contig(name=row["name"].decode()) + contig.is_circular = row["is_circular"] + contig.length = int(row["length"]) + try: + organism = pangenome.get_organism(row["organism"].decode()) + except KeyError: + pass + else: + organism.add(contig) + +def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], + link: bool = True, chunk_size: int = 20000, disable_bar: bool = False): + """Read genes in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: Genes table + :param genedata_dict: Dictionary to link genedata with gene + :param link: Allow to link gene to organism and contig + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): + gene = Gene(row["ID"].decode()) + genedata = genedata_dict[row["genedata_id"]] + try: + local = row["local"].decode() + except ValueError: + local = "" + gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, + gene_type=genedata.gene_type, name=genedata.name, position=genedata.position, + genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local) + gene.is_fragment = row["is_fragment"] + if link: + contig = pangenome.get_contig(row["contig"].decode()) + gene.fill_parents(contig.organism, contig) + contig.add(gene) + + +def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], + link: bool = True, chunk_size: int = 20000, disable_bar: bool = False): + """Read RNAs in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: RNAs table + :param genedata_dict: Dictionary to link genedata with gene + :param link: Allow to link gene to organism and contig + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): + rna = RNA(row["ID"].decode()) + genedata = genedata_dict[row["genedata_id"]] + rna.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, + gene_type=genedata.gene_type, name=genedata.name, + product=genedata.product) + if link: + contig = pangenome.get_contig(row["contig"].decode()) + rna.fill_parents(contig.organism, contig) + contig.add_rna(rna) + + +def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_contigs: bool = True, + load_genes: bool = True, load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False): """ Read annotation in pangenome hdf5 file to add in pangenome object @@ -447,36 +484,24 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = :param disable_bar: Disable the progress bar """ annotations = h5f.root.annotations - - table = annotations.genes - pangenome_dict = {} - circular_contigs = {} - - genedata_dict = read_genedata(h5f) - - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): - decode_org = row["organism"].decode() - try: - # new gene, seen contig, seen org - pangenome_dict[decode_org][row["contig"]["name"].decode()].append(row["gene"]) - except KeyError: - try: - # new contig, seen org - pangenome_dict[decode_org][row["contig"]["name"].decode()] = [row["gene"]] - circular_contigs[decode_org][row["contig"]["name"].decode()] = row["contig"]["is_circular"] - except KeyError: - # new org - pangenome_dict[sys.intern(decode_org)] = {row["contig"]["name"].decode(): [row["gene"]]} - circular_contigs[decode_org] = {row["contig"]["name"].decode(): row["contig"]["is_circular"]} - - link = True if pangenome.status["genesClustered"] in ["Computed", "Loaded"] else False - - for org_name, contig_dict in tqdm(pangenome_dict.items(), total=len(pangenome_dict), - unit="organism", disable=disable_bar): - read_organism(pangenome, org_name, contig_dict, circular_contigs[org_name], genedata_dict, link) + genedata_dict = None + if load_organisms: + read_organisms(pangenome, annotations.genomes, disable_bar=disable_bar) + + if load_contigs: + read_contigs(pangenome, annotations.contigs, disable_bar=disable_bar) + + if load_genes: + genedata_dict = read_genedata(h5f) + read_genes(pangenome, annotations.genes, genedata_dict, + all([load_organisms, load_contigs]), disable_bar=disable_bar) + if load_rnas: + read_rnas(pangenome, annotations.RNAs, read_genedata(h5f) if genedata_dict is None else genedata_dict, + all([load_organisms, load_contigs]), disable_bar=disable_bar) pangenome.status["genomesAnnotated"] = "Loaded" + def read_info(h5f: tables.File): """ Read the pangenome content @@ -556,7 +581,16 @@ def read_modules_info(h5f: tables.File): f"\t\t\t- mean: {info_group._v_attrs['StatOfFamiliesInModules']['mean']}") -def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, sources: List[str] = None, disable_bar: bool = False): +def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, + sources: List[str] = None, disable_bar: bool = False): + """Read metadata to add them to the pangenome object + + :param pangenome: Pangenome object + :param h5f: Pangenome file + :param metatype: Object type to associate metadata + :param sources: Source name of metadata + :param disable_bar: Disable progress bar + """ metadata_group = h5f.root.metadata._f_get_child(metatype) for source in sources: source_table = metadata_group._f_get_child(source) @@ -630,12 +664,14 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa fix_partitioned(pangenome.file) h5f = tables.open_file(filename, "r") - if annotation: + + if annotation: # I place annotation here, to link gene to gene families if organism are not loaded if h5f.root.status._v_attrs.genomesAnnotated: logging.getLogger("PPanGGOLiN").info("Reading pangenome annotations...") read_annotation(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' has not been annotated, or has been improperly filled") + if gene_sequences: if h5f.root.status._v_attrs.geneSequences: logging.getLogger("PPanGGOLiN").info("Reading pangenome gene dna sequences...") @@ -652,6 +688,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception( f"The pangenome in file '{filename}' does not have gene families, or has been improperly filled") + if graph: if h5f.root.status._v_attrs.NeighborsGraph: logging.getLogger("PPanGGOLiN").info("Reading the neighbors graph edges...") @@ -659,6 +696,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have graph information, " f"or has been improperly filled") + if rgp: if h5f.root.status._v_attrs.predictedRGP: logging.getLogger("PPanGGOLiN").info("Reading the RGP...") @@ -666,6 +704,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have RGP information, " f"or has been improperly filled") + if spots: if h5f.root.status._v_attrs.spots: logging.getLogger("PPanGGOLiN").info("Reading the spots...") @@ -673,6 +712,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have spots information, " f"or has been improperly filled") + if modules: if h5f.root.status._v_attrs.modules: logging.getLogger("PPanGGOLiN").info("Reading the modules...") @@ -680,6 +720,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have modules information, " f"or has been improperly filled") + if metadata: assert metatype is not None if sources is None: diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py new file mode 100644 index 00000000..b09e4166 --- /dev/null +++ b/ppanggolin/formats/writeAnnotations.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python3 +# coding:utf-8 + +# default libraries +import logging +from typing import Dict, Tuple, Union + +# installed libraries +from tqdm import tqdm +import tables + +# local libraries +from ppanggolin.pangenome import Pangenome +from ppanggolin.genome import Gene, RNA +from ppanggolin.formats.readBinaries import Genedata + + +genedata_counter = 0 + + +def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int]: + """ + Get the maximum size of each annotation information to optimize disk space + + :param pangenome: Annotated pangenome + + :return: Maximum size of each annotation + """ + max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id = 1, 1, 1, 1, 1 + for org in pangenome.organisms: + if len(org.name) > max_org_len: + max_org_len = len(org.name) + for contig in org.contigs: + if len(contig.name) > max_contig_len: + max_contig_len = len(contig.name) + for gene in contig.genes: + if len(gene.ID) > max_gene_id_len: + max_gene_id_len = len(gene.ID) + if len(gene.local_identifier) > max_gene_local_id: + max_gene_local_id = len(gene.local_identifier) + for rna in contig.RNAs: + if len(rna.ID) > max_rna_id_len: + max_rna_id_len = len(rna.ID) + + return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id + + +def organism_desc(org_len: int) -> Dict[str, tables.StringCol]: + """ + Table description to save organism-related information + + :param org_len: Maximum size of organism name. + + :return: Formatted table + """ + return {'name': tables.StringCol(itemsize=org_len)} + + +def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + organism_desc: Dict[str, tables.StringCol], disable_bar=False): + """Write organisms information in the pangenome file + + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param organism_desc: Organisms table description. + :param disable_bar: Allow disabling progress bar + """ + organism_table = h5f.create_table(annotation, "genomes", organism_desc, + expectedrows=pangenome.number_of_organisms) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes") + organism_row = organism_table.row + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): + organism_row["name"] = org.name + organism_row.append() + organism_table.flush() + + +def contig_desc(contig_len: int, org_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]: + """Table description to save contig-related information + + :param contig_len: Maximum size of contig name + :param org_len: Maximum size of organism name. + + :return: Formatted table + """ + return {'name': tables.StringCol(itemsize=contig_len), + "is_circular": tables.BoolCol(dflt=False), + 'length': tables.UInt32Col(), + "organism": tables.StringCol(itemsize=org_len)} + + +def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]], + disable_bar=False): + """Write contigs information in the pangenome file + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param contig_desc: Contigs table description + :param disable_bar: Allow disabling progress bar + """ + contig_table = h5f.create_table(annotation, "contigs", contig_desc, expectedrows=pangenome.number_of_contigs) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") + contig_row = contig_table.row + for contig in tqdm(pangenome.contigs, total=pangenome.number_of_contigs, unit="contigs", disable=disable_bar): + contig_row["name"] = contig.name + contig_row["is_circular"] = contig.is_circular + contig_row["length"] = len(contig) + contig_row["organism"] = contig.organism.name + contig_row.append() + contig_table.flush() + + +def gene_desc(id_len: int, max_local_id: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: + """Table description to save gene-related information + + :param id_len: Maximum size of gene name + :param max_local_id: Maximum size of gene local identifier + :param max_contig_len: Maximum size of contig identifier + + :return: Formatted table + """ + return {'ID': tables.StringCol(itemsize=id_len), + 'genedata_id': tables.UInt32Col(), + 'local': tables.StringCol(itemsize=max_local_id), + 'is_fragment': tables.BoolCol(dflt=False), + 'contig': tables.StringCol(itemsize=max_contig_len)} + + +def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + gene_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]], + disable_bar=False) -> Dict[Genedata, int]: + """Write genes information in the pangenome file + + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param gene_desc: Genes table description + :param disable_bar: Allow to disable progress bar + + :returns: Dictionnary linking genedata to gene identifier + """ + global genedata_counter + genedata2gene = {} + gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") + gene_row = gene_table.row + for gene in tqdm(pangenome.genes, total=pangenome.number_of_genes, unit="gene", disable=disable_bar): + gene_row["ID"] = gene.ID + gene_row["is_fragment"] = gene.is_fragment + gene_row["local"] = gene.local_identifier + gene_row["contig"] = gene.contig.name + genedata = get_genedata(gene) + genedata_id = genedata2gene.get(genedata) + if genedata_id is None: + genedata_id = genedata_counter + genedata2gene[genedata] = genedata_id + genedata_counter += 1 + gene_row["genedata_id"] = genedata_id + gene_row.append() + gene_table.flush() + return genedata2gene + + +def rna_desc(id_len: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]: + """Table description to save rna-related information + + :param id_len: Maximum size of RNA identifier + :param max_contig_len: Maximum size of contig identifier + + :return: Formatted table + """ + return {'ID': tables.StringCol(itemsize=id_len), + 'genedata_id': tables.UInt32Col(), + 'contig': tables.StringCol(itemsize=max_contig_len)} + + +def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + rna_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col]], + disable_bar=False) -> Dict[Genedata, int]: + """Write RNAs information in the pangenome file + + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param rna_desc: RNAs table description + :param disable_bar: Allow to disable progress bar + + :returns: Dictionnary linking genedata to RNA identifier + """ + global genedata_counter + genedata2rna = {} + rna_table = h5f.create_table(annotation, "RNAs", rna_desc, expectedrows=pangenome.number_of_genes) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") + rna_row = rna_table.row + for rna in tqdm(pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar): + rna_row["ID"] = rna.ID + rna_row["contig"] = rna.contig.name + genedata = get_genedata(rna) + genedata_id = genedata2rna.get(genedata) + if genedata_id is None: + genedata_id = genedata_counter + genedata2rna[genedata] = genedata_id + genedata_counter += 1 + rna_row["genedata_id"] = genedata_id + rna_row.append() + rna_table.flush() + return genedata2rna + + +def genedata_desc(type_len: int, name_len: int, product_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: + """ + Creates a table for gene-related data + + :param type_len: Maximum size of gene Type. + :param name_len: Maximum size of gene name + :param product_len: Maximum size of gene product + :return: Formatted table for gene metadata + """ + return { + 'genedata_id': tables.UInt32Col(), + 'start': tables.UInt32Col(), + 'stop': tables.UInt32Col(), + 'strand': tables.StringCol(itemsize=1), + 'gene_type': tables.StringCol(itemsize=type_len), + 'position': tables.UInt32Col(), + 'name': tables.StringCol(itemsize=name_len), + 'product': tables.StringCol(itemsize=product_len), + 'genetic_code': tables.UInt32Col(dflt=11), + } + + +def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]: + """ + Get the maximum size of each gene data information to optimize disk space + + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_name_len = 1 + max_product_len = 1 + max_type_len = 1 + for org in pangenome.organisms: + for contig in org.contigs: + for gene in contig.genes: + if len(gene.name) > max_name_len: + max_name_len = len(gene.name) + if len(gene.product) > max_product_len: + max_product_len = len(gene.product) + if len(gene.type) > max_type_len: + max_type_len = len(gene.type) + for gene in contig.RNAs: + if len(gene.name) > max_name_len: + max_name_len = len(gene.name) + if len(gene.product) > max_product_len: + max_product_len = len(gene.product) + if len(gene.type) > max_type_len: + max_type_len = len(gene.type) + + return max_type_len, max_name_len, max_product_len + + +def get_genedata(feature: Union[Gene, RNA]) -> Genedata: + """ + Gets the genedata type of Feature + + :param feature: Gene or RNA object + + :return: Tuple with a Feature associated data + """ + position = None + genetic_code = 11 + if isinstance(feature, Gene): + position = feature.position + genetic_code = feature.genetic_code + return Genedata(feature.start, feature.stop, feature.strand, feature.type, position, feature.name, + feature.product, genetic_code) + + +def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + genedata2gene: Dict[Genedata, int], disable_bar=False): + """Writting genedata information in pangenome file + + :param pangenome: Pangenome object filled with annotation. + :param h5f: Pangenome file + :param annotation: Annotation group in Table + :param genedata2gene: Dictionnary linking genedata to gene identifier. + :param disable_bar: Allow disabling progress bar + """ + try: + genedata_table = annotation.genedata + except tables.exceptions.NoSuchNodeError: + genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), + expectedrows=len(genedata2gene)) + logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data " + "(can be lower than the number of genes)") + genedata_row = genedata_table.row + for genedata, genedata_id in tqdm(genedata2gene.items(), unit="genedata", disable=disable_bar): + genedata_row["genedata_id"] = genedata_id + genedata_row["start"] = genedata.start + genedata_row["stop"] = genedata.stop + genedata_row["strand"] = genedata.strand + genedata_row["gene_type"] = genedata.gene_type + if genedata.gene_type == "CDS": + genedata_row["position"] = genedata.position + genedata_row["genetic_code"] = genedata.genetic_code + genedata_row["name"] = genedata.name + genedata_row["product"] = genedata.product + genedata_row.append() + genedata_table.flush() + + +def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, rec_contigs: bool = True, + rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False): + """Function writing all the pangenome annotations + + :param pangenome: Annotated pangenome + :param h5f: Pangenome HDF5 file + :param rec_organisms: Allow writing organisms in pangenomes + :param rec_contigs: Allow writing contigs in pangenomes + :param rec_genes: Allow writing genes in pangenomes + :param rec_rnas: Allow writing RNAs in pangenomes + :param disable_bar: Alow to disable progress bar + """ + annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") + + org_len, contig_len, gene_id_len, rna_id_len, gene_local_id = get_max_len_annotations(pangenome) + + # I add these boolean in case we would one day only load organism, contig or genes, without the other. + + if rec_organisms: + desc = organism_desc(org_len) + write_organisms(pangenome, h5f, annotation, desc, disable_bar) + if rec_contigs: + desc = contig_desc(contig_len, org_len) + write_contigs(pangenome, h5f, annotation, desc, disable_bar) + if rec_genes: + desc = gene_desc(gene_id_len, gene_local_id, contig_len) + genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar) + write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar) + if rec_rnas: + desc = rna_desc(rna_id_len, contig_len) + genedata2rna = write_rnas(pangenome, h5f, annotation, desc, disable_bar) + write_genedata(pangenome, h5f, annotation, genedata2rna, disable_bar) + + +def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: + """ + Get the maximum size of gene sequences to optimize disk space + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_gene_id_len = 1 + max_gene_type = 1 + for gene in pangenome.genes: + if len(gene.ID) > max_gene_id_len: + max_gene_id_len = len(gene.ID) + if len(gene.type) > max_gene_type: + max_gene_type = len(gene.type) + return max_gene_id_len, max_gene_type + + +def gene_sequences_desc(gene_id_len: int, gene_type_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: + """ + Create table to save gene sequences + + :param gene_id_len: Maximum size of gene sequence identifier + :param gene_type_len: Maximum size of gene type + + :return: Formated table + """ + return { + "gene": tables.StringCol(itemsize=gene_id_len), + "seqid": tables.UInt32Col(), + "type": tables.StringCol(itemsize=gene_type_len) + } + + +def get_sequence_len(pangenome: Pangenome) -> int: + """ + Get the maximum size of gene sequences to optimize disk space + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_seq_len = 1 + for gene in pangenome.genes: + if len(gene.dna) > max_seq_len: + max_seq_len = len(gene.dna) + return max_seq_len + + +def sequence_desc(max_seq_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: + """ + Table description to save sequences + :param max_seq_len: Maximum size of gene type + :return: Formated table + """ + return { + "seqid": tables.UInt32Col(), + "dna": tables.StringCol(itemsize=max_seq_len) + } + + +def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Function writing all the pangenome gene sequences + :param pangenome: Pangenome with gene sequences + :param h5f: Pangenome HDF5 file without sequences + :param disable_bar: Disable progress bar + """ + gene_seq = h5f.create_table("/annotations", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), + expectedrows=pangenome.number_of_genes) + # process sequences to save them only once + seq2seqid = {} + id_counter = 0 + gene_row = gene_seq.row + for gene in tqdm(sorted(pangenome.genes, key=lambda x: x.ID), total=pangenome.number_of_genes, unit="gene", + disable=disable_bar): + curr_seq_id = seq2seqid.get(gene.dna) + if curr_seq_id is None: + curr_seq_id = id_counter + seq2seqid[gene.dna] = id_counter + id_counter += 1 + gene_row["gene"] = gene.ID + gene_row["seqid"] = curr_seq_id + gene_row["type"] = gene.type + gene_row.append() + gene_seq.flush() + + seq_table = h5f.create_table("/annotations", "sequences", sequence_desc(get_sequence_len(pangenome)), + expectedrows=len(seq2seqid)) + + seq_row = seq_table.row + for seq, seqid in seq2seqid.items(): + seq_row["dna"] = seq + seq_row["seqid"] = seqid + seq_row.append() + seq_table.flush() diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 18821b6b..dd0123ca 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -5,7 +5,7 @@ import logging from collections import Counter, defaultdict import statistics -from typing import Tuple +from typing import Tuple, Union import pkg_resources # installed libraries @@ -15,286 +15,12 @@ # local libraries from ppanggolin.pangenome import Pangenome +from ppanggolin.formats.writeAnnotations import write_annotations, write_gene_sequences from ppanggolin.formats.writeMetadata import write_metadata, erase_metadata, write_metadata_status from ppanggolin.genome import Feature, Gene from ppanggolin.formats.readBinaries import read_genedata, Genedata -def gene_desc(org_len, contig_len, id_len, max_local_id) -> dict: - """ - Create a table to save gene-related information - - :param org_len: Maximum size of organism - :param contig_len: Maximum size of contigs - :param id_len: Maximum size of gene ID - - :param max_local_id: Maximum size of gene local identifier - - :return: Formatted table - """ - return { - 'organism': tables.StringCol(itemsize=org_len), - "contig": { - 'name': tables.StringCol(itemsize=contig_len), - "is_circular": tables.BoolCol(dflt=False) - }, - "gene": { - 'ID': tables.StringCol(itemsize=id_len), - 'genedata_id': tables.UInt32Col(), - 'local': tables.StringCol(itemsize=max_local_id), - 'is_fragment': tables.BoolCol(dflt=False) - } - } - - -def genedata_desc(type_len, name_len, product_len): - """ - Creates a table for gene-related data - - :param type_len: Maximum size of gene Type - :param name_len: Maximum size of gene name - :param product_len: Maximum size of gene product - :return: Formatted table for gene metadata - """ - return { - 'genedata_id': tables.UInt32Col(), - 'start': tables.UInt32Col(), - 'stop': tables.UInt32Col(), - 'strand': tables.StringCol(itemsize=1), - 'gene_type': tables.StringCol(itemsize=type_len), - 'position': tables.UInt32Col(), - 'name': tables.StringCol(itemsize=name_len), - 'product': tables.StringCol(itemsize=product_len), - 'genetic_code': tables.UInt32Col(dflt=11), - } - - -def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int]: - """ - Get the maximum size of each annotation information to optimize disk space - - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_org_len = 1 - max_contig_len = 1 - max_gene_id_len = 1 - max_local_id = 1 - for org in pangenome.organisms: - if len(org.name) > max_org_len: - max_org_len = len(org.name) - for contig in org.contigs: - if len(contig.name) > max_contig_len: - max_contig_len = len(contig.name) - for gene in contig.genes: - if len(gene.ID) > max_gene_id_len: - max_gene_id_len = len(gene.ID) - if len(gene.local_identifier) > max_local_id: - max_local_id = len(gene.local_identifier) - for gene in contig.RNAs: - if len(gene.ID) > max_gene_id_len: - max_gene_id_len = len(gene.ID) - if len(gene.local_identifier) > max_local_id: - max_local_id = len(gene.local_identifier) - - return max_org_len, max_contig_len, max_gene_id_len, max_local_id - - -def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]: - """ - Get the maximum size of each gene data information to optimize disk space - - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_name_len = 1 - max_product_len = 1 - max_type_len = 1 - for org in pangenome.organisms: - for contig in org.contigs: - for gene in contig.genes: - if len(gene.name) > max_name_len: - max_name_len = len(gene.name) - if len(gene.product) > max_product_len: - max_product_len = len(gene.product) - if len(gene.type) > max_type_len: - max_type_len = len(gene.type) - for gene in contig.RNAs: - if len(gene.name) > max_name_len: - max_name_len = len(gene.name) - if len(gene.product) > max_product_len: - max_product_len = len(gene.product) - if len(gene.type) > max_type_len: - max_type_len = len(gene.type) - - return max_type_len, max_name_len, max_product_len - - -def get_genedata(gene: Feature) -> Genedata: - """ - Gets the genedata type of Feature - - :param gene: a Feature - :return: Tuple with a Feature associated data - """ - position = None - genetic_code = 11 - if gene.type == "CDS": - gene: Gene - position = gene.position - genetic_code = gene.genetic_code - return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name, - gene.product, genetic_code) - - -def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): - """ - Function writing all the pangenome annotations - - :param pangenome: Annotated pangenome - :param h5f: Pangenome HDF5 file - :param disable_bar: Alow to disable progress bar - """ - annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") - gene_table = h5f.create_table(annotation, "genes", gene_desc(*get_max_len_annotations(pangenome)), - expectedrows=pangenome.number_of_genes) - - logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") - - genedata2gene = {} - genedata_counter = 0 - - gene_row = gene_table.row - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): - for contig in org.contigs: - for gene in list(contig.genes) + list(contig.RNAs): - gene_row["organism"] = org.name - gene_row["contig/name"] = contig.name - gene_row["contig/is_circular"] = contig.is_circular - gene_row["gene/ID"] = gene.ID - gene_row["gene/is_fragment"] = gene.is_fragment - if gene.type == "CDS": - gene_row["gene/local"] = gene.local_identifier - genedata = get_genedata(gene) - genedata_id = genedata2gene.get(genedata) - if genedata_id is None: - genedata_id = genedata_counter - genedata2gene[genedata] = genedata_id - genedata_counter += 1 - gene_row["gene/genedata_id"] = genedata_id - gene_row.append() - gene_table.flush() - - genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), - expectedrows=len(genedata2gene)) - logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data (can be lower than the number of genes)") - genedata_row = genedata_table.row - for genedata, genedata_id in genedata2gene.items(): - genedata_row["genedata_id"] = genedata_id - genedata_row["start"] = genedata.start - genedata_row["stop"] = genedata.stop - genedata_row["strand"] = genedata.strand - genedata_row["gene_type"] = genedata.gene_type - if genedata.gene_type == "CDS": - genedata_row["position"] = genedata.position - genedata_row["genetic_code"] = genedata.genetic_code - genedata_row["name"] = genedata.name - genedata_row["product"] = genedata.product - genedata_row.append() - genedata_table.flush() - - -def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: - """ - Get the maximum size of gene sequences to optimize disk space - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_gene_id_len = 1 - max_gene_type = 1 - for gene in pangenome.genes: - if len(gene.ID) > max_gene_id_len: - max_gene_id_len = len(gene.ID) - if len(gene.type) > max_gene_type: - max_gene_type = len(gene.type) - return max_gene_id_len, max_gene_type - - -def gene_sequences_desc(gene_id_len, gene_type_len) -> dict: - """ - Create table to save gene sequences - :param gene_id_len: Maximum size of gene sequence identifier - :param gene_type_len: Maximum size of gene type - :return: Formated table - """ - return { - "gene": tables.StringCol(itemsize=gene_id_len), - "seqid": tables.UInt32Col(), - "type": tables.StringCol(itemsize=gene_type_len) - } - - -def get_sequence_len(pangenome: Pangenome) -> int: - """ - Get the maximum size of gene sequences to optimize disk space - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_seq_len = 1 - for gene in pangenome.genes: - if len(gene.dna) > max_seq_len: - max_seq_len = len(gene.dna) - return max_seq_len - - -def sequence_desc(max_seq_len: int) -> dict: - """ - Table description to save sequences - :param max_seq_len: Maximum size of gene type - :return: Formated table - """ - return { - "seqid": tables.UInt32Col(), - "dna": tables.StringCol(itemsize=max_seq_len) - } - - -def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): - """ - Function writing all the pangenome gene sequences - :param pangenome: Pangenome with gene sequences - :param h5f: Pangenome HDF5 file without sequences - :param disable_bar: Disable progress bar - """ - gene_seq = h5f.create_table("/", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), - expectedrows=pangenome.number_of_genes) - # process sequences to save them only once - seq2seqid = {} - id_counter = 0 - gene_row = gene_seq.row - for gene in tqdm(sorted(pangenome.genes, key=lambda x: x.ID), total=pangenome.number_of_genes, unit="gene", disable=disable_bar): - curr_seq_id = seq2seqid.get(gene.dna) - if curr_seq_id is None: - curr_seq_id = id_counter - seq2seqid[gene.dna] = id_counter - id_counter += 1 - gene_row["gene"] = gene.ID - gene_row["seqid"] = curr_seq_id - gene_row["type"] = gene.type - gene_row.append() - gene_seq.flush() - - seq_table = h5f.create_table("/", "sequences", sequence_desc(get_sequence_len(pangenome)), - expectedrows=len(seq2seqid)) - - seq_row = seq_table.row - for seq, seqid in seq2seqid.items(): - seq_row["dna"] = seq - seq_row["seqid"] = seqid - seq_row.append() - seq_table.flush() - - def gene_fam_desc(max_name_len: int, max_sequence_length: int, max_part_len: int) -> dict: """ Create a formated table for gene families description @@ -880,9 +606,9 @@ def update_gene_fragments(pangenome: Pangenome, h5f: tables.File, disable_bar: b table = h5f.root.annotations.genes for row in tqdm(table, total=table.nrows, unit="gene", disable=disable_bar): - genedata_id = row['gene/genedata_id'] + genedata_id = row['genedata_id'] if genedataid2genedata[genedata_id].gene_type == 'CDS': - row['gene/is_fragment'] = pangenome.get_gene(row['gene/ID'].decode()).is_fragment + row['is_fragment'] = pangenome.get_gene(row['ID'].decode()).is_fragment row.update() table.flush() diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 8017266e..205b08dc 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -311,6 +311,7 @@ def __init__(self, name: str, is_circular: bool = False): self._genes_getter = {} self._genes_position = [] self._organism = None + self._length = None def __str__(self) -> str: return self.name @@ -342,6 +343,23 @@ def __setitem__(self, start: int, gene: Gene): # TODO define eq function + @property + def length(self): + if self._length is None: + logging.getLogger("PPanGGOLiN").warning("Contig length is unknown") + return self._length + + @length.setter + def length(self, contig_len: int): + if not isinstance(contig_len, int): + raise TypeError("Contig length is expected to be an integer") + if contig_len < 0: + raise ValueError("Contig length must be positive") + self._length = contig_len + + def __len__(self): + return self.length + # retrieve gene by start position def __getitem__(self, position: int) -> Gene: """Get the gene for the given position @@ -489,6 +507,12 @@ def RNAs(self) -> Generator[RNA, None, None]: """ yield from self._rna_getter + @property + def number_of_rnas(self) -> int: + """Get the number of RNA in the contig + """ + return len(self._rna_getter) + class Organism(MetaFeatures): """ @@ -582,6 +606,13 @@ def __delitem__(self, name): except KeyError: raise KeyError("Position of the gene in the contig does not exist") + def __len__(self): + """ Get number of contigs in organism + + :return: Number of contigs in organism + """ + return len(self._contigs_getter.keys()) + @property def families(self): """Return the gene families present in the organism @@ -626,13 +657,6 @@ def contigs(self) -> Generator[Contig, None, None]: """ yield from self._contigs_getter.values() - def number_of_contigs(self) -> int: - """ Get number of contigs in organism - - :return: Number of contigs in organism - """ - return len(self._contigs_getter) - def add(self, contig: Contig): """Add a contig to organism diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index ba35e4c1..b5bf7c88 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -8,7 +8,7 @@ from pathlib import Path # local libraries -from ppanggolin.genome import Organism, Gene +from ppanggolin.genome import Organism, Contig, Gene from ppanggolin.region import Region, Spot, Module from ppanggolin.geneFamily import GeneFamily from ppanggolin.edge import Edge @@ -38,7 +38,6 @@ def __init__(self): self._regionGetter = {} self._spotGetter = {} self._moduleGetter = {} - self.status = { 'genomesAnnotated': "No", 'geneSequences': "No", @@ -146,6 +145,25 @@ def number_of_genes(self) -> int: self._mk_gene_getter() # make it return len(self._geneGetter) + """RNAs methods""" + @property + def RNAs(self) -> Generator[Gene, None, None]: + """Generator of genes in the pangenome. + + :return: gene generator + """ + for org in self.organisms: + for contig in org.contigs: + yield from contig.RNAs + + @property + def number_of_rnas(self) -> int: + """Returns the number of gene present in the pangenome + + :return: The number of genes + """ + return sum(ctg.number_of_rnas for ctg in self.contigs) + """Gene families methods""" @property def max_fam_id(self): @@ -281,6 +299,51 @@ def number_of_organisms(self) -> int: """ return len(self._orgGetter) + @property + def contigs(self) -> Generator[Contig, None, None]: + for organism in self.organisms: + yield from organism.contigs + @property + def number_of_contigs(self) -> int: + """Returns the number of contigs present in the pangenome + + :return: The number of contigs + """ + return sum(len(org) for org in self.organisms) + + def _mk_contig_getter(self): + """ + Builds the attribute _contig_getter of the pangenome + + Since the genes are never explicitly 'added' to a pangenome (but rather to an organism), + the pangenome cannot directly extract a gene from a geneID since it does not 'know' them. + If at some point we want to extract contig from a pangenome we'll create a contig_getter. + The assumption behind this is that the pangenome has been filled and no more contig will be added. + """ + self._contig_getter = {} + for contig in self.contigs: + self._contig_getter[contig.name] = contig + + def get_contig(self, name: str) -> Contig: + """Returns the contig that has the given name + + :param name: The ,ame of the contig to look for + + :return: Returns the wanted contig + + :raises AssertionError: If the `gene_id` is not an integer + :raises KeyError: If the `gene_id` is not in the pangenome + """ + assert isinstance(name, str), "Contig name should be a string" + + try: + return self._contig_getter[name] + except AttributeError: + # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. + self._mk_contig_getter() # make it + return self.get_contig(name) # Return what was expected. If geneID does not exist it will raise an error. + except KeyError: + raise KeyError(f"Contig: {name}, does not exist in the pangenome.") def get_organism(self, name: str) -> Organism: """ Get an organism that is expected to be in the pangenome using its name, which is supposedly unique. diff --git a/testingDataset/organisms.gbff.list b/testingDataset/organisms.gbff.list index 1537925a..d7fb3bf5 100644 --- a/testingDataset/organisms.gbff.list +++ b/testingDataset/organisms.gbff.list @@ -50,4 +50,4 @@ GCF_006508265.1 GBFF/GCF_006508265.1_ASM650826v1_genomic.gbff.gz GCF_000068585.1 GBFF/NC_010287.1.gff.gz GCF_000008725.1 GBFF/NC_000117.1.gbk.gz GCF_001183765.1_gff GBFF/PROKKA_12132021.gff.gz -GCF_001183765.1_gbk GBFF/PROKKA_12132021.gbk.gz +GCF_001183765.1_gbk GBFF/PROKKA_12132021.gbk.gz \ No newline at end of file diff --git a/tests/test_genome.py b/tests/test_genome.py index 4b511b98..4119c943 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -523,7 +523,8 @@ def test_number_of_contigs(self, organism): """ organism.add(Contig('contig1')) organism.add(Contig('contig2')) - assert organism.number_of_contigs() == 2 + assert isinstance(len(organism), int) + assert len(organism) == 2 def test_get_families(self, organism, contig, gene): """Tests that gene families in an organism can be retrieved