From bbef096786cb0754870e1e4b46e4924f80660cb1 Mon Sep 17 00:00:00 2001 From: jpjarnoux Date: Fri, 12 May 2023 15:56:26 +0200 Subject: [PATCH 01/75] Add pathlib to get path of input file in annot --- ppanggolin/annotate/annotate.py | 64 +++++------ ppanggolin/annotate/synta.py | 3 +- ppanggolin/formats/writeSequences.py | 3 +- ppanggolin/main.py | 21 ++-- ppanggolin/utils.py | 159 +++++++++++++++------------ 5 files changed, 126 insertions(+), 124 deletions(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 9faae5dc..7c5b240a 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -7,6 +7,8 @@ import logging import os import time +from pathlib import Path +from typing import List # installed libraries from tqdm import tqdm @@ -15,31 +17,14 @@ from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Organism, Gene, RNA, Contig -from ppanggolin.utils import read_compressed_or_not, mk_file_name, min_one +from ppanggolin.utils import read_compressed_or_not, mk_file_name, min_one, detect_filetype from ppanggolin.formats import write_pangenome -def detect_filetype(filename): - """ - Detects whether the current file is gff3, gbk/gbff, fasta or unknown. - If unknown, it will raise an error - - :param filename: path to file - - :return: current file type - """ - with read_compressed_or_not(filename) as f: - first_line = f.readline() - if first_line.startswith("LOCUS "): # then this is probably a gbff/gbk file - return "gbff" - elif first_line.startswith("##gff-version 3"): - return 'gff' - elif first_line.startswith(">"): - return 'fasta' - else: - raise Exception("Filetype was not gff3 (file starts with '##gff-version 3') " - "nor gbff/gbk (file starts with 'LOCUS '). " - "Only those two file formats are supported (for now).") +def check_annotate_args(args): + if args.fasta is None and args.anno is None: + raise Exception("You must provide at least a file with the --fasta option to annotate from sequences, " + "or a file with the --gff option to load annotations from.") def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set, @@ -94,7 +79,7 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_parents(org, contig) -def read_org_gbff(organism: str, gbff_file_path: str, circular_contigs: list, pseudo: bool = False) -> (Organism, bool): +def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> (Organism, bool): """ Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file @@ -107,7 +92,7 @@ def read_org_gbff(organism: str, gbff_file_path: str, circular_contigs: list, ps """ org = Organism(organism) - logging.getLogger().debug(f"Extracting genes informations from the given gbff {gbff_file_path.split('/')[-1]}") + logging.getLogger().debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}") # revert the order of the file, to read the first line first. lines = read_compressed_or_not(gbff_file_path).readlines()[::-1] gene_counter = 0 @@ -238,7 +223,7 @@ def read_org_gbff(organism: str, gbff_file_path: str, circular_contigs: list, ps return org, True -def read_org_gff(organism: str, gff_file_path: str, circular_contigs, pseudo: bool = False) -> (Organism, bool): +def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: bool = False) -> (Organism, bool): """ Read annotation from GFF file @@ -382,7 +367,7 @@ def launch_read_anno(args: tuple) -> (Organism, bool): return read_anno_file(*args) -def read_anno_file(organism_name: str, filename: str, circular_contigs: list, pseudo: bool = False) -> (Organism, bool): +def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, pseudo: bool = False) -> (Organism, bool): """ Read a GBFF file for one organism @@ -434,7 +419,7 @@ def chose_gene_identifiers(pangenome) -> bool: return True -def read_annotations(pangenome: Pangenome, organisms_file: str, cpu: int = 1, pseudo: bool = False, +def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, pseudo: bool = False, disable_bar: bool = False): """ Read the annotation from GBFF file @@ -445,7 +430,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: str, cpu: int = 1, ps :param pseudo: allow to read pseudogène :param disable_bar: Disable the progresse bar """ - logging.getLogger().info("Reading " + organisms_file + " the list of organism files ...") + logging.getLogger().info(f"Reading {organisms_file.name} the list of organism files ...") pangenome.status["geneSequences"] = "Computed" # we assume there are gene sequences in the annotation files, @@ -455,7 +440,10 @@ def read_annotations(pangenome: Pangenome, organisms_file: str, cpu: int = 1, ps elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: raise Exception(f"No tabulation separator found in given --fasta file: '{organisms_file}'") - args.append((elements[0], elements[1], elements[2:], pseudo)) + org_path = Path(elements[1]) + if not org_path.exists(): # Check tsv sanity test if it's not one it's the other + org_path = organisms_file.parent.joinpath(org_path) + args.append((elements[0], org_path, elements[2:], pseudo)) with get_context('fork').Pool(cpu) as p: for org, flag in tqdm(p.imap_unordered(launch_read_anno, args), unit="file", total=len(args), disable=disable_bar): @@ -531,7 +519,7 @@ def launch_annotate_organism(pack: tuple) -> Organism: return annotate_organism(*pack) -def annotate_pangenome(pangenome: Pangenome, fasta_list: str, tmpdir: str, cpu: int = 1, translation_table: int = 11, +def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11, kingdom: str = "bacteria", norna: bool = False, overlap: bool = True, contig_filter: int = 1, procedure: str = None, disable_bar: bool = False): """ @@ -555,9 +543,12 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: str, tmpdir: str, cpu: arguments = [] # Argument given to annotate organism in same order than prototype for line in read_compressed_or_not(fasta_list): elements = [el.strip() for el in line.split("\t")] - if len(elements) <= 1: + if len(elements) <= 1: # TODO remove ? Already tested by check TSV sanity raise Exception("No tabulation separator found in organisms file") - arguments.append((elements[0], elements[1], elements[2:], tmpdir, translation_table, + org_path = Path(elements[1]) + if not org_path.exists(): # Check tsv sanity test if it's not one it's the other + org_path = fasta_list.parent.joinpath(org_path) + arguments.append((elements[0], org_path, elements[2:], tmpdir, translation_table, norna, kingdom, overlap, contig_filter, procedure)) if len(arguments) == 0: raise Exception("There are no genomes in the provided file") @@ -587,8 +578,7 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ - if not any([args.fasta, args.anno]): - raise Exception("At least one of --fasta or --anno must be given") + check_annotate_args(args) filename = mk_file_name(args.basename, args.output, args.force) pangenome = Pangenome() if args.fasta is not None and args.anno is None: @@ -630,16 +620,16 @@ def parser_annot(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('--fasta', required=False, type=str, + required.add_argument('--fasta', required=False, type=Path, help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " "sequence(s) (the fastas can be compressed with gzip). One line per organism.") - required.add_argument('--anno', required=False, type=str, + required.add_argument('--anno', required=False, type=Path, help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " "annotations (the files can be compressed with gzip). One line per organism. " "If this is provided, those annotations will be used.") optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('-o', '--output', required=False, type=str, + optional.add_argument('-o', '--output', required=False, type=Path, default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 23d5c195..5dd89468 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -10,6 +10,7 @@ import ast from collections import defaultdict from typing import Union +from pathlib import Path # local libraries from ppanggolin.genome import Organism, Gene, RNA @@ -291,7 +292,7 @@ def get_dna_sequence(contig_seq: str, gene: Gene) -> str: return reverse_complement(contig_seq[gene.start - 1:gene.stop]) -def annotate_organism(org_name: str, file_name: str, circular_contigs, tmpdir: str, +def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: str, code: int = 11, norna: bool = False, kingdom: str = "bacteria", overlap: bool = True, contig_filter: int = 1, procedure: str = None) -> Organism: """ diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 4f9a78b7..58f89da6 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -12,9 +12,8 @@ # local libraries from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float +from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file -from ppanggolin.annotate.annotate import detect_filetype poss_values_log = "Possible values are 'all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', " \ "'core', 'module_X' with X being a module id." diff --git a/ppanggolin/main.py b/ppanggolin/main.py index 87cfb0ce..b82b621f 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -13,7 +13,7 @@ # local modules import ppanggolin.pangenome -from ppanggolin.utils import check_log, check_input_files, set_verbosity_level +from ppanggolin.utils import check_log, check_input_files, set_verbosity_level, check_tsv_sanity import ppanggolin.nem.partition import ppanggolin.nem.rarefaction import ppanggolin.graph @@ -125,9 +125,6 @@ def cmd_line() -> argparse.Namespace: sys.exit(0) args = parser.parse_args() - if args.subcommand == "annotate" and args.fasta is None and args.anno is None: - raise Exception("You must provide at least a file with the --fasta option to annotate from sequences, " - "or a file with the --gff option to load annotations from.") return args @@ -138,15 +135,17 @@ def main(): """ args = cmd_line() - if hasattr(args, "pangenome"): - check_input_files(pangenome=args.pangenome) - if hasattr(args, "fasta"): - check_input_files(fasta=args.fasta) - if hasattr(args, "anno"): - check_input_files(anno=args.anno) - set_verbosity_level(args) + if hasattr(args, "pangenome"): + check_input_files(args.pangenome) + if hasattr(args, "fasta") and args.fasta is not None: + check_input_files(args.fasta) + check_tsv_sanity(args.fasta) + if hasattr(args, "anno") and args.anno is not None: + check_input_files(args.anno) + check_tsv_sanity(args.anno) + if args.subcommand == "annotate": ppanggolin.annotate.launch(args) elif args.subcommand == "cluster": diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index ecafa29e..9dd80eb3 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -43,56 +43,45 @@ def check_log(name: str) -> TextIO: return log_file -def check_tsv_sanity(tsv): +def check_tsv_sanity(tsv: Path): """ Check if the given tsv is readable for the next PPanGGOLiN step - :param tsv: Path to the input tsv + :param tsv: Path to the tsv containing organims informations """ - f = open(tsv, "r") - name_set = set() - duplicated_names = set() - non_existing_files = set() - for line in f: - elements = [el.strip() for el in line.split("\t")] - if len(elements) <= 1: - raise Exception(f"No tabulation separator found in given file: {tsv}") - if " " in elements[0]: - raise Exception(f"Your genome names contain spaces (The first encountered genome name that had this string:" - f" '{elements[0]}'). To ensure compatibility with all of the dependencies of PPanGGOLiN " - f"this is not allowed. Please remove spaces from your genome names.") - old_len = len(name_set) - name_set.add(elements[0]) - if len(name_set) == old_len: - duplicated_names.add(elements[0]) - if not os.path.exists(elements[1]): - non_existing_files.add(elements[1]) - if len(non_existing_files) != 0: - raise Exception(f"Some of the given files do not exist. The non-existing files are the following : " - f"'{' '.join(non_existing_files)}'") - if len(duplicated_names) != 0: - raise Exception(f"Some of your genomes have identical names. The duplicated names are the following : " - f"'{' '.join(duplicated_names)}'") - - -def check_input_files(anno: str = None, pangenome: str = None, fasta: str = None): + with open(tsv, "r") as organisms_file: + name_set = set() + duplicated_names = set() + non_existing_files = set() + for line in organisms_file: + elements = [el.strip() for el in line.split("\t")] + if len(elements) <= 1: + raise Exception(f"No tabulation separator found in given file: {tsv}") + if " " in elements[0]: + raise Exception(f"Your genome names contain spaces (The first encountered genome name that had this string:" + f" '{elements[0]}'). To ensure compatibility with all of the dependencies of PPanGGOLiN " + f"this is not allowed. Please remove spaces from your genome names.") + old_len = len(name_set) + name_set.add(elements[0]) + if len(name_set) == old_len: + duplicated_names.add(elements[0]) + org_path = Path(elements[1]) + if not org_path.exists() and not tsv.parent.joinpath(org_path).exists(): + non_existing_files.add(elements[1]) + if len(non_existing_files) != 0: + raise Exception(f"Some of the given files do not exist. The non-existing files are the following : " + f"'{' '.join(non_existing_files)}'") + if len(duplicated_names) != 0: + raise Exception(f"Some of your genomes have identical names. The duplicated names are the following : " + f"'{' '.join(duplicated_names)}'") + + +def check_input_files(file: Path): """ Checks if the provided input files exist and are of the proper format - :param anno: Path to the annotation file - :param pangenome: Path to the pangenome hdf5 file - :param fasta: path to the fasta file + :param file: Path to the file """ - if pangenome is not None and not os.path.exists(pangenome): - raise FileNotFoundError(f"No such file or directory: '{pangenome}'") - - if anno is not None: - if not os.path.exists(anno): - raise FileNotFoundError(f"No such file or directory: '{anno}'") - check_tsv_sanity(anno) - - if fasta is not None: - if not os.path.exists(fasta): - raise FileNotFoundError(f"No such file or directory: '{fasta}'") - check_tsv_sanity(fasta) + if not file.exists(): + raise FileNotFoundError(f"No such file or directory: '{file.absolute().as_posix()}'") def set_verbosity_level(args): @@ -138,7 +127,7 @@ def jaccard_similarities(mat: csc_matrix, jaccard_similarity_th) -> csc_matrix: return similarities -def read_compressed_or_not(file_or_file_path: Union[str, BinaryIO, TextIOWrapper, TextIO]) -> Union[TextIOWrapper, +def read_compressed_or_not(file_or_file_path: Union[Path, BinaryIO, TextIOWrapper, TextIO]) -> Union[TextIOWrapper, BinaryIO, TextIO]: """ Reads a file object or file path, uncompresses it, if need be. @@ -147,21 +136,21 @@ def read_compressed_or_not(file_or_file_path: Union[str, BinaryIO, TextIOWrapper :return: TextIO object in read only """ - file = file_or_file_path - if isinstance(file, str): - file = open(file, "rb") - else: + input_file = file_or_file_path + if isinstance(input_file, Path): + input_file = open(input_file, "rb") + else: # type BinaryIO, TextIOWrapper, TextIO try: - file = open(file.name, "rb") + input_file = open(input_file.name, "rb") except AttributeError: - return file - if file.read(2).startswith(b'\x1f\x8b'): - file.seek(0) - return TextIOWrapper(gzip.open(filename=file, mode="r")) + return input_file + if input_file.read(2).startswith(b'\x1f\x8b'): + input_file.seek(0) + return TextIOWrapper(gzip.open(filename=input_file, mode="r")) else: - file.close() - file = open(file.name, "r") - return file + input_file.close() + input_file = open(input_file.name, "r") + return input_file def write_compressed_or_not(file_path: str, compress: bool = False) -> Union[gzip.GzipFile, TextIO]: @@ -179,24 +168,24 @@ def write_compressed_or_not(file_path: str, compress: bool = False) -> Union[gzi return open(file_path, "w") -def is_compressed(file_or_file_path: Union[str, TextIO, gzip.GzipFile]): - """ Checks is a file, or file path given is compressed or not +def is_compressed(file_or_file_path: Union[Path, TextIO, gzip.GzipFile]): + """ Checks is a compressed_file, or compressed_file path given is compressed or not - :param file_or_file_path: Input file + :param file_or_file_path: Input compressed_file - :return: Get if the file is compressed + :return: Get if the compressed_file is compressed """ - file = file_or_file_path - if isinstance(file, str): - file = open(file, "rb") + compressed_file = file_or_file_path + if isinstance(compressed_file, Path): + compressed_file = open(compressed_file, "rb") else: try: - file = open(file.name, "rb") + compressed_file = open(compressed_file.name, "rb") except AttributeError: return False - if file.read(2).startswith(b'\x1f\x8b'): + if compressed_file.read(2).startswith(b'\x1f\x8b'): return True - file.close() + compressed_file.close() return False @@ -208,13 +197,15 @@ def mk_outdir(output, force): :raise FileExistError: The current path already exist and force is false """ - if not os.path.exists(output): - os.makedirs(output) - elif not force: - raise FileExistsError(f"{output} already exists. Use -f if you want to overwrite the files in the directory") + if not output.is_dir(): + logging.getLogger().debug(f"Create output directory {output.absolute().as_posix()}") + Path.mkdir(output) + else: + if not force: + raise FileExistsError(f"{output} already exists. Use -f if you want to overwrite the files in the directory") -def mk_file_name(basename: str, output: str, force: bool = False) -> Path: +def mk_file_name(basename: str, output: Path, force: bool = False) -> Path: """Returns a usable filename for a ppanggolin output file, or crashes. :param basename: basename for the file @@ -223,7 +214,7 @@ def mk_file_name(basename: str, output: str, force: bool = False) -> Path: :return: Path to the file """ - filename = Path(output + "/" + basename) + filename = output/basename if filename.suffix != ".h5": filename = filename.with_suffix(".h5") @@ -234,6 +225,28 @@ def mk_file_name(basename: str, output: str, force: bool = False) -> Path: return filename +def detect_filetype(filename): + """ + Detects whether the current file is gff3, gbk/gbff, fasta or unknown. + If unknown, it will raise an error + + :param filename: path to file + + :return: current file type + """ + with read_compressed_or_not(filename) as f: + first_line = f.readline() + if first_line.startswith("LOCUS "): # then this is probably a gbff/gbk file + return "gbff" + elif first_line.startswith("##gff-version 3"): + return 'gff' + elif first_line.startswith(">"): + return 'fasta' + else: + raise Exception("Filetype was not gff3 (file starts with '##gff-version 3') " + "nor gbff/gbk (file starts with 'LOCUS '). " + "Only those two file formats are supported (for now).") + def restricted_float(x) -> float: """Decrease the choice possibility of float in argparse From 70f4e7166eca1144483a8ea1dccc5c9437600c96 Mon Sep 17 00:00:00 2001 From: jpjarnoux Date: Fri, 12 May 2023 16:07:05 +0200 Subject: [PATCH 02/75] Replace str by Path in cluster --- .github/workflows/main.yml | 2 +- ppanggolin/cluster/cluster.py | 9 +++++---- ppanggolin/formats/readBinaries.py | 4 ++-- ppanggolin/pangenome.py | 5 +++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1115de65..74af3a2f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -86,7 +86,7 @@ jobs: cd testingDataset ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv --output readclusterpang ppanggolin annotate --anno organisms.gbff.list --output readclusters - ppanggolin cluster --cluster clusters.tsv -p readclusters/pangenome.h5 + ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5 ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f cd - - name: testing align command diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index f3abf286..8366fefa 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -9,6 +9,7 @@ import os import argparse from typing import io +from pathlib import Path # installed libraries from networkx import Graph @@ -358,7 +359,7 @@ def infer_singletons(pangenome: Pangenome): logging.getLogger().info(f"Inferred {singleton_counter} singleton families") -def read_clustering(pangenome: Pangenome, families_tsv_file: str, infer_singleton: bool = False, force: bool = False, +def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False, disable_bar: bool = False): """ Get the pangenome information, the gene families and the genes with an associated gene family. @@ -373,7 +374,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: str, infer_singleto check_pangenome_former_clustering(pangenome, force) check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar) - logging.getLogger().info("Reading " + families_tsv_file + " the gene families file ...") + logging.getLogger().info(f"Reading {families_tsv_file.name} the gene families file ...") filesize = os.stat(families_tsv_file).st_size families_tsv_file = read_compressed_or_not(families_tsv_file) frag = False # the genome annotations are necessarily loaded. @@ -471,7 +472,7 @@ def parser_clust(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") clust = parser.add_argument_group(title="Clustering arguments") clust.add_argument("--identity", required=False, type=restricted_float, default=0.8, help="Minimal identity percent for two proteins to be in the same cluster") @@ -488,7 +489,7 @@ def parser_clust(parser: argparse.ArgumentParser): clust.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") read = parser.add_argument_group(title="Read clustering arguments") - read.add_argument('--clusters', required=False, type=str, + read.add_argument('--clusters', required=False, type=Path, help="A tab-separated list containing the result of a clustering. One line per gene. " "First column is cluster ID, and second is gene ID") read.add_argument("--infer_singletons", required=False, action="store_true", diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index ffa5e48b..37cd35f6 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -4,7 +4,7 @@ # default libraries import logging import sys - +from pathlib import Path # installed libraries from typing import TextIO @@ -97,7 +97,7 @@ def fix_partitioned(pangenome: Pangenome, pangenome_file: str): del status_group._v_attrs.Partitionned h5f.close() -def get_status(pangenome: Pangenome, pangenome_file: str): +def get_status(pangenome: Pangenome, pangenome_file: Path): """ Checks which elements are already present in the file. diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 6cc43f3f..651d7e79 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -3,6 +3,7 @@ # default libraries from typing import Iterator, List, Union, Dict, Set, Iterable +from pathlib import Path # local libraries from ppanggolin.genome import Organism, Gene @@ -49,7 +50,7 @@ def __init__(self): } self.parameters = {} - def add_file(self, pangenome_file: str): + def add_file(self, pangenome_file: Path): """Links an HDF5 file to the pangenome. If needed elements will be loaded from this file, and anything that is computed will be saved to this file when :func:`ppanggolin.formats.writeBinaries.writePangenome` is called. @@ -59,7 +60,7 @@ def add_file(self, pangenome_file: str): from ppanggolin.formats.readBinaries import get_status # importing on call instead of importing on top to avoid cross-reference problems. get_status(self, pangenome_file) - self.file = pangenome_file + self.file = pangenome_file.absolute().as_posix() """ Gene Methods""" @property From 412bfbb09d51b8a729f3ce008fd9c1689219a777 Mon Sep 17 00:00:00 2001 From: jpjarnoux Date: Fri, 12 May 2023 16:07:57 +0200 Subject: [PATCH 03/75] Remove old option defrag --- ppanggolin/cluster/cluster.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 8366fefa..f05aa9d5 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -484,8 +484,6 @@ def parser_clust(parser: argparse.ArgumentParser): clust.add_argument('--no_defrag', required=False, default=False, action="store_true", help="DO NOT Use the defragmentation strategy to link potential fragments " "with their original gene family.") - clust.add_argument("--defrag", required=False, action="store_true", - help=argparse.SUPPRESS) # This ensures compatibility with the old option "defrag" clust.add_argument("--translation_table", required=False, default="11", help="Translation table (genetic code) to use.") read = parser.add_argument_group(title="Read clustering arguments") From de84b00fb9c10768f808efd0e0a543ec0f84cb71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 30 May 2023 14:57:48 +0200 Subject: [PATCH 04/75] Rafactoring and change for pathlib --- VERSION | 2 +- ppanggolin/align/alignOnPang.py | 71 +++++++++++-------------- ppanggolin/context/searchGeneContext.py | 25 +++++---- ppanggolin/figures/draw_spot.py | 15 +++--- ppanggolin/utils.py | 4 +- 5 files changed, 54 insertions(+), 63 deletions(-) diff --git a/VERSION b/VERSION index d46997a2..4dbf8c6a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.106 +1.2.107 diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 762917c9..f3eb54c8 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -8,7 +8,8 @@ import subprocess import argparse from collections import defaultdict -from typing import Tuple, Set, Dict +from typing import List, Tuple, Set, Dict, IO +from pathlib import Path # local libraries from ppanggolin.formats import check_pangenome_info @@ -19,7 +20,7 @@ from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph -def createdb(file_obj: TextIOWrapper, tmpdir: tempfile.TemporaryDirectory) -> TextIOWrapper: +def createdb(file_obj: TextIOWrapper, tmpdir: tempfile.TemporaryDirectory) -> IO: """ Create a MMseqs2 sequence database with the given fasta file @@ -35,9 +36,9 @@ def createdb(file_obj: TextIOWrapper, tmpdir: tempfile.TemporaryDirectory) -> Te return seqdb -def align_seq_to_pang(pang_file: TextIOWrapper, seq_file: TextIOWrapper, output: str, +def align_seq_to_pang(pang_file: IO, seq_file: TextIOWrapper, output: Path, tmpdir: tempfile.TemporaryDirectory, cpu: int = 1, no_defrag: bool = False, - identity: float = 0.8, coverage: float = 0.8) -> str: + identity: float = 0.8, coverage: float = 0.8) -> Path: """ Align pangenome sequences against fasta sequence @@ -64,8 +65,8 @@ def align_seq_to_pang(pang_file: TextIOWrapper, seq_file: TextIOWrapper, output: logging.getLogger().debug(" ".join(cmd)) logging.getLogger().info("Aligning sequences to cluster representatives...") subprocess.run(cmd, stdout=subprocess.DEVNULL) - outfile = output + "/input_to_pangenome_associations.blast-tab_tmp" # write a tmp file of the results - cmd = ["mmseqs", "convertalis", seq_db.name, pang_db.name, aln_db.name, outfile, "--format-mode", "2"] + outfile = output.absolute()/"input_to_pangenome_associations.blast-tab_tmp" # write a tmp file of the results + cmd = ["mmseqs", "convertalis", seq_db.name, pang_db.name, aln_db.name, outfile.as_posix(), "--format-mode", "2"] logging.getLogger().debug(" ".join(cmd)) logging.getLogger().info("Extracting alignments...") subprocess.run(cmd, stdout=subprocess.DEVNULL) @@ -76,7 +77,7 @@ def align_seq_to_pang(pang_file: TextIOWrapper, seq_file: TextIOWrapper, output: return outfile -def read_alignments(aln_res: str, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: +def read_alignments(aln_res: Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: """ Read alignment result to link input sequence to pangenome @@ -86,7 +87,7 @@ def read_alignments(aln_res: str, pangenome: Pangenome) -> Tuple[Dict[str, GeneF :return: Dictionnary with sequence link to pangenome gene families and actual name of resulting alignment file """ seq2pang = {} - outname = open(aln_res.replace("_tmp", ""), "w") # write the actual result file + outname = open(aln_res.absolute().as_posix().replace("_tmp", ""), "w") # write the actual result file with open(aln_res, "r") as alnFile: for line in alnFile: line = line.replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id @@ -113,7 +114,7 @@ def get_seq(seq_file: TextIOWrapper) -> Set[str]: return seqset -def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIOWrapper, add: str = ""): +def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = ""): """ Export the sequence of genes in families @@ -127,7 +128,7 @@ def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIOWrapper, add: file_obj.flush() -def project_partition(seq_to_pang: Dict[str, GeneFamily], seq_set: Set[str], output: str) -> str: +def project_partition(seq_to_pang: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> str: """ Project the partition of each sequence from the input file @@ -138,12 +139,12 @@ def project_partition(seq_to_pang: Dict[str, GeneFamily], seq_set: Set[str], out :return: Path to file which contain partition projection """ - partition_proj = output + "/sequences_partition_projection.tsv" + partition_proj = output.absolute()/"sequences_partition_projection.tsv" with open(partition_proj, "w") as partProjFile: - for key, pangFam in seq_to_pang.items(): - partProjFile.write(key + "\t" + pangFam.named_partition + "\n") - for remainingSeq in (seq_to_pang.keys() & seq_set): - partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. + for key, pang_fam in seq_to_pang.items(): + partProjFile.write(key + "\t" + pang_fam.named_partition + "\n") + for remaining_seq in (seq_to_pang.keys() & seq_set): + partProjFile.write(remaining_seq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. return partition_proj @@ -166,7 +167,8 @@ def get_fam_to_rgp(pangenome, multigenics: set) -> dict: return fam2rgp -def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) -> Tuple[dict, dict]: +def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) -> Tuple[Dict[str, List[Spot]], + Dict[str, List[Spot]]]: """ Reads a pangenome object to link families and spots and indicate where each family is. @@ -194,6 +196,7 @@ def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) -> Tuple def add_spot_str(spot: Spot) -> str: + # TODO define as self.__str__ in spot """ allow to map spot set @@ -204,7 +207,7 @@ def add_spot_str(spot: Spot) -> str: return "spot_" + str(spot.ID) -def draw_spot_gexf(spots: set, output: str, multigenics: set, fam_to_mod: dict, set_size: int = 3): +def draw_spot_gexf(spots: set, output: Path, multigenics: set, fam_to_mod: dict, set_size: int = 3): """ Draw a gexf graph of the spot @@ -215,11 +218,11 @@ def draw_spot_gexf(spots: set, output: str, multigenics: set, fam_to_mod: dict, :param set_size: """ for spot in spots: - fname = output + "/spot_" + str(spot.ID) + ".gexf" + fname = output/f"spot_{str(spot.ID)}.gexf" subgraph(spot, fname, set_size=set_size, multigenics=multigenics, fam_to_mod=fam_to_mod) -def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: str, draw_related: bool = False, disable_bar=False): +def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_related: bool = False, disable_bar=False): """ Get sequences information after alignment @@ -233,7 +236,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: str, draw_rela logging.getLogger().info("Writing RGP and spot information related to hits in the pan") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) - finfo = open(output + "/info_input_seq.tsv", "w") + finfo = open(output/"info_input_seq.tsv", "w") finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n") fam2rgp = get_fam_to_rgp(pangenome, multigenics) fam2spot, fam2border = get_fam_to_spot(pangenome, multigenics) @@ -265,10 +268,10 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: str, draw_rela draw_spot_gexf(drawn_spots, output, multigenics=multigenics, fam_to_mod=fam2mod) logging.getLogger().info(f"File listing RGP and spots where sequences of interest are located : " - f"'{output + '/info_input_seq.tsv'}'") + f"{output/'info_input_seq.tsv'}") -def get_seq2pang(pangenome: Pangenome, sequence_file: str, output: str, tmpdir: tempfile.TemporaryDirectory, +def get_seq2pang(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: tempfile.TemporaryDirectory, cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8) -> Tuple[set, str, dict]: """ @@ -300,7 +303,7 @@ def get_seq2pang(pangenome: Pangenome, sequence_file: str, output: str, tmpdir: return seq_set, align_file, seq2pang -def align(pangenome: Pangenome, sequence_file: str, output: str, tmpdir: str, identity: float = 0.8, +def align(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, identity: float = 0.8, coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, draw_related: bool = False, disable_bar: bool = False): """ @@ -359,10 +362,6 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) - if args.interest or args.fig_margin or args.label_priority: - logging.getLogger().warning("Options --interest, --fig_margin and --label_priority are deprecated, " - "and the actions they defined are now doable directly in the interactive figures " - "that are drawn") align(pangenome=pangenome, sequence_file=args.sequences, output=args.output, tmpdir=args.tmpdir, identity=args.identity, coverage=args.coverage, no_defrag=args.no_defrag, cpu=args.cpu, getinfo=args.getinfo, draw_related=args.draw_related, disable_bar=args.disable_prog_bar) @@ -389,16 +388,14 @@ def parser_align(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") - required.add_argument('-S', '--sequences', required=True, type=str, + required.add_argument('-S', '--sequences', required=True, type=Path, help="sequences (nucleotides or amino acids) to align on the pangenome gene families") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") - required.add_argument('-o', '--output', required=True, type=str, + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") + required.add_argument('-o', '--output', required=True, type=Path, help="Output directory where the file(s) will be written") optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--defrag", required=False, action="store_true", - help=argparse.SUPPRESS) # This ensures compatibility with the old option "defrag" optional.add_argument('--no_defrag', required=False, action="store_true", help="DO NOT Realign gene families to link fragments with" "their non-fragmented gene family. (default: False)") @@ -414,14 +411,6 @@ def parser_align(parser: argparse.ArgumentParser): optional.add_argument("--draw_related", required=False, action="store_true", help="Draw figures and provide graphs in a gexf format of the eventual spots" " associated to the input sequences") - optional.add_argument("--interest", required=False, action="store_true", - help=argparse.SUPPRESS) # This ensures compatibility with the old API - # but does not use the option - optional.add_argument("--fig_margin", required=False, action="store_true", - help=argparse.SUPPRESS) # This ensures compatibility with the old API - # but does not use the option - optional.add_argument("--label_priority", required=False, action="store_true", - help=argparse.SUPPRESS) # This ensures compatibility with the old API # but does not use the option optional.add_argument("--use_pseudo", required=False, action="store_true", help="In the context of provided annotation, use this option to read pseudogenes. " @@ -440,7 +429,7 @@ def parser_align(parser: argparse.ArgumentParser): common = main_parser.add_argument_group(title="Common argument") common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2], help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)") - common.add_argument("--tmpdir", required=False, type=str, default=tempfile.gettempdir(), + common.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") common.add_argument("--log", required=False, type=check_log, default="stdout", help="log output file") common.add_argument("-d", "--disable_prog_bar", required=False, action="store_true", diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 75e2350c..91dfa419 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -6,7 +6,7 @@ import tempfile import time import logging - +from pathlib import Path # installed libraries from tqdm import tqdm import networkx as nx @@ -21,8 +21,8 @@ from ppanggolin.region import GeneContext -def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, - families: str = None, transitive: int = 4, identity: float = 0.5, +def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: Path, sequences: Path = None, + families: Path = None, transitive: int = 4, identity: float = 0.5, coverage: float = 0.8, jaccard: float = 0.85, no_defrag: bool = False, cpu: int = 1, disable_bar=True): """ @@ -53,8 +53,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: if sequences is not None: # Alignment of sequences on pangenome families new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, new_tmpdir, cpu, no_defrag, identity, - coverage) + seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, new_tmpdir, cpu, no_defrag, identity, coverage) project_partition(seq2pan, seq_set, output) new_tmpdir.cleanup() for k, v in seq2pan.items(): @@ -84,7 +83,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: if len(families) != 0: export_to_dataframe(families, common_components, fam_2_seq, output) else: - logging.getLogger().info(f"No gene contexts were found") + logging.getLogger().info("No gene contexts were found") logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") @@ -204,7 +203,7 @@ def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, out """ Export the results into dataFrame :param families: Families related to the connected components - :param gene_contexts: connected components found in the pan + :param gene_contexts: connected components found in the pangenome :param fam_to_seq: Dictionary with gene families as keys and list of sequence ids as values :param output: output path """ @@ -270,14 +269,14 @@ def parser_context(parser: argparse.ArgumentParser): required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome.h5 file") - required.add_argument('-o', '--output', required=True, type=str, + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome.h5 file") + required.add_argument('-o', '--output', required=True, type=Path, help="Output directory where the file(s) will be written") onereq = parser.add_argument_group(title="Input file", description="One of the following argument is required :") - onereq.add_argument('-S', '--sequences', required=False, type=str, + onereq.add_argument('-S', '--sequences', required=False, type=Path, help="Fasta file with the sequences of interest") - onereq.add_argument('-F', '--family', required=False, type=str, - help="List of family IDs of interest from the pan") + onereq.add_argument('-F', '--family', required=False, type=Path, + help="List of family IDs of interest from the pangenome") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument('--no_defrag', required=False, action="store_true", @@ -306,7 +305,7 @@ def parser_context(parser: argparse.ArgumentParser): parser_context(main_parser) common = main_parser.add_argument_group(title="Common argument") - common.add_argument("--tmpdir", required=False, type=str, default=tempfile.gettempdir(), + common.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2], help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)") diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 202dd102..09dd2ec2 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -8,6 +8,8 @@ import random from math import pi import sys +from typing import List, Set, Union +from pathlib import Path # installed libraries from scipy.spatial.distance import pdist @@ -548,8 +550,8 @@ def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam save(column(fig, row(labels_tools, gene_tools), row(genome_tools))) -def draw_selected_spots(selected_spots: list, pangenome: Pangenome, output: str, overlapping_match: int, - exact_match: int, set_size: int, disable_bar: bool = False): +def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: Pangenome, output: Path, + overlapping_match: int, exact_match: int, set_size: int, disable_bar: bool = False): """ Draw only the selected spot and give parameters @@ -573,10 +575,10 @@ def draw_selected_spots(selected_spots: list, pangenome: Pangenome, output: str, for spot in tqdm(selected_spots, total=len(selected_spots), unit="spot", disable=disable_bar): - fname = output + '/spot_' + str(spot.ID) + fname = output/f"spot_{str(spot.ID)}" # write rgps representatives and the rgps they are identical to - out_struc = open(fname + '_identical_rgps.tsv', 'w') + out_struc = open(fname.absolute().as_posix() + '_identical_rgps.tsv', 'w') out_struc.write('representative_rgp\trepresentative_rgp_organism\tidentical_rgp\tidentical_rgp_organism\n') for keyRGP, otherRGPs in spot.get_uniq_to_rgp().items(): for rgp in otherRGPs: @@ -618,8 +620,9 @@ def draw_selected_spots(selected_spots: list, pangenome: Pangenome, output: str, uniq_gene_lists.append(genelist) ordered_counts.append(curr_genelist_count) - draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, fname) - subgraph(spot, fname + ".gexf", set_size=set_size, multigenics=multigenics, fam_to_mod=fam2mod) + draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, fname.absolute().as_posix()) + subgraph(spot, fname.absolute().as_posix() + ".gexf", set_size=set_size, + multigenics=multigenics, fam_to_mod=fam2mod) logging.getLogger().info(f"Done drawing spot(s), they can be found in the directory: '{output}'") diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 9dd80eb3..84c62cb5 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -128,7 +128,7 @@ def jaccard_similarities(mat: csc_matrix, jaccard_similarity_th) -> csc_matrix: def read_compressed_or_not(file_or_file_path: Union[Path, BinaryIO, TextIOWrapper, TextIO]) -> Union[TextIOWrapper, - BinaryIO, TextIO]: + BinaryIO, TextIO]: """ Reads a file object or file path, uncompresses it, if need be. @@ -189,7 +189,7 @@ def is_compressed(file_or_file_path: Union[Path, TextIO, gzip.GzipFile]): return False -def mk_outdir(output, force): +def mk_outdir(output: Path, force: bool = False): """ Create a directory at the given output if it doesn't exist already :param output: Path where to create directory From 074e8b2ccaeb57274f7f310215a6d8d5ad8eb999 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 6 Jun 2023 15:12:51 +0200 Subject: [PATCH 05/75] Refactor info --- VERSION | 2 +- ppanggolin/figures/draw_spot.py | 60 ++++++++++++++++----------------- ppanggolin/info/info.py | 6 ++-- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/VERSION b/VERSION index 4dbf8c6a..5870e331 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.107 +1.2.108 diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 09dd2ec2..ad360ec7 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -81,12 +81,12 @@ def row_order_gene_lists(gene_lists: list) -> list: :return : An ordered genes list """ fam_dict = defaultdict(set) - #if there is only one, ordering is useless + # if there is only one, ordering is useless if len(gene_lists) == 1: return gene_lists if len(gene_lists) > sys.getrecursionlimit(): - sys.setrecursionlimit(len(gene_lists))#we need the recursion limit to be higher than the number of regions. + sys.setrecursionlimit(len(gene_lists)) # we need the recursion limit to be higher than the number of regions. for index, genelist in enumerate([genelist[0] for genelist in gene_lists]): for gene in genelist: @@ -95,10 +95,10 @@ def row_order_gene_lists(gene_lists: list) -> list: all_indexes = [] all_columns = [] data = [] - for famIndex, RGPindexes in enumerate(fam_dict.values()): - all_indexes.extend([famIndex] * len(RGPindexes)) - all_columns.extend(RGPindexes) - data.extend([1.0] * len(RGPindexes)) + for fam_index, rgp_indexes in enumerate(fam_dict.values()): + all_indexes.extend([fam_index] * len(rgp_indexes)) + all_columns.extend(rgp_indexes) + data.extend([1.0] * len(rgp_indexes)) mat_p_a = csc_matrix((data, (all_indexes, all_columns)), shape=(len(fam_dict), len(gene_lists)), dtype='float') dist = pdist(1 - jaccard_similarities(mat_p_a, 0).todense()) @@ -126,29 +126,29 @@ def line_order_gene_lists(gene_lists: list, overlapping_match: int, exact_match: to_classify = set(range(1, len(gene_lists))) # the others may (or may not) have it while len(to_classify) != 0: - for classIndex in classified: - base_border1 = [gene.family for gene in gene_lists[classIndex][1][0]] - base_border2 = [gene.family for gene in gene_lists[classIndex][1][1]] - for unclassIndex in list(to_classify): - border1 = [gene.family for gene in gene_lists[unclassIndex][1][0]] - border2 = [gene.family for gene in gene_lists[unclassIndex][1][1]] + for class_index in classified: + base_border1 = [gene.family for gene in gene_lists[class_index][1][0]] + base_border2 = [gene.family for gene in gene_lists[class_index][1][1]] + for unclass_index in list(to_classify): + border1 = [gene.family for gene in gene_lists[unclass_index][1][0]] + border2 = [gene.family for gene in gene_lists[unclass_index][1][1]] if comp_border(base_border1, border1, overlapping_match, set_size, exact_match) and \ comp_border(base_border2, border2, overlapping_match, set_size, exact_match): - to_classify.discard(unclassIndex) - new_classify.add(unclassIndex) + to_classify.discard(unclass_index) + new_classify.add(unclass_index) elif comp_border(base_border2, border1, overlapping_match, set_size, exact_match) and \ comp_border(base_border1, border2, overlapping_match, set_size, exact_match): # reverse the order of the genes to match the 'reference' - gene_lists[unclassIndex][0] = gene_lists[unclassIndex][0][::-1] + gene_lists[unclass_index][0] = gene_lists[unclass_index][0][::-1] # inverse the borders - former_border_1 = gene_lists[unclassIndex][1][0] - former_border_2 = gene_lists[unclassIndex][1][1] - gene_lists[unclassIndex][1][0] = former_border_2 - gene_lists[unclassIndex][1][1] = former_border_1 + former_border_1 = gene_lists[unclass_index][1][0] + former_border_2 = gene_lists[unclass_index][1][1] + gene_lists[unclass_index][1][0] = former_border_2 + gene_lists[unclass_index][1][1] = former_border_1 # specify the new 'classified' and remove from unclassified - to_classify.discard(unclassIndex) - new_classify.add(unclassIndex) + to_classify.discard(unclass_index) + new_classify.add(unclass_index) classified |= new_classify # the newly classified will help to check the unclassified, # the formerly classified are not useful for what remains (if something remains) new_classify = set() @@ -228,8 +228,8 @@ def mk_source_data(genelists: list, fam_col: dict, fam_to_mod: dict) -> (ColumnD "family": [], "product": [], "x_label": [], "y_label": [], "label": [], "gene_type": [], 'gene_ID': [], "gene_local_ID": []} - for index, GeneList in enumerate(genelists): - genelist = GeneList[0] + for index, gene_list in enumerate(genelists): + genelist = gene_list[0] if genelist[0].start < genelist[1].start: # if the order has been inverted, positionning elements on the figure is different @@ -244,7 +244,7 @@ def mk_source_data(genelists: list, fam_col: dict, fam_to_mod: dict) -> (ColumnD df["strand"].append(gene.strand) df["start"].append(gene.start) df["stop"].append(gene.stop) - df["length"].append(max([gene.stop, gene.start])-min([gene.stop, gene.start])) + df["length"].append(max([gene.stop, gene.start]) - min([gene.stop, gene.start])) df["gene_type"].append(gene.type) df["product"].append(gene.product) df["gene_local_ID"].append(gene.local_identifier) @@ -425,8 +425,8 @@ def mk_genomes(gene_lists: list, ordered_counts: list) -> (ColumnDataSource, lis """ df = {"name": [], "width": [], "occurrences": [], 'x': [], 'y': [], "x_label": []} - for index, GeneList in enumerate(gene_lists): - genelist = GeneList[0] + for index, gene_list in enumerate(gene_lists): + genelist = gene_list[0] df["occurrences"].append(ordered_counts[index]) df["y"].append(index * 10) if genelist[0].start < genelist[1].start: @@ -575,14 +575,14 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: for spot in tqdm(selected_spots, total=len(selected_spots), unit="spot", disable=disable_bar): - fname = output/f"spot_{str(spot.ID)}" + fname = output / f"spot_{str(spot.ID)}" # write rgps representatives and the rgps they are identical to out_struc = open(fname.absolute().as_posix() + '_identical_rgps.tsv', 'w') out_struc.write('representative_rgp\trepresentative_rgp_organism\tidentical_rgp\tidentical_rgp_organism\n') - for keyRGP, otherRGPs in spot.get_uniq_to_rgp().items(): - for rgp in otherRGPs: - out_struc.write(f"{keyRGP.name}\t{keyRGP.organism.name}\t{rgp.name}\t{rgp.organism.name}\n") + for key_rgp, other_rgps in spot.get_uniq_to_rgp().items(): + for rgp in other_rgps: + out_struc.write(f"{key_rgp.name}\t{key_rgp.organism.name}\t{rgp.name}\t{rgp.organism.name}\n") out_struc.close() fams = set() diff --git a/ppanggolin/info/info.py b/ppanggolin/info/info.py index a21d2ea6..2ba22142 100644 --- a/ppanggolin/info/info.py +++ b/ppanggolin/info/info.py @@ -3,10 +3,8 @@ # default libraries import argparse - +from pathlib import Path # installed libraries -import time - import tables # local libraries @@ -97,7 +95,7 @@ def parser_info(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="The following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") options = parser.add_argument_group(title="optional arguments") options.add_argument("--parameters", required=False, action="store_true", From 8787c4adb98491f7fd891b4b7ec204f3893edb1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 15 Jun 2023 14:21:57 +0200 Subject: [PATCH 06/75] Refactor and replace str by path in arguments --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 2 +- ppanggolin/RGP/spot.py | 2 +- ppanggolin/align/alignOnPang.py | 2 +- ppanggolin/annotate/annotate.py | 2 +- ppanggolin/context/searchGeneContext.py | 8 +- ppanggolin/edge.py | 2 +- ppanggolin/figures/draw_spot.py | 4 +- ppanggolin/figures/drawing.py | 9 +- ppanggolin/figures/tile_plot.py | 10 +- ppanggolin/figures/ucurve.py | 20 ++-- ppanggolin/formats/readBinaries.py | 95 ++++++++-------- ppanggolin/formats/writeBinaries.py | 80 ++++++------- ppanggolin/formats/writeFlat.py | 144 ++++++++++++------------ ppanggolin/formats/writeSequences.py | 2 +- ppanggolin/geneFamily.py | 2 +- ppanggolin/genome.py | 2 +- ppanggolin/info/info.py | 2 +- ppanggolin/main.py | 10 +- ppanggolin/metrics/fluidity.py | 8 +- ppanggolin/metrics/metrics.py | 6 +- ppanggolin/nem/rarefaction.py | 4 +- ppanggolin/pangenome.py | 33 ++++-- ppanggolin/region.py | 25 ++-- tests/test_Pangenome.py | 2 +- 25 files changed, 258 insertions(+), 220 deletions(-) diff --git a/VERSION b/VERSION index 5870e331..88941813 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.108 +1.2.109 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 99adb011..804b87c4 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -301,7 +301,7 @@ def parser_rgp(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pan', required=True, type=str, help="The pangenome .h5 file") + required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument('--persistent_penalty', required=False, type=int, default=3, diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 356273be..0fd596ae 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -191,7 +191,7 @@ def predict_hotspots(pangenome: Pangenome, output: str, spot_graph: bool = False logging.getLogger().info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) - logging.getLogger().info("Detecting hotspots in the pan...") + logging.getLogger().info("Detecting hotspots in the pangenome...") # predict spots spots = make_spot_graph(pangenome.regions, multigenics, output, spot_graph, overlapping_match, set_size, diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index f3eb54c8..a4450282 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -233,7 +233,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel :param disable_bar: disable progress bar :return: """ - logging.getLogger().info("Writing RGP and spot information related to hits in the pan") + logging.getLogger().info("Writing RGP and spot information related to hits in the pangenome") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) finfo = open(output/"info_input_seq.tsv", "w") diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 7c5b240a..d99ca4ed 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -411,7 +411,7 @@ def chose_gene_identifiers(pangenome) -> bool: if len(local_to_gene_id) != len(gene_id_2_local): # then, there are non unique local identifiers return False - # if we reach this line, local identifiers are unique within the pan + # if we reach this line, local identifiers are unique within the pangenome for gene in pangenome.genes: gene.ID = gene.local_identifier # Erase ppanggolin generated gene ids and replace with local identifiers gene.local_identifier = "" # this is now useless, setting it to default value diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 91dfa419..cbf8cfca 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -90,13 +90,13 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = False) -> nx.Graph: """ - Construct the graph of gene contexts between families of the pan + Construct the graph of gene contexts between families of the pangenome :param families: Gene families of interest :param t: transitive value :param disable_bar: Prevents progress bar printing - :return: Graph of gene contexts between interesting gene families of the pan + :return: Graph of gene contexts between interesting gene families of the pangenome """ g = nx.Graph() @@ -114,7 +114,7 @@ def _compute_gene_context_graph(g: nx.Graph, env_gene: Gene, contig: Contig, pos """ Compute graph of gene contexts between one gene and the other part of the contig - :param: Graph of gene contexts between interesting gene families of the pan + :param: Graph of gene contexts between interesting gene families of the pangenome :param env_gene: Gene of the current position :param contig: Current contig to search a gene context :param pos_r: Gene to search a gene context @@ -166,7 +166,7 @@ def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set: """ Compute the gene contexts in the graph - :param g: Graph of gene contexts between interesting gene families of the pan + :param g: Graph of gene contexts between interesting gene families of the pangenome :param jaccard: Jaccard index :return: Set of gene contexts find in graph diff --git a/ppanggolin/edge.py b/ppanggolin/edge.py index 6c9db0a8..eea36c73 100644 --- a/ppanggolin/edge.py +++ b/ppanggolin/edge.py @@ -9,7 +9,7 @@ class Edge: - """The Edge class represents an edge between two gene families in the pan graph. It is associated with all the + """The Edge class represents an edge between two gene families in the pangenome graph. It is associated with all the organisms in which the neighborship is found, and all the involved genes as well. :param source_gene: a first gene to initialize the edge diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index ad360ec7..0292938d 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -521,7 +521,7 @@ def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam # generate the figure and add some tools to it wheel_zoom = WheelZoomTool() fig = figure(title="spot graphic", plot_width=1600, plot_height=600, - tools=["pan", "box_zoom", "reset", "save", wheel_zoom, "ywheel_zoom", "xwheel_zoom"]) + tools=["pangenome", "box_zoom", "reset", "save", wheel_zoom, "ywheel_zoom", "xwheel_zoom"]) fig.axis.visible = True fig.toolbar.active_scroll = wheel_zoom @@ -626,7 +626,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: logging.getLogger().info(f"Done drawing spot(s), they can be found in the directory: '{output}'") -def draw_spots(pangenome: Pangenome, output: str, spot_list: str, disable_bar: bool = False): +def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: bool = False): """ Main function to draw spot diff --git a/ppanggolin/figures/drawing.py b/ppanggolin/figures/drawing.py index 231a55d9..f8905966 100644 --- a/ppanggolin/figures/drawing.py +++ b/ppanggolin/figures/drawing.py @@ -5,6 +5,7 @@ import argparse import time import os +from pathlib import Path # Installed libraries @@ -57,20 +58,20 @@ def parser_draw(parser: argparse.ArgumentParser): required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome.h5 file") + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome.h5 file") optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('-o', '--output', required=False, type=str, + optional.add_argument('-o', '--output', required=False, type=Path, default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", time.localtime()) + "_PID" + str(os.getpid()), help="Output directory") optional.add_argument("--tile_plot", required=False, default=False, action="store_true", - help="draw the tile plot of the pan") + help="draw the tile plot of the pangenome") optional.add_argument("--nocloud", required=False, default=False, action="store_true", help="Do not draw the cloud in the tile plot") optional.add_argument("--soft_core", required=False, default=0.95, help="Soft core threshold to use") optional.add_argument("--ucurve", required=False, default=False, action="store_true", - help="draw the U-curve of the pan") + help="draw the U-curve of the pangenome") optional.add_argument("--spots", required=False, type=str, default='', help="a comma-separated list of spots to draw (or 'all' to draw all spots)") diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 9a2b7caa..655ec083 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -4,6 +4,7 @@ # default libraries import logging from collections import defaultdict +from pathlib import Path # installed libraries import numpy @@ -13,13 +14,14 @@ import plotly.graph_objs as go import plotly.offline as out_plotly import colorlover as cl + # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.pangenome import Pangenome from ppanggolin.utils import jaccard_similarities -def draw_tile_plot(pangenome: Pangenome, output: str, nocloud: bool = False, disable_bar: bool = False): +def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, disable_bar: bool = False): """ Draw a tile plot from a partitioned pangenome @@ -49,7 +51,7 @@ def draw_tile_plot(pangenome: Pangenome, output: str, nocloud: bool = False, dis index2org = {} for org, index in org_index.items(): index2org[index] = org - colors = {"pan": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", + colors = {"pangenome": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", "undefined": "#828282"} @@ -171,5 +173,5 @@ def draw_tile_plot(pangenome: Pangenome, output: str, nocloud: bool = False, dis shapes=shapes, plot_bgcolor='#ffffff') logging.getLogger().info("Drawing the figure itself...") - out_plotly.plot(go.Figure(data=[heatmap], layout=layout), filename=output + "/tile_plot.html", auto_open=False) - logging.getLogger().info(f"Done with the tile plot : '{output + '/tile_plot.html'}' ") + out_plotly.plot(go.Figure(data=[heatmap], layout=layout), filename=output/"tile_plot.html", auto_open=False) + logging.getLogger().info(f"Done with the tile plot : '{output.as_posix() + '/tile_plot.html'}' ") diff --git a/ppanggolin/figures/ucurve.py b/ppanggolin/figures/ucurve.py index a95ea288..a3974783 100644 --- a/ppanggolin/figures/ucurve.py +++ b/ppanggolin/figures/ucurve.py @@ -1,16 +1,18 @@ # default libraries import logging from collections import defaultdict +from pathlib import Path # installed libraries import plotly.graph_objs as go import plotly.offline as out_plotly + # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.pangenome import Pangenome -def draw_ucurve(pangenome: Pangenome, output: str, soft_core: float = 0.95, disable_bar: bool = False): +def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, disable_bar: bool = False): """ :param pangenome: Partitioned pangenome @@ -32,13 +34,13 @@ def draw_ucurve(pangenome: Pangenome, output: str, soft_core: float = 0.95, dis if fam.partition == "U": has_undefined = True count[nb_org][fam.named_partition] += 1 - count[nb_org]["pan"] += 1 - max_bar = count[nb_org]["pan"] if count[nb_org]["pan"] > max_bar else max_bar + count[nb_org]["pangenome"] += 1 + max_bar = count[nb_org]["pangenome"] if count[nb_org]["pangenome"] > max_bar else max_bar data_plot = [] chao = "NA" - if count[1]["pan"] > 0: - chao = round(len(pangenome.gene_families) + ((count[0]["pan"] ^ 2) / (count[1]["pan"] * 2)), 2) - colors = {"pan": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", + if count[1]["pangenome"] > 0: + chao = round(len(pangenome.gene_families) + ((count[0]["pangenome"] ^ 2) / (count[1]["pangenome"] * 2)), 2) + colors = {"pangenome": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", "undefined": "#828282"} @@ -57,7 +59,7 @@ def draw_ucurve(pangenome: Pangenome, output: str, soft_core: float = 0.95, dis data_plot.append(go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=cloud_values, name='cloud', marker=dict(color=colors["cloud"]))) else: - text = 'undefined' if has_undefined else "pan" + text = 'undefined' if has_undefined else "pangenome" undefined_values = [] for nb_org in range(1, len(pangenome.organisms) + 1): undefined_values.append(count[nb_org][text]) @@ -73,5 +75,5 @@ def draw_ucurve(pangenome: Pangenome, output: str, soft_core: float = 0.95, dis plot_bgcolor='#ffffff') fig = go.Figure(data=data_plot, layout=layout) - out_plotly.plot(fig, filename=output + "/Ushaped_plot.html", auto_open=False) - logging.getLogger().info(f"Done drawing the U-shaped curve : '{output + '/Ushaped_plot.html'}'") + out_plotly.plot(fig, filename=output/"Ushaped_plot.html", auto_open=False) + logging.getLogger().info(f"Done drawing the U-shaped curve : '{output.as_posix() + '/Ushaped_plot.html'}'") diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 37cd35f6..764f493a 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -17,6 +17,7 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.region import Spot, Module + class Genedata: """ This is a general class storing unique gene-related data to be written in a specific @@ -31,29 +32,31 @@ class Genedata: :param product: Associated product :param genetic_code: associated genetic code, if any """ - def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: int, name: str, product: str, genetic_code: int): - self.start= start + + def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: int, name: str, product: str, + genetic_code: int): + self.start = start self.stop = stop self.strand = strand - self.gene_type= gene_type + self.gene_type = gene_type self.position = position self.name = name self.product = product self.genetic_code = genetic_code - + def __eq__(self, other): - return self.start == other.start \ - and self.stop == other.stop \ - and self.strand == other.strand \ - and self.gene_type == other.gene_type \ - and self.position == other.position \ - and self.name == other.name \ - and self.product == other.product \ - and self.genetic_code == other.genetic_code - + return self.start == other.start \ + and self.stop == other.stop \ + and self.strand == other.strand \ + and self.gene_type == other.gene_type \ + and self.position == other.position \ + and self.name == other.name \ + and self.product == other.product \ + and self.genetic_code == other.genetic_code + def __hash__(self): return hash((self.start, self.stop, self.strand, self.gene_type, self.position, - self.name, self.product, self.genetic_code)) + self.name, self.product, self.genetic_code)) def get_number_of_organisms(pangenome: Pangenome) -> int: @@ -78,12 +81,11 @@ def get_number_of_organisms(pangenome: Pangenome) -> int: return len(org_set) -def fix_partitioned(pangenome: Pangenome, pangenome_file: str): +def fix_partitioned(pangenome_file: str): """ - Fixes pangenomes with the 'partitionned' typo. + Fixes pangenomes with the 'partitionned' typo. - :param pangenome: a pangenome - :param pangenome_file: path to the pangenome file + :param pangenome_file: path to the pangenome file """ h5f = tables.open_file(pangenome_file, "a") status_group = h5f.root.status @@ -97,6 +99,7 @@ def fix_partitioned(pangenome: Pangenome, pangenome_file: str): del status_group._v_attrs.Partitionned h5f.close() + def get_status(pangenome: Pangenome, pangenome_file: Path): """ Checks which elements are already present in the file. @@ -104,7 +107,7 @@ def get_status(pangenome: Pangenome, pangenome_file: Path): :param pangenome: Blank pangenome :param pangenome_file: path to the pangenome file """ - fix_partitioned(pangenome, pangenome_file) + fix_partitioned(pangenome_file) h5f = tables.open_file(pangenome_file, "r") logging.getLogger().info("Getting the current pangenome status") status_group = h5f.root.status @@ -119,8 +122,6 @@ def get_status(pangenome: Pangenome, pangenome_file: Path): if status_group._v_attrs.NeighborsGraph: pangenome.status["neighborsGraph"] = "inFile" - - if status_group._v_attrs.Partitioned: pangenome.status["partitioned"] = "inFile" @@ -160,7 +161,7 @@ def read_genedata(h5f: tables.File) -> dict: """ table = h5f.root.annotations.genedata genedata_id2genedata = {} - for row in read_chunks(table,chunk=20000): + for row in read_chunks(table, chunk=20000): genedata = Genedata(start=row["start"], stop=row["stop"], strand=row["strand"].decode(), @@ -176,13 +177,13 @@ def read_genedata(h5f: tables.File) -> dict: def read_sequences(h5f: tables.File) -> dict: """ - Reads the sequences table and returns a seqid2seq dictionnary + Reads the sequences table and returns a sequence id to sequence dictionnary :param h5f: the hdf5 file handler :return: dictionnary linking sequences to the seq identifier """ table = h5f.root.sequences seqid2seq = {} - for row in read_chunks(table,chunk=20000): + for row in read_chunks(table, chunk=20000): seqid2seq[row["seqid"]] = row['dna'].decode() return seqid2seq @@ -213,7 +214,8 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter h5f.close() -def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circular_contigs: dict, genedata_dict:dict, link: bool = False): +def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circular_contigs: dict, genedata_dict: dict, + link: bool = False): """ Read information from pangenome to assign to organism object @@ -221,13 +223,14 @@ def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circul :param org_name: Name of the organism :param contig_dict: Dictionary with all contig and associate genes :param circular_contigs: Dictionary of contigs + :param genedata_dict: dictionnary linking genedata to the genedata identifier :param link: get the gene object if the genes are clustered """ org = Organism(org_name) gene, gene_type = (None, None) - for contigName, geneList in contig_dict.items(): - contig = org.get_contig(contigName, is_circular=circular_contigs[contigName]) - for row in geneList: + for contig_name, gene_list in contig_dict.items(): + contig = org.get_contig(contig_name, is_circular=circular_contigs[contig_name]) + for row in gene_list: if link: # if the gene families are already computed/loaded the gene exists. gene = pangenome.get_gene(row["ID"].decode()) else: # else creating the gene. @@ -271,8 +274,8 @@ def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False """ table = h5f.root.edges - if not pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] or \ - not pangenome.status["genesClustered"] in ["Computed", "Loaded"]: + if pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"] or \ + pangenome.status["genesClustered"] not in ["Computed", "Loaded"]: raise Exception("It's not possible to read the graph " "if the annotations and the gene families have not been loaded.") for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="contig adjacency", disable=disable_bar): @@ -332,7 +335,7 @@ def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: boo :param h5f: Pangenome HDF5 file with gene sequence associate to gene :param disable_bar: Disable the progress bar """ - if not pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"]: + if pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"]: raise Exception("It's not possible to read the pangenome gene dna sequences " "if the annotations have not been loaded.") table = h5f.root.geneSequences @@ -352,8 +355,8 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): :param h5f: Pangenome HDF5 file with RGP computed :param disable_bar: Disable the progress bar """ - if not pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] or \ - not pangenome.status["genesClustered"] in ["Computed", "Loaded"]: + if pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"] or \ + pangenome.status["genesClustered"] not in ["Computed", "Loaded"]: raise Exception("It's not possible to read the RGP " "if the annotations and the gene families have not been loaded.") table = h5f.root.RGP @@ -396,7 +399,7 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal :param h5f: Pangenome HDF5 file with modules computed :param disable_bar: Disable the progress bar """ - if not pangenome.status["genesClustered"] in ["Computed", "Loaded"]: + if pangenome.status["genesClustered"] not in ["Computed", "Loaded"]: raise Exception("It's not possible to read the modules if the gene families have not been loaded.") table = h5f.root.modules modules = {} # id2mod @@ -443,9 +446,9 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = link = True if pangenome.status["genesClustered"] in ["Computed", "Loaded"] else False - for orgName, contigDict in tqdm(pangenome_dict.items(), total=len(pangenome_dict), - unit="organism", disable=disable_bar): - read_organism(pangenome, orgName, contigDict, circular_contigs[orgName], genedata_dict, link) + for org_name, contig_dict in tqdm(pangenome_dict.items(), total=len(pangenome_dict), + unit="organism", disable=disable_bar): + read_organism(pangenome, org_name, contig_dict, circular_contigs[org_name], genedata_dict, link) pangenome.status["genomesAnnotated"] = "Loaded" @@ -544,7 +547,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa rgp: bool = False, spots: bool = False, gene_sequences: bool = False, modules: bool = False, disable_bar: bool = False): """ - Reads a previously written pan, with all of its parts, depending on what is asked, + Reads a previously written pangenome, with all of its parts, depending on what is asked, with regard to what is filled in the 'status' field of the hdf5 file. :param pangenome: Pangenome object without some information @@ -562,7 +565,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise FileNotFoundError("The provided pangenome does not have an associated .h5 file") - fix_partitioned(pangenome, pangenome.file) + fix_partitioned(pangenome.file) h5f = tables.open_file(filename, "r") if annotation: @@ -649,42 +652,42 @@ def check_pangenome_info(pangenome, need_annotations: bool = False, need_familie if need_annotations: if pangenome.status["genomesAnnotated"] == "inFile": annotation = True - elif not pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"]: + elif pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"]: raise Exception("Your pangenome has no genes. See the 'annotate' subcommand.") if need_families: if pangenome.status["genesClustered"] == "inFile": gene_families = True - elif not pangenome.status["genesClustered"] in ["Computed", "Loaded"]: + elif pangenome.status["genesClustered"] not in ["Computed", "Loaded"]: raise Exception("Your pangenome has no gene families. See the 'cluster' subcommand.") if need_graph: if pangenome.status["neighborsGraph"] == "inFile": graph = True - elif not pangenome.status["neighborsGraph"] in ["Computed", "Loaded"]: + elif pangenome.status["neighborsGraph"] not in ["Computed", "Loaded"]: raise Exception("Your pangenome does not have a graph (no edges). See the 'graph' subcommand.") if need_partitions and pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: raise Exception("Your pangenome has not been partitioned. See the 'partition' subcommand") if need_rgp: if pangenome.status["predictedRGP"] == "inFile": rgp = True - elif not pangenome.status["predictedRGP"] in ["Computed", "Loaded"]: + elif pangenome.status["predictedRGP"] not in ["Computed", "Loaded"]: raise Exception( "Your pangenome regions of genomic plasticity have not been predicted. See the 'rgp' subcommand") if need_spots: if pangenome.status["spots"] == "inFile": spots = True - elif not pangenome.status["spots"] in ["Computed", "Loaded"]: + elif pangenome.status["spots"] not in ["Computed", "Loaded"]: raise Exception("Your pangenome spots of insertion have not been predicted. See the 'spot' subcommand") if need_gene_sequences: if pangenome.status["geneSequences"] == "inFile": gene_sequences = True - elif not pangenome.status["geneSequences"] in ["Computed", "Loaded"]: + elif pangenome.status["geneSequences"] not in ["Computed", "Loaded"]: raise Exception("Your pangenome does not include gene sequences. " "This is possible only if you provided your own cluster file with the 'cluster' subcommand") if need_modules: if pangenome.status["modules"] == "inFile": modules = True - elif not pangenome.status["modules"] in ["Computed", "Loaded"]: + elif pangenome.status["modules"] not in ["Computed", "Loaded"]: raise Exception("Your pangenome modules have not been predicted. See the 'module' subcommand") if annotation or gene_families or graph or rgp or spots or gene_sequences or modules: diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index a8784444..119d0bf9 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -13,9 +13,9 @@ import tables from gmpy2 import popcount -#local libraries +# local libraries from ppanggolin.pangenome import Pangenome -from ppanggolin.genome import Feature +from ppanggolin.genome import Feature, Gene from ppanggolin.formats.readBinaries import read_genedata, Genedata @@ -45,6 +45,7 @@ def gene_desc(org_len, contig_len, id_len, max_local_id) -> dict: } } + def genedata_desc(type_len, name_len, product_len): """ Creates a table for gene-related data @@ -55,15 +56,15 @@ def genedata_desc(type_len, name_len, product_len): :return: Formatted table for gene metadata """ return { - 'genedata_id': tables.UInt32Col(), - 'start': tables.UInt32Col(), - 'stop': tables.UInt32Col(), - 'strand': tables.StringCol(itemsize=1), - 'gene_type': tables.StringCol(itemsize=type_len), - 'position': tables.UInt32Col(), - 'name': tables.StringCol(itemsize=name_len), - 'product': tables.StringCol(itemsize=product_len), - 'genetic_code': tables.UInt32Col(dflt=11), + 'genedata_id': tables.UInt32Col(), + 'start': tables.UInt32Col(), + 'stop': tables.UInt32Col(), + 'strand': tables.StringCol(itemsize=1), + 'gene_type': tables.StringCol(itemsize=type_len), + 'position': tables.UInt32Col(), + 'name': tables.StringCol(itemsize=name_len), + 'product': tables.StringCol(itemsize=product_len), + 'genetic_code': tables.UInt32Col(dflt=11), } @@ -128,9 +129,9 @@ def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]: return max_type_len, max_name_len, max_product_len -def get_genedata(gene:Feature) -> Tuple[int, str, str, int, str, str, int]: +def get_genedata(gene: Feature) -> Genedata: """ - Gets the genedata type of a Feature + Gets the genedata type of Feature :param gene: a Feature :return: Tuple with a Feature associated data @@ -138,6 +139,7 @@ def get_genedata(gene:Feature) -> Tuple[int, str, str, int, str, str, int]: position = None genetic_code = 11 if gene.type == "CDS": + gene: Gene position = gene.position genetic_code = gene.genetic_code return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name, @@ -177,13 +179,13 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool if genedata_id is None: genedata_id = genedata_counter genedata2gene[genedata] = genedata_id - genedata_counter+=1 + genedata_counter += 1 gene_row["gene/genedata_id"] = genedata_id gene_row.append() gene_table.flush() genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), - expectedrows=len(genedata2gene)) + expectedrows=len(genedata2gene)) logging.getLogger().debug(f"Writing {len(genedata2gene)} gene-related data (can be lower than the number of genes)") genedata_row = genedata_table.row for genedata, genedata_id in genedata2gene.items(): @@ -230,6 +232,7 @@ def gene_sequences_desc(gene_id_len, gene_type_len) -> dict: "type": tables.StringCol(itemsize=gene_type_len) } + def get_sequence_len(pangenome: Pangenome) -> int: """ Get the maximum size of gene sequences to optimize disk space @@ -242,6 +245,7 @@ def get_sequence_len(pangenome: Pangenome) -> int: max_seq_len = len(gene.dna) return max_seq_len + def sequence_desc(max_seq_len: int) -> dict: """ Table description to save sequences @@ -253,6 +257,7 @@ def sequence_desc(max_seq_len: int) -> dict: "dna": tables.StringCol(itemsize=max_seq_len) } + def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ Function writing all the pangenome gene sequences @@ -262,7 +267,7 @@ def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bo """ gene_seq = h5f.create_table("/", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), expectedrows=len(pangenome.genes)) - #process sequences to save them only once + # process sequences to save them only once seq2seqid = {} id_counter = 0 gene_row = gene_seq.row @@ -271,14 +276,14 @@ def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bo if curr_seq_id is None: curr_seq_id = id_counter seq2seqid[gene.dna] = id_counter - id_counter+=1 + id_counter += 1 gene_row["gene"] = gene.ID gene_row["seqid"] = curr_seq_id gene_row["type"] = gene.type gene_row.append() gene_seq.flush() - seq_table = h5f.create_table("/","sequences", sequence_desc(get_sequence_len(pangenome)), + seq_table = h5f.create_table("/", "sequences", sequence_desc(get_sequence_len(pangenome)), expectedrows=len(seq2seqid)) seq_row = seq_table.row @@ -377,10 +382,10 @@ def get_gene_to_fam_len(pangenome: Pangenome): """ max_gene_fam_name = 1 max_gene_id = 1 - for geneFam in pangenome.gene_families: - if len(geneFam.name) > max_gene_fam_name: - max_gene_fam_name = len(geneFam.name) - for gene in geneFam.genes: + for family in pangenome.gene_families: + if len(family.name) > max_gene_fam_name: + max_gene_fam_name = len(family.name) + for gene in family.genes: if len(gene.ID) > max_gene_id: max_gene_id = len(gene.ID) return max_gene_fam_name, max_gene_id @@ -400,11 +405,11 @@ def write_gene_families(pangenome: Pangenome, h5f: tables.File, force: bool = Fa h5f.remove_node('/', 'geneFamilies') # erasing the table, and rewriting a new one. gene_families = h5f.create_table("/", "geneFamilies", gene_to_fam_desc(*get_gene_to_fam_len(pangenome))) gene_row = gene_families.row - for geneFam in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families(), unit="gene family", - disable=disable_bar): - for gene in geneFam.genes: + for family in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families(), unit="gene family", + disable=disable_bar): + for gene in family.genes: gene_row["gene"] = gene.ID - gene_row["geneFam"] = geneFam.name + gene_row["geneFam"] = family.name gene_row.append() gene_families.flush() @@ -457,8 +462,8 @@ def write_graph(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis expectedrows=len(pangenome.edges)) edge_row = edge_table.row for edge in tqdm(pangenome.edges, total=pangenome.number_of_edge(), unit="edge", disable=disable_bar): - for genePairs in edge.organisms.values(): - for gene1, gene2 in genePairs: + for gene_pairs in edge.organisms.values(): + for gene1, gene2 in gene_pairs: edge_row["geneTarget"] = gene1.ID edge_row["geneSource"] = gene2.ID edge_row.append() @@ -894,7 +899,7 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo :param spots: remove spots information :param modules: remove modules information """ - + h5f = tables.open_file(pangenome.file, "a") status_group = h5f.root.status info_group = h5f.root.info @@ -978,17 +983,16 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable :param disable_bar: Allow to disable progress bar """ - if pangenome.status["genomesAnnotated"] == "Computed": - compression_filter = tables.Filters(complevel=1, shuffle=True, bitshuffle=True, complib='blosc:zstd') - h5f = tables.open_file(filename, "w", filters=compression_filter) - logging.getLogger().info("Writing genome annotations...") + if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded", "inFile"]: + if pangenome.status["genomesAnnotated"] == "Computed": + compression_filter = tables.Filters(complevel=1, shuffle=True, bitshuffle=True, complib='blosc:zstd') + h5f = tables.open_file(filename, "w", filters=compression_filter) + logging.getLogger().info("Writing genome annotations...") - write_annotations(pangenome, h5f, disable_bar=disable_bar) + write_annotations(pangenome, h5f, disable_bar=disable_bar) - pangenome.status["genomesAnnotated"] = "Loaded" - h5f.close() - elif pangenome.status["genomesAnnotated"] in ["Loaded", "inFile"]: - pass + pangenome.status["genomesAnnotated"] = "Loaded" + h5f.close() else: # if the pangenome is not Computed or not Loaded, it's probably not really in a good state # (or something new was coded). diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 14686964..3201751f 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -6,6 +6,7 @@ from multiprocessing import get_context from collections import Counter, defaultdict import logging +from pathlib import Path from typing import TextIO import pkg_resources from statistics import median, mean, stdev @@ -19,8 +20,8 @@ from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float from ppanggolin.formats.readBinaries import check_pangenome_info -# global variable to store the pan -pan = Pangenome() # TODO change to pan:Pangenome = Pangenome=() ? +# global variable to store the pangenome +pan = Pangenome() # TODO change to pangenome:Pangenome = Pangenome=() ? needAnnotations = False needFamilies = False needGraph = False @@ -108,9 +109,9 @@ def write_json_nodes(json: TextIO): fam_list = list(pan.gene_families) first_fam = fam_list[0] write_json_gene_fam(first_fam, json) - for geneFam in fam_list[1:]: + for family in fam_list[1:]: json.write(', ') - write_json_gene_fam(geneFam, json) + write_json_gene_fam(family, json) json.write(']') @@ -148,20 +149,20 @@ def write_json_edges(json): json.write(']') -def write_json(output: str, compress: bool = False): +def write_json(output: Path, compress: bool = False): """Writes the graph in a json file format :param output: Path to output directory :param compress: Compress the file in .gz """ logging.getLogger().info("Writing the json file for the pangenome graph...") - outname = output + "/pangenomeGraph.json" + outname = output / "pangenomeGraph.json" with write_compressed_or_not(outname, compress) as json: write_json_header(json) write_json_nodes(json) write_json_edges(json) json.write("}") - logging.getLogger().info(f"Done writing the json file : '{outname}'") + logging.getLogger().info(f"Done writing the json file : '{outname.as_posix()}'") def write_gexf_header(gexf: TextIO, light: bool = True): @@ -189,15 +190,15 @@ def write_gexf_header(gexf: TextIO, light: bool = True): gexf.write(' \n') gexf.write(' \n') if not light: - for org, orgIndex in index.items(): - gexf.write(f' \n') + for org, org_idx in index.items(): + gexf.write(f' \n') gexf.write(' \n') gexf.write(' \n') gexf.write(' \n') if not light: - for org, orgIndex in index.items(): - gexf.write(f' \n') + for org, org_idx in index.items(): + gexf.write(f' \n') gexf.write(' \n') gexf.write(' \n') gexf.write(f' PPanGGOLiN {pkg_resources.get_distribution("ppanggolin").version}\n') @@ -232,7 +233,7 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') - gexf.write(f' \n') + gexf.write(' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') @@ -253,8 +254,8 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): f' \n') - gexf.write(f' \n') - gexf.write(f' \n') + gexf.write(' \n') + gexf.write(' \n') gexf.write(' \n') @@ -292,7 +293,7 @@ def write_gexf_end(gexf: TextIO): gexf.write("") -def write_gexf(output: str, light: bool = True, compress: bool = False): +def write_gexf(output: Path, light: bool = True, compress: bool = False): """Write the node of pangenome in gexf file :param output: Path to output directory @@ -303,7 +304,7 @@ def write_gexf(output: str, light: bool = True, compress: bool = False): txt += "light gexf file for the pangenome graph..." if light else "gexf file for the pangenome graph..." logging.getLogger().info(txt) - outname = output + "/pangenomeGraph" + outname = output / "pangenomeGraph" outname += "_light" if light else "" outname += ".gexf" with write_compressed_or_not(outname, compress) as gexf: @@ -311,10 +312,10 @@ def write_gexf(output: str, light: bool = True, compress: bool = False): write_gexf_nodes(gexf, light) write_gexf_edges(gexf, light) write_gexf_end(gexf) - logging.getLogger().info(f"Done writing the gexf file : '{outname}'") + logging.getLogger().info(f"Done writing the gexf file : '{outname.as_posix()}'") -def write_matrix(output: str, sep: str = ',', ext: str = 'csv', compress: bool = False, gene_names: bool = False): +def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool = False, gene_names: bool = False): """ Write a csv file format as used by Roary, among others. The alternative gene ID will be the partition, if there is one @@ -326,7 +327,7 @@ def write_matrix(output: str, sep: str = ',', ext: str = 'csv', compress: bool = :param gene_names: write the genes name if there are saved in pangenome """ logging.getLogger().info(f"Writing the .{ext} file ...") - outname = output + "/matrix." + ext + outname = output / f"matrix.{ext}" with write_compressed_or_not(outname, compress) as matrix: index_org = {} @@ -386,18 +387,18 @@ def write_matrix(output: str, sep: str = ',', ext: str = 'csv', compress: bool = '"' + str(max(lis)) + '"', # 13 '"' + str(round(sum(lis) / len(lis), 2)) + '"'] # 14 + genes) + "\n") # 15 - logging.getLogger().info(f"Done writing the matrix : '{outname}'") + logging.getLogger().info(f"Done writing the matrix : '{outname.as_posix()}'") -def write_gene_presence_absence(output: str, compress: bool = False): +def write_gene_presence_absence(output: Path, compress: bool = False): """ Write the gene presence absence matrix :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger().info(f"Writing the gene presence absence file ...") - outname = output + "/gene_presence_absence.Rtab" + logging.getLogger().info("Writing the gene presence absence file ...") + outname = output / "gene_presence_absence.Rtab" with write_compressed_or_not(outname, compress) as matrix: index_org = {} default_dat = [] @@ -416,10 +417,10 @@ def write_gene_presence_absence(output: str, compress: bool = False): matrix.write('\t'.join([fam.name] # 14 + genes) + "\n") # 15 - logging.getLogger().info(f"Done writing the gene presence absence file : '{outname}'") + logging.getLogger().info(f"Done writing the gene presence absence file : '{outname.as_posix()}'") -def write_stats(output: str, soft_core: float = 0.95, dup_margin: float = 0.05, compress: bool = False): +def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, compress: bool = False): """ Write pangenome statistics @@ -431,7 +432,7 @@ def write_stats(output: str, soft_core: float = 0.95, dup_margin: float = 0.05, logging.getLogger().info("Writing pangenome statistics...") logging.getLogger().info("Writing statistics on persistent duplication...") single_copy_markers = set() # could use bitarrays if speed is needed - with write_compressed_or_not(output + "/mean_persistent_duplication.tsv", compress) as outfile: + with write_compressed_or_not(output / "mean_persistent_duplication.tsv", compress) as outfile: outfile.write(f"#duplication_margin={round(dup_margin, 3)}\n") outfile.write("\t".join(["persistent_family", "duplication_ratio", "mean_presence", "is_single_copy_marker"]) + "\n") @@ -461,7 +462,7 @@ def write_stats(output: str, soft_core: float = 0.95, dup_margin: float = 0.05, if len(fam.organisms) == pan.number_of_organisms(): core.add(fam) - with write_compressed_or_not(output + "/organisms_statistics.tsv", compress) as outfile: + with write_compressed_or_not(output / "organisms_statistics.tsv", compress) as outfile: outfile.write(f"#soft_core={round(soft_core, 3)}\n") outfile.write(f"#duplication_margin={round(dup_margin, 3)}\n") outfile.write("\t".join( @@ -520,7 +521,7 @@ def write_stats(output: str, soft_core: float = 0.95, dup_margin: float = 0.05, logging.getLogger().info("Done writing genome per genome statistics") -def write_org_file(org: Organism, output: str, compress: bool = False): +def write_org_file(org: Organism, output: Path, compress: bool = False): """ Write the projection of pangenome for one organism @@ -528,7 +529,7 @@ def write_org_file(org: Organism, output: str, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - with write_compressed_or_not(output + "/" + org.name + ".tsv", compress) as outfile: + with write_compressed_or_not(output / f"{org.name}.tsv", compress) as outfile: header = ["gene", "contig", "start", "stop", "strand", "family", "nb_copy_in_org", "partition", "persistent_neighbors", "shell_neighbors", "cloud_neighbors"] if needRegions: @@ -572,7 +573,7 @@ def write_org_file(org: Organism, output: str, compress: bool = False): outfile.write("\t".join(map(str, row)) + "\n") -def write_projections(output: str, compress: bool = False): +def write_projections(output: Path, compress: bool = False): """ Write the projection of pangenome for all organisms @@ -580,7 +581,7 @@ def write_projections(output: str, compress: bool = False): :param compress: Compress the file in .gz """ logging.getLogger().info("Writing the projection files...") - outdir = output + "/projection" + outdir = output / "projection" if not os.path.exists(outdir): os.makedirs(outdir) for org in pan.organisms: @@ -588,7 +589,7 @@ def write_projections(output: str, compress: bool = False): logging.getLogger().info("Done writing the projection files") -def write_parts(output: str, soft_core: float = 0.95): +def write_parts(output: Path, soft_core: float = 0.95): """ Write the list of gene families for each partition @@ -596,13 +597,13 @@ def write_parts(output: str, soft_core: float = 0.95): :param soft_core: Soft core threshold to use """ logging.getLogger().info("Writing the list of gene families for each partition ...") - if not os.path.exists(output + "/partitions"): - os.makedirs(output + "/partitions") + if not os.path.exists(output / "partitions"): + os.makedirs(output / "partitions") part_sets = defaultdict(set) # initializing key, value pairs so that files exist even if they are empty - for neededKey in ["undefined", "soft_core", "exact_core", "exact_accessory", "soft_accessory", "persistent", - "shell", "cloud"]: - part_sets[neededKey] = set() + for needed_key in ["undefined", "soft_core", "exact_core", "exact_accessory", + "soft_accessory", "persistent", "shell", "cloud"]: + part_sets[needed_key] = set() for fam in pan.gene_families: part_sets[fam.named_partition].add(fam.name) if fam.partition.startswith("S"): @@ -618,14 +619,14 @@ def write_parts(output: str, soft_core: float = 0.95): part_sets["exact_accessory"].add(fam.name) for key, val in part_sets.items(): - curr_key_file = open(output + "/partitions/" + key + ".txt", "w") + curr_key_file = open(output / f"partitions/{key}.txt", "w") if len(val) > 0: curr_key_file.write('\n'.join(val) + "\n") curr_key_file.close() logging.getLogger().info("Done writing the list of gene families for each partition") -def write_gene_families_tsv(output: str, compress: bool = False): +def write_gene_families_tsv(output: Path, compress: bool = False): """ Write the file providing the association between genes and gene families @@ -633,7 +634,7 @@ def write_gene_families_tsv(output: str, compress: bool = False): :param compress: Compress the file in .gz """ logging.getLogger().info("Writing the file providing the association between genes and gene families...") - outname = output + "/gene_families.tsv" + outname = output / "gene_families.tsv" with write_compressed_or_not(outname, compress) as tsv: for fam in pan.gene_families: for gene in fam.genes: @@ -643,14 +644,14 @@ def write_gene_families_tsv(output: str, compress: bool = False): f"gene families : '{outname}'") -def write_regions(output, compress=False): +def write_regions(output: Path, compress: bool = False): """ Write the file providing information about RGP content :param output: Path to output directory :param compress: Compress the file in .gz """ - fname = output + "/plastic_regions.tsv" + fname = output / "plastic_regions.tsv" with write_compressed_or_not(fname, compress) as tab: tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\n") regions = sorted(pan.regions, key=lambda x: (x.organism.name, x.contig.name, x.start)) @@ -659,7 +660,7 @@ def write_regions(output, compress=False): len(region.genes), region.is_contig_border, region.is_whole_contig])) + "\n") -def summarize_spots(spots: set, output: str, compress: bool = False): +def summarize_spots(spots: set, output: Path, compress: bool = False): """ Write a file providing summarize information about hotspots @@ -667,11 +668,12 @@ def summarize_spots(spots: set, output: str, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ + def r_and_s(value: float): """rounds to dp figures and returns a str of the provided value""" return str(round(value, 3)) if isinstance(value, float) else str(value) - with write_compressed_or_not(output + "/summarize_spots.tsv", compress) as fout: + with write_compressed_or_not(output / "summarize_spots.tsv", compress) as fout: fout.write("spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\t" "stdev_nb_genes\tmax_nb_genes\tmin_nb_genes\n") for spot in sorted(spots, key=lambda x: len(x.regions), reverse=True): @@ -688,24 +690,24 @@ def r_and_s(value: float): min_size = min(size_list) fout.write("\t".join(map(r_and_s, [f"spot_{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, mean_size, stdev_size, max_size, min_size])) + "\n") - logging.getLogger().info(f"Done writing spots in : '{output + '/summarize_spots.tsv'}'") + logging.getLogger().info(f"Done writing spots in : '{output.as_posix() + '/summarize_spots.tsv'}'") -def spot2rgp(spots: set, output: str, compress: bool = False): +def spot2rgp(spots: set, output: Path, compress: bool = False): """Write a tsv file providing association between spot and rgp :param spots: set of spots in pangenome :param output: Path to output directory :param compress: Compress the file in .gz """ - with write_compressed_or_not(output + "/spots.tsv", compress) as fout: + with write_compressed_or_not(output / "spots.tsv", compress) as fout: fout.write("spot_id\trgp_id\n") for spot in spots: for rgp in spot.regions: fout.write(f"spot_{spot.ID}\t{rgp.name}\n") -def write_spots(output, compress): +def write_spots(output: Path, compress: bool = False): """ Write tsv files providing spots information and association with RGP :param output: Path to output directory @@ -716,7 +718,7 @@ def write_spots(output, compress): summarize_spots(pan.spots, output, compress) -def write_borders(output: str, dup_margin: float = 0.05, compress: bool = False): +def write_borders(output: Path, dup_margin: float = 0.05, compress: bool = False): """Write all gene families bordering each spot :param output: Path to output directory @@ -725,7 +727,7 @@ def write_borders(output: str, dup_margin: float = 0.05, compress: bool = False) """ multigenics = pan.get_multigenics(dup_margin=dup_margin) all_fams = set() - with write_compressed_or_not(output + "/spot_borders.tsv", compress) as fout: + with write_compressed_or_not(output / "spot_borders.tsv", compress) as fout: fout.write("spot_id\tnumber\tborder1\tborder2\n") for spot in sorted(pan.spots, key=lambda x: len(x.regions), reverse=True): curr_borders = spot.borders(pan.parameters["spots"]["set_size"], multigenics) @@ -736,13 +738,13 @@ def write_borders(output: str, dup_margin: float = 0.05, compress: bool = False) all_fams |= set(border[1]) fout.write(f"{spot.ID}\t{c}\t{famstring1}\t{famstring2}\n") - with write_compressed_or_not(output + "/border_protein_genes.fasta", compress) as fout: + with write_compressed_or_not(output / "border_protein_genes.fasta", compress) as fout: for fam in all_fams: fout.write(f">{fam.name}\n") fout.write(f"{fam.sequence}\n") -def write_module_summary(output: str, compress: bool = False): +def write_module_summary(output: Path, compress: bool = False): """ Write a file providing summarize information about modules @@ -750,7 +752,7 @@ def write_module_summary(output: str, compress: bool = False): :param compress: Compress the file in .gz """ logging.getLogger().info("Writing functional modules summary...") - with write_compressed_or_not(output + "/modules_summary.tsv", compress) as fout: + with write_compressed_or_not(output / "modules_summary.tsv", compress) as fout: fout.write("module_id\tnb_families\tnb_organisms\tpartition\tmean_number_of_occurrence\n") for mod in pan.modules: org_dict = defaultdict(set) @@ -764,34 +766,34 @@ def write_module_summary(output: str, compress: bool = False): f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod.families), 3)}\n") fout.close() - logging.getLogger().info(f"Done writing module summary: '{output + '/modules_summary.tsv'}'") + logging.getLogger().info(f"Done writing module summary: '{output.as_posix() + '/modules_summary.tsv'}'") -def write_modules(output: str, compress: bool = False): +def write_modules(output: Path, compress: bool = False): """Write a tsv file providing association between modules and gene families :param output: Path to output directory :param compress: Compress the file in .gz """ logging.getLogger().info("Writing functional modules...") - with write_compressed_or_not(output + "/functional_modules.tsv", compress) as fout: + with write_compressed_or_not(output / "functional_modules.tsv", compress) as fout: fout.write("module_id\tfamily_id\n") for mod in pan.modules: for family in mod.families: fout.write(f"module_{mod.ID}\t{family.name}\n") fout.close() - logging.getLogger().info(f"Done writing functional modules to: '{output + '/functional_modules.tsv'}'") + logging.getLogger().info(f"Done writing functional modules to: '{output.as_posix() + '/functional_modules.tsv'}'") -def write_org_modules(output, compress): +def write_org_modules(output: Path, compress: bool = False): """Write a tsv file providing association between modules and organisms :param output: Path to output directory :param compress: Compress the file in .gz """ logging.getLogger().info("Writing modules to organisms associations...") - with write_compressed_or_not(output + "/modules_in_organisms.tsv", compress) as fout: + with write_compressed_or_not(output / "modules_in_organisms.tsv", compress) as fout: fout.write("module_id\torganism\tcompletion\n") for mod in pan.modules: mod_orgs = set() @@ -802,10 +804,10 @@ def write_org_modules(output, compress): fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n") fout.close() logging.getLogger().info( - f"Done writing modules to organisms associations to: '{output + '/modules_in_organisms.tsv'}'") + f"Done writing modules to organisms associations to: '{output.as_posix() + '/modules_in_organisms.tsv'}'") -def write_spot_modules(output, compress): +def write_spot_modules(output: Path, compress: bool = False): """Write a tsv file providing association between modules and spots :param output: Path to output directory @@ -818,7 +820,7 @@ def write_spot_modules(output, compress): for fam in mod.families: fam2mod[fam] = mod - with write_compressed_or_not(output + "/modules_spots.tsv", compress) as fout: + with write_compressed_or_not(output / "modules_spots.tsv", compress) as fout: fout.write("module_id\tspot_id\n") for spot in pan.spots: @@ -834,10 +836,10 @@ def write_spot_modules(output, compress): # if all the families in the module are found in the spot, write the association fout.write(f"module_{mod.ID}\tspot_{spot.ID}\n") - logging.getLogger().info(f"Done writing module to spot associations to: {output + '/modules_spots.tsv'}") + logging.getLogger().info(f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}") -def write_rgp_modules(output, compress): +def write_rgp_modules(output: Path, compress: bool = False): """Write a tsv file providing association between modules and RGP :param output: Path to output directory @@ -845,7 +847,7 @@ def write_rgp_modules(output, compress): """ logging.getLogger().info("Clustering RGPs based on module content...") - lists = write_compressed_or_not(output + "/modules_RGP_lists.tsv", compress) + lists = write_compressed_or_not(output / "modules_RGP_lists.tsv", compress) lists.write("representative_RGP\tnb_spots\tmod_list\tRGP_list\n") fam2mod = {} for mod in pan.modules: @@ -878,10 +880,12 @@ def write_rgp_modules(output, compress): f"{','.join([reg.name for reg in regions])}\n") lists.close() - logging.getLogger().info(f"RGP and associated modules are listed in : {output + '/modules_RGP_lists.tsv'}") + logging.getLogger().info( + f"RGP and associated modules are listed in : {output.as_posix() + '/modules_RGP_lists.tsv'}") -def write_flat_files(pangenome: Pangenome, output: str, cpu: int = 1, soft_core: float = 0.95, dup_margin: float = 0.05, +def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core: float = 0.95, + dup_margin: float = 0.05, csv: bool = False, gene_pa: bool = False, gexf: bool = False, light_gexf: bool = False, projection: bool = False, stats: bool = False, json: bool = False, partitions: bool = False, regions: bool = False, families_tsv: bool = False, spots: bool = False, borders: bool = False, @@ -949,7 +953,7 @@ def write_flat_files(pangenome: Pangenome, output: str, cpu: int = 1, soft_core: needSpots = True if pan.status["spots"] == "inFile" else False needModules = True if pan.status["modules"] == "inFile" else False - check_pangenome_info(pan, need_annotations=needAnnotations, need_families=needFamilies, need_graph=needGraph, + check_pangenome_info(pangenome, need_annotations=needAnnotations, need_families=needFamilies, need_graph=needGraph, need_partitions=needPartitions, need_rgp=needRegions, need_spots=needSpots, need_modules=needModules, disable_bar=disable_bar) @@ -1028,8 +1032,8 @@ def parser_flat(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") - required.add_argument('-o', '--output', required=True, type=str, + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") + required.add_argument('-o', '--output', required=True, type=Path, help="Output directory where the file(s) will be written") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 58f89da6..1c93ca4e 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -404,7 +404,7 @@ def launch(args: argparse.Namespace): """ if args.regions is not None and args.fasta is None and args.anno is None: raise Exception("The --regions options requires the use of --anno or --fasta " - "(You need to provide the same file used to compute the pan)") + "(You need to provide the same file used to compute the pangenome)") mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index de4c2af0..e1eb549a 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -90,7 +90,7 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence/absence of the family in the pangenome using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. - :param index: The index computed by :func:`ppanggolin.pan.Pangenome.getIndex` + :param index: The index computed by :func:`ppanggolin.pangenome.Pangenome.getIndex` :param partition: partition used to compute bitarray """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 29517a57..fd4c2c78 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -265,7 +265,7 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. :param partition: Filter partition - :param index: The index computed by :func:`ppanggolin.pan.Pangenome.getIndex` + :param index: The index computed by :func:`ppanggolin.pangenome.Pangenome.getIndex` """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member diff --git a/ppanggolin/info/info.py b/ppanggolin/info/info.py index 2ba22142..ebd90e77 100644 --- a/ppanggolin/info/info.py +++ b/ppanggolin/info/info.py @@ -101,7 +101,7 @@ def parser_info(parser: argparse.ArgumentParser): options.add_argument("--parameters", required=False, action="store_true", help="Shows the parameters used (or computed) for each step of the pangenome generation") options.add_argument("--content", required=False, action="store_true", - help="Shows detailled informations about the pan's content") + help="Shows detailled informations about the pangenome's content") options.add_argument("--status", required=False, action="store_true", help="Shows informations about the statuses of the different elements of the pangenome " "(what has been computed, or not)") diff --git a/ppanggolin/main.py b/ppanggolin/main.py index b82b621f..86b6d1ce 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -59,16 +59,16 @@ def cmd_line() -> argparse.Namespace: desc += " Output:\n" desc += " draw Draw figures representing the pangenome through different aspects\n" desc += " write Writes 'flat' files representing the pangenome that can be used with other software\n" - desc += " fasta Writes fasta files for different elements of the pan\n" + desc += " fasta Writes fasta files for different elements of the pangenome\n" desc += " info Prints information about a given pangenome graph file\n" - desc += " metrics Compute several metrics on a given pan\n" + desc += " metrics Compute several metrics on a given pangenome\n" desc += " \n" desc += " Regions of genomic Plasticity:\n" desc += " align aligns a genome or a set of proteins to the pangenome gene families representatives and "\ "predict information from it\n" - desc += " rgp predicts Regions of Genomic Plasticity in the genomes of your pan\n" - desc += " spot predicts spots in your pan\n" - desc += " module Predicts functional modules in your pan\n" + desc += " rgp predicts Regions of Genomic Plasticity in the genomes of your pangenome\n" + desc += " spot predicts spots in your pangenome\n" + desc += " module Predicts functional modules in your pangenome\n" desc += " \n" desc += " Genomic context:\n" desc += " context Local genomic context analysis\n" diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py index f7fd1282..a9a4e77c 100644 --- a/ppanggolin/metrics/fluidity.py +++ b/ppanggolin/metrics/fluidity.py @@ -15,7 +15,7 @@ def gen_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: - """ Compute the genomes' fluidity from the pan + """ Compute the genomes' fluidity from the pangenome :param pangenome: pangenome which will be used to compute the genomes' fluidity :param disable_bar: Disable the progress bar @@ -24,7 +24,7 @@ def gen_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: """ # check statuses and load info - logging.getLogger().info("Check information in pan") + logging.getLogger().info("Check information in pangenome") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) fluidity_dict = {'all': None, 'shell': None, 'cloud': None, 'accessory': None} for subset in fluidity_dict.keys(): @@ -66,7 +66,7 @@ def nb_fam_per_org(pangenome: Pangenome, disable_bar: bool = False) -> dict: # TODO Function to compute mash distance between genome for normalization def fam_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: - """ Compute the family fluidity from the pan + """ Compute the family fluidity from the pangenome :param pangenome: pangenome which will be used to compute the genomes' fluidity :param disable_bar: Disable the progress bar @@ -74,7 +74,7 @@ def fam_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: :return: family fluidity value from the pangenome for each partition """ # check statuses and load info - logging.getLogger().info("Check information in pan") + logging.getLogger().info("Check information in pangenome") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) fluidity_dict = {'all': None, 'shell': None, 'cloud': None, 'accessory': None} for subset in fluidity_dict.keys(): diff --git a/ppanggolin/metrics/metrics.py b/ppanggolin/metrics/metrics.py index cade18e5..c9ca4713 100644 --- a/ppanggolin/metrics/metrics.py +++ b/ppanggolin/metrics/metrics.py @@ -81,15 +81,15 @@ def write_metrics(pangenome: Pangenome, metrics_dict: dict, no_print_info: bool info_group = h5f.root.info logging.getLogger().debug("H5f open") if 'genomes_fluidity' in metrics_dict.keys(): - logging.getLogger().info("Writing genome fluidity in pan") + logging.getLogger().info("Writing genome fluidity in pangenome") info_group._v_attrs.genomes_fluidity = metrics_dict['genomes_fluidity'] if 'families_fluidity' in metrics_dict.keys(): - logging.getLogger().info("Writing family fluidity in pan") + logging.getLogger().info("Writing family fluidity in pangenome") info_group._v_attrs.families_fluidity = metrics_dict['families_fluidity'] if 'info_modules' in metrics_dict.keys(): - logging.getLogger().info("Writing modules information in pan") + logging.getLogger().info("Writing modules information in pangenome") write_info_modules(pangenome, h5f) # After all metrics was written diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index c20e1441..c2895734 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -29,7 +29,7 @@ from ppanggolin.formats import check_pangenome_info import ppanggolin.nem.partition as ppp -# import this way to use the global variable pan defined in ppanggolin.nem.partition +# import this way to use the global variable pangenome defined in ppanggolin.nem.partition samples = [] @@ -353,7 +353,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: """ if krange is None: krange = [3, -1] - ppp.pan = pangenome # use the global from partition to store the pan, so that it is usable + ppp.pan = pangenome # use the global from partition to store the pangenome, so that it is usable try: krange[0] = ppp.pan.parameters["partition"]["K"] if krange[0] < 0 else krange[0] diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 651d7e79..9b60afe1 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -15,8 +15,8 @@ class Pangenome: """ This is a class representing your pangenome. It is used as a basic unit for all the analysis to access to the - different elements of your pan, such as organisms, contigs, genes or gene families. It has setter and getter - methods for most elements in your pan, and you can use those to add new elements to it, + different elements of your pangenome, such as organisms, contigs, genes or gene families. It has setter and getter + methods for most elements in your pangenome, and you can use those to add new elements to it, or get objects that have a specific identifier to manipulate them directly. """ @@ -33,8 +33,8 @@ def __init__(self): self._orgGetter = {} self._edgeGetter = {} self._regionGetter = {} - self.spots = set() - self.modules = set() + self._spots = set() + self._modules = set() self.status = { 'genomesAnnotated': "No", @@ -63,6 +63,7 @@ def add_file(self, pangenome_file: Path): self.file = pangenome_file.absolute().as_posix() """ Gene Methods""" + @property def genes(self) -> list: """Creates the geneGetter if it does not exist, and returns all the genes of all organisms in the pangenome. @@ -134,6 +135,7 @@ def number_of_gene(self) -> int: return len(self._geneGetter) """Gene families methods""" + @property def gene_families(self) -> List[GeneFamily]: """returns all the gene families in the pangenome @@ -185,6 +187,7 @@ def add_gene_family(self, name: str): return fam """Graph methods""" + @property def edges(self) -> list: """returns all the edges in the pangenome graph @@ -220,6 +223,7 @@ def number_of_edge(self) -> int: return len(self._edgeGetter) """Organism methods""" + @property def organisms(self) -> List[Organism]: """returns all the organisms in the pangenome @@ -341,6 +345,7 @@ def compute_org_bitarrays(self, part='all') -> Dict[GeneFamily, int]: return self._fam_index """RGP methods""" + @property def regions(self) -> list: """returns all the regions (RGP) in the pangenome @@ -404,34 +409,46 @@ def add_regions(self, region_group: Union[Region, Iterable[Region]]): f"but you provided a {type(region_group)} type object") def number_of_rgp(self) -> int: - """Returns the number of gene families present in the pan + """Returns the number of gene families present in the pangenome :return: the number of gene families """ return len(self._regionGetter) """Spot methods""" + + @property + def spots(self) -> Set[Spot]: + # TODO made as generator + return self._spots + def add_spots(self, spots: Iterable[Spot]): """Adds the given iterable of spots to the pangenome. :param spots: An iterable of :class:`ppanggolin.region.Spot`. """ - self.spots |= set(spots) + self._spots |= set(spots) def number_of_spots(self) -> int: - """Returns the number of gene families present in the pan + """Returns the number of gene families present in the pangenome :return: the number of gene families """ return len(self.spots) """Modules methods""" + + @property + def modules(self) -> Set[Module]: + # TODO made as generator + return self._modules + def add_modules(self, modules: Iterable[Module]): """Adds the given iterable of modules to the pangenome :param modules: an iterable of :class:`ppanggolin.module.Module` """ - self.modules |= set(modules) + self._modules |= set(modules) def compute_mod_bitarrays(self, part: str = 'all') -> Dict[GeneFamily, int]: """Based on the index generated by get_fam_index, generated a bitarray diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 700f5bf1..3977829b 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -7,7 +7,7 @@ from collections.abc import Iterable # installed libraries -from typing import Dict +from typing import Dict, Set import gmpy2 @@ -59,11 +59,11 @@ def append(self, value): self.genes.append(value) value.RGP.add(self) else: - raise TypeError("Unexpected class / type for " + type(value) + - " when adding it to a region of genomic plasticity") + raise TypeError(f"Unexpected class / type for {type(value)} " + f"when adding it to a region of genomic plasticity") @property - def families(self) -> set: + def families(self) -> Set[GeneFamily]: """Get the gene families in the RGP :return: Set of gene families @@ -373,14 +373,19 @@ def __init__(self, module_id: int, families: set = None): but do not define it. """ self.ID = module_id - self.families = set() + self._families = set() if families is not None: if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception(f"You provided elements that were not GeneFamily object." - f" Modules are only made of GeneFamily") - self.families |= set(families) + raise Exception("You provided elements that were not GeneFamily object. " + "Modules are only made of GeneFamily") + self._families |= set(families) self.bitarray = None + @property + def families(self) -> Set[GeneFamily]: + # TODO made as generator + return self._families + def add_family(self, family: GeneFamily): """ Add a family to the module @@ -390,14 +395,14 @@ def add_family(self, family: GeneFamily): if not isinstance(family, GeneFamily): raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") family.modules.add(self) - self.families.add(family) + self._families.add(family) def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. :param partition: filter module by partition - :param index: The index computed by :func:`ppanggolin.pan.Pangenome.getIndex` + :param index: The index computed by :func:`ppanggolin.pangenome.Pangenome.getIndex` """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': diff --git a/tests/test_Pangenome.py b/tests/test_Pangenome.py index beca8cbc..daf85439 100644 --- a/tests/test_Pangenome.py +++ b/tests/test_Pangenome.py @@ -215,7 +215,7 @@ def test_genes_organism_debug(o_pang, make_org_with_genes): def test_genes_genefamilies(o_pang, fill_fam_with_genes): - """Genes are added in pan through their family.""" + """Genes are added in pangenome through their family.""" # geneFamily with genes. o_fam = o_pang.add_gene_family("fam1") l_genes = fill_fam_with_genes(o_fam) # the list of genes, and the geneFam are supposed to be the same From 0c9c91121d44bde4640b93c4716a7c39065afd06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 15 Jun 2023 16:54:04 +0200 Subject: [PATCH 07/75] Refactor and replace str by path in arguments for write files --- VERSION | 2 +- ppanggolin/formats/writeMSA.py | 204 ++++++++++++++------------- ppanggolin/formats/writeSequences.py | 97 +++++++------ ppanggolin/utils.py | 2 +- 4 files changed, 164 insertions(+), 141 deletions(-) diff --git a/VERSION b/VERSION index 88941813..f50de059 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.109 +1.2.110 diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index e23931c7..37a2fc4f 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -8,6 +8,8 @@ import subprocess import time from multiprocessing import get_context +from pathlib import Path +from typing import Dict, Set, List, Tuple # installed libraries from tqdm import tqdm @@ -20,25 +22,27 @@ from ppanggolin.genetic_codes import genetic_codes -def is_single_copy(fam, dup_margin): +def is_single_copy(family: GeneFamily, dup_margin: float = 0.95) -> bool: """ Check if a gene family can be considered 'single copy' or not - :param fam: GeneFamily object + :param family: GeneFamily object :param dup_margin: maximal number of genomes in which the gene family can have multiple members and still be considered a 'single copy' gene family + + :return: True if gene family is single copy else False """ nb_multi = 0 - for gene_list in fam.get_org_dict().values(): + for gene_list in family.get_org_dict().values(): if len(gene_list) > 1: nb_multi += 1 - dup_ratio = nb_multi / len(fam.organisms) + dup_ratio = nb_multi / len(family.organisms) if dup_ratio < dup_margin: return True return False -def getFamiliesToWrite(pangenome, partition_filter, soft_core=0.95, dup_margin=0.95, single_copy=True): - +def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", soft_core: float = 0.95, + dup_margin: float = 0.95, single_copy: bool = True) -> Set[GeneFamily]: """ Get families corresponding to the given partition @@ -50,44 +54,48 @@ def getFamiliesToWrite(pangenome, partition_filter, soft_core=0.95, dup_margin=0 :return: set of families unique to one partition """ - fams = set() + families = set() nb_org = pangenome.number_of_organisms() if partition_filter == "all": return set(pangenome.gene_families) if partition_filter in ["persistent", "shell", "cloud"]: - for fam in pangenome.gene_families: - if fam.named_partition == partition_filter: - if single_copy and is_single_copy(fam, dup_margin): - fams.add(fam) - elif not single_copy: - fams.add(fam) + for family in pangenome.gene_families: + if family.named_partition == partition_filter: + if single_copy: + if is_single_copy(family, dup_margin): + families.add(family) + else: + families.add(family) elif partition_filter in ["core", "accessory", "softcore"]: if partition_filter == "core": - for fam in pangenome.gene_families: - if len(fam.organisms) == nb_org: - if single_copy and is_single_copy(fam, dup_margin): - fams.add(fam) - elif not single_copy: - fams.add(fam) + for family in pangenome.gene_families: + if len(family.organisms) == nb_org: + if single_copy: + if is_single_copy(family, dup_margin): + families.add(family) + else: + families.add(family) elif partition_filter == "accessory": - for fam in pangenome.gene_families: - if len(fam.organisms) < nb_org: - if single_copy and is_single_copy(fam, dup_margin): - fams.add(fam) - elif not single_copy: - fams.add(fam) + for family in pangenome.gene_families: + if len(family.organisms) < nb_org: + if single_copy: + if is_single_copy(family, dup_margin): + families.add(family) + else: + families.add(family) elif partition_filter == "softcore": - for fam in pangenome.gene_families: - if len(fam.organisms) >= nb_org * soft_core: - if single_copy and is_single_copy(fam, dup_margin): - fams.add(fam) - elif not single_copy: - fams.add(fam) - return fams + for family in pangenome.gene_families: + if len(family.organisms) >= nb_org * soft_core: + if single_copy: + if is_single_copy(family, dup_margin): + families.add(family) + else: + families.add(family) + return families -def translate(seq: str, code: dict): +def translate(seq: str, code: Dict[str, str]) -> str: """translates the given dna sequence with the given translation table :param seq: given dna sequence @@ -112,8 +120,8 @@ def translate(seq: str, code: dict): return protein -def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory, code_table: dict, - source: str = 'protein', use_gene_id: bool = False): +def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory, code_table: Dict[str, str], + source: str = 'protein', use_gene_id: bool = False) -> Path: """Write fasta files for each gene family :param family: gene family to write @@ -126,7 +134,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory """ # have a directory for each gene family, to make deletion of tmp files simpler - f_name = tmpdir.name + "/" + family.name + ".fasta" + f_name = Path(tmpdir.name) / f"{family.name}.fasta" f_obj = open(f_name, "w") # get genes that are present in only one copy for our family in each organism. single_copy_genes = [] @@ -150,7 +158,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory return f_name -def launch_mafft(fname, output, fam_name): +def launch_mafft(fname: Path, output: Path, fam_name: str): """ Compute the MSA with mafft @@ -158,13 +166,13 @@ def launch_mafft(fname, output, fam_name): :param output: directory to save alignment :param fam_name: Name of the gene family """ - outname = output + "/" + fam_name + ".aln" - cmd = ["mafft", "--thread", "1", fname] + outname = output/f"{fam_name}.aln" + cmd = ["mafft", "--thread", "1", fname.absolute().as_posix()] logging.getLogger().debug("command: " + " ".join(cmd)) subprocess.run(cmd, stdout=open(outname, "w"), stderr=subprocess.DEVNULL, check=True) -def launch_multi_mafft(args): +def launch_multi_mafft(args: List[Tuple[Path, Path, str]]): """ Allow to launch mafft in multiprocessing :param args: Pack of argument for launch_mafft @@ -174,7 +182,7 @@ def launch_multi_mafft(args): launch_mafft(*args) -def compute_msa(families: set, output: str, tmpdir: str, cpu: int = 1, source: str = "protein", +def compute_msa(families: set, output: Path, tmpdir: Path, cpu: int = 1, source: str = "protein", use_gene_id: bool = False, code: int = 11, disable_bar: bool = False): """ Compute MSA between pangenome gene families @@ -209,7 +217,7 @@ def compute_msa(families: set, output: str, tmpdir: str, cpu: int = 1, source: s bar.close() -def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: str, outname: str, +def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: str, outdir: Path, use_gene_id: bool = False): """ Writes a whole genome msa file for additional phylogenetic analysis @@ -217,7 +225,7 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: str, :param pangenome: Pangenome object :param families: Set of families specific to given partition :param phylo_name: output file name for phylo alignment - :param outname: output directory name for families alignment + :param outdir: output directory name for families alignment :param use_gene_id: Use gene identifiers rather than organism names for sequences in the family MSA """ phylo_dict = {} @@ -225,55 +233,54 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: str, phylo_dict[org.name] = "" for fam in families: missing_genomes = set(phylo_dict.keys()) - fin = open(outname + "/" + fam.name + ".aln", "r") - genome_id = "" - seq = "" - curr_len = 0 - dup_gene = 0 - curr_phylo_dict = {} - - for line in fin: - if line.startswith('>'): - if genome_id != "": - if genome_id not in missing_genomes: - dup_gene += 1 - # duplicated genes. Replacing them with gaps. - curr_phylo_dict[genome_id] = "-" * curr_len + with open(outdir / f"{fam.name}.aln", "r") as fin: + genome_id = "" + seq = "" + curr_len = 0 + dup_gene = 0 # TODO Remove ? + curr_phylo_dict = {} + + for line in fin: + if line.startswith('>'): + if genome_id != "": + if genome_id not in missing_genomes: + dup_gene += 1 + # duplicated genes. Replacing them with gaps. + curr_phylo_dict[genome_id] = "-" * curr_len + else: + curr_phylo_dict[genome_id] = seq + missing_genomes -= {genome_id} + curr_len = len(seq) + if use_gene_id: + genome_id = pangenome.get_gene(line[1:].strip()).organism.name else: - curr_phylo_dict[genome_id] = seq - missing_genomes -= {genome_id} - curr_len = len(seq) - if use_gene_id: - genome_id = pangenome.get_gene(line[1:].strip()).organism.name + genome_id = line[1:].strip() + seq = "" + else: + seq += line.strip() + if genome_id != "": + if genome_id not in missing_genomes: + # duplicated genes. Replacing them with gaps. + curr_phylo_dict[genome_id] = "-" * curr_len else: - genome_id = line[1:].strip() - seq = "" - else: - seq += line.strip() - if genome_id != "": - if genome_id not in missing_genomes: - # duplicated genes. Replacing them with gaps. - curr_phylo_dict[genome_id] = "-" * curr_len - else: - curr_phylo_dict[genome_id] = seq - curr_len = len(seq) - fin.close() + curr_phylo_dict[genome_id] = seq + curr_len = len(seq) for genome in missing_genomes: curr_phylo_dict[genome] = "-" * curr_len for key, val in curr_phylo_dict.items(): phylo_dict[key] += val - fout = open(phylo_name, "w") - for key, val in phylo_dict.items(): - fout.write(">" + key + "\n") - fout.write(val + "\n") - fout.close() + with open(phylo_name, "w") as fout: + for key, val in phylo_dict.items(): + fout.write(">" + key + "\n") + fout.write(val + "\n") -def writeMSAFiles(pangenome, output, cpu=1, partition="core", tmpdir="/tmp", source="protein", soft_core=0.95, - phylo=False, use_gene_id=False, translation_table="11", dup_margin = 0.95, single_copy=True, force=False, disable_bar=False): - +def write_msa_files(pangenome: Pangenome, output: Path, cpu: int = 1, partition: str = "core", tmpdir: Path = None, + source: str = "protein", soft_core: float = 0.95, phylo: bool = False, use_gene_id: bool = False, + translation_table: str = "11", dup_margin: float = 0.95, single_copy: bool = True, + force: bool = False, disable_bar: bool = False): """ Main function to write MSA files @@ -292,17 +299,20 @@ def writeMSAFiles(pangenome, output, cpu=1, partition="core", tmpdir="/tmp", sou :param force: force to write in the directory :param disable_bar: Disable progress bar """ + tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir + need_partitions = False if partition in ["persistent", "shell", "cloud"]: need_partitions = True - outname = output + f"/msa_{partition}_{source}/" - mk_outdir(outname, force=force) + outdir = output / f"msa_{partition}_{source}/" + mk_outdir(outdir, force=force) check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=need_partitions, need_gene_sequences=True, disable_bar=disable_bar) logging.getLogger().info(f"Doing MSA for {partition} families...") - families = getFamiliesToWrite(pangenome, partition_filter=partition, soft_core=soft_core, dup_margin=dup_margin, single_copy=single_copy) + families = get_families_to_write(pangenome, partition_filter=partition, soft_core=soft_core, dup_margin=dup_margin, + single_copy=single_copy) # check that the code is similar than the one used previously, if there is one if 'translation_table' in pangenome.parameters["cluster"]: @@ -312,17 +322,17 @@ def writeMSAFiles(pangenome, output, cpu=1, partition="core", tmpdir="/tmp", sou f"is different than the one provided now ('{translation_table}')") code = translation_table - compute_msa(families, outname, cpu=cpu, tmpdir=tmpdir, source=source, use_gene_id=use_gene_id, code=code, + compute_msa(families, outdir, cpu=cpu, tmpdir=tmpdir, source=source, use_gene_id=use_gene_id, code=code, disable_bar=disable_bar) - logging.getLogger().info(f"Done writing all {partition} MSA in: {outname}") + logging.getLogger().info(f"Done writing all {partition} MSA in: {outdir}") if phylo: logging.getLogger().info("Writing the whole genome msa file") if partition == "softcore": - phylo_name = output + f"/{partition}_{soft_core}_genome_alignment.aln" + phylo_name = output / f"{partition}_{soft_core}_genome_alignment.aln" else: - phylo_name = output + f"/{partition}_genome_alignment.aln" - write_whole_genome_msa(pangenome, families, phylo_name, outname, use_gene_id=use_gene_id) + phylo_name = output / f"{partition}_genome_alignment.aln" + write_whole_genome_msa(pangenome, families, phylo_name, outdir, use_gene_id=use_gene_id) logging.getLogger().info(f"Done writing the {partition} genome alignment in: '{phylo_name}'") @@ -335,10 +345,10 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) - writeMSAFiles(pangenome, args.output, cpu=args.cpu, partition=args.partition, tmpdir=args.tmpdir, - source=args.source, soft_core=args.soft_core, phylo=args.phylo, use_gene_id=args.use_gene_id, - translation_table=args.translation_table, dup_margin=args.dup_margin, - single_copy=args.single_copy, force=args.force, disable_bar=args.disable_prog_bar) + write_msa_files(pangenome, args.output, cpu=args.cpu, partition=args.partition, tmpdir=args.tmpdir, + source=args.source, soft_core=args.soft_core, phylo=args.phylo, use_gene_id=args.use_gene_id, + translation_table=args.translation_table, dup_margin=args.dup_margin, single_copy=args.single_copy, + force=args.force, disable_bar=args.disable_prog_bar) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -362,8 +372,8 @@ def parser_msa(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="The following arguments are required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") - required.add_argument('-o', '--output', required=True, type=str, + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") + required.add_argument('-o', '--output', required=True, type=Path, help="Output directory where the file(s) will be written") optional = parser.add_argument_group(title="Optional arguments. Indicating 'all' writes all elements. " @@ -402,7 +412,7 @@ def parser_msa(parser: argparse.ArgumentParser): parser_msa(main_parser) common = main_parser.add_argument_group(title="Common argument") - common.add_argument("--tmpdir", required=False, type=str, default=tempfile.gettempdir(), + common.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2], help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)") diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 1c93ca4e..59aedcdc 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -4,23 +4,26 @@ # default libraries import argparse import logging +import re +from pathlib import Path +from typing import TextIO, Dict, Set # installed libraries -from typing import TextIO - from tqdm import tqdm # local libraries from ppanggolin.pangenome import Pangenome +from ppanggolin.geneFamily import GeneFamily from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file -poss_values_log = "Possible values are 'all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', " \ - "'core', 'module_X' with X being a module id." +module_regex = re.compile(r'^module_[0-9]+') +poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] +poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." -def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO, list_cds: list = None, add: str = '', - disable_bar: bool = False): +def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO, list_cds: list = None, + add: str = '', disable_bar: bool = False): """ Writes the CDS sequences given through list_CDS of the Pangenome object to a tmpFile object, and adds the str provided through add in front of it. @@ -32,7 +35,7 @@ def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO :param add: Add prefix to gene ID :param disable_bar: Disable progress bar """ - counter = 0 + counter = 0 # TODO remove ? if list_cds is None: list_cds = pangenome.genes logging.getLogger().info("Writing all of the CDS sequences...") @@ -44,7 +47,7 @@ def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO file_obj.flush() -def write_gene_sequences(pangenome: Pangenome, output: str, genes: str, soft_core: float = 0.95, +def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_core: float = 0.95, compress: bool = False, disable_bar: bool = False): """ Write all nucleotide CDS sequences @@ -56,8 +59,10 @@ def write_gene_sequences(pangenome: Pangenome, output: str, genes: str, soft_cor :param compress: Compress the file in .gz :param disable_bar: Disable progress bar """ + assert genes in poss_values, f"Selected part to write genes not in {poss_values}" + logging.getLogger().info("Writing all the gene nucleotide sequences...") - outname = output + f"/{genes}_genes.fna" + outpath = output / f"{genes}_genes.fna" genefams = select_families(pangenome, genes, "gene nucleotide sequences", soft_core) genes_to_write = [] @@ -66,7 +71,7 @@ def write_gene_sequences(pangenome: Pangenome, output: str, genes: str, soft_cor genes_to_write.extend(fam.genes) logging.getLogger().info(f"There are {len(genes_to_write)} genes to write") - with write_compressed_or_not(outname, compress) as fasta: + with write_compressed_or_not(outpath, compress) as fasta: if pangenome.status["geneSequences"] in ["inFile"]: get_gene_sequences_from_file(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]), disable_bar=disable_bar) @@ -75,10 +80,10 @@ def write_gene_sequences(pangenome: Pangenome, output: str, genes: str, soft_cor else: # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") - logging.getLogger().info(f"Done writing the gene sequences : '{outname}'") + logging.getLogger().info(f"Done writing the gene sequences : '{outpath}'") -def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> set: +def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]: """ function used to filter down families to the given partition @@ -125,7 +130,7 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c return genefams -def write_fasta_gene_fam(pangenome: Pangenome, output: str, gene_families: str, soft_core: float = 0.95, +def write_fasta_gene_fam(pangenome: Pangenome, output: Path, gene_families: str, soft_core: float = 0.95, compress: bool = False, disable_bar=False): """ Write representative nucleotide sequences of gene families @@ -137,18 +142,20 @@ def write_fasta_gene_fam(pangenome: Pangenome, output: str, gene_families: str, :param compress: Compress the file in .gz :param disable_bar: Disable progress bar """ - outname = output + f"/{gene_families}_nucleotide_families.fasta" + assert gene_families in poss_values, f"Selected part to write gene families not in {poss_values}" + + outpath = output / f"{gene_families}_nucleotide_families.fasta" genefams = select_families(pangenome, gene_families, "representative nucleotide sequences of the gene families", soft_core) - with write_compressed_or_not(outname, compress) as fasta: + with write_compressed_or_not(outpath, compress) as fasta: get_gene_sequences_from_file(pangenome.file, fasta, [fam.name for fam in genefams], disable_bar=disable_bar) - logging.getLogger().info(f"Done writing the representative nucleotide sequences of the gene families : '{outname}'") + logging.getLogger().info(f"Done writing the representative nucleotide sequences of the gene families : '{outpath}'") -def write_fasta_prot_fam(pangenome: Pangenome, output: str, prot_families: str, soft_core: float = 0.95, +def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, soft_core: float = 0.95, compress: bool = False, disable_bar: bool = False): """ Write representative amino acid sequences of gene families. @@ -160,23 +167,25 @@ def write_fasta_prot_fam(pangenome: Pangenome, output: str, prot_families: str, :param compress: Compress the file in .gz :param disable_bar: Disable progress bar """ - outname = output + f"/{prot_families}_protein_families.faa" + assert prot_families in poss_values, f"Selected part: {prot_families} to write protein families not in {poss_values}" + + outpath = output / f"{prot_families}_protein_families.faa" genefams = select_families(pangenome, prot_families, "representative amino acid sequences of the gene families", soft_core) - with write_compressed_or_not(outname, compress) as fasta: + with write_compressed_or_not(outpath, compress) as fasta: for fam in tqdm(genefams, unit="prot families", disable=disable_bar): fasta.write('>' + fam.name + "\n") fasta.write(fam.sequence + "\n") - logging.getLogger().info(f"Done writing the representative amino acid sequences of the gene families : '{outname}'") + logging.getLogger().info(f"Done writing the representative amino acid sequences of the gene families : '{outpath}'") -def read_fasta_or_gff(filename: str) -> dict: +def read_fasta_or_gff(file_path: Path) -> Dict[str, str]: """ Read the genome file in fasta or gbff format - :param filename: Path to genome file + :param file_path: Path to genome file :return: Dictionary with all sequences associated to contig """ @@ -184,7 +193,7 @@ def read_fasta_or_gff(filename: str) -> dict: seqname = "" seq = "" z = False - with read_compressed_or_not(filename) as f: + with read_compressed_or_not(file_path) as f: for line in f: if line.startswith(">"): z = True @@ -200,24 +209,24 @@ def read_fasta_or_gff(filename: str) -> dict: return sequence_dict -def read_fasta_gbk(filename): +def read_fasta_gbk(file_path: Path) -> Dict[str, str]: """ Read the genome file in gbk format - :param filename: Path to genome file + :param file_path: Path to genome file :return: Dictionary with all sequences associated to contig """ # line.startswith("ORIGIN"): sequence_dict = {} - lines = read_compressed_or_not(filename).readlines()[::-1] + lines = read_compressed_or_not(file_path).readlines()[::-1] contig_id, contig_locus_id = ("", "") while len(lines) != 0: line = lines.pop() # beginning of contig if line.startswith('LOCUS'): contig_locus_id = line.split()[1] - # If contig_id is not specified in VERSION afterwards like with Prokka, + # If contig_id is not specified in VERSION afterward like with Prokka, # in that case we use the one in LOCUS. while not line.startswith('FEATURES'): if line.startswith('VERSION'): @@ -238,7 +247,7 @@ def read_fasta_gbk(filename): return sequence_dict -def read_genome_file(file_dict: dict, genome_name: str) -> dict: +def read_genome_file(file_dict: Dict[str, Path], genome_name: str) -> Dict[str, str]: """ Read the genome file associated to organism @@ -256,7 +265,7 @@ def read_genome_file(file_dict: dict, genome_name: str) -> dict: raise Exception(f"Unknown filetype detected: '{file_dict[genome_name]}'") -def write_spaced_fasta(sequence: str, space: int = 60): +def write_spaced_fasta(sequence: str, space: int = 60) -> str: """Write a maximum of element per line :param sequence: sequence to write @@ -272,7 +281,7 @@ def write_spaced_fasta(sequence: str, space: int = 60): return seq -def write_regions_sequences(pangenome: Pangenome, output: str, regions: str, fasta: str, anno: str, +def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fasta: Path = None, anno: Path = None, compress: bool = False, disable_bar: bool = False): """ Write representative amino acid sequences of gene families. @@ -285,13 +294,17 @@ def write_regions_sequences(pangenome: Pangenome, output: str, regions: str, fas :param compress: Compress the file in .gz :param disable_bar: Disable progress bar """ + assert fasta is not None or anno is not None, "Write regions requires to use anno or fasta, not any provided" + organisms_file = fasta if fasta is not None else anno org_dict = {} for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: raise Exception(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") - org_dict[elements[0]] = elements[1] + org_dict[elements[0]] = Path(elements[1]) + if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other + org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) logging.getLogger().info(f"Writing {regions} rgp genomic sequences...") regions_to_write = [] @@ -305,7 +318,7 @@ def write_regions_sequences(pangenome: Pangenome, output: str, regions: str, fas regions_to_write = sorted(regions_to_write, key=lambda x: x.organism.name) # order regions by organism, so that we only have to read one genome at the time - outname = output + f"/{regions}_rgp_genomic_sequences.fasta" + outname = output/f"{regions}_rgp_genomic_sequences.fasta" with write_compressed_or_not(outname, compress) as fasta: loaded_genome = "" for region in tqdm(regions_to_write, unit="rgp", disable=disable_bar): @@ -317,7 +330,7 @@ def write_regions_sequences(pangenome: Pangenome, output: str, regions: str, fas logging.getLogger().info(f"Done writing the regions nucleotide sequences: '{outname}'") -def write_sequence_files(pangenome: Pangenome, output: str, fasta: str = None, anno: str = None, +def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, soft_core: float = 0.95, regions: str = None, genes: str = None, gene_families: str = None, prot_families: str = None, compress: bool = False, disable_bar: bool = False): """ @@ -377,9 +390,9 @@ def write_sequence_files(pangenome: Pangenome, output: str, fasta: str = None, a ex_gene_family_sequences = Exception("The provided pangenome has no gene families. " "This is not compatible with any of the following options : " "--prot_families, --gene_families") - if not pangenome.status["geneSequences"] in ["inFile"] and (genes or gene_families): + if pangenome.status["geneSequences"] not in ["inFile"] and (genes or gene_families): raise ex_gene_sequences - if not pangenome.status["geneFamilySequences"] in ["Loaded", "Computed", "inFile"] and prot_families: + if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"] and prot_families: raise ex_gene_family_sequences check_pangenome_info(pangenome, need_annotations=need_annotations, need_families=need_families, @@ -434,16 +447,16 @@ def parser_seq(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") - required.add_argument('-o', '--output', required=True, type=str, + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") + required.add_argument('-o', '--output', required=True, type=Path, help="Output directory where the file(s) will be written") context = parser.add_argument_group(title="Contextually required arguments", description="With --regions, the following arguments are required:") - context.add_argument('--fasta', required=False, type=str, + context.add_argument('--fasta', required=False, type=Path, help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " "sequence(s) (the fastas can be compressed with gzip). One line per organism.") - context.add_argument('--anno', required=False, type=str, + context.add_argument('--anno', required=False, type=Path, help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " "annotations (the files can be compressed with gzip). One line per organism. " "If this is provided, those annotations will be used.") @@ -453,11 +466,11 @@ def parser_seq(parser: argparse.ArgumentParser): "('persistent', 'shell' or 'cloud') write the elements associated " "to said partition. Writing 'rgp' writes elements associated to RGPs" ) - onereq.add_argument("--genes", required=False, type=str, + onereq.add_argument("--genes", required=False, type=str, choices=poss_values, help=f"Write all nucleotide CDS sequences. {poss_values_log}") - onereq.add_argument("--prot_families", required=False, type=str, + onereq.add_argument("--prot_families", required=False, type=str, choices=poss_values, help=f"Write representative amino acid sequences of gene families. {poss_values_log}") - onereq.add_argument("--gene_families", required=False, type=str, + onereq.add_argument("--gene_families", required=False, type=str, choices=poss_values, help=f"Write representative nucleotide sequences of gene families. {poss_values_log}") optional = parser.add_argument_group(title="Optional arguments") # could make choice to allow customization diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 84c62cb5..7d286668 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -225,7 +225,7 @@ def mk_file_name(basename: str, output: Path, force: bool = False) -> Path: return filename -def detect_filetype(filename): +def detect_filetype(filename: Path) -> str: """ Detects whether the current file is gff3, gbk/gbff, fasta or unknown. If unknown, it will raise an error From 991dc87da4b8cfcaee704234f143a54ed8ae87c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 15 Jun 2023 16:56:50 +0200 Subject: [PATCH 08/75] Remove getLogger from logging --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 8 ++-- ppanggolin/RGP/spot.py | 16 +++---- ppanggolin/align/alignOnPang.py | 22 ++++----- ppanggolin/annotate/annotate.py | 20 ++++----- ppanggolin/annotate/synta.py | 8 ++-- ppanggolin/cluster/cluster.py | 60 ++++++++++++------------- ppanggolin/context/searchGeneContext.py | 12 ++--- ppanggolin/figures/draw_spot.py | 8 ++-- ppanggolin/figures/tile_plot.py | 16 +++---- ppanggolin/figures/ucurve.py | 4 +- ppanggolin/formats/readBinaries.py | 18 ++++---- ppanggolin/formats/writeBinaries.py | 52 ++++++++++----------- ppanggolin/formats/writeFlat.py | 60 ++++++++++++------------- ppanggolin/formats/writeMSA.py | 16 +++---- ppanggolin/formats/writeSequences.py | 28 ++++++------ ppanggolin/geneFamily.py | 6 +-- ppanggolin/genome.py | 6 +-- ppanggolin/graph/makeGraph.py | 4 +- ppanggolin/metrics/fluidity.py | 16 +++---- ppanggolin/metrics/metrics.py | 14 +++--- ppanggolin/mod/module.py | 10 ++--- ppanggolin/nem/partition.py | 38 ++++++++-------- ppanggolin/nem/rarefaction.py | 22 ++++----- ppanggolin/pangenome.py | 2 +- ppanggolin/region.py | 8 ++-- ppanggolin/utils.py | 6 +-- ppanggolin/workflow/all.py | 18 ++++---- ppanggolin/workflow/panModule.py | 14 +++--- ppanggolin/workflow/panRGP.py | 16 +++---- 30 files changed, 265 insertions(+), 265 deletions(-) diff --git a/VERSION b/VERSION index f50de059..0e2d0606 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.110 +1.2.111 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 804b87c4..11264525 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -208,7 +208,7 @@ def naming_scheme(pangenome: Pangenome): oldlen = len(contigsids) contigsids.add(contig.name) if oldlen == len(contigsids): - logging.getLogger().warning("You have contigs with identical identifiers in your assemblies. " + logging.warning("You have contigs with identical identifiers in your assemblies. " "identifiers will be supplemented with your provided organism names.") return "organism" return "contig" @@ -247,14 +247,14 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=False, need_partitions=True, disable_bar=disable_bar) - logging.getLogger().info("Detecting multigenic families...") + logging.info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(dup_margin) - logging.getLogger().info("Compute Regions of Genomic Plasticity ...") + logging.info("Compute Regions of Genomic Plasticity ...") name_scheme = naming_scheme(pangenome) for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar): pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme)) - logging.getLogger().info(f"Predicted {len(pangenome.regions)} RGP") + logging.info(f"Predicted {len(pangenome.regions)} RGP") # save parameters and save status pangenome.parameters["RGP"] = {} diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 0fd596ae..00649157 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -115,11 +115,11 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): else: used += 1 add_new_node(graph_spot, rgp, border) - logging.getLogger().info(f"{lost} RGPs were not used as they are on a contig border (or have less than {set_size} " + logging.info(f"{lost} RGPs were not used as they are on a contig border (or have less than {set_size} " f"persistent gene families until the contig border)") - logging.getLogger().info(f"{used} RGPs are being used to predict spots of insertion") + logging.info(f"{used} RGPs are being used to predict spots of insertion") node_list = list(graph_spot.nodes) - logging.getLogger().info(f"{len(node_list)} number of different pairs of flanking gene families") + logging.info(f"{len(node_list)} number of different pairs of flanking gene families") for i, nodei in enumerate(node_list[:-1]): for nodej in node_list[i + 1:]: node_obj_i = graph_spot.nodes[nodei] @@ -188,19 +188,19 @@ def predict_hotspots(pangenome: Pangenome, output: str, spot_graph: bool = False need_rgp=True, disable_bar=disable_bar) # get multigenic gene families - logging.getLogger().info("Detecting multigenic families...") + logging.info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) - logging.getLogger().info("Detecting hotspots in the pangenome...") + logging.info("Detecting hotspots in the pangenome...") # predict spots spots = make_spot_graph(pangenome.regions, multigenics, output, spot_graph, overlapping_match, set_size, exact_match) if len(spots) == 0: - logging.getLogger().warning("No spots were detected.") + logging.warning("No spots were detected.") else: - logging.getLogger().info(f"{len(spots)} spots were detected") + logging.info(f"{len(spots)} spots were detected") pangenome.add_spots(spots) pangenome.status["spots"] = "Computed" @@ -221,7 +221,7 @@ def launch(args: argparse.Namespace): if args.spot_graph: mk_outdir(args.output, args.force) if args.draw_hotspots or args.interest or args.fig_margin or args.priority: - logging.getLogger().warning( + logging.warning( "Options to draw the spots with the 'ppanggolin spot' subcommand have been deprecated, " "and are now dealt with in a dedicated subcommand 'ppanggolin drawspot'.") predict_hotspots(pangenome, args.output, force=args.force, spot_graph=args.spot_graph, diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index a4450282..ef5ed3fc 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -31,7 +31,7 @@ def createdb(file_obj: TextIOWrapper, tmpdir: tempfile.TemporaryDirectory) -> IO """ seqdb = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name) cmd = ["mmseqs", "createdb", file_obj.name, seqdb.name, '--dbtype', '0'] - logging.getLogger().debug(" ".join(cmd)) + logging.debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL) return seqdb @@ -62,13 +62,13 @@ def align_seq_to_pang(pang_file: IO, seq_file: TextIOWrapper, output: Path, aln_db = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name) cmd = ["mmseqs", "search", seq_db.name, pang_db.name, aln_db.name, tmpdir.name, "-a", "--min-seq-id", str(identity), "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu)] - logging.getLogger().debug(" ".join(cmd)) - logging.getLogger().info("Aligning sequences to cluster representatives...") + logging.debug(" ".join(cmd)) + logging.info("Aligning sequences to cluster representatives...") subprocess.run(cmd, stdout=subprocess.DEVNULL) outfile = output.absolute()/"input_to_pangenome_associations.blast-tab_tmp" # write a tmp file of the results cmd = ["mmseqs", "convertalis", seq_db.name, pang_db.name, aln_db.name, outfile.as_posix(), "--format-mode", "2"] - logging.getLogger().debug(" ".join(cmd)) - logging.getLogger().info("Extracting alignments...") + logging.debug(" ".join(cmd)) + logging.info("Extracting alignments...") subprocess.run(cmd, stdout=subprocess.DEVNULL) pang_db.close() seq_db.close() @@ -233,7 +233,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel :param disable_bar: disable progress bar :return: """ - logging.getLogger().info("Writing RGP and spot information related to hits in the pangenome") + logging.info("Writing RGP and spot information related to hits in the pangenome") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) finfo = open(output/"info_input_seq.tsv", "w") @@ -253,7 +253,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel for spot in spot_list: if len(spot.get_uniq_ordered_set()) > 1: drawn_spots.add(spot) - logging.getLogger().info(f"Drawing the {len(drawn_spots)} spots with more than 1 organization " + logging.info(f"Drawing the {len(drawn_spots)} spots with more than 1 organization " f"related to hits of the input sequences...") draw_selected_spots(drawn_spots, pangenome, output, pangenome.parameters["spots"]["overlapping_match"], pangenome.parameters["spots"]["exact_match"], pangenome.parameters["spots"]["set_size"], @@ -267,7 +267,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel draw_spot_gexf(drawn_spots, output, multigenics=multigenics, fam_to_mod=fam2mod) - logging.getLogger().info(f"File listing RGP and spots where sequences of interest are located : " + logging.info(f"File listing RGP and spots where sequences of interest are located : " f"{output/'info_input_seq.tsv'}") @@ -346,9 +346,9 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) part_proj = project_partition(seq2pang, seq_set, output) # write the partition assignation only - logging.getLogger().info(f"sequences partition projection : '{part_proj}'") - logging.getLogger().info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") - logging.getLogger().info(f"Blast-tab file of the alignment : '{align_file}'") + logging.info(f"sequences partition projection : '{part_proj}'") + logging.info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") + logging.info(f"Blast-tab file of the alignment : '{align_file}'") new_tmpdir.cleanup() diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index d99ca4ed..28a872b7 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -92,7 +92,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p """ org = Organism(organism) - logging.getLogger().debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}") + logging.debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}") # revert the order of the file, to read the first line first. lines = read_compressed_or_not(gbff_file_path).readlines()[::-1] gene_counter = 0 @@ -430,7 +430,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p :param pseudo: allow to read pseudogène :param disable_bar: Disable the progresse bar """ - logging.getLogger().info(f"Reading {organisms_file.name} the list of organism files ...") + logging.info(f"Reading {organisms_file.name} the list of organism files ...") pangenome.status["geneSequences"] = "Computed" # we assume there are gene sequences in the annotation files, @@ -454,10 +454,10 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p # decide whether we use local ids or ppanggolin ids. used_local_identifiers = chose_gene_identifiers(pangenome) if used_local_identifiers: - logging.getLogger().info("gene identifiers used in the provided annotation files were unique, " + logging.info("gene identifiers used in the provided annotation files were unique, " "PPanGGOLiN will use them.") else: - logging.getLogger().info("gene identifiers used in the provided annotation files were not unique, " + logging.info("gene identifiers used in the provided annotation files were not unique, " "PPanGGOLiN will use self-generated identifiers.") pangenome.status["genomesAnnotated"] = "Computed" @@ -478,7 +478,7 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): for line in read_compressed_or_not(fasta_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - logging.getLogger().error("No tabulation separator found in organisms file") + logging.error("No tabulation separator found in organisms file") exit(1) try: org = pangenome.get_organism(elements[0]) @@ -538,7 +538,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: :param disable_bar: Disable the progresse bar """ - logging.getLogger().info(f"Reading {fasta_list} the list of organism files") + logging.info(f"Reading {fasta_list} the list of organism files") arguments = [] # Argument given to annotate organism in same order than prototype for line in read_compressed_or_not(fasta_list): @@ -552,7 +552,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: norna, kingdom, overlap, contig_filter, procedure)) if len(arguments) == 0: raise Exception("There are no genomes in the provided file") - logging.getLogger().info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") + logging.info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") with get_context('fork').Pool(processes=cpu) as p: for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", total=len(arguments), disable=disable_bar): @@ -560,7 +560,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: p.close() p.join() - logging.getLogger().info("Done annotating genomes") + logging.info("Done annotating genomes") pangenome.status["genomesAnnotated"] = "Computed" # the pangenome is now annotated. pangenome.status["geneSequences"] = "Computed" # the gene objects have their respective gene sequences. pangenome.parameters["annotation"] = {} @@ -591,9 +591,9 @@ def launch(args: argparse.Namespace): if args.fasta: get_gene_sequences_from_fastas(pangenome, args.fasta) else: - logging.getLogger().warning("You provided gff files without sequences, and you did not provide " + logging.warning("You provided gff files without sequences, and you did not provide " "fasta sequences. Thus it was not possible to get the gene sequences.") - logging.getLogger().warning("You will be able to proceed with your analysis ONLY if you provide " + logging.warning("You will be able to proceed with your analysis ONLY if you provide " "the clustering results in the next step.") write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 5dd89468..4d6c7017 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -45,7 +45,7 @@ def launch_aragorn(fna_file: str, org: Organism) -> defaultdict: """ locustag = org.name cmd = ["aragorn", "-t", "-gcbact", "-l", "-w", fna_file] - logging.getLogger().debug(f"aragorn command : {' '.join(cmd)}") + logging.debug(f"aragorn command : {' '.join(cmd)}") p = Popen(cmd, stdout=PIPE) # loading the whole thing, reverting it to 'pop' in order. file_data = p.communicate()[0].decode().split("\n")[:: -1] @@ -81,7 +81,7 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str """ locustag = org.name cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"])) - logging.getLogger().debug(f"prodigal command : {' '.join(cmd)}") + logging.debug(f"prodigal command : {' '.join(cmd)}") p = Popen(cmd, stdout=PIPE) gene_objs = defaultdict(set) @@ -124,7 +124,7 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = " tmp_file = tempfile.NamedTemporaryFile(mode="r", dir=tmpdir) cmd = ["cmscan", "--tblout", tmp_file.name, "--hmmonly", "--cpu", str(1), "--noali", modelfile, fna_file] - logging.getLogger().debug(f"infernal command : {' '.join(cmd)}") + logging.debug(f"infernal command : {' '.join(cmd)}") p = Popen(cmd, stdout=open(os.devnull, "w"), stderr=PIPE) err = p.communicate()[1].decode().split() if err: @@ -319,7 +319,7 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) if procedure is None: # prodigal procedure is not force by user - logging.getLogger().debug(all_contig_len) + logging.debug(all_contig_len) if all_contig_len < 20000: # case of short sequence procedure = "meta" else: diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index f05aa9d5..afc4ab05 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -83,33 +83,33 @@ def first_clustering(sequences: io.TextIO, tmpdir: tempfile.TemporaryDirectory, """ seq_nucdb = tmpdir.name + '/nucleotid_sequences_db' cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) - logging.getLogger().debug(" ".join(cmd)) - logging.getLogger().info("Creating sequence database...") + logging.debug(" ".join(cmd)) + logging.info("Creating sequence database...") subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.getLogger().debug("Translate sequence ...") + logging.debug("Translate sequence ...") seqdb = tmpdir.name + '/aa_db' cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) - logging.getLogger().debug(" ".join(cmd)) + logging.debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.getLogger().info("Clustering sequences...") + logging.info("Clustering sequences...") cludb = tmpdir.name + '/cluster_db' cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir.name, "--cluster-mode", mode, "--min-seq-id", identity, "-c", coverage, "--threads", cpu, "--kmer-per-seq", 80, "--max-seqs", 300])) - logging.getLogger().debug(" ".join(cmd)) + logging.debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.getLogger().info("Extracting cluster representatives...") + logging.info("Extracting cluster representatives...") repdb = tmpdir.name + '/representative_db' cmd = list(map(str, ["mmseqs", "result2repseq", seqdb, cludb, repdb])) - logging.getLogger().debug(" ".join(cmd)) + logging.debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) reprfa = tmpdir.name + '/representative_sequences.fasta' cmd = ["mmseqs", "result2flat", seqdb, seqdb, repdb, reprfa, "--use-fasta-header"] - logging.getLogger().debug(" ".join(cmd)) + logging.debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.getLogger().info("Writing gene to family informations") + logging.info("Writing gene to family informations") outtsv = tmpdir.name + '/families_tsv' cmd = list(map(str, ["mmseqs", "createtsv", seqdb, seqdb, cludb, outtsv, "--threads", cpu, "--full-header"])) - logging.getLogger().debug(" ".join(cmd)) + logging.debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return reprfa, outtsv @@ -146,21 +146,21 @@ def align_rep(faa_file: str, tmpdir: tempfile.TemporaryDirectory, cpu: int = 1, :return: Result of alignment """ - logging.getLogger().debug("Create database") + logging.debug("Create database") seqdb = tmpdir.name + '/rep_sequence_db' cmd = ["mmseqs", "createdb", faa_file, seqdb] - logging.getLogger().debug(" ".join(cmd)) + logging.debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.getLogger().info("Aligning cluster representatives...") + logging.info("Aligning cluster representatives...") alndb = tmpdir.name + '/rep_alignment_db' cmd = list(map(str, ["mmseqs", "search", seqdb, seqdb, alndb, tmpdir.name, "-a", "--min-seq-id", identity, "-c", coverage, "--cov-mode", 1, "--threads", cpu])) - logging.getLogger().debug(" ".join(cmd)) + logging.debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.getLogger().info("Extracting alignments...") + logging.info("Extracting alignments...") outfile = tmpdir.name + '/rep_families.tsv' cmd = ["mmseqs", "convertalis", seqdb, seqdb, alndb, outfile, "--format-output", "query,target,qlen,tlen,bits"] - logging.getLogger().debug(" ".join(cmd)) + logging.debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return outfile @@ -195,7 +195,7 @@ def refine_clustering(tsv: str, aln_file: str, fam_to_seq: dict) -> (dict, dict) """ simgraph = Graph() genes2fam, fam2genes = read_tsv(tsv) - logging.getLogger().info(f"Starting with {len(fam_to_seq)} families") + logging.info(f"Starting with {len(fam_to_seq)} families") # create the nodes for fam, genes in fam2genes.items(): simgraph.add_node(fam, nbgenes=len(genes)) @@ -229,7 +229,7 @@ def refine_clustering(tsv: str, aln_file: str, fam_to_seq: dict) -> (dict, dict) new_fam_to_seq = {} for fam in fam2genes: new_fam_to_seq[fam] = fam_to_seq[fam] - logging.getLogger().info(f"Ending with {len(new_fam_to_seq)} gene families") + logging.info(f"Ending with {len(new_fam_to_seq)} gene families") return genes2fam, new_fam_to_seq @@ -240,7 +240,7 @@ def read_fam2seq(pangenome: Pangenome, fam_to_seq: dict): :param pangenome: Annotated pangenome :param fam_to_seq: Dictionary which link families and sequences """ - logging.getLogger().info("Adding protein sequences to the gene families") + logging.info("Adding protein sequences to the gene families") for family, protein in fam_to_seq.items(): fam = pangenome.add_gene_family(family) fam.add_sequence(protein) @@ -254,7 +254,7 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F :param gene_to_fam: Dictionary which link gene to families :param disable_bar: Allow to disable progress bar """ - logging.getLogger().info(f"Adding {len(gene_to_fam)} genes to the gene families") + logging.info(f"Adding {len(gene_to_fam)} genes to the gene families") link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False if link: @@ -295,15 +295,15 @@ def clustering(pangenome: Pangenome, tmpdir: str, cpu: int = 1, defrag: bool = T newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) with open(newtmpdir.name + '/nucleotid_sequences', "w") as sequence_file: check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar) - logging.getLogger().info("Clustering all of the genes sequences...") + logging.info("Clustering all of the genes sequences...") rep, tsv = first_clustering(sequence_file, newtmpdir, cpu, code, coverage, identity, mode) fam2seq = read_faa(rep) if not defrag: - logging.getLogger().debug("No defragmentation") + logging.debug("No defragmentation") genes2fam, _ = read_tsv(tsv) else: - logging.getLogger().info("Associating fragments to their original gene family...") + logging.info("Associating fragments to their original gene family...") aln = align_rep(rep, newtmpdir, cpu, coverage, identity) genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq) pangenome.status["defragmented"] = "Computed" @@ -356,7 +356,7 @@ def infer_singletons(pangenome: Pangenome): if gene.family is None: pangenome.add_gene_family(gene.ID).add_gene(gene) singleton_counter += 1 - logging.getLogger().info(f"Inferred {singleton_counter} singleton families") + logging.info(f"Inferred {singleton_counter} singleton families") def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False, @@ -374,7 +374,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet check_pangenome_former_clustering(pangenome, force) check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar) - logging.getLogger().info(f"Reading {families_tsv_file.name} the gene families file ...") + logging.info(f"Reading {families_tsv_file.name} the gene families file ...") filesize = os.stat(families_tsv_file).st_size families_tsv_file = read_compressed_or_not(families_tsv_file) frag = False # the genome annotations are necessarily loaded. @@ -436,18 +436,18 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) if args.clusters is None: if args.infer_singletons is True: - logging.getLogger().warning("--infer_singletons option is not compatible with clustering creation. " + logging.warning("--infer_singletons option is not compatible with clustering creation. " "To infer singleton you should give a clustering") clustering(pangenome, args.tmpdir, args.cpu, defrag=not args.no_defrag, code=args.translation_table, coverage=args.coverage, identity=args.identity, mode=args.mode, force=args.force, disable_bar=args.disable_prog_bar) - logging.getLogger().info("Done with the clustering") + logging.info("Done with the clustering") else: if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, args.coverage, args.identity, args.mode]: - logging.getLogger().warning("You are using an option compatible only with clustering creation.") + logging.warning("You are using an option compatible only with clustering creation.") read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar) - logging.getLogger().info("Done reading the cluster file") + logging.info("Done reading the cluster file") write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index cbf8cfca..abda03f0 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -67,11 +67,11 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: # Compute the graph with transitive closure size provided as parameter start_time = time.time() - logging.getLogger().info("Building the graph...") + logging.info("Building the graph...") g = compute_gene_context_graph(families=gene_families, t=transitive, disable_bar=disable_bar) - logging.getLogger().info( + logging.info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") - logging.getLogger().debug(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") + logging.debug(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") # extract the modules from the graph common_components = compute_gene_context(g, jaccard) @@ -83,9 +83,9 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: if len(families) != 0: export_to_dataframe(families, common_components, fam_2_seq, output) else: - logging.getLogger().info("No gene contexts were found") + logging.info("No gene contexts were found") - logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") + logging.info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = False) -> nx.Graph: @@ -208,7 +208,7 @@ def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, out :param output: output path """ - logging.getLogger().debug(f"There are {len(families)} families among {len(gene_contexts)} gene contexts") + logging.debug(f"There are {len(families)} families among {len(gene_contexts)} gene contexts") lines = [] for gene_context in gene_contexts: diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 0292938d..84032291 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -564,7 +564,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: :param disable_bar: Allow preventing bar progress print """ - logging.getLogger().info("Ordering genes among regions, and drawing spots...") + logging.info("Ordering genes among regions, and drawing spots...") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) @@ -623,7 +623,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, fname.absolute().as_posix()) subgraph(spot, fname.absolute().as_posix() + ".gexf", set_size=set_size, multigenics=multigenics, fam_to_mod=fam2mod) - logging.getLogger().info(f"Done drawing spot(s), they can be found in the directory: '{output}'") + logging.info(f"Done drawing spot(s), they can be found in the directory: '{output}'") def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: bool = False): @@ -653,10 +653,10 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: else: selected_spots = [s for s in pangenome.spots if "spot_" + str(s.ID) in curated_spot_list] if len(selected_spots) < 10: - logging.getLogger().info(f"Drawing the following spots: " + logging.info(f"Drawing the following spots: " f"{','.join(['spot_' + str(s.ID) for s in selected_spots])}") else: - logging.getLogger().info(f"Drawing {len(selected_spots)} spots") + logging.info(f"Drawing {len(selected_spots)} spots") draw_selected_spots(selected_spots, pangenome, output, overlapping_match=pangenome.parameters["spots"]["overlapping_match"], diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 655ec083..e661fb56 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -35,9 +35,9 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di if pangenome.status["partitioned"] == "No": raise Exception("Cannot draw the tile plot as your pangenome has not been partitioned") if len(pangenome.organisms) > 500 and nocloud is False: - logging.getLogger().warning("You asked to draw a tile plot for a lot of organisms (>500). " + logging.warning("You asked to draw a tile plot for a lot of organisms (>500). " "Your browser will probably not be able to open it.") - logging.getLogger().info("Drawing the tile plot...") + logging.info("Drawing the tile plot...") data = [] all_indexes = [] all_columns = [] @@ -55,7 +55,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", "undefined": "#828282"} - logging.getLogger().info("start with matrice") + logging.info("start with matrice") for row, fam in enumerate(families): new_col = [org_index[org] for org in fam.organisms] @@ -71,7 +71,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di hc = linkage(dist, 'single') dendro = dendrogram(hc, no_plot=True) - logging.getLogger().info("done with making the dendrogram to order the organisms on the plot") + logging.info("done with making the dendrogram to order the organisms on the plot") order_organisms = [index2org[index] for index in dendro["leaves"]] @@ -105,7 +105,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di ordered_nodes += ordered_nodes_c separators.append(separators[len(separators) - 1] + len(ordered_nodes_c)) - logging.getLogger().info("Getting the gene name(s) and the number for each tile of the plot ...") + logging.info("Getting the gene name(s) and the number for each tile of the plot ...") for node in ordered_nodes: fam_order.append('\u200c' + node.name) data = node.organisms @@ -115,7 +115,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di xaxis_values = ['\u200c' + org.name for org in order_organisms] - logging.getLogger().info("Done extracting names and numbers. Making the heatmap ...") + logging.info("Done extracting names and numbers. Making the heatmap ...") heatmap = go.Heatmap(z=binary_data, x=xaxis_values, @@ -172,6 +172,6 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di tickfont=dict(size=10)), shapes=shapes, plot_bgcolor='#ffffff') - logging.getLogger().info("Drawing the figure itself...") + logging.info("Drawing the figure itself...") out_plotly.plot(go.Figure(data=[heatmap], layout=layout), filename=output/"tile_plot.html", auto_open=False) - logging.getLogger().info(f"Done with the tile plot : '{output.as_posix() + '/tile_plot.html'}' ") + logging.info(f"Done with the tile plot : '{output.as_posix() + '/tile_plot.html'}' ") diff --git a/ppanggolin/figures/ucurve.py b/ppanggolin/figures/ucurve.py index a3974783..2dba54c8 100644 --- a/ppanggolin/figures/ucurve.py +++ b/ppanggolin/figures/ucurve.py @@ -22,7 +22,7 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di :return: """ check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) - logging.getLogger().info("Drawing the U-shaped curve...") + logging.info("Drawing the U-shaped curve...") max_bar = 0 count = defaultdict(lambda: defaultdict(int)) is_partitioned = False @@ -76,4 +76,4 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di fig = go.Figure(data=data_plot, layout=layout) out_plotly.plot(fig, filename=output/"Ushaped_plot.html", auto_open=False) - logging.getLogger().info(f"Done drawing the U-shaped curve : '{output.as_posix() + '/Ushaped_plot.html'}'") + logging.info(f"Done drawing the U-shaped curve : '{output.as_posix() + '/Ushaped_plot.html'}'") diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 764f493a..66067b8e 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -109,7 +109,7 @@ def get_status(pangenome: Pangenome, pangenome_file: Path): """ fix_partitioned(pangenome_file) h5f = tables.open_file(pangenome_file, "r") - logging.getLogger().info("Getting the current pangenome status") + logging.info("Getting the current pangenome status") status_group = h5f.root.status if status_group._v_attrs.genomesAnnotated: pangenome.status["genomesAnnotated"] = "inFile" @@ -199,7 +199,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter :param add: Add a prefix to sequence header :param disable_bar: Prevent to print disable progress bar """ - logging.getLogger().info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...") + logging.info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...") h5f = tables.open_file(filename, "r", driver_core_backing_store=0) table = h5f.root.geneSequences list_cds = set(list_cds) if list_cds is not None else None @@ -570,13 +570,13 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa h5f = tables.open_file(filename, "r") if annotation: if h5f.root.status._v_attrs.genomesAnnotated: - logging.getLogger().info("Reading pangenome annotations...") + logging.info("Reading pangenome annotations...") read_annotation(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' has not been annotated, or has been improperly filled") if gene_sequences: if h5f.root.status._v_attrs.geneSequences: - logging.getLogger().info("Reading pangenome gene dna sequences...") + logging.info("Reading pangenome gene dna sequences...") read_gene_sequences(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' does not have gene sequences, " @@ -584,7 +584,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa if gene_families: if h5f.root.status._v_attrs.genesClustered: - logging.getLogger().info("Reading pangenome gene families...") + logging.info("Reading pangenome gene families...") read_gene_families(pangenome, h5f, disable_bar=disable_bar) read_gene_families_info(pangenome, h5f, disable_bar=disable_bar) else: @@ -592,28 +592,28 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa f"The pangenome in file '{filename}' does not have gene families, or has been improperly filled") if graph: if h5f.root.status._v_attrs.NeighborsGraph: - logging.getLogger().info("Reading the neighbors graph edges...") + logging.info("Reading the neighbors graph edges...") read_graph(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' does not have graph information, " f"or has been improperly filled") if rgp: if h5f.root.status._v_attrs.predictedRGP: - logging.getLogger().info("Reading the RGP...") + logging.info("Reading the RGP...") read_rgp(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' does not have RGP information, " f"or has been improperly filled") if spots: if h5f.root.status._v_attrs.spots: - logging.getLogger().info("Reading the spots...") + logging.info("Reading the spots...") read_spots(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' does not have spots information, " f"or has been improperly filled") if modules: if h5f.root.status._v_attrs.modules: - logging.getLogger().info("Reading the modules...") + logging.info("Reading the modules...") read_modules(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' does not have modules information, " diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 119d0bf9..961ffa9e 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -158,7 +158,7 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool gene_table = h5f.create_table(annotation, "genes", gene_desc(*get_max_len_annotations(pangenome)), expectedrows=len(pangenome.genes)) - logging.getLogger().debug(f"Writing {len(pangenome.genes)} genes") + logging.debug(f"Writing {len(pangenome.genes)} genes") genedata2gene = {} genedata_counter = 0 @@ -186,7 +186,7 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), expectedrows=len(genedata2gene)) - logging.getLogger().debug(f"Writing {len(genedata2gene)} gene-related data (can be lower than the number of genes)") + logging.debug(f"Writing {len(genedata2gene)} gene-related data (can be lower than the number of genes)") genedata_row = genedata_table.row for genedata, genedata_id in genedata2gene.items(): genedata_row["genedata_id"] = genedata_id @@ -342,7 +342,7 @@ def write_gene_fam_info(pangenome: Pangenome, h5f: tables.File, force: bool = Fa :param disable_bar: Disable progress bar """ if '/geneFamiliesInfo' in h5f and force is True: - logging.getLogger().info("Erasing the formerly computed gene family representative sequences...") + logging.info("Erasing the formerly computed gene family representative sequences...") h5f.remove_node('/', 'geneFamiliesInfo') # erasing the table, and rewriting a new one. gene_fam_seq = h5f.create_table("/", "geneFamiliesInfo", gene_fam_desc(*get_gene_fam_len(pangenome)), expectedrows=len(pangenome.gene_families)) @@ -401,7 +401,7 @@ def write_gene_families(pangenome: Pangenome, h5f: tables.File, force: bool = Fa :param disable_bar: Disable progress bar """ if '/geneFamilies' in h5f and force is True: - logging.getLogger().info("Erasing the formerly computed gene family to gene associations...") + logging.info("Erasing the formerly computed gene family to gene associations...") h5f.remove_node('/', 'geneFamilies') # erasing the table, and rewriting a new one. gene_families = h5f.create_table("/", "geneFamilies", gene_to_fam_desc(*get_gene_to_fam_len(pangenome))) gene_row = gene_families.row @@ -456,7 +456,7 @@ def write_graph(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis # consumming parts to read), it might be good to add the organism name in the table here. # for now, forcing the read of annotations. if '/edges' in h5f and force is True: - logging.getLogger().info("Erasing the formerly computed edges") + logging.info("Erasing the formerly computed edges") h5f.remove_node("/", "edges") edge_table = h5f.create_table("/", "edges", graph_desc(get_gene_id_len(pangenome)), expectedrows=len(pangenome.edges)) @@ -514,7 +514,7 @@ def write_rgp(pangenome: Pangenome, h5f: tables.File, force: bool = False, disab :param disable_bar: Disable progress bar """ if '/RGP' in h5f and force is True: - logging.getLogger().info("Erasing the formerly computer RGP") + logging.info("Erasing the formerly computer RGP") h5f.remove_node('/', 'RGP') rgp_table = h5f.create_table('/', 'RGP', rgp_desc(*get_rgp_len(pangenome)), @@ -568,7 +568,7 @@ def write_spots(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis :param disable_bar: Disable progress bar """ if '/spots' in h5f and force is True: - logging.getLogger().info("Erasing the formerly computed spots") + logging.info("Erasing the formerly computed spots") h5f.remove_node("/", "spots") spot_table = h5f.create_table("/", "spots", spot_desc(get_spot_desc(pangenome)), @@ -622,7 +622,7 @@ def write_modules(pangenome: Pangenome, h5f: tables.File, force: bool = False, d :param disable_bar: Disable progress bar """ if '/modules' in h5f and force is True: - logging.getLogger().info("Erasing the formerly computed modules") + logging.info("Erasing the formerly computed modules") h5f.remove_node("/", "modules") mod_table = h5f.create_table('/', 'modules', mod_desc(get_mod_desc(pangenome)), @@ -859,7 +859,7 @@ def update_gene_fam_partition(pangenome: Pangenome, h5f: tables.File, disable_ba :param h5f: HDF5 file with gene families :param disable_bar: Allow to disable progress bar """ - logging.getLogger().info("Updating gene families with partition information") + logging.info("Updating gene families with partition information") table = h5f.root.geneFamiliesInfo for row in tqdm(table, total=table.nrows, unit="gene family", disable=disable_bar): row["partition"] = pangenome.get_gene_family(row["name"].decode()).partition @@ -874,7 +874,7 @@ def update_gene_fragments(pangenome: Pangenome, h5f: tables.File, disable_bar: b :param h5f: HDF5 pangenome file :param disable_bar: Allow to disable progress bar """ - logging.getLogger().info("Updating annotations with fragment information") + logging.info("Updating annotations with fragment information") genedataid2genedata = read_genedata(h5f) table = h5f.root.annotations.genes @@ -905,13 +905,13 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo info_group = h5f.root.info if '/edges' in h5f and (graph or gene_families): - logging.getLogger().info("Erasing the formerly computed edges") + logging.info("Erasing the formerly computed edges") h5f.remove_node("/", "edges") status_group._v_attrs.NeighborsGraph = False pangenome.status["neighborsGraph"] = "No" h5f.del_node_attr(info_group, "numberOfEdges") if '/geneFamilies' in h5f and gene_families: - logging.getLogger().info("Erasing the formerly computed gene family to gene associations...") + logging.info("Erasing the formerly computed gene family to gene associations...") h5f.remove_node('/', 'geneFamilies') # erasing the table, and rewriting a new one. pangenome.status["defragmented"] = "No" pangenome.status["genesClustered"] = "No" @@ -921,12 +921,12 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfClusters") if '/geneFamiliesInfo' in h5f and gene_families: - logging.getLogger().info("Erasing the formerly computed gene family representative sequences...") + logging.info("Erasing the formerly computed gene family representative sequences...") h5f.remove_node('/', 'geneFamiliesInfo') # erasing the table, and rewriting a new one. pangenome.status["geneFamilySequences"] = "No" status_group._v_attrs.geneFamilySequences = False if partition: - logging.getLogger().info("Erasing former partitions...") + logging.info("Erasing former partitions...") pangenome.status["partitioned"] = "No" status_group._v_attrs.Partitioned = False if 'Partitioned' in status_group._v_attrs._f_list(): @@ -942,7 +942,7 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfSubpartitions") if '/RGP' in h5f and (gene_families or partition or rgp): - logging.getLogger().info("Erasing the formerly computer RGP...") + logging.info("Erasing the formerly computer RGP...") pangenome.status["predictedRGP"] = "No" status_group._v_attrs.predictedRGP = False h5f.remove_node("/", "RGP") @@ -950,7 +950,7 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfRGP") if '/spots' in h5f and (gene_families or partition or rgp or spots): - logging.getLogger().info("Erasing the formerly computed spots...") + logging.info("Erasing the formerly computed spots...") pangenome.status["spots"] = "No" status_group._v_attrs.spots = False h5f.remove_node("/", "spots") @@ -958,7 +958,7 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfSpots") if '/modules' in h5f and (gene_families or partition or modules): - logging.getLogger().info("Erasing the formerly computed modules...") + logging.info("Erasing the formerly computed modules...") pangenome.status["modules"] = "No" status_group._v_attrs.modules = False h5f.remove_node("/", "modules") @@ -987,7 +987,7 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable if pangenome.status["genomesAnnotated"] == "Computed": compression_filter = tables.Filters(complevel=1, shuffle=True, bitshuffle=True, complib='blosc:zstd') h5f = tables.open_file(filename, "w", filters=compression_filter) - logging.getLogger().info("Writing genome annotations...") + logging.info("Writing genome annotations...") write_annotations(pangenome, h5f, disable_bar=disable_bar) @@ -1003,14 +1003,14 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable h5f = tables.open_file(filename, "a") if pangenome.status["geneSequences"] == "Computed": - logging.getLogger().info("writing the protein coding gene dna sequences") + logging.info("writing the protein coding gene dna sequences") write_gene_sequences(pangenome, h5f, disable_bar=disable_bar) pangenome.status["geneSequences"] = "Loaded" if pangenome.status["genesClustered"] == "Computed": - logging.getLogger().info("Writing gene families and gene associations...") + logging.info("Writing gene families and gene associations...") write_gene_families(pangenome, h5f, force, disable_bar=disable_bar) - logging.getLogger().info("Writing gene families information...") + logging.info("Writing gene families information...") write_gene_fam_info(pangenome, h5f, force, disable_bar=disable_bar) if pangenome.status["genomesAnnotated"] in ["Loaded", "inFile"] and \ pangenome.status["defragmented"] == "Computed": @@ -1019,7 +1019,7 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable update_gene_fragments(pangenome, h5f, disable_bar=disable_bar) pangenome.status["genesClustered"] = "Loaded" if pangenome.status["neighborsGraph"] == "Computed": - logging.getLogger().info("Writing the edges...") + logging.info("Writing the edges...") write_graph(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status["neighborsGraph"] = "Loaded" if pangenome.status["partitioned"] == "Computed" and \ @@ -1028,17 +1028,17 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable pangenome.status["partitioned"] = "Loaded" if pangenome.status['predictedRGP'] == "Computed": - logging.getLogger().info("Writing Regions of Genomic Plasticity...") + logging.info("Writing Regions of Genomic Plasticity...") write_rgp(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status['predictedRGP'] = "Loaded" if pangenome.status["spots"] == "Computed": - logging.getLogger().info("Writing Spots of Insertion...") + logging.info("Writing Spots of Insertion...") write_spots(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status['spots'] = "Loaded" if pangenome.status["modules"] == "Computed": - logging.getLogger().info("Writing Modules...") + logging.info("Writing Modules...") write_modules(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status["modules"] = "Loaded" @@ -1046,4 +1046,4 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable write_info(pangenome, h5f) h5f.close() - logging.getLogger().info(f"Done writing the pangenome. It is in file : {filename}") + logging.info(f"Done writing the pangenome. It is in file : {filename}") diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 3201751f..c6858807 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -155,14 +155,14 @@ def write_json(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger().info("Writing the json file for the pangenome graph...") + logging.info("Writing the json file for the pangenome graph...") outname = output / "pangenomeGraph.json" with write_compressed_or_not(outname, compress) as json: write_json_header(json) write_json_nodes(json) write_json_edges(json) json.write("}") - logging.getLogger().info(f"Done writing the json file : '{outname.as_posix()}'") + logging.info(f"Done writing the json file : '{outname.as_posix()}'") def write_gexf_header(gexf: TextIO, light: bool = True): @@ -303,7 +303,7 @@ def write_gexf(output: Path, light: bool = True, compress: bool = False): txt = "Writing the " txt += "light gexf file for the pangenome graph..." if light else "gexf file for the pangenome graph..." - logging.getLogger().info(txt) + logging.info(txt) outname = output / "pangenomeGraph" outname += "_light" if light else "" outname += ".gexf" @@ -312,7 +312,7 @@ def write_gexf(output: Path, light: bool = True, compress: bool = False): write_gexf_nodes(gexf, light) write_gexf_edges(gexf, light) write_gexf_end(gexf) - logging.getLogger().info(f"Done writing the gexf file : '{outname.as_posix()}'") + logging.info(f"Done writing the gexf file : '{outname.as_posix()}'") def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool = False, gene_names: bool = False): @@ -326,7 +326,7 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool :param compress: Compress the file in .gz :param gene_names: write the genes name if there are saved in pangenome """ - logging.getLogger().info(f"Writing the .{ext} file ...") + logging.info(f"Writing the .{ext} file ...") outname = output / f"matrix.{ext}" with write_compressed_or_not(outname, compress) as matrix: @@ -387,7 +387,7 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool '"' + str(max(lis)) + '"', # 13 '"' + str(round(sum(lis) / len(lis), 2)) + '"'] # 14 + genes) + "\n") # 15 - logging.getLogger().info(f"Done writing the matrix : '{outname.as_posix()}'") + logging.info(f"Done writing the matrix : '{outname.as_posix()}'") def write_gene_presence_absence(output: Path, compress: bool = False): @@ -397,7 +397,7 @@ def write_gene_presence_absence(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger().info("Writing the gene presence absence file ...") + logging.info("Writing the gene presence absence file ...") outname = output / "gene_presence_absence.Rtab" with write_compressed_or_not(outname, compress) as matrix: index_org = {} @@ -417,7 +417,7 @@ def write_gene_presence_absence(output: Path, compress: bool = False): matrix.write('\t'.join([fam.name] # 14 + genes) + "\n") # 15 - logging.getLogger().info(f"Done writing the gene presence absence file : '{outname.as_posix()}'") + logging.info(f"Done writing the gene presence absence file : '{outname.as_posix()}'") def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, compress: bool = False): @@ -429,8 +429,8 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, :param dup_margin: minimum ratio of organisms in which family must have multiple genes to be considered duplicated :param compress: Compress the file in .gz """ - logging.getLogger().info("Writing pangenome statistics...") - logging.getLogger().info("Writing statistics on persistent duplication...") + logging.info("Writing pangenome statistics...") + logging.info("Writing statistics on persistent duplication...") single_copy_markers = set() # could use bitarrays if speed is needed with write_compressed_or_not(output / "mean_persistent_duplication.tsv", compress) as outfile: outfile.write(f"#duplication_margin={round(dup_margin, 3)}\n") @@ -452,8 +452,8 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, str(round(dup_ratio, 3)), str(round(mean_pres, 3)), str(is_scm)]) + "\n") - logging.getLogger().info("Done writing stats on persistent duplication") - logging.getLogger().info("Writing genome per genome statistics (completeness and counts)...") + logging.info("Done writing stats on persistent duplication") + logging.info("Writing genome per genome statistics (completeness and counts)...") soft = set() # could use bitarrays if speed is needed core = set() for fam in pan.gene_families: @@ -518,7 +518,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, completeness, len(fams & single_copy_markers)])) + "\n") - logging.getLogger().info("Done writing genome per genome statistics") + logging.info("Done writing genome per genome statistics") def write_org_file(org: Organism, output: Path, compress: bool = False): @@ -580,13 +580,13 @@ def write_projections(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger().info("Writing the projection files...") + logging.info("Writing the projection files...") outdir = output / "projection" if not os.path.exists(outdir): os.makedirs(outdir) for org in pan.organisms: write_org_file(org, outdir, compress) - logging.getLogger().info("Done writing the projection files") + logging.info("Done writing the projection files") def write_parts(output: Path, soft_core: float = 0.95): @@ -596,7 +596,7 @@ def write_parts(output: Path, soft_core: float = 0.95): :param output: Path to output directory :param soft_core: Soft core threshold to use """ - logging.getLogger().info("Writing the list of gene families for each partition ...") + logging.info("Writing the list of gene families for each partition ...") if not os.path.exists(output / "partitions"): os.makedirs(output / "partitions") part_sets = defaultdict(set) @@ -623,7 +623,7 @@ def write_parts(output: Path, soft_core: float = 0.95): if len(val) > 0: curr_key_file.write('\n'.join(val) + "\n") curr_key_file.close() - logging.getLogger().info("Done writing the list of gene families for each partition") + logging.info("Done writing the list of gene families for each partition") def write_gene_families_tsv(output: Path, compress: bool = False): @@ -633,14 +633,14 @@ def write_gene_families_tsv(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger().info("Writing the file providing the association between genes and gene families...") + logging.info("Writing the file providing the association between genes and gene families...") outname = output / "gene_families.tsv" with write_compressed_or_not(outname, compress) as tsv: for fam in pan.gene_families: for gene in fam.genes: tsv.write("\t".join([fam.name, gene.ID if gene.local_identifier == "" else gene.local_identifier, "F" if gene.is_fragment else ""]) + "\n") - logging.getLogger().info("Done writing the file providing the association between genes and " + logging.info("Done writing the file providing the association between genes and " f"gene families : '{outname}'") @@ -690,7 +690,7 @@ def r_and_s(value: float): min_size = min(size_list) fout.write("\t".join(map(r_and_s, [f"spot_{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, mean_size, stdev_size, max_size, min_size])) + "\n") - logging.getLogger().info(f"Done writing spots in : '{output.as_posix() + '/summarize_spots.tsv'}'") + logging.info(f"Done writing spots in : '{output.as_posix() + '/summarize_spots.tsv'}'") def spot2rgp(spots: set, output: Path, compress: bool = False): @@ -751,7 +751,7 @@ def write_module_summary(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger().info("Writing functional modules summary...") + logging.info("Writing functional modules summary...") with write_compressed_or_not(output / "modules_summary.tsv", compress) as fout: fout.write("module_id\tnb_families\tnb_organisms\tpartition\tmean_number_of_occurrence\n") for mod in pan.modules: @@ -766,7 +766,7 @@ def write_module_summary(output: Path, compress: bool = False): f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod.families), 3)}\n") fout.close() - logging.getLogger().info(f"Done writing module summary: '{output.as_posix() + '/modules_summary.tsv'}'") + logging.info(f"Done writing module summary: '{output.as_posix() + '/modules_summary.tsv'}'") def write_modules(output: Path, compress: bool = False): @@ -775,7 +775,7 @@ def write_modules(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger().info("Writing functional modules...") + logging.info("Writing functional modules...") with write_compressed_or_not(output / "functional_modules.tsv", compress) as fout: fout.write("module_id\tfamily_id\n") for mod in pan.modules: @@ -783,7 +783,7 @@ def write_modules(output: Path, compress: bool = False): fout.write(f"module_{mod.ID}\t{family.name}\n") fout.close() - logging.getLogger().info(f"Done writing functional modules to: '{output.as_posix() + '/functional_modules.tsv'}'") + logging.info(f"Done writing functional modules to: '{output.as_posix() + '/functional_modules.tsv'}'") def write_org_modules(output: Path, compress: bool = False): @@ -792,7 +792,7 @@ def write_org_modules(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger().info("Writing modules to organisms associations...") + logging.info("Writing modules to organisms associations...") with write_compressed_or_not(output / "modules_in_organisms.tsv", compress) as fout: fout.write("module_id\torganism\tcompletion\n") for mod in pan.modules: @@ -803,7 +803,7 @@ def write_org_modules(output: Path, compress: bool = False): completion = round(len(org.families & mod.families) / len(mod.families), 2) fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n") fout.close() - logging.getLogger().info( + logging.info( f"Done writing modules to organisms associations to: '{output.as_posix() + '/modules_in_organisms.tsv'}'") @@ -813,7 +813,7 @@ def write_spot_modules(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger().info("Writing modules to spot associations...") + logging.info("Writing modules to spot associations...") fam2mod = {} for mod in pan.modules: @@ -836,7 +836,7 @@ def write_spot_modules(output: Path, compress: bool = False): # if all the families in the module are found in the spot, write the association fout.write(f"module_{mod.ID}\tspot_{spot.ID}\n") - logging.getLogger().info(f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}") + logging.info(f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}") def write_rgp_modules(output: Path, compress: bool = False): @@ -845,7 +845,7 @@ def write_rgp_modules(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger().info("Clustering RGPs based on module content...") + logging.info("Clustering RGPs based on module content...") lists = write_compressed_or_not(output / "modules_RGP_lists.tsv", compress) lists.write("representative_RGP\tnb_spots\tmod_list\tRGP_list\n") @@ -880,7 +880,7 @@ def write_rgp_modules(output: Path, compress: bool = False): f"{','.join([reg.name for reg in regions])}\n") lists.close() - logging.getLogger().info( + logging.info( f"RGP and associated modules are listed in : {output.as_posix() + '/modules_RGP_lists.tsv'}") diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 37a2fc4f..be3fd9b0 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -168,7 +168,7 @@ def launch_mafft(fname: Path, output: Path, fam_name: str): """ outname = output/f"{fam_name}.aln" cmd = ["mafft", "--thread", "1", fname.absolute().as_posix()] - logging.getLogger().debug("command: " + " ".join(cmd)) + logging.debug("command: " + " ".join(cmd)) subprocess.run(cmd, stdout=open(outname, "w"), stderr=subprocess.DEVNULL, check=True) @@ -200,7 +200,7 @@ def compute_msa(families: set, output: Path, tmpdir: Path, cpu: int = 1, source: write_total = 0 args = [] - logging.getLogger().info("Preparing input files for MSA...") + logging.info("Preparing input files for MSA...") code_table = genetic_codes(str(code)) for family in tqdm(families, unit="family", disable=disable_bar): @@ -209,7 +209,7 @@ def compute_msa(families: set, output: Path, tmpdir: Path, cpu: int = 1, source: write_total = write_total + (time.time() - start_write) args.append((fname, output, family.name)) - logging.getLogger().info("Computing the MSA ...") + logging.info("Computing the MSA ...") bar = tqdm(range(len(families)), unit="family", disable=disable_bar) with get_context('fork').Pool(cpu) as p: for _ in p.imap_unordered(launch_multi_mafft, args): @@ -310,30 +310,30 @@ def write_msa_files(pangenome: Pangenome, output: Path, cpu: int = 1, partition: check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=need_partitions, need_gene_sequences=True, disable_bar=disable_bar) - logging.getLogger().info(f"Doing MSA for {partition} families...") + logging.info(f"Doing MSA for {partition} families...") families = get_families_to_write(pangenome, partition_filter=partition, soft_core=soft_core, dup_margin=dup_margin, single_copy=single_copy) # check that the code is similar than the one used previously, if there is one if 'translation_table' in pangenome.parameters["cluster"]: if pangenome.parameters["cluster"]["translation_table"] != translation_table: - logging.getLogger().warning("The translation table used during clustering " + logging.warning("The translation table used during clustering " f"('{pangenome.parameters['cluster']['translation_table']}') " f"is different than the one provided now ('{translation_table}')") code = translation_table compute_msa(families, outdir, cpu=cpu, tmpdir=tmpdir, source=source, use_gene_id=use_gene_id, code=code, disable_bar=disable_bar) - logging.getLogger().info(f"Done writing all {partition} MSA in: {outdir}") + logging.info(f"Done writing all {partition} MSA in: {outdir}") if phylo: - logging.getLogger().info("Writing the whole genome msa file") + logging.info("Writing the whole genome msa file") if partition == "softcore": phylo_name = output / f"{partition}_{soft_core}_genome_alignment.aln" else: phylo_name = output / f"{partition}_genome_alignment.aln" write_whole_genome_msa(pangenome, families, phylo_name, outdir, use_gene_id=use_gene_id) - logging.getLogger().info(f"Done writing the {partition} genome alignment in: '{phylo_name}'") + logging.info(f"Done writing the {partition} genome alignment in: '{phylo_name}'") def launch(args: argparse.Namespace): diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 59aedcdc..83ed2705 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -38,7 +38,7 @@ def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO counter = 0 # TODO remove ? if list_cds is None: list_cds = pangenome.genes - logging.getLogger().info("Writing all of the CDS sequences...") + logging.info("Writing all of the CDS sequences...") for gene in tqdm(list_cds, unit="gene", disable=disable_bar): if gene.type == "CDS": counter += 1 @@ -61,7 +61,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co """ assert genes in poss_values, f"Selected part to write genes not in {poss_values}" - logging.getLogger().info("Writing all the gene nucleotide sequences...") + logging.info("Writing all the gene nucleotide sequences...") outpath = output / f"{genes}_genes.fna" genefams = select_families(pangenome, genes, "gene nucleotide sequences", soft_core) @@ -70,7 +70,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co for fam in genefams: genes_to_write.extend(fam.genes) - logging.getLogger().info(f"There are {len(genes_to_write)} genes to write") + logging.info(f"There are {len(genes_to_write)} genes to write") with write_compressed_or_not(outpath, compress) as fasta: if pangenome.status["geneSequences"] in ["inFile"]: get_gene_sequences_from_file(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]), @@ -80,7 +80,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co else: # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") - logging.getLogger().info(f"Done writing the gene sequences : '{outpath}'") + logging.info(f"Done writing the gene sequences : '{outpath}'") def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]: @@ -96,31 +96,31 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c """ genefams = set() if partition == 'all': - logging.getLogger().info(f"Writing all of the {type_name}...") + logging.info(f"Writing all of the {type_name}...") genefams = pangenome.gene_families elif partition in ['persistent', 'shell', 'cloud']: - logging.getLogger().info(f"Writing the {type_name} of the {partition}...") + logging.info(f"Writing the {type_name} of the {partition}...") for fam in pangenome.gene_families: if fam.named_partition == partition: genefams.add(fam) elif partition == "rgp": - logging.getLogger().info(f"Writing the {type_name} in RGPs...") + logging.info(f"Writing the {type_name} in RGPs...") for region in pangenome.regions: genefams |= region.families elif partition == "softcore": - logging.getLogger().info( + logging.info( f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes") threshold = pangenome.number_of_organisms() * soft_core for fam in pangenome.gene_families: if len(fam.organisms) >= threshold: genefams.add(fam) elif partition == "core": - logging.getLogger().info(f"Writing the representative {type_name} of the {partition} gene families...") + logging.info(f"Writing the representative {type_name} of the {partition} gene families...") for fam in pangenome.gene_families: if len(fam.organisms) == pangenome.number_of_organisms(): genefams.add(fam) elif "module_" in partition: - logging.getLogger().info(f"Writing the representation {type_name} of {partition} gene families...") + logging.info(f"Writing the representation {type_name} of {partition} gene families...") mod_id = int(partition.replace("module_", "")) for mod in pangenome.modules: # could be way more efficient with a dict structure instead of a set @@ -152,7 +152,7 @@ def write_fasta_gene_fam(pangenome: Pangenome, output: Path, gene_families: str, with write_compressed_or_not(outpath, compress) as fasta: get_gene_sequences_from_file(pangenome.file, fasta, [fam.name for fam in genefams], disable_bar=disable_bar) - logging.getLogger().info(f"Done writing the representative nucleotide sequences of the gene families : '{outpath}'") + logging.info(f"Done writing the representative nucleotide sequences of the gene families : '{outpath}'") def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, soft_core: float = 0.95, @@ -178,7 +178,7 @@ def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, for fam in tqdm(genefams, unit="prot families", disable=disable_bar): fasta.write('>' + fam.name + "\n") fasta.write(fam.sequence + "\n") - logging.getLogger().info(f"Done writing the representative amino acid sequences of the gene families : '{outpath}'") + logging.info(f"Done writing the representative amino acid sequences of the gene families : '{outpath}'") def read_fasta_or_gff(file_path: Path) -> Dict[str, str]: @@ -306,7 +306,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) - logging.getLogger().info(f"Writing {regions} rgp genomic sequences...") + logging.info(f"Writing {regions} rgp genomic sequences...") regions_to_write = [] if regions == "complete": for region in pangenome.regions: @@ -327,7 +327,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa genome_sequence = read_genome_file(org_dict, loaded_genome) fasta.write(f">{region.name}\n") fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60)) - logging.getLogger().info(f"Done writing the regions nucleotide sequences: '{outname}'") + logging.info(f"Done writing the regions nucleotide sequences: '{outname}'") def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index e1eb549a..137dc14b 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -95,16 +95,16 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.getLogger().debug(f"all") + logging.debug(f"all") for org in self.organisms: self.bitarray[index[org]] = 1 elif partition in ['shell', 'cloud']: - logging.getLogger().debug(f"shell, cloud") + logging.debug(f"shell, cloud") if self.named_partition == partition: for org in self.organisms: self.bitarray[index[org]] = 1 elif partition == 'accessory': - logging.getLogger().debug(f"accessory") + logging.debug(f"accessory") if self.named_partition in ['shell', 'cloud']: for org in self.organisms: self.bitarray[index[org]] = 1 diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index fd4c2c78..7058de0f 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -270,16 +270,16 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.getLogger().debug(f"all") + logging.debug(f"all") for fam in self.families: self.bitarray[index[fam]] = 1 elif partition in ['shell', 'cloud']: - logging.getLogger().debug(f"shell, cloud") + logging.debug(f"shell, cloud") for fam in self.families: if fam.named_partition == partition: self.bitarray[index[fam]] = 1 elif partition == 'accessory': - logging.getLogger().debug(f"accessory") + logging.debug(f"accessory") for fam in self.families: if fam.named_partition in ['shell', 'cloud']: self.bitarray[index[fam]] = 1 diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index aef655b3..175e0817 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -84,7 +84,7 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, if remove_copy_number > 0: remove_high_copy_number(pangenome, remove_copy_number) - logging.getLogger().info("Computing the neighbors graph...") + logging.info("Computing the neighbors graph...") bar = tqdm(pangenome.organisms, total=len(pangenome.organisms), unit="organism", disable=disable_bar) for org in bar: bar.set_description(f"Processing {org.name}") @@ -105,7 +105,7 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, if prev is not None and contig.is_circular and len(contig.genes) > 0: # if prev is None, the contig is entirely made of duplicated genes, so no edges are added pangenome.add_edge(contig.genes[0], prev) - logging.getLogger().info("Done making the neighbors graph.") + logging.info("Done making the neighbors graph.") pangenome.status["neighborsGraph"] = "Computed" pangenome.parameters["graph"] = {} diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py index a9a4e77c..b0784c43 100644 --- a/ppanggolin/metrics/fluidity.py +++ b/ppanggolin/metrics/fluidity.py @@ -24,17 +24,17 @@ def gen_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: """ # check statuses and load info - logging.getLogger().info("Check information in pangenome") + logging.info("Check information in pangenome") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) fluidity_dict = {'all': None, 'shell': None, 'cloud': None, 'accessory': None} for subset in fluidity_dict.keys(): - logging.getLogger().debug(f"Compute binaries for {subset} partition") + logging.debug(f"Compute binaries for {subset} partition") pangenome.compute_org_bitarrays(part=subset) # Compute binaries corresponding to presence / absence of families in organisms g_sum = 0 - logging.getLogger().debug("Get number of families in each organisms") + logging.debug("Get number of families in each organisms") org2_nb_fam = nb_fam_per_org(pangenome, disable_bar) - logging.getLogger().info(f"Compute rate of unique family for each genome combination in {subset}") + logging.info(f"Compute rate of unique family for each genome combination in {subset}") for c_organisms in tqdm(list(combinations(pangenome.organisms, 2)), unit="combination", disable=disable_bar): tot_fam = org2_nb_fam.get(c_organisms[0].name) + org2_nb_fam.get(c_organisms[1].name) common_fam = popcount(c_organisms[0].bitarray & c_organisms[1].bitarray) - 1 @@ -74,17 +74,17 @@ def fam_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: :return: family fluidity value from the pangenome for each partition """ # check statuses and load info - logging.getLogger().info("Check information in pangenome") + logging.info("Check information in pangenome") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) fluidity_dict = {'all': None, 'shell': None, 'cloud': None, 'accessory': None} for subset in fluidity_dict.keys(): - logging.getLogger().debug(f"Compute binaries for {subset} partition") + logging.debug(f"Compute binaries for {subset} partition") pangenome.compute_family_bitarrays(part=subset) # Compute binaries corresponding to presence / absence of families in organisms f_sum = 0 - logging.getLogger().debug("Get number of families in each organisms") + logging.debug("Get number of families in each organisms") fam_2_nb_org = nb_org_per_fam(pangenome, disable_bar) - logging.getLogger().info("Compute rate of unique organism for each family combination") + logging.info("Compute rate of unique organism for each family combination") for c_fam in tqdm(list(combinations(pangenome.gene_families, 2)), unit="combination", disable=disable_bar): tot_org = fam_2_nb_org.get(c_fam[0].name) + fam_2_nb_org.get(c_fam[1].name) common_fam = popcount(c_fam[0].bitarray & c_fam[1].bitarray) - 1 diff --git a/ppanggolin/metrics/metrics.py b/ppanggolin/metrics/metrics.py index c9ca4713..fbfaf6e7 100644 --- a/ppanggolin/metrics/metrics.py +++ b/ppanggolin/metrics/metrics.py @@ -79,17 +79,17 @@ def write_metrics(pangenome: Pangenome, metrics_dict: dict, no_print_info: bool """ with tables.open_file(pangenome.file, "a") as h5f: info_group = h5f.root.info - logging.getLogger().debug("H5f open") + logging.debug("H5f open") if 'genomes_fluidity' in metrics_dict.keys(): - logging.getLogger().info("Writing genome fluidity in pangenome") + logging.info("Writing genome fluidity in pangenome") info_group._v_attrs.genomes_fluidity = metrics_dict['genomes_fluidity'] if 'families_fluidity' in metrics_dict.keys(): - logging.getLogger().info("Writing family fluidity in pangenome") + logging.info("Writing family fluidity in pangenome") info_group._v_attrs.families_fluidity = metrics_dict['families_fluidity'] if 'info_modules' in metrics_dict.keys(): - logging.getLogger().info("Writing modules information in pangenome") + logging.info("Writing modules information in pangenome") write_info_modules(pangenome, h5f) # After all metrics was written @@ -115,12 +115,12 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) - logging.getLogger().debug("Check if one of the metrics was already compute") + logging.debug("Check if one of the metrics was already compute") if not args.force: check_metric(pangenome, **args_dict) - logging.getLogger().info("Metrics computation begin") + logging.info("Metrics computation begin") metrics_dictionary = compute_metrics(pangenome, disable_bar=args.disable_prog_bar, **args_dict) - logging.getLogger().info("Metrics computation done") + logging.info("Metrics computation done") write_metrics(pangenome, metrics_dictionary, no_print_info=args.no_print_info) diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index b37ce976..f494cde5 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -118,10 +118,10 @@ def predict_modules(pangenome: Pangenome, tmpdir: str, cpu: int = 1, dup_margin: # compute the graph with transitive closure size provided as parameter start_time = time.time() - logging.getLogger().info("Building the graph...") + logging.info("Building the graph...") g = compute_mod_graph(pangenome.organisms, t=transitive, disable_bar=disable_bar) - logging.getLogger().info(f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find modules in") - logging.getLogger().info(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") + logging.info(f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find modules in") + logging.info(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") start_time = time.time() # get all multigenic gene families @@ -134,8 +134,8 @@ def predict_modules(pangenome: Pangenome, tmpdir: str, cpu: int = 1, dup_margin: for mod in modules: fams |= mod.families - logging.getLogger().info(f"There are {len(fams)} families among {len(modules)} modules") - logging.getLogger().info(f"Computing modules took {round(time.time() - start_time, 2)} seconds") + logging.info(f"There are {len(fams)} families among {len(modules)} modules") + logging.info(f"Computing modules took {round(time.time() - start_time, 2)} seconds") pangenome.add_modules(modules) diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index 79637079..49de793f 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -52,7 +52,7 @@ def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dis :return: Nem parameters and if not just log likelihood the families associated to partition """ - logging.getLogger().debug("run_partitioning...") + logging.debug("run_partitioning...") if init == "param_file": with open(nem_dir_path + "/nem_file_init_" + str(kval) + ".m", "w") as m_file: m_file.write("1 ") # 1 to initialize parameter, @@ -86,8 +86,8 @@ def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dis convergence_th = 0.01 # (INIT_SORT, init_random, init_param_file, INIT_FILE, INIT_LABEL, INIT_NB) = range(0,6) init_random, init_param_file = range(1, 3) - logging.getLogger().debug("Running NEM...") - logging.getLogger().debug([nem_dir_path.encode('ascii') + b"/nem_file", kval, algo, beta, convergence, + logging.debug("Running NEM...") + logging.debug([nem_dir_path.encode('ascii') + b"/nem_file", kval, algo, beta, convergence, convergence_th, b"fuzzy", itermax, True, model, proportion, variance_model, init_param_file if init in ["param_file", "init_from_old"] else init_random, nem_dir_path.encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", @@ -101,16 +101,16 @@ def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dis out_file_prefix=nem_dir_path.encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), seed=seed) - logging.getLogger().debug("After running NEM...") + logging.debug("After running NEM...") no_nem = False if os.path.isfile(nem_dir_path + "/nem_file_" + str(kval) + ".uf"): - logging.getLogger().debug("Reading NEM results...") + logging.debug("Reading NEM results...") elif not just_log_likelihood: - # logging.getLogger().warning("No NEM output file found: "+ nem_dir_path+"/nem_file_"+str(K)+".uf") + # logging.warning("No NEM output file found: "+ nem_dir_path+"/nem_file_"+str(K)+".uf") no_nem = True else: - logging.getLogger().debug("No NEM output file found: " + nem_dir_path + "/nem_file_" + str(kval) + ".uf") + logging.debug("No NEM output file found: " + nem_dir_path + "/nem_file_" + str(kval) + ".uf") no_nem = True index_fam = [] @@ -163,7 +163,7 @@ def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dis else: partitions_list[i] = parti[positions_max_prob.pop()] except IOError: - logging.getLogger().debug("partitioning did not work (the number of organisms used is probably too low), " + logging.debug("partitioning did not work (the number of organisms used is probably too low), " "see logs here to obtain more details " + nem_dir_path + "/nem_file_" + str(kval) + ".log") return {}, None, None # return empty objects @@ -251,7 +251,7 @@ def write_nem_input_files(tmpdir: str, organisms: set, sm_degree: int = 10) -> ( with open(tmpdir + "/column_org_file", "w") as org_file: org_file.write(" ".join([f'"{org.name}"' for org in organisms]) + "\n") - logging.getLogger().debug("Writing nem_file.str nem_file.index nem_file.nei and nem_file.dat files") + logging.debug("Writing nem_file.str nem_file.index nem_file.nei and nem_file.dat files") with open(tmpdir + "/nem_file.str", "w") as str_file, \ open(tmpdir + "/nem_file.index", "w") as index_file, \ open(tmpdir + "/nem_file.nei", "w") as nei_file, \ @@ -466,7 +466,7 @@ def partition(pangenome: Pangenome, tmpdir: str, outputdir: str = None, beta: fl tmp_dir = tempfile.TemporaryDirectory(dir=tmpdir) if len(organisms) <= 10: - logging.getLogger().warning(f"The number of selected organisms is too low ({len(organisms)} " + logging.warning(f"The number of selected organisms is too low ({len(organisms)} " f"organisms used) to robustly partition the graph") pangenome.parameters["partition"] = {} @@ -479,10 +479,10 @@ def partition(pangenome: Pangenome, tmpdir: str, outputdir: str = None, beta: fl if kval < 2: pangenome.parameters["partition"]["computed_K"] = True - logging.getLogger().info("Estimating the optimal number of partitions...") + logging.info("Estimating the optimal number of partitions...") kval = evaluate_nb_partitions(organisms, tmp_dir.name, outputdir, sm_degree, free_dispersion, chunk_size, kmm, icl_margin, draw_icl, cpu, seed, disable_bar=disable_bar) - logging.getLogger().info(f"The number of partitions has been evaluated at {kval}") + logging.info(f"The number of partitions has been evaluated at {kval}") pangenome.parameters["partition"]["K"] = kval init = "param_file" @@ -500,7 +500,7 @@ def partition(pangenome: Pangenome, tmpdir: str, outputdir: str = None, beta: fl cpt_partition[fam.name] = {"P": 0, "S": 0, "C": 0, "U": 0} start_partitioning = time.time() - logging.getLogger().info("Partitioning...") + logging.info("Partitioning...") pansize = len(families) if chunk_size < len(organisms): validated = set() @@ -544,7 +544,7 @@ def validate_family(res): args.append((i, tmp_dir.name, kval, beta, sm_degree, free_dispersion, seed, init, keep_tmp_files)) - logging.getLogger().info("Launching NEM") + logging.info("Launching NEM") with get_context('fork').Pool(processes=cpu) as p: # launch partitioning bar = tqdm(range(len(args)), unit=" samples partitioned", disable=disable_bar) @@ -554,7 +554,7 @@ def validate_family(res): bar.close() condition += 1 # if len(validated) < pan_size, we will want to resample more. - logging.getLogger().debug(f"There are {len(validated)} validated families out of {pansize} families.") + logging.debug(f"There are {len(validated)} validated families out of {pansize} families.") p.close() p.join() for fam, data in cpt_partition.items(): @@ -563,7 +563,7 @@ def validate_family(res): # need to compute the median vectors of each partition ??? partitioning_results = [partitioning_results, []] # introduces a 'non feature'. - logging.getLogger().info(f"Did {len(samples)} partitioning with chunks of size {chunk_size} among " + logging.info(f"Did {len(samples)} partitioning with chunks of size {chunk_size} among " f"{len(organisms)} genomes in {round(time.time() - start_partitioning, 2)} seconds.") else: edges_weight, nb_fam = write_nem_input_files(tmp_dir.name + "/" + str(cpt) + "/", organisms, @@ -575,7 +575,7 @@ def validate_family(res): raise Exception("Statistical partitioning does not work on your data. " "This usually happens because you used very few (<15) genomes.") cpt += 1 - logging.getLogger().info(f"Partitioned {len(organisms)} genomes in " + logging.info(f"Partitioned {len(organisms)} genomes in " f"{round(time.time() - start_partitioning, 2)} seconds.") # pangenome.savePartitionParameters(K, beta, free_dispersion, sm_degree, partitioning_results[1], chunk_size) @@ -603,9 +603,9 @@ def launch(args: argparse.Namespace): partition(pan, args.tmpdir, args.output, args.beta, args.max_degree_smoothing, args.free_dispersion, args.chunk_size, args.nb_of_partitions, args.krange, args.ICL_margin, args.draw_ICL, args.cpu, args.seed, args.keep_tmp_files, args.force, disable_bar=args.disable_prog_bar) - logging.getLogger().debug("Write partition in pangenome") + logging.debug("Write partition in pangenome") write_pangenome(pan, pan.file, args.force, disable_bar=args.disable_prog_bar) - logging.getLogger().debug("Partitioning is finished") + logging.debug("Partitioning is finished") def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index c2895734..930ef3ec 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -155,7 +155,7 @@ def draw_curve(output: str, data: list, max_sampling: int = 10): :param max_sampling: Maximum number of organisms in a sample :param data: """ - logging.getLogger().info("Drawing the rarefaction curve ...") + logging.info("Drawing the rarefaction curve ...") raref_name = output + "/rarefaction.csv" raref = open(raref_name, "w") raref.write(",".join(["nb_org", "persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", @@ -373,24 +373,24 @@ def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: if kval < 3 and kestimate is False: # estimate K once and for all. try: kval = ppp.pan.parameters["partition"]["K"] - logging.getLogger().info(f"Reuse the number of partitions {kval}") + logging.info(f"Reuse the number of partitions {kval}") except KeyError: - logging.getLogger().info("Estimating the number of partitions...") + logging.info("Estimating the number of partitions...") kval = ppp.evaluate_nb_partitions(set(pangenome.organisms), tmpdir, None, sm_degree, free_dispersion, chunk_size, krange, 0.05, False, cpu, seed) - logging.getLogger().info(f"The number of partitions has been evaluated at {kval}") + logging.info(f"The number of partitions has been evaluated at {kval}") - logging.getLogger().info("Extracting samples ...") + logging.info("Extracting samples ...") all_samples = [] for i in range(min_sampling, max_sampling): # each point for _ in range(depth): # number of samples per points all_samples.append(set(random.sample(set(pangenome.organisms), i + 1))) - logging.getLogger().info(f"Done sampling organisms in the pan, there are {len(all_samples)} samples") + logging.info(f"Done sampling organisms in the pan, there are {len(all_samples)} samples") samp_nb_per_part = [] - logging.getLogger().info("Computing bitarrays for each family...") + logging.info("Computing bitarrays for each family...") index_org = pangenome.compute_family_bitarrays() - logging.getLogger().info("Done computing bitarrays. Comparing them to get exact and soft core stats for " + logging.info("Done computing bitarrays. Comparing them to get exact and soft core stats for " f"{len(all_samples)} samples...") bar = tqdm(range(len(all_samples) * len(pangenome.gene_families)), unit="gene family", disable=disable_bar) for samp in all_samples: @@ -431,7 +431,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: with get_context('fork').Pool(processes=cpu) as p: # launch partitioning - logging.getLogger().info(" Partitioning all samples...") + logging.info(" Partitioning all samples...") bar = tqdm(range(len(args)), unit="samples partitioned", disable=disable_bar) random.shuffle(args) # shuffling the processing so that the progress bar is closer to reality. for result in p.imap_unordered(launch_raref_nem, args): @@ -439,12 +439,12 @@ def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: bar.update() bar.close() - logging.getLogger().info("Done partitioning everything") + logging.info("Done partitioning everything") warnings.filterwarnings("ignore") draw_curve(output, samp_nb_per_part, max_sampling) warnings.resetwarnings() tmpdir_obj.cleanup() - logging.getLogger().info("Done making the rarefaction curves") + logging.info("Done making the rarefaction curves") def launch(args: argparse.Namespace): diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 9b60afe1..2c07f51d 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -385,7 +385,7 @@ def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[Gen len([gene for gene in genes if not gene.is_fragment]) > 1]) if (dup / len(fam.organisms)) >= dup_margin: # tot / nborgs >= 1.05 multigenics.add(fam) - # logging.getLogger().info(f"{len(multigenics)} gene families are defined as being multigenic. + # logging.info(f"{len(multigenics)} gene families are defined as being multigenic. # (duplicated in more than {dup_margin} of the genomes)") return multigenics diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 3977829b..88ece3e0 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -406,21 +406,21 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.getLogger().debug(f"all") + logging.debug(f"all") for fam in self.families: self.bitarray[index[fam]] = 1 elif partition == 'persistent': - logging.getLogger().debug(f"persistent") + logging.debug(f"persistent") for fam in self.families: if fam.named_partition in ['persistent']: self.bitarray[index[fam]] = 1 elif partition in ['shell', 'cloud']: - logging.getLogger().debug(f"shell, cloud") + logging.debug(f"shell, cloud") for fam in self.families: if fam.named_partition == partition: self.bitarray[index[fam]] = 1 elif partition == 'accessory': - logging.getLogger().debug(f"accessory") + logging.debug(f"accessory") for fam in self.families: if fam.named_partition in ['shell', 'cloud']: self.bitarray[index[fam]] = 1 diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 7d286668..9d1af6f5 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -102,8 +102,8 @@ def set_verbosity_level(args): logging.basicConfig(stream=args.log, level=level, format='%(asctime)s %(filename)s:l%(lineno)d %(levelname)s\t%(message)s', datefmt='%Y-%m-%d %H:%M:%S') - logging.getLogger().info("Command: " + " ".join([arg for arg in sys.argv])) - logging.getLogger().info("PPanGGOLiN version: " + pkg_resources.get_distribution("ppanggolin").version) + logging.info("Command: " + " ".join([arg for arg in sys.argv])) + logging.info("PPanGGOLiN version: " + pkg_resources.get_distribution("ppanggolin").version) def jaccard_similarities(mat: csc_matrix, jaccard_similarity_th) -> csc_matrix: @@ -198,7 +198,7 @@ def mk_outdir(output: Path, force: bool = False): :raise FileExistError: The current path already exist and force is false """ if not output.is_dir(): - logging.getLogger().debug(f"Create output directory {output.absolute().as_posix()}") + logging.debug(f"Create output directory {output.absolute().as_posix()}") Path.mkdir(output) else: if not force: diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 3a081a53..45cec674 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -122,16 +122,16 @@ def launch(args: argparse.Namespace): borders=True, modules=True, spot_modules=True) desc_time = time.time() - start_desc - logging.getLogger().info(f"Annotation took : {round(anno_time, 2)} seconds") - logging.getLogger().info(f"Clustering took : {round(clust_time, 2)} seconds") - logging.getLogger().info(f"Building the graph took : {round(graph_time, 2)} seconds") - logging.getLogger().info(f"Partitioning the pangenome took : {round(part_time, 2)} seconds") - logging.getLogger().info(f"Predicting RGP took : {round(regions_time, 2)} seconds") - logging.getLogger().info(f"Gathering RGP into spots took : {round(spot_time, 2)} seconds") - logging.getLogger().info(f"Predicting modules took : {round(mod_time, 2)} seconds") - logging.getLogger().info(f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds") + logging.info(f"Annotation took : {round(anno_time, 2)} seconds") + logging.info(f"Clustering took : {round(clust_time, 2)} seconds") + logging.info(f"Building the graph took : {round(graph_time, 2)} seconds") + logging.info(f"Partitioning the pangenome took : {round(part_time, 2)} seconds") + logging.info(f"Predicting RGP took : {round(regions_time, 2)} seconds") + logging.info(f"Gathering RGP into spots took : {round(spot_time, 2)} seconds") + logging.info(f"Predicting modules took : {round(mod_time, 2)} seconds") + logging.info(f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds") if not args.only_pangenome: - logging.getLogger().info(f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds") + logging.info(f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds") print_info(filename, content=True) diff --git a/ppanggolin/workflow/panModule.py b/ppanggolin/workflow/panModule.py index 2b90f52c..003d0bea 100644 --- a/ppanggolin/workflow/panModule.py +++ b/ppanggolin/workflow/panModule.py @@ -101,13 +101,13 @@ def launch(args: argparse.Namespace): projection=True, stats=True, json=True, partitions=True, modules=True) desc_time = time.time() - start_desc - logging.getLogger().info(f"Annotation took : {round(anno_time, 2)} seconds") - logging.getLogger().info(f"Clustering took : {round(clust_time, 2)} seconds") - logging.getLogger().info(f"Building the graph took : {round(graph_time, 2)} seconds") - logging.getLogger().info(f"Partitioning the pangenome took : {round(part_time, 2)} seconds") - logging.getLogger().info(f"Predicting modules took : {round(mod_time, 2)} seconds") - logging.getLogger().info(f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds") - logging.getLogger().info(f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds") + logging.info(f"Annotation took : {round(anno_time, 2)} seconds") + logging.info(f"Clustering took : {round(clust_time, 2)} seconds") + logging.info(f"Building the graph took : {round(graph_time, 2)} seconds") + logging.info(f"Partitioning the pangenome took : {round(part_time, 2)} seconds") + logging.info(f"Predicting modules took : {round(mod_time, 2)} seconds") + logging.info(f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds") + logging.info(f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds") print_info(filename, content=True) diff --git a/ppanggolin/workflow/panRGP.py b/ppanggolin/workflow/panRGP.py index 7c0a8445..2e06814d 100644 --- a/ppanggolin/workflow/panRGP.py +++ b/ppanggolin/workflow/panRGP.py @@ -111,14 +111,14 @@ def launch(args: argparse.Namespace): projection=True, stats=True, json=True, partitions=True, regions=True, spots=True) desc_time = time.time() - start_desc - logging.getLogger().info(f"Annotation took : {round(anno_time, 2)} seconds") - logging.getLogger().info(f"Clustering took : {round(clust_time, 2)} seconds") - logging.getLogger().info(f"Building the graph took : {round(graph_time, 2)} seconds") - logging.getLogger().info(f"Partitioning the pangenome took : {round(part_time, 2)} seconds") - logging.getLogger().info(f"Predicting RGP took : {round(regions_time, 2)} seconds") - logging.getLogger().info(f"Gathering RGP into spots took : {round(spot_time, 2)} seconds") - logging.getLogger().info(f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds") - logging.getLogger().info(f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds") + logging.info(f"Annotation took : {round(anno_time, 2)} seconds") + logging.info(f"Clustering took : {round(clust_time, 2)} seconds") + logging.info(f"Building the graph took : {round(graph_time, 2)} seconds") + logging.info(f"Partitioning the pangenome took : {round(part_time, 2)} seconds") + logging.info(f"Predicting RGP took : {round(regions_time, 2)} seconds") + logging.info(f"Gathering RGP into spots took : {round(spot_time, 2)} seconds") + logging.info(f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds") + logging.info(f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds") print_info(filename, content=True) From d8f04c211fc353eaa2e1a90875a8a8ad7b106e7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 15 Jun 2023 22:03:09 +0200 Subject: [PATCH 09/75] Refactor partition and rarefaction --- VERSION | 2 +- ppanggolin/graph/makeGraph.py | 3 +- ppanggolin/info/info.py | 16 +--- ppanggolin/main.py | 3 +- ppanggolin/mod/module.py | 20 ++--- ppanggolin/nem/partition.py | 150 ++++++++++++++++--------------- ppanggolin/nem/rarefaction.py | 58 ++++++------ ppanggolin/workflow/all.py | 2 +- ppanggolin/workflow/panModule.py | 2 +- ppanggolin/workflow/panRGP.py | 2 +- ppanggolin/workflow/workflow.py | 2 +- 11 files changed, 128 insertions(+), 132 deletions(-) diff --git a/VERSION b/VERSION index 0e2d0606..6efbab83 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.111 +1.2.112 diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 175e0817..0d27d6d1 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -4,6 +4,7 @@ # default libraries import logging import argparse +from pathlib import Path # installed libraries from tqdm import tqdm @@ -146,7 +147,7 @@ def parser_graph(parser: argparse.ArgumentParser): :param parser: parser for align argument """ - parser.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") + parser.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") parser.add_argument('-r', '--remove_high_copy_number', type=int, default=0, help="Positive Number: Remove families having a number of copy of gene in a single organism " "above or equal to this threshold in at least one organism " diff --git a/ppanggolin/info/info.py b/ppanggolin/info/info.py index ebd90e77..7b6d57f2 100644 --- a/ppanggolin/info/info.py +++ b/ppanggolin/info/info.py @@ -8,10 +8,10 @@ import tables # local libraries -from ppanggolin.formats import read_info, read_parameters +from ppanggolin.formats import read_info, read_parameters, fix_partitioned -def print_info(pangenome: str, status: bool = False, content: bool = False, parameters: bool = False): +def print_info(pangenome: Path, status: bool = False, content: bool = False, parameters: bool = False): """ Main function to return information about pangenome @@ -20,6 +20,7 @@ def print_info(pangenome: str, status: bool = False, content: bool = False, para :param content: Get pangenome content :param parameters: Get pangenome parameters """ + fix_partitioned(pangenome) if status or content or parameters: h5f = tables.open_file(pangenome, "r+") if status: @@ -30,16 +31,6 @@ def print_info(pangenome: str, status: bool = False, content: bool = False, para print(f"gene families have their sequences : " f"{'true' if status_group._v_attrs.geneFamilySequences else 'false'}") print(f"neighbors graph : {'true' if status_group._v_attrs.NeighborsGraph else 'false'}") - if 'Partitionned' in status_group._v_attrs._f_list(): - # Partitionned keep working with older version - h5f.close() - h5f = tables.open_file(pangenome, "a") - status_group = h5f.root.status - if status_group._v_attrs.Partitionned: - status_group._v_attrs.Partitioned = True - else: - status_group._v_attrs.Partitioned = False - del status_group._v_attrs.Partitionned if status_group._v_attrs.Partitioned: print("pangenome partitioned : true") else: @@ -55,7 +46,6 @@ def print_info(pangenome: str, status: bool = False, content: bool = False, para if hasattr(status_group._v_attrs, "version"): print(f"PPanGGOLiN version : {status_group._v_attrs.version}") - if content: read_info(h5f) if parameters: diff --git a/ppanggolin/main.py b/ppanggolin/main.py index 86b6d1ce..f93fe39f 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -10,6 +10,7 @@ import argparse import pkg_resources import tempfile +from pathlib import Path # local modules import ppanggolin.pangenome @@ -105,7 +106,7 @@ def cmd_line() -> argparse.Namespace: for sub in subs: # add options common to all subcommands common = sub._action_groups.pop(1) # get the 'optional arguments' action group. common.title = "Common arguments" - common.add_argument("--tmpdir", required=False, type=str, default=tempfile.gettempdir(), + common.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2], help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)") diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index f494cde5..1ca695ac 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -5,24 +5,18 @@ import logging import argparse import time -import os import tempfile -import subprocess -from itertools import combinations -from statistics import mean, median -from collections import defaultdict +from pathlib import Path # installed libraries from tqdm import tqdm import networkx as nx -from gmpy2 import xmpz, popcount # pylint: disable=no-name-in-module # local libraries -from ppanggolin.genome import Organism from ppanggolin.pangenome import Pangenome from ppanggolin.region import Module from ppanggolin.formats import check_pangenome_info, write_pangenome, erase_pangenome -from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components +from ppanggolin.utils import restricted_float, add_gene, connected_components def check_pangenome_former_modules(pangenome: Pangenome, force: bool = False): @@ -76,7 +70,7 @@ def compute_modules(g: nx.Graph, multi: set, weight: float = 0.85, min_fam: int :param multi: a set of families :class:`ppanggolin.geneFamily.GeneFamily` considered multigenic :param weight: the minimal jaccard under which edges are not considered :param min_fam: the minimal number of presence under which the family is not considered - : param size: Minimal number of gene family in a module + :param size: Minimal number of gene family in a module """ # removing families with low presence @@ -94,15 +88,13 @@ def compute_modules(g: nx.Graph, multi: set, weight: float = 0.85, min_fam: int return modules -def predict_modules(pangenome: Pangenome, tmpdir: str, cpu: int = 1, dup_margin: float = 0.05, +def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = 3, min_presence: int = 2, transitive: int = 4, jaccard: float = 0.85, force: bool = False, disable_bar: bool = False): """ Main function to predict module :param pangenome: Pangenome object with Gene Families, Annotation and Partition - :param tmpdir: Path to temporary directory - :param cpu: Number of available core :param dup_margin: minimum ratio of organisms in which family must have multiple genes to be considered duplicated :param size: Minimal number of gene family in a module :param min_presence: Minimum number of times the module needs to be present in the pangenome to be reported. @@ -156,7 +148,7 @@ def launch(args: argparse.Namespace): """ pangenome = Pangenome() pangenome.add_file(args.pangenome) - predict_modules(pangenome=pangenome, tmpdir=args.tmpdir, cpu=args.cpu, dup_margin=args.dup_margin, size=args.size, + predict_modules(pangenome=pangenome, dup_margin=args.dup_margin, size=args.size, min_presence=args.min_presence, transitive=args.transitive, jaccard=args.jaccard, force=args.force, disable_bar=args.disable_prog_bar) write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) @@ -183,7 +175,7 @@ def parser_module(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument("--size", required=False, type=int, default=3, diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index 49de793f..0bfd3360 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -12,9 +12,10 @@ from collections import defaultdict, Counter import math from shutil import copytree +from pathlib import Path # installed libraries -from typing import Union, Tuple +from typing import Union, Tuple, List from tqdm import tqdm import plotly.offline as out_plotly @@ -32,10 +33,10 @@ samples = [] -def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dispersion: bool = False, kval: int = 3, +def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_dispersion: bool = False, kval: int = 3, seed: int = 42, init: str = "param_file", keep_files: bool = False, itermax: int = 100, - just_log_likelihood: bool = False) -> Union[Tuple[dict, None, None], Tuple[int, float, float], - Tuple[dict, dict, float]]: + just_log_likelihood: bool = False) \ + -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: """ Main function to make partitionning @@ -54,7 +55,7 @@ def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dis """ logging.debug("run_partitioning...") if init == "param_file": - with open(nem_dir_path + "/nem_file_init_" + str(kval) + ".m", "w") as m_file: + with open(nem_dir_path / f"nem_file_init_{str(kval)}.m", "w") as m_file: m_file.write("1 ") # 1 to initialize parameter, m_file.write(" ".join([str(round(1 / float(kval), 2))] * (kval - 1)) + " ") # 1/K give the initial proportion to each class @@ -87,34 +88,35 @@ def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dis # (INIT_SORT, init_random, init_param_file, INIT_FILE, INIT_LABEL, INIT_NB) = range(0,6) init_random, init_param_file = range(1, 3) logging.debug("Running NEM...") - logging.debug([nem_dir_path.encode('ascii') + b"/nem_file", kval, algo, beta, convergence, - convergence_th, b"fuzzy", itermax, True, model, proportion, variance_model, - init_param_file if init in ["param_file", "init_from_old"] else init_random, - nem_dir_path.encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", - nem_dir_path.encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), - seed]) - nem_stats.nem(Fname=nem_dir_path.encode('ascii') + b"/nem_file", nk=kval, algo=algo, beta=beta, + logging.debug([nem_dir_path.as_posix().encode('ascii') + b"/nem_file", kval, algo, beta, convergence, + convergence_th, b"fuzzy", itermax, True, model, proportion, variance_model, + init_param_file if init in ["param_file", "init_from_old"] else init_random, + nem_dir_path.as_posix().encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", + nem_dir_path.as_posix().encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), + seed]) + nem_stats.nem(Fname=nem_dir_path.as_posix().encode('ascii') + b"/nem_file", nk=kval, algo=algo, beta=beta, convergence=convergence, convergence_th=convergence_th, format=b"fuzzy", it_max=itermax, dolog=True, model_family=model, proportion=proportion, dispersion=variance_model, init_mode=init_param_file if init in ["param_file", "init_from_old"] else init_random, - init_file=nem_dir_path.encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", - out_file_prefix=nem_dir_path.encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), + init_file=nem_dir_path.as_posix().encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", + out_file_prefix=nem_dir_path.as_posix().encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), seed=seed) logging.debug("After running NEM...") no_nem = False - if os.path.isfile(nem_dir_path + "/nem_file_" + str(kval) + ".uf"): + nem_out_path = nem_dir_path / f"nem_file_{str(kval)}.uf" + if nem_out_path.is_file(): logging.debug("Reading NEM results...") elif not just_log_likelihood: # logging.warning("No NEM output file found: "+ nem_dir_path+"/nem_file_"+str(K)+".uf") no_nem = True else: - logging.debug("No NEM output file found: " + nem_dir_path + "/nem_file_" + str(kval) + ".uf") + logging.debug(f"No NEM output file found: {nem_out_path.absolute().as_posix()}") no_nem = True index_fam = [] - with open(nem_dir_path + "/nem_file.index", "r") as index_nem_file: + with open(nem_dir_path / "nem_file.index", "r") as index_nem_file: for line in index_nem_file: index_fam.append(line.split("\t")[1].strip()) @@ -123,8 +125,8 @@ def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dis log_likelihood = None entropy = None try: - with open(nem_dir_path + "/nem_file_" + str(kval) + ".uf", "r") as partitions_nem_file, open( - nem_dir_path + "/nem_file_" + str(kval) + ".mf", "r") as parameters_nem_file: + with open(nem_dir_path / f"nem_file_{str(kval)}.uf", "r") as partitions_nem_file, \ + open(nem_dir_path / f"nem_file_{str(kval)}.mf", "r") as parameters_nem_file: parameters = parameters_nem_file.readlines() log_likelihood = float(parameters[2].split()[3]) @@ -164,23 +166,23 @@ def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dis partitions_list[i] = parti[positions_max_prob.pop()] except IOError: logging.debug("partitioning did not work (the number of organisms used is probably too low), " - "see logs here to obtain more details " + nem_dir_path + "/nem_file_" + - str(kval) + ".log") + "see logs here to obtain more details " + nem_dir_path.as_posix() + "/nem_file_" + + str(kval) + ".log") return {}, None, None # return empty objects except ValueError: # return the default partitions_list which correspond to undefined pass if not keep_files and no_nem is False: - os.remove(nem_dir_path + "/nem_file_" + str(kval) + ".uf") - os.remove(nem_dir_path + "/nem_file_" + str(kval) + ".mf") - os.remove(nem_dir_path + "/nem_file_" + str(kval) + ".log") - os.remove(nem_dir_path + "/nem_file_" + str(kval) + ".stderr") - os.remove(nem_dir_path + "/nem_file_init_" + str(kval) + ".m") - os.remove(nem_dir_path + "/nem_file.index") - os.remove(nem_dir_path + "/nem_file.dat") - os.remove(nem_dir_path + "/nem_file.nei") - os.remove(nem_dir_path + "/nem_file.str") + os.remove(nem_dir_path / f"nem_file_{str(kval)}.uf") + os.remove(nem_dir_path / f"nem_file_{str(kval)}.mf") + os.remove(nem_dir_path / f"nem_file_{str(kval)}.log") + os.remove(nem_dir_path / f"nem_file_{str(kval)}.stderr") + os.remove(nem_dir_path / f"nem_file_init_{str(kval)}.m") + os.remove(nem_dir_path / "nem_file.index") + os.remove(nem_dir_path / "nem_file.dat") + os.remove(nem_dir_path / "nem_file.nei") + os.remove(nem_dir_path / "nem_file.str") if just_log_likelihood: return kval, log_likelihood, entropy @@ -188,7 +190,8 @@ def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dis return dict(zip(index_fam, partitions_list)), all_parameters, log_likelihood -def nem_single(args: tuple) -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: +def nem_single(args: List[Tuple[Path, int, float, bool, int, int, str, bool, int, bool]]) \ + -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: """ Allow to run partitioning in multiprocessing to evaluate partition number @@ -199,10 +202,10 @@ def nem_single(args: tuple) -> Union[Tuple[dict, None, None], Tuple[int, float, return run_partitioning(*args) -def partition_nem(index: int, tmpdir: str, kval: int, beta: float = 2.5, sm_degree: int = 10, +def partition_nem(index: int, kval: int, beta: float = 2.5, sm_degree: int = 10, free_dispersion: bool = False, seed: int = 42, init: str = "param_file", - keep_tmp_files: bool = False) -> Union[Tuple[dict, None, None], Tuple[int, float, float], - Tuple[dict, dict, float]]: + tmpdir: Path = None, keep_tmp_files: bool = False) \ + -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: """ :param index: Index of the sample group @@ -217,11 +220,10 @@ def partition_nem(index: int, tmpdir: str, kval: int, beta: float = 2.5, sm_degr :return: """ - currtmpdir = tmpdir + "/" + str(index) # unique directory name + currtmpdir = tmpdir / f"{str(index)}" # unique directory name samp = samples[index] # org_samples accessible because it is a global variable. edges_weight, nb_fam = write_nem_input_files(tmpdir=currtmpdir, organisms=samp, sm_degree=sm_degree) - return run_partitioning(currtmpdir, len(samp), beta * (nb_fam / edges_weight), free_dispersion, kval=kval, seed=seed, init=init, keep_files=keep_tmp_files) @@ -235,7 +237,7 @@ def nem_samples(pack: tuple) -> Union[Tuple[dict, None, None], Tuple[int, float, return partition_nem(*pack) -def write_nem_input_files(tmpdir: str, organisms: set, sm_degree: int = 10) -> (float, int): +def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> Tuple[float, int]: """ Create and format input files for partitioning with NEM @@ -248,14 +250,14 @@ def write_nem_input_files(tmpdir: str, organisms: set, sm_degree: int = 10) -> ( mk_outdir(tmpdir, force=False) total_edges_weight = 0 - with open(tmpdir + "/column_org_file", "w") as org_file: + with open(tmpdir / "column_org_file", "w") as org_file: org_file.write(" ".join([f'"{org.name}"' for org in organisms]) + "\n") logging.debug("Writing nem_file.str nem_file.index nem_file.nei and nem_file.dat files") - with open(tmpdir + "/nem_file.str", "w") as str_file, \ - open(tmpdir + "/nem_file.index", "w") as index_file, \ - open(tmpdir + "/nem_file.nei", "w") as nei_file, \ - open(tmpdir + "/nem_file.dat", "w") as dat_file: + with open(tmpdir / "nem_file.str", "w") as str_file, \ + open(tmpdir / "nem_file.index", "w") as index_file, \ + open(tmpdir / "nem_file.nei", "w") as nei_file, \ + open(tmpdir / "nem_file.dat", "w") as dat_file: nei_file.write("1\n") index_fam = {} @@ -303,16 +305,15 @@ def write_nem_input_files(tmpdir: str, organisms: set, sm_degree: int = 10) -> ( return total_edges_weight / 2, len(index_fam) -def evaluate_nb_partitions(organisms: set, tmpdir: str, outputdir: str = None, sm_degree: int = 10, - free_dispersion: bool = False, chunk_size: int = 500, krange: list = None, - icl_margin: float = 0.05, draw_icl: bool = False, cpu: int = 1, seed: int = 42, - disable_bar: bool = False) -> int: +def evaluate_nb_partitions(organisms: set, output: Path = None, sm_degree: int = 10, free_dispersion: bool = False, + chunk_size: int = 500, krange: list = None, icl_margin: float = 0.05, draw_icl: bool = False, + cpu: int = 1, seed: int = 42, tmpdir: Path = None, disable_bar: bool = False) -> int: """ Evaluate the optimal number of partition for the pangenome :param organisms: Set of organisms from pangenome :param tmpdir: temporary directory path - :param outputdir: output directory path to draw ICL + :param output: output directory path to draw ICL :param sm_degree: Maximum degree of the nodes to be included in the smoothing process. :param free_dispersion: use if the dispersion around the centroid vector of each partition during must be free. :param chunk_size: Size of the chunks when performing partitioning using chunks of organisms. @@ -325,8 +326,8 @@ def evaluate_nb_partitions(organisms: set, tmpdir: str, outputdir: str = None, s :return: Ideal number of partition computed """ - - newtmpdir = tmpdir + "/eval_partitions" + tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir + newtmpdir = tmpdir / "eval_partitions" if len(organisms) > chunk_size: select_organisms = set(random.sample(set(organisms), chunk_size)) @@ -410,7 +411,8 @@ def evaluate_nb_partitions(organisms: set, tmpdir: str, outputdir: str = None, s dict(type='line', x0=2, x1=krange[1], y0=all_icls[best_k], y1=all_icls[best_k], line=dict(dict(width=1, dash='dashdot', color="black")))]) fig = go.Figure(data=traces, layout=layout) - out_plotly.plot(fig, filename=outputdir + "/ICL_curve_K" + str(best_k) + ".html", auto_open=False) + out_plot = output / f"ICL_curve_K{str(best_k)}.html" + out_plotly.plot(fig, filename=out_plot.as_posix(), auto_open=False) return chosen_k @@ -428,16 +430,16 @@ def check_pangenome_former_partition(pangenome: Pangenome, force: bool = False): erase_pangenome(pangenome, partition=True) -def partition(pangenome: Pangenome, tmpdir: str, outputdir: str = None, beta: float = 2.5, sm_degree: int = 10, +def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_degree: int = 10, free_dispersion: bool = False, chunk_size: int = 500, kval: int = -1, krange: list = None, icl_margin: float = 0.05, draw_icl: bool = False, cpu: int = 1, seed: int = 42, - keep_tmp_files: bool = False, force: bool = False, disable_bar: bool = False): + tmpdir: Path = None, keep_tmp_files: bool = False, force: bool = False, disable_bar: bool = False): """ Partitioning the pangenome :param pangenome: Pangenome containing GeneFamilies to align with sequence set :param tmpdir: temporary directory path - :param outputdir: output directory path to draw ICL + :param output: output directory path to draw ICL :param beta: strength of the smoothing using the graph topology during partitioning. 0 deactivate spatial smoothing :param sm_degree: Maximum degree of the nodes to be included in the smoothing process. :param free_dispersion: use if the dispersion around the centroid vector of each partition during must be free. @@ -452,22 +454,24 @@ def partition(pangenome: Pangenome, tmpdir: str, outputdir: str = None, beta: fl :param force: Allow to force write on Pangenome file :param disable_bar: Disable progress bar """ + tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir kmm = [3, 20] if krange is None else krange global samples global pan pan = pangenome - if draw_icl and outputdir is None: + if draw_icl and output is None: raise Exception("Combination of option impossible: " "You asked to draw the ICL curves but did not provide an output directory!") check_pangenome_former_partition(pangenome, force) check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) organisms = set(pangenome.organisms) tmp_dir = tempfile.TemporaryDirectory(dir=tmpdir) + tmp_path = Path(tmp_dir.name) if len(organisms) <= 10: logging.warning(f"The number of selected organisms is too low ({len(organisms)} " - f"organisms used) to robustly partition the graph") + f"organisms used) to robustly partition the graph") pangenome.parameters["partition"] = {} pangenome.parameters["partition"]["beta"] = beta @@ -480,8 +484,8 @@ def partition(pangenome: Pangenome, tmpdir: str, outputdir: str = None, beta: fl if kval < 2: pangenome.parameters["partition"]["computed_K"] = True logging.info("Estimating the optimal number of partitions...") - kval = evaluate_nb_partitions(organisms, tmp_dir.name, outputdir, sm_degree, free_dispersion, chunk_size, kmm, - icl_margin, draw_icl, cpu, seed, disable_bar=disable_bar) + kval = evaluate_nb_partitions(organisms, output, sm_degree, free_dispersion, chunk_size, kmm, + icl_margin, draw_icl, cpu, seed, tmp_path, disable_bar) logging.info(f"The number of partitions has been evaluated at {kval}") pangenome.parameters["partition"]["K"] = kval @@ -541,8 +545,8 @@ def validate_family(res): args = [] # tmpdir, beta, sm_degree, free_dispersion, K, seed for i, _ in enumerate(samples[prev:], start=prev): - args.append((i, tmp_dir.name, kval, beta, sm_degree, free_dispersion, seed, init, - keep_tmp_files)) + args.append((i, kval, beta, sm_degree, free_dispersion, seed, init, + tmp_path, keep_tmp_files)) logging.info("Launching NEM") with get_context('fork').Pool(processes=cpu) as p: @@ -564,11 +568,11 @@ def validate_family(res): partitioning_results = [partitioning_results, []] # introduces a 'non feature'. logging.info(f"Did {len(samples)} partitioning with chunks of size {chunk_size} among " - f"{len(organisms)} genomes in {round(time.time() - start_partitioning, 2)} seconds.") + f"{len(organisms)} genomes in {round(time.time() - start_partitioning, 2)} seconds.") else: - edges_weight, nb_fam = write_nem_input_files(tmp_dir.name + "/" + str(cpt) + "/", organisms, + edges_weight, nb_fam = write_nem_input_files(tmp_path / f"{str(cpt)}", organisms, sm_degree=sm_degree) - partitioning_results = run_partitioning(tmp_dir.name + "/" + str(cpt) + "/", len(organisms), + partitioning_results = run_partitioning(tmp_path / f"{str(cpt)}", len(organisms), beta * (nb_fam / edges_weight), free_dispersion, kval=kval, seed=seed, init=init, keep_files=keep_tmp_files) if partitioning_results == [{}, None, None]: @@ -576,18 +580,18 @@ def validate_family(res): "This usually happens because you used very few (<15) genomes.") cpt += 1 logging.info(f"Partitioned {len(organisms)} genomes in " - f"{round(time.time() - start_partitioning, 2)} seconds.") + f"{round(time.time() - start_partitioning, 2)} seconds.") # pangenome.savePartitionParameters(K, beta, free_dispersion, sm_degree, partitioning_results[1], chunk_size) - for famName, part in partitioning_results[0].items(): - pangenome.get_gene_family(famName).partition = part + for fam_name, part in partitioning_results[0].items(): + pangenome.get_gene_family(fam_name).partition = part pangenome.status["partitioned"] = "Computed" if not keep_tmp_files: tmp_dir.cleanup() else: - copytree(tmp_dir.name, outputdir + "/NEM_files/") + copytree(tmp_path, output / "NEM_files/") def launch(args: argparse.Namespace): @@ -600,8 +604,8 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) global pan pan.add_file(args.pangenome) - partition(pan, args.tmpdir, args.output, args.beta, args.max_degree_smoothing, args.free_dispersion, - args.chunk_size, args.nb_of_partitions, args.krange, args.ICL_margin, args.draw_ICL, args.cpu, args.seed, + partition(pan, args.output, args.beta, args.max_degree_smoothing, args.free_dispersion, args.chunk_size, + args.nb_of_partitions, args.krange, args.ICL_margin, args.draw_ICL, args.cpu, args.seed, args.tmpdir, args.keep_tmp_files, args.force, disable_bar=args.disable_prog_bar) logging.debug("Write partition in pangenome") write_pangenome(pan, pan.file, args.force, disable_bar=args.disable_prog_bar) @@ -629,7 +633,7 @@ def parser_partition(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome.h5 file") + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome.h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument("-b", "--beta", required=False, default=2.5, type=float, @@ -637,9 +641,9 @@ def parser_partition(parser: argparse.ArgumentParser): "0 will deactivate spatial smoothing.") optional.add_argument("-ms", "--max_degree_smoothing", required=False, default=10, type=float, help="max. degree of the nodes to be included in the smoothing process.") - optional.add_argument('-o', '--output', required=False, type=str, - default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", - time.localtime()) + "_PID" + str(os.getpid()), + optional.add_argument('-o', '--output', required=False, type=Path, + default=Path(f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" + f"_PID{str(os.getpid())}"), help="Output directory") optional.add_argument("-fd", "--free_dispersion", required=False, default=False, action="store_true", help="use if the dispersion around the centroid vector of each partition during must be free." @@ -679,7 +683,7 @@ def parser_partition(parser: argparse.ArgumentParser): common = main_parser.add_argument_group(title="Common argument") common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2], help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)") - common.add_argument("--tmpdir", required=False, type=str, default=tempfile.gettempdir(), + common.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") common.add_argument("--log", required=False, type=check_log, default="stdout", help="log output file") common.add_argument("-d", "--disable_prog_bar", required=False, action="store_true", diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index 930ef3ec..a22239e6 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -11,10 +11,10 @@ from multiprocessing import get_context import os import warnings +from pathlib import Path +from typing import Union, Tuple, Dict # installed libraries -from typing import Union, Tuple - from tqdm import tqdm import gmpy2 import numpy @@ -34,8 +34,9 @@ samples = [] -def raref_nem(index: int, tmpdir: str, beta: float = 2.5, sm_degree: int = 10, free_dispersion: bool = False, - chunk_size: int = 500, kval: int = -1, krange: list = None, seed: int = 42) -> (dict, int): +def raref_nem(index: int, tmpdir: Path, beta: float = 2.5, sm_degree: int = 10, + free_dispersion: bool = False, chunk_size: int = 500, kval: int = -1, + krange: list = None, seed: int = 42) -> Tuple[Dict[str, int], int]: """ :param index: Index of the sample group organisms @@ -51,12 +52,14 @@ def raref_nem(index: int, tmpdir: str, beta: float = 2.5, sm_degree: int = 10, f :return: Count of each partition and paremeters for the given sample index """ samp = samples[index] - currtmpdir = tmpdir + "/" + str(index) + "/" + currtmpdir = tmpdir / f"{str(index)}" + kmm = [3, 20] if krange is None else krange if kval < 3: - kval = ppp.evaluate_nb_partitions(samp, tmpdir + "/" + str(index) + "_eval", None, sm_degree, free_dispersion, - chunk_size, kmm, 0.05, False, 1, seed) + kval = ppp.evaluate_nb_partitions(organisms=samp, sm_degree=sm_degree, free_dispersion=free_dispersion, + chunk_size=chunk_size, krange=kmm, seed=seed, + tmpdir=tmpdir / f"{str(index)}_eval") if len(samp) <= chunk_size: # all good, just write stuff. edges_weight, nb_fam = ppp.write_nem_input_files(tmpdir=currtmpdir, organisms=set(samp), @@ -109,9 +112,11 @@ def validate_family(result: Union[Tuple[dict, None, None], Tuple[int, float, flo shuffled_orgs = shuffled_orgs[chunk_size:] # making arguments for all samples: for samp in org_samples: - edges_weight, nb_fam = ppp.write_nem_input_files(currtmpdir + "/" + str(cpt) + "/", samp, + if not currtmpdir.exists(): + mk_outdir(currtmpdir) + edges_weight, nb_fam = ppp.write_nem_input_files(currtmpdir / f"{str(cpt)}", samp, sm_degree=sm_degree) - validate_family(ppp.run_partitioning(currtmpdir + "/" + str(cpt) + "/", len(samp), + validate_family(ppp.run_partitioning(currtmpdir / f"{str(cpt)}", len(samp), beta * (nb_fam / edges_weight), free_dispersion, kval=kval, seed=seed, init="param_file")) cpt += 1 @@ -136,7 +141,7 @@ def validate_family(result: Union[Tuple[dict, None, None], Tuple[int, float, flo return counts, index -def launch_raref_nem(args: tuple) -> (dict, int): +def launch_raref_nem(args: Tuple[int, Path, float, int, bool, int, int, list, int]) -> Tuple[Tuple[Dict[str, int], int]]: """ Launch raref_nem in multiprocessing @@ -147,7 +152,7 @@ def launch_raref_nem(args: tuple) -> (dict, int): return raref_nem(*args) -def draw_curve(output: str, data: list, max_sampling: int = 10): +def draw_curve(output: Path, data: list, max_sampling: int = 10): """ Draw the rarefaction curve and associated data @@ -156,7 +161,7 @@ def draw_curve(output: str, data: list, max_sampling: int = 10): :param data: """ logging.info("Drawing the rarefaction curve ...") - raref_name = output + "/rarefaction.csv" + raref_name = output/"rarefaction.csv" raref = open(raref_name, "w") raref.write(",".join(["nb_org", "persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", "soft_core", "soft_accessory", "pangenome", "K"]) + "\n") @@ -176,7 +181,7 @@ def poly_area(p_x: list, p_y: list) -> float: annotations = [] traces = [] data_raref = read_csv(raref_name, index_col=False) - params_file = open(output + "/rarefaction_parameters" + ".csv", "w") + params_file = open(output/"rarefaction_parameters.csv", "w") params_file.write("partition,kappa,gamma,kappa_std_error,gamma_std_error,IQR_area\n") for partition in ["persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", "soft_core", "soft_accessory", "pangenome"]: @@ -321,11 +326,11 @@ def poly_area(p_x: list, p_y: list) -> float: annotations=annotations, plot_bgcolor='#ffffff') fig = go.Figure(data=traces, layout=layout) - out_plotly.plot(fig, filename=output + "/rarefaction_curve.html", auto_open=False) + out_plotly.plot(fig, filename=output.as_posix() + "/rarefaction_curve.html", auto_open=False) params_file.close() -def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: float = 2.5, depth: int = 30, +def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = None, beta: float = 2.5, depth: int = 30, min_sampling: int = 1, max_sampling: int = 100, sm_degree: int = 10, free_dispersion: bool = False, chunk_size: int = 500, kval: int = -1, krange: list = None, cpu: int = 1, seed: int = 42, kestimate: bool = False, soft_core: float = 0.95, @@ -351,6 +356,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: :param soft_core: Soft core threshold :param disable_bar: Disable progress bar """ + tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir if krange is None: krange = [3, -1] ppp.pan = pangenome # use the global from partition to store the pangenome, so that it is usable @@ -363,7 +369,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) tmpdir_obj = tempfile.TemporaryDirectory(dir=tmpdir) - tmpdir = tmpdir_obj.name + tmp_path = Path(tmpdir_obj.name) if float(len(pangenome.organisms)) < max_sampling: max_sampling = len(pangenome.organisms) @@ -376,8 +382,9 @@ def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: logging.info(f"Reuse the number of partitions {kval}") except KeyError: logging.info("Estimating the number of partitions...") - kval = ppp.evaluate_nb_partitions(set(pangenome.organisms), tmpdir, None, sm_degree, free_dispersion, - chunk_size, krange, 0.05, False, cpu, seed) + kval = ppp.evaluate_nb_partitions(organisms=set(pangenome.organisms), sm_degree=sm_degree, + free_dispersion=free_dispersion, chunk_size=chunk_size, krange=krange, + cpu=cpu, seed=seed, tmpdir=tmp_path) logging.info(f"The number of partitions has been evaluated at {kval}") logging.info("Extracting samples ...") @@ -391,7 +398,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: logging.info("Computing bitarrays for each family...") index_org = pangenome.compute_family_bitarrays() logging.info("Done computing bitarrays. Comparing them to get exact and soft core stats for " - f"{len(all_samples)} samples...") + f"{len(all_samples)} samples...") bar = tqdm(range(len(all_samples) * len(pangenome.gene_families)), unit="gene family", disable=disable_bar) for samp in all_samples: # make the sample's organism bitarray. @@ -427,7 +434,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: args = [] for index, samp in enumerate(samples): - args.append((index, tmpdir, beta, sm_degree, free_dispersion, chunk_size, kval, krange, seed)) + args.append((index, tmp_path, beta, sm_degree, free_dispersion, chunk_size, kval, krange, seed)) with get_context('fork').Pool(processes=cpu) as p: # launch partitioning @@ -486,7 +493,7 @@ def parser_rarefaction(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument("-b", "--beta", required=False, default=2.5, type=float, @@ -501,9 +508,10 @@ def parser_rarefaction(parser: argparse.ArgumentParser): optional.add_argument("-ms", "--max_degree_smoothing", required=False, default=10, type=float, help="max. degree of the nodes to be included in the smoothing process.") - optional.add_argument('-o', '--output', required=False, type=str, - default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", - time.localtime()) + "_PID" + str(os.getpid()), + optional.add_argument('-o', '--output', required=False, type=Path, + default=Path( + f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" + f"_PID{str(os.getpid())}"), help="Output directory") optional.add_argument("-fd", "--free_dispersion", required=False, default=False, action="store_true", help="use if the dispersion around the centroid vector of each partition during must be free." @@ -538,7 +546,7 @@ def parser_rarefaction(parser: argparse.ArgumentParser): common = main_parser.add_argument_group(title="Common argument") common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2], help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)") - common.add_argument("--tmpdir", required=False, type=str, default=tempfile.gettempdir(), + common.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") common.add_argument("--log", required=False, type=check_log, default="stdout", help="log output file") common.add_argument("-d", "--disable_prog_bar", required=False, action="store_true", diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 45cec674..1ec6d6bb 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -79,7 +79,7 @@ def launch(args: argparse.Namespace): graph_time = time.time() - start_graph start_part = time.time() - partition(pangenome, tmpdir=args.tmpdir, kval=args.nb_of_partitions, cpu=args.cpu, + partition(pangenome, kval=args.nb_of_partitions, cpu=args.cpu, tmpdir=args.tmpdir, disable_bar=args.disable_prog_bar) part_time = time.time() - start_part diff --git a/ppanggolin/workflow/panModule.py b/ppanggolin/workflow/panModule.py index 003d0bea..9083aa7c 100644 --- a/ppanggolin/workflow/panModule.py +++ b/ppanggolin/workflow/panModule.py @@ -74,7 +74,7 @@ def launch(args: argparse.Namespace): graph_time = time.time() - start_graph start_part = time.time() - partition(pangenome, tmpdir=args.tmpdir, kval=args.nb_of_partitions, cpu=args.cpu, + partition(pangenome, kval=args.nb_of_partitions, cpu=args.cpu, tmpdir=args.tmpdir, disable_bar=args.disable_prog_bar) part_time = time.time() - start_part diff --git a/ppanggolin/workflow/panRGP.py b/ppanggolin/workflow/panRGP.py index 2e06814d..1e23a419 100644 --- a/ppanggolin/workflow/panRGP.py +++ b/ppanggolin/workflow/panRGP.py @@ -75,7 +75,7 @@ def launch(args: argparse.Namespace): graph_time = time.time() - start_graph start_part = time.time() - partition(pangenome, tmpdir=args.tmpdir, kval=args.nb_of_partitions, cpu=args.cpu, + partition(pangenome, kval=args.nb_of_partitions, cpu=args.cpu, tmpdir=args.tmpdir, disable_bar=args.disable_prog_bar) part_time = time.time() - start_part diff --git a/ppanggolin/workflow/workflow.py b/ppanggolin/workflow/workflow.py index b6ad7000..8bcaaaf7 100644 --- a/ppanggolin/workflow/workflow.py +++ b/ppanggolin/workflow/workflow.py @@ -59,7 +59,7 @@ def launch(args: argparse.Namespace): compute_neighbors_graph(pangenome, disable_bar=args.disable_prog_bar) - partition(pangenome, tmpdir=args.tmpdir, kval=args.nb_of_partitions, cpu=args.cpu, + partition(pangenome, kval=args.nb_of_partitions, cpu=args.cpu, tmpdir=args.tmpdir, disable_bar=args.disable_prog_bar) write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) From 459fb3f3d12f6d97b4fd2291bbab787dbe15cf20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 15 Jun 2023 22:10:57 +0200 Subject: [PATCH 10/75] Refactor region and spot --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 9 +++++---- ppanggolin/RGP/spot.py | 31 ++++++++----------------------- 3 files changed, 14 insertions(+), 28 deletions(-) diff --git a/VERSION b/VERSION index 6efbab83..86c98d12 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.112 +1.2.113 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 11264525..87452cde 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -4,7 +4,7 @@ # default libraries import logging import argparse - +from pathlib import Path # installed libraries from tqdm import tqdm @@ -163,6 +163,7 @@ def mk_regions(contig: Contig, matrix: list, multi: set, min_length: int = 3000, :return: """ + def max_index_node(lst): """gets the last node with the highest score from a list of matriceNode""" if isinstance(lst, list): @@ -209,7 +210,7 @@ def naming_scheme(pangenome: Pangenome): contigsids.add(contig.name) if oldlen == len(contigsids): logging.warning("You have contigs with identical identifiers in your assemblies. " - "identifiers will be supplemented with your provided organism names.") + "identifiers will be supplemented with your provided organism names.") return "organism" return "contig" @@ -273,7 +274,7 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ pangenome = Pangenome() - pangenome.add_file(args.pan) + pangenome.add_file(args.pangenome) predict_rgp(pangenome, persistent_penalty=args.persistent_penalty, variable_gain=args.variable_gain, min_length=args.min_length, min_score=args.min_score, dup_margin=args.dup_margin, force=args.force, disable_bar=args.disable_prog_bar) @@ -301,7 +302,7 @@ def parser_rgp(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument('--persistent_penalty', required=False, type=int, default=3, diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 00649157..f4bf49ff 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -6,6 +6,7 @@ import argparse import time import os +from pathlib import Path # installed libraries import networkx as nx @@ -69,7 +70,7 @@ def check_sim(pair_border1: list, pair_border2: list, overlapping_match: int = 2 return False -def make_spot_graph(rgps: list, multigenics: set, output: str, spot_graph: bool = False, overlapping_match: int = 2, +def make_spot_graph(rgps: list, multigenics: set, output: Path, spot_graph: bool = False, overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1) -> list: """ Create a spot graph from pangenome RGP @@ -142,7 +143,7 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): del graph_spot.nodes[node]["border1"] del graph_spot.nodes[node]["rgp"] - nx.readwrite.gexf.write_gexf(graph_spot, output + "/spotGraph.gexf") + nx.readwrite.gexf.write_gexf(graph_spot, output.as_posix() + "/spotGraph.gexf") return spots @@ -160,7 +161,7 @@ def check_pangenome_former_spots(pangenome: Pangenome, force: bool = False): erase_pangenome(pangenome, spots=True) -def predict_hotspots(pangenome: Pangenome, output: str, spot_graph: bool = False, overlapping_match: int = 2, +def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = False, overlapping_match: int = 2, set_size: int = 3, exact_match: int = 1, force: bool = False, disable_bar: bool = False): """ Main function to predict hotspot @@ -220,10 +221,6 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) if args.spot_graph: mk_outdir(args.output, args.force) - if args.draw_hotspots or args.interest or args.fig_margin or args.priority: - logging.warning( - "Options to draw the spots with the 'ppanggolin spot' subcommand have been deprecated, " - "and are now dealt with in a dedicated subcommand 'ppanggolin drawspot'.") predict_hotspots(pangenome, args.output, force=args.force, spot_graph=args.spot_graph, overlapping_match=args.overlapping_match, set_size=args.set_size, exact_match=args.exact_match_size, disable_bar=args.disable_prog_bar) @@ -251,11 +248,11 @@ def parser_spot(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") + required.add_argument('-p', '--pangenome', required=True, type=Path, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument('-o', '--output', required=False, type=str, - default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", - time.localtime()) + "_PID" + str(os.getpid()), + optional.add_argument('-o', '--output', required=False, type=Path, + default=Path(f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" + f"_PID{str(os.getpid())}"), help="Output directory") optional.add_argument("--spot_graph", required=False, action="store_true", help="Writes a graph in .gexf format of pairs of blocks of single copy markers flanking RGPs," @@ -270,18 +267,6 @@ def parser_spot(parser: argparse.ArgumentParser): help="Number of perfectly matching flanking single copy markers required to associate RGPs " "during hotspot computation (Ex: If set to 1, two RGPs are in the same hotspot " "if both their 1st flanking genes are the same)") - optional.add_argument("--draw_hotspots", required=False, action="store_true", - help=argparse.SUPPRESS) # This ensures compatibility with the old API - # but does not use the option - optional.add_argument("--interest", required=False, action="store_true", - help=argparse.SUPPRESS) # This ensures compatibility with the old API - # but does not use the option - optional.add_argument("--fig_margin", required=False, action="store_true", - help=argparse.SUPPRESS) # This ensures compatibility with the old API - # but does not use the option - optional.add_argument("--priority", required=False, action="store_true", - help=argparse.SUPPRESS) # This ensures compatibility with the old API - # but does not use the option if __name__ == '__main__': From 6623c84834b3321e2f1e5d365f011e5ea0445658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 19 Jun 2023 14:54:28 +0200 Subject: [PATCH 11/75] Add getLogger(PPanggolin) for other tools utilization --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 8 ++-- ppanggolin/RGP/spot.py | 14 +++--- ppanggolin/align/alignOnPang.py | 22 ++++----- ppanggolin/annotate/annotate.py | 20 ++++----- ppanggolin/annotate/synta.py | 8 ++-- ppanggolin/cluster/cluster.py | 60 ++++++++++++------------- ppanggolin/context/searchGeneContext.py | 14 +++--- ppanggolin/figures/draw_spot.py | 14 +++--- ppanggolin/figures/tile_plot.py | 16 +++---- ppanggolin/figures/ucurve.py | 4 +- ppanggolin/formats/readBinaries.py | 18 ++++---- ppanggolin/formats/writeBinaries.py | 52 ++++++++++----------- ppanggolin/formats/writeFlat.py | 60 ++++++++++++------------- ppanggolin/formats/writeMSA.py | 16 +++---- ppanggolin/formats/writeSequences.py | 28 ++++++------ ppanggolin/geneFamily.py | 6 +-- ppanggolin/genome.py | 6 +-- ppanggolin/graph/makeGraph.py | 4 +- ppanggolin/metrics/fluidity.py | 16 +++---- ppanggolin/metrics/metrics.py | 14 +++--- ppanggolin/mod/module.py | 10 ++--- ppanggolin/nem/partition.py | 38 ++++++++-------- ppanggolin/nem/rarefaction.py | 22 ++++----- ppanggolin/pangenome.py | 2 +- ppanggolin/region.py | 8 ++-- ppanggolin/utility/utils.py | 2 +- ppanggolin/utils.py | 30 ++++++------- ppanggolin/workflow/all.py | 22 ++++----- 29 files changed, 267 insertions(+), 269 deletions(-) diff --git a/VERSION b/VERSION index 4e6afac6..ff2f04c5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.114 +1.2.115 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 76969e18..8f117ba0 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -209,7 +209,7 @@ def naming_scheme(pangenome: Pangenome): oldlen = len(contigsids) contigsids.add(contig.name) if oldlen == len(contigsids): - logging.warning("You have contigs with identical identifiers in your assemblies. " + logging.getLogger("PPanGGOLiN").warning("You have contigs with identical identifiers in your assemblies. " "identifiers will be supplemented with your provided organism names.") return "organism" return "contig" @@ -248,14 +248,14 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=False, need_partitions=True, disable_bar=disable_bar) - logging.info("Detecting multigenic families...") + logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(dup_margin) - logging.info("Compute Regions of Genomic Plasticity ...") + logging.getLogger("PPanGGOLiN").info("Compute Regions of Genomic Plasticity ...") name_scheme = naming_scheme(pangenome) for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar): pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme)) - logging.info(f"Predicted {len(pangenome.regions)} RGP") + logging.getLogger("PPanGGOLiN").info(f"Predicted {len(pangenome.regions)} RGP") # save parameters and save status pangenome.parameters["RGP"] = {} diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 0bd975f8..93944c8f 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -116,11 +116,11 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): else: used += 1 add_new_node(graph_spot, rgp, border) - logging.info(f"{lost} RGPs were not used as they are on a contig border (or have less than {set_size} " + logging.getLogger("PPanGGOLiN").info(f"{lost} RGPs were not used as they are on a contig border (or have less than {set_size} " f"persistent gene families until the contig border)") - logging.info(f"{used} RGPs are being used to predict spots of insertion") + logging.getLogger("PPanGGOLiN").info(f"{used} RGPs are being used to predict spots of insertion") node_list = list(graph_spot.nodes) - logging.info(f"{len(node_list)} number of different pairs of flanking gene families") + logging.getLogger("PPanGGOLiN").info(f"{len(node_list)} number of different pairs of flanking gene families") for i, nodei in enumerate(node_list[:-1]): for nodej in node_list[i + 1:]: node_obj_i = graph_spot.nodes[nodei] @@ -189,19 +189,19 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals need_rgp=True, disable_bar=disable_bar) # get multigenic gene families - logging.info("Detecting multigenic families...") + logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) - logging.info("Detecting hotspots in the pangenome...") + logging.getLogger("PPanGGOLiN").info("Detecting hotspots in the pangenome...") # predict spots spots = make_spot_graph(pangenome.regions, multigenics, output, spot_graph, overlapping_match, set_size, exact_match) if len(spots) == 0: - logging.warning("No spots were detected.") + logging.getLogger("PPanGGOLiN").warning("No spots were detected.") else: - logging.info(f"{len(spots)} spots were detected") + logging.getLogger("PPanGGOLiN").info(f"{len(spots)} spots were detected") pangenome.add_spots(spots) pangenome.status["spots"] = "Computed" diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 5447bc4e..74432507 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -31,7 +31,7 @@ def createdb(file_obj: TextIOWrapper, tmpdir: Path) -> IO: """ seqdb = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir) cmd = ["mmseqs", "createdb", file_obj.name, seqdb.name, '--dbtype', '0'] - logging.debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL) return seqdb @@ -62,14 +62,14 @@ def align_seq_to_pang(pang_file: IO, seq_file: TextIOWrapper, output: Path, aln_db = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir) cmd = list(map(str, ["mmseqs", "search", seq_db.name, pang_db.name, aln_db.name, tmpdir, "-a", "--min-seq-id", identity, "-c", coverage, "--cov-mode", cov_mode, "--threads", cpu])) - logging.debug(" ".join(cmd)) - logging.info("Aligning sequences to cluster representatives...") + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").info("Aligning sequences to cluster representatives...") subprocess.run(cmd, stdout=subprocess.DEVNULL) outfile = output.absolute() / "input_to_pangenome_associations.blast-tab_tmp" # write a tmp file of the results cmd = list(map(str, ["mmseqs", "convertalis", seq_db.name, pang_db.name, aln_db.name, outfile, "--format-mode", "2"])) - logging.debug(" ".join(cmd)) - logging.info("Extracting alignments...") + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").info("Extracting alignments...") subprocess.run(cmd, stdout=subprocess.DEVNULL) pang_db.close() seq_db.close() @@ -234,7 +234,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel :param disable_bar: disable progress bar :return: """ - logging.info("Writing RGP and spot information related to hits in the pangenome") + logging.getLogger("PPanGGOLiN").info("Writing RGP and spot information related to hits in the pangenome") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) finfo = open(output / "info_input_seq.tsv", "w") @@ -254,7 +254,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel for spot in spot_list: if len(spot.get_uniq_ordered_set()) > 1: drawn_spots.add(spot) - logging.info(f"Drawing the {len(drawn_spots)} spots with more than 1 organization " + logging.getLogger("PPanGGOLiN").info(f"Drawing the {len(drawn_spots)} spots with more than 1 organization " f"related to hits of the input sequences...") draw_selected_spots(drawn_spots, pangenome, output, pangenome.parameters["spots"]["overlapping_match"], pangenome.parameters["spots"]["exact_match"], pangenome.parameters["spots"]["set_size"], @@ -268,7 +268,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel draw_spot_gexf(drawn_spots, output, multigenics=multigenics, fam_to_mod=fam2mod) - logging.info(f"File listing RGP and spots where sequences of interest are located : " + logging.getLogger("PPanGGOLiN").info(f"File listing RGP and spots where sequences of interest are located : " f"{output / 'info_input_seq.tsv'}") @@ -348,9 +348,9 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) part_proj = project_partition(seq2pang, seq_set, output) # write the partition assignation only - logging.info(f"sequences partition projection : '{part_proj}'") - logging.info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") - logging.info(f"Blast-tab file of the alignment : '{align_file}'") + logging.getLogger("PPanGGOLiN").info(f"sequences partition projection : '{part_proj}'") + logging.getLogger("PPanGGOLiN").info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") + logging.getLogger("PPanGGOLiN").info(f"Blast-tab file of the alignment : '{align_file}'") new_tmpdir.cleanup() diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index a92d62b7..5b537e46 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -99,7 +99,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p """ org = Organism(organism) - logging.debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}") + logging.getLogger("PPanGGOLiN").debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}") # revert the order of the file, to read the first line first. lines = read_compressed_or_not(gbff_file_path).readlines()[::-1] gene_counter = 0 @@ -438,7 +438,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p :param pseudo: allow to read pseudogène :param disable_bar: Disable the progresse bar """ - logging.info(f"Reading {organisms_file.name} the list of organism files ...") + logging.getLogger("PPanGGOLiN").info(f"Reading {organisms_file.name} the list of organism files ...") pangenome.status["geneSequences"] = "Computed" # we assume there are gene sequences in the annotation files, @@ -462,10 +462,10 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p # decide whether we use local ids or ppanggolin ids. used_local_identifiers = chose_gene_identifiers(pangenome) if used_local_identifiers: - logging.info("gene identifiers used in the provided annotation files were unique, " + logging.getLogger("PPanGGOLiN").info("gene identifiers used in the provided annotation files were unique, " "PPanGGOLiN will use them.") else: - logging.info("gene identifiers used in the provided annotation files were not unique, " + logging.getLogger("PPanGGOLiN").info("gene identifiers used in the provided annotation files were not unique, " "PPanGGOLiN will use self-generated identifiers.") pangenome.status["genomesAnnotated"] = "Computed" @@ -486,7 +486,7 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): for line in read_compressed_or_not(fasta_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - logging.error("No tabulation separator found in organisms file") + logging.getLogger("PPanGGOLiN").error("No tabulation separator found in organisms file") exit(1) try: org = pangenome.get_organism(elements[0]) @@ -545,7 +545,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: :param disable_bar: Disable the progresse bar """ - logging.info(f"Reading {fasta_list} the list of organism files") + logging.getLogger("PPanGGOLiN").info(f"Reading {fasta_list} the list of organism files") arguments = [] # Argument given to annotate organism in same order than prototype for line in read_compressed_or_not(fasta_list): @@ -559,7 +559,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: norna, kingdom, overlap, procedure)) if len(arguments) == 0: raise Exception("There are no genomes in the provided file") - logging.info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") + logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") with get_context('fork').Pool(processes=cpu) as p: for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", total=len(arguments), disable=disable_bar): @@ -567,7 +567,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: p.close() p.join() - logging.info("Done annotating genomes") + logging.getLogger("PPanGGOLiN").info("Done annotating genomes") pangenome.status["genomesAnnotated"] = "Computed" # the pangenome is now annotated. pangenome.status["geneSequences"] = "Computed" # the gene objects have their respective gene sequences. pangenome.parameters["annotation"] = {} @@ -597,9 +597,9 @@ def launch(args: argparse.Namespace): if args.fasta: get_gene_sequences_from_fastas(pangenome, args.fasta) else: - logging.warning("You provided gff files without sequences, and you did not provide " + logging.getLogger("PPanGGOLiN").warning("You provided gff files without sequences, and you did not provide " "fasta sequences. Thus it was not possible to get the gene sequences.") - logging.warning("You will be able to proceed with your analysis ONLY if you provide " + logging.getLogger("PPanGGOLiN").warning("You will be able to proceed with your analysis ONLY if you provide " "the clustering results in the next step.") write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 14dcc796..6658d0d7 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -45,7 +45,7 @@ def launch_aragorn(fna_file: str, org: Organism) -> defaultdict: """ locustag = org.name cmd = ["aragorn", "-t", "-gcbact", "-l", "-w", fna_file] - logging.debug(f"aragorn command : {' '.join(cmd)}") + logging.getLogger("PPanGGOLiN").debug(f"aragorn command : {' '.join(cmd)}") p = Popen(cmd, stdout=PIPE) # loading the whole thing, reverting it to 'pop' in order. file_data = p.communicate()[0].decode().split("\n")[:: -1] @@ -81,7 +81,7 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str """ locustag = org.name cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"])) - logging.debug(f"prodigal command : {' '.join(cmd)}") + logging.getLogger("PPanGGOLiN").debug(f"prodigal command : {' '.join(cmd)}") p = Popen(cmd, stdout=PIPE) gene_objs = defaultdict(set) @@ -124,7 +124,7 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = " tmp_file = tempfile.NamedTemporaryFile(mode="r", dir=tmpdir) cmd = ["cmscan", "--tblout", tmp_file.name, "--hmmonly", "--cpu", str(1), "--noali", modelfile, fna_file] - logging.debug(f"infernal command : {' '.join(cmd)}") + logging.getLogger("PPanGGOLiN").debug(f"infernal command : {' '.join(cmd)}") p = Popen(cmd, stdout=open(os.devnull, "w"), stderr=PIPE) err = p.communicate()[1].decode().split() if err: @@ -317,7 +317,7 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) if procedure is None: # prodigal procedure is not force by user - logging.debug(all_contig_len) + logging.getLogger("PPanGGOLiN").debug(all_contig_len) if all_contig_len < 20000: # case of short sequence procedure = "meta" else: diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 49f1f97b..b7235de2 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -83,33 +83,33 @@ def first_clustering(sequences: TextIO, tmpdir: Path, cpu: int = 1, code: int = """ seq_nucdb = tmpdir/'nucleotid_sequences_db' cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) - logging.debug(" ".join(cmd)) - logging.info("Creating sequence database...") + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").info("Creating sequence database...") subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.debug("Translate sequence ...") + logging.getLogger("PPanGGOLiN").debug("Translate sequence ...") seqdb = tmpdir/'aa_db' cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) - logging.debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.info("Clustering sequences...") + logging.getLogger("PPanGGOLiN").info("Clustering sequences...") cludb = tmpdir/'cluster_db' cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir, "--cluster-mode", mode, "--min-seq-id", identity, "-c", coverage, "--threads", cpu, "--kmer-per-seq", 80, "--max-seqs", 300])) - logging.debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.info("Extracting cluster representatives...") + logging.getLogger("PPanGGOLiN").info("Extracting cluster representatives...") repdb = tmpdir/'representative_db' cmd = list(map(str, ["mmseqs", "result2repseq", seqdb, cludb, repdb])) - logging.debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) reprfa = tmpdir/'representative_sequences.fasta' cmd = list(map(str, ["mmseqs", "result2flat", seqdb, seqdb, repdb, reprfa, "--use-fasta-header"])) - logging.debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.info("Writing gene to family informations") + logging.getLogger("PPanGGOLiN").info("Writing gene to family informations") outtsv = tmpdir/'families_tsv' cmd = list(map(str, ["mmseqs", "createtsv", seqdb, seqdb, cludb, outtsv, "--threads", cpu, "--full-header"])) - logging.debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return reprfa, outtsv @@ -145,22 +145,22 @@ def align_rep(faa_file: Path, tmpdir: Path, cpu: int = 1, coverage: float = 0.8, :return: Result of alignment """ - logging.debug("Create database") + logging.getLogger("PPanGGOLiN").debug("Create database") seqdb = tmpdir/'rep_sequence_db' cmd = list(map(str, ["mmseqs", "createdb", faa_file, seqdb])) - logging.debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.info("Aligning cluster representatives...") + logging.getLogger("PPanGGOLiN").info("Aligning cluster representatives...") alndb = tmpdir/'rep_alignment_db' cmd = list(map(str, ["mmseqs", "search", seqdb, seqdb, alndb, tmpdir, "-a", "--min-seq-id", identity, "-c", coverage, "--cov-mode", 1, "--threads", cpu])) - logging.debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - logging.info("Extracting alignments...") + logging.getLogger("PPanGGOLiN").info("Extracting alignments...") outfile = tmpdir/'rep_families.tsv' cmd = list(map(str, ["mmseqs", "convertalis", seqdb, seqdb, alndb, outfile, "--format-output", "query,target,qlen,tlen,bits"])) - logging.debug(" ".join(cmd)) + logging.getLogger("PPanGGOLiN").debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return outfile @@ -195,7 +195,7 @@ def refine_clustering(tsv: str, aln_file: str, fam_to_seq: dict) -> Tuple[Dict[s """ simgraph = Graph() genes2fam, fam2genes = read_tsv(tsv) - logging.info(f"Starting with {len(fam_to_seq)} families") + logging.getLogger("PPanGGOLiN").info(f"Starting with {len(fam_to_seq)} families") # create the nodes for fam, genes in fam2genes.items(): simgraph.add_node(fam, nbgenes=len(genes)) @@ -229,7 +229,7 @@ def refine_clustering(tsv: str, aln_file: str, fam_to_seq: dict) -> Tuple[Dict[s new_fam_to_seq = {} for fam in fam2genes: new_fam_to_seq[fam] = fam_to_seq[fam] - logging.info(f"Ending with {len(new_fam_to_seq)} gene families") + logging.getLogger("PPanGGOLiN").info(f"Ending with {len(new_fam_to_seq)} gene families") return genes2fam, new_fam_to_seq @@ -240,7 +240,7 @@ def read_fam2seq(pangenome: Pangenome, fam_to_seq: Dict[str, str]): :param pangenome: Annotated pangenome :param fam_to_seq: Dictionary which link families and sequences """ - logging.info("Adding protein sequences to the gene families") + logging.getLogger("PPanGGOLiN").info("Adding protein sequences to the gene families") for family, protein in fam_to_seq.items(): fam = pangenome.add_gene_family(family) fam.add_sequence(protein) @@ -254,7 +254,7 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F :param gene_to_fam: Dictionary which link gene to families :param disable_bar: Allow to disable progress bar """ - logging.info(f"Adding {len(gene_to_fam)} genes to the gene families") + logging.getLogger("PPanGGOLiN").info(f"Adding {len(gene_to_fam)} genes to the gene families") link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False if link and len(gene_to_fam) != len(pangenome.genes): # then maybe there are genes with identical IDs @@ -294,15 +294,15 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = tmp_path = Path(newtmpdir.name) with open(tmp_path/'nucleotid_sequences', "w") as sequence_file: check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar) - logging.info("Clustering all of the genes sequences...") + logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") rep, tsv = first_clustering(sequence_file, tmp_path, cpu, code, coverage, identity, mode) fam2seq = read_faa(rep) if not defrag: - logging.debug("No defragmentation") + logging.getLogger("PPanGGOLiN").debug("No defragmentation") genes2fam, _ = read_tsv(tsv) else: - logging.info("Associating fragments to their original gene family...") + logging.getLogger("PPanGGOLiN").info("Associating fragments to their original gene family...") aln = align_rep(rep, tmp_path, cpu, coverage, identity) genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq) pangenome.status["defragmented"] = "Computed" @@ -355,7 +355,7 @@ def infer_singletons(pangenome: Pangenome): if gene.family is None: pangenome.add_gene_family(gene.ID).add_gene(gene) singleton_counter += 1 - logging.info(f"Inferred {singleton_counter} singleton families") + logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families") def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singleton: bool = False, force: bool = False, @@ -373,7 +373,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet check_pangenome_former_clustering(pangenome, force) check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar) - logging.info(f"Reading {families_tsv_file.name} the gene families file ...") + logging.getLogger("PPanGGOLiN").info(f"Reading {families_tsv_file.name} the gene families file ...") filesize = os.stat(families_tsv_file).st_size families_tsv_file = read_compressed_or_not(families_tsv_file) frag = False # the genome annotations are necessarily loaded. @@ -435,18 +435,18 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) if args.clusters is None: if args.infer_singletons is True: - logging.warning("--infer_singletons option is not compatible with clustering creation. " + logging.getLogger("PPanGGOLiN").warning("--infer_singletons option is not compatible with clustering creation. " "To infer singleton you should give a clustering") clustering(pangenome, args.tmpdir, args.cpu, defrag=not args.no_defrag, code=args.translation_table, coverage=args.coverage, identity=args.identity, mode=args.mode, force=args.force, disable_bar=args.disable_prog_bar) - logging.info("Done with the clustering") + logging.getLogger("PPanGGOLiN").info("Done with the clustering") else: if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, args.coverage, args.identity, args.mode]: - logging.warning("You are using an option compatible only with clustering creation.") + logging.getLogger("PPanGGOLiN").warning("You are using an option compatible only with clustering creation.") read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar) - logging.info("Done reading the cluster file") + logging.getLogger("PPanGGOLiN").info("Done reading the cluster file") write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 5657f158..eb39ffe6 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -69,11 +69,11 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: # Compute the graph with transitive closure size provided as parameter start_time = time.time() - logging.info("Building the graph...") + logging.getLogger("PPanGGOLiN").info("Building the graph...") g = compute_gene_context_graph(families=gene_families, t=transitive, disable_bar=disable_bar) - logging.info( + logging.getLogger("PPanGGOLiN").info( f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts") - logging.debug(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") + logging.getLogger("PPanGGOLiN").debug(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") # extract the modules from the graph common_components = compute_gene_context(g, jaccard) @@ -85,9 +85,9 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: if len(families) != 0: export_to_dataframe(families, common_components, fam_2_seq, output) else: - logging.info("No gene contexts were found") + logging.getLogger("PPanGGOLiN").info("No gene contexts were found") - logging.info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = False) -> nx.Graph: @@ -210,7 +210,7 @@ def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, out :param output: output path """ - logging.debug(f"There are {len(families)} families among {len(gene_contexts)} gene contexts") + logging.getLogger("PPanGGOLiN").debug(f"There are {len(families)} families among {len(gene_contexts)} gene contexts") lines = [] for gene_context in gene_contexts: @@ -227,7 +227,7 @@ def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, out ).set_index("GeneContext ID") df.sort_values(["GeneContext ID", "Sequence ID"], na_position='last').to_csv( path_or_buf=f"{output}/gene_contexts.tsv", sep="\t", na_rep='NA') - logging.getLogger(f"detected gene context(s) are listed in: '{output}/gene_contexts.tsv'") + logging.getLogger("PPanGGOLiN").info(f"detected gene context(s) are listed in: '{output}/gene_contexts.tsv'") def launch(args: argparse.Namespace): diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 7ab4cf36..322e40ee 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -564,7 +564,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: :param disable_bar: Allow preventing bar progress print """ - logging.info("Ordering genes among regions, and drawing spots...") + logging.getLogger("PPanGGOLiN").info("Ordering genes among regions, and drawing spots...") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) @@ -623,7 +623,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, fname.absolute().as_posix()) subgraph(spot, fname.absolute().as_posix() + ".gexf", set_size=set_size, multigenics=multigenics, fam_to_mod=fam2mod) - logging.info(f"Done drawing spot(s), they can be found in the directory: '{output}'") + logging.getLogger("PPanGGOLiN").info(f"Done drawing spot(s), they can be found in the directory: '{output}'") def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: bool = False): @@ -647,23 +647,23 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: need_rgp=True, need_spots=True, need_modules=need_mod, disable_bar=disable_bar) if spot_list == 'all' or any(x == 'all' for x in spot_list): - logging.debug("all is found in spot list, all spot are drawn.") + logging.getLogger("PPanGGOLiN").debug("all is found in spot list, all spot are drawn.") selected_spots = [s for s in pangenome.spots if len(s.get_uniq_ordered_set()) > 1] else: curated_spot_list = {'spot_' + str(s) if not s.startswith("spot_") else str(s) for s in spot_list} - logging.debug(f'Required spots to draw: {curated_spot_list}') + logging.getLogger("PPanGGOLiN").debug(f'Required spots to draw: {curated_spot_list}') selected_spots = [s for s in pangenome.spots if "spot_" + str(s.ID) in curated_spot_list] if len(selected_spots) != len(curated_spot_list): existing_spots = {"spot_" + str(s.ID) for s in pangenome.spots} required_non_existing_spots = curated_spot_list - existing_spots - logging.warning( + logging.getLogger("PPanGGOLiN").warning( f'{len(required_non_existing_spots)} required spots to draw do not exist: {" ".join(required_non_existing_spots)} ') if len(selected_spots) < 10: - logging.info(f"Drawing the following spots: " + logging.getLogger("PPanGGOLiN").info(f"Drawing the following spots: " f"{','.join(['spot_' + str(s.ID) for s in selected_spots])}") else: - logging.info(f"Drawing {len(selected_spots)} spots") + logging.getLogger("PPanGGOLiN").info(f"Drawing {len(selected_spots)} spots") draw_selected_spots(selected_spots, pangenome, output, overlapping_match=pangenome.parameters["spots"]["overlapping_match"], diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 6c76f97c..0b26b9ea 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -35,9 +35,9 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di if pangenome.status["partitioned"] == "No": raise Exception("Cannot draw the tile plot as your pangenome has not been partitioned") if len(pangenome.organisms) > 500 and nocloud is False: - logging.warning("You asked to draw a tile plot for a lot of organisms (>500). " + logging.getLogger("PPanGGOLiN").warning("You asked to draw a tile plot for a lot of organisms (>500). " "Your browser will probably not be able to open it.") - logging.info("Drawing the tile plot...") + logging.getLogger("PPanGGOLiN").info("Drawing the tile plot...") data = [] all_indexes = [] all_columns = [] @@ -55,7 +55,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", "undefined": "#828282"} - logging.info("start with matrice") + logging.getLogger("PPanGGOLiN").info("start with matrice") for row, fam in enumerate(families): new_col = [org_index[org] for org in fam.organisms] @@ -71,7 +71,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di hc = linkage(dist, 'single') dendro = dendrogram(hc, no_plot=True) - logging.info("done with making the dendrogram to order the organisms on the plot") + logging.getLogger("PPanGGOLiN").info("done with making the dendrogram to order the organisms on the plot") order_organisms = [index2org[index] for index in dendro["leaves"]] @@ -105,7 +105,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di ordered_nodes += ordered_nodes_c separators.append(separators[len(separators) - 1] + len(ordered_nodes_c)) - logging.info("Getting the gene name(s) and the number for each tile of the plot ...") + logging.getLogger("PPanGGOLiN").info("Getting the gene name(s) and the number for each tile of the plot ...") for node in ordered_nodes: fam_order.append('\u200c' + node.name) data = node.organisms @@ -115,7 +115,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di xaxis_values = ['\u200c' + org.name for org in order_organisms] - logging.info("Done extracting names and numbers. Making the heatmap ...") + logging.getLogger("PPanGGOLiN").info("Done extracting names and numbers. Making the heatmap ...") heatmap = go.Heatmap(z=binary_data, x=xaxis_values, @@ -172,7 +172,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di tickfont=dict(size=10)), shapes=shapes, plot_bgcolor='#ffffff') - logging.info("Drawing the figure itself...") + logging.getLogger("PPanGGOLiN").info("Drawing the figure itself...") out_plotly.plot(go.Figure(data=[heatmap], layout=layout), filename=output.as_posix() + "/tile_plot.html", auto_open=False) - logging.info(f"Done with the tile plot : '{output.as_posix() + '/tile_plot.html'}' ") + logging.getLogger("PPanGGOLiN").info(f"Done with the tile plot : '{output.as_posix() + '/tile_plot.html'}' ") diff --git a/ppanggolin/figures/ucurve.py b/ppanggolin/figures/ucurve.py index 1c6f4344..4d31f2d4 100644 --- a/ppanggolin/figures/ucurve.py +++ b/ppanggolin/figures/ucurve.py @@ -22,7 +22,7 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di :return: """ check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) - logging.info("Drawing the U-shaped curve...") + logging.getLogger("PPanGGOLiN").info("Drawing the U-shaped curve...") max_bar = 0 count = defaultdict(lambda: defaultdict(int)) is_partitioned = False @@ -76,4 +76,4 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di fig = go.Figure(data=data_plot, layout=layout) out_plotly.plot(fig, filename=output.as_posix() + "/Ushaped_plot.html", auto_open=False) - logging.info(f"Done drawing the U-shaped curve : '{output.as_posix() + '/Ushaped_plot.html'}'") + logging.getLogger("PPanGGOLiN").info(f"Done drawing the U-shaped curve : '{output.as_posix() + '/Ushaped_plot.html'}'") diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 66067b8e..93ee2d83 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -109,7 +109,7 @@ def get_status(pangenome: Pangenome, pangenome_file: Path): """ fix_partitioned(pangenome_file) h5f = tables.open_file(pangenome_file, "r") - logging.info("Getting the current pangenome status") + logging.getLogger("PPanGGOLiN").info("Getting the current pangenome status") status_group = h5f.root.status if status_group._v_attrs.genomesAnnotated: pangenome.status["genomesAnnotated"] = "inFile" @@ -199,7 +199,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter :param add: Add a prefix to sequence header :param disable_bar: Prevent to print disable progress bar """ - logging.info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...") + logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...") h5f = tables.open_file(filename, "r", driver_core_backing_store=0) table = h5f.root.geneSequences list_cds = set(list_cds) if list_cds is not None else None @@ -570,13 +570,13 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa h5f = tables.open_file(filename, "r") if annotation: if h5f.root.status._v_attrs.genomesAnnotated: - logging.info("Reading pangenome annotations...") + logging.getLogger("PPanGGOLiN").info("Reading pangenome annotations...") read_annotation(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' has not been annotated, or has been improperly filled") if gene_sequences: if h5f.root.status._v_attrs.geneSequences: - logging.info("Reading pangenome gene dna sequences...") + logging.getLogger("PPanGGOLiN").info("Reading pangenome gene dna sequences...") read_gene_sequences(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' does not have gene sequences, " @@ -584,7 +584,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa if gene_families: if h5f.root.status._v_attrs.genesClustered: - logging.info("Reading pangenome gene families...") + logging.getLogger("PPanGGOLiN").info("Reading pangenome gene families...") read_gene_families(pangenome, h5f, disable_bar=disable_bar) read_gene_families_info(pangenome, h5f, disable_bar=disable_bar) else: @@ -592,28 +592,28 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa f"The pangenome in file '{filename}' does not have gene families, or has been improperly filled") if graph: if h5f.root.status._v_attrs.NeighborsGraph: - logging.info("Reading the neighbors graph edges...") + logging.getLogger("PPanGGOLiN").info("Reading the neighbors graph edges...") read_graph(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' does not have graph information, " f"or has been improperly filled") if rgp: if h5f.root.status._v_attrs.predictedRGP: - logging.info("Reading the RGP...") + logging.getLogger("PPanGGOLiN").info("Reading the RGP...") read_rgp(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' does not have RGP information, " f"or has been improperly filled") if spots: if h5f.root.status._v_attrs.spots: - logging.info("Reading the spots...") + logging.getLogger("PPanGGOLiN").info("Reading the spots...") read_spots(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' does not have spots information, " f"or has been improperly filled") if modules: if h5f.root.status._v_attrs.modules: - logging.info("Reading the modules...") + logging.getLogger("PPanGGOLiN").info("Reading the modules...") read_modules(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' does not have modules information, " diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 961ffa9e..84ca3f64 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -158,7 +158,7 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool gene_table = h5f.create_table(annotation, "genes", gene_desc(*get_max_len_annotations(pangenome)), expectedrows=len(pangenome.genes)) - logging.debug(f"Writing {len(pangenome.genes)} genes") + logging.getLogger("PPanGGOLiN").debug(f"Writing {len(pangenome.genes)} genes") genedata2gene = {} genedata_counter = 0 @@ -186,7 +186,7 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), expectedrows=len(genedata2gene)) - logging.debug(f"Writing {len(genedata2gene)} gene-related data (can be lower than the number of genes)") + logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data (can be lower than the number of genes)") genedata_row = genedata_table.row for genedata, genedata_id in genedata2gene.items(): genedata_row["genedata_id"] = genedata_id @@ -342,7 +342,7 @@ def write_gene_fam_info(pangenome: Pangenome, h5f: tables.File, force: bool = Fa :param disable_bar: Disable progress bar """ if '/geneFamiliesInfo' in h5f and force is True: - logging.info("Erasing the formerly computed gene family representative sequences...") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed gene family representative sequences...") h5f.remove_node('/', 'geneFamiliesInfo') # erasing the table, and rewriting a new one. gene_fam_seq = h5f.create_table("/", "geneFamiliesInfo", gene_fam_desc(*get_gene_fam_len(pangenome)), expectedrows=len(pangenome.gene_families)) @@ -401,7 +401,7 @@ def write_gene_families(pangenome: Pangenome, h5f: tables.File, force: bool = Fa :param disable_bar: Disable progress bar """ if '/geneFamilies' in h5f and force is True: - logging.info("Erasing the formerly computed gene family to gene associations...") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed gene family to gene associations...") h5f.remove_node('/', 'geneFamilies') # erasing the table, and rewriting a new one. gene_families = h5f.create_table("/", "geneFamilies", gene_to_fam_desc(*get_gene_to_fam_len(pangenome))) gene_row = gene_families.row @@ -456,7 +456,7 @@ def write_graph(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis # consumming parts to read), it might be good to add the organism name in the table here. # for now, forcing the read of annotations. if '/edges' in h5f and force is True: - logging.info("Erasing the formerly computed edges") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed edges") h5f.remove_node("/", "edges") edge_table = h5f.create_table("/", "edges", graph_desc(get_gene_id_len(pangenome)), expectedrows=len(pangenome.edges)) @@ -514,7 +514,7 @@ def write_rgp(pangenome: Pangenome, h5f: tables.File, force: bool = False, disab :param disable_bar: Disable progress bar """ if '/RGP' in h5f and force is True: - logging.info("Erasing the formerly computer RGP") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computer RGP") h5f.remove_node('/', 'RGP') rgp_table = h5f.create_table('/', 'RGP', rgp_desc(*get_rgp_len(pangenome)), @@ -568,7 +568,7 @@ def write_spots(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis :param disable_bar: Disable progress bar """ if '/spots' in h5f and force is True: - logging.info("Erasing the formerly computed spots") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed spots") h5f.remove_node("/", "spots") spot_table = h5f.create_table("/", "spots", spot_desc(get_spot_desc(pangenome)), @@ -622,7 +622,7 @@ def write_modules(pangenome: Pangenome, h5f: tables.File, force: bool = False, d :param disable_bar: Disable progress bar """ if '/modules' in h5f and force is True: - logging.info("Erasing the formerly computed modules") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed modules") h5f.remove_node("/", "modules") mod_table = h5f.create_table('/', 'modules', mod_desc(get_mod_desc(pangenome)), @@ -859,7 +859,7 @@ def update_gene_fam_partition(pangenome: Pangenome, h5f: tables.File, disable_ba :param h5f: HDF5 file with gene families :param disable_bar: Allow to disable progress bar """ - logging.info("Updating gene families with partition information") + logging.getLogger("PPanGGOLiN").info("Updating gene families with partition information") table = h5f.root.geneFamiliesInfo for row in tqdm(table, total=table.nrows, unit="gene family", disable=disable_bar): row["partition"] = pangenome.get_gene_family(row["name"].decode()).partition @@ -874,7 +874,7 @@ def update_gene_fragments(pangenome: Pangenome, h5f: tables.File, disable_bar: b :param h5f: HDF5 pangenome file :param disable_bar: Allow to disable progress bar """ - logging.info("Updating annotations with fragment information") + logging.getLogger("PPanGGOLiN").info("Updating annotations with fragment information") genedataid2genedata = read_genedata(h5f) table = h5f.root.annotations.genes @@ -905,13 +905,13 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo info_group = h5f.root.info if '/edges' in h5f and (graph or gene_families): - logging.info("Erasing the formerly computed edges") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed edges") h5f.remove_node("/", "edges") status_group._v_attrs.NeighborsGraph = False pangenome.status["neighborsGraph"] = "No" h5f.del_node_attr(info_group, "numberOfEdges") if '/geneFamilies' in h5f and gene_families: - logging.info("Erasing the formerly computed gene family to gene associations...") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed gene family to gene associations...") h5f.remove_node('/', 'geneFamilies') # erasing the table, and rewriting a new one. pangenome.status["defragmented"] = "No" pangenome.status["genesClustered"] = "No" @@ -921,12 +921,12 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfClusters") if '/geneFamiliesInfo' in h5f and gene_families: - logging.info("Erasing the formerly computed gene family representative sequences...") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed gene family representative sequences...") h5f.remove_node('/', 'geneFamiliesInfo') # erasing the table, and rewriting a new one. pangenome.status["geneFamilySequences"] = "No" status_group._v_attrs.geneFamilySequences = False if partition: - logging.info("Erasing former partitions...") + logging.getLogger("PPanGGOLiN").info("Erasing former partitions...") pangenome.status["partitioned"] = "No" status_group._v_attrs.Partitioned = False if 'Partitioned' in status_group._v_attrs._f_list(): @@ -942,7 +942,7 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfSubpartitions") if '/RGP' in h5f and (gene_families or partition or rgp): - logging.info("Erasing the formerly computer RGP...") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computer RGP...") pangenome.status["predictedRGP"] = "No" status_group._v_attrs.predictedRGP = False h5f.remove_node("/", "RGP") @@ -950,7 +950,7 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfRGP") if '/spots' in h5f and (gene_families or partition or rgp or spots): - logging.info("Erasing the formerly computed spots...") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed spots...") pangenome.status["spots"] = "No" status_group._v_attrs.spots = False h5f.remove_node("/", "spots") @@ -958,7 +958,7 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo h5f.del_node_attr(info_group, "numberOfSpots") if '/modules' in h5f and (gene_families or partition or modules): - logging.info("Erasing the formerly computed modules...") + logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed modules...") pangenome.status["modules"] = "No" status_group._v_attrs.modules = False h5f.remove_node("/", "modules") @@ -987,7 +987,7 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable if pangenome.status["genomesAnnotated"] == "Computed": compression_filter = tables.Filters(complevel=1, shuffle=True, bitshuffle=True, complib='blosc:zstd') h5f = tables.open_file(filename, "w", filters=compression_filter) - logging.info("Writing genome annotations...") + logging.getLogger("PPanGGOLiN").info("Writing genome annotations...") write_annotations(pangenome, h5f, disable_bar=disable_bar) @@ -1003,14 +1003,14 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable h5f = tables.open_file(filename, "a") if pangenome.status["geneSequences"] == "Computed": - logging.info("writing the protein coding gene dna sequences") + logging.getLogger("PPanGGOLiN").info("writing the protein coding gene dna sequences") write_gene_sequences(pangenome, h5f, disable_bar=disable_bar) pangenome.status["geneSequences"] = "Loaded" if pangenome.status["genesClustered"] == "Computed": - logging.info("Writing gene families and gene associations...") + logging.getLogger("PPanGGOLiN").info("Writing gene families and gene associations...") write_gene_families(pangenome, h5f, force, disable_bar=disable_bar) - logging.info("Writing gene families information...") + logging.getLogger("PPanGGOLiN").info("Writing gene families information...") write_gene_fam_info(pangenome, h5f, force, disable_bar=disable_bar) if pangenome.status["genomesAnnotated"] in ["Loaded", "inFile"] and \ pangenome.status["defragmented"] == "Computed": @@ -1019,7 +1019,7 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable update_gene_fragments(pangenome, h5f, disable_bar=disable_bar) pangenome.status["genesClustered"] = "Loaded" if pangenome.status["neighborsGraph"] == "Computed": - logging.info("Writing the edges...") + logging.getLogger("PPanGGOLiN").info("Writing the edges...") write_graph(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status["neighborsGraph"] = "Loaded" if pangenome.status["partitioned"] == "Computed" and \ @@ -1028,17 +1028,17 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable pangenome.status["partitioned"] = "Loaded" if pangenome.status['predictedRGP'] == "Computed": - logging.info("Writing Regions of Genomic Plasticity...") + logging.getLogger("PPanGGOLiN").info("Writing Regions of Genomic Plasticity...") write_rgp(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status['predictedRGP'] = "Loaded" if pangenome.status["spots"] == "Computed": - logging.info("Writing Spots of Insertion...") + logging.getLogger("PPanGGOLiN").info("Writing Spots of Insertion...") write_spots(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status['spots'] = "Loaded" if pangenome.status["modules"] == "Computed": - logging.info("Writing Modules...") + logging.getLogger("PPanGGOLiN").info("Writing Modules...") write_modules(pangenome, h5f, force, disable_bar=disable_bar) pangenome.status["modules"] = "Loaded" @@ -1046,4 +1046,4 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable write_info(pangenome, h5f) h5f.close() - logging.info(f"Done writing the pangenome. It is in file : {filename}") + logging.getLogger("PPanGGOLiN").info(f"Done writing the pangenome. It is in file : {filename}") diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 9941eed0..838887e3 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -155,14 +155,14 @@ def write_json(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.info("Writing the json file for the pangenome graph...") + logging.getLogger("PPanGGOLiN").info("Writing the json file for the pangenome graph...") outname = output / "pangenomeGraph.json" with write_compressed_or_not(outname, compress) as json: write_json_header(json) write_json_nodes(json) write_json_edges(json) json.write("}") - logging.info(f"Done writing the json file : '{outname.as_posix()}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the json file : '{outname.as_posix()}'") def write_gexf_header(gexf: TextIO, light: bool = True): @@ -303,14 +303,14 @@ def write_gexf(output: Path, light: bool = True, compress: bool = False): txt = "Writing the " txt += "light gexf file for the pangenome graph..." if light else "gexf file for the pangenome graph..." - logging.info(txt) + logging.getLogger("PPanGGOLiN").info(txt) outname = output / f"pangenomeGraph{'_light' if light else ''}.gexf" with write_compressed_or_not(outname, compress) as gexf: write_gexf_header(gexf, light) write_gexf_nodes(gexf, light) write_gexf_edges(gexf, light) write_gexf_end(gexf) - logging.info(f"Done writing the gexf file : '{outname.as_posix()}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the gexf file : '{outname.as_posix()}'") def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool = False, gene_names: bool = False): @@ -324,7 +324,7 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool :param compress: Compress the file in .gz :param gene_names: write the genes name if there are saved in pangenome """ - logging.info(f"Writing the .{ext} file ...") + logging.getLogger("PPanGGOLiN").info(f"Writing the .{ext} file ...") outname = output / f"matrix.{ext}" with write_compressed_or_not(outname, compress) as matrix: @@ -385,7 +385,7 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool '"' + str(max(lis)) + '"', # 13 '"' + str(round(sum(lis) / len(lis), 2)) + '"'] # 14 + genes) + "\n") # 15 - logging.info(f"Done writing the matrix : '{outname.as_posix()}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the matrix : '{outname.as_posix()}'") def write_gene_presence_absence(output: Path, compress: bool = False): @@ -395,7 +395,7 @@ def write_gene_presence_absence(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.info("Writing the gene presence absence file ...") + logging.getLogger("PPanGGOLiN").info("Writing the gene presence absence file ...") outname = output / "gene_presence_absence.Rtab" with write_compressed_or_not(outname, compress) as matrix: index_org = {} @@ -415,7 +415,7 @@ def write_gene_presence_absence(output: Path, compress: bool = False): matrix.write('\t'.join([fam.name] # 14 + genes) + "\n") # 15 - logging.info(f"Done writing the gene presence absence file : '{outname.as_posix()}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene presence absence file : '{outname.as_posix()}'") def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, compress: bool = False): @@ -427,8 +427,8 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, :param dup_margin: minimum ratio of organisms in which family must have multiple genes to be considered duplicated :param compress: Compress the file in .gz """ - logging.info("Writing pangenome statistics...") - logging.info("Writing statistics on persistent duplication...") + logging.getLogger("PPanGGOLiN").info("Writing pangenome statistics...") + logging.getLogger("PPanGGOLiN").info("Writing statistics on persistent duplication...") single_copy_markers = set() # could use bitarrays if speed is needed with write_compressed_or_not(output / "mean_persistent_duplication.tsv", compress) as outfile: outfile.write(f"#duplication_margin={round(dup_margin, 3)}\n") @@ -450,8 +450,8 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, str(round(dup_ratio, 3)), str(round(mean_pres, 3)), str(is_scm)]) + "\n") - logging.info("Done writing stats on persistent duplication") - logging.info("Writing genome per genome statistics (completeness and counts)...") + logging.getLogger("PPanGGOLiN").info("Done writing stats on persistent duplication") + logging.getLogger("PPanGGOLiN").info("Writing genome per genome statistics (completeness and counts)...") soft = set() # could use bitarrays if speed is needed core = set() for fam in pan.gene_families: @@ -516,7 +516,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, completeness, len(fams & single_copy_markers)])) + "\n") - logging.info("Done writing genome per genome statistics") + logging.getLogger("PPanGGOLiN").info("Done writing genome per genome statistics") def write_org_file(org: Organism, output: Path, compress: bool = False): @@ -578,13 +578,13 @@ def write_projections(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.info("Writing the projection files...") + logging.getLogger("PPanGGOLiN").info("Writing the projection files...") outdir = output / "projection" if not os.path.exists(outdir): os.makedirs(outdir) for org in pan.organisms: write_org_file(org, outdir, compress) - logging.info("Done writing the projection files") + logging.getLogger("PPanGGOLiN").info("Done writing the projection files") def write_parts(output: Path, soft_core: float = 0.95): @@ -594,7 +594,7 @@ def write_parts(output: Path, soft_core: float = 0.95): :param output: Path to output directory :param soft_core: Soft core threshold to use """ - logging.info("Writing the list of gene families for each partition ...") + logging.getLogger("PPanGGOLiN").info("Writing the list of gene families for each partition ...") if not os.path.exists(output / "partitions"): os.makedirs(output / "partitions") part_sets = defaultdict(set) @@ -621,7 +621,7 @@ def write_parts(output: Path, soft_core: float = 0.95): if len(val) > 0: curr_key_file.write('\n'.join(val) + "\n") curr_key_file.close() - logging.info("Done writing the list of gene families for each partition") + logging.getLogger("PPanGGOLiN").info("Done writing the list of gene families for each partition") def write_gene_families_tsv(output: Path, compress: bool = False): @@ -631,14 +631,14 @@ def write_gene_families_tsv(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.info("Writing the file providing the association between genes and gene families...") + logging.getLogger("PPanGGOLiN").info("Writing the file providing the association between genes and gene families...") outname = output / "gene_families.tsv" with write_compressed_or_not(outname, compress) as tsv: for fam in pan.gene_families: for gene in fam.genes: tsv.write("\t".join([fam.name, gene.ID if gene.local_identifier == "" else gene.local_identifier, "F" if gene.is_fragment else ""]) + "\n") - logging.info("Done writing the file providing the association between genes and " + logging.getLogger("PPanGGOLiN").info("Done writing the file providing the association between genes and " f"gene families : '{outname}'") @@ -688,7 +688,7 @@ def r_and_s(value: float): min_size = min(size_list) fout.write("\t".join(map(r_and_s, [f"spot_{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, mean_size, stdev_size, max_size, min_size])) + "\n") - logging.info(f"Done writing spots in : '{output.as_posix() + '/summarize_spots.tsv'}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing spots in : '{output.as_posix() + '/summarize_spots.tsv'}'") def spot2rgp(spots: set, output: Path, compress: bool = False): @@ -749,7 +749,7 @@ def write_module_summary(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.info("Writing functional modules summary...") + logging.getLogger("PPanGGOLiN").info("Writing functional modules summary...") with write_compressed_or_not(output / "modules_summary.tsv", compress) as fout: fout.write("module_id\tnb_families\tnb_organisms\tpartition\tmean_number_of_occurrence\n") for mod in pan.modules: @@ -764,7 +764,7 @@ def write_module_summary(output: Path, compress: bool = False): f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod.families), 3)}\n") fout.close() - logging.info(f"Done writing module summary: '{output.as_posix() + '/modules_summary.tsv'}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing module summary: '{output.as_posix() + '/modules_summary.tsv'}'") def write_modules(output: Path, compress: bool = False): @@ -773,7 +773,7 @@ def write_modules(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.info("Writing functional modules...") + logging.getLogger("PPanGGOLiN").info("Writing functional modules...") with write_compressed_or_not(output / "functional_modules.tsv", compress) as fout: fout.write("module_id\tfamily_id\n") for mod in pan.modules: @@ -781,7 +781,7 @@ def write_modules(output: Path, compress: bool = False): fout.write(f"module_{mod.ID}\t{family.name}\n") fout.close() - logging.info(f"Done writing functional modules to: '{output.as_posix() + '/functional_modules.tsv'}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing functional modules to: '{output.as_posix() + '/functional_modules.tsv'}'") def write_org_modules(output: Path, compress: bool = False): @@ -790,7 +790,7 @@ def write_org_modules(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.info("Writing modules to organisms associations...") + logging.getLogger("PPanGGOLiN").info("Writing modules to organisms associations...") with write_compressed_or_not(output / "modules_in_organisms.tsv", compress) as fout: fout.write("module_id\torganism\tcompletion\n") for mod in pan.modules: @@ -801,7 +801,7 @@ def write_org_modules(output: Path, compress: bool = False): completion = round(len(org.families & mod.families) / len(mod.families), 2) fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n") fout.close() - logging.info( + logging.getLogger("PPanGGOLiN").info( f"Done writing modules to organisms associations to: '{output.as_posix() + '/modules_in_organisms.tsv'}'") @@ -811,7 +811,7 @@ def write_spot_modules(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.info("Writing modules to spot associations...") + logging.getLogger("PPanGGOLiN").info("Writing modules to spot associations...") fam2mod = {} for mod in pan.modules: @@ -834,7 +834,7 @@ def write_spot_modules(output: Path, compress: bool = False): # if all the families in the module are found in the spot, write the association fout.write(f"module_{mod.ID}\tspot_{spot.ID}\n") - logging.info(f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}") + logging.getLogger("PPanGGOLiN").info(f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}") def write_rgp_modules(output: Path, compress: bool = False): @@ -843,7 +843,7 @@ def write_rgp_modules(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.info("Clustering RGPs based on module content...") + logging.getLogger("PPanGGOLiN").info("Clustering RGPs based on module content...") lists = write_compressed_or_not(output / "modules_RGP_lists.tsv", compress) lists.write("representative_RGP\tnb_spots\tmod_list\tRGP_list\n") @@ -878,7 +878,7 @@ def write_rgp_modules(output: Path, compress: bool = False): f"{','.join([reg.name for reg in regions])}\n") lists.close() - logging.info(f"RGP and associated modules are listed in : {output.as_posix() + '/modules_RGP_lists.tsv'}") + logging.getLogger("PPanGGOLiN").info(f"RGP and associated modules are listed in : {output.as_posix() + '/modules_RGP_lists.tsv'}") def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core: float = 0.95, diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 4bc170ac..8b347021 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -168,7 +168,7 @@ def launch_mafft(fname: Path, output: Path, fam_name: str): """ outname = output / f"{fam_name}.aln" cmd = ["mafft", "--thread", "1", fname.absolute().as_posix()] - logging.debug("command: " + " ".join(cmd)) + logging.getLogger("PPanGGOLiN").debug("command: " + " ".join(cmd)) subprocess.run(cmd, stdout=open(outname, "w"), stderr=subprocess.DEVNULL, check=True) @@ -200,7 +200,7 @@ def compute_msa(families: set, output: Path, tmpdir: Path, cpu: int = 1, source: write_total = 0 args = [] - logging.info("Preparing input files for MSA...") + logging.getLogger("PPanGGOLiN").info("Preparing input files for MSA...") code_table = genetic_codes(str(code)) for family in tqdm(families, unit="family", disable=disable_bar): @@ -209,7 +209,7 @@ def compute_msa(families: set, output: Path, tmpdir: Path, cpu: int = 1, source: write_total = write_total + (time.time() - start_write) args.append((fname, output, family.name)) - logging.info("Computing the MSA ...") + logging.getLogger("PPanGGOLiN").info("Computing the MSA ...") bar = tqdm(range(len(families)), unit="family", disable=disable_bar) with get_context('fork').Pool(cpu) as p: for _ in p.imap_unordered(launch_multi_mafft, args): @@ -310,30 +310,30 @@ def write_msa_files(pangenome: Pangenome, output: Path, cpu: int = 1, partition: check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=need_partitions, need_gene_sequences=True, disable_bar=disable_bar) - logging.info(f"Doing MSA for {partition} families...") + logging.getLogger("PPanGGOLiN").info(f"Doing MSA for {partition} families...") families = get_families_to_write(pangenome, partition_filter=partition, soft_core=soft_core, dup_margin=dup_margin, single_copy=single_copy) # check that the code is similar than the one used previously, if there is one if 'translation_table' in pangenome.parameters["cluster"]: if pangenome.parameters["cluster"]["translation_table"] != translation_table: - logging.warning("The translation table used during clustering " + logging.getLogger("PPanGGOLiN").warning("The translation table used during clustering " f"('{pangenome.parameters['cluster']['translation_table']}') " f"is different than the one provided now ('{translation_table}')") code = translation_table compute_msa(families, outdir, cpu=cpu, tmpdir=tmpdir, source=source, use_gene_id=use_gene_id, code=code, disable_bar=disable_bar) - logging.info(f"Done writing all {partition} MSA in: {outdir}") + logging.getLogger("PPanGGOLiN").info(f"Done writing all {partition} MSA in: {outdir}") if phylo: - logging.info("Writing the whole genome msa file") + logging.getLogger("PPanGGOLiN").info("Writing the whole genome msa file") if partition == "softcore": phylo_name = output / f"{partition}_{soft_core}_genome_alignment.aln" else: phylo_name = output / f"{partition}_genome_alignment.aln" write_whole_genome_msa(pangenome, families, phylo_name, outdir, use_gene_id=use_gene_id) - logging.info(f"Done writing the {partition} genome alignment in: '{phylo_name}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the {partition} genome alignment in: '{phylo_name}'") def launch(args: argparse.Namespace): diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index b0431697..2614589c 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -38,7 +38,7 @@ def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO counter = 0 # TODO remove ? if list_cds is None: list_cds = pangenome.genes - logging.info("Writing all of the CDS sequences...") + logging.getLogger("PPanGGOLiN").info("Writing all of the CDS sequences...") for gene in tqdm(list_cds, unit="gene", disable=disable_bar): if gene.type == "CDS": counter += 1 @@ -61,7 +61,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co """ assert genes in poss_values, f"Selected part to write genes not in {poss_values}" - logging.info("Writing all the gene nucleotide sequences...") + logging.getLogger("PPanGGOLiN").info("Writing all the gene nucleotide sequences...") outpath = output / f"{genes}_genes.fna" genefams = select_families(pangenome, genes, "gene nucleotide sequences", soft_core) @@ -70,7 +70,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co for fam in genefams: genes_to_write.extend(fam.genes) - logging.info(f"There are {len(genes_to_write)} genes to write") + logging.getLogger("PPanGGOLiN").info(f"There are {len(genes_to_write)} genes to write") with write_compressed_or_not(outpath, compress) as fasta: if pangenome.status["geneSequences"] in ["inFile"]: get_gene_sequences_from_file(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]), @@ -80,7 +80,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co else: # this should never happen if the pangenome has been properly checked before launching this function. raise Exception("The pangenome does not include gene sequences") - logging.info(f"Done writing the gene sequences : '{outpath}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]: @@ -96,31 +96,31 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c """ genefams = set() if partition == 'all': - logging.info(f"Writing all of the {type_name}...") + logging.getLogger("PPanGGOLiN").info(f"Writing all of the {type_name}...") genefams = pangenome.gene_families elif partition in ['persistent', 'shell', 'cloud']: - logging.info(f"Writing the {type_name} of the {partition}...") + logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} of the {partition}...") for fam in pangenome.gene_families: if fam.named_partition == partition: genefams.add(fam) elif partition == "rgp": - logging.info(f"Writing the {type_name} in RGPs...") + logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} in RGPs...") for region in pangenome.regions: genefams |= region.families elif partition == "softcore": - logging.info( + logging.getLogger("PPanGGOLiN").info( f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes") threshold = pangenome.number_of_organisms() * soft_core for fam in pangenome.gene_families: if len(fam.organisms) >= threshold: genefams.add(fam) elif partition == "core": - logging.info(f"Writing the representative {type_name} of the {partition} gene families...") + logging.getLogger("PPanGGOLiN").info(f"Writing the representative {type_name} of the {partition} gene families...") for fam in pangenome.gene_families: if len(fam.organisms) == pangenome.number_of_organisms(): genefams.add(fam) elif "module_" in partition: - logging.info(f"Writing the representation {type_name} of {partition} gene families...") + logging.getLogger("PPanGGOLiN").info(f"Writing the representation {type_name} of {partition} gene families...") mod_id = int(partition.replace("module_", "")) for mod in pangenome.modules: # could be way more efficient with a dict structure instead of a set @@ -152,7 +152,7 @@ def write_fasta_gene_fam(pangenome: Pangenome, output: Path, gene_families: str, with write_compressed_or_not(outpath, compress) as fasta: get_gene_sequences_from_file(pangenome.file, fasta, [fam.name for fam in genefams], disable_bar=disable_bar) - logging.info(f"Done writing the representative nucleotide sequences of the gene families : '{outpath}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the representative nucleotide sequences of the gene families : '{outpath}'") def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, soft_core: float = 0.95, @@ -178,7 +178,7 @@ def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, for fam in tqdm(genefams, unit="prot families", disable=disable_bar): fasta.write('>' + fam.name + "\n") fasta.write(fam.sequence + "\n") - logging.info(f"Done writing the representative amino acid sequences of the gene families : '{outpath}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the representative amino acid sequences of the gene families : '{outpath}'") def read_fasta_or_gff(file_path: Path) -> Dict[str, str]: @@ -306,7 +306,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa if not org_dict[elements[0]].exists(): # Check tsv sanity test if it's not one it's the other org_dict[elements[0]] = organisms_file.parent.joinpath(org_dict[elements[0]]) - logging.info(f"Writing {regions} rgp genomic sequences...") + logging.getLogger("PPanGGOLiN").info(f"Writing {regions} rgp genomic sequences...") regions_to_write = [] if regions == "complete": for region in pangenome.regions: @@ -327,7 +327,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa genome_sequence = read_genome_file(org_dict, loaded_genome) fasta.write(f">{region.name}\n") fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60)) - logging.info(f"Done writing the regions nucleotide sequences: '{outname}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: '{outname}'") def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, anno: Path = None, diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 137dc14b..bdca29c3 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -95,16 +95,16 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.debug(f"all") + logging.getLogger("PPanGGOLiN").debug(f"all") for org in self.organisms: self.bitarray[index[org]] = 1 elif partition in ['shell', 'cloud']: - logging.debug(f"shell, cloud") + logging.getLogger("PPanGGOLiN").debug(f"shell, cloud") if self.named_partition == partition: for org in self.organisms: self.bitarray[index[org]] = 1 elif partition == 'accessory': - logging.debug(f"accessory") + logging.getLogger("PPanGGOLiN").debug(f"accessory") if self.named_partition in ['shell', 'cloud']: for org in self.organisms: self.bitarray[index[org]] = 1 diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 7058de0f..bc6bafd7 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -270,16 +270,16 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.debug(f"all") + logging.getLogger("PPanGGOLiN").debug(f"all") for fam in self.families: self.bitarray[index[fam]] = 1 elif partition in ['shell', 'cloud']: - logging.debug(f"shell, cloud") + logging.getLogger("PPanGGOLiN").debug(f"shell, cloud") for fam in self.families: if fam.named_partition == partition: self.bitarray[index[fam]] = 1 elif partition == 'accessory': - logging.debug(f"accessory") + logging.getLogger("PPanGGOLiN").debug(f"accessory") for fam in self.families: if fam.named_partition in ['shell', 'cloud']: self.bitarray[index[fam]] = 1 diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index bb4dfba6..7fef1187 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -85,7 +85,7 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, if remove_copy_number > 0: remove_high_copy_number(pangenome, remove_copy_number) - logging.info("Computing the neighbors graph...") + logging.getLogger("PPanGGOLiN").info("Computing the neighbors graph...") bar = tqdm(pangenome.organisms, total=len(pangenome.organisms), unit="organism", disable=disable_bar) for org in bar: bar.set_description(f"Processing {org.name}") @@ -106,7 +106,7 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, if prev is not None and contig.is_circular and len(contig.genes) > 0: # if prev is None, the contig is entirely made of duplicated genes, so no edges are added pangenome.add_edge(contig.genes[0], prev) - logging.info("Done making the neighbors graph.") + logging.getLogger("PPanGGOLiN").info("Done making the neighbors graph.") pangenome.status["neighborsGraph"] = "Computed" pangenome.parameters["graph"] = {} diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py index b0784c43..b8aaf46b 100644 --- a/ppanggolin/metrics/fluidity.py +++ b/ppanggolin/metrics/fluidity.py @@ -24,17 +24,17 @@ def gen_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: """ # check statuses and load info - logging.info("Check information in pangenome") + logging.getLogger("PPanGGOLiN").info("Check information in pangenome") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) fluidity_dict = {'all': None, 'shell': None, 'cloud': None, 'accessory': None} for subset in fluidity_dict.keys(): - logging.debug(f"Compute binaries for {subset} partition") + logging.getLogger("PPanGGOLiN").debug(f"Compute binaries for {subset} partition") pangenome.compute_org_bitarrays(part=subset) # Compute binaries corresponding to presence / absence of families in organisms g_sum = 0 - logging.debug("Get number of families in each organisms") + logging.getLogger("PPanGGOLiN").debug("Get number of families in each organisms") org2_nb_fam = nb_fam_per_org(pangenome, disable_bar) - logging.info(f"Compute rate of unique family for each genome combination in {subset}") + logging.getLogger("PPanGGOLiN").info(f"Compute rate of unique family for each genome combination in {subset}") for c_organisms in tqdm(list(combinations(pangenome.organisms, 2)), unit="combination", disable=disable_bar): tot_fam = org2_nb_fam.get(c_organisms[0].name) + org2_nb_fam.get(c_organisms[1].name) common_fam = popcount(c_organisms[0].bitarray & c_organisms[1].bitarray) - 1 @@ -74,17 +74,17 @@ def fam_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: :return: family fluidity value from the pangenome for each partition """ # check statuses and load info - logging.info("Check information in pangenome") + logging.getLogger("PPanGGOLiN").info("Check information in pangenome") check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar) fluidity_dict = {'all': None, 'shell': None, 'cloud': None, 'accessory': None} for subset in fluidity_dict.keys(): - logging.debug(f"Compute binaries for {subset} partition") + logging.getLogger("PPanGGOLiN").debug(f"Compute binaries for {subset} partition") pangenome.compute_family_bitarrays(part=subset) # Compute binaries corresponding to presence / absence of families in organisms f_sum = 0 - logging.debug("Get number of families in each organisms") + logging.getLogger("PPanGGOLiN").debug("Get number of families in each organisms") fam_2_nb_org = nb_org_per_fam(pangenome, disable_bar) - logging.info("Compute rate of unique organism for each family combination") + logging.getLogger("PPanGGOLiN").info("Compute rate of unique organism for each family combination") for c_fam in tqdm(list(combinations(pangenome.gene_families, 2)), unit="combination", disable=disable_bar): tot_org = fam_2_nb_org.get(c_fam[0].name) + fam_2_nb_org.get(c_fam[1].name) common_fam = popcount(c_fam[0].bitarray & c_fam[1].bitarray) - 1 diff --git a/ppanggolin/metrics/metrics.py b/ppanggolin/metrics/metrics.py index 0c97e09a..0bd87b2b 100644 --- a/ppanggolin/metrics/metrics.py +++ b/ppanggolin/metrics/metrics.py @@ -79,17 +79,17 @@ def write_metrics(pangenome: Pangenome, metrics_dict: dict, no_print_info: bool """ with tables.open_file(pangenome.file, "a") as h5f: info_group = h5f.root.info - logging.debug("H5f open") + logging.getLogger("PPanGGOLiN").debug("H5f open") if 'genomes_fluidity' in metrics_dict.keys(): - logging.info("Writing genome fluidity in pangenome") + logging.getLogger("PPanGGOLiN").info("Writing genome fluidity in pangenome") info_group._v_attrs.genomes_fluidity = metrics_dict['genomes_fluidity'] if 'families_fluidity' in metrics_dict.keys(): - logging.info("Writing family fluidity in pangenome") + logging.getLogger("PPanGGOLiN").info("Writing family fluidity in pangenome") info_group._v_attrs.families_fluidity = metrics_dict['families_fluidity'] if 'info_modules' in metrics_dict.keys(): - logging.info("Writing modules information in pangenome") + logging.getLogger("PPanGGOLiN").info("Writing modules information in pangenome") write_info_modules(pangenome, h5f) # After all metrics was written @@ -115,12 +115,12 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) - logging.debug("Check if one of the metrics was already compute") + logging.getLogger("PPanGGOLiN").debug("Check if one of the metrics was already compute") if not args.force: check_metric(pangenome, **args_dict) - logging.info("Metrics computation begin") + logging.getLogger("PPanGGOLiN").info("Metrics computation begin") metrics_dictionary = compute_metrics(pangenome, disable_bar=args.disable_prog_bar, **args_dict) - logging.info("Metrics computation done") + logging.getLogger("PPanGGOLiN").info("Metrics computation done") write_metrics(pangenome, metrics_dictionary, no_print_info=args.no_print_info) diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index 1aad6bb1..b8b09c07 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -108,10 +108,10 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = # compute the graph with transitive closure size provided as parameter start_time = time.time() - logging.info("Building the graph...") + logging.getLogger("PPanGGOLiN").info("Building the graph...") g = compute_mod_graph(pangenome.organisms, t=transitive, disable_bar=disable_bar) - logging.info(f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find modules in") - logging.info(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") + logging.getLogger("PPanGGOLiN").info(f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find modules in") + logging.getLogger("PPanGGOLiN").info(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") start_time = time.time() # get all multigenic gene families @@ -124,8 +124,8 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = for mod in modules: fams |= mod.families - logging.info(f"There are {len(fams)} families among {len(modules)} modules") - logging.info(f"Computing modules took {round(time.time() - start_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"There are {len(fams)} families among {len(modules)} modules") + logging.getLogger("PPanGGOLiN").info(f"Computing modules took {round(time.time() - start_time, 2)} seconds") pangenome.add_modules(modules) diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index c419c49d..3732de4d 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -53,7 +53,7 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di :return: Nem parameters and if not just log likelihood the families associated to partition """ - logging.debug("run_partitioning...") + logging.getLogger("PPanGGOLiN").debug("run_partitioning...") if init == "param_file": with open(nem_dir_path / f"nem_file_init_{str(kval)}.m", "w") as m_file: m_file.write("1 ") # 1 to initialize parameter, @@ -87,8 +87,8 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di convergence_th = 0.01 # (INIT_SORT, init_random, init_param_file, INIT_FILE, INIT_LABEL, INIT_NB) = range(0,6) init_random, init_param_file = range(1, 3) - logging.debug("Running NEM...") - logging.debug([nem_dir_path.as_posix().encode('ascii') + b"/nem_file", kval, algo, beta, convergence, + logging.getLogger("PPanGGOLiN").debug("Running NEM...") + logging.getLogger("PPanGGOLiN").debug([nem_dir_path.as_posix().encode('ascii') + b"/nem_file", kval, algo, beta, convergence, convergence_th, b"fuzzy", itermax, True, model, proportion, variance_model, init_param_file if init in ["param_file", "init_from_old"] else init_random, nem_dir_path.as_posix().encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", @@ -102,17 +102,17 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di out_file_prefix=nem_dir_path.as_posix().encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), seed=seed) - logging.debug("After running NEM...") + logging.getLogger("PPanGGOLiN").debug("After running NEM...") no_nem = False nem_out_path = nem_dir_path / f"nem_file_{str(kval)}.uf" if nem_out_path.is_file(): - logging.debug("Reading NEM results...") + logging.getLogger("PPanGGOLiN").debug("Reading NEM results...") elif not just_log_likelihood: - # logging.warning("No NEM output file found: "+ nem_dir_path+"/nem_file_"+str(K)+".uf") + # logging.getLogger("PPanGGOLiN").warning("No NEM output file found: "+ nem_dir_path+"/nem_file_"+str(K)+".uf") no_nem = True else: - logging.debug(f"No NEM output file found: {nem_out_path.absolute().as_posix()}") + logging.getLogger("PPanGGOLiN").debug(f"No NEM output file found: {nem_out_path.absolute().as_posix()}") no_nem = True index_fam = [] @@ -165,7 +165,7 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di else: partitions_list[i] = parti[positions_max_prob.pop()] except IOError: - logging.debug("partitioning did not work (the number of organisms used is probably too low), " + logging.getLogger("PPanGGOLiN").debug("partitioning did not work (the number of organisms used is probably too low), " "see logs here to obtain more details " + nem_dir_path.as_posix() + "/nem_file_" + str(kval) + ".log") return {}, None, None # return empty objects @@ -253,7 +253,7 @@ def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> with open(tmpdir / "column_org_file", "w") as org_file: org_file.write(" ".join([f'"{org.name}"' for org in organisms]) + "\n") - logging.debug("Writing nem_file.str nem_file.index nem_file.nei and nem_file.dat files") + logging.getLogger("PPanGGOLiN").debug("Writing nem_file.str nem_file.index nem_file.nei and nem_file.dat files") with open(tmpdir / "nem_file.str", "w") as str_file, \ open(tmpdir / "nem_file.index", "w") as index_file, \ open(tmpdir / "nem_file.nei", "w") as nei_file, \ @@ -470,7 +470,7 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d tmp_path = Path(tmp_dir.name) if len(organisms) <= 10: - logging.warning(f"The number of selected organisms is too low ({len(organisms)} " + logging.getLogger("PPanGGOLiN").warning(f"The number of selected organisms is too low ({len(organisms)} " f"organisms used) to robustly partition the graph") pangenome.parameters["partition"] = {} @@ -483,10 +483,10 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d if kval < 2: pangenome.parameters["partition"]["computed_K"] = True - logging.info("Estimating the optimal number of partitions...") + logging.getLogger("PPanGGOLiN").info("Estimating the optimal number of partitions...") kval = evaluate_nb_partitions(organisms, output, sm_degree, free_dispersion, chunk_size, kmm, icl_margin, draw_icl, cpu, seed, tmp_path, disable_bar) - logging.info(f"The number of partitions has been evaluated at {kval}") + logging.getLogger("PPanGGOLiN").info(f"The number of partitions has been evaluated at {kval}") pangenome.parameters["partition"]["K"] = kval init = "param_file" @@ -504,7 +504,7 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d cpt_partition[fam.name] = {"P": 0, "S": 0, "C": 0, "U": 0} start_partitioning = time.time() - logging.info("Partitioning...") + logging.getLogger("PPanGGOLiN").info("Partitioning...") pansize = len(families) if chunk_size < len(organisms): validated = set() @@ -548,7 +548,7 @@ def validate_family(res): args.append((i, kval, beta, sm_degree, free_dispersion, seed, init, tmp_path, keep_tmp_files)) - logging.info("Launching NEM") + logging.getLogger("PPanGGOLiN").info("Launching NEM") with get_context('fork').Pool(processes=cpu) as p: # launch partitioning bar = tqdm(range(len(args)), unit=" samples partitioned", disable=disable_bar) @@ -558,7 +558,7 @@ def validate_family(res): bar.close() condition += 1 # if len(validated) < pan_size, we will want to resample more. - logging.debug(f"There are {len(validated)} validated families out of {pansize} families.") + logging.getLogger("PPanGGOLiN").debug(f"There are {len(validated)} validated families out of {pansize} families.") p.close() p.join() for fam, data in cpt_partition.items(): @@ -567,7 +567,7 @@ def validate_family(res): # need to compute the median vectors of each partition ??? partitioning_results = [partitioning_results, []] # introduces a 'non feature'. - logging.info(f"Did {len(samples)} partitioning with chunks of size {chunk_size} among " + logging.getLogger("PPanGGOLiN").info(f"Did {len(samples)} partitioning with chunks of size {chunk_size} among " f"{len(organisms)} genomes in {round(time.time() - start_partitioning, 2)} seconds.") else: edges_weight, nb_fam = write_nem_input_files(tmp_path / f"{str(cpt)}", organisms, @@ -579,7 +579,7 @@ def validate_family(res): raise Exception("Statistical partitioning does not work on your data. " "This usually happens because you used very few (<15) genomes.") cpt += 1 - logging.info(f"Partitioned {len(organisms)} genomes in " + logging.getLogger("PPanGGOLiN").info(f"Partitioned {len(organisms)} genomes in " f"{round(time.time() - start_partitioning, 2)} seconds.") # pangenome.savePartitionParameters(K, beta, free_dispersion, sm_degree, partitioning_results[1], chunk_size) @@ -607,9 +607,9 @@ def launch(args: argparse.Namespace): partition(pan, args.output, args.beta, args.max_degree_smoothing, args.free_dispersion, args.chunk_size, args.nb_of_partitions, args.krange, args.ICL_margin, args.draw_ICL, args.cpu, args.seed, args.tmpdir, args.keep_tmp_files, args.force, disable_bar=args.disable_prog_bar) - logging.debug("Write partition in pangenome") + logging.getLogger("PPanGGOLiN").debug("Write partition in pangenome") write_pangenome(pan, pan.file, args.force, disable_bar=args.disable_prog_bar) - logging.debug("Partitioning is finished") + logging.getLogger("PPanGGOLiN").debug("Partitioning is finished") def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index 9624e199..5dd59ae3 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -160,7 +160,7 @@ def draw_curve(output: Path, data: list, max_sampling: int = 10): :param max_sampling: Maximum number of organisms in a sample :param data: """ - logging.info("Drawing the rarefaction curve ...") + logging.getLogger("PPanGGOLiN").info("Drawing the rarefaction curve ...") raref_name = output/"rarefaction.csv" raref = open(raref_name, "w") raref.write(",".join(["nb_org", "persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", @@ -379,25 +379,25 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No if kval < 3 and kestimate is False: # estimate K once and for all. try: kval = ppp.pan.parameters["partition"]["K"] - logging.info(f"Reuse the number of partitions {kval}") + logging.getLogger("PPanGGOLiN").info(f"Reuse the number of partitions {kval}") except KeyError: - logging.info("Estimating the number of partitions...") + logging.getLogger("PPanGGOLiN").info("Estimating the number of partitions...") kval = ppp.evaluate_nb_partitions(organisms=set(pangenome.organisms), sm_degree=sm_degree, free_dispersion=free_dispersion, chunk_size=chunk_size, krange=krange, cpu=cpu, seed=seed, tmpdir=tmp_path) - logging.info(f"The number of partitions has been evaluated at {kval}") + logging.getLogger("PPanGGOLiN").info(f"The number of partitions has been evaluated at {kval}") - logging.info("Extracting samples ...") + logging.getLogger("PPanGGOLiN").info("Extracting samples ...") all_samples = [] for i in range(min_sampling, max_sampling): # each point for _ in range(depth): # number of samples per points all_samples.append(set(random.sample(set(pangenome.organisms), i + 1))) - logging.info(f"Done sampling organisms in the pan, there are {len(all_samples)} samples") + logging.getLogger("PPanGGOLiN").info(f"Done sampling organisms in the pan, there are {len(all_samples)} samples") samp_nb_per_part = [] - logging.info("Computing bitarrays for each family...") + logging.getLogger("PPanGGOLiN").info("Computing bitarrays for each family...") index_org = pangenome.compute_family_bitarrays() - logging.info("Done computing bitarrays. Comparing them to get exact and soft core stats for " + logging.getLogger("PPanGGOLiN").info("Done computing bitarrays. Comparing them to get exact and soft core stats for " f"{len(all_samples)} samples...") bar = tqdm(range(len(all_samples) * len(pangenome.gene_families)), unit="gene family", disable=disable_bar) for samp in all_samples: @@ -438,7 +438,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No with get_context('fork').Pool(processes=cpu) as p: # launch partitioning - logging.info(" Partitioning all samples...") + logging.getLogger("PPanGGOLiN").info(" Partitioning all samples...") bar = tqdm(range(len(args)), unit="samples partitioned", disable=disable_bar) random.shuffle(args) # shuffling the processing so that the progress bar is closer to reality. for result in p.imap_unordered(launch_raref_nem, args): @@ -446,12 +446,12 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No bar.update() bar.close() - logging.info("Done partitioning everything") + logging.getLogger("PPanGGOLiN").info("Done partitioning everything") warnings.filterwarnings("ignore") draw_curve(output, samp_nb_per_part, max_sampling) warnings.resetwarnings() tmpdir_obj.cleanup() - logging.info("Done making the rarefaction curves") + logging.getLogger("PPanGGOLiN").info("Done making the rarefaction curves") def launch(args: argparse.Namespace): diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 2c07f51d..475c53b0 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -385,7 +385,7 @@ def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[Gen len([gene for gene in genes if not gene.is_fragment]) > 1]) if (dup / len(fam.organisms)) >= dup_margin: # tot / nborgs >= 1.05 multigenics.add(fam) - # logging.info(f"{len(multigenics)} gene families are defined as being multigenic. + # logging.getLogger("PPanGGOLiN").info(f"{len(multigenics)} gene families are defined as being multigenic. # (duplicated in more than {dup_margin} of the genomes)") return multigenics diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 88ece3e0..452dccff 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -406,21 +406,21 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.debug(f"all") + logging.getLogger("PPanGGOLiN").debug(f"all") for fam in self.families: self.bitarray[index[fam]] = 1 elif partition == 'persistent': - logging.debug(f"persistent") + logging.getLogger("PPanGGOLiN").debug(f"persistent") for fam in self.families: if fam.named_partition in ['persistent']: self.bitarray[index[fam]] = 1 elif partition in ['shell', 'cloud']: - logging.debug(f"shell, cloud") + logging.getLogger("PPanGGOLiN").debug(f"shell, cloud") for fam in self.families: if fam.named_partition == partition: self.bitarray[index[fam]] = 1 elif partition == 'accessory': - logging.debug(f"accessory") + logging.getLogger("PPanGGOLiN").debug(f"accessory") for fam in self.families: if fam.named_partition in ['shell', 'cloud']: self.bitarray[index[fam]] = 1 diff --git a/ppanggolin/utility/utils.py b/ppanggolin/utility/utils.py index e34302b0..8e4a6b99 100644 --- a/ppanggolin/utility/utils.py +++ b/ppanggolin/utility/utils.py @@ -212,7 +212,7 @@ def launch_default_config(args: argparse.Namespace): arg_lines += get_default_argument_lines(specific_actions) mk_outdir(args.output.parent, args.force) - logging.info(f'Writting default config in {args.output}') + logging.getLogger("PPanGGOLiN").info(f'Writting default config in {args.output}') with open(args.output, 'w') as fl: fl.write('\n'.join(arg_lines) + '\n') diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 0793ad2f..a80bd510 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -151,16 +151,14 @@ def set_verbosity_level(args): # use stream logging.basicConfig(stream=args.log, level=level, format=str_format, - datefmt=datefmt, - force=True) + datefmt=datefmt) else: # log is written in a files. basic condif uses filename logging.basicConfig(filename=args.log, level=level, format=str_format, - datefmt=datefmt, - force=True) - logging.info("Command: " + " ".join([arg for arg in sys.argv])) - logging.info("PPanGGOLiN version: " + pkg_resources.get_distribution("ppanggolin").version) + datefmt=datefmt) + logging.getLogger("PPanGGOLiN").info("Command: " + " ".join([arg for arg in sys.argv])) + logging.getLogger("PPanGGOLiN").info("PPanGGOLiN version: " + pkg_resources.get_distribution("ppanggolin").version) def jaccard_similarities(mat: csc_matrix, jaccard_similarity_th) -> csc_matrix: @@ -254,7 +252,7 @@ def mk_outdir(output: Path, force: bool = False): :raise FileExistError: The current path already exist and force is false """ if not output.is_dir(): - logging.debug(f"Create output directory {output.absolute().as_posix()}") + logging.getLogger("PPanGGOLiN").debug(f"Create output directory {output.absolute().as_posix()}") Path.mkdir(output) else: if not force: @@ -493,7 +491,7 @@ def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Names setattr(args, param, cli_val) if default_val != cli_val: - logging.debug( + logging.getLogger("PPanGGOLiN").debug( f'Parameter "--{param} {get_arg_name(cli_val)}" has been specified in command line.' f' Its value overwrites putative config values.') @@ -502,7 +500,7 @@ def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Names setattr(args, param, config_val) if default_val != config_val: - logging.debug( + logging.getLogger("PPanGGOLiN").debug( f'Parameter "{param}: {get_arg_name(config_val)}" has been specified in config file with non default value.' f' Its value overwrites default value ({get_arg_name(default_val)}).') else: @@ -614,7 +612,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ if params_that_differ: params_that_differ_str = ', '.join([f'{p}={v}' for p, v in params_that_differ.items()]) - logging.debug( + logging.getLogger("PPanGGOLiN").debug( f"{len(params_that_differ)} {subcommand} parameters have non-default value: {params_that_differ_str}") # manage workflow command @@ -624,7 +622,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ (workflow_step == "module" and subcommand in ["workflow", "panmodule"]): continue - logging.debug(f'Parsing {workflow_step} arguments in config file.') + logging.getLogger("PPanGGOLiN").debug(f'Parsing {workflow_step} arguments in config file.') step_subparser = subcommand_to_subparser[workflow_step] default_step_args = get_default_args(workflow_step, step_subparser, unwanted_args=all_unspecific_params) @@ -651,7 +649,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ if step_params_that_differ: step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) - logging.debug(f"{len(step_params_that_differ)} {workflow_step} " + logging.getLogger("PPanGGOLiN").debug(f"{len(step_params_that_differ)} {workflow_step} " f"parameters have a non-default value: {step_params_that_differ_str}") # add step name to differentiate the params @@ -664,7 +662,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ setattr(args, workflow_step, step_args) if params_that_differ: - logging.info(f'{len(params_that_differ)} parameters have a non-default value.') + logging.getLogger("PPanGGOLiN").info(f'{len(params_that_differ)} parameters have a non-default value.') check_config_consistency(config, ALL_WORKFLOW_DEPENDENCIES) @@ -675,7 +673,7 @@ def check_config_consistency(config: dict, workflow_steps: list): """ Check that the same parameter used in different subcommand inside a workflow has the same value. - If not, the function throw a logging.warning. + If not, the function throw a logging.getLogger("PPanGGOLiN").warning. :params config_dict: config dict with as key the section of the config file and as value another dict pairing name and value of parameters. :params workflow_steps: list of subcommand names used in the workflow execution. @@ -704,7 +702,7 @@ def count_different_values(values: Iterable[Union[int, str, Tuple, List]]) -> in duplicate_param in param_to_value} if count_different_values(step_to_value.values()) > 1: - logging.warning( + logging.getLogger("PPanGGOLiN").warning( f'The parameter {duplicate_param} used in multiple subcommands of the workflow is specified with different values in config file: {step_to_value}.') @@ -826,7 +824,7 @@ def get_config_args(subcommand: str, subparser_fct: Callable, config_dict: dict, config = {name: value for name, value in config.items() if name in expected_args_names} if unexpected_config: - logging.info( + logging.getLogger("PPanGGOLiN").info( f'While parsing {config_section} section in config file, {len(unexpected_config)} unexpected parameters ' f'were ignored : {" ".join(unexpected_config)}') else: diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 4e2bde31..a60dc3fa 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -190,7 +190,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, nocloud = args.draw.nocloud if len(pangenome.organisms) < 500 else True draw_tile_plot(pangenome, args.output, nocloud=nocloud, disable_bar=args.disable_prog_bar) else: - logging.warning( + logging.getLogger("PPanGGOLiN").warning( 'Tile plot output have been requested but there are too many organisms to produce a viewable tile plot.') if args.draw.ucurve: @@ -227,26 +227,26 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, compress=args.write.compress, spot_modules=spot_modules, regions=regions, modules=modules, spots=spots, borders=borders) else: - logging.info('No flat file output has been requested in config file. Writing output flat file is skipped.') + logging.getLogger("PPanGGOLiN").info('No flat file output has been requested in config file. Writing output flat file is skipped.') desc_time = time.time() - start_desc - logging.info(f"Annotation took : {round(anno_time, 2)} seconds") - logging.info(f"Clustering took : {round(clust_time, 2)} seconds") - logging.info(f"Building the graph took : {round(graph_time, 2)} seconds") - logging.info(f"Partitioning the pangenome took : {round(part_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Annotation took : {round(anno_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Clustering took : {round(clust_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Building the graph took : {round(graph_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Partitioning the pangenome took : {round(part_time, 2)} seconds") if panrgp: - logging.info(f"Predicting RGP took : {round(regions_time, 2)} seconds") - logging.info(f"Gathering RGP into spots took : {round(spot_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Predicting RGP took : {round(regions_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Gathering RGP into spots took : {round(spot_time, 2)} seconds") if panmodule: - logging.info(f"Predicting modules took : {round(mod_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Predicting modules took : {round(mod_time, 2)} seconds") - logging.info(f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds") if not args.no_flat_files: - logging.info(f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds") + logging.getLogger("PPanGGOLiN").info(f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds") print_info(filename, content=True) From 7f37edbb9658e6b8368e9799da18e541ae8d4b1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 19 Jun 2023 15:08:10 +0200 Subject: [PATCH 12/75] Refactor to respect PEP --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 5 ++-- ppanggolin/RGP/spot.py | 4 +-- ppanggolin/align/alignOnPang.py | 7 ++--- ppanggolin/annotate/annotate.py | 14 +++++----- ppanggolin/annotate/synta.py | 4 +-- ppanggolin/cluster/cluster.py | 6 ++--- ppanggolin/figures/draw_spot.py | 2 +- ppanggolin/figures/tile_plot.py | 2 +- ppanggolin/formats/writeFlat.py | 14 ++++++---- ppanggolin/formats/writeMSA.py | 7 ++--- ppanggolin/formats/writeSequences.py | 2 +- ppanggolin/graph/makeGraph.py | 2 +- ppanggolin/nem/partition.py | 39 ++++++++++++++++------------ ppanggolin/nem/rarefaction.py | 4 +-- ppanggolin/region.py | 29 ++++++++++----------- ppanggolin/utils.py | 7 ++--- ppanggolin/workflow/all.py | 1 - 18 files changed, 82 insertions(+), 69 deletions(-) diff --git a/VERSION b/VERSION index ff2f04c5..c69d1350 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.115 +1.2.116 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 8f117ba0..f07116de 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -209,8 +209,9 @@ def naming_scheme(pangenome: Pangenome): oldlen = len(contigsids) contigsids.add(contig.name) if oldlen == len(contigsids): - logging.getLogger("PPanGGOLiN").warning("You have contigs with identical identifiers in your assemblies. " - "identifiers will be supplemented with your provided organism names.") + logging.getLogger("PPanGGOLiN").warning("You have contigs with identical identifiers in your " + "assemblies. identifiers will be supplemented with your " + "provided organism names.") return "organism" return "contig" diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 93944c8f..c42ff920 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -116,8 +116,8 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): else: used += 1 add_new_node(graph_spot, rgp, border) - logging.getLogger("PPanGGOLiN").info(f"{lost} RGPs were not used as they are on a contig border (or have less than {set_size} " - f"persistent gene families until the contig border)") + logging.getLogger("PPanGGOLiN").info(f"{lost} RGPs were not used as they are on a contig border (or have " + f"less than {set_size} persistent gene families until the contig border)") logging.getLogger("PPanGGOLiN").info(f"{used} RGPs are being used to predict spots of insertion") node_list = list(graph_spot.nodes) logging.getLogger("PPanGGOLiN").info(f"{len(node_list)} number of different pairs of flanking gene families") diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 74432507..126b0047 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -255,7 +255,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel if len(spot.get_uniq_ordered_set()) > 1: drawn_spots.add(spot) logging.getLogger("PPanGGOLiN").info(f"Drawing the {len(drawn_spots)} spots with more than 1 organization " - f"related to hits of the input sequences...") + f"related to hits of the input sequences...") draw_selected_spots(drawn_spots, pangenome, output, pangenome.parameters["spots"]["overlapping_match"], pangenome.parameters["spots"]["exact_match"], pangenome.parameters["spots"]["set_size"], disable_bar=disable_bar) @@ -269,7 +269,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel draw_spot_gexf(drawn_spots, output, multigenics=multigenics, fam_to_mod=fam2mod) logging.getLogger("PPanGGOLiN").info(f"File listing RGP and spots where sequences of interest are located : " - f"{output / 'info_input_seq.tsv'}") + f"{output / 'info_input_seq.tsv'}") def get_seq2pang(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, cpu: int = 1, @@ -349,7 +349,8 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) part_proj = project_partition(seq2pang, seq_set, output) # write the partition assignation only logging.getLogger("PPanGGOLiN").info(f"sequences partition projection : '{part_proj}'") - logging.getLogger("PPanGGOLiN").info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") + logging.getLogger("PPanGGOLiN").info( + f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.") logging.getLogger("PPanGGOLiN").info(f"Blast-tab file of the alignment : '{align_file}'") new_tmpdir.cleanup() diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 5b537e46..fb20c5ce 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -463,10 +463,10 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p used_local_identifiers = chose_gene_identifiers(pangenome) if used_local_identifiers: logging.getLogger("PPanGGOLiN").info("gene identifiers used in the provided annotation files were unique, " - "PPanGGOLiN will use them.") + "PPanGGOLiN will use them.") else: logging.getLogger("PPanGGOLiN").info("gene identifiers used in the provided annotation files were not unique, " - "PPanGGOLiN will use self-generated identifiers.") + "PPanGGOLiN will use self-generated identifiers.") pangenome.status["genomesAnnotated"] = "Computed" pangenome.parameters["annotation"] = {} @@ -597,10 +597,12 @@ def launch(args: argparse.Namespace): if args.fasta: get_gene_sequences_from_fastas(pangenome, args.fasta) else: - logging.getLogger("PPanGGOLiN").warning("You provided gff files without sequences, and you did not provide " - "fasta sequences. Thus it was not possible to get the gene sequences.") - logging.getLogger("PPanGGOLiN").warning("You will be able to proceed with your analysis ONLY if you provide " - "the clustering results in the next step.") + logging.getLogger("PPanGGOLiN").warning( + "You provided gff files without sequences, and you did not provide " + "fasta sequences. Thus it was not possible to get the gene sequences.") + logging.getLogger("PPanGGOLiN").warning( + "You will be able to proceed with your analysis ONLY if you provide " + "the clustering results in the next step.") write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 6658d0d7..42953743 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -325,8 +325,8 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: genes = syntaxic_annotation(org, fasta_file, tmpdir, norna, kingdom, code, procedure) genes = overlap_filter(genes, overlap) - for contigName, genes in genes.items(): - contig = org.get_contig(contigName) + for contig_name, genes in genes.items(): + contig = org.get_contig(contig_name) if contig.name in circular_contigs: contig.is_circular = True for gene in genes: diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index b7235de2..5c3be9f4 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -435,8 +435,8 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) if args.clusters is None: if args.infer_singletons is True: - logging.getLogger("PPanGGOLiN").warning("--infer_singletons option is not compatible with clustering creation. " - "To infer singleton you should give a clustering") + logging.getLogger("PPanGGOLiN").warning("--infer_singletons option is not compatible with clustering " + "creation. To infer singleton you should give a clustering") clustering(pangenome, args.tmpdir, args.cpu, defrag=not args.no_defrag, code=args.translation_table, coverage=args.coverage, identity=args.identity, mode=args.mode, force=args.force, disable_bar=args.disable_prog_bar) @@ -502,7 +502,7 @@ def parser_clust(parser: argparse.ArgumentParser): if __name__ == '__main__': """To test local change and allow using debugger""" - from ppanggolin.utils import check_log, set_verbosity_level, add_common_arguments + from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 322e40ee..e0403bcb 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -661,7 +661,7 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: if len(selected_spots) < 10: logging.getLogger("PPanGGOLiN").info(f"Drawing the following spots: " - f"{','.join(['spot_' + str(s.ID) for s in selected_spots])}") + f"{','.join(['spot_' + str(s.ID) for s in selected_spots])}") else: logging.getLogger("PPanGGOLiN").info(f"Drawing {len(selected_spots)} spots") diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 0b26b9ea..d9d559ef 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -36,7 +36,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di raise Exception("Cannot draw the tile plot as your pangenome has not been partitioned") if len(pangenome.organisms) > 500 and nocloud is False: logging.getLogger("PPanGGOLiN").warning("You asked to draw a tile plot for a lot of organisms (>500). " - "Your browser will probably not be able to open it.") + "Your browser will probably not be able to open it.") logging.getLogger("PPanGGOLiN").info("Drawing the tile plot...") data = [] all_indexes = [] diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 838887e3..7bca515e 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -631,7 +631,8 @@ def write_gene_families_tsv(output: Path, compress: bool = False): :param output: Path to output directory :param compress: Compress the file in .gz """ - logging.getLogger("PPanGGOLiN").info("Writing the file providing the association between genes and gene families...") + logging.getLogger("PPanGGOLiN").info( + "Writing the file providing the association between genes and gene families...") outname = output / "gene_families.tsv" with write_compressed_or_not(outname, compress) as tsv: for fam in pan.gene_families: @@ -639,7 +640,7 @@ def write_gene_families_tsv(output: Path, compress: bool = False): tsv.write("\t".join([fam.name, gene.ID if gene.local_identifier == "" else gene.local_identifier, "F" if gene.is_fragment else ""]) + "\n") logging.getLogger("PPanGGOLiN").info("Done writing the file providing the association between genes and " - f"gene families : '{outname}'") + f"gene families : '{outname}'") def write_regions(output: Path, compress: bool = False): @@ -781,7 +782,8 @@ def write_modules(output: Path, compress: bool = False): fout.write(f"module_{mod.ID}\t{family.name}\n") fout.close() - logging.getLogger("PPanGGOLiN").info(f"Done writing functional modules to: '{output.as_posix() + '/functional_modules.tsv'}'") + logging.getLogger("PPanGGOLiN").info( + f"Done writing functional modules to: '{output.as_posix() + '/functional_modules.tsv'}'") def write_org_modules(output: Path, compress: bool = False): @@ -834,7 +836,8 @@ def write_spot_modules(output: Path, compress: bool = False): # if all the families in the module are found in the spot, write the association fout.write(f"module_{mod.ID}\tspot_{spot.ID}\n") - logging.getLogger("PPanGGOLiN").info(f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}") + logging.getLogger("PPanGGOLiN").info( + f"Done writing module to spot associations to: {output.as_posix() + '/modules_spots.tsv'}") def write_rgp_modules(output: Path, compress: bool = False): @@ -878,7 +881,8 @@ def write_rgp_modules(output: Path, compress: bool = False): f"{','.join([reg.name for reg in regions])}\n") lists.close() - logging.getLogger("PPanGGOLiN").info(f"RGP and associated modules are listed in : {output.as_posix() + '/modules_RGP_lists.tsv'}") + logging.getLogger("PPanGGOLiN").info( + f"RGP and associated modules are listed in : {output.as_posix() + '/modules_RGP_lists.tsv'}") def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core: float = 0.95, diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 8b347021..0f08bde7 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -318,8 +318,8 @@ def write_msa_files(pangenome: Pangenome, output: Path, cpu: int = 1, partition: if 'translation_table' in pangenome.parameters["cluster"]: if pangenome.parameters["cluster"]["translation_table"] != translation_table: logging.getLogger("PPanGGOLiN").warning("The translation table used during clustering " - f"('{pangenome.parameters['cluster']['translation_table']}') " - f"is different than the one provided now ('{translation_table}')") + f"('{pangenome.parameters['cluster']['translation_table']}') " + f"is different than the one provided now ('{translation_table}')") code = translation_table compute_msa(families, outdir, cpu=cpu, tmpdir=tmpdir, source=source, use_gene_id=use_gene_id, code=code, @@ -404,6 +404,7 @@ def parser_msa(parser: argparse.ArgumentParser): optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()), help="directory for storing temporary files") + if __name__ == '__main__': """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments @@ -415,4 +416,4 @@ def parser_msa(parser: argparse.ArgumentParser): parser_msa(main_parser) add_common_arguments(main_parser) set_verbosity_level(main_parser.parse_args()) - launch(main_parser.parse_args()) \ No newline at end of file + launch(main_parser.parse_args()) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 2614589c..74dda3a2 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -493,4 +493,4 @@ def parser_seq(parser: argparse.ArgumentParser): parser_seq(main_parser) add_common_arguments(main_parser) set_verbosity_level(main_parser.parse_args()) - launch(main_parser.parse_args()) \ No newline at end of file + launch(main_parser.parse_args()) diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 7fef1187..10e8bdfc 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -165,4 +165,4 @@ def parser_graph(parser: argparse.ArgumentParser): parser_graph(main_parser) add_common_arguments(main_parser) set_verbosity_level(main_parser.parse_args()) - launch(main_parser.parse_args()) \ No newline at end of file + launch(main_parser.parse_args()) diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index 3732de4d..eccde1b1 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -88,17 +88,19 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di # (INIT_SORT, init_random, init_param_file, INIT_FILE, INIT_LABEL, INIT_NB) = range(0,6) init_random, init_param_file = range(1, 3) logging.getLogger("PPanGGOLiN").debug("Running NEM...") - logging.getLogger("PPanGGOLiN").debug([nem_dir_path.as_posix().encode('ascii') + b"/nem_file", kval, algo, beta, convergence, - convergence_th, b"fuzzy", itermax, True, model, proportion, variance_model, - init_param_file if init in ["param_file", "init_from_old"] else init_random, - nem_dir_path.as_posix().encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", - nem_dir_path.as_posix().encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), - seed]) + logging.getLogger("PPanGGOLiN").debug( + [nem_dir_path.as_posix().encode('ascii') + b"/nem_file", kval, algo, beta, convergence, + convergence_th, b"fuzzy", itermax, True, model, proportion, variance_model, + init_param_file if init in ["param_file", "init_from_old"] else init_random, + nem_dir_path.as_posix().encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", + nem_dir_path.as_posix().encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), + seed]) nem_stats.nem(Fname=nem_dir_path.as_posix().encode('ascii') + b"/nem_file", nk=kval, algo=algo, beta=beta, convergence=convergence, convergence_th=convergence_th, format=b"fuzzy", it_max=itermax, dolog=True, model_family=model, proportion=proportion, dispersion=variance_model, init_mode=init_param_file if init in ["param_file", "init_from_old"] else init_random, - init_file=nem_dir_path.as_posix().encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", + init_file=nem_dir_path.as_posix().encode('ascii') + b"/nem_file_init_" + str(kval).encode( + 'ascii') + b".m", out_file_prefix=nem_dir_path.as_posix().encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), seed=seed) @@ -116,7 +118,7 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di no_nem = True index_fam = [] - with open(nem_dir_path / "nem_file.index", "r") as index_nem_file: + with open(nem_dir_path / "nem_file.index", "r") as index_nem_file: for line in index_nem_file: index_fam.append(line.split("\t")[1].strip()) @@ -165,9 +167,10 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di else: partitions_list[i] = parti[positions_max_prob.pop()] except IOError: - logging.getLogger("PPanGGOLiN").debug("partitioning did not work (the number of organisms used is probably too low), " - "see logs here to obtain more details " + nem_dir_path.as_posix() + "/nem_file_" + - str(kval) + ".log") + logging.getLogger("PPanGGOLiN").debug( + "partitioning did not work (the number of organisms used is probably too low), " + "see logs here to obtain more details " + nem_dir_path.as_posix() + "/nem_file_" + + str(kval) + ".log") return {}, None, None # return empty objects except ValueError: # return the default partitions_list which correspond to undefined @@ -471,7 +474,7 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d if len(organisms) <= 10: logging.getLogger("PPanGGOLiN").warning(f"The number of selected organisms is too low ({len(organisms)} " - f"organisms used) to robustly partition the graph") + f"organisms used) to robustly partition the graph") pangenome.parameters["partition"] = {} pangenome.parameters["partition"]["beta"] = beta @@ -558,7 +561,8 @@ def validate_family(res): bar.close() condition += 1 # if len(validated) < pan_size, we will want to resample more. - logging.getLogger("PPanGGOLiN").debug(f"There are {len(validated)} validated families out of {pansize} families.") + logging.getLogger("PPanGGOLiN").debug( + f"There are {len(validated)} validated families out of {pansize} families.") p.close() p.join() for fam, data in cpt_partition.items(): @@ -568,7 +572,7 @@ def validate_family(res): partitioning_results = [partitioning_results, []] # introduces a 'non feature'. logging.getLogger("PPanGGOLiN").info(f"Did {len(samples)} partitioning with chunks of size {chunk_size} among " - f"{len(organisms)} genomes in {round(time.time() - start_partitioning, 2)} seconds.") + f"{len(organisms)} genomes in {round(time.time() - start_partitioning, 2)} seconds.") else: edges_weight, nb_fam = write_nem_input_files(tmp_path / f"{str(cpt)}", organisms, sm_degree=sm_degree) @@ -580,7 +584,7 @@ def validate_family(res): "This usually happens because you used very few (<15) genomes.") cpt += 1 logging.getLogger("PPanGGOLiN").info(f"Partitioned {len(organisms)} genomes in " - f"{round(time.time() - start_partitioning, 2)} seconds.") + f"{round(time.time() - start_partitioning, 2)} seconds.") # pangenome.savePartitionParameters(K, beta, free_dispersion, sm_degree, partitioning_results[1], chunk_size) @@ -642,8 +646,9 @@ def parser_partition(parser: argparse.ArgumentParser): optional.add_argument("-ms", "--max_degree_smoothing", required=False, default=10, type=float, help="max. degree of the nodes to be included in the smoothing process.") optional.add_argument('-o', '--output', required=False, type=Path, - default=Path(f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" - f"_PID{str(os.getpid())}"), + default=Path( + f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" + f"_PID{str(os.getpid())}"), help="Output directory") optional.add_argument("-fd", "--free_dispersion", required=False, default=False, action="store_true", help="use if the dispersion around the centroid vector of each partition during must be free." diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index 5dd59ae3..d13ea370 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -397,8 +397,8 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No logging.getLogger("PPanGGOLiN").info("Computing bitarrays for each family...") index_org = pangenome.compute_family_bitarrays() - logging.getLogger("PPanGGOLiN").info("Done computing bitarrays. Comparing them to get exact and soft core stats for " - f"{len(all_samples)} samples...") + logging.getLogger("PPanGGOLiN").info("Done computing bitarrays. Comparing them to get exact and soft core stats " + f"for {len(all_samples)} samples...") bar = tqdm(range(len(all_samples) * len(pangenome.gene_families)), unit="gene family", disable=disable_bar) for samp in all_samples: # make the sample's organism bitarray. diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 452dccff..abee2592 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -136,9 +136,8 @@ def is_contig_border(self) -> bool: """ if len(self.genes) == 0: raise Exception("Your region has no genes. Something wrong happenned.") - if self.start_gene.position == 0 and not self.contig.is_circular: - return True - elif self.stop_gene.position == len(self.contig.genes) - 1 and not self.contig.is_circular: + if (self.start_gene.position == 0 and not self.contig.is_circular) or \ + (self.stop_gene.position == len(self.contig.genes) - 1 and not self.contig.is_circular): return True return False @@ -283,10 +282,10 @@ def _mk_uniq_ordered_set_obj(self): """cluster RGP into groups that have an identical synteny""" for rgp in self.regions: z = True - for seenRgp in self._uniqOrderedSet: - if rgp == seenRgp: + for seen_rgp in self._uniqOrderedSet: + if rgp == seen_rgp: z = False - self._uniqOrderedSet[seenRgp].add(rgp) + self._uniqOrderedSet[seen_rgp].add(rgp) if z: self._uniqOrderedSet[rgp] = {rgp} @@ -294,10 +293,10 @@ def _mk_uniq_content(self): """cluster RGP into groups that have identical gene content""" for rgp in self.regions: z = True - for seenRgp in self._uniqContent: - if rgp.families == seenRgp.families: + for seen_rgp in self._uniqContent: + if rgp.families == seen_rgp.families: z = False - self._uniqContent[seenRgp].add(rgp) + self._uniqContent[seen_rgp].add(rgp) if z: self._uniqContent[rgp] = {rgp} @@ -406,21 +405,21 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.getLogger("PPanGGOLiN").debug(f"all") + logging.getLogger("PPanGGOLiN").debug("all") for fam in self.families: self.bitarray[index[fam]] = 1 elif partition == 'persistent': - logging.getLogger("PPanGGOLiN").debug(f"persistent") + logging.getLogger("PPanGGOLiN").debug("persistent") for fam in self.families: if fam.named_partition in ['persistent']: self.bitarray[index[fam]] = 1 elif partition in ['shell', 'cloud']: - logging.getLogger("PPanGGOLiN").debug(f"shell, cloud") + logging.getLogger("PPanGGOLiN").debug("shell, cloud") for fam in self.families: if fam.named_partition == partition: self.bitarray[index[fam]] = 1 elif partition == 'accessory': - logging.getLogger("PPanGGOLiN").debug(f"accessory") + logging.getLogger("PPanGGOLiN").debug("accessory") for fam in self.families: if fam.named_partition in ['shell', 'cloud']: self.bitarray[index[fam]] = 1 @@ -441,8 +440,8 @@ def __init__(self, gc_id: int, families: set = None): self.families = set() if families is not None: if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception(f"You provided elements that were not GeneFamily object." - f" GeneContext are only made of GeneFamily") + raise Exception("You provided elements that were not GeneFamily object." + " GeneContext are only made of GeneFamily") self.families |= set(families) def add_family(self, family: GeneFamily): diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index a80bd510..28e19a1c 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -158,7 +158,8 @@ def set_verbosity_level(args): format=str_format, datefmt=datefmt) logging.getLogger("PPanGGOLiN").info("Command: " + " ".join([arg for arg in sys.argv])) - logging.getLogger("PPanGGOLiN").info("PPanGGOLiN version: " + pkg_resources.get_distribution("ppanggolin").version) + logging.getLogger("PPanGGOLiN").info( + "PPanGGOLiN version: " + pkg_resources.get_distribution("ppanggolin").version) def jaccard_similarities(mat: csc_matrix, jaccard_similarity_th) -> csc_matrix: @@ -649,8 +650,8 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ if step_params_that_differ: step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) - logging.getLogger("PPanGGOLiN").debug(f"{len(step_params_that_differ)} {workflow_step} " - f"parameters have a non-default value: {step_params_that_differ_str}") + logging.getLogger("PPanGGOLiN").debug(f"{len(step_params_that_differ)} {workflow_step} parameters have " + f"a non-default value: {step_params_that_differ_str}") # add step name to differentiate the params step_params_that_differ = {f'{workflow_step}:{param}': value for param, value in diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index a60dc3fa..6fa51524 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -6,7 +6,6 @@ import os import time import argparse -from collections.abc import Callable from pathlib import Path import tempfile From 185acb097ddc3bfc2fbbd1ea320d3f388746037f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <39793176+jpjarnoux@users.noreply.github.com> Date: Tue, 20 Jun 2023 11:30:06 +0200 Subject: [PATCH 13/75] Update requirements.txt --- requirements.txt | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index c9d610cd..61774df7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,16 @@ tqdm>=4.64 +pytables>=3.7 +prodigal>=2.6.3 +aragorn>=1.2.41 +infernal>=1.1.4 +mmseqs2>=13.45111 networkx>=2.3 +dataclasses>=0.8 scipy>=1.7.3 plotly>=4.14.3 gmpy2>=2.1.2 pandas>=0.25.3 colorlover>=0.3.0 -numpy>=1.20.3 +mafft>=7.505 +numpy>=1.21.6 bokeh>=2.4.2,<3 -ppanggolin>=1.2.112 -setuptools>=67.1.0 -yaml>=0.2.5 \ No newline at end of file From 74c5c40f9f16239415dfd1ae4c41fab76fd26c06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 20 Jun 2023 13:05:48 +0200 Subject: [PATCH 14/75] Refactor Metric --- VERSION | 2 +- ppanggolin/metrics/metrics.py | 21 ++++----------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/VERSION b/VERSION index c69d1350..b0d271d7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.116 +1.2.117 diff --git a/ppanggolin/metrics/metrics.py b/ppanggolin/metrics/metrics.py index 0bd87b2b..0a1f41e8 100644 --- a/ppanggolin/metrics/metrics.py +++ b/ppanggolin/metrics/metrics.py @@ -5,7 +5,7 @@ import argparse import tables import logging - +from pathlib import Path # local libraries from ppanggolin.pangenome import Pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, read_info @@ -146,15 +146,13 @@ def parser_metrics(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") - required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome .h5 file") + required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") onereq = parser.add_argument_group(title="Input file", description="One of the following argument is required :") onereq.add_argument('--genome_fluidity', required=False, action="store_true", default=False, help="Compute the pangenome genomic fluidity.") # help="Compute the pangenome genomic and/or family fluidity.") onereq.add_argument('--info_modules', required=False, action='store_true', default=False, help='Compute more information about modules') - onereq.add_argument('--family_fluidity', required=False, action="store_true", default=False, - help=argparse.SUPPRESS) onereq.add_argument('--all', required=False, action="store_true", default=False, help="Compute all the metrics") optional = parser.add_argument_group(title="Optional arguments", @@ -163,28 +161,17 @@ def parser_metrics(parser: argparse.ArgumentParser): optional.add_argument('--no_print_info', required=False, action="store_true", default=False, help="Don't show the metrics result. " "All the metric are saved in your pangenome and visible with ppanggolin info.") - # optional.add_argument('--genome_only', required=False, action="store_true", default=False, - # help="Compute the genome fluidity only") - # optional.add_argument('--family_only', required=False, action="store_true", default=False, - # help="Compute the genome fluidity only") if __name__ == '__main__': """To test local change and allow using debugger""" - from ppanggolin.utils import check_log, set_verbosity_level + from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", formatter_class=argparse.RawTextHelpFormatter) parser_metrics(main_parser) - common = main_parser.add_argument_group(title="Common argument") - common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2], - help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)") - common.add_argument("--log", required=False, type=check_log, default="stdout", help="log output file") - common.add_argument("-d", "--disable_prog_bar", required=False, action="store_true", - help="disables the progress bars") - common.add_argument('-f', '--force', action="store_true", - help="Force writing in output directory and in pangenome output file.") + add_common_arguments(main_parser) set_verbosity_level(main_parser.parse_args()) launch(main_parser.parse_args()) From 9d7a7c4518f5c6781d076668a272d5ca3d85c069 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 21 Jun 2023 11:55:42 +0200 Subject: [PATCH 15/75] Fix bug with removed option --- VERSION | 2 +- ppanggolin/metrics/metrics.py | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/VERSION b/VERSION index b0d271d7..6be47f33 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.117 +1.2.118 diff --git a/ppanggolin/metrics/metrics.py b/ppanggolin/metrics/metrics.py index 0a1f41e8..eeb5d674 100644 --- a/ppanggolin/metrics/metrics.py +++ b/ppanggolin/metrics/metrics.py @@ -84,10 +84,6 @@ def write_metrics(pangenome: Pangenome, metrics_dict: dict, no_print_info: bool logging.getLogger("PPanGGOLiN").info("Writing genome fluidity in pangenome") info_group._v_attrs.genomes_fluidity = metrics_dict['genomes_fluidity'] - if 'families_fluidity' in metrics_dict.keys(): - logging.getLogger("PPanGGOLiN").info("Writing family fluidity in pangenome") - info_group._v_attrs.families_fluidity = metrics_dict['families_fluidity'] - if 'info_modules' in metrics_dict.keys(): logging.getLogger("PPanGGOLiN").info("Writing modules information in pangenome") write_info_modules(pangenome, h5f) @@ -103,10 +99,9 @@ def launch(args: argparse.Namespace): :param args: All arguments provide by user """ - if not any(x for x in [args.genome_fluidity, args.family_fluidity, args.info_modules, args.all]): + if not any(x for x in [args.genome_fluidity, args.info_modules, args.all]): raise Exception("You did not indicate which metric you want to compute.") args_dict = {'genomes_fluidity': args.genome_fluidity, - 'families_fluidity': args.family_fluidity, 'info_modules': args.info_modules} if args.all: for arg in args_dict.keys(): From 70dd691e2bebf6c96362879b6cba45b946a3e2bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 29 Jun 2023 17:10:25 +0200 Subject: [PATCH 16/75] Change for unit test --- VERSION | 2 +- ppanggolin/cluster/cluster.py | 33 +++++++++++++------- ppanggolin/figures/tile_plot.py | 2 +- ppanggolin/formats/readBinaries.py | 6 ++-- ppanggolin/formats/writeMSA.py | 2 +- ppanggolin/pangenome.py | 48 ++++++++++++++---------------- 6 files changed, 52 insertions(+), 41 deletions(-) diff --git a/VERSION b/VERSION index e829013f..e144fab6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.132 +1.2.133 diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 3f5b3365..7d18f18a 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -17,6 +17,7 @@ # local libraries from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Gene +from ppanggolin.geneFamily import GeneFamily from ppanggolin.utils import read_compressed_or_not, restricted_float from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file @@ -241,8 +242,9 @@ def read_fam2seq(pangenome: Pangenome, fam_to_seq: dict): """ logging.getLogger().info("Adding protein sequences to the gene families") for family, protein in fam_to_seq.items(): - fam = pangenome.add_gene_family(family) + fam = GeneFamily(pangenome.max_fam_id, family) fam.add_sequence(protein) + pangenome.add_gene_family(fam) def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = False): @@ -256,15 +258,18 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F logging.getLogger().info(f"Adding {len(gene_to_fam)} genes to the gene families") link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False - if link: - if len(gene_to_fam) != len(pangenome.genes): # then maybe there are genes with identical IDs - raise Exception("Something unexpected happened during clustering " - "(have less genes clustered than genes in the pangenome). " - "A probable reason is that two genes in two different organisms have the same IDs; " - "If you are sure that all of your genes have non identical IDs, " - "please post an issue at https://github.com/labgem/PPanGGOLiN/") + if link and len(gene_to_fam) != len(pangenome.genes): # then maybe there are genes with identical IDs + raise Exception("Something unexpected happened during clustering " + "(have less genes clustered than genes in the pangenome). " + "A probable reason is that two genes in two different organisms have the same IDs; " + "If you are sure that all of your genes have non identical IDs, " + "please post an issue at https://github.com/labgem/PPanGGOLiN/") for gene, (family, is_frag) in tqdm(gene_to_fam.items(), unit="gene", total=len(gene_to_fam), disable=disable_bar): - fam = pangenome.add_gene_family(family) + try: + fam = pangenome.get_gene_family(family) + except KeyError: # Family not found so create and add + fam = GeneFamily(pangenome.max_fam_id, family) + pangenome.add_gene_family(fam) if link: # doing the linking if the annotations are loaded. gene_obj = pangenome.get_gene(gene) else: @@ -353,7 +358,9 @@ def infer_singletons(pangenome: Pangenome): singleton_counter = 0 for gene in pangenome.genes: if gene.family is None: - pangenome.add_gene_family(gene.ID).add_gene(gene) + fam = GeneFamily(family_id=pangenome.max_fam_id, name=gene.ID) + fam.add_gene(gene) + pangenome.add_gene_family(fam) singleton_counter += 1 logging.getLogger().info(f"Inferred {singleton_counter} singleton families") @@ -395,7 +402,11 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: str, infer_singleto gene_obj = local_dict.get(gene_id) if gene_obj is not None: nb_gene_with_fam += 1 - fam = pangenome.add_gene_family(fam_id) + try: + fam = pangenome.get_gene_family(fam_id) + except KeyError: # Family not found so create and add + fam = GeneFamily(pangenome.max_fam_id, fam_id) + pangenome.add_gene_family(fam) gene_obj.is_fragment = True if is_frag == "F" else False # F for Fragment fam.add_gene(gene_obj) if is_frag == "F": diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 9a2b7caa..fd6c84f0 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -44,7 +44,7 @@ def draw_tile_plot(pangenome: Pangenome, output: str, nocloud: bool = False, dis if nocloud: families = {fam for fam in pangenome.gene_families if not fam.partition.startswith("C")} else: - families = set(pangenome.gene_families) + families = pangenome.gene_families org_index = pangenome.get_org_index() index2org = {} for org, index in org_index.items(): diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index c4d88696..646e8a3e 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -15,6 +15,7 @@ # local libraries from ppanggolin.genome import Organism, Gene, RNA from ppanggolin.pangenome import Pangenome +from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Spot, Module from ppanggolin.metadata import Metadata @@ -308,7 +309,8 @@ def read_gene_families(pangenome: Pangenome, h5f: tables.File, disable_bar: bool link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene family", disable=disable_bar): - fam = pangenome.add_gene_family(row["geneFam"].decode()) + fam = GeneFamily(family_id=pangenome.max_fam_id, name=row["geneFam"].decode()) + pangenome.add_gene_family(fam) if link: # linking if we have loaded the annotations gene_obj = pangenome.get_gene(row["gene"].decode()) else: # else, no @@ -328,7 +330,7 @@ def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar: table = h5f.root.geneFamiliesInfo for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene family", disable=disable_bar): - fam = pangenome.add_gene_family(row["name"].decode()) + fam = pangenome.get_gene_family(row["name"].decode()) fam.add_partition(row["partition"].decode()) fam.add_sequence(row["protein"].decode()) diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index f8af1135..c26e3c43 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -54,7 +54,7 @@ def getFamiliesToWrite(pangenome, partition_filter, soft_core=0.95, dup_margin=0 nb_org = pangenome.number_of_organisms() if partition_filter == "all": - return set(pangenome.gene_families) + return pangenome.gene_families if partition_filter in ["persistent", "shell", "cloud"]: for fam in pangenome.gene_families: if fam.named_partition == partition_filter: diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 92836995..04c5694f 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -148,26 +148,13 @@ def number_of_gene(self) -> int: return len(self._geneGetter) """Gene families methods""" - @property - def gene_families(self) -> List[GeneFamily]: + def gene_families(self) -> Set[GeneFamily]: """returns all the gene families in the pangenome :return: list of :class:`ppanggolin.geneFamily.GeneFamily` """ - return list(self._famGetter.values()) - - def _create_gene_family(self, name: str) -> GeneFamily: - """Creates a gene family object with the given `name` - - :param name: the name to give to the gene family. Must not exist already. - - :return: the created GeneFamily object - """ - new_fam = GeneFamily(family_id=self.max_fam_id, name=name) - self.max_fam_id += 1 - self._famGetter[new_fam.name] = new_fam - return new_fam + return set(self._famGetter.values()) def number_of_gene_families(self) -> int: """Returns the number of gene families present in the pangenome @@ -183,21 +170,32 @@ def get_gene_family(self, name: str) -> GeneFamily: :return: returns the gene family that has the name `name` """ - return self._famGetter[name] + try: + fam = self._famGetter[name] + except KeyError: + raise KeyError(f"Gene family with name={name} is not in pangenome") + except Exception as error: + raise error + else: + return fam - def add_gene_family(self, name: str): + def add_gene_family(self, family: GeneFamily): """ Get the :class:`ppanggolin.geneFamily.GeneFamily` object that has the given `name`. If it does not exist, creates it. - :param name: The gene family name to get if it exists, and create otherwise. + :param family: The gene family to add in pangenomes - :return: GeneFamily object. + :raise KeyError: Exception if family with the same name already in pangenome """ - fam = self._famGetter.get(name) - if fam is None: - fam = self._create_gene_family(name) - return fam + try: + _ = self.get_gene_family(family.name) + except KeyError: + self._famGetter[family.name] = family + except Exception as error: + raise error + else: + raise KeyError("Gene Family already exist") """Graph methods""" @@ -238,12 +236,12 @@ def number_of_edge(self) -> int: """Organism methods""" @property - def organisms(self) -> List[Organism]: + def organisms(self) -> Set[Organism]: """returns all the organisms in the pangenome :return: list of :class:`ppanggolin.genome.Organism` """ - return list(self._orgGetter.values()) + return set(self._orgGetter.values()) def number_of_organisms(self) -> int: """Returns the number of organisms present in the pangenome From 483a2bc0f4eb760e74d9db93c4f011df72eed18d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 3 Jul 2023 11:03:24 +0200 Subject: [PATCH 17/75] Add unit Test for Pangenome and Metadata --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 9 +- ppanggolin/annotate/annotate.py | 4 +- ppanggolin/cluster/cluster.py | 6 +- ppanggolin/figures/tile_plot.py | 10 +- ppanggolin/figures/ucurve.py | 16 +- ppanggolin/formats/readBinaries.py | 15 +- ppanggolin/formats/writeBinaries.py | 30 +- ppanggolin/formats/writeFlat.py | 18 +- ppanggolin/graph/makeGraph.py | 2 +- ppanggolin/meta/meta.py | 18 +- ppanggolin/metadata.py | 42 +- ppanggolin/nem/rarefaction.py | 6 +- ppanggolin/pangenome.py | 223 +++----- ppanggolin/region.py | 1 + ppanggolin/workflow/all.py | 4 +- tests/test_Pangenome.py | 830 ++++++++++++++++------------ tests/test_metadata.py | 126 +++++ 18 files changed, 793 insertions(+), 569 deletions(-) create mode 100644 tests/test_metadata.py diff --git a/VERSION b/VERSION index dcfe1b94..62352484 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.134 +1.2.135 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index f07116de..462e2045 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -5,6 +5,7 @@ import logging import argparse from pathlib import Path +from typing import Set # installed libraries from tqdm import tqdm @@ -30,7 +31,7 @@ def changes(self, score): self.score = score if score >= 0 else 0 -def extract_rgp(contig, node, rgp_id, naming): +def extract_rgp(contig, node, rgp_id, naming) -> Region: """ Extract the region from the given starting node """ @@ -148,7 +149,7 @@ def init_matrices(contig: Contig, multi: set, persistent_penalty: int = 3, varia def mk_regions(contig: Contig, matrix: list, multi: set, min_length: int = 3000, min_score: int = 4, - persistent: int = 3, continuity: int = 1, naming: str = "contig") -> set: + persistent: int = 3, continuity: int = 1, naming: str = "contig") -> Set[Region]: """ Processing matrix and 'emptying' it to get the regions. @@ -191,7 +192,7 @@ def max_index_node(lst): def compute_org_rgp(organism: Organism, multigenics: set, persistent_penalty: int = 3, variable_gain: int = 1, - min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> set: + min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> Set[Region]: org_regions = set() for contig in organism.contigs: if len(contig.genes) != 0: # some contigs have no coding genes... @@ -256,7 +257,7 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar): pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme)) - logging.getLogger("PPanGGOLiN").info(f"Predicted {len(pangenome.regions)} RGP") + logging.getLogger("PPanGGOLiN").info(f"Predicted {pangenome.number_of_rgp()} RGP") # save parameters and save status pangenome.parameters["RGP"] = {} diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index fb20c5ce..5a84afae 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -497,9 +497,9 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): with read_compressed_or_not(elements[1]) as currFastaFile: fasta_dict[org], _ = read_fasta(org, currFastaFile) if set(pangenome.organisms) > set(fasta_dict.keys()): - missing = len(pangenome.organisms) - len(set(pangenome.organisms) & set(fasta_dict.keys())) + missing = pangenome.number_of_organisms() - len(set(pangenome.organisms) & set(fasta_dict.keys())) raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. " - f"{missing} are missing (out of {len(pangenome.organisms)}).") + f"{missing} are missing (out of {pangenome.number_of_organisms()}).") for org in pangenome.organisms: for contig in org.contigs: diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 76d6ed2d..07786caa 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -259,7 +259,7 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F logging.getLogger("PPanGGOLiN").info(f"Adding {len(gene_to_fam)} genes to the gene families") link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False - if link and len(gene_to_fam) != len(pangenome.genes): # then maybe there are genes with identical IDs + if link and len(gene_to_fam) != pangenome.number_of_genes(): # then maybe there are genes with identical IDs raise Exception("Something unexpected happened during clustering (have less genes clustered than genes " "in the pangenome). A probable reason is that two genes in two different organisms have " "the same IDs; If you are sure that all of your genes have non identical IDs, please post an " @@ -416,7 +416,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet raise Exception(f"line {line_counter} of the file '{families_tsv_file.name}' raised an error.") bar.close() families_tsv_file.close() - if nb_gene_with_fam < len(pangenome.genes): # not all genes have an associated cluster + if nb_gene_with_fam < pangenome.number_of_genes(): # not all genes have an associated cluster if nb_gene_with_fam == 0: raise Exception("No gene ID in the cluster file matched any gene ID from the annotation step." " Please ensure that the annotations that you loaded previously and the clustering results " @@ -426,7 +426,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet if infer_singleton: infer_singletons(pangenome) else: - raise Exception(f"Some genes ({len(pangenome.genes) - nb_gene_with_fam}) did not have an associated " + raise Exception(f"Some genes ({pangenome.number_of_genes() - nb_gene_with_fam}) did not have an associated " f"cluster. Either change your cluster file so that each gene has a cluster, " f"or use the --infer_singletons option to infer a cluster for each non-clustered gene.") pangenome.status["genesClustered"] = "Computed" diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index f90f3ccb..619845e1 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -34,7 +34,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) if pangenome.status["partitioned"] == "No": raise Exception("Cannot draw the tile plot as your pangenome has not been partitioned") - if len(pangenome.organisms) > 500 and nocloud is False: + if pangenome.number_of_organisms() > 500 and nocloud is False: logging.getLogger("PPanGGOLiN").warning("You asked to draw a tile plot for a lot of organisms (>500). " "Your browser will probably not be able to open it.") logging.getLogger("PPanGGOLiN").info("Drawing the tile plot...") @@ -46,7 +46,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di if nocloud: families = {fam for fam in pangenome.gene_families if not fam.partition.startswith("C")} else: - families = pangenome.gene_families + families = set(pangenome.gene_families) org_index = pangenome.get_org_index() index2org = {} for org, index in org_index.items(): @@ -65,7 +65,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di index2fam[row] = fam.name fam2index[fam.name] = row - mat_p_a = csc_matrix((data, (all_indexes, all_columns)), shape=(len(families), len(pangenome.organisms)), + mat_p_a = csc_matrix((data, (all_indexes, all_columns)), shape=(len(families), pangenome.number_of_organisms()), dtype='float') dist = pdist(1 - jaccard_similarities(mat_p_a, 0).todense()) hc = linkage(dist, 'single') @@ -153,9 +153,9 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di else: color = colors["shell"] shapes.append(dict(type='line', x0=-1, x1=-1, y0=sep_prec, y1=sep, line=dict(dict(width=10, color=color)))) - shapes.append(dict(type='line', x0=len(pangenome.organisms), x1=len(pangenome.organisms), y0=sep_prec, y1=sep, + shapes.append(dict(type='line', x0=pangenome.number_of_organisms(), x1=pangenome.number_of_organisms(), y0=sep_prec, y1=sep, line=dict(dict(width=10, color=color)))) - shapes.append(dict(type='line', x0=-1, x1=len(pangenome.organisms), y0=sep, y1=sep, + shapes.append(dict(type='line', x0=-1, x1=pangenome.number_of_organisms(), y0=sep, y1=sep, line=dict(dict(width=1, color=color)))) sep_prec = sep diff --git a/ppanggolin/figures/ucurve.py b/ppanggolin/figures/ucurve.py index 4d31f2d4..497be19c 100644 --- a/ppanggolin/figures/ucurve.py +++ b/ppanggolin/figures/ucurve.py @@ -39,7 +39,7 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di data_plot = [] chao = "NA" if count[1]["pangenome"] > 0: - chao = round(len(pangenome.gene_families) + ((count[0]["pangenome"] ^ 2) / (count[1]["pangenome"] * 2)), 2) + chao = round(pangenome.number_of_gene_families() + ((count[0]["pangenome"] ^ 2) / (count[1]["pangenome"] * 2)), 2) colors = {"pangenome": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", "undefined": "#828282"} @@ -48,24 +48,24 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di persistent_values = [] shell_values = [] cloud_values = [] - for nb_org in range(1, len(pangenome.organisms) + 1): + for nb_org in range(1, pangenome.number_of_organisms() + 1): persistent_values.append(count[nb_org]["persistent"]) shell_values.append(count[nb_org]["shell"]) cloud_values.append(count[nb_org]["cloud"]) - data_plot.append(go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=persistent_values, name='persistent', + data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms() + 1)), y=persistent_values, name='persistent', marker=dict(color=colors["persistent"]))) - data_plot.append(go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=shell_values, name='shell', + data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms() + 1)), y=shell_values, name='shell', marker=dict(color=colors["shell"]))) - data_plot.append(go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=cloud_values, name='cloud', + data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms() + 1)), y=cloud_values, name='cloud', marker=dict(color=colors["cloud"]))) else: text = 'undefined' if has_undefined else "pangenome" undefined_values = [] - for nb_org in range(1, len(pangenome.organisms) + 1): + for nb_org in range(1, pangenome.number_of_organisms() + 1): undefined_values.append(count[nb_org][text]) - data_plot.append(go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=undefined_values, name=text, + data_plot.append(go.Bar(x=list(range(1, pangenome.number_of_organisms() + 1)), y=undefined_values, name=text, marker=dict(color=colors[text]))) - x = len(pangenome.organisms) * soft_core + x = pangenome.number_of_organisms() * soft_core layout = go.Layout(title="Gene families frequency distribution (U shape), chao=" + str(chao), xaxis=dict(title='Occurring in x genomes'), yaxis=dict(title='# of gene families (F)'), diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 92aaf91f..2f025918 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -16,7 +16,7 @@ from ppanggolin.genome import Organism, Gene, RNA from ppanggolin.pangenome import Pangenome from ppanggolin.geneFamily import GeneFamily -from ppanggolin.region import Spot, Module +from ppanggolin.region import Region, Spot, Module from ppanggolin.metadata import Metadata @@ -309,8 +309,11 @@ def read_gene_families(pangenome: Pangenome, h5f: tables.File, disable_bar: bool link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene family", disable=disable_bar): - fam = GeneFamily(family_id=pangenome.max_fam_id, name=row["geneFam"].decode()) - pangenome.add_gene_family(fam) + try: + fam = pangenome.get_gene_family(name=row["geneFam"].decode()) + except KeyError: + fam = GeneFamily(family_id=pangenome.max_fam_id, name=row["geneFam"].decode()) + pangenome.add_gene_family(fam) if link: # linking if we have loaded the annotations gene_obj = pangenome.get_gene(row["gene"].decode()) else: # else, no @@ -374,7 +377,11 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): table = h5f.root.RGP for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="region", disable=disable_bar): - region = pangenome.get_region(row["RGP"].decode()) + try: + region = pangenome.get_region(row["RGP"].decode()) + except KeyError: + region = Region(row["RGP"].decode()) + pangenome.add_region(region) region.append(pangenome.get_gene(row["gene"].decode())) # order the genes properly in the regions for region in pangenome.regions: diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index b9b7c7e2..cf4277d7 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -157,9 +157,9 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool """ annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") gene_table = h5f.create_table(annotation, "genes", gene_desc(*get_max_len_annotations(pangenome)), - expectedrows=len(pangenome.genes)) + expectedrows=pangenome.number_of_genes()) - logging.getLogger("PPanGGOLiN").debug(f"Writing {len(pangenome.genes)} genes") + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes()} genes") genedata2gene = {} genedata_counter = 0 @@ -267,12 +267,12 @@ def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bo :param disable_bar: Disable progress bar """ gene_seq = h5f.create_table("/", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), - expectedrows=len(pangenome.genes)) + expectedrows=pangenome.number_of_genes()) # process sequences to save them only once seq2seqid = {} id_counter = 0 gene_row = gene_seq.row - for gene in tqdm(pangenome.genes, total=pangenome.number_of_gene(), unit="gene", disable=disable_bar): + for gene in tqdm(pangenome.genes, total=pangenome.number_of_genes(), unit="gene", disable=disable_bar): curr_seq_id = seq2seqid.get(gene.dna) if curr_seq_id is None: curr_seq_id = id_counter @@ -346,7 +346,7 @@ def write_gene_fam_info(pangenome: Pangenome, h5f: tables.File, force: bool = Fa logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed gene family representative sequences...") h5f.remove_node('/', 'geneFamiliesInfo') # erasing the table, and rewriting a new one. gene_fam_seq = h5f.create_table("/", "geneFamiliesInfo", gene_fam_desc(*get_gene_fam_len(pangenome)), - expectedrows=len(pangenome.gene_families)) + expectedrows=pangenome.number_of_gene_families()) row = gene_fam_seq.row for fam in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families(), @@ -460,9 +460,9 @@ def write_graph(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis logging.getLogger("PPanGGOLiN").info("Erasing the formerly computed edges") h5f.remove_node("/", "edges") edge_table = h5f.create_table("/", "edges", graph_desc(get_gene_id_len(pangenome)), - expectedrows=len(pangenome.edges)) + expectedrows=pangenome.number_of_edges()) edge_row = edge_table.row - for edge in tqdm(pangenome.edges, total=pangenome.number_of_edge(), unit="edge", disable=disable_bar): + for edge in tqdm(pangenome.edges, total=pangenome.number_of_edges(), unit="edge", disable=disable_bar): for gene_pairs in edge.organisms.values(): for gene1, gene2 in gene_pairs: edge_row["geneTarget"] = gene1.ID @@ -721,12 +721,12 @@ def getmin(arg: iter) -> float: else: info_group = h5f.create_group("/", "info", "Informations about the pangenome content") if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfGenes = len(pangenome.genes) - info_group._v_attrs.numberOfOrganisms = len(pangenome.organisms) + info_group._v_attrs.numberOfGenes = pangenome.number_of_genes() + info_group._v_attrs.numberOfOrganisms = pangenome.number_of_organisms() if pangenome.status["genesClustered"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfClusters = len(pangenome.gene_families) + info_group._v_attrs.numberOfClusters = pangenome.number_of_gene_families() if pangenome.status["neighborsGraph"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfEdges = len(pangenome.edges) + info_group._v_attrs.numberOfEdges = pangenome.number_of_edges() if pangenome.status["partitioned"] in ["Computed", "Loaded"]: named_part_counter = Counter() subpart_counter = Counter() @@ -734,7 +734,7 @@ def getmin(arg: iter) -> float: part_set = set() for fam in pangenome.gene_families: named_part_counter[fam.named_partition] += 1 - part_distribs[fam.named_partition].append(len(fam.organisms) / len(pangenome.organisms)) + part_distribs[fam.named_partition].append(len(fam.organisms) / pangenome.number_of_organisms()) if fam.named_partition == "shell": subpart_counter[fam.partition] += 1 if fam.partition != "S_": @@ -756,11 +756,11 @@ def getmin(arg: iter) -> float: info_group._v_attrs.numberOfPartitions = len(part_set) info_group._v_attrs.numberOfSubpartitions = subpart_counter if pangenome.status["predictedRGP"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfRGP = len(pangenome.regions) + info_group._v_attrs.numberOfRGP = pangenome.number_of_rgp() if pangenome.status["spots"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfSpots = len(pangenome.spots) + info_group._v_attrs.numberOfSpots = pangenome.number_of_spots() if pangenome.status["modules"] in ["Computed", "Loaded"]: - info_group._v_attrs.numberOfModules = len(pangenome.modules) + info_group._v_attrs.numberOfModules = pangenome.number_of_modules() info_group._v_attrs.numberOfFamiliesInModules = sum([len(mod.families) for mod in pangenome.modules]) info_group._v_attrs.parameters = pangenome.parameters # saving the pangenome parameters diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 368ce5eb..2e5cc34b 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -141,9 +141,9 @@ def write_json_edges(json): :param json: file-like object, compressed or not """ json.write(', "links": [') - edgelist = pan.edges - write_json_edge(edgelist[0], json) - for edge in edgelist[1:]: + edge_list = list(pan.edges) + write_json_edge(edge_list[0], json) + for edge in edge_list[1:]: json.write(", ") write_json_edge(edge, json) json.write(']') @@ -241,9 +241,9 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') + f'{"exact_accessory" if len(fam.organisms) != pan.number_of_organisms() else "exact_core"}" />\n') gexf.write(f' = (len(pan.organisms) * soft_core) else "soft_accessory"}"' + f'{"soft_core" if len(fam.organisms) >= (pan.number_of_organisms() * soft_core) else "soft_accessory"}"' f' />\n') gexf.write(f' \n') gexf.write(f' \n') @@ -349,7 +349,7 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool '"Max group size nuc"', # 13 '"Avg group size nuc"'] # 14 + ['"' + str(org) + '"' for org in pan.organisms]) + "\n") # 15 - default_genes = ['""'] * len(pan.organisms) if gene_names else ["0"] * len(pan.organisms) + default_genes = ['""'] * pan.number_of_organisms() if gene_names else ["0"] * pan.number_of_organisms() org_index = pan.get_org_index() # should just return things for fam in pan.gene_families: genes = default_genes.copy() @@ -406,7 +406,7 @@ def write_gene_presence_absence(output: Path, compress: bool = False): matrix.write('\t'.join(['Gene'] + # 14 [str(org) for org in pan.organisms]) + "\n") # 15 - default_genes = ["0"] * len(pan.organisms) + default_genes = ["0"] * pan.number_of_organisms() org_index = pan.get_org_index() # should just return things for fam in pan.gene_families: genes = default_genes.copy() @@ -606,9 +606,9 @@ def write_parts(output: Path, soft_core: float = 0.95): part_sets[fam.named_partition].add(fam.name) if fam.partition.startswith("S"): part_sets[fam.partition].add(fam.name) - if len(fam.organisms) >= len(pan.organisms) * soft_core: + if len(fam.organisms) >= pan.number_of_organisms() * soft_core: part_sets["soft_core"].add(fam.name) - if len(fam.organisms) == len(pan.organisms): + if len(fam.organisms) == pan.number_of_organisms(): part_sets["exact_core"].add(fam.name) else: part_sets["exact_accessory"].add(fam.name) diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 10e8bdfc..e05f260b 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -86,7 +86,7 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, remove_high_copy_number(pangenome, remove_copy_number) logging.getLogger("PPanGGOLiN").info("Computing the neighbors graph...") - bar = tqdm(pangenome.organisms, total=len(pangenome.organisms), unit="organism", disable=disable_bar) + bar = tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="organism", disable=disable_bar) for org in bar: bar.set_description(f"Processing {org.name}") bar.refresh() diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index 06bbbe7f..b07ea8c8 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -65,7 +65,6 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame: :return: Dataframe with metadata loaded """ assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] - colname_check = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') metadata_df = pd.read_csv(metadata, sep="\t", header=0, quoting=csv.QUOTE_NONE, dtype={metatype: str}) @@ -85,14 +84,13 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame: def assign_metadata(metadata_df: pd.DataFrame, pangenome: Pangenome, source: str, metatype: str, omit: bool = False, disable_bar: bool = False): - """ Add to pangenome element a metadata - - :param metadata_df: Dataframe with for each family a metadata - :param pangenome: Pangenome with gene families - :param source: source of the metadata - :param metatype: select to which pangenome element metadata will be added - :param omit: allow to omit a row in dataframe if the element name is not find in pangenomes - :param disable_bar: Disable progress bar + """function assigns metadata to elements in a pangenome based on a metadata dataframe. + :param metadata_df: A pandas dataframe containing metadata to be assigned to elements in the pangenome. + :param pangenome: A Pangenome object representing the pangenome to which metadata will be assigned. + :param source: A string representing the source of the metadata. + :param metatype: A string representing the type of element to which metadata will be assigned. + :param omit: A boolean indicating whether to raise an error if metadata cannot be assigned to an element. If True, metadata will not be assigned to elements that do not exist in the pangenome. If False, an error will be raised. Default is False. + :param disable_bar: A boolean indicating whether to disable the progress bar. Default is False. :raise KeyError: element name is not find in pangenome :raise AssertionError: Metatype is not recognized @@ -165,7 +163,7 @@ def parser_meta(parser: argparse.ArgumentParser): """ required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") - required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome .h5 file") + required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") required.add_argument('-m', '--metadata', required=False, type=Path, nargs='?', help='Metadata in TSV file. See our github for more detail about format') required.add_argument("-s", "--source", required=False, type=str, nargs="?", diff --git a/ppanggolin/metadata.py b/ppanggolin/metadata.py index 4866a8ad..e1386d4c 100644 --- a/ppanggolin/metadata.py +++ b/ppanggolin/metadata.py @@ -9,34 +9,46 @@ class Metadata: + """The Metadata class represents a metadata link to genes, gene families, organisms, regions, spot or modules. It allows the creation of metadata objects with different attributes and values, and provides methods to access and manipulate these attributes. The class has a constructor method that initializes the Metadata object with a source and a dictionary of attributes and values. The class also has methods to get the value of a specific attribute, return a list of all the attributes, and join a list of strings into a single string separated by commas. The class has two fields: source, which represents the source of the metadata, and **kwargs, which is a dictionary of attributes and values representing the metadata. + + Methods: + - __init__(self, source: str, **kwargs): Constructor method that initializes the Metadata object with a source and a dictionary of attributes and values. + - number_of_attribute(self): Returns the number of attributes in the Metadata object. + - get(self, name: str, skip_error: bool = False): Returns the value of a specific attribute in the Metadata object, or None if the attribute does not exist. If skip_error is True, it does not raise an AttributeError if the attribute does not exist. + - fields(self) -> List[str]: Returns a list of all the attributes in the Metadata object. + - _join_list(attr_list: Union[str, List[str]]): Joins a list of strings into a single string separated by commas. + + Fields: + - source: A string representing the source of the metadata. + - **kwargs: A dictionary of attributes and values representing the metadata. The attributes can be any string, and the values can be any type except None or NaN. """ - This represents a metadata link to genes, gene families, organisms, regions, spot or modules - - :param source: source of the metadata - :param kwargs: all metadata name with there value - """ - def __init__(self, source: str, **kwargs): - """Constructor Method + """ + The Metadata class represents a metadata link to genes, gene families, organisms, regions, spot or modules. + It allows the creation of metadata objects with different attributes and values, and provides methods to access + and manipulate these attributes. Add attributes and values representing the metadata as mush as you want. + The attributes can be any string, and the values can be any type except None or NaN. + + :param source: A string representing the source of the metadata. + :param kwargs: A dictionary of attributes and values representing the metadata. The attributes can be any string, and the values can be any type except None or NaN. """ self.source = source + if len(kwargs) == 0: + raise Exception(f"No metadata given for source: {source}") for attr, value in kwargs.items(): + if isinstance(value, list): + value = self._join_list(value) if value is not None and not isna(value): - if isinstance(value, list): - value = self._join_list(value) setattr(self, attr, value) - def __len__(self): + def number_of_attribute(self): return len(self.__dict__.keys()) - def get(self, name: str, skip_error: bool = False): + def get(self, name: str): try: value = self.__getattribute__(name) except AttributeError as attr_error: - if skip_error: - return None - else: - raise attr_error + raise AttributeError(attr_error) else: return value diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index d13ea370..8b0de528 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -371,8 +371,8 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No tmpdir_obj = tempfile.TemporaryDirectory(dir=tmpdir) tmp_path = Path(tmpdir_obj.name) - if float(len(pangenome.organisms)) < max_sampling: - max_sampling = len(pangenome.organisms) + if float(pangenome.number_of_organisms()) < max_sampling: + max_sampling = pangenome.number_of_organisms() else: max_sampling = int(max_sampling) @@ -399,7 +399,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No index_org = pangenome.compute_family_bitarrays() logging.getLogger("PPanGGOLiN").info("Done computing bitarrays. Comparing them to get exact and soft core stats " f"for {len(all_samples)} samples...") - bar = tqdm(range(len(all_samples) * len(pangenome.gene_families)), unit="gene family", disable=disable_bar) + bar = tqdm(range(len(all_samples) * pangenome.number_of_gene_families()), unit="gene family", disable=disable_bar) for samp in all_samples: # make the sample's organism bitarray. samp_bitarray = gmpy2.xmpz() # pylint: disable=no-member diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 167cbd9a..f7007de9 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -77,23 +77,11 @@ def add_file(self, pangenome_file: Path): self.file = pangenome_file.absolute().as_posix() """ Gene Methods""" - @property - def genes(self) -> list: - """Creates the geneGetter if it does not exist, and returns all the genes of all organisms in the pangenome. + def genes(self) -> Generator[Gene, None, None]: + """Generator of genes in the pangenome. - :return: list of :class:`ppanggolin.genome.Gene` - """ - try: - return list(self._geneGetter.values()) - except AttributeError: # in that case the gene getter has not been computed - self._mk_gene_getter() # make it - return self.genes # return what was expected - - def _yield_genes(self) -> Iterator[Gene]: - """ Use a generator to get all the genes of a pangenome - - :return: an iterator of Gene + :return: gene generator """ if self.number_of_organisms() > 0: # if we have organisms, they're supposed to have genes for org in self.organisms: @@ -116,7 +104,7 @@ def _mk_gene_getter(self): The assumption behind this is that the pangenome has been filled and no more gene will be added. """ self._geneGetter = {} - for gene in self._yield_genes(): + for gene in self.genes: self._geneGetter[gene.ID] = gene def get_gene(self, gene_id: str) -> Gene: @@ -137,7 +125,7 @@ def get_gene(self, gene_id: str) -> Gene: except KeyError: raise KeyError(f"{gene_id} does not exist in the pangenome.") - def number_of_gene(self) -> int: + def number_of_genes(self) -> int: """Returns the number of gene present in the pangenome :return: the number of gene families @@ -149,14 +137,14 @@ def number_of_gene(self) -> int: return len(self._geneGetter) """Gene families methods""" - @property - def gene_families(self) -> Set[GeneFamily]: + def gene_families(self) -> Generator[GeneFamily, None, None]: """returns all the gene families in the pangenome :return: list of :class:`ppanggolin.geneFamily.GeneFamily` """ - return set(self._famGetter.values()) + for family in self._famGetter.values(): + yield family def number_of_gene_families(self) -> int: """Returns the number of gene families present in the pangenome @@ -172,12 +160,13 @@ def get_gene_family(self, name: str) -> GeneFamily: :return: returns the gene family that has the name `name` """ + assert isinstance(name, str) try: fam = self._famGetter[name] except KeyError: raise KeyError(f"Gene family with name={name} is not in pangenome") except Exception as error: - raise error + raise Exception(error) else: return fam @@ -194,22 +183,25 @@ def add_gene_family(self, family: GeneFamily): _ = self.get_gene_family(family.name) except KeyError: self._famGetter[family.name] = family + self.max_fam_id += 1 except Exception as error: - raise error + raise Exception(error) else: raise KeyError("Gene Family already exist") """Graph methods""" @property - def edges(self) -> list: + def edges(self) -> Generator[Edge, None, None]: """returns all the edges in the pangenome graph - :return: list of :class:`ppanggolin.pangenome.Edge` + :return: Generator of edge """ - return list(self._edgeGetter.values()) + for edge in self._edgeGetter.values(): + yield edge def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: + # TODO add_edge should not create edge but add one to pangenome like other add methods """ Adds an edge between the two gene families that the two given genes belong to. Genes object are expected, and they are also expected to have a family assigned @@ -228,7 +220,7 @@ def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: edge.add_genes(gene1, gene2) return edge - def number_of_edge(self) -> int: + def number_of_edges(self) -> int: """Returns the number of edge present in the pangenome :return: the number of gene families @@ -236,14 +228,14 @@ def number_of_edge(self) -> int: return len(self._edgeGetter) """Organism methods""" - @property - def organisms(self) -> Set[Organism]: + def organisms(self) -> Generator[Organism, None, None]: """returns all the organisms in the pangenome :return: list of :class:`ppanggolin.genome.Organism` """ - return set(self._orgGetter.values()) + for organism in self._orgGetter.values(): + yield organism def number_of_organisms(self) -> int: """Returns the number of organisms present in the pangenome @@ -268,34 +260,24 @@ def get_organism(self, org_name: str) -> Organism: except KeyError: raise KeyError(f"{org_name} does not seem to be in your pangenome") - def add_organism(self, new_org: Union[Organism, str]) -> Organism: + def add_organism(self, organism: Organism): """ adds an organism that did not exist previously in the pangenome if an Organism object is provided. If an organism with the same name exists it will raise an error. If a str object is provided, will return the corresponding organism that has this name OR create a new one if it does not exist. - :param new_org: Organism to add to the pangenome - - :return: The created organism + :param organism: Organism to add to the pangenome - :raises TypeError: if the provided `newOrg` is neither a str nor a :class:`ppanggolin.genome.Organism` + :raises KeyError: if the provided organism is already in pangenome """ - if isinstance(new_org, Organism): - old_len = len(self._orgGetter) - self._orgGetter[new_org.name] = new_org - if len(self._orgGetter) == old_len: - raise KeyError(f"Redondant organism name was found ({new_org.name})." - f"All of your organisms must have unique names.") - elif isinstance(new_org, str): - org = self._orgGetter.get(new_org) - if org is None: - org = Organism(new_org) - self._orgGetter[org.name] = org - new_org = org + try: + self.get_organism(organism.name) + except KeyError: + self._orgGetter[organism.name] = organism else: - raise TypeError("Provide an Organism object or a str that will serve as organism name") - return new_org + raise KeyError(f"Redondant organism name was found ({organism.name})." + f"All of your organisms must have unique names.") def get_org_index(self) -> Dict[Organism, int]: # will not make a new index if it exists already """Creates an index for Organisms (each organism is assigned an Integer). @@ -360,12 +342,13 @@ def compute_org_bitarrays(self, part='all') -> Dict[GeneFamily, int]: """RGP methods""" @property - def regions(self) -> list: + def regions(self) -> Generator[Region, None, None]: """returns all the regions (RGP) in the pangenome :return: list of RGP """ - return list(self._regionGetter.values()) + for region in self._regionGetter.values(): + yield region def get_region(self, region_name: str) -> Region: """Returns a region with the given region_name. Creates it if it does not exist. @@ -375,11 +358,11 @@ def get_region(self, region_name: str) -> Region: :return: The region """ try: - return self._regionGetter[region_name] + rgp = self._regionGetter[region_name] except KeyError: # then the region is not stored in this pangenome. - new_region = Region(region_name) - self._regionGetter[region_name] = new_region - return new_region + raise KeyError(f"There is no RGP with name={region_name}") + else: + return rgp def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[GeneFamily]: """ @@ -402,24 +385,32 @@ def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[Gen # (duplicated in more than {dup_margin} of the genomes)") return multigenics - def add_regions(self, region_group: Union[Region, Iterable[Region]]): - """Takes an Iterable or a Region object and adds it to the pangenome + def add_region(self, region: Region): + """Add a region to the pangenome - :param region_group: a region or an Iterable of regions to add to the pangenome + :param region: Region to add in pangenome - :raises TypeError: if regionGroup is neither a Region nor an Iterable[Region] + :raise KeyError: Error if another Region exist in pangenome with the same name """ - old_len = len(self._regionGetter) - if isinstance(region_group, Iterable): - for region in region_group: - self._regionGetter[region.name] = region - if len(self._regionGetter) != len(region_group) + old_len: - raise Exception("Two regions had an identical name, which was unexpected.") - elif isinstance(region_group, Region): - self._regionGetter[region_group.name] = region_group + try: + self.get_region(region.name) + except KeyError: + self._regionGetter[region.name] = region else: - raise TypeError(f"An iterable or a 'Region' type object were expected, " - f"but you provided a {type(region_group)} type object") + raise KeyError(f"A RGP with this name ({region.name} already exist in pangenome") + + def add_regions(self, regions: Iterable[Region]): + #TODO remove this function + """Takes an Iterable of Region and adds it to the pangenome + + :param regions: An Iterable of regions to add to the pangenome + + :raises AssertionError: if regions is not an Iterable[Region] + """ + assert isinstance(regions, Iterable), f"An iterable was expected, but you provided a {type(regions)}" + + for region in regions: + self.add_region(region) def number_of_rgp(self) -> int: """Returns the number of gene families present in the pangenome @@ -431,11 +422,11 @@ def number_of_rgp(self) -> int: """Spot methods""" @property def spots(self) -> Generator[Spot, None, None]: - # TODO made as generator for spot in self._spotGetter.values(): yield spot def get_spot(self, spot_id: Union[int, str]) -> Spot: + # TODO Change for only str or only int """ Returns the spot that has the given spot ID. @@ -450,7 +441,7 @@ def get_spot(self, spot_id: Union[int, str]) -> Spot: try: spot_id = int(spot_id) except ValueError: - result = re.search("^spot_(\d+)$", spot_id) + result = re.search(r"^spot_(\d+)$", spot_id) if result: spot_id = int(result.group(1)) else: @@ -463,7 +454,7 @@ def get_spot(self, spot_id: Union[int, str]) -> Spot: else: return spot - def add_spots(self, spots: Iterable[spots]): + def add_spots(self, spots: Iterable[Spot]): #TODO remove this function for spot in spots: self.add_spot(spot) @@ -474,11 +465,11 @@ def add_spot(self, spot: Spot): :param spot: spot which should be added """ try: - _ = self.get_spot(spot.ID) + self.get_spot(spot.ID) except KeyError: self._spotGetter[spot.ID] = spot except Exception as error: - raise error + raise Exception(error) else: raise KeyError("Spot already exist") @@ -510,7 +501,7 @@ def get_module(self, module_id: Union[int, str]) -> Module: try: module_id = int(module_id) except ValueError: - result = re.search("^module_(\d+)$", module_id) + result = re.search(r"^module_(\d+)$", module_id) if result: module_id = int(result.group(1)) else: @@ -539,11 +530,11 @@ def add_module(self, module: Module): :param module: module to add in pangenome """ try: - _ = self.get_module(module.ID) + self.get_module(module.ID) except KeyError: self._moduleGetter[module.ID] = module except Exception as error: - raise error + raise Exception(error) else: raise KeyError("Module already exist") @@ -571,9 +562,23 @@ def number_of_modules(self) -> int: :return: the number of modules """ - return len(self.modules) + return len(self._moduleGetter) """Metadata""" + def select_elem(self, metatype: str): + if metatype == "families": + return self.gene_families + elif metatype == "genomes": + return self.organisms + elif metatype == "genes": + return self.genes + elif metatype == "RGPs": + return self.regions + elif metatype == "spots": + return self.spots + else: # metatype == "modules": + return self.modules + def metadata_sources(self, metatype: str) -> Set[str]: """returns all the metadata source in the pangenomes @@ -583,19 +588,7 @@ def metadata_sources(self, metatype: str) -> Set[str]: """ assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] source_set = set() - if metatype == "families": - elements = self.gene_families - elif metatype == "genomes": - elements = self.organisms - elif metatype == "genes": - elements = self.genes - elif metatype == "RGPs": - elements = self.regions - elif metatype == "spots": - elements = self.spots - else: # metatype == "modules": - elements = self.modules - for elem in elements: + for elem in self.select_elem(metatype): for source_metadata in elem.sources: source_set.add(source_metadata) return source_set @@ -606,39 +599,15 @@ def metadata(self, metatype: str) -> Generator[Metadata, None, None]: :return: set of metadata source """ assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] - if metatype == "families": - elements = self.gene_families - elif metatype == "genomes": - elements = self.organisms - elif metatype == "genes": - elements = self.genes - elif metatype == "RGPs": - elements = self.regions - elif metatype == "spots": - elements = self.spots - else: # metatype == "modules": - elements = self.modules - for elem in elements: + for elem in self.select_elem(metatype): yield elem.metadata def get_elem_by_metadata(self, metatype: str, **kargs) -> Generator[ Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] - if metatype == "families": - elements = self.gene_families - elif metatype == "genomes": - elements = self.organisms - elif metatype == "genes": - elements = self.genes - elif metatype == "RGPs": - elements = self.regions - elif metatype == "spots": - elements = self.spots - else: # metatype == "modules": - elements = self.modules - for element in elements: - if len(list(element.get_metadata(**kargs))) > 0: - yield element + for elem in self.select_elem(metatype): + if len(list(elem.get_metadata(**kargs))) > 0: + yield elem def get_elem_by_sources(self, source: List[str], metatype: str) -> Generator[ Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: @@ -649,18 +618,6 @@ def get_elem_by_sources(self, source: List[str], metatype: str) -> Generator[ :return: Gene families with the source """ assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] - if metatype == "families": - elements = self.gene_families - elif metatype == "genomes": - elements = self.organisms - elif metatype == "genes": - elements = self.genes - elif metatype == "RGPs": - elements = self.regions - elif metatype == "spots": - elements = self.spots - else: # metatype == "modules": - elements = self.modules - for element in elements: - if element.get_source(source) is not None: - yield element + for elem in self.select_elem(metatype): + if elem.get_source(source) is not None: + yield elem diff --git a/ppanggolin/region.py b/ppanggolin/region.py index e9515fe9..8a08e92f 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -56,6 +56,7 @@ def __getitem__(self, index): return self.genes[index] def append(self, gene: Gene): + # TODO change name foir add_gene """allowing only gene-class objects in a region :param gene: gene which will be added diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 6fa51524..88696f9c 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -185,8 +185,8 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, spot_time += time.time() - start_spot_drawing if args.draw.tile_plot: - if 1 < len(pangenome.organisms) < 5000: - nocloud = args.draw.nocloud if len(pangenome.organisms) < 500 else True + if 1 < pangenome.number_of_organisms() < 5000: + nocloud = args.draw.nocloud if pangenome.number_of_organisms() < 500 else True draw_tile_plot(pangenome, args.output, nocloud=nocloud, disable_bar=args.disable_prog_bar) else: logging.getLogger("PPanGGOLiN").warning( diff --git a/tests/test_Pangenome.py b/tests/test_Pangenome.py index eb0856d0..1900127b 100644 --- a/tests/test_Pangenome.py +++ b/tests/test_Pangenome.py @@ -2,359 +2,481 @@ import pytest from random import choices, randint, sample +from typing import Generator, Set +from pathlib import Path -from ppanggolin.genome import Gene, Organism -from ppanggolin.pangenome import Edge, Pangenome +from ppanggolin.genome import Gene, Organism, Contig +from ppanggolin.pangenome import Pangenome +from ppanggolin.edge import Edge from ppanggolin.geneFamily import GeneFamily - - -def test_cstr(): - o_pang = Pangenome() - assert isinstance(o_pang, Pangenome) - - for attr in "max_fam_id", "parameters", "status": - assert hasattr(o_pang, attr) - assert o_pang.max_fam_id == 0 - assert o_pang.parameters == {} - assert o_pang.status == {'genomesAnnotated': "No", - 'geneSequences': "No", - 'genesClustered': "No", - 'defragmented': "No", - 'geneFamilySequences': "No", - 'neighborsGraph': "No", - 'partitioned': "No", - 'predictedRGP': "No", - 'spots': "No", - 'modules': 'No', - "metadata": {"families": 'No', - "genes": 'No', - "genomes": 'No', - "RGPs": 'No', - "spots": 'No', - "modules": 'No'}, - "metasources": {"families": [], - "genes": [], - "genomes": [], - "RGPs": [], - "spots": [], - "modules": []} - } - - -@pytest.fixture -def o_pang(): - return Pangenome() - - -# @pytest.mark.xfail(reason="not implemented !") -# def test_add_file(o_pang): -# assert False # need to generate a valid file several time - -@pytest.fixture -def l_orgs(): - l_orgs = [] - for i_org in range(randint(5, 20)): - o_org = Organism(str(i_org)) - l_orgs.append(o_org) - - return l_orgs - - -def test_organisms(o_pang, l_orgs): - # 'set' because order is not guaranted - # and org should be unique - assert set(o_pang.organisms) == set() - - # add Org from Org - for o_org in l_orgs: - o_pang.add_organism(o_org) - - # add Org from string - for i_org in range(randint(5, 20)): - o_org = o_pang.add_organism(str(i_org)) - l_orgs.append(o_org) - - assert set(o_pang.organisms) == set(l_orgs) - - -def test_add_organism_str(o_pang): - o_org = o_pang.add_organism("org1") - assert o_org in o_pang.organisms - assert isinstance(o_org, Organism) - assert set(o_pang.organisms) == {o_org} - - -def test_add_organism(o_pang): - o_org = Organism("org") - assert o_pang.add_organism(o_org) == o_org - assert set(o_pang.organisms) == {o_org} - - -def test_number_of_organism(o_pang, l_orgs): - assert o_pang.number_of_organisms() == 0 - - for o_org in l_orgs: - o_pang.add_organism(o_org) - - assert o_pang.number_of_organisms() == len(l_orgs) - - -def test_add_gene_family_one(o_pang): - name = "fam1" - o_fam1 = o_pang.add_gene_family(name) - assert isinstance(o_fam1, GeneFamily) - assert 1 == o_pang.max_fam_id - - -def test_add_gene_family_same(o_pang): - name = "fam1" - o_fam1 = o_pang.add_gene_family(name) - o_fam2 = o_pang.add_gene_family(name) - assert o_fam1 == o_fam2 - - -def test_add_gene_family_many(o_pang): - n_fams = randint(5, 20) - for i_fam in range(n_fams): - o_pang.add_gene_family(str(i_fam)) - assert n_fams == o_pang.max_fam_id - - -def test_get_gene_family(o_pang): - name = "fam1" - o_fam = o_pang.add_gene_family(name) - assert o_pang.get_gene_family(name) == o_fam - - for i_fam in range(randint(5, 20)): - o_pang.add_gene_family(str(i_fam)) - # still true after many insert - assert o_pang.get_gene_family(name) == o_fam - - -def test_number_of_gene_families_empty(o_pang): - assert o_pang.number_of_gene_families() == 0 - - -def test_number_of_gene_families(o_pang): - n_fams = randint(5, 10) - - for i_fam in sample(range(20), k=n_fams): - o_pang.add_gene_family(str(i_fam)) - assert o_pang.number_of_gene_families() == n_fams - - -def test_gene_families_empty(o_pang): - # 'set' because order is not guaranted - assert set(o_pang.gene_families) == set() - - -def test_gene_families(o_pang): - l_ints = choices(range(20), k=10) - s_fams = set() - for i_fam in l_ints: - o_fam = o_pang.add_gene_family(str(i_fam)) - s_fams.add(o_fam) - - assert set(o_pang.gene_families) == s_fams - - -def test_genes_empty(o_pang): - assert list(o_pang.genes) == [] - - -# code copy-pasted from test_Edge.py -@pytest.fixture() -def make_gene_pair(): - def _make_gene_pair(org, gene_id1, gene_id2): - """create a pair of genes that belong to the same organism.""" - lo_genes = [] - for k in gene_id1, gene_id2: - o_gene = Gene(k) - o_gene.fill_parents(org, None) - - lo_genes.append(o_gene) - - o_family = GeneFamily(k, k) - o_family.add_gene(o_gene) - return tuple(lo_genes) - - return _make_gene_pair - - -@pytest.fixture() -def make_org_with_genes(): - def _make_org_with_genes(org): - """make an organism, add from 2 to 10 contigs - with 2 to 10 genes each.""" - l_genes = [] - o_org = Organism(org) - for i in range(randint(2, 10)): - o_ctg = o_org.get_contig("k_{}".format(i)) - for j in range(randint(2, 10)): - name = "{}.{}.{}".format(org, o_ctg.name, j) - o_gene = Gene(name) - o_gene.position = j - o_gene.start = j - o_ctg.add_gene(o_gene) - l_genes.append(o_gene) - return o_org, l_genes - - return _make_org_with_genes - - -@pytest.fixture() -def fill_fam_with_genes(): - def _fill_fam_with_genes(o_fam): - """add genes with names from 2 to 10 to a geneFamily object.""" - l_genes = [] - for i in range(2, 10): - name = "{}_{}".format(o_fam.name, i) - o_gene = Gene(name) - o_fam.add_gene(o_gene) - l_genes.append(o_gene) - return l_genes - - return _fill_fam_with_genes - - -def test_genes_organism_debug(o_pang, make_org_with_genes): - # orgs with genes. - o_org, l_genes = make_org_with_genes("org1") - o_pang.add_organism(o_org) - l_expected = sorted(l_genes, key=lambda g: g.ID) - l_observed = sorted(o_pang.genes, key=lambda g: g.ID) - assert l_observed == l_expected - - -def test_genes_genefamilies(o_pang, fill_fam_with_genes): - """Genes are added in pangenome through their family.""" - # geneFamily with genes. - o_fam = o_pang.add_gene_family("fam1") - l_genes = fill_fam_with_genes(o_fam) # the list of genes, and the geneFam are supposed to be the same - l_expected = sorted(l_genes, key=lambda g: g.ID) - l_observed = sorted(o_pang.genes, key=lambda g: g.ID) - print(o_pang.genes) - assert l_observed == l_expected - - -def test_edges_empty(o_pang): - assert list(o_pang.edges) == [] - - -def test_add_edge(o_pang, make_gene_pair): - name = "gene_fam" # gene/fam name - to_genes = make_gene_pair("org", name, name) - - o_edge1 = o_pang.add_edge(*to_genes) - assert isinstance(o_edge1, Edge) - - # addEdge doesn't act the same when the edge already exists. - o_edge2 = o_pang.add_edge(*to_genes) - assert o_edge2 == o_edge1 - - -def test_edges_one(o_pang, make_gene_pair): - name = "gene_fam" # gene/fam name - to_genes = make_gene_pair("org", name, name) - - lo_edges = [] - n = randint(1, 5) - for _ in range(n): - lo_edges.append(o_pang.add_edge(*to_genes)) - - # always the same family couple - # = one edge, with several couple of genes - # I use set because edges are uniques, it is not a multigraph. - assert set(o_pang.edges) == set(lo_edges) - assert len(o_pang.edges) == 1 - - o_edge = list(o_pang.edges).pop() - assert o_edge.gene_pairs == [to_genes for _ in range(n)] - - -def test_edges_many_rand(o_pang, make_gene_pair): - lo_edges = [] - n = randint(1, 5) - for i in range(n): - name1 = "gene_" + str(i) # gene/fam name - name2 = str(i) + "_gene" # gene/fam name - to_genes = make_gene_pair("org", name1, name2) - lo_edges.append(o_pang.add_edge(*to_genes)) - # I use set because edges are uniques, it is not a supergraph. - assert set(o_pang.edges) == set(lo_edges) - - -def test_edges_several(o_pang, make_gene_pair): - # little more sophisticated - to_genes = make_gene_pair("org", "g1", "g2") - o_fam2 = to_genes[1].family - o_pang.add_edge(*to_genes) - - to_genes = make_gene_pair("org", "g1", "g3") - o_fam3 = to_genes[1].family - o_pang.add_edge(*to_genes) - # g3 -- g1 -- g2 - - to_genes = make_gene_pair("org", "g22", "g33") - o_fam2.add_gene(to_genes[0]) - o_fam3.add_gene(to_genes[1]) - o_pang.add_edge(*to_genes) - # g2 -- g3 - - assert len(o_pang.edges) == 3 - - -def test_get_index(o_pang, l_orgs): - for o_org in l_orgs: - o_pang.add_organism(o_org) - idx = o_pang.get_org_index() - - # after the method, the index exist - assert o_pang.get_org_index() is idx - - # all orgs are in the index - l_observed = sorted(idx.keys(), key=lambda x: x.name) - l_orgs.sort(key=lambda x: x.name) - assert l_observed == l_orgs - - -def test_compute_family_bitarrays(o_pang, l_orgs): - for o_org in l_orgs: - o_pang.add_organism(o_org) - idx = o_pang.get_org_index() - assert o_pang.compute_family_bitarrays() is idx - - -def test_family_have_bitarrays(o_pang, l_orgs): - """test that after the method all the families have a bitarray.""" - n_fams = randint(5, 10) - - l_fams = [] - for i_fam in sample(range(20), k=n_fams): - l_fams.append(o_pang.add_gene_family(str(i_fam))) - o_pang.compute_family_bitarrays() - for o_fam in l_fams: - assert hasattr(o_fam, 'bitarray') - - -def test_get_gene_empty(o_pang): - with pytest.raises(KeyError): - o_pang.get_gene(33) - - -def test_get_gene_org(o_pang, make_org_with_genes): - # orgs with genes. - o_org, l_genes = make_org_with_genes("org") - o_pang.add_organism(o_org) - - n = len(l_genes) - for o_gene in sample(l_genes, randint(4, n)): - assert o_pang.get_gene(o_gene.ID) == o_gene - - -def test_get_gene_fam(o_pang, fill_fam_with_genes): - o_fam = o_pang.add_gene_family("fam") - l_genes = fill_fam_with_genes(o_fam) - - for o_gene in l_genes: - assert o_pang.get_gene(o_gene.ID) == o_gene +from ppanggolin.region import Region, Spot, Module +from ppanggolin.metadata import Metadata + + +class TestPangenome: + @pytest.fixture + def pangenome(self) -> Generator[Pangenome, None, None]: + """Create a pangenomes object for test + + :return: Generator with pangenomes object + """ + pangenome = Pangenome() + yield pangenome + + def test_cstr(self, pangenome): + pangenome_attr_type = { + "file": type(None), + "_famGetter": dict, + "_org_index": type(None), + "_fam_index": type(None), + "max_fam_id": int, + "_orgGetter": dict, + "_edgeGetter": dict, + "_regionGetter": dict, + "_spotGetter": dict, + "_moduleGetter": dict, + "status": dict, + "parameters": dict + } + status_keys = [ + 'genomesAnnotated', + 'geneSequences', + 'genesClustered', + 'defragmented', + 'geneFamilySequences', + 'neighborsGraph', + 'partitioned', + 'predictedRGP', + 'spots', + 'modules', + "metadata", + "metasources" + ] + metadata_keys = [ + "families", + "genes", + "genomes", + "RGPs", + "spots", + "modules" + ] + for attr, attr_type in pangenome_attr_type.items(): + assert hasattr(pangenome, attr) + assert isinstance(pangenome.__getattribute__(attr), attr_type) + if attr_type == dict: + if attr == "status": + assert len(pangenome.status) == len(status_keys) + else: + assert len(pangenome.__getattribute__(attr)) == 0 + + for status_key in status_keys: + assert status_key in pangenome.status + if status_key not in ["metadata", "metasources"]: + assert pangenome.status[status_key] == "No" + else: + assert_res = "No" if status_key == "metadata" else [] + for metadata_key in metadata_keys: + assert metadata_key in pangenome.status[status_key] + assert pangenome.status[status_key][metadata_key] == assert_res + assert pangenome.max_fam_id == 0 + + def test_is_instance_pangenome(self, pangenome): + assert isinstance(pangenome, Pangenome) + + +class TestPangenomeOrganism(TestPangenome): + """Organism test""" + def test_add_organism(self, pangenome): + org = Organism("org") + pangenome.add_organism(org) + assert set(pangenome.organisms) == {org} + + def test_get_organism(self, pangenome): + org = Organism("org") + pangenome.add_organism(org) + get_org = pangenome.get_organism("org") + assert isinstance(get_org, Organism) + assert org == get_org + with pytest.raises(KeyError): + pangenome.get_organism('organism') + + @pytest.fixture + def orgs(self) -> Generator[Set[Organism], None, None]: + """Create a list of organism object for test + + :return: Generator with list of organism object + """ + orgs = set() + for i in range(randint(5, 20)): + org = Organism(str(i)) + orgs.add(org) + yield orgs + + @pytest.fixture + def add_organisms(self, pangenome, orgs): + for org in orgs: + pangenome.add_organism(org) + + def test_number_of_organisms(self, add_organisms, pangenome, orgs): + assert isinstance(pangenome.number_of_organisms(), int) + assert pangenome.number_of_organisms() == len(orgs) + + def test_add_organisms(self, add_organisms, pangenome, orgs): + # 'set' because order is not guaranted + # and org should be unique + assert set(pangenome.organisms) == set(orgs) + + +class TestPangenomeGeneFamilies(TestPangenome): + def test_max_fam_id_is_instance_int_and_egal_zero(self, pangenome): + assert isinstance(pangenome.max_fam_id, int) + assert pangenome.max_fam_id == 0 + + def test_add_gene_family(self, pangenome): + family = GeneFamily(pangenome.max_fam_id, "family") + pangenome.add_gene_family(family) + assert 1 == pangenome.max_fam_id + with pytest.raises(KeyError): + pangenome.add_gene_family(family) + + def test_get_gene_family(self, pangenome): + family = GeneFamily(pangenome.max_fam_id, "family") + pangenome.add_gene_family(family) + assert isinstance(pangenome.get_gene_family("family"), GeneFamily) + assert pangenome.get_gene_family("family") == family + + @pytest.fixture + def families(self) -> Generator[Set[GeneFamily], None, None]: + """Create a list of organism object for test + + :return: Generator with list of organism object + """ + families = set() + for i in range(randint(5, 20)): + family = GeneFamily(family_id=i, name=f'family{i}') + families.add(family) + yield families + + @pytest.fixture + def add_families(self, pangenome, families): + for family in families: + pangenome.add_gene_family(family) + + def test_number_of_gene_families_empty(self, add_families, pangenome, families): + assert pangenome.number_of_gene_families() == len(families) + + +class TestPangenomeGene(TestPangenome): + @pytest.fixture + def genes(self): + genes = set() + for i in range(randint(5, 20)): + gene = Gene(gene_id=i) + genes.add(gene) + yield genes + + def test_get_gene_empty(self, pangenome): + with pytest.raises(KeyError): + pangenome.get_gene(33) + + @pytest.fixture(name="organism_genes") + def fill_org_with_genes(self): + genes = set() + organism = Organism(name="organism") + for contig_id in range(randint(2, 10)): + contig = organism.get_contig("k_{}".format(contig_id)) + for gene_idx in range(randint(2, 10)): + gene = Gene(gene_id=f"{organism.name}.{contig_id}.{gene_idx}") + gene.position = gene_idx + gene.start = gene_idx + contig.add_gene(gene) + genes.add(gene) + yield organism, genes + + @pytest.fixture(name="family_genes") + def fill_family_with_genes(self, pangenome): + genes = set() + family = GeneFamily(family_id=pangenome.max_fam_id, name="family") + for gene_idx in range(randint(2, 10)): + gene = Gene(gene_id=f"{family.name}_{gene_idx}") + gene.position = gene_idx + gene.start = gene_idx + family.add_gene(gene) + genes.add(gene) + yield family, genes + + def test_genes_organism_generator(self, pangenome, organism_genes): + # orgs with genes. + organism, genes = organism_genes + pangenome.add_organism(organism) + assert len(genes.difference(set(pangenome.genes))) == 0 + + def test_get_gene_with_organism(self, pangenome, organism_genes): + organism, genes = organism_genes + pangenome.add_organism(organism) + for gene in genes: + assert pangenome.get_gene(gene.ID) == gene + + def test_genes_gene_families(self, family_genes, pangenome): + """Genes are added in pan through their family.""" + family, genes = family_genes + pangenome.add_gene_family(family) + assert len(genes.difference(set(pangenome.genes))) == 0 + + def test_get_with_gene_family(self, pangenome, family_genes): + family, genes = family_genes + pangenome.add_gene_family(family) + for gene in genes: + assert pangenome.get_gene(gene.ID) == gene + + def test_number_of_gene(self, pangenome, organism_genes): + # orgs with genes. + organism, genes = organism_genes + pangenome.add_organism(organism) + assert isinstance(pangenome.number_of_genes(), int) + assert pangenome.number_of_genes() == len(genes) + + def test_get_multigenic(self, pangenome): + # TODO make a better test + multigenic = pangenome.get_multigenics(0.5) + assert isinstance(multigenic, set) + + +class TestPangenomeEdge(TestPangenome): + @staticmethod + def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2): + """create a pair of genes that belong to the same organism in 2 different families.""" + gene1 = Gene(gene_id=f"gene_{gene_id_1}") + gene2 = Gene(gene_id=f"gene_{gene_id_2}") + fam1 = GeneFamily(family_id=1, name=f"fam_{gene_id_1}") + fam2 = GeneFamily(family_id=2, name=f"fam_{gene_id_2}") + ctg1 = Contig(name=f"ctg_{gene_id_1}") + ctg2 = Contig(name=f"ctg_{gene_id_2}") + fam1.add_gene(gene1) + fam2.add_gene(gene2) + organism = Organism(name=f"org_{choices([gene_id_1, gene_id_2], k=1)}") + gene1.fill_parents(organism, ctg1) + gene2.fill_parents(organism, ctg2) + return gene1, gene2 + + @pytest.fixture + def gene_pair(self): + return self.make_gene_pair() + + def test_add_edge(self, pangenome, gene_pair): + gene1, gene2 = gene_pair + edge = pangenome.add_edge(gene1, gene2) + assert isinstance(edge, Edge) + # addEdge doesn't act the same when the edge already exists. + assert pangenome.add_edge(gene1, gene2) == edge + + def test_number_of_edges(self, pangenome, gene_pair): + gene1, gene2 = gene_pair + pangenome.add_edge(gene1, gene2) + assert isinstance(pangenome.number_of_edges(), int) + assert pangenome.number_of_edges() == 1 + + def test_edges_one(self, pangenome, gene_pair): + gene_1, gene_2 = gene_pair + + edges = [] + for _ in range(randint(2, 5)): + edges.append(pangenome.add_edge(gene_1, gene_2)) + + # always the same family couple + # = one edge, with several couple of genes + # I use set because edges are uniques, it is not a multigraph. + assert set(pangenome.edges) == set(edges) + assert pangenome.number_of_edges() == 1 + + edge = list(pangenome.edges).pop() + assert edge.gene_pairs[0] == (gene_1, gene_2) + + @pytest.fixture + def gene_pairs(self): + gene_pairs = set() + for _ in range(randint(5, 20)): + gene_id_1, gene_id_2 = choices(range(randint(2, 10)), k=2) + gene1, gene2 = self.make_gene_pair(gene_id_1, gene_id_2) + gene_pairs.add((gene1, gene2)) + yield gene_pairs + + def test_edges_many_rand(self, pangenome, gene_pairs): + edges = set() + for gene_pair in gene_pairs: + edges.add(pangenome.add_edge(*gene_pair)) + # I use set because edges are uniques, it is not a supergraph. + assert set(pangenome.edges) == edges + + +class TestPangenomeBinary(TestPangenomeOrganism, TestPangenomeGeneFamilies): + def test_get_org_index(self, add_organisms, pangenome, orgs): + orgs_index = pangenome.get_org_index() + assert isinstance(orgs_index, dict) + index_know = set() + for org, index in orgs_index.items(): + assert isinstance(org, Organism) + assert isinstance(index, int) + assert index not in index_know + index_know.add(index) + + def test_compute_family_bitarrays_without_index_already_computed(self, add_families, pangenome): + pangenome.compute_family_bitarrays() + for family in pangenome.gene_families: + assert family.bitarray is not None + + def test_compute_family_bitarrays_with_index_already_computed(self, add_families, pangenome): + org_idx = pangenome.get_org_index() + assert pangenome.compute_family_bitarrays() == org_idx + + +class TestPangenomeRGP(TestPangenome): + def test_add_region(self, pangenome): + rgp = Region(region_id="rgp") + pangenome.add_region(rgp) + assert len(pangenome._regionGetter) == 1 + assert pangenome._regionGetter["rgp"] == rgp + + def test_add_region_already_in_pangenome(self, pangenome): + rgp = Region(region_id="rgp") + pangenome.add_region(rgp) + with pytest.raises(KeyError): + pangenome.add_region(rgp) + + def test_get_region(self, pangenome): + rgp = Region(region_id="rgp") + pangenome.add_region(rgp) + assert pangenome.get_region("rgp") == rgp + + def test_get_region_not_in_pangenome(self, pangenome): + with pytest.raises(KeyError): + pangenome.get_region("rgp") + + def test_number_of_rgp(self, pangenome): + rgp = Region(region_id="rgp") + pangenome.add_region(rgp) + assert isinstance(pangenome.number_of_rgp(), int) + assert pangenome.number_of_rgp() == 1 + + +class TestPangenomeSpot(TestPangenome): + def test_add_spot(self, pangenome): + spot = Spot(spot_id=0) + pangenome.add_spot(spot) + assert len(pangenome._spotGetter) == 1 + assert pangenome._spotGetter[0] == spot + + def test_add_spot_already_in_pangenome(self, pangenome): + spot = Spot(spot_id=0) + pangenome.add_spot(spot) + with pytest.raises(KeyError): + pangenome.add_spot(spot) + + def test_get_spot_with_int(self, pangenome): + spot = Spot(spot_id=0) + pangenome.add_spot(spot) + assert pangenome.get_spot(0) == spot + + def test_get_spot_with_str(self, pangenome): + spot = Spot(spot_id=0) + pangenome.add_spot(spot) + assert pangenome.get_spot("spot_0") == spot + + def test_get_spot_not_in_pangenome(self, pangenome): + with pytest.raises(KeyError): + pangenome.get_spot(0) + + def test_number_of_spots(self, pangenome): + spot = Spot(spot_id=0) + pangenome.add_spot(spot) + assert isinstance(pangenome.number_of_spots(), int) + assert pangenome.number_of_spots() == 1 + + +class TestPangenomeModule(TestPangenome): + def test_add_module(self, pangenome): + module = Module(module_id=0) + pangenome.add_module(module) + assert len(pangenome._moduleGetter) == 1 + assert pangenome._moduleGetter[0] == module + + def test_add_module_already_in_pangenome(self, pangenome): + module = Module(module_id=0) + pangenome.add_module(module) + with pytest.raises(KeyError): + pangenome.add_module(module) + + def test_get_module_with_int(self, pangenome): + module = Module(module_id=0) + pangenome.add_module(module) + assert pangenome.get_module(0) == module + + def test_get_module_with_str(self, pangenome): + module = Module(module_id=0) + pangenome.add_module(module) + assert pangenome.get_module("module_0") == module + + def test_get_module_not_in_pangenome(self, pangenome): + with pytest.raises(KeyError): + pangenome.get_module(0) + + def test_number_of_modules(self, pangenome): + module = Module(module_id=0) + pangenome.add_module(module) + assert isinstance(pangenome.number_of_modules(), int) + assert pangenome.number_of_modules() == 1 + +class TestPangenomeMetadata(TestPangenome): + @pytest.fixture + def add_element_to_pangenome(self, pangenome): + metadata = Metadata(source="source", attribute="attr") + family = GeneFamily(family_id=pangenome.max_fam_id, name="Fam") + family.add_metadata(source=metadata.source, metadata=metadata) + pangenome.add_gene_family(family) + org = Organism("Org") + org.add_metadata(source=metadata.source, metadata=metadata) + ctg = org.get_contig("Ctg") + gene = Gene("Gene") + gene.position, gene.start = (0, 0) + gene.add_metadata(source=metadata.source, metadata=metadata) + ctg.add_gene(gene) + pangenome.add_organism(org) + rgp = Region("RGP") + rgp.add_metadata(source=metadata.source, metadata=metadata) + pangenome.add_region(rgp) + spot = Spot(0) + spot.add_metadata(source=metadata.source, metadata=metadata) + pangenome.add_spot(spot) + module = Module(0) + module.add_metadata(source=metadata.source, metadata=metadata) + pangenome.add_module(module) + + def test_metadata_sources(self, add_element_to_pangenome, pangenome): + for metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]: + assert isinstance(pangenome.metadata_sources(metatype), set) + assert pangenome.metadata_sources(metatype) == {'source'} + + def test_metadata(self, add_element_to_pangenome, pangenome): + for metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]: + for metadata_gen in pangenome.metadata(metatype): + for metadata in metadata_gen: + assert isinstance(metadata, Metadata) + assert metadata.source == 'source' + + def test_get_elem_by_metadata(self, add_element_to_pangenome, pangenome): + for metatype, expected_type in {"families": GeneFamily, "genomes": Organism, "genes": Gene, "RGPs": Region, + "spots": Spot, "modules": Module}.items(): + for elem in pangenome.get_elem_by_metadata(metatype, attribute="attr"): + assert isinstance(elem, expected_type) + for metadata in elem.metadata: + assert isinstance(metadata, Metadata) + assert metadata.source == 'source' + + def test_get_elem_by_sources(self, add_element_to_pangenome, pangenome): + for metatype, expected_type in {"families": GeneFamily, "genomes": Organism, "genes": Gene, "RGPs": Region, + "spots": Spot, "modules": Module}.items(): + for elem in pangenome.get_elem_by_sources(source='source', metatype=metatype): + assert isinstance(elem, expected_type) + for metadata in elem.metadata: + assert isinstance(metadata, Metadata) + assert metadata.source == 'source' diff --git a/tests/test_metadata.py b/tests/test_metadata.py new file mode 100644 index 00000000..b95a1833 --- /dev/null +++ b/tests/test_metadata.py @@ -0,0 +1,126 @@ +#! /usr/bin/env python3 + +import pytest +from random import choices, randint, sample +from typing import Generator, Set + +from ppanggolin.metadata import Metadata, MetaFeatures + + +class TestMetadata: + def test_create_metadata_with_attributes(self): + metadata = Metadata('source', attribute='value') + assert metadata.__getattribute__("attribute") == 'value' + assert metadata.source == 'source' + + def test_create_metadata_with_no_attributes(self): + with pytest.raises(Exception): + Metadata(source='source') + + def test_get_existing_attribute_value(self): + metadata = Metadata('source', attribute='value') + assert metadata.get('attribute') == 'value' + + def test_get_non_existing_attribute_value(self): + metadata = Metadata('source', attribute='value') + with pytest.raises(AttributeError): + metadata.get('non_existing_attribute') + + def test_get_all_attributes(self): + metadata = Metadata('source', attribute='value', another_attribute='another_value') + assert metadata.fields == ['source', 'attribute', 'another_attribute'] + + def test_join_list_attribute(self): + metadata = Metadata('source', attribute=['value1', 'value2']) + assert metadata.get("attribute") == 'value1,value2' + + def test_metadata_number_of_attributes(self): + metadata = Metadata('source', attribute='value', another_attribute='another_value') + assert metadata.number_of_attribute() == 3 + + +class TestMetaFeatures: + # Tests that metadata can be added to MetaFeatures and checking if it was added successfully + def test_add_metadata(self): + meta_features = MetaFeatures() + metadata = Metadata('source1', attribute1='value1') + meta_features.add_metadata('source1', metadata) + assert meta_features._metadataGetter['source1'] == [metadata] + + # Tests that metadata can be gotten from MetaFeatures by source and checking if it returns the correct metadata + def test_get_metadata_by_source(self): + meta_features = MetaFeatures() + metadata1 = Metadata('source1', attribute1='value1') + metadata2 = Metadata('source2', attribute2='value2') + meta_features.add_metadata('source1', metadata1) + meta_features.add_metadata('source2', metadata2) + assert meta_features.get_source('source1') == [metadata1] + assert meta_features.get_source('source2') == [metadata2] + + # Tests that metadata can be gotten from MetaFeatures by attribute and checking if it returns the correct metadata + def test_get_metadata_by_attribute(self): + meta_features = MetaFeatures() + metadata1 = Metadata('source1', attribute1='value1') + metadata2 = Metadata('source2', attribute2='value2') + meta_features.add_metadata('source1', metadata1) + meta_features.add_metadata('source2', metadata2) + assert list(meta_features.get_metadata(attribute1='value1')) == [metadata1] + + # Tests that all metadata can be gotten from MetaFeatures and checking if it returns all metadata + def test_get_all_metadata(self): + meta_features = MetaFeatures() + metadata1 = Metadata('source1', attribute1='value1') + metadata2 = Metadata('source2', attribute2='value2') + meta_features.add_metadata('source1', metadata1) + meta_features.add_metadata('source2', metadata2) + assert list(meta_features.metadata) == [metadata1, metadata2] + + # Tests that all metadata sources can be gotten from MetaFeatures and checking if it returns all sources + def test_get_all_metadata_sources(self): + meta_features = MetaFeatures() + metadata1 = Metadata('source1', attribute1='value1') + metadata2 = Metadata('source2', attribute2='value2') + meta_features.add_metadata('source1', metadata1) + meta_features.add_metadata('source2', metadata2) + assert meta_features.sources == ['source1', 'source2'] + + # Tests that the source with the maximum number of metadata can be gotten from MetaFeatures and checking if it returns the correct source and number + def test_get_source_with_maximum_metadata(self): + meta_features = MetaFeatures() + metadata1 = Metadata('source1', attribute1='value1') + metadata2 = Metadata('source1', attribute2='value2') + meta_features.add_metadata('source1', metadata1) + meta_features.add_metadata('source1', metadata2) + assert meta_features.max_metadata_by_source() == ('source1', 2) + + # Tests that getting metadata from MetaFeatures with non-existent source returns None + def test_get_metadata_with_non_existent_source_returns_none(self): + meta_features = MetaFeatures() + metadata = Metadata('source1', attribute1='value1') + meta_features.add_metadata('source1', metadata) + assert meta_features.get_source('source2') == None + + # Tests that getting metadata from MetaFeatures with non-existent attribute returns None + def test_get_metadata_with_non_existent_attribute_returns_none(self): + meta_features = MetaFeatures() + metadata = Metadata('source1', attribute1='value1') + meta_features.add_metadata('source1', metadata) + assert list(meta_features.get_metadata(attribute2='value2')) == [] + + # Tests that getting metadata from MetaFeatures with empty attribute value returns the correct metadata + def test_get_metadata_with_empty_attribute_value_returns_correct_metadata(self): + meta_features = MetaFeatures() + metadata1 = Metadata('source1', attribute1='') + metadata2 = Metadata('source2', attribute2='value2') + meta_features.add_metadata('source1', metadata1) + meta_features.add_metadata('source2', metadata2) + assert list(meta_features.get_metadata(attribute1='')) == [metadata1] + + # Tests that getting metadata from MetaFeatures with list attribute value returns the correct metadata + def test_get_metadata_with_list_attribute_value_returns_correct_metadata(self): + meta_features = MetaFeatures() + metadata1 = Metadata('source1', attribute1=['value1', 'value2']) + metadata2 = Metadata('source2', attribute2='value2') + meta_features.add_metadata('source1', metadata1) + meta_features.add_metadata('source2', metadata2) + assert list(meta_features.get_metadata(attribute1='value1,value2')) == [metadata1] From 7da73459a01d7d03c32cf1219ef0251463752f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 18 Jul 2023 09:57:44 +0200 Subject: [PATCH 18/75] Last refactor pangenome class --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 5 +- ppanggolin/RGP/spot.py | 9 +- ppanggolin/formats/readBinaries.py | 6 +- ppanggolin/geneFamily.py | 5 + ppanggolin/mod/module.py | 3 +- ppanggolin/pangenome.py | 160 ++++++++++++++++++----------- tests/test_Pangenome.py | 12 +++ 8 files changed, 133 insertions(+), 69 deletions(-) diff --git a/VERSION b/VERSION index 62352484..20992e3f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.135 +1.2.136 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 462e2045..d2ff6098 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -255,8 +255,9 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain logging.getLogger("PPanGGOLiN").info("Compute Regions of Genomic Plasticity ...") name_scheme = naming_scheme(pangenome) for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar): - pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, - min_score, naming=name_scheme)) + for region in compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, + min_score, naming=name_scheme): + pangenome.add_region(region) logging.getLogger("PPanGGOLiN").info(f"Predicted {pangenome.number_of_rgp()} RGP") # save parameters and save status diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index c42ff920..d27f3d4a 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -202,8 +202,8 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals logging.getLogger("PPanGGOLiN").warning("No spots were detected.") else: logging.getLogger("PPanGGOLiN").info(f"{len(spots)} spots were detected") - - pangenome.add_spots(spots) + for spot in spots: + pangenome.add_spot(spot) pangenome.status["spots"] = "Computed" pangenome.parameters["spots"] = {} pangenome.parameters["spots"]["set_size"] = set_size @@ -251,8 +251,9 @@ def parser_spot(parser: argparse.ArgumentParser): required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument('-o', '--output', required=False, type=Path, - default=Path(f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" - f"_PID{str(os.getpid())}"), + default=Path( + f"ppanggolin_output{time.strftime('DATE%Y-%m-%d_HOUR%H.%M.%S', time.localtime())}" + f"_PID{str(os.getpid())}"), help="Output directory") optional.add_argument("--spot_graph", required=False, action="store_true", help="Writes a graph in .gexf format of pairs of blocks of single copy markers flanking RGPs," diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 2f025918..9dbf4083 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -406,7 +406,8 @@ def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False spots[row["spot"]] = curr_spot curr_spot.add_region(pangenome.get_region(row["RGP"].decode())) curr_spot.spot_2_families() - pangenome.add_spots(spots.values()) + for spot in spots.values(): + pangenome.add_spot(spot) pangenome.status["spots"] = "Loaded" @@ -428,7 +429,8 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal curr_module = Module(row['module']) modules[row["module"]] = curr_module curr_module.add_family(pangenome.get_gene_family(row['geneFam'].decode())) - pangenome.add_modules(modules.values()) + for module in modules.values(): + pangenome.add_module(module) pangenome.status["modules"] = "Loaded" diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 4b33c622..136b70e4 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -28,6 +28,10 @@ class GeneFamily(MetaFeatures): """ def __init__(self, family_id: int, name: str): + assert isinstance(family_id, int), "GeneFamily object id should be an integer" + assert isinstance(name, str), "GeneFamily object name should be a string" + assert name != '', "GeneFamily object cannot be created with an empty name" + super().__init__() self.name = str(name) self.ID = family_id @@ -46,6 +50,7 @@ def add_sequence(self, seq: str): :param seq: the sequence to add to the gene family """ + assert isinstance(seq, str) and str != "", "Sequence must be a string and not empty" self.sequence = seq def add_partition(self, partition: str): diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index b8b09c07..baf4955f 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -123,12 +123,11 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = fams = set() for mod in modules: fams |= mod.families + pangenome.add_module(mod) logging.getLogger("PPanGGOLiN").info(f"There are {len(fams)} families among {len(modules)} modules") logging.getLogger("PPanGGOLiN").info(f"Computing modules took {round(time.time() - start_time, 2)} seconds") - pangenome.add_modules(modules) - pangenome.status["modules"] = "Computed" pangenome.parameters["modules"] = {} pangenome.parameters["modules"]["size"] = size diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index f7007de9..03de7e0b 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -2,6 +2,7 @@ # coding: utf8 # default libraries +import logging import re from typing import Iterator, List, Union, Dict, Set, Iterable, Generator from pathlib import Path @@ -31,7 +32,7 @@ def __init__(self): self._famGetter = {} self._org_index = None self._fam_index = None - self.max_fam_id = 0 + self._max_fam_id = 0 self._orgGetter = {} self._edgeGetter = {} self._regionGetter = {} @@ -70,7 +71,10 @@ def add_file(self, pangenome_file: Path): :func:`ppanggolin.formats.writeBinaries.writePangenome` is called. :param pangenome_file: A string representing filepath to hdf5 pangenome file to be either used or created + + :raises AssertionError: If the `pangenome_file` is not a Path """ + assert isinstance(pangenome_file, Path), "pangenome file should be a Path object type" from ppanggolin.formats.readBinaries import get_status # importing on call instead of importing on top to avoid cross-reference problems. get_status(self, pangenome_file) @@ -93,6 +97,8 @@ def genes(self) -> Generator[Gene, None, None]: for gene_fam in self.gene_families: for gene in gene_fam.genes: yield gene + else: + logging.getLogger("PPanGGOLiN").warning("There is no gene in your pangenome") def _mk_gene_getter(self): """ @@ -108,14 +114,17 @@ def _mk_gene_getter(self): self._geneGetter[gene.ID] = gene def get_gene(self, gene_id: str) -> Gene: - """returns the gene that has the given geneID + """returns the gene that has the given gene ID :param gene_id: The gene ID to look for - :return: returns the gene that has the ID `geneID` + :return: returns the gene that has the ID `gene_id` - :raises KeyError: If the `geneID` is not in the pangenome + :raises AssertionError: If the `gene_id` is not an integer + :raises KeyError: If the `gene_id` is not in the pangenome """ + assert isinstance(gene_id, str), "Gene id should be an integer" + try: return self._geneGetter[gene_id] except AttributeError: @@ -128,7 +137,7 @@ def get_gene(self, gene_id: str) -> Gene: def number_of_genes(self) -> int: """Returns the number of gene present in the pangenome - :return: the number of gene families + :return: the number of genes """ try: return len(self._geneGetter) @@ -137,11 +146,19 @@ def number_of_genes(self) -> int: return len(self._geneGetter) """Gene families methods""" + @property + def max_fam_id(self): + return self._max_fam_id + + @max_fam_id.setter + def max_fam_id(self, value): + self._max_fam_id = value + @property def gene_families(self) -> Generator[GeneFamily, None, None]: """returns all the gene families in the pangenome - :return: list of :class:`ppanggolin.geneFamily.GeneFamily` + :return: Generator of gene families """ for family in self._famGetter.values(): yield family @@ -159,8 +176,11 @@ def get_gene_family(self, name: str) -> GeneFamily: :param name: The gene family name to look for :return: returns the gene family that has the name `name` + + :raises AssertionError: If the `name` is not an integer + :raises KeyError: If the `name` is not corresponding to any family in the pangenome """ - assert isinstance(name, str) + assert isinstance(name, str), "Name of gene family should be a string" try: fam = self._famGetter[name] except KeyError: @@ -177,7 +197,8 @@ def add_gene_family(self, family: GeneFamily): :param family: The gene family to add in pangenomes - :raise KeyError: Exception if family with the same name already in pangenome + :raises KeyError: Exception if family with the same name already in pangenome + :raises Exception: Unexpected exception """ try: _ = self.get_gene_family(family.name) @@ -190,7 +211,6 @@ def add_gene_family(self, family: GeneFamily): raise KeyError("Gene Family already exist") """Graph methods""" - @property def edges(self) -> Generator[Edge, None, None]: """returns all the edges in the pangenome graph @@ -201,17 +221,24 @@ def edges(self) -> Generator[Edge, None, None]: yield edge def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: - # TODO add_edge should not create edge but add one to pangenome like other add methods """ Adds an edge between the two gene families that the two given genes belong to. - Genes object are expected, and they are also expected to have a family assigned :param gene1: The first gene :param gene2: The second gene :return: the created Edge + + :raises AssertionError: Genes object are expected + :raises AttributeError: Genes are not associated to any families """ - key = frozenset([gene1.family, gene2.family]) + assert isinstance(gene1, Gene) and isinstance(gene2, Gene), "Gene object are expected" + try: + family_1, family_2 = gene1.family, gene2.family + except AttributeError: + raise AttributeError("Genes are not linked to families. Check that you compute the gene families and post an" + " issue on our GitHub") + key = frozenset([family_1, family_2 ]) edge = self._edgeGetter.get(key) if edge is None: edge = Edge(gene1, gene2) @@ -232,7 +259,7 @@ def number_of_edges(self) -> int: def organisms(self) -> Generator[Organism, None, None]: """returns all the organisms in the pangenome - :return: list of :class:`ppanggolin.genome.Organism` + :return: Generator :class:`ppanggolin.genome.Organism` """ for organism in self._orgGetter.values(): yield organism @@ -244,21 +271,23 @@ def number_of_organisms(self) -> int: """ return len(self._orgGetter) - def get_organism(self, org_name: str) -> Organism: + def get_organism(self, name: str) -> Organism: """ Get an organism that is expected to be in the pangenome using its name, which is supposedly unique. Raises an error if the organism does not exist. - :param org_name: Name of the Organism to get + :param name: Name of the Organism to get :return: The related Organism object - :raises KeyError: If the provided name is not in the pangenome + :raise AssertionError: If the organism name is not a string + :raises KeyError: If the provided name is not an organism in the pangenome """ + assert isinstance(name, str), "Organism name should be a string" try: - return self._orgGetter[org_name] + return self._orgGetter[name] except KeyError: - raise KeyError(f"{org_name} does not seem to be in your pangenome") + raise KeyError(f"{name} does not seem to be in your pangenome") def add_organism(self, organism: Organism): """ @@ -269,8 +298,10 @@ def add_organism(self, organism: Organism): :param organism: Organism to add to the pangenome + :raise AssertionError: If the organism name is not a string :raises KeyError: if the provided organism is already in pangenome """ + assert isinstance(organism, Organism), "An organism object is expected to be add to pangenome" try: self.get_organism(organism.name) except KeyError: @@ -340,7 +371,6 @@ def compute_org_bitarrays(self, part='all') -> Dict[GeneFamily, int]: return self._fam_index """RGP methods""" - @property def regions(self) -> Generator[Region, None, None]: """returns all the regions (RGP) in the pangenome @@ -350,17 +380,22 @@ def regions(self) -> Generator[Region, None, None]: for region in self._regionGetter.values(): yield region - def get_region(self, region_name: str) -> Region: + def get_region(self, name: str) -> Region: """Returns a region with the given region_name. Creates it if it does not exist. - :param region_name: The name of the region to return + :param name: The name of the region to return :return: The region + + :raise AssertionError: If the RGP name is not a string + :raises KeyError: If the provided name is not a RGP in the pangenome """ + assert isinstance(name, str), "RGP name should be a string" + try: - rgp = self._regionGetter[region_name] + rgp = self._regionGetter[name] except KeyError: # then the region is not stored in this pangenome. - raise KeyError(f"There is no RGP with name={region_name}") + raise KeyError(f"There is no RGP with name={name}") else: return rgp @@ -374,6 +409,9 @@ def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[Gen :return: set of gene families considered multigenic """ + assert isinstance(dup_margin, float), "Dup margin should be a float" + assert isinstance(persistent, bool), "persistent should be a boolean" + multigenics = set() for fam in self.gene_families: if fam.named_partition == "persistent" or not persistent: @@ -381,8 +419,6 @@ def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[Gen len([gene for gene in genes if not gene.is_fragment]) > 1]) if (dup / len(fam.organisms)) >= dup_margin: # tot / nborgs >= 1.05 multigenics.add(fam) - # logging.getLogger("PPanGGOLiN").info(f"{len(multigenics)} gene families are defined as being multigenic. - # (duplicated in more than {dup_margin} of the genomes)") return multigenics def add_region(self, region: Region): @@ -390,8 +426,11 @@ def add_region(self, region: Region): :param region: Region to add in pangenome + :raise AssertionError: Error if region is not a Region object :raise KeyError: Error if another Region exist in pangenome with the same name """ + assert isinstance(region, Region), "A Region object is expected" + try: self.get_region(region.name) except KeyError: @@ -399,19 +438,6 @@ def add_region(self, region: Region): else: raise KeyError(f"A RGP with this name ({region.name} already exist in pangenome") - def add_regions(self, regions: Iterable[Region]): - #TODO remove this function - """Takes an Iterable of Region and adds it to the pangenome - - :param regions: An Iterable of regions to add to the pangenome - - :raises AssertionError: if regions is not an Iterable[Region] - """ - assert isinstance(regions, Iterable), f"An iterable was expected, but you provided a {type(regions)}" - - for region in regions: - self.add_region(region) - def number_of_rgp(self) -> int: """Returns the number of gene families present in the pangenome @@ -437,7 +463,6 @@ def get_spot(self, spot_id: Union[int, str]) -> Spot: :raises KeyError: If the spot ID does not exist in the pangenome. :raises ValueError: If the provided spot ID does not have the expected format. """ - try: spot_id = int(spot_id) except ValueError: @@ -454,16 +479,15 @@ def get_spot(self, spot_id: Union[int, str]) -> Spot: else: return spot - def add_spots(self, spots: Iterable[Spot]): - #TODO remove this function - for spot in spots: - self.add_spot(spot) - def add_spot(self, spot: Spot): """Adds the given iterable of spots to the pangenome. :param spot: spot which should be added + + :raise AssertionError: Error if spot is not a Spot object + :raise KeyError: Error if another Spot exist in pangenome with the same identifier """ + assert isinstance(spot, Spot), "Spot object is expected" try: self.get_spot(spot.ID) except KeyError: @@ -487,6 +511,7 @@ def modules(self) -> Module: yield module def get_module(self, module_id: Union[int, str]) -> Module: + # TODO Change for only str or only int """ Returns the module that has the given module ID. @@ -515,20 +540,15 @@ def get_module(self, module_id: Union[int, str]) -> Module: else: return module - def add_modules(self, modules: Iterable[Module]): - # TODO remove - """Adds the given iterable of modules to the pangenome - - :param modules: an iterable of :class:`ppanggolin.module.Module` - """ - for module in modules: - self.add_module(module) - def add_module(self, module: Module): - """Adds the given module to the pangenome + """Add the given module to the pangenome :param module: module to add in pangenome + + :raise AssertionError: Error if module is not a Module object + :raise KeyError: Error if another module exist in pangenome with the same name """ + assert isinstance(module, Module), "Module object is expected" try: self.get_module(module.ID) except KeyError: @@ -566,6 +586,17 @@ def number_of_modules(self) -> int: """Metadata""" def select_elem(self, metatype: str): + """Get all the element for the given metatype + + :param metatype: name of pangenome component that will be get + + :return: All elements from pangenome for the metatype + + :raise AssertionError: Error if metatype is not a string + :raise KeyError: Error if metatype is not recognized + """ + assert isinstance(metatype, str), "Metatype name should be a string" + if metatype == "families": return self.gene_families elif metatype == "genomes": @@ -576,8 +607,10 @@ def select_elem(self, metatype: str): return self.regions elif metatype == "spots": return self.spots - else: # metatype == "modules": + elif metatype == "modules": return self.modules + else: + raise KeyError("Given metatype is not allowed") def metadata_sources(self, metatype: str) -> Set[str]: """returns all the metadata source in the pangenomes @@ -585,8 +618,10 @@ def metadata_sources(self, metatype: str) -> Set[str]: :param metatype: select to which pangenome element metadata should be searched :return: set of metadata source + + :raise AssertionError: Error if metatype is not a string """ - assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] + assert isinstance(metatype, str), "Metatype name should be a string" source_set = set() for elem in self.select_elem(metatype): for source_metadata in elem.sources: @@ -596,17 +631,26 @@ def metadata_sources(self, metatype: str) -> Set[str]: def metadata(self, metatype: str) -> Generator[Metadata, None, None]: """Create a generator with all metadatas in the pangenome + :param metatype: select to which pangenome element metadata should be generate + :return: set of metadata source """ assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] for elem in self.select_elem(metatype): yield elem.metadata - def get_elem_by_metadata(self, metatype: str, **kargs) -> Generator[ + def get_elem_by_metadata(self, metatype: str, **kwargs) -> Generator[ Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: + """Get element in pangenome with metadata attribute expected + + :param metatype: select to which pangenome element metadata + :param kwargs: attributes to identify metadata + + :return: metadata element + """ assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] for elem in self.select_elem(metatype): - if len(list(elem.get_metadata(**kargs))) > 0: + if len(list(elem.get_metadata(**kwargs))) > 0: yield elem def get_elem_by_sources(self, source: List[str], metatype: str) -> Generator[ diff --git a/tests/test_Pangenome.py b/tests/test_Pangenome.py index 1900127b..b8827a18 100644 --- a/tests/test_Pangenome.py +++ b/tests/test_Pangenome.py @@ -83,6 +83,10 @@ def test_cstr(self, pangenome): def test_is_instance_pangenome(self, pangenome): assert isinstance(pangenome, Pangenome) + def test_add_file_is_not_path(self, pangenome): + with pytest.raises(AssertionError): + pangenome.add_file("pangenome.h5") + class TestPangenomeOrganism(TestPangenome): """Organism test""" @@ -145,6 +149,10 @@ def test_get_gene_family(self, pangenome): assert isinstance(pangenome.get_gene_family("family"), GeneFamily) assert pangenome.get_gene_family("family") == family + def test_get_gene_family_with_name_no_str(self, pangenome): + with pytest.raises(AssertionError): + pangenome.get_gene_family(3) + @pytest.fixture def families(self) -> Generator[Set[GeneFamily], None, None]: """Create a list of organism object for test @@ -229,6 +237,10 @@ def test_get_with_gene_family(self, pangenome, family_genes): for gene in genes: assert pangenome.get_gene(gene.ID) == gene + def test_get_gene_with_id_not_int(self, pangenome): + with pytest.raises(AssertionError): + pangenome.get_gene(gene_id="id") + def test_number_of_gene(self, pangenome, organism_genes): # orgs with genes. organism, genes = organism_genes From 76e5166b1605519785fd58673645c3001f4bbeed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 19 Jul 2023 11:49:23 +0200 Subject: [PATCH 19/75] Write 102 tests for Pangenome Class --- VERSION | 2 +- ppanggolin/pangenome.py | 5 +- tests/test_Pangenome.py | 593 +++++++++++++++++++++++++++++++++------- 3 files changed, 491 insertions(+), 109 deletions(-) diff --git a/VERSION b/VERSION index 20992e3f..2578cfec 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.136 +1.2.137 diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 03de7e0b..53040b49 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -72,7 +72,7 @@ def add_file(self, pangenome_file: Path): :param pangenome_file: A string representing filepath to hdf5 pangenome file to be either used or created - :raises AssertionError: If the `pangenome_file` is not a Path + :raises AssertionError: If the `pangenome_file` is not an instance of the Path class """ assert isinstance(pangenome_file, Path), "pangenome file should be a Path object type" from ppanggolin.formats.readBinaries import get_status @@ -621,7 +621,6 @@ def metadata_sources(self, metatype: str) -> Set[str]: :raise AssertionError: Error if metatype is not a string """ - assert isinstance(metatype, str), "Metatype name should be a string" source_set = set() for elem in self.select_elem(metatype): for source_metadata in elem.sources: @@ -635,7 +634,6 @@ def metadata(self, metatype: str) -> Generator[Metadata, None, None]: :return: set of metadata source """ - assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] for elem in self.select_elem(metatype): yield elem.metadata @@ -648,7 +646,6 @@ def get_elem_by_metadata(self, metatype: str, **kwargs) -> Generator[ :return: metadata element """ - assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] for elem in self.select_elem(metatype): if len(list(elem.get_metadata(**kwargs))) > 0: yield elem diff --git a/tests/test_Pangenome.py b/tests/test_Pangenome.py index b8827a18..b21fb71b 100644 --- a/tests/test_Pangenome.py +++ b/tests/test_Pangenome.py @@ -2,7 +2,7 @@ import pytest from random import choices, randint, sample -from typing import Generator, Set +from typing import Generator, Set, Tuple, Union from pathlib import Path from ppanggolin.genome import Gene, Organism, Contig @@ -14,6 +14,11 @@ class TestPangenome: + """This class tests methods in pangenome class associated to pangenome direclty. + For pangenome components, there are subclass to test each component. + This class also generate a pangenome for all the test + """ + @pytest.fixture def pangenome(self) -> Generator[Pangenome, None, None]: """Create a pangenomes object for test @@ -24,20 +29,28 @@ def pangenome(self) -> Generator[Pangenome, None, None]: yield pangenome def test_cstr(self, pangenome): + """ + Tests the constructor method of the Pangenome class. + It checks that all attributes are present and have the correct type and or value. + + :param pangenome: Test the function + + :return: A pangenome object + """ pangenome_attr_type = { - "file": type(None), - "_famGetter": dict, - "_org_index": type(None), - "_fam_index": type(None), - "max_fam_id": int, - "_orgGetter": dict, - "_edgeGetter": dict, - "_regionGetter": dict, - "_spotGetter": dict, - "_moduleGetter": dict, - "status": dict, - "parameters": dict - } + "file": type(None), + "_famGetter": dict, + "_org_index": type(None), + "_fam_index": type(None), + "_max_fam_id": int, + "_orgGetter": dict, + "_edgeGetter": dict, + "_regionGetter": dict, + "_spotGetter": dict, + "_moduleGetter": dict, + "status": dict, + "parameters": dict + } status_keys = [ 'genomesAnnotated', 'geneSequences', @@ -81,34 +94,93 @@ def test_cstr(self, pangenome): assert pangenome.max_fam_id == 0 def test_is_instance_pangenome(self, pangenome): + """Tests whether the pangenome object is an instance of the Pangenome class. + This test is important because it ensures that the class name does not change and that we are working + with a Pangenome object, and not some other type of object. + + :param pangenome: object to test if is an instance of the pangenome class + + :raise AssertionError: If pangenome is not an instance of the pangenome class + """ assert isinstance(pangenome, Pangenome) def test_add_file_is_not_path(self, pangenome): + """Tests that the add_file method raises an AssertionError if a file is not an instance of the Path class + + :param pangenome: pangenome object to test method + """ with pytest.raises(AssertionError): pangenome.add_file("pangenome.h5") class TestPangenomeOrganism(TestPangenome): - """Organism test""" - def test_add_organism(self, pangenome): - org = Organism("org") - pangenome.add_organism(org) - assert set(pangenome.organisms) == {org} + """This class tests methods in pangenome class associated to organisms. + """ - def test_get_organism(self, pangenome): - org = Organism("org") - pangenome.add_organism(org) - get_org = pangenome.get_organism("org") + @pytest.fixture + def organism(self) -> Generator[Organism, None, None]: + organism = Organism(name="organism") + yield organism + + def test_add_organism(self, pangenome, organism): + """Tests the add_organism method of the Pangenome class. + + :param pangenome: pangenome object to test method + :param organism: organism object to test method + """ + pangenome.add_organism(organism) + assert set(pangenome.organisms) == {organism} + + def test_add_organism_already_in_pangenome(self, pangenome, organism): + """Tests that adding organism that already exist return a KeyError. + + :param pangenome: pangenome object to test method + :param organism: organism object to test method + """ + pangenome.add_organism(organism) + with pytest.raises(KeyError): + pangenome.add_organism(organism) + + def test_add_organism_not_instance_organism(self, pangenome): + """Ensure that it raises an AssertionError when a non-Organism object is passed as an argument. + + :param pangenome: pangenome object to test method + """ + with pytest.raises(AssertionError): + pangenome.add_organism("org") + + def test_get_organism(self, pangenome, organism): + """Tests the get_organism method of the Pangenome class. + + :param pangenome: pangenome object to test method + :param organism: organism object to test method + """ + pangenome.add_organism(organism) + get_org = pangenome.get_organism("organism") assert isinstance(get_org, Organism) - assert org == get_org + assert organism == get_org + + def test_get_organism_not_in_pangenome(self, pangenome): + """Ensure that it raises a KeyError when an Organism is not in the pangenome. + + :param pangenome: pangenome object to test method + """ with pytest.raises(KeyError): - pangenome.get_organism('organism') + pangenome.get_organism('org') + + def test_get_organism_with_name_not_instance_string(self, pangenome): + """Ensure that it raises an AssertionError when a non-string name is passed as organism name. + + :param pangenome: pangenome object to test method + """ + with pytest.raises(AssertionError): + pangenome.get_organism(33) @pytest.fixture - def orgs(self) -> Generator[Set[Organism], None, None]: - """Create a list of organism object for test + def organisms(self) -> Generator[Set[Organism], None, None]: + """Create a set of organism object for test - :return: Generator with list of organism object + :return: Generator with set of organism object """ orgs = set() for i in range(randint(5, 20)): @@ -117,47 +189,98 @@ def orgs(self) -> Generator[Set[Organism], None, None]: yield orgs @pytest.fixture - def add_organisms(self, pangenome, orgs): - for org in orgs: + def add_organisms(self, pangenome, organisms): + """Add set of organims to pangenome + + :param pangenome: pangenome object to test method + :param orgs: set of organisms to add to pangenome + """ + for org in organisms: pangenome.add_organism(org) - def test_number_of_organisms(self, add_organisms, pangenome, orgs): - assert isinstance(pangenome.number_of_organisms(), int) - assert pangenome.number_of_organisms() == len(orgs) + def test_number_of_organisms(self, add_organisms, pangenome, organisms): + """Tests the number_of_organisms method of the pangenome class. - def test_add_organisms(self, add_organisms, pangenome, orgs): - # 'set' because order is not guaranted - # and org should be unique - assert set(pangenome.organisms) == set(orgs) + :param add_organisms: method to add organisms to pangenome + :param pangenome: pangenome object to test method + :param orgs: set of organisms to add to pangenome + """ + assert isinstance(pangenome.number_of_organisms(), int) + assert pangenome.number_of_organisms() == len(organisms) class TestPangenomeGeneFamilies(TestPangenome): + """This class tests methods in pangenome class associated to gene families. + """ + @pytest.fixture + def family(self) -> Generator[GeneFamily, None, None]: + """Create a Gene Family object + + :return: Generator with a Gene Family object + """ + family = GeneFamily(0, "family") + yield family + def test_max_fam_id_is_instance_int_and_egal_zero(self, pangenome): + """Tests that the max_fam_id attribute is corretly set + + :param pangenome: pangenome object to test method + """ assert isinstance(pangenome.max_fam_id, int) assert pangenome.max_fam_id == 0 - def test_add_gene_family(self, pangenome): - family = GeneFamily(pangenome.max_fam_id, "family") + def test_add_gene_family(self, pangenome, family): + """Tests the add_gene_family method of the Pangenome class. + + :param pangenome: pangenome object to test method + :param family: gene family object to test method + """ pangenome.add_gene_family(family) assert 1 == pangenome.max_fam_id + assert set(pangenome.gene_families) == {family} + + def test_add_gene_family_already_in_pangenome(self, pangenome, family): + """Tests that adding gene family that already exist return a KeyError. + + :param pangenome: pangenome object to test method + :param family: gene family object to test method + """ + pangenome.add_gene_family(family) with pytest.raises(KeyError): pangenome.add_gene_family(family) - def test_get_gene_family(self, pangenome): - family = GeneFamily(pangenome.max_fam_id, "family") + def test_get_gene_family(self, pangenome, family): + """Tests that get_gene_family return a gene family object corresponding to the requested gene family + + :param pangenome: pangenome object to test method + :param family: gene family object to test method + """ pangenome.add_gene_family(family) assert isinstance(pangenome.get_gene_family("family"), GeneFamily) assert pangenome.get_gene_family("family") == family - def test_get_gene_family_with_name_no_str(self, pangenome): + def test_get_gene_family_not_in_pangenome(self, pangenome, family): + """Tests that return a KeyError if family does not exist in pangenome + + :param pangenome: pangenome object to test method + :param family: gene family object to test method + """ + with pytest.raises(KeyError): + pangenome.get_gene_family("fam") + + def test_get_gene_family_with_name_not_isinstance_string(self, pangenome): + """Tests that return an AssertionError if family name used to get family is not string + + :param pangenome: pangenome object to test method + """ with pytest.raises(AssertionError): pangenome.get_gene_family(3) @pytest.fixture def families(self) -> Generator[Set[GeneFamily], None, None]: - """Create a list of organism object for test + """Create a set of Gene Family object for test - :return: Generator with list of organism object + :return: Generator with set of organism object """ families = set() for i in range(randint(5, 20)): @@ -167,28 +290,46 @@ def families(self) -> Generator[Set[GeneFamily], None, None]: @pytest.fixture def add_families(self, pangenome, families): + """Add set of gene families to pangenome + + :param pangenome: pangenome object to test method + :param orgs: set of gene families to add to pangenome + """ for family in families: pangenome.add_gene_family(family) def test_number_of_gene_families_empty(self, add_families, pangenome, families): + """Tests the number_of_gene_families method of the pangenome class. + + :param add_organisms: method to add gene families to pangenome + :param pangenome: pangenome object to test method + :param families: set of families to add to pangenome + """ + assert isinstance(pangenome.number_of_gene_families(), int) assert pangenome.number_of_gene_families() == len(families) class TestPangenomeGene(TestPangenome): + """This class tests methods in pangenome class associated to Gene. + """ @pytest.fixture - def genes(self): + def genes(self)-> Generator[Set[Gene], None, None]: + """Create a set of Gene object for test + + :return: Generator with set of organism object + """ genes = set() for i in range(randint(5, 20)): gene = Gene(gene_id=i) genes.add(gene) yield genes - def test_get_gene_empty(self, pangenome): - with pytest.raises(KeyError): - pangenome.get_gene(33) - @pytest.fixture(name="organism_genes") - def fill_org_with_genes(self): + def fill_org_with_genes(self) -> Generator[Union[Organism, Set[Gene]], None, None]: + """Fill an organism with a random set of gene + + :return: Organism with genes + """ genes = set() organism = Organism(name="organism") for contig_id in range(randint(2, 10)): @@ -203,6 +344,10 @@ def fill_org_with_genes(self): @pytest.fixture(name="family_genes") def fill_family_with_genes(self, pangenome): + """Fill a gene family with a random set of gene + + :return: Gene family with genes + """ genes = set() family = GeneFamily(family_id=pangenome.max_fam_id, name="family") for gene_idx in range(randint(2, 10)): @@ -213,36 +358,70 @@ def fill_family_with_genes(self, pangenome): genes.add(gene) yield family, genes - def test_genes_organism_generator(self, pangenome, organism_genes): - # orgs with genes. + def test_genes_generator_from_organism(self, pangenome, organism_genes): + """Tests genes generator from organism in pangenome object + + :param pangenome: pangenome object to test method + :param organism_genes: method to get an organism object fill with genes + """ organism, genes = organism_genes pangenome.add_organism(organism) - assert len(genes.difference(set(pangenome.genes))) == 0 + assert genes == set(pangenome.genes) def test_get_gene_with_organism(self, pangenome, organism_genes): + """Tests get genes from organism in pangenome object + + :param pangenome: pangenome object to test method + :param organism_genes: method to get an organism object fill with genes + """ organism, genes = organism_genes pangenome.add_organism(organism) for gene in genes: assert pangenome.get_gene(gene.ID) == gene - def test_genes_gene_families(self, family_genes, pangenome): - """Genes are added in pan through their family.""" + def test_genes_generator_from_gene_families(self, family_genes, pangenome): + """Tests genes generator from gene families in pangenome object + + :param pangenome: pangenome object to test method + :param family_genes: method to get a gene family object fill with genes + """ family, genes = family_genes pangenome.add_gene_family(family) - assert len(genes.difference(set(pangenome.genes))) == 0 + assert genes == set(pangenome.genes) def test_get_with_gene_family(self, pangenome, family_genes): + """Tests genes generator from gene families in pangenome object + + :param pangenome: pangenome object to test method + :param family_genes: method to get a gene family object fill with genes + """ family, genes = family_genes pangenome.add_gene_family(family) for gene in genes: assert pangenome.get_gene(gene.ID) == gene - def test_get_gene_with_id_not_int(self, pangenome): + def test_get_gene_not_in_pangenome(self, pangenome): + """Tests that return a KeyError if gene does not exist in pangenome + + :param pangenome: pangenome object to test method + """ + with pytest.raises(KeyError): + pangenome.get_gene("12151405613024") + + def test_get_gene_with_id_not_string(self, pangenome): + """Tests that return an AssertionError if gene identifier is not a string + + :param pangenome: pangenome object to test method + """ with pytest.raises(AssertionError): - pangenome.get_gene(gene_id="id") + pangenome.get_gene(gene_id=4) + + def test_number_of_genes(self, pangenome, organism_genes): + """Tests get number of genes in pangenome object - def test_number_of_gene(self, pangenome, organism_genes): - # orgs with genes. + :param pangenome: pangenome object to test method + :param organism_genes: method to get a organism object fill with genes + """ organism, genes = organism_genes pangenome.add_organism(organism) assert isinstance(pangenome.number_of_genes(), int) @@ -250,14 +429,23 @@ def test_number_of_gene(self, pangenome, organism_genes): def test_get_multigenic(self, pangenome): # TODO make a better test + """Tests get multigenic genes in pangenome object + + :param pangenome: pangenome object to test method + """ multigenic = pangenome.get_multigenics(0.5) assert isinstance(multigenic, set) class TestPangenomeEdge(TestPangenome): + """This class tests methods in pangenome class associated to Edge. + """ @staticmethod - def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2): - """create a pair of genes that belong to the same organism in 2 different families.""" + def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2) -> Tuple[Gene, Gene]: + """create a pair of genes that belong to the same organism in 2 different families + + :return: Two genes linked to contigs, organism and gene families + """ gene1 = Gene(gene_id=f"gene_{gene_id_1}") gene2 = Gene(gene_id=f"gene_{gene_id_2}") fam1 = GeneFamily(family_id=1, name=f"fam_{gene_id_1}") @@ -272,57 +460,63 @@ def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2): return gene1, gene2 @pytest.fixture - def gene_pair(self): - return self.make_gene_pair() + def gene_pair(self) -> Generator[Tuple[Gene, Gene], None, None]: + """Call method to create a pair of genes that belong to the same organism in 2 different families + + :return: Two genes linked to contigs, organism and gene families + """ + yield self.make_gene_pair() def test_add_edge(self, pangenome, gene_pair): + """Tests the add_edge method of the Pangenome class. + + :param pangenome: pangenome object to test method + :param gene_pair: Pair of gene coding for the edge + """ gene1, gene2 = gene_pair edge = pangenome.add_edge(gene1, gene2) assert isinstance(edge, Edge) - # addEdge doesn't act the same when the edge already exists. - assert pangenome.add_edge(gene1, gene2) == edge + assert set(pangenome.edges) == {edge} - def test_number_of_edges(self, pangenome, gene_pair): - gene1, gene2 = gene_pair - pangenome.add_edge(gene1, gene2) - assert isinstance(pangenome.number_of_edges(), int) - assert pangenome.number_of_edges() == 1 + def test_add_edge_already_in_pangenome(self, pangenome, gene_pair): + """Tests that adding the same pair of gene as edge return the edge. - def test_edges_one(self, pangenome, gene_pair): - gene_1, gene_2 = gene_pair + :param pangenome: pangenome object to test method + :param gene_pair: Pair of gene coding for the edge + """ + gene1, gene2 = gene_pair + edge = pangenome.add_edge(gene1, gene2) + assert pangenome.add_edge(gene1, gene2) == edge - edges = [] - for _ in range(randint(2, 5)): - edges.append(pangenome.add_edge(gene_1, gene_2)) + def test_add_edge_with_gene_not_isinstance_gene(self, pangenome): + """Tests that return an AssertionError if genes are not Gene objects - # always the same family couple - # = one edge, with several couple of genes - # I use set because edges are uniques, it is not a multigraph. - assert set(pangenome.edges) == set(edges) - assert pangenome.number_of_edges() == 1 + :param pangenome: pangenome object to test method + """ + with pytest.raises(AssertionError): + pangenome.add_edge("gene1", "gene2") - edge = list(pangenome.edges).pop() - assert edge.gene_pairs[0] == (gene_1, gene_2) + def test_number_of_edges(self, pangenome, gene_pair): + """Tests the number_of_edges method of the Pangenome class. - @pytest.fixture - def gene_pairs(self): - gene_pairs = set() - for _ in range(randint(5, 20)): - gene_id_1, gene_id_2 = choices(range(randint(2, 10)), k=2) - gene1, gene2 = self.make_gene_pair(gene_id_1, gene_id_2) - gene_pairs.add((gene1, gene2)) - yield gene_pairs - - def test_edges_many_rand(self, pangenome, gene_pairs): - edges = set() - for gene_pair in gene_pairs: - edges.add(pangenome.add_edge(*gene_pair)) - # I use set because edges are uniques, it is not a supergraph. - assert set(pangenome.edges) == edges + :param pangenome: pangenome object to test method + :param gene_pair: Pair of gene coding for the edge + """ + pangenome.add_edge(*gene_pair) + assert isinstance(pangenome.number_of_edges(), int) + assert pangenome.number_of_edges() == 1 class TestPangenomeBinary(TestPangenomeOrganism, TestPangenomeGeneFamilies): - def test_get_org_index(self, add_organisms, pangenome, orgs): + """This class tests methods in pangenome class associated to binary methods. + """ + #TODO Better test for this part + def test_get_org_index(self, add_organisms, pangenome): + """Tests the get_org_index function in pangenome class + + :param add_organisms: Add organisms to the pangenome + :param pangenome: Pass the pangenome object + """ orgs_index = pangenome.get_org_index() assert isinstance(orgs_index, dict) index_know = set() @@ -332,39 +526,121 @@ def test_get_org_index(self, add_organisms, pangenome, orgs): assert index not in index_know index_know.add(index) - def test_compute_family_bitarrays_without_index_already_computed(self, add_families, pangenome): + def test_compute_family_bitarrays_with_index_already_computed(self, add_organisms, add_families, pangenome): + """Tests the compute_family_bitarrays function in Pangenome class + + :param add_families: Add families to the pangenome object + :param pangenome: Access the pangenome object + """ + org_idx = pangenome.get_org_index() + assert pangenome.compute_family_bitarrays() == org_idx + + def test_compute_family_bitarrays_without_index_already_computed(self, add_organisms, add_families, pangenome): + """Tests the compute_family_bitarrays function of the Pangenome class. + + :param add_families: Add families to the pangenome + :param pangenome: Test the compute_family_bitarrays function + """ pangenome.compute_family_bitarrays() for family in pangenome.gene_families: assert family.bitarray is not None - def test_compute_family_bitarrays_with_index_already_computed(self, add_families, pangenome): - org_idx = pangenome.get_org_index() - assert pangenome.compute_family_bitarrays() == org_idx + def test_get_fam_index(self, add_families, pangenome): + """Tests the get_org_index function in pangenome class + + :param add_families: Add families to the pangenome + :param pangenome: Pass the pangenome object + """ + fams_index = pangenome.get_fam_index() + assert isinstance(fams_index, dict) + index_know = set() + for fam, index in fams_index.items(): + assert isinstance(fam, GeneFamily) + assert isinstance(index, int) + assert index not in index_know + index_know.add(index) + + def test_compute_org_bitarrays_with_index_already_computed(self, add_organisms, add_families, pangenome): + """Tests the compute_family_bitarrays function in Pangenome class + + :param add_families: Add families to the pangenome object + :param pangenome: Access the pangenome object + """ + fams_index = pangenome.get_fam_index() + assert pangenome.compute_org_bitarrays() == fams_index + + def test_compute_org_bitarrays_without_index_already_computed(self, add_organisms, add_families, pangenome): + """Tests the compute_family_bitarrays function of the Pangenome class. + + :param add_families: Add families to the pangenome + :param pangenome: Test the compute_family_bitarrays function + """ + pangenome.compute_org_bitarrays() + for organism in pangenome.organisms: + assert organism.bitarray is not None class TestPangenomeRGP(TestPangenome): + """This class tests methods in pangenome class associated to Region + """ def test_add_region(self, pangenome): + """Tests the add_region method in the Pangenome class. + + :param pangenome: Access the pangenome object + """ rgp = Region(region_id="rgp") pangenome.add_region(rgp) assert len(pangenome._regionGetter) == 1 assert pangenome._regionGetter["rgp"] == rgp def test_add_region_already_in_pangenome(self, pangenome): + """Tests that adding region already in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ rgp = Region(region_id="rgp") pangenome.add_region(rgp) with pytest.raises(KeyError): pangenome.add_region(rgp) + def test_add_region_with_isinstance_not_region(self, pangenome): + """Tests that adding an object with not Region type return an AssertionError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(AssertionError): + pangenome.add_region("rgp") + def test_get_region(self, pangenome): + """Tests the get_region method in the Pangenome class. + + :param pangenome: Access the pangenome object + """ rgp = Region(region_id="rgp") pangenome.add_region(rgp) assert pangenome.get_region("rgp") == rgp def test_get_region_not_in_pangenome(self, pangenome): + """Tests get region not in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ with pytest.raises(KeyError): pangenome.get_region("rgp") + def test_get_region_with_isinstance_not_string(self, pangenome): + """Tests that getting a region with not string as identifier return an AssertionError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(AssertionError): + pangenome.get_region(15646) + def test_number_of_rgp(self, pangenome): + """Tests the number_of_rgp method in the Pangenome class. + + :param pangenome: Pass the pangenome object to the function + """ rgp = Region(region_id="rgp") pangenome.add_region(rgp) assert isinstance(pangenome.number_of_rgp(), int) @@ -372,33 +648,67 @@ def test_number_of_rgp(self, pangenome): class TestPangenomeSpot(TestPangenome): + """This class tests methods in pangenome class associated to Spot. + """ def test_add_spot(self, pangenome): + """Tests the add_spot method in the Pangenome class. + + :param pangenome: Access the pangenome object + """ spot = Spot(spot_id=0) pangenome.add_spot(spot) assert len(pangenome._spotGetter) == 1 assert pangenome._spotGetter[0] == spot def test_add_spot_already_in_pangenome(self, pangenome): + """Tests that adding spot already in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ spot = Spot(spot_id=0) pangenome.add_spot(spot) with pytest.raises(KeyError): pangenome.add_spot(spot) + def test_add_spot_with_isinstance_not_spot(self, pangenome): + """Tests that adding an object with not Spot type return an AssertionError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(AssertionError): + pangenome.add_spot(4564) + def test_get_spot_with_int(self, pangenome): + """Tests get_spot method with integer in pangenome class + + :param pangenome: Access the pangenome object + """ spot = Spot(spot_id=0) pangenome.add_spot(spot) assert pangenome.get_spot(0) == spot def test_get_spot_with_str(self, pangenome): + """Tests get_spot method with string in pangenome class + + :param pangenome: Access the pangenome object + """ spot = Spot(spot_id=0) pangenome.add_spot(spot) assert pangenome.get_spot("spot_0") == spot def test_get_spot_not_in_pangenome(self, pangenome): + """Tests that getting spot not in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ with pytest.raises(KeyError): - pangenome.get_spot(0) + pangenome.get_spot(544654) def test_number_of_spots(self, pangenome): + """Tests number_of_spots methods in Pangenome class + + :param pangenome: Access the pangenome object + """ spot = Spot(spot_id=0) pangenome.add_spot(spot) assert isinstance(pangenome.number_of_spots(), int) @@ -406,41 +716,81 @@ def test_number_of_spots(self, pangenome): class TestPangenomeModule(TestPangenome): + """This class tests methods in pangenome class associated to Modules. + """ def test_add_module(self, pangenome): + """Tests the add_module method in the Pangenome class. + + :param pangenome: Access the pangenome object + """ module = Module(module_id=0) pangenome.add_module(module) assert len(pangenome._moduleGetter) == 1 assert pangenome._moduleGetter[0] == module def test_add_module_already_in_pangenome(self, pangenome): + """Tests that adding module already in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ module = Module(module_id=0) pangenome.add_module(module) with pytest.raises(KeyError): pangenome.add_module(module) + def test_add_module_with_isinstance_not_region(self, pangenome): + """Tests that adding an object with not Module type return an AssertionError. + + :param pangenome: Access the pangenome object + """ + with pytest.raises(AssertionError): + pangenome.add_module("module") + def test_get_module_with_int(self, pangenome): + """Tests get_module method with integer in pangenome class + + :param pangenome: Access the pangenome object + """ module = Module(module_id=0) pangenome.add_module(module) assert pangenome.get_module(0) == module def test_get_module_with_str(self, pangenome): + """Tests get_module method with string in pangenome class + + :param pangenome: Access the pangenome object + """ module = Module(module_id=0) pangenome.add_module(module) assert pangenome.get_module("module_0") == module def test_get_module_not_in_pangenome(self, pangenome): + """Tests that getting module not in pangenome return a KeyError. + + :param pangenome: Access the pangenome object + """ with pytest.raises(KeyError): pangenome.get_module(0) def test_number_of_modules(self, pangenome): + """Tests number_of_modules methods in Pangenome class + + :param pangenome: Access the pangenome object + """ module = Module(module_id=0) pangenome.add_module(module) assert isinstance(pangenome.number_of_modules(), int) assert pangenome.number_of_modules() == 1 class TestPangenomeMetadata(TestPangenome): + """This class tests methods in pangenome class associated to Metadata. + """ @pytest.fixture def add_element_to_pangenome(self, pangenome): + """Adds a metadata element to each elements of pangenome + + :param pangenome: Access the pangenome object + """ metadata = Metadata(source="source", attribute="attr") family = GeneFamily(family_id=pangenome.max_fam_id, name="Fam") family.add_metadata(source=metadata.source, metadata=metadata) @@ -463,12 +813,37 @@ def add_element_to_pangenome(self, pangenome): module.add_metadata(source=metadata.source, metadata=metadata) pangenome.add_module(module) + def test_select_elem(self, add_element_to_pangenome, pangenome): + """Tests the select_elem method of the Pangenome class. + + :param add_element_to_pangenome: Add elements to the pangenome + :param pangenome: Access the pangenome object + """ + assert all(isinstance(elem, GeneFamily) for elem in set(pangenome.select_elem("families"))) + assert all(isinstance(elem, Organism) for elem in set(pangenome.select_elem("genomes"))) + assert all(isinstance(elem, Gene) for elem in set(pangenome.select_elem("genes"))) + assert all(isinstance(elem, Region) for elem in set(pangenome.select_elem("RGPs"))) + assert all(isinstance(elem, Spot) for elem in set(pangenome.select_elem("spots"))) + assert all(isinstance(elem, Module) for elem in set(pangenome.select_elem("modules"))) + with pytest.raises(KeyError): + pangenome.select_elem("error") + def test_metadata_sources(self, add_element_to_pangenome, pangenome): + """Tests the metadata_sources method of the Pangenome class. + + :param add_element_to_pangenome: Add elements to the pangenome + :param pangenome: Access the pangenome object + """ for metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]: assert isinstance(pangenome.metadata_sources(metatype), set) assert pangenome.metadata_sources(metatype) == {'source'} def test_metadata(self, add_element_to_pangenome, pangenome): + """Tests the metadata generator of the Pangenome class. + + :param add_element_to_pangenome: Add elements to the pangenome + :param pangenome: Access the pangenome object + """ for metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]: for metadata_gen in pangenome.metadata(metatype): for metadata in metadata_gen: @@ -476,6 +851,11 @@ def test_metadata(self, add_element_to_pangenome, pangenome): assert metadata.source == 'source' def test_get_elem_by_metadata(self, add_element_to_pangenome, pangenome): + """Tests the metadata generator filtered by metadata attribute of the Pangenome class. + + :param add_element_to_pangenome: Add elements to the pangenome + :param pangenome: Access the pangenome object + """ for metatype, expected_type in {"families": GeneFamily, "genomes": Organism, "genes": Gene, "RGPs": Region, "spots": Spot, "modules": Module}.items(): for elem in pangenome.get_elem_by_metadata(metatype, attribute="attr"): @@ -485,6 +865,11 @@ def test_get_elem_by_metadata(self, add_element_to_pangenome, pangenome): assert metadata.source == 'source' def test_get_elem_by_sources(self, add_element_to_pangenome, pangenome): + """Tests the metadata generator filtered by source of the Pangenome class. + + :param add_element_to_pangenome: Add elements to the pangenome + :param pangenome: Access the pangenome object + """ for metatype, expected_type in {"families": GeneFamily, "genomes": Organism, "genes": Gene, "RGPs": Region, "spots": Spot, "modules": Module}.items(): for elem in pangenome.get_elem_by_sources(source='source', metatype=metatype): From eb7f95f9f8af705bd73d3f096fce585e9d962374 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 20 Jul 2023 08:48:57 +0200 Subject: [PATCH 20/75] Test for Features and Gene --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 17 ++- ppanggolin/annotate/synta.py | 7 +- ppanggolin/genome.py | 153 ++++++++++++++++++++---- tests/tests_genome.py | 201 ++++++++++++++++++++++++++++++++ 5 files changed, 339 insertions(+), 41 deletions(-) create mode 100644 tests/tests_genome.py diff --git a/VERSION b/VERSION index 2578cfec..ab749c23 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.137 +1.2.138 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 5a84afae..baa9b27a 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -143,14 +143,14 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p genetic_code = "" useful_info = False start = None - end = None + stop = None strand = None line = lines.pop() while not line.startswith("ORIGIN"): curr_type = line[5:21].strip() if curr_type != "": if useful_info: - create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, end, strand, obj_type, + create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, len(contig.genes), gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 @@ -166,18 +166,17 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p useful_info = True if line[21:].startswith('complement('): strand = "-" - start, end = line[32:].strip().replace( - ')', '').split("..") + start, stop = line[32:].strip().replace(')', '').split("..") else: strand = "+" - start, end = line[21:].strip().split('..') - if '>' in start or '<' in start or '>' in end or '<' in end: + start, stop = line[21:].strip().split('..') + if '>' in start or '<' in start or '>' in stop or '<' in stop: if not pseudo: # pseudogene likely useful_info = False else: - start = start.replace('>', '').replace('<', '') - end = end.replace('>', '').replace('<', '') + start = int(start.replace('>', '').replace('<', '')) + stop = int(stop.replace('>', '').replace('<', '')) except ValueError: pass # don't know what to do with that, ignoring for now. @@ -210,7 +209,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p line = lines.pop() # end of contig if useful_info: # saving the last element... - create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, end, strand, obj_type, + create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, len(contig.genes), gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 42953743..4c504104 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -143,16 +143,11 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = " c += 1 line_data = line.split() strand = line_data[9] - if strand == "-": - start = line_data[8] - stop = line_data[7] - else: - start, stop = map(int, (line_data[7], line_data[8])) + start, stop = map(int, (line_data[8], line_data[7]) if strand == "-" else (line_data[7], line_data[8])) gene = RNA(rna_id=locustag + "_rRNA_" + str(c).zfill(3)) gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type="rRNA", product=" ".join(line_data[17:])) gene_objs[line_data[2]].add(gene) - return gene_objs diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 717bf515..c55f315b 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -16,10 +16,34 @@ class Feature(MetaFeatures): """This is a general class representation of Gene, RNA - :param identifier: Identifier of the feature given by PPanGGOLiN + Methods: + - fill_annotations(): fills general annotation for child classes. + - fill_parents(): associates the object to an organism and a contig. + - add_dna(): adds DNA sequence to the feature. + + Fields: + - ID: Identifier of the feature given by PPanGGOLiN. + - is_fragment: Boolean value indicating whether the feature is a fragment or not. + - type: Type of the feature. + - start: Start position of the feature. + - stop: Stop position of the feature. + - strand: Strand associated with the feature. + - product: Associated product of the feature. + - name: Name of the feature. + - local_identifier: Identifier provided by the original file. + - organism: Parent organism of the feature. + - contig: Parent contig of the feature. + - dna: DNA sequence of the feature. """ def __init__(self, identifier: str): + """Constructor Method + + :param identifier: identifier of the feature + """ + assert isinstance(identifier, str), "Expected identifier should be a string" + if identifier == '': + raise ValueError("Identifier should not be empty") super().__init__() self.ID = identifier self.is_fragment = False @@ -30,17 +54,51 @@ def __init__(self, identifier: str): self.product = None self.name = None self.local_identifier = None - self.organism = None - self.contig = None + self._organism = None + self._contig = None self.dna = None + @property + def organism(self) -> Organism: + """Return organism that Feature belongs to. + + :return: Organism of the feature + """ + return self._organism + + @organism.setter + def organism(self, organism: Organism): + if not isinstance(organism, Organism): + raise TypeError(f'Expected type Organism, got {type(organism)}') + self._organism = organism + + @property + def contig(self) -> Contig: + """Return contig that Feature belongs to. + + :return: Contig of the feature + """ + return self._contig + + @contig.setter + def contig(self, contig: Contig): + if not isinstance(contig, Contig): + raise TypeError(f'Expected type Contig, got {type(contig)}') + self._contig = contig + @property def length(self) -> int: """Return gene length :return: gene length """ - return self.stop - self.start + if self.start is not None: + if self.stop is not None: + return self.stop - self.start + else: + raise ValueError("Stop is not known") + else: + raise ValueError("Start is not known") def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "", product: str = "", local_identifier: str = ""): @@ -54,16 +112,35 @@ def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = :param name: Name of the feature :param product: Associated product :param local_identifier: Identifier provided by the original file + + :raises TypeError: If attribute value not correspond to expected type + :raises ValueError: If strand is not '+' or '-' """ - self.start = start if isinstance(start, int) else int(start) - self.stop = stop if isinstance(stop, int) else int(stop) - self.type = gene_type + if not isinstance(start, int): + raise TypeError("Start should be int") + if not isinstance(stop, int): + raise TypeError("Stop should be int") + if not isinstance(strand, str): + raise TypeError("Strand should be str") + if not isinstance(gene_type, str): + raise TypeError("Gene type should be str") + if not isinstance(name, str): + raise TypeError("Name should be str") + if not isinstance(product, str): + raise TypeError("Product should be str") + if not isinstance(local_identifier, str): + raise TypeError("Local identifier should be str") + if strand not in ["+", "-"]: + raise ValueError("Strand should be + or -") + self.start = start + self.stop = stop self.strand = strand + self.type = gene_type self.product = product self.name = name self.local_identifier = local_identifier - def fill_parents(self, organism: Organism, contig: Contig): + def fill_parents(self, organism: Organism = None, contig: Contig = None): """ Associate object to an organism and a contig :param organism: Parent organism @@ -79,8 +156,7 @@ def add_dna(self, dna): :raise TypeError: DNA sequence must be a string """ - if not isinstance(dna, str): - raise TypeError(f"'str' type was expected but you provided a '{type(dna)}' type object") + assert isinstance(dna, str), f"'str' type was expected but you provided a '{type(dna)}' type object" self.dna = dna @@ -97,41 +173,66 @@ def __init__(self, rna_id: str): class Gene(Feature): """Save gene from genome as an Object with some information for Pangenome - :param gene_id: Identifier of the gene + Methods: + - fill_annotations(): fills general annotation for the gene object and adds additional attributes such as + position and genetic code. + - add_protein(): adds the protein sequence corresponding to the translated gene to the object. + + Fields: + - position: the position of the gene in the genome. + - family: the family that the gene belongs to. + - RGP: a set of resistance gene profiles associated with the gene. + - genetic_code: the genetic code associated with the gene. + - protein: the protein sequence corresponding to the translated gene. """ def __init__(self, gene_id: str): + """Constructor method + + :param gene_id: Identifier of the gene + """ super().__init__(gene_id) self.position = None - self.family = None - self.RGP = set() + self._family = None + self.RGP = set() # TODO check if a RGP is unique to a Gene. In that case change for setter/getter with none as default self.genetic_code = None self.protein = None def __str__(self) -> str: return str(self.ID) - def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "", - product: str = "", local_identifier: str = "", position: int = None, genetic_code: int = 11): + @property + def family(self): + """Return GeneFamily that Gene belongs to. + + :return: Gene family of the gene """ - Fill Gene annotation provide by PPanGGOLiN dependencies + return self._family + + @family.setter + def family(self, family): + from ppanggolin.geneFamily import GeneFamily + if not isinstance(family, GeneFamily): + raise TypeError(f'Expected type Organism, got {type(family)}') + self._family = family + + def fill_annotations(self, position: int = None, genetic_code: int = 11, **kwargs): + """Fill Gene annotation provide by PPanGGOLiN dependencies - :param start: Start position - :param stop: Stop position - :param strand: associated strand - :param gene_type: Type of the gene - :param name: Gene name - :param product: Associated product - :param local_identifier: Identifier provided by the original file :param position: Gene localisation in genome :param genetic_code: Genetic code associated to gene + :param kwargs: look at Feature.fill_annotations methods """ - super().fill_annotations(start, stop, strand, gene_type, name, product, local_identifier) + super().fill_annotations(**kwargs) + if position is not None and not isinstance(position, int): + raise TypeError("position should be an integer") + if not isinstance(genetic_code, int): + raise TypeError("Genetic code should be an integer") self.position = position self.genetic_code = genetic_code def add_protein(self, protein: str): - """ Add protein sequence corresponding to translated gene + """Add protein sequence corresponding to translated gene :param protein: Protein sequence @@ -149,6 +250,7 @@ class Contig: :param name: Name of the contig :param is_circular: save if the contig is circular """ + def __init__(self, name: str, is_circular: bool = False): self.name = name self.is_circular = is_circular @@ -211,6 +313,7 @@ class Organism(MetaFeatures): :param name: Name of the genome """ + def __init__(self, name: str): super().__init__() self.name = name diff --git a/tests/tests_genome.py b/tests/tests_genome.py new file mode 100644 index 00000000..1fb105c9 --- /dev/null +++ b/tests/tests_genome.py @@ -0,0 +1,201 @@ +#! /usr/bin/env python3 + +import pytest +from random import choices, randint, sample +from typing import Generator, Set, Tuple, Union +from pathlib import Path + +from ppanggolin.genome import Feature, Gene, RNA, Contig, Organism +from ppanggolin.geneFamily import GeneFamily + + +class TestFeature: + """Tests Feature class + """ + + def test_creation(self): + """Tests that 'Feature' object is created successfully with the given identifier + """ + feature = Feature('test_id') + assert feature.ID == 'test_id' + assert not feature.is_fragment + assert feature.type == '' + assert feature.start is None + assert feature.stop is None + assert feature.strand is None + assert feature.product is None + assert feature.name is None + assert feature.local_identifier is None + assert feature.organism is None + assert feature.contig is None + assert feature.dna is None + + def test_create_feature_assertion_error(self): + """Tests that a Feature object cannot be created with a non-string type identifier""" + with pytest.raises(AssertionError): + Feature(4) + + def test_create_feature_empty_identifier(self): + """Tests that a Feature object cannot be created with an empty identifier""" + with pytest.raises(ValueError): + Feature('') + + def test_fill_annotations(self): + """Tests that 'fill_annotations' method fills the attributes correctly + """ + feature = Feature('test_id') + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + assert feature.start == 1 + assert feature.stop == 10 + assert feature.type == 'gene_type' + assert feature.strand == '+' + assert feature.product == 'product' + assert feature.name == 'name' + assert feature.local_identifier == 'local_id' + + def test_set_organism_valid_type(self): + """Tests that organism setter sets organism with valid type + """ + feature = Feature('test') + organism = Organism('organism') + feature.organism = organism + assert feature.organism == organism + + def test_set_organism_invalid_type(self): + """Tests that organism setter return TypeError if sets organism with invalid type + """ + feature = Feature('test') + with pytest.raises(TypeError): + feature.organism = 4 + + def test_set_contig_valid_type(self): + """Tests that contig setter sets contig with valid type + """ + feature = Feature('test') + contig = Contig('contig') + feature.contig = contig + assert feature.contig == contig + + def test_set_contig_invalid_type(self): + """Tests that contig setter return TypeError if sets contig with invalid type + """ + feature = Feature('test') + with pytest.raises(TypeError): + feature.contig = 4 + + def test_fill_parents(self): + """Tests that 'fill_parents' method associates the object with the given organism and contig + """ + organism = Organism('org_id') + contig = Contig('contig_name') + feature = Feature('test_id') + feature.fill_parents(organism, contig) + assert feature.organism == organism + assert feature.contig == contig + + def test_add_dna(self): + """Tests that 'add_dna' method adds the DNA sequence to the object successfully + """ + feature = Feature('test_id') + feature.add_dna('ATCG') + assert feature.dna == 'ATCG' + + def test_fill_annotations_type_error(self): + """Tests that 'fill_annotations' method raises a TypeError if attribute value is not with correct type + """ + feature = Feature('test_id') + with pytest.raises(TypeError): + feature.fill_annotations('1', 10, '+', 'gene_type', 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, "10", '+', 'gene_type', 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, 4, 'gene_type', 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, "+", 4, 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, '+', 'gene_type', 4, 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 4, 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 4) + + def test_fill_annotations_value_error(self): + """Tests that 'fill_annotations' method raises a TypeError if strand is not '+' or '-' + """ + feature = Feature('test_id') + with pytest.raises(ValueError): + feature.fill_annotations(1, 10, '4', 'gene_type', 'name', 'product', 'local_id') + + def test_add_dna_type_error(self): + """Tests that 'add_dna' method raises a TypeError if the DNA sequence is not a string + """ + feature = Feature('test_id') + with pytest.raises(AssertionError): + feature.add_dna(123) + + def test_length_start_or_stop_are_not_known(self): + """Tests that length property raises ValueError when start is not known + """ + with pytest.raises(ValueError): + feature = Feature('test') + feature.stop = 10 + _ = feature.length + with pytest.raises(ValueError): + feature = Feature('test') + feature.start = 1 + _ = feature.length + + +class TestGene: + """Tests Gene class + """ + def test_create_gene_object(self): + """Tests that a Gene object can be created with a valid gene_id + """ + gene = Gene('gene1') + assert gene.ID == 'gene1' + + def test_fill_annotations(self): + """Tests that Gene annotations can be filled with valid parameters + """ + gene = Gene('gene1') + gene.fill_annotations(position=10, genetic_code=4) + assert gene.position == 10 + assert gene.genetic_code == 4 + + # Tests that Gene annotations cannot be filled with invalid parameters + def test_fill_annotations_invalid_parameters(self): + gene = Gene('gene1') + with pytest.raises(TypeError): + gene.fill_annotations(position='10', genetic_code=4) + with pytest.raises(TypeError): + gene.fill_annotations(position=10, genetic_code="4") + + def test_add_protein(self): + """Tests that a protein sequence can be added to a Gene object + """ + gene = Gene('gene1') + gene.add_protein('MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA') + assert gene.protein == 'MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA' + + def test_add_protein_non_string(self): + """Tests that a non-string protein sequence cannot be added to a Gene object + """ + gene = Gene('gene1') + with pytest.raises(TypeError): + gene.add_protein(123) + + def test_set_family_valid_type(self): + """Tests that family setter sets family with valid type + """ + gene = Gene('gene1') + family = GeneFamily(0, 'family') + gene.family = family + assert gene.family == family + + def test_set_family_invalid_type(self): + """Tests that family setter return TypeError if sets family with invalid type + """ + gene = Gene('gene1') + with pytest.raises(TypeError): + gene.family = 4 From 36ed9be2630fcce86c9455df6dced3ae7b233f0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 20 Jul 2023 10:28:34 +0200 Subject: [PATCH 21/75] Change that RGP is unique and not set in gene --- VERSION | 2 +- ppanggolin/formats/writeFlat.py | 4 +--- ppanggolin/genome.py | 19 ++++++++++++++++++- ppanggolin/region.py | 2 +- tests/tests_genome.py | 22 +++++++++++++++++++--- 5 files changed, 40 insertions(+), 9 deletions(-) diff --git a/VERSION b/VERSION index ab749c23..13735242 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.138 +1.2.139 diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 2e5cc34b..ad454d5f 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -556,9 +556,7 @@ def write_org_file(org: Organism, output: Path, compress: bool = False): contig.name, gene.start, gene.stop, gene.strand, gene.family.name, len(gene.family.get_genes_per_org(org)), gene.family.named_partition, nb_pers, nb_shell, nb_cloud] - if needRegions: - if len(gene.RGP) > 0: - rgp = ','.join([str(region.name) for region in gene.RGP]) + if needRegions and gene.RGP is not None: row.append(rgp) if needSpots: if len(gene.family.spot) > 0: diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index c55f315b..9b08823e 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -194,7 +194,7 @@ def __init__(self, gene_id: str): super().__init__(gene_id) self.position = None self._family = None - self.RGP = set() # TODO check if a RGP is unique to a Gene. In that case change for setter/getter with none as default + self._RGP = None self.genetic_code = None self.protein = None @@ -206,6 +206,7 @@ def family(self): """Return GeneFamily that Gene belongs to. :return: Gene family of the gene + :rtype: GeneFamily """ return self._family @@ -216,6 +217,22 @@ def family(self, family): raise TypeError(f'Expected type Organism, got {type(family)}') self._family = family + @property + def RGP(self): + """Return the RGP that gene belongs to + + :return: RGP fo the Gene + :rtype: Region + """ + return self._RGP + + @RGP.setter + def RGP(self, RGP): + from ppanggolin.region import Region + if not isinstance(RGP, Region): + raise TypeError(f'Expected type Organism, got {type(RGP)}') + self._RGP = RGP + def fill_annotations(self, position: int = None, genetic_code: int = 11, **kwargs): """Fill Gene annotation provide by PPanGGOLiN dependencies diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 8a08e92f..25b9bbfb 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -66,7 +66,7 @@ def append(self, gene: Gene): if isinstance(gene, Gene): self.genes.append(gene) - gene.RGP.add(self) + gene.RGP = self else: raise TypeError(f"Unexpected class / type for {type(gene)} " f"when adding it to a region of genomic plasticity") diff --git a/tests/tests_genome.py b/tests/tests_genome.py index 1fb105c9..660197a9 100644 --- a/tests/tests_genome.py +++ b/tests/tests_genome.py @@ -7,6 +7,7 @@ from ppanggolin.genome import Feature, Gene, RNA, Contig, Organism from ppanggolin.geneFamily import GeneFamily +from ppanggolin.region import Region class TestFeature: @@ -159,7 +160,7 @@ def test_fill_annotations(self): """Tests that Gene annotations can be filled with valid parameters """ gene = Gene('gene1') - gene.fill_annotations(position=10, genetic_code=4) + gene.fill_annotations(start=1, stop=10, strand='+', position=10, genetic_code=4) assert gene.position == 10 assert gene.genetic_code == 4 @@ -167,9 +168,9 @@ def test_fill_annotations(self): def test_fill_annotations_invalid_parameters(self): gene = Gene('gene1') with pytest.raises(TypeError): - gene.fill_annotations(position='10', genetic_code=4) + gene.fill_annotations(start=1, stop=10, strand='+', position='10', genetic_code=4) with pytest.raises(TypeError): - gene.fill_annotations(position=10, genetic_code="4") + gene.fill_annotations(start=1, stop=10, strand='+', position=10, genetic_code="4") def test_add_protein(self): """Tests that a protein sequence can be added to a Gene object @@ -199,3 +200,18 @@ def test_set_family_invalid_type(self): gene = Gene('gene1') with pytest.raises(TypeError): gene.family = 4 + + def test_set_rgp_valid_type(self): + """Tests that RGP setter sets family with valid type + """ + gene = Gene('gene1') + region = Region(0) + gene.RGP = region + assert gene.RGP == region + + def test_set_rgp_invalid_type(self): + """Tests that family setter return TypeError if sets family with invalid type + """ + gene = Gene('gene1') + with pytest.raises(TypeError): + gene.RGP = 4 \ No newline at end of file From 4e16c8bdf09cdb0d25fddedf77a4ef46cbbf634d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 20 Jul 2023 10:29:21 +0200 Subject: [PATCH 22/75] Fix writeFlat after RGP gene modif --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 8 ++++---- ppanggolin/formats/writeFlat.py | 5 ++--- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/VERSION b/VERSION index 13735242..c83211c8 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.139 +1.2.140 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 9dbf4083..11366622 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -173,14 +173,14 @@ def read_genedata(h5f: tables.File) -> dict: table = h5f.root.annotations.genedata genedata_id2genedata = {} for row in read_chunks(table, chunk=20000): - genedata = Genedata(start=row["start"], - stop=row["stop"], + genedata = Genedata(start=int(row["start"]), + stop=int(row["stop"]), strand=row["strand"].decode(), gene_type=row["gene_type"].decode(), - position=row["position"], + position=int(row["position"]), name=row["name"].decode(), product=row["product"].decode(), - genetic_code=row["genetic_code"]) + genetic_code=int(row["genetic_code"])) genedata_id = row["genedata_id"] genedata_id2genedata[genedata_id] = genedata return genedata_id2genedata diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index ad454d5f..4c0b7e8e 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -543,7 +543,6 @@ def write_org_file(org: Organism, output: Path, compress: bool = False): nb_shell = 0 nb_cloud = 0 modules = None - rgp = None spot = None for neighbor in gene.family.neighbors: if neighbor.named_partition == "persistent": @@ -556,8 +555,8 @@ def write_org_file(org: Organism, output: Path, compress: bool = False): contig.name, gene.start, gene.stop, gene.strand, gene.family.name, len(gene.family.get_genes_per_org(org)), gene.family.named_partition, nb_pers, nb_shell, nb_cloud] - if needRegions and gene.RGP is not None: - row.append(rgp) + if needRegions: + row.append(gene.RGP.name if gene.RGP is not None else gene.RGP) if needSpots: if len(gene.family.spot) > 0: spot = ','.join([str(s.ID) for s in gene.family.spot]) From 104c71ad44b1890bbc3d964f0c0ca5004bd33efb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 20 Jul 2023 15:42:51 +0200 Subject: [PATCH 23/75] refactor contig class --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 2 +- ppanggolin/annotate/annotate.py | 15 ++-- ppanggolin/annotate/synta.py | 2 +- ppanggolin/context/searchGeneContext.py | 2 +- ppanggolin/figures/draw_spot.py | 4 +- ppanggolin/formats/readBinaries.py | 2 +- ppanggolin/formats/writeBinaries.py | 2 +- ppanggolin/genome.py | 106 +++++++++++++++--------- ppanggolin/graph/makeGraph.py | 4 +- ppanggolin/mod/module.py | 6 +- ppanggolin/region.py | 20 ++--- 12 files changed, 97 insertions(+), 70 deletions(-) diff --git a/VERSION b/VERSION index c83211c8..d07e0381 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.140 +1.2.141 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index d2ff6098..eef66050 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -195,7 +195,7 @@ def compute_org_rgp(organism: Organism, multigenics: set, persistent_penalty: in min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> Set[Region]: org_regions = set() for contig in organism.contigs: - if len(contig.genes) != 0: # some contigs have no coding genes... + if len(contig) != 0: # some contigs have no coding genes... # can definitely multiprocess this part, as not THAT much information is needed... matrix = init_matrices(contig, multigenics, persistent_penalty, variable_gain) org_regions |= mk_regions(contig, matrix, multigenics, min_length, min_score, persistent_penalty, diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index baa9b27a..374a2393 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -76,7 +76,7 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, position=position, product=product, local_identifier=gene_id, genetic_code=genetic_code) - contig.add_gene(new_gene) + contig[new_gene.start] = new_gene else: # if not CDS, it is RNA new_gene = RNA(org.name + "_RNA_" + str(rna_counter).zfill(4)) new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, @@ -151,7 +151,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p if curr_type != "": if useful_info: create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, - len(contig.genes), gene_name, product, genetic_code, protein_id) + len(contig), gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 else: @@ -175,8 +175,9 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p # pseudogene likely useful_info = False else: - start = int(start.replace('>', '').replace('<', '')) - stop = int(stop.replace('>', '').replace('<', '')) + start = start.replace('>', '').replace('<', '') + stop = stop.replace('>', '').replace('<', '') + start, stop = map(int, [start, stop]) except ValueError: pass # don't know what to do with that, ignoring for now. @@ -210,7 +211,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p # end of contig if useful_info: # saving the last element... create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, - len(contig.genes), gene_name, product, genetic_code, protein_id) + len(contig), gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 else: @@ -338,10 +339,10 @@ def get_id_attribute(attributes_dict: dict) -> str: # here contig is filled in order, so position is the number of genes already stored in the contig. gene.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]), strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name, - position=len(contig.genes), product=product, local_identifier=gene_id, + position=len(contig), product=product, local_identifier=gene_id, genetic_code=genetic_code) gene.fill_parents(org, contig) - contig.add_gene(gene) + contig[gene.start] = gene gene_counter += 1 elif "RNA" in fields_gff[gff_type]: rna = RNA(org.name + "_CDS_" + str(rna_counter).zfill(4)) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 4c504104..91b24c8d 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -328,7 +328,7 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: gene.add_dna(get_dna_sequence(contig_sequences[contig.name], gene)) gene.fill_parents(org, contig) if isinstance(gene, Gene): - contig.add_gene(gene) + contig[gene.start] = gene elif isinstance(gene, RNA): contig.add_rna(gene) return org diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index eb39ffe6..6a25fe13 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -104,7 +104,7 @@ def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = F g = nx.Graph() for family in tqdm(families.values(), unit="families", disable=disable_bar): for gene in family.genes: - contig = gene.contig.genes + contig = list(gene.contig.genes) pos_left, in_context_left, pos_right, in_context_right = extract_gene_context(gene, contig, families, t) if in_context_left or in_context_right: for env_gene in contig[pos_left:pos_right + 1]: diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index e0403bcb..3764c083 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -176,7 +176,7 @@ def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = else: minpos = rgp.start_gene.position maxpos = rgp.stop_gene.position - gene_list = rgp.contig.genes[minpos:maxpos + 1] + gene_list = rgp.contig.get_genes(minpos, maxpos + 1) prev = None for gene in gene_list: g.add_node(gene.family.name, partition=gene.family.named_partition) @@ -592,7 +592,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: borders = rgp.get_bordering_genes(set_size, multigenics) minpos = min([gene.position for border in borders for gene in border]) maxpos = max([gene.position for border in borders for gene in border]) - gene_list = rgp.contig.genes[minpos:maxpos + 1] + gene_list = rgp.contig.get_genes(minpos, maxpos + 1) minstart = min([gene.start for border in borders for gene in border]) maxstop = max([gene.stop for border in borders for gene in border]) rnas_toadd = set() diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 11366622..7757a1e9 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -267,7 +267,7 @@ def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circul gene.is_fragment = row["is_fragment"] gene.fill_parents(org, contig) if gene_type == "CDS": - contig.add_gene(gene) + contig[gene.start] = gene elif "RNA" in gene_type: contig.add_rna(gene) else: diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index cf4277d7..ea695459 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -167,7 +167,7 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool gene_row = gene_table.row for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genome", disable=disable_bar): for contig in org.contigs: - for gene in contig.genes + list(contig.RNAs): + for gene in list(contig.genes) + list(contig.RNAs): gene_row["organism"] = org.name gene_row["contig/name"] = contig.name gene_row["contig/is_circular"] = contig.is_circular diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 9b08823e..0d0deff1 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -5,7 +5,7 @@ # installed libraries import logging -from typing import Dict, Iterator +from typing import Dict, Iterator, Generator import gmpy2 @@ -35,7 +35,6 @@ class Feature(MetaFeatures): - contig: Parent contig of the feature. - dna: DNA sequence of the feature. """ - def __init__(self, identifier: str): """Constructor Method @@ -185,7 +184,6 @@ class Gene(Feature): - genetic_code: the genetic code associated with the gene. - protein: the protein sequence corresponding to the translated gene. """ - def __init__(self, gene_id: str): """Constructor method @@ -194,7 +192,7 @@ def __init__(self, gene_id: str): super().__init__(gene_id) self.position = None self._family = None - self._RGP = None + self._RGP = None self.genetic_code = None self.protein = None @@ -263,40 +261,82 @@ def add_protein(self, protein: str): class Contig: """ Describe the contig content and some information + Methods: + - genes(self) -> list: Returns a list of gene objects present in the contig. + - add_rna(self, rna: RNA): Adds an RNA object to the contig. + - add_gene(self, gene: Gene): Adds a gene object to the contig. - :param name: Name of the contig - :param is_circular: save if the contig is circular + Fields: + - name: Name of the contig. + - is_circular: Boolean value indicating whether the contig is circular or not. + - RNAs: Set of RNA annotations present in the contig. """ def __init__(self, name: str, is_circular: bool = False): + """Constructor method + :param name: Name of the contig + :param is_circular: save if the contig is circular + """ self.name = name self.is_circular = is_circular - self.RNAs = set() # saving the rna annotations. We're not using them in the vast majority of cases. - self._genes_start = {} + self._rnaGetter = {} # saving the rna annotations. We're not using them in the vast majority of cases. + self._genesGetter = {} self._genes_position = [] + def __str__(self) -> str: + return self.name + + def __len__(self): + return len(self._genes_position) + + def __setitem__(self, start: int, gene: Gene): + """ Set gene to Contig + + :param gene: Gene object to add + """ + if not isinstance(gene, Gene): + raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") + if start in self._genesGetter: + raise ValueError(f"Gene with start position {start} already exists in the contig") + if gene.position is None: + raise TypeError("The gene object needs to have its position in the contig filled before adding it") + # adding empty values. They should be filled by the end of the parsing. + # Doing this because genes are not always met in order. + self._genes_position.extend([None] * (gene.position - len(self._genes_position) + 1)) + self._genes_position[gene.position] = gene + self._genesGetter[gene.start] = gene + + # retrieve gene by start position + def __getitem__(self, index: int) -> Gene: + if not isinstance(index, int): + raise TypeError(f"Expected type is int, given type was '{type(index)}'") + return self._genes_position[index] + + def get_genes(self, begin: int, end: int): + if not isinstance(begin, int) or not isinstance(end, int): + raise TypeError(f"Expected type is int, given type was '{type(begin)}, {type(end)}'") + if end < begin: + raise ValueError("End position is lower than begin position") + else: + return self._genes_position[begin: end] + @property def genes(self) -> list: """ Give the gene content of the contig :return: list of gene in contig """ - return self._genes_position - - def __str__(self) -> str: - return self.name + for gene in self._genes_position: + yield gene - def __iter__(self): - return iter(self.genes) + @property + def RNAs(self) -> Generator[RNA, None, None]: + """Return all the RNA in the contig - # retrieve gene by start position - def __getitem__(self, index: int): - gene = self._genes_start.get(index) - if not gene: - if not isinstance(index, int): - raise TypeError(f"Expected type is int, given type was '{type(index)}'") - raise IndexError(f"No gene start at the given position {index}") - return gene + :return: Generator of RNA + """ + for rna in self._rnaGetter.values(): + yield rna def add_rna(self, rna: RNA): """ Add RNA to contig @@ -305,23 +345,9 @@ def add_rna(self, rna: RNA): """ if not isinstance(rna, RNA): raise TypeError(f"'RNA' type was expected but you provided a '{type(rna)}' type object") - self.RNAs.add(rna) - - def add_gene(self, gene: Gene): - """ Add gene to Contig - - :param gene: Gene object to add - """ - if not isinstance(gene, Gene): - raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") - if gene.position is None: - raise TypeError("The gene object needs to have its position in the contig filled before adding it") - while len(self._genes_position) <= gene.position: - # adding empty values. They should be filled by the end of the parsing. - # Doing this because genes are not always met in order. - self._genes_position.append(None) - self._genes_position[gene.position] = gene - self._genes_start[gene.start] = gene + if rna.ID in self._rnaGetter: + raise KeyError(f"RNA with the id: {rna.ID} already exist in contig {self.name}") + self._rnaGetter[rna.ID] = rna class Organism(MetaFeatures): @@ -357,7 +383,7 @@ def number_of_genes(self) -> int: :return: Number of gene in organism """ - return sum([len(list(contig.genes)) for contig in self.contigs]) + return sum([len(contig) for contig in self.contigs]) @property def contigs(self) -> dict.values: diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index e05f260b..54b9a4f7 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -103,9 +103,9 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, raise AttributeError("a Gene does not have a GeneFamily object associated") except Exception: raise Exception("Unexpected error. Please report on our github.") - if prev is not None and contig.is_circular and len(contig.genes) > 0: + if prev is not None and contig.is_circular and len(contig) > 0: # if prev is None, the contig is entirely made of duplicated genes, so no edges are added - pangenome.add_edge(contig.genes[0], prev) + pangenome.add_edge(contig[0], prev) logging.getLogger("PPanGGOLiN").info("Done making the neighbors graph.") pangenome.status["neighborsGraph"] = "Computed" diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index baf4955f..2f6431b4 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -44,12 +44,12 @@ def compute_mod_graph(organisms: list, t: int = 1, disable_bar: bool = False): g = nx.Graph() for org in tqdm(organisms, unit="genome", disable=disable_bar): for contig in org.contigs: - if len(contig.genes) > 0: - start_gene = contig.genes[0] + if len(contig) > 0: + start_gene = contig[0] g.add_node(start_gene.family) add_gene(g.nodes[start_gene.family], start_gene, fam_split=False) for i, gene in enumerate(contig.genes): - for j, a_gene in enumerate(contig.genes[i + 1:i + t + 2], start=i + 1): + for j, a_gene in enumerate(contig.get_genes(i + 1, i + t + 2), start=i + 1): g.add_edge(gene.family, a_gene.family) edge = g[gene.family][a_gene.family] add_gene(edge, gene) diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 25b9bbfb..7c91c5ed 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -133,7 +133,7 @@ def is_whole_contig(self) -> bool: :return: True if whole contig """ - if self.start_gene.position == 0 and self.stop_gene.position == len(self.contig.genes) - 1: + if self.start_gene.position == 0 and self.stop_gene.position == len(self.contig) - 1: return True return False @@ -146,7 +146,7 @@ def is_contig_border(self) -> bool: if len(self.genes) == 0: raise Exception("Your region has no genes. Something wrong happenned.") if (self.start_gene.position == 0 and not self.contig.is_circular) or \ - (self.stop_gene.position == len(self.contig.genes) - 1 and not self.contig.is_circular): + (self.stop_gene.position == len(self.contig) - 1 and not self.contig.is_circular): return True return False @@ -176,30 +176,30 @@ def get_bordering_genes(self, n: int, multigenics: set) -> list: curr_gene = None if pos == 0: if self.contig.is_circular: - curr_gene = self.contig.genes[-1] + curr_gene = self.contig[pos - 1] else: - curr_gene = self.contig.genes[pos - 1] + curr_gene = self.contig[pos - 1] if curr_gene is not None and curr_gene.family not in multigenics and \ curr_gene.family.named_partition == "persistent": border[0].append(curr_gene) pos -= 1 if pos == -1 and self.contig.is_circular: - pos = len(self.contig.genes) + pos = len(self.contig) if pos == init: break # looped around the contig pos = self.stop_gene.position init = pos - while len(border[1]) < n and (pos != len(self.contig.genes) - 1 or self.contig.is_circular): + while len(border[1]) < n and (pos != len(self.contig) - 1 or self.contig.is_circular): curr_gene = None - if pos == len(self.contig.genes) - 1: + if pos == len(self.contig) - 1: if self.contig.is_circular: - curr_gene = self.contig.genes[0] + curr_gene = self.contig[0] else: - curr_gene = self.contig.genes[pos + 1] + curr_gene = self.contig[pos + 1] if curr_gene is not None and curr_gene.family not in multigenics: border[1].append(curr_gene) pos += 1 - if pos == len(self.contig.genes) and self.contig.is_circular: + if pos == len(self.contig) and self.contig.is_circular: pos = -1 if pos == init: break # looped around the contig From 97547b219ec54b02545e80dccebaf4bc190d41e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 20 Jul 2023 16:20:05 +0200 Subject: [PATCH 24/75] Tests contig class --- VERSION | 2 +- ppanggolin/genome.py | 11 ++-- tests/tests_genome.py | 125 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 131 insertions(+), 7 deletions(-) diff --git a/VERSION b/VERSION index d07e0381..46dc9183 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.141 +1.2.142 diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 0d0deff1..f73dbd6b 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -279,7 +279,7 @@ def __init__(self, name: str, is_circular: bool = False): """ self.name = name self.is_circular = is_circular - self._rnaGetter = {} # saving the rna annotations. We're not using them in the vast majority of cases. + self._rnaGetter = set() # saving the rna annotations. We're not using them in the vast majority of cases. self._genesGetter = {} self._genes_position = [] @@ -292,6 +292,7 @@ def __len__(self): def __setitem__(self, start: int, gene: Gene): """ Set gene to Contig + :param start: Start position of the gene :param gene: Gene object to add """ if not isinstance(gene, Gene): @@ -299,7 +300,7 @@ def __setitem__(self, start: int, gene: Gene): if start in self._genesGetter: raise ValueError(f"Gene with start position {start} already exists in the contig") if gene.position is None: - raise TypeError("The gene object needs to have its position in the contig filled before adding it") + raise AttributeError("The gene object needs to have its position in the contig filled before adding it") # adding empty values. They should be filled by the end of the parsing. # Doing this because genes are not always met in order. self._genes_position.extend([None] * (gene.position - len(self._genes_position) + 1)) @@ -335,7 +336,7 @@ def RNAs(self) -> Generator[RNA, None, None]: :return: Generator of RNA """ - for rna in self._rnaGetter.values(): + for rna in self._rnaGetter: yield rna def add_rna(self, rna: RNA): @@ -345,9 +346,9 @@ def add_rna(self, rna: RNA): """ if not isinstance(rna, RNA): raise TypeError(f"'RNA' type was expected but you provided a '{type(rna)}' type object") - if rna.ID in self._rnaGetter: + if rna in self._rnaGetter: raise KeyError(f"RNA with the id: {rna.ID} already exist in contig {self.name}") - self._rnaGetter[rna.ID] = rna + self._rnaGetter.add(rna) class Organism(MetaFeatures): diff --git a/tests/tests_genome.py b/tests/tests_genome.py index 660197a9..f505da98 100644 --- a/tests/tests_genome.py +++ b/tests/tests_genome.py @@ -214,4 +214,127 @@ def test_set_rgp_invalid_type(self): """ gene = Gene('gene1') with pytest.raises(TypeError): - gene.RGP = 4 \ No newline at end of file + gene.RGP = 4 + + +class TestContig: + """Tests Contig class + """ + @pytest.fixture + def gene(self) -> Generator[Gene, None, None]: + gene = Gene('test_gene') + gene.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + yield gene + + @pytest.fixture + def genes(self): + gene1 = Gene('test_gene1') + gene1.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + gene2 = Gene('test_gene2') + gene2.fill_annotations(start=11, stop=20, strand='+', position=1, genetic_code=4) + gene3 = Gene('test_gene3') + gene3.fill_annotations(start=21, stop=30, strand='+', position=2, genetic_code=4) + yield {gene1, gene2, gene3} + + def test_add_gene(self, gene): + """Tests that a gene can be added to the contig + """ + contig = Contig('test_contig') + contig[gene.start] = gene + assert len(contig._genesGetter) == 1 + assert len(contig._genes_position) == 1 + assert contig._genesGetter[gene.start] == gene + assert contig._genes_position[0] == gene + + def test_add_rna(self): + """Tests that an RNA can be added to the contig + """ + contig = Contig('test_contig') + rna = RNA('test_rna') + contig.add_rna(rna) + assert list(contig.RNAs) == [rna] + + def test_get_length(self, gene): + """Tests that the length of the contig can be retrieved + """ + contig = Contig('test_contig') + contig[gene.start] = gene + assert len(contig) == 1 + + def test_get_gene(self, gene): + """Tests that a gene can be retrieved by its position + """ + contig = Contig('test_contig') + contig[gene.start] = gene + assert contig[0] == gene + + def test_get_genes(self, genes): + """Tests that a list of genes within a range can be retrieved + """ + contig = Contig('test_contig') + gene1, gene2, gene3 = genes + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + contig[gene3.start] = gene3 + assert set(contig.get_genes(0, 3)) == genes + + def test_iterate_over_genes(self, genes): + """Tests that all genes in the contig can be iterated over + """ + contig = Contig('test_contig') + gene1, gene2, gene3 = genes + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + contig[gene3.start] = gene3 + assert list(contig.genes) == sorted([gene1, gene2, gene3], key=lambda x: x.position) + + def test_add_gene_with_existing_start_position(self, gene): + """Tests that a gene cannot be added with a start position that already exists + """ + contig = Contig('test_contig') + contig[gene.start] = gene + with pytest.raises(ValueError): + contig[gene.start] = gene + + def test_add_gene_without_position(self): + """Tests that a gene cannot be added without a position + """ + contig = Contig('test_contig') + gene = Gene('test_gene') + gene.fill_annotations(start=1, stop=10, strand='+', genetic_code=4) + with pytest.raises(AttributeError): + contig[gene.start] = gene + + def test_get_gene_with_non_integer_index(self, gene): + """Tests that a gene cannot be retrieved with an index that is not an integer + """ + contig = Contig('test_contig') + contig[gene.start] = gene + with pytest.raises(TypeError): + contig['a'] + + def test_get_genes_with_non_integer_begin_and_end_positions_edge_case(self, genes): + """Tests that genes cannot be retrieved with non-integer begin and end positions + """ + contig = Contig('test_contig') + gene1, gene2, gene3 = genes + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + contig[gene3.start] = gene3 + with pytest.raises(TypeError): + contig.get_genes('a', 4) + with pytest.raises(TypeError): + contig.get_genes(5, 'b') + with pytest.raises(TypeError): + contig.get_genes('a', 'b') + + def test_get_genes_with_end_position_lower_than_begin_position_edge_case(self, genes): + """Tests that genes cannot be retrieved with end position lower than begin position + """ + contig = Contig('test_contig') + gene1, gene2, gene3 = genes + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + contig[gene3.start] = gene3 + with pytest.raises(ValueError): + contig.get_genes(2, 0) From 2e8272c42256a79e6536d41117783e74f7444dc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 21 Jul 2023 17:13:40 +0200 Subject: [PATCH 25/75] Refactor and test of Genomes classes --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 43 ++- ppanggolin/annotate/synta.py | 18 +- ppanggolin/formats/readBinaries.py | 10 +- ppanggolin/formats/writeFlat.py | 16 +- ppanggolin/genome.py | 204 +++++++--- tests/genome/test_Feature.py | 70 ---- tests/tests_Genome.py | 575 +++++++++++++++++++++++++++++ tests/tests_genome.py | 340 ----------------- 9 files changed, 773 insertions(+), 505 deletions(-) delete mode 100644 tests/genome/test_Feature.py create mode 100644 tests/tests_Genome.py delete mode 100644 tests/tests_genome.py diff --git a/VERSION b/VERSION index 46dc9183..3bb592c1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.142 +1.2.143 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 374a2393..8c822bf6 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -121,18 +121,22 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p if line.startswith('VERSION'): contig_id = line[12:].strip() if contig_id != "": - if contig_id in circular_contigs: - is_circ = True - contig = org.get_contig(contig_id, is_circ) + try: + contig = org.get_contig(contig_id) + except KeyError: + contig = Contig(contig_id, True if contig_id in circular_contigs else False) + org.add_contig(contig) set_contig = True line = lines.pop() if not set_contig: # if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. # Should be unique in a dataset, but if there's an update the contig ID # might still be the same even though it should not(?) - if contig_locus_id in circular_contigs: - is_circ = True - contig = org.get_contig(contig_locus_id, is_circ) + try: + contig = org.get_contig(contig_locus_id) + except KeyError: + contig = Contig(contig_locus_id, True if contig_locus_id in circular_contigs else False) + org.add_contig(contig) # start of the feature object. dbxref = set() gene_name = "" @@ -226,7 +230,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p line = lines.pop() # get each gene's sequence. for gene in contig.genes: - gene.add_dna(get_dna_sequence(sequence, gene)) + gene.add_sequence(get_dna_sequence(sequence, gene)) return org, True @@ -292,7 +296,12 @@ def get_id_attribute(attributes_dict: dict) -> str: has_fasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] - contig = org.get_contig(fields[1], True if fields[1] in circular_contigs else False) + try: + contig = org.get_contig(fields[1]) + except KeyError: + contig = Contig(fields[1], True if fields[1] in circular_contigs else False) + org.add_contig(contig) + continue elif line.startswith('#'): # comment lines to be ignores by parsers continue @@ -331,8 +340,12 @@ def get_id_attribute(attributes_dict: dict) -> str: genetic_code = 11 if contig is None or contig.name != fields_gff[gff_seqname]: # get the current contig - contig = org.get_contig(fields_gff[gff_seqname], - True if fields_gff[gff_seqname] in circular_contigs else False) + try: + contig = org.get_contig(fields_gff[gff_seqname]) + except KeyError: + contig = Contig(fields_gff[gff_seqname], + True if fields_gff[gff_seqname] in circular_contigs else False) + org.add_contig(contig) if fields_gff[gff_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) @@ -342,7 +355,6 @@ def get_id_attribute(attributes_dict: dict) -> str: position=len(contig), product=product, local_identifier=gene_id, genetic_code=genetic_code) gene.fill_parents(org, contig) - contig[gene.start] = gene gene_counter += 1 elif "RNA" in fields_gff[gff_type]: rna = RNA(org.name + "_CDS_" + str(rna_counter).zfill(4)) @@ -350,7 +362,6 @@ def get_id_attribute(attributes_dict: dict) -> str: strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name, product=product, local_identifier=gene_id) rna.fill_parents(org, contig) - contig.add_rna(rna) rna_counter += 1 # GET THE FASTA SEQUENCES OF THE GENES @@ -358,9 +369,9 @@ def get_id_attribute(attributes_dict: dict) -> str: contig_sequences, _ = read_fasta(org, fasta_string.split('\n')) # _ is total contig length for contig in org.contigs: for gene in contig.genes: - gene.add_dna(get_dna_sequence(contig_sequences[contig.name], gene)) + gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) for rna in contig.RNAs: - rna.add_dna(get_dna_sequence(contig_sequences[contig.name], rna)) + rna.add_sequence(get_dna_sequence(contig_sequences[contig.name], rna)) return org, has_fasta @@ -505,9 +516,9 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): for contig in org.contigs: try: for gene in contig.genes: - gene.add_dna(get_dna_sequence(fasta_dict[org][contig.name], gene)) + gene.add_sequence(get_dna_sequence(fasta_dict[org][contig.name], gene)) for rna in contig.RNAs: - rna.add_dna(get_dna_sequence(fasta_dict[org][contig.name], rna)) + rna.add_sequence(get_dna_sequence(fasta_dict[org][contig.name], rna)) except KeyError: msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} " \ f"that was read from the annotation file. " diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 91b24c8d..cd677e98 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -13,7 +13,7 @@ from pathlib import Path # local libraries -from ppanggolin.genome import Organism, Gene, RNA +from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.utils import is_compressed, read_compressed_or_not @@ -170,7 +170,11 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in contigs[contig.name] = contig_seq.upper() all_contig_len += len(contig_seq) contig_seq = "" - contig = org.get_contig(line.split()[0][1:]) + try: + contig = org.get_contig(line.split()[0][1:]) + except KeyError: + contig = Contig(line.split()[0][1:]) + org.add_contig(contig) else: contig_seq += line.strip() if len(contig_seq) >= 1: # processing the last contig @@ -321,11 +325,13 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: genes = overlap_filter(genes, overlap) for contig_name, genes in genes.items(): - contig = org.get_contig(contig_name) - if contig.name in circular_contigs: - contig.is_circular = True + try: + contig = org.get_contig(contig_name) + except KeyError: + contig = Contig(contig_name, True if contig_name in circular_contigs else False) + org.add_contig(contig) for gene in genes: - gene.add_dna(get_dna_sequence(contig_sequences[contig.name], gene)) + gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) gene.fill_parents(org, contig) if isinstance(gene, Gene): contig[gene.start] = gene diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 7757a1e9..7bbe971e 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -13,7 +13,7 @@ import tables # local libraries -from ppanggolin.genome import Organism, Gene, RNA +from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.pangenome import Pangenome from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module @@ -240,7 +240,11 @@ def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circul org = Organism(org_name) gene, gene_type = (None, None) for contig_name, gene_list in contig_dict.items(): - contig = org.get_contig(contig_name, is_circular=circular_contigs[contig_name]) + try: + contig = org.get_contig(contig_name) + except KeyError: + contig = Contig(contig_name, is_circular=circular_contigs[contig_name]) + org.add_contig(contig) for row in gene_list: if link: # if the gene families are already computed/loaded the gene exists. gene = pangenome.get_gene(row["ID"].decode()) @@ -358,7 +362,7 @@ def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: boo seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): gene = pangenome.get_gene(row['gene'].decode()) - gene.add_dna(seqid2seq[row['seqid']]) + gene.add_sequence(seqid2seq[row['seqid']]) pangenome.status["geneSequences"] = "Loaded" diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 4c0b7e8e..40f7be81 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -469,11 +469,10 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, "nb_exact_core_genes", "nb_soft_core_genes", "completeness", "nb_single_copy_markers"]) + "\n") for org in pan.organisms: - fams = org.families nb_pers = 0 nb_shell = 0 nb_cloud = 0 - for fam in fams: + for fam in org.families: if fam.named_partition == "persistent": nb_pers += 1 elif fam.named_partition == "shell": @@ -499,14 +498,15 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, nb_gene_core += 1 completeness = "NA" if len(single_copy_markers) > 0: - completeness = round((len(fams & single_copy_markers) / len(single_copy_markers)) * 100, 2) + completeness = round(((org.number_of_families() + len(single_copy_markers)) / + len(single_copy_markers)) * 100, 2) outfile.write("\t".join(map(str, [org.name, - len(fams), + org.number_of_families(), nb_pers, nb_shell, nb_cloud, - len(core & fams), - len(soft & fams), + len(core) + org.number_of_families(), + len(soft) + org.number_of_families(), org.number_of_genes(), nb_gene_pers, nb_gene_shell, @@ -514,7 +514,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, nb_gene_core, nb_gene_soft, completeness, - len(fams & single_copy_markers)])) + "\n") + org.number_of_families() + len(single_copy_markers)])) + "\n") logging.getLogger("PPanGGOLiN").info("Done writing genome per genome statistics") @@ -797,7 +797,7 @@ def write_org_modules(output: Path, compress: bool = False): for fam in mod.families: mod_orgs |= fam.organisms for org in mod_orgs: - completion = round(len(org.families & mod.families) / len(mod.families), 2) + completion = round((org.number_of_families() + len(mod.families)) / len(mod.families), 2) fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n") fout.close() logging.getLogger("PPanGGOLiN").info( diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index f73dbd6b..158d289b 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -57,6 +57,24 @@ def __init__(self, identifier: str): self._contig = None self.dna = None + def __str__(self) -> str: + return str(self.ID) + + def __len__(self) -> int: + """Return gene length + + :return: gene length + + :raises ValueError: If start or stop are not defined in gene + """ + if self.start is not None: + if self.stop is not None: + return self.stop - self.start + 1 + else: + raise ValueError("Stop is not known") + else: + raise ValueError("Start is not known") + @property def organism(self) -> Organism: """Return organism that Feature belongs to. @@ -85,20 +103,6 @@ def contig(self, contig: Contig): raise TypeError(f'Expected type Contig, got {type(contig)}') self._contig = contig - @property - def length(self) -> int: - """Return gene length - - :return: gene length - """ - if self.start is not None: - if self.stop is not None: - return self.stop - self.start - else: - raise ValueError("Stop is not known") - else: - raise ValueError("Start is not known") - def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "", product: str = "", local_identifier: str = ""): """ @@ -145,18 +149,25 @@ def fill_parents(self, organism: Organism = None, contig: Contig = None): :param organism: Parent organism :param contig: Parent contig """ - self.organism = organism - self.contig = contig + if organism is not None: + self.organism = organism + if contig is not None: + self.contig = contig + else: + if contig is not None: + self.contig = contig + else: + raise AssertionError("You should provide at least organism or contig") - def add_dna(self, dna): + def add_sequence(self, sequence): """ Add DNA sequence to feature - :param dna: DNA sequence + :param sequence: DNA sequence :raise TypeError: DNA sequence must be a string """ - assert isinstance(dna, str), f"'str' type was expected but you provided a '{type(dna)}' type object" - self.dna = dna + assert isinstance(sequence, str), f"'str' type was expected but you provided a '{type(sequence)}' type object" + self.dna = sequence class RNA(Feature): @@ -196,9 +207,6 @@ def __init__(self, gene_id: str): self.genetic_code = None self.protein = None - def __str__(self) -> str: - return str(self.ID) - @property def family(self): """Return GeneFamily that Gene belongs to. @@ -279,9 +287,10 @@ def __init__(self, name: str, is_circular: bool = False): """ self.name = name self.is_circular = is_circular - self._rnaGetter = set() # saving the rna annotations. We're not using them in the vast majority of cases. - self._genesGetter = {} + self._rna_getter = set() # saving the rna annotations. We're not using them in the vast majority of cases. + self._genes_getter = {} self._genes_position = [] + self._organism = None def __str__(self) -> str: return self.name @@ -297,7 +306,7 @@ def __setitem__(self, start: int, gene: Gene): """ if not isinstance(gene, Gene): raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") - if start in self._genesGetter: + if start in self._genes_getter: raise ValueError(f"Gene with start position {start} already exists in the contig") if gene.position is None: raise AttributeError("The gene object needs to have its position in the contig filled before adding it") @@ -305,7 +314,7 @@ def __setitem__(self, start: int, gene: Gene): # Doing this because genes are not always met in order. self._genes_position.extend([None] * (gene.position - len(self._genes_position) + 1)) self._genes_position[gene.position] = gene - self._genesGetter[gene.start] = gene + self._genes_getter[gene.start] = gene # retrieve gene by start position def __getitem__(self, index: int) -> Gene: @@ -314,6 +323,10 @@ def __getitem__(self, index: int) -> Gene: return self._genes_position[index] def get_genes(self, begin: int, end: int): + """Gets a list of genes within a range + :param begin: Position of first gene to retrieve + :param end: Position of last gene to not retrieve + """ if not isinstance(begin, int) or not isinstance(end, int): raise TypeError(f"Expected type is int, given type was '{type(begin)}, {type(end)}'") if end < begin: @@ -331,13 +344,18 @@ def genes(self) -> list: yield gene @property - def RNAs(self) -> Generator[RNA, None, None]: - """Return all the RNA in the contig + def organism(self) -> Organism: + """Return organism that Feature belongs to. - :return: Generator of RNA + :return: Organism of the feature """ - for rna in self._rnaGetter: - yield rna + return self._organism + + @organism.setter + def organism(self, organism: Organism): + if not isinstance(organism, Organism): + raise TypeError(f'Expected type Organism, got {type(organism)}') + self._organism = organism def add_rna(self, rna: RNA): """ Add RNA to contig @@ -346,35 +364,85 @@ def add_rna(self, rna: RNA): """ if not isinstance(rna, RNA): raise TypeError(f"'RNA' type was expected but you provided a '{type(rna)}' type object") - if rna in self._rnaGetter: + if rna in self._rna_getter: raise KeyError(f"RNA with the id: {rna.ID} already exist in contig {self.name}") - self._rnaGetter.add(rna) + self._rna_getter.add(rna) + + @property + def RNAs(self) -> Generator[RNA, None, None]: + """Return all the RNA in the contig + + :return: Generator of RNA + """ + for rna in self._rna_getter: + yield rna class Organism(MetaFeatures): """ Describe the Genome content and some information - :param name: Name of the genome + Methods: + - `families(self) -> set`: Returns a set of gene families present in the organism. + - `genes(self) -> Iterator[Gene]`: Returns a generator to get genes in the organism. + - `number_of_genes(self) -> int`: Returns the number of genes in the organism. + - `contigs(self) -> dict.values`: Returns the values in the contig dictionary from the organism. + - `get_contig(self, contig_id: str, is_circular: bool = False)`: Gets the contig with the given identifier in the organism, adding it if it does not exist. + - `_create_contig(self, contig_id: str, is_circular: bool = False)`: Creates a new contig object and adds it to the contig dictionary. + - `mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all')`: Produces a bitarray representing the presence/absence of gene families in the organism using the provided index. + + Fields: + - `name`: Name of the organism. + - `bitarray`: Bitarray representing the presence/absence of gene families in the organism. """ def __init__(self, name: str): + """Constructor Method + :param name: Name of the genome + """ + assert isinstance(name, str), "Organism name should be a string" + assert name != "", "Organism name should not be empty" + super().__init__() self.name = name self._contigs_getter = {} + self._families = None self.bitarray = None + def __str__(self): + return self.name + + def _get_families(self) -> set: + """Get the set of gene families belonging to organism""" + self._families = {gene.family for gene in self.genes} + @property - def families(self) -> set: - """ returns the gene families present in the organism + def families(self): + """returns the gene families present in the organism - :return: set of gene families in organism + :return: Generator of gene families in organism + :rtype: Generator[GeneFamily, None, None] """ - return {gene.family for contig in self.contigs for gene in contig.genes} + if self._families is None: + self._get_families() + for fam in self._families: + yield fam + + def number_of_families(self) -> int: + """Return number of gene families in organism + + :return: Number of gene families in organism + """ + if self._families is None: + self._get_families() + return len(self._families) @property def genes(self) -> Iterator[Gene]: - """ Generator to get genes in organism """ + """ Generator to get genes in organism + + :return: Generator of genes in organism + """ for contig in self.contigs: for gene in contig.genes: yield gene @@ -382,39 +450,54 @@ def genes(self) -> Iterator[Gene]: def number_of_genes(self) -> int: """ Get number of genes in organism - :return: Number of gene in organism + :return: Number of genes in organism """ return sum([len(contig) for contig in self.contigs]) @property - def contigs(self) -> dict.values: + def contigs(self) -> Generator[Contig, None, None]: """ Get contigs in organism :return: values in contig dictionary from organism """ - return self._contigs_getter.values() + for contig in self._contigs_getter.values(): + yield contig - def __str__(self): - return self.name + def number_of_contigs(self) -> int: + """ Get number of contigs in organism - def get_contig(self, contig_id: str, is_circular: bool = False): + :return: Number of contigs in organism """ - Get contig with the given identifier in the organim, if it does not exist in organism,the contig is added + return len(self._contigs_getter) - :param contig_id: Contig idenitifier - :param is_circular: save if the contig is circular + def get_contig(self, name: str) -> Contig: + """ + Get contig with the given identifier in the organim + + :param name: Contig identifier :return: the contig with the given identifier """ - contig = self._contigs_getter.get(contig_id) - if contig is None: - contig = self._create_contig(contig_id, is_circular) - return contig + assert isinstance(name, str), f"To get a contig, name with string type is expected. Given type: {type(name)}" + try: + contig = self._contigs_getter[name] + except KeyError: + raise KeyError(f"Contig {name} does not belong to organism {self.name}") + else: + return contig - def _create_contig(self, contig_id: str, is_circular: bool = False): - new_contig = Contig(contig_id, is_circular) - self._contigs_getter[contig_id] = new_contig - return new_contig + def add_contig(self, contig: Contig): + """Add a contig to organism + :param: contig to add in organism + """ + assert isinstance(contig, Contig), f"Contig object is expected, given type was {type(contig)}" + try: + contig = self.get_contig(contig.name) + except KeyError: + self._contigs_getter[contig.name] = contig + contig.organism = self + else: + raise KeyError(f"Contig {contig.name} already in organism {self.name}") def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index @@ -423,19 +506,18 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): :param partition: Filter partition :param index: The index computed by :func:`ppanggolin.pangenome.Pangenome.getIndex` """ - self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.getLogger("PPanGGOLiN").debug(f"all") + logging.getLogger("PPanGGOLiN").debug("all") for fam in self.families: self.bitarray[index[fam]] = 1 elif partition in ['shell', 'cloud']: - logging.getLogger("PPanGGOLiN").debug(f"shell, cloud") + logging.getLogger("PPanGGOLiN").debug("shell, cloud") for fam in self.families: if fam.named_partition == partition: self.bitarray[index[fam]] = 1 elif partition == 'accessory': - logging.getLogger("PPanGGOLiN").debug(f"accessory") + logging.getLogger("PPanGGOLiN").debug("accessory") for fam in self.families: if fam.named_partition in ['shell', 'cloud']: self.bitarray[index[fam]] = 1 diff --git a/tests/genome/test_Feature.py b/tests/genome/test_Feature.py deleted file mode 100644 index 37bb93ef..00000000 --- a/tests/genome/test_Feature.py +++ /dev/null @@ -1,70 +0,0 @@ -#! /usr/bin/env python3 - -import pytest - -from ppanggolin.genome import Feature - - -def test_cstr(): - identifier = 4 - o_feature = Feature(identifier) - assert isinstance(o_feature, Feature) - for attr in "ID", "is_fragment", "type": - assert hasattr(o_feature, attr) - assert o_feature.ID == identifier - assert o_feature.is_fragment is False - assert o_feature.type == "" - - -@pytest.fixture() -def o_feature(): - return Feature(4) - - -def test_fill_annotations(o_feature): - start, stop = 1, 9 - strand = "plus" - o_feature.fill_annotations(start, stop, strand) - for attr in 'start', 'stop', 'strand', \ - 'type', 'product', 'name': - assert hasattr(o_feature, attr) - assert o_feature.start == start - assert o_feature.stop == stop - assert o_feature.strand == strand - assert o_feature.type == '' - assert o_feature.name == '' - assert o_feature.product == '' - - gene_type = "inconnu" - name = "Eugène" - product = "va savoir" - o_feature.fill_annotations(start, stop, strand, gene_type, name, product) - assert o_feature.type == gene_type - assert o_feature.name == name - assert o_feature.product == product - - # what if start or stop < 0 ? - # stop < start - # start/stop cannot int() ? - # position not int - - -def test_fill_parents(o_feature): - org = "toto" - ctg = 99 - o_feature.fill_parents(org, ctg) - for attr in 'organism', 'contig': - assert hasattr(o_feature, attr) - assert o_feature.organism == org - assert o_feature.contig == ctg - - -def test_add_dna(o_feature): - dna = "test adn" - o_feature.add_dna(dna) - assert hasattr(o_feature, 'dna') - o_feature.dna = dna - - dna = 123 - with pytest.raises(TypeError): - o_feature.add_dna(dna) diff --git a/tests/tests_Genome.py b/tests/tests_Genome.py new file mode 100644 index 00000000..528e97c4 --- /dev/null +++ b/tests/tests_Genome.py @@ -0,0 +1,575 @@ +#! /usr/bin/env python3 + +import pytest +from typing import Generator, Tuple +import gmpy2 + +from ppanggolin.genome import Feature, Gene, RNA, Contig, Organism +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.region import Region + + +class TestFeature: + """Tests Feature class + """ + @pytest.fixture + def feature(self) -> Generator[Feature, None, None]: + """Generate a basic feature for tests + """ + yield Feature('test_id') + + def test_creation(self, feature): + """Tests that 'Feature' object is created successfully with the given identifier + """ + assert feature.ID == 'test_id' + assert not feature.is_fragment + assert feature.type == '' + assert feature.start is None + assert feature.stop is None + assert feature.strand is None + assert feature.product is None + assert feature.name is None + assert feature.local_identifier is None + assert feature.organism is None + assert feature.contig is None + assert feature.dna is None + + def test_create_feature_with_identifier_not_instance_string(self): + """Tests that a Feature object cannot be created with a non-string type identifier + """ + with pytest.raises(AssertionError): + Feature(4) + + def test_create_feature_empty_identifier(self): + """Tests that a Feature object cannot be created with an empty identifier + """ + with pytest.raises(ValueError): + Feature('') + + def tests_write_organism(self, feature): + """Tests that write feature return feature name as string + """ + assert str(feature) == "test_id" + + def test_fill_annotations(self, feature): + """Tests that 'fill_annotations' method fills the attributes correctly + """ + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + assert feature.start == 1 + assert feature.stop == 10 + assert feature.type == 'gene_type' + assert feature.strand == '+' + assert feature.product == 'product' + assert feature.name == 'name' + assert feature.local_identifier == 'local_id' + + def test_fill_annotations_type_error(self, feature): + """Tests that 'fill_annotations' method raises a TypeError if attribute value is not with correct type + """ + with pytest.raises(TypeError): + feature.fill_annotations('1', 10, '+', 'gene_type', 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, "10", '+', 'gene_type', 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, 4, 'gene_type', 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, "+", 4, 'name', 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, '+', 'gene_type', 4, 'product', 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 4, 'local_id') + with pytest.raises(TypeError): + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 4) + + def test_fill_annotations_value_error(self, feature): + """Tests that 'fill_annotations' method raises a TypeError if strand is not '+' or '-' + """ + with pytest.raises(ValueError): + feature.fill_annotations(1, 10, '4', 'gene_type', 'name', 'product', 'local_id') + + def test_fill_parents(self, feature): + """Tests that 'fill_parents' method associates the object with the given organism and contig + """ + organism = Organism('org_id') + contig = Contig('contig_name') + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + feature.fill_parents(organism, contig) + assert feature.organism == organism + assert feature.contig == contig + + def test_fill_parents_with_organism_or_contig_only(self, feature): + """Tests that Gene can be filled with only an organism or a contig + """ + organism = Organism('org') + contig = Contig("ctg") + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + feature.fill_parents(organism=organism) + assert feature.organism == organism + feature.fill_parents(contig=contig) + assert feature.contig == contig + + def test_fill_parents_with_nothing(self, feature): + """Tests that Gene cannot be filled with neither an organism and a contig + """ + with pytest.raises(AssertionError): + feature.fill_parents() + + def test_set_organism(self, feature): + """Tests that organism setter sets organism with valid type + """ + organism = Organism('organism') + feature.organism = organism + assert feature.organism == organism + + def test_set_organism_not_isinstance_organism(self, feature): + """Tests that organism setter return TypeError if sets organism with invalid type + """ + with pytest.raises(TypeError): + feature.organism = 4 + + def test_set_contig(self, feature): + """Tests that contig setter sets contig with valid type + """ + contig = Contig('contig') + feature.contig = contig + assert feature.contig == contig + + def test_set_contig_not_isinstance_contig(self, feature): + """Tests that contig setter return TypeError if sets contig with invalid type + """ + with pytest.raises(TypeError): + feature.contig = 4 + + def test_add_dna(self, feature): + """Tests that 'add_dna' method adds the DNA sequence to the object successfully + """ + feature.add_sequence('ATCG') + assert feature.dna == 'ATCG' + + def test_add_dna_type_error(self, feature): + """Tests that 'add_dna' method raises a TypeError if the DNA sequence is not a string + """ + with pytest.raises(AssertionError): + feature.add_sequence(123) + + def test_lenght(self, feature): + """Tests len method + """ + feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') + assert isinstance(len(feature), int) + assert len(feature) == 10 + + def test_length_start_or_stop_are_not_known(self): + """Tests that len raises ValueError when start is not known + """ + with pytest.raises(ValueError): + feature = Feature('test') + feature.stop = 10 + len(feature) + with pytest.raises(ValueError): + feature = Feature('test') + feature.start = 1 + len(feature) + + +class TestRNA: + """Tests RNA Class + """ + @pytest.fixture + def rna(self) -> Generator[RNA, None, None]: + """Generate a basic gene for tests + """ + yield RNA('rna') + + def test_create_gene_object(self, rna): + """Tests that a Gene object can be created with a valid gene_id + """ + assert rna.ID == 'rna' + + +class TestGene: + """Tests Gene class + """ + @pytest.fixture + def gene(self) -> Generator[Gene, None, None]: + """Generate a basic gene for tests + """ + yield Gene('gene') + + def test_create_gene_object(self, gene): + """Tests that a Gene object can be created with a valid gene_id + """ + assert gene.ID == 'gene' + assert gene.position is None + assert gene._family is None + assert gene._RGP is None + assert gene.genetic_code is None + assert gene.protein is None + + def test_fill_annotations(self, gene): + """Tests that Gene annotations can be filled with valid parameters + """ + gene.fill_annotations(start=1, stop=10, strand='+', position=10, genetic_code=4) + assert gene.position == 10 + assert gene.genetic_code == 4 + + def test_fill_annotations_type_error(self, gene): + """Tests that Gene annotations cannot be filled with invalid parameters + """ + with pytest.raises(TypeError): + gene.fill_annotations(start=1, stop=10, strand='+', position='10', genetic_code=4) + with pytest.raises(TypeError): + gene.fill_annotations(start=1, stop=10, strand='+', position=10, genetic_code="4") + + def test_add_protein(self, gene): + """Tests that a protein sequence can be added to a Gene object + """ + gene.add_protein('MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA') + assert gene.protein == 'MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA' + + def test_add_protein_non_string(self, gene): + """Tests that a non-string protein sequence cannot be added to a Gene object + """ + with pytest.raises(TypeError): + gene.add_protein(123) + + def test_set_family(self, gene): + """Tests that family setter sets family with valid type + """ + family = GeneFamily(0, 'family') + gene.family = family + assert gene.family == family + + def test_set_family_not_instance_gene_family(self, gene): + """Tests that family setter return TypeError if sets family is not instance GeneFamily + """ + with pytest.raises(TypeError): + gene.family = 4 + + def test_set_rgp(self, gene): + """Tests that RGP setter sets family with valid type + """ + region = Region(0) + gene.RGP = region + assert gene.RGP == region + + def test_set_rgp_not_instance_region(self, gene): + """Tests that family setter return TypeError if sets rgp is not instance Region + """ + with pytest.raises(TypeError): + gene.RGP = 4 + + +class TestContig: + """Tests Contig class + """ + @pytest.fixture + def contig(self) -> Generator[Contig, None, None]: + """Generate basic contig for tests + """ + yield Contig("contig") + + @pytest.fixture + def gene(self) -> Generator[Gene, None, None]: + """Generate basic gene for tests + """ + gene = Gene('test_gene') + gene.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + yield gene + + @pytest.fixture + def genes(self) -> Generator[Tuple[Gene, Gene, Gene], None, None]: + """Generate 3 basic genes for tests + """ + gene1 = Gene('test_gene1') + gene1.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + gene2 = Gene('test_gene2') + gene2.fill_annotations(start=11, stop=20, strand='+', position=1, genetic_code=4) + gene3 = Gene('test_gene3') + gene3.fill_annotations(start=21, stop=30, strand='+', position=2, genetic_code=4) + yield gene1, gene2, gene3 + + def test_create_contig(self, contig): + """Tests that a contig is correclty created + """ + assert contig.name == "contig" + assert not contig.is_circular + assert contig._rna_getter == set() # saving the rna annotations. We're not using them in the vast majority of cases. + assert contig._genes_getter == {} + assert contig._genes_position == [] + assert contig._organism is None + + def tests_write_contig(self, contig): + """Tests that write contig return contig name as string + """ + assert str(contig) == "contig" + + def test_add_gene(self, gene, contig): + """Tests that a gene can be added to the contig + """ + contig[gene.start] = gene + assert len(contig._genes_getter) == 1 + assert len(contig._genes_position) == 1 + assert contig._genes_getter[gene.start] == gene + assert contig._genes_position[0] == gene + + def test_add_gene_at_far_position(self, gene, contig): + """Tests that a gene can be added at each position and between position are fill with None + """ + contig[gene.start] = gene + new_gene = Gene("Gene2") + new_gene.fill_annotations(start=50, stop=72, strand='+', position=6, genetic_code=4) + contig[new_gene.start] = new_gene + assert len(contig._genes_position) == 7 + assert contig._genes_position[1:6] == [None]*5 + + def test_add_gene_not_instance_gene(self, contig): + """Tests that the contig cannot be fill with a non gene object + """ + with pytest.raises(TypeError): + contig[1] = "4" + + def test_add_gene_with_start_already_taken(self, contig, gene): + """Tests that the contig cannot be fill with a non gene object + """ + contig[gene.start] = gene + with pytest.raises(ValueError): + new_gene = Gene('test_gene') + new_gene.fill_annotations(start=1, stop=12, strand='+', position=2, genetic_code=4) + contig[new_gene.start] = new_gene + + def test_add_gene_without_position(self, contig): + with pytest.raises(AttributeError): + gene = Gene('test_gene') + contig[gene.start] = gene + + def test_get_len(self, genes, contig): + """Tests len method + """ + gene1, gene2, gene3 = genes + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + contig[gene3.start] = gene3 + assert isinstance(len(contig), int) + assert len(contig) == 3 + + def test_get_gene(self, gene, contig): + """Tests that a gene can be retrieved by its position + """ + contig[gene.start] = gene + assert contig[0] == gene + + def test_get_genes(self, genes, contig): + """Tests that a list of genes within a range can be retrieved + """ + gene1, gene2, gene3 = genes + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + contig[gene3.start] = gene3 + assert set(contig.get_genes(0, 3)) == set(genes) + + def test_get_gene_with_non_integer_index(self, contig): + """Tests that a gene cannot be retrieved with an index that is not an integer + """ + with pytest.raises(TypeError): + _ = contig['a'] + + def test_get_genes_with_non_integer_begin_and_end_positions(self, genes, contig): + """Tests that genes cannot be retrieved with non-integer begin and end positions + """ + gene1, gene2, gene3 = genes + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + contig[gene3.start] = gene3 + with pytest.raises(TypeError): + contig.get_genes('a', 4) + with pytest.raises(TypeError): + contig.get_genes(5, 'b') + with pytest.raises(TypeError): + contig.get_genes('a', 'b') + + def test_get_genes_with_end_position_lower_than_begin_position(self, genes, contig): + """Tests that genes cannot be retrieved with end position lower than begin position + """ + gene1, gene2, gene3 = genes + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + contig[gene3.start] = gene3 + with pytest.raises(ValueError): + contig.get_genes(2, 0) + + def test_iterate_over_genes(self, genes, contig): + """Tests that all genes in the contig can be iterated over + """ + gene1, gene2, gene3 = genes + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + contig[gene3.start] = gene3 + assert list(contig.genes) == sorted([gene1, gene2, gene3], key=lambda x: x.position) + + def test_add_rna(self, contig): + """Tests that an RNA can be added to the contig + """ + rna = RNA('test_rna') + contig.add_rna(rna) + assert list(contig.RNAs) == [rna] + + def test_set_organism(self, contig): + """Tests that an organism can be set to the contig + """ + organism = Organism("organism") + contig.organism = organism + assert contig.organism == organism + + def test_set_organism_with_not_instance_organism(self, contig): + """Tests that the contig cannot be fill with a non organism object + """ + with pytest.raises(TypeError): + contig.organism = 4 + + +class TestOrganism: + """Tests Contig class + """ + @pytest.fixture + def organism(self) -> Generator[Organism, None, None]: + """Generate a basic organism for test + """ + yield Organism('organism') + + @pytest.fixture + def contig(self) -> Generator[Contig, None, None]: + """Generate a basic contig for test + """ + yield Contig("contig") + + @pytest.fixture + def gene(self) -> Generator[Gene, None, None]: + """Generate a basic gene for test + """ + gene = Gene('test_gene') + gene.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + yield gene + + def test_create_organism(self, organism): + """Tests that an Organism instance can be created with a valid name + """ + assert organism.name == 'organism' + assert organism._contigs_getter == {} + assert organism._families is None + assert organism.bitarray is None + + def test_create_organism_empty_name(self): + """Tests that an Organism instance cannot be created with an empty name + """ + with pytest.raises(AssertionError): + Organism('') + + def test_create_organism_with_name_not_string(self): + """Tests that an Organism instance cannot be created with a name not instance string + """ + with pytest.raises(AssertionError): + Organism(4) + + def tests_write_organism(self, organism): + """Tests that write organism return organism name as string + """ + assert str(organism) == "organism" + + def test_add_contig(self, organism, contig): + """Tests that a contig can be added to an Organism instance + """ + organism.add_contig(contig) + assert organism._contigs_getter['contig'] == contig + + def test_add_contig_not_instance_contig(self, organism): + """Tests that a non Contig object cannot be added to an Organism instance + """ + with pytest.raises(AssertionError): + organism.add_contig(4) + + def test_add_contig_existing_name(self, organism, contig): + """Tests that a contig with an existing name cannot be added to an Organism instance + """ + organism.add_contig(contig) + with pytest.raises(KeyError): + organism.add_contig(Contig('contig')) + + def test_get_contig(self, organism, contig): + """Tests that a contig can be retrieved from an Organism instance + """ + organism.add_contig(contig) + assert organism.get_contig('contig') == contig + + def test_get_contig_not_instance_string(self, organism): + """Tests that a non Contig object cannot be added to an Organism instance + """ + with pytest.raises(AssertionError): + organism.get_contig(4) + + def test_get_nonexistent_contig(self, organism): + """Tests that a non-existent contig cannot be retrieved from an Organism instance + """ + with pytest.raises(KeyError): + organism.get_contig('contig1') + + def test_number_of_contigs(self, organism): + """Tests that the number of contigs in an organism instance can be retrieved + """ + organism.add_contig(Contig('contig1')) + organism.add_contig(Contig('contig2')) + assert organism.number_of_contigs() == 2 + + def test_get_families(self, organism, contig, gene): + """Tests that gene families in an organism can be retrieved + """ + family = GeneFamily(0, "fam") + family.add_gene(gene) + gene.fill_parents(organism, contig) + organism.add_contig(contig) + contig[gene.start] = gene + assert set(organism.families) == {family} + + def test_number_of_families(self, organism, contig, gene): + """Tests that the number of gene families in an organism instance can be retrieved + """ + family = GeneFamily(0, "fam") + family.add_gene(gene) + gene.fill_parents(organism, contig) + organism.add_contig(contig) + contig[gene.start] = gene + assert organism.number_of_families() == 1 + + def tests_get_genes(self, organism, contig, gene): + """Tests that genes in an organism can be retrieved + """ + gene.fill_parents(organism, contig) + organism.add_contig(contig) + contig[gene.start] = gene + assert set(organism.genes) == {gene} + + def test_number_of_genes(self, organism, contig, gene): + """Tests that the number of genes in an organism instance can be retrieved + """ + gene.fill_parents(organism, contig) + organism.add_contig(contig) + contig[gene.start] = gene + assert organism.number_of_genes() == 1 + + def test_mk_bitarray(self, organism, contig): + """Tests that a bitarray can be created for an Organism instance + """ + fam1 = GeneFamily(1, 'fam1') + fam2 = GeneFamily(2, 'fam2') + gene1 = Gene('gene1') + gene2 = Gene('gene2') + gene1.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) + gene2.fill_annotations(start=11, stop=19, strand='+', position=1, genetic_code=4) + fam1.add_gene(gene1) + fam2.add_gene(gene2) + contig[gene1.start] = gene1 + contig[gene2.start] = gene2 + organism.add_contig(contig) + index = {fam1: 1, fam2: 2} + organism.mk_bitarray(index) + assert organism.bitarray == gmpy2.xmpz(6) diff --git a/tests/tests_genome.py b/tests/tests_genome.py deleted file mode 100644 index f505da98..00000000 --- a/tests/tests_genome.py +++ /dev/null @@ -1,340 +0,0 @@ -#! /usr/bin/env python3 - -import pytest -from random import choices, randint, sample -from typing import Generator, Set, Tuple, Union -from pathlib import Path - -from ppanggolin.genome import Feature, Gene, RNA, Contig, Organism -from ppanggolin.geneFamily import GeneFamily -from ppanggolin.region import Region - - -class TestFeature: - """Tests Feature class - """ - - def test_creation(self): - """Tests that 'Feature' object is created successfully with the given identifier - """ - feature = Feature('test_id') - assert feature.ID == 'test_id' - assert not feature.is_fragment - assert feature.type == '' - assert feature.start is None - assert feature.stop is None - assert feature.strand is None - assert feature.product is None - assert feature.name is None - assert feature.local_identifier is None - assert feature.organism is None - assert feature.contig is None - assert feature.dna is None - - def test_create_feature_assertion_error(self): - """Tests that a Feature object cannot be created with a non-string type identifier""" - with pytest.raises(AssertionError): - Feature(4) - - def test_create_feature_empty_identifier(self): - """Tests that a Feature object cannot be created with an empty identifier""" - with pytest.raises(ValueError): - Feature('') - - def test_fill_annotations(self): - """Tests that 'fill_annotations' method fills the attributes correctly - """ - feature = Feature('test_id') - feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id') - assert feature.start == 1 - assert feature.stop == 10 - assert feature.type == 'gene_type' - assert feature.strand == '+' - assert feature.product == 'product' - assert feature.name == 'name' - assert feature.local_identifier == 'local_id' - - def test_set_organism_valid_type(self): - """Tests that organism setter sets organism with valid type - """ - feature = Feature('test') - organism = Organism('organism') - feature.organism = organism - assert feature.organism == organism - - def test_set_organism_invalid_type(self): - """Tests that organism setter return TypeError if sets organism with invalid type - """ - feature = Feature('test') - with pytest.raises(TypeError): - feature.organism = 4 - - def test_set_contig_valid_type(self): - """Tests that contig setter sets contig with valid type - """ - feature = Feature('test') - contig = Contig('contig') - feature.contig = contig - assert feature.contig == contig - - def test_set_contig_invalid_type(self): - """Tests that contig setter return TypeError if sets contig with invalid type - """ - feature = Feature('test') - with pytest.raises(TypeError): - feature.contig = 4 - - def test_fill_parents(self): - """Tests that 'fill_parents' method associates the object with the given organism and contig - """ - organism = Organism('org_id') - contig = Contig('contig_name') - feature = Feature('test_id') - feature.fill_parents(organism, contig) - assert feature.organism == organism - assert feature.contig == contig - - def test_add_dna(self): - """Tests that 'add_dna' method adds the DNA sequence to the object successfully - """ - feature = Feature('test_id') - feature.add_dna('ATCG') - assert feature.dna == 'ATCG' - - def test_fill_annotations_type_error(self): - """Tests that 'fill_annotations' method raises a TypeError if attribute value is not with correct type - """ - feature = Feature('test_id') - with pytest.raises(TypeError): - feature.fill_annotations('1', 10, '+', 'gene_type', 'name', 'product', 'local_id') - with pytest.raises(TypeError): - feature.fill_annotations(1, "10", '+', 'gene_type', 'name', 'product', 'local_id') - with pytest.raises(TypeError): - feature.fill_annotations(1, 10, 4, 'gene_type', 'name', 'product', 'local_id') - with pytest.raises(TypeError): - feature.fill_annotations(1, 10, "+", 4, 'name', 'product', 'local_id') - with pytest.raises(TypeError): - feature.fill_annotations(1, 10, '+', 'gene_type', 4, 'product', 'local_id') - with pytest.raises(TypeError): - feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 4, 'local_id') - with pytest.raises(TypeError): - feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 4) - - def test_fill_annotations_value_error(self): - """Tests that 'fill_annotations' method raises a TypeError if strand is not '+' or '-' - """ - feature = Feature('test_id') - with pytest.raises(ValueError): - feature.fill_annotations(1, 10, '4', 'gene_type', 'name', 'product', 'local_id') - - def test_add_dna_type_error(self): - """Tests that 'add_dna' method raises a TypeError if the DNA sequence is not a string - """ - feature = Feature('test_id') - with pytest.raises(AssertionError): - feature.add_dna(123) - - def test_length_start_or_stop_are_not_known(self): - """Tests that length property raises ValueError when start is not known - """ - with pytest.raises(ValueError): - feature = Feature('test') - feature.stop = 10 - _ = feature.length - with pytest.raises(ValueError): - feature = Feature('test') - feature.start = 1 - _ = feature.length - - -class TestGene: - """Tests Gene class - """ - def test_create_gene_object(self): - """Tests that a Gene object can be created with a valid gene_id - """ - gene = Gene('gene1') - assert gene.ID == 'gene1' - - def test_fill_annotations(self): - """Tests that Gene annotations can be filled with valid parameters - """ - gene = Gene('gene1') - gene.fill_annotations(start=1, stop=10, strand='+', position=10, genetic_code=4) - assert gene.position == 10 - assert gene.genetic_code == 4 - - # Tests that Gene annotations cannot be filled with invalid parameters - def test_fill_annotations_invalid_parameters(self): - gene = Gene('gene1') - with pytest.raises(TypeError): - gene.fill_annotations(start=1, stop=10, strand='+', position='10', genetic_code=4) - with pytest.raises(TypeError): - gene.fill_annotations(start=1, stop=10, strand='+', position=10, genetic_code="4") - - def test_add_protein(self): - """Tests that a protein sequence can be added to a Gene object - """ - gene = Gene('gene1') - gene.add_protein('MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA') - assert gene.protein == 'MVKLAVLALALAVLALALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALAVLALALA' - - def test_add_protein_non_string(self): - """Tests that a non-string protein sequence cannot be added to a Gene object - """ - gene = Gene('gene1') - with pytest.raises(TypeError): - gene.add_protein(123) - - def test_set_family_valid_type(self): - """Tests that family setter sets family with valid type - """ - gene = Gene('gene1') - family = GeneFamily(0, 'family') - gene.family = family - assert gene.family == family - - def test_set_family_invalid_type(self): - """Tests that family setter return TypeError if sets family with invalid type - """ - gene = Gene('gene1') - with pytest.raises(TypeError): - gene.family = 4 - - def test_set_rgp_valid_type(self): - """Tests that RGP setter sets family with valid type - """ - gene = Gene('gene1') - region = Region(0) - gene.RGP = region - assert gene.RGP == region - - def test_set_rgp_invalid_type(self): - """Tests that family setter return TypeError if sets family with invalid type - """ - gene = Gene('gene1') - with pytest.raises(TypeError): - gene.RGP = 4 - - -class TestContig: - """Tests Contig class - """ - @pytest.fixture - def gene(self) -> Generator[Gene, None, None]: - gene = Gene('test_gene') - gene.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) - yield gene - - @pytest.fixture - def genes(self): - gene1 = Gene('test_gene1') - gene1.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) - gene2 = Gene('test_gene2') - gene2.fill_annotations(start=11, stop=20, strand='+', position=1, genetic_code=4) - gene3 = Gene('test_gene3') - gene3.fill_annotations(start=21, stop=30, strand='+', position=2, genetic_code=4) - yield {gene1, gene2, gene3} - - def test_add_gene(self, gene): - """Tests that a gene can be added to the contig - """ - contig = Contig('test_contig') - contig[gene.start] = gene - assert len(contig._genesGetter) == 1 - assert len(contig._genes_position) == 1 - assert contig._genesGetter[gene.start] == gene - assert contig._genes_position[0] == gene - - def test_add_rna(self): - """Tests that an RNA can be added to the contig - """ - contig = Contig('test_contig') - rna = RNA('test_rna') - contig.add_rna(rna) - assert list(contig.RNAs) == [rna] - - def test_get_length(self, gene): - """Tests that the length of the contig can be retrieved - """ - contig = Contig('test_contig') - contig[gene.start] = gene - assert len(contig) == 1 - - def test_get_gene(self, gene): - """Tests that a gene can be retrieved by its position - """ - contig = Contig('test_contig') - contig[gene.start] = gene - assert contig[0] == gene - - def test_get_genes(self, genes): - """Tests that a list of genes within a range can be retrieved - """ - contig = Contig('test_contig') - gene1, gene2, gene3 = genes - contig[gene1.start] = gene1 - contig[gene2.start] = gene2 - contig[gene3.start] = gene3 - assert set(contig.get_genes(0, 3)) == genes - - def test_iterate_over_genes(self, genes): - """Tests that all genes in the contig can be iterated over - """ - contig = Contig('test_contig') - gene1, gene2, gene3 = genes - contig[gene1.start] = gene1 - contig[gene2.start] = gene2 - contig[gene3.start] = gene3 - assert list(contig.genes) == sorted([gene1, gene2, gene3], key=lambda x: x.position) - - def test_add_gene_with_existing_start_position(self, gene): - """Tests that a gene cannot be added with a start position that already exists - """ - contig = Contig('test_contig') - contig[gene.start] = gene - with pytest.raises(ValueError): - contig[gene.start] = gene - - def test_add_gene_without_position(self): - """Tests that a gene cannot be added without a position - """ - contig = Contig('test_contig') - gene = Gene('test_gene') - gene.fill_annotations(start=1, stop=10, strand='+', genetic_code=4) - with pytest.raises(AttributeError): - contig[gene.start] = gene - - def test_get_gene_with_non_integer_index(self, gene): - """Tests that a gene cannot be retrieved with an index that is not an integer - """ - contig = Contig('test_contig') - contig[gene.start] = gene - with pytest.raises(TypeError): - contig['a'] - - def test_get_genes_with_non_integer_begin_and_end_positions_edge_case(self, genes): - """Tests that genes cannot be retrieved with non-integer begin and end positions - """ - contig = Contig('test_contig') - gene1, gene2, gene3 = genes - contig[gene1.start] = gene1 - contig[gene2.start] = gene2 - contig[gene3.start] = gene3 - with pytest.raises(TypeError): - contig.get_genes('a', 4) - with pytest.raises(TypeError): - contig.get_genes(5, 'b') - with pytest.raises(TypeError): - contig.get_genes('a', 'b') - - def test_get_genes_with_end_position_lower_than_begin_position_edge_case(self, genes): - """Tests that genes cannot be retrieved with end position lower than begin position - """ - contig = Contig('test_contig') - gene1, gene2, gene3 = genes - contig[gene1.start] = gene1 - contig[gene2.start] = gene2 - contig[gene3.start] = gene3 - with pytest.raises(ValueError): - contig.get_genes(2, 0) From 03f46d0e7762572093b201cc7594aa5bd78e606f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 1 Aug 2023 10:03:10 +0200 Subject: [PATCH 26/75] Refactor GenFamily class --- VERSION | 2 +- ppanggolin/context/searchGeneContext.py | 8 +- ppanggolin/edge.py | 4 +- ppanggolin/figures/tile_plot.py | 10 +- ppanggolin/figures/ucurve.py | 2 +- ppanggolin/formats/readBinaries.py | 2 +- ppanggolin/formats/writeBinaries.py | 2 +- ppanggolin/formats/writeFlat.py | 40 +++--- ppanggolin/formats/writeMSA.py | 8 +- ppanggolin/formats/writeSequences.py | 4 +- ppanggolin/geneFamily.py | 158 ++++++++++++++++-------- ppanggolin/mod/module.py | 10 +- ppanggolin/nem/partition.py | 5 +- ppanggolin/nem/rarefaction.py | 2 +- ppanggolin/pangenome.py | 2 +- ppanggolin/region.py | 4 +- 16 files changed, 159 insertions(+), 104 deletions(-) diff --git a/VERSION b/VERSION index 3bb592c1..5d6aa2b6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.143 +1.2.144 diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 6a25fe13..29e0e04f 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -7,6 +7,7 @@ import tempfile import time from pathlib import Path +from typing import Set # installed libraries from tqdm import tqdm @@ -16,6 +17,7 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig +from ppanggolin.geneFamily import GeneFamily from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components from ppanggolin.pangenome import Pangenome from ppanggolin.align.alignOnPang import get_seq2pang, project_partition @@ -201,7 +203,7 @@ def fam2seq(seq_to_pan: dict) -> dict: return fam_2_seq -def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, output: str): +def export_to_dataframe(families: Set[GeneFamily], gene_contexts: Set[GeneContext], fam_to_seq: dict, output: str): """ Export the results into dataFrame :param families: Families related to the connected components @@ -217,10 +219,10 @@ def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, out for family in gene_context.families: line = [gene_context.ID] if fam_to_seq is None or fam_to_seq.get(family.ID) is None: - line += [family.name, None, len(family.organisms), family.named_partition] + line += [family.name, None, family.number_of_organisms(), family.named_partition] else: line += [family.name, ','.join(fam_to_seq.get(family.ID)), - len(family.organisms), family.named_partition] + family.number_of_organisms(), family.named_partition] lines.append(line) df = pd.DataFrame(lines, columns=["GeneContext ID", "Gene family name", "Sequence ID", "Nb Genomes", "Partition"] diff --git a/ppanggolin/edge.py b/ppanggolin/edge.py index eea36c73..2b213f7d 100644 --- a/ppanggolin/edge.py +++ b/ppanggolin/edge.py @@ -25,8 +25,8 @@ def __init__(self, source_gene: Gene, target_gene: Gene): f"gene {target_gene.ID} did not have a gene family.") self.source = source_gene.family self.target = target_gene.family - self.source._edges[self.target] = self - self.target._edges[self.source] = self + self.source.set_edge(self.target, self) + self.target.set_edge(self.source, self) self.organisms = defaultdict(list) self.add_genes(source_gene, target_gene) diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 619845e1..21ef9ff8 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -84,13 +84,13 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di partitions_dict[fam.partition].append(fam) if fam.partition.startswith("S"): shell_subs.add(fam.partition) # number of elements will tell the number of subpartitions - ordered_nodes_p = sorted(partitions_dict["P"], key=lambda n: len(n.organisms), reverse=True) - ordered_nodes_c = sorted(partitions_dict["C"], key=lambda n: len(n.organisms), reverse=True) + ordered_nodes_p = sorted(partitions_dict["P"], key=lambda n: n.number_of_organisms(), reverse=True) + ordered_nodes_c = sorted(partitions_dict["C"], key=lambda n: n.number_of_organisms(), reverse=True) sep_p = len(ordered_nodes_p) - 0.5 separators = [sep_p] shell_na = None if len(shell_subs) == 1: - ordered_nodes_s = sorted(partitions_dict[shell_subs.pop()], key=lambda n: len(n.organisms), reverse=True) + ordered_nodes_s = sorted(partitions_dict[shell_subs.pop()], key=lambda n: n.number_of_organisms(), reverse=True) ordered_nodes = ordered_nodes_p + ordered_nodes_s + ordered_nodes_c separators.append(separators[len(separators) - 1] + len(ordered_nodes_s)) separators.append(separators[len(separators) - 1] + len(ordered_nodes_c)) @@ -99,7 +99,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di for subpartition in sorted(shell_subs): if subpartition == "S_": shell_na = len(separators) - 1 - ordered_nodes_s = sorted(partitions_dict[subpartition], key=lambda n: len(n.organisms), reverse=True) + ordered_nodes_s = sorted(partitions_dict[subpartition], key=lambda n: n.number_of_organisms(), reverse=True) ordered_nodes += ordered_nodes_s separators.append(separators[len(separators) - 1] + len(ordered_nodes_s)) ordered_nodes += ordered_nodes_c @@ -109,7 +109,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di for node in ordered_nodes: fam_order.append('\u200c' + node.name) data = node.organisms - binary_data.append([len(node.get_genes_per_org(org)) if org in data else numpy.nan for org in order_organisms]) + binary_data.append([len(list(node.get_genes_per_org(org))) if org in data else numpy.nan for org in order_organisms]) text_data.append([("\n".join(map(str, node.get_genes_per_org(org)))) if org in data else numpy.nan for org in order_organisms]) diff --git a/ppanggolin/figures/ucurve.py b/ppanggolin/figures/ucurve.py index 497be19c..e76f51ec 100644 --- a/ppanggolin/figures/ucurve.py +++ b/ppanggolin/figures/ucurve.py @@ -28,7 +28,7 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di is_partitioned = False has_undefined = False for fam in pangenome.gene_families: - nb_org = len(fam.organisms) + nb_org = fam.number_of_organisms() if fam.partition != "": is_partitioned = True if fam.partition == "U": diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 7bbe971e..820c86f2 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -338,7 +338,7 @@ def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar: for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene family", disable=disable_bar): fam = pangenome.get_gene_family(row["name"].decode()) - fam.add_partition(row["partition"].decode()) + fam.partition = row["partition"].decode() fam.add_sequence(row["protein"].decode()) if h5f.root.status._v_attrs.Partitioned: diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index ea695459..f31173c0 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -734,7 +734,7 @@ def getmin(arg: iter) -> float: part_set = set() for fam in pangenome.gene_families: named_part_counter[fam.named_partition] += 1 - part_distribs[fam.named_partition].append(len(fam.organisms) / pangenome.number_of_organisms()) + part_distribs[fam.named_partition].append(fam.number_of_organisms() / pangenome.number_of_organisms()) if fam.named_partition == "shell": subpart_counter[fam.partition] += 1 if fam.partition != "S_": diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 40f7be81..ce8a7ab5 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -60,7 +60,7 @@ def write_json_gene_fam(gene_fam: GeneFamily, json: TextIO): :param gene_fam: file-like object, compressed or not :param json: file-like object, compressed or not """ - json.write('{' + f'"id": "{gene_fam.name}", "nb_genes": {len(gene_fam.genes)}, ' + json.write('{' + f'"id": "{gene_fam.name}", "nb_genes": {gene_fam.number_of_genes()}, ' f'"partition": "{gene_fam.named_partition}", "subpartition": "{gene_fam.partition}"' + '}') org_dict = {} name_counts = Counter() @@ -232,22 +232,22 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): gexf.write(f' \n') gexf.write(f' \n') - gexf.write(f' \n') + gexf.write(f' \n') gexf.write(' \n') - gexf.write(f' \n') + gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') + f'{"exact_accessory" if fam.number_of_organisms() != pan.number_of_organisms() else "exact_core"}" />\n') gexf.write(f' = (pan.number_of_organisms() * soft_core) else "soft_accessory"}"' + f'{"soft_core" if fam.number_of_organisms() >= (pan.number_of_organisms() * soft_core) else "soft_accessory"}"' f' />\n') gexf.write(f' \n') gexf.write(f' \n') - gexf.write(f' \n') + gexf.write(f' \n') if not light: for org, genes in fam.get_org_dict().items(): gexf.write( @@ -373,9 +373,9 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool matrix.write(sep.join(['"' + fam.name + '"', # 1 '"' + alt + '"', # 2 '"' + str(product.most_common(1)[0][0]) + '"', # 3 - '"' + str(len(fam.organisms)) + '"', # 4 - '"' + str(len(fam.genes)) + '"', # 5 - '"' + str(round(len(fam.genes) / len(fam.organisms), 2)) + '"', # 6 + '"' + str(fam.number_of_organisms()) + '"', # 4 + '"' + str(fam.number_of_genes()) + '"', # 5 + '"' + str(round(fam.number_of_genes() / fam.number_of_organisms(), 2)) + '"', # 6 '"NA"', # 7 '"NA"', # 8 '""', # 9 @@ -436,12 +436,12 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, "\n") for fam in pan.gene_families: if fam.named_partition == "persistent": - mean_pres = len(fam.genes) / len(fam.organisms) + mean_pres = fam.number_of_genes() / fam.number_of_organisms() nb_multi = 0 for gene_list in fam.get_org_dict().values(): if len(gene_list) > 1: nb_multi += 1 - dup_ratio = nb_multi / len(fam.organisms) + dup_ratio = nb_multi / fam.number_of_organisms() is_scm = False if dup_ratio < dup_margin: is_scm = True @@ -455,9 +455,9 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, soft = set() # could use bitarrays if speed is needed core = set() for fam in pan.gene_families: - if len(fam.organisms) >= pan.number_of_organisms() * soft_core: + if fam.number_of_organisms() >= pan.number_of_organisms() * soft_core: soft.add(fam) - if len(fam.organisms) == pan.number_of_organisms(): + if fam.number_of_organisms() == pan.number_of_organisms(): core.add(fam) with write_compressed_or_not(output / "organisms_statistics.tsv", compress) as outfile: @@ -553,16 +553,16 @@ def write_org_file(org: Organism, output: Path, compress: bool = False): nb_cloud += 1 row = [gene.ID if gene.local_identifier == "" else gene.local_identifier, contig.name, gene.start, gene.stop, gene.strand, gene.family.name, - len(gene.family.get_genes_per_org(org)), gene.family.named_partition, + len(list(gene.family.get_genes_per_org(org))), gene.family.named_partition, nb_pers, nb_shell, nb_cloud] if needRegions: row.append(gene.RGP.name if gene.RGP is not None else gene.RGP) if needSpots: - if len(gene.family.spot) > 0: - spot = ','.join([str(s.ID) for s in gene.family.spot]) + if gene.family.number_of_spots() > 0: + spot = ','.join([str(spot.ID) for spot in gene.family.spots]) row.append(spot) if needModules: - if len(gene.family.modules) > 0: + if gene.family.number_of_modules() > 0: modules = ','.join(["module_" + str(module.ID) for module in gene.family.modules]) row.append(modules) outfile.write("\t".join(map(str, row)) + "\n") @@ -603,9 +603,9 @@ def write_parts(output: Path, soft_core: float = 0.95): part_sets[fam.named_partition].add(fam.name) if fam.partition.startswith("S"): part_sets[fam.partition].add(fam.name) - if len(fam.organisms) >= pan.number_of_organisms() * soft_core: + if fam.number_of_organisms() >= pan.number_of_organisms() * soft_core: part_sets["soft_core"].add(fam.name) - if len(fam.organisms) == pan.number_of_organisms(): + if fam.number_of_organisms() == pan.number_of_organisms(): part_sets["exact_core"].add(fam.name) else: part_sets["exact_accessory"].add(fam.name) @@ -795,7 +795,7 @@ def write_org_modules(output: Path, compress: bool = False): for mod in pan.modules: mod_orgs = set() for fam in mod.families: - mod_orgs |= fam.organisms + mod_orgs |= set(fam.organisms) for org in mod_orgs: completion = round((org.number_of_families() + len(mod.families)) / len(mod.families), 2) fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n") diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 2404abd8..172a7614 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -35,7 +35,7 @@ def is_single_copy(family: GeneFamily, dup_margin: float = 0.95) -> bool: for gene_list in family.get_org_dict().values(): if len(gene_list) > 1: nb_multi += 1 - dup_ratio = nb_multi / len(family.organisms) + dup_ratio = nb_multi / family.number_of_organisms() if dup_ratio < dup_margin: return True return False @@ -70,7 +70,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", elif partition_filter in ["core", "accessory", "softcore"]: if partition_filter == "core": for family in pangenome.gene_families: - if len(family.organisms) == nb_org: + if family.number_of_organisms() == nb_org: if single_copy: if is_single_copy(family, dup_margin): families.add(family) @@ -78,7 +78,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", families.add(family) elif partition_filter == "accessory": for family in pangenome.gene_families: - if len(family.organisms) < nb_org: + if family.number_of_organisms() < nb_org: if single_copy: if is_single_copy(family, dup_margin): families.add(family) @@ -86,7 +86,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", families.add(family) elif partition_filter == "softcore": for family in pangenome.gene_families: - if len(family.organisms) >= nb_org * soft_core: + if family.number_of_organisms() >= nb_org * soft_core: if single_copy: if is_single_copy(family, dup_margin): families.add(family) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 74dda3a2..7abf1adb 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -112,12 +112,12 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes") threshold = pangenome.number_of_organisms() * soft_core for fam in pangenome.gene_families: - if len(fam.organisms) >= threshold: + if fam.number_of_organisms() >= threshold: genefams.add(fam) elif partition == "core": logging.getLogger("PPanGGOLiN").info(f"Writing the representative {type_name} of the {partition} gene families...") for fam in pangenome.gene_families: - if len(fam.organisms) == pangenome.number_of_organisms(): + if fam.number_of_organisms() == pangenome.number_of_organisms(): genefams.add(fam) elif "module_" in partition: logging.getLogger("PPanGGOLiN").info(f"Writing the representation {type_name} of {partition} gene families...") diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 136b70e4..de498698 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -37,29 +37,14 @@ def __init__(self, family_id: int, name: str): self.ID = family_id self._edges = {} self._genePerOrg = defaultdict(set) - self.genes = set() + self._genes = set() self.removed = False # for the repeated family not added in the main graph self.sequence = "" self.partition = "" - self.spot = set() - self.modules = set() + self._spots = set() + self._modules = set() self.bitarray = None - def add_sequence(self, seq: str): - """Assigns a protein sequence to the gene family. - - :param seq: the sequence to add to the gene family - """ - assert isinstance(seq, str) and str != "", "Sequence must be a string and not empty" - self.sequence = seq - - def add_partition(self, partition: str): - """Assigns a partition to the gene family. It should be the raw partition name provided by NEM. - - :param partition: The partition - """ - self.partition = partition - @property def named_partition(self) -> str: """Reads the partition attribute and returns a meaningful name @@ -79,6 +64,97 @@ def named_partition(self) -> str: else: return "undefined" + @property + def neighbors(self) -> Set[GeneFamily]: + """Returns all the GeneFamilies that are linked with an edge + + :return: Neighbors + """ + for family in self._edges.keys(): + yield family + + @property + def edges(self) -> List[Edge]: + """Returns all Edges that are linked to this gene family + + :return: Edges of the gene family + """ + for edge in self._edges.values(): + yield edge + + @property + def genes(self): + for gene in self._genes: + yield gene + + @property + def organisms(self) -> Set[Organism]: + """Returns all the Organisms that have this gene family + + :return: Organisms that have this gene family + """ + try: + for org in self._genePerOrg.keys(): + yield org + except AttributeError: # then the genes have been added before they had organisms + for gene in self.genes: + self._genePerOrg[gene.organism].add(gene) + return self.organisms + except Exception: + raise Exception("An unexpected error occurs. Please report in our GitHub") + + @property + def spots(self): + for spot in self._spots: + yield spot + + @property + def modules(self): + for module in self._modules: + yield module + + def number_of_neighbor(self) -> int: + """Get the number of neighbor for the current gene family + """ + return len(self._edges.keys()) + + def number_of_edges(self) -> int: + """Get the number of edges for the current gene family + """ + return len(self._edges.values()) + + def number_of_genes(self) -> int: + """Get the number of genes for the current gene family + """ + return len(self._genes) + + def number_of_organisms(self) -> int: + """Get the number of organisms for the current gene family + """ + return len(self._genePerOrg.keys()) + + def number_of_spots(self) -> int: + """Get the number of spots for the current gene family + """ + return len(self._spots) + + def number_of_modules(self) -> int: + """Get the number of modules for the current gene family + """ + return len(self._modules) + + def set_edge(self, target: GeneFamily, edge: Edge): + self._edges[target] = edge + + def add_sequence(self, seq: str): + """Assigns a protein sequence to the gene family. + + :param seq: the sequence to add to the gene family + """ + assert isinstance(seq, str), "Sequence must be a string" + + self.sequence = seq + def add_gene(self, gene: Gene): """Add a gene to the gene family, and sets the gene's :attr:family accordingly. @@ -88,11 +164,17 @@ def add_gene(self, gene: Gene): """ if not isinstance(gene, Gene): raise TypeError(f"'Gene' type object was expected, but '{type(gene)}' type object was provided.") - self.genes.add(gene) + self._genes.add(gene) gene.family = self if hasattr(gene, "organism"): self._genePerOrg[gene.organism].add(gene) + def add_spot(self, spot: Spot): + self._spots.add(spot) + + def add_module(self, module: Module): + self._modules.add(module) + def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence/absence of the family in the pangenome using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. @@ -138,41 +220,11 @@ def get_genes_per_org(self, org: Organism) -> Set[Gene]: :return: a set of gene(s) """ try: - return self._genePerOrg[org] + for gene in self._genePerOrg[org]: + yield gene except AttributeError: for gene in self.genes: self._genePerOrg[gene.organism].add(gene) - return self._genePerOrg[org] + return self.get_genes_per_org(org) except Exception: - raise Exception("An unexpected error occurs. Please report in our GitHub") - - @property - def neighbors(self) -> Set[GeneFamily]: - """Returns all the GeneFamilies that are linked with an edge - - :return: Neighbors - """ - return set(self._edges.keys()) - - @property - def edges(self) -> List[Edge]: - """Returns all Edges that are linked to this gene family - - :return: Edges of the gene family - """ - return list(self._edges.values()) - - @property - def organisms(self) -> Set[Organism]: - """Returns all the Organisms that have this gene family - - :return: Organisms that have this gene family - """ - try: - return set(self._genePerOrg.keys()) - except AttributeError: # then the genes have been added before they had organisms - for gene in self.genes: - self._genePerOrg[gene.organism].add(gene) - return set(self._genePerOrg.keys()) - except Exception: - raise Exception("An unexpected error occurs. Please report in our GitHub") + raise Exception("An unexpected error occurs. Please report in our GitHub") \ No newline at end of file diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index 2f6431b4..1b6e0d97 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -32,17 +32,17 @@ def check_pangenome_former_modules(pangenome: Pangenome, force: bool = False): erase_pangenome(pangenome, modules=True) -def compute_mod_graph(organisms: list, t: int = 1, disable_bar: bool = False): +def compute_mod_graph(pangenome: Pangenome, t: int = 1, disable_bar: bool = False): """ Computes a graph using all provided genomes with a transitive closure of size t - :param organisms: the list of organisms to compute the graph with + :param pangenome: pangenome with organisms to compute the graph :param t: the size of the transitive closure :param disable_bar: whether to show a progress bar or not """ g = nx.Graph() - for org in tqdm(organisms, unit="genome", disable=disable_bar): + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genome", disable=disable_bar): for contig in org.contigs: if len(contig) > 0: start_gene = contig[0] @@ -73,7 +73,7 @@ def compute_modules(g: nx.Graph, multi: set, weight: float = 0.85, min_fam: int """ # removing families with low presence - removed = set([fam for fam in g.nodes if len(fam.organisms) < min_fam]) + removed = set([fam for fam in g.nodes if fam.number_of_organisms() < min_fam]) modules = set() c = 0 @@ -109,7 +109,7 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = # compute the graph with transitive closure size provided as parameter start_time = time.time() logging.getLogger("PPanGGOLiN").info("Building the graph...") - g = compute_mod_graph(pangenome.organisms, t=transitive, disable_bar=disable_bar) + g = compute_mod_graph(pangenome, t=transitive, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info(f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find modules in") logging.getLogger("PPanGGOLiN").info(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index eccde1b1..4303b8c5 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -272,10 +272,11 @@ def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> index_org[org] = index for fam in pan.gene_families: + fam_organisms = set(fam.organisms) # could use bitarrays if this part is limiting? - if not organisms.isdisjoint(fam.organisms): + if not organisms.isdisjoint(fam_organisms): curr_dat = list(default_dat) - curr_orgs = fam.organisms & organisms + curr_orgs = fam_organisms & organisms for org in curr_orgs: curr_dat[index_org[org]] = "1" dat_file.write("\t".join(curr_dat) + "\n") diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index 8b0de528..bc54273c 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -89,7 +89,7 @@ def validate_family(result: Union[Tuple[dict, None, None], Tuple[int, float, flo validated.add(node) for fam in ppp.pan.gene_families: - if not samp.isdisjoint(fam.organisms): # otherwise, useless to keep track of + if not samp.isdisjoint(set(fam.organisms)): # otherwise, useless to keep track of families.add(fam) cpt_partition[fam.name] = {"P": 0, "S": 0, "C": 0, "U": 0} diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 53040b49..52f4709f 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -417,7 +417,7 @@ def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[Gen if fam.named_partition == "persistent" or not persistent: dup = len([genes for org, genes in fam.get_org_dict().items() if len([gene for gene in genes if not gene.is_fragment]) > 1]) - if (dup / len(fam.organisms)) >= dup_margin: # tot / nborgs >= 1.05 + if (dup / fam.number_of_organisms()) >= dup_margin: # tot / nborgs >= 1.05 multigenics.add(fam) return multigenics diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 7c91c5ed..c776a41a 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -258,7 +258,7 @@ def add_region(self, region): def spot_2_families(self): """Add to Gene Families a link to spot""" for family in self.families: - family.spot.add(self) + family.add_spot(self) def borders(self, set_size: int, multigenics): """ Extracts all the borders of all RGPs belonging to the spot @@ -402,7 +402,7 @@ def add_family(self, family: GeneFamily): """ if not isinstance(family, GeneFamily): raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") - family.modules.add(self) + family.add_module(self) self._families.add(family) def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): From 21f34f2f3cc5316be6bbaa3eb5f98e37355b65a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 1 Aug 2023 10:43:23 +0200 Subject: [PATCH 27/75] Make number_of_* as property in GeneFamily --- VERSION | 2 +- ppanggolin/context/searchGeneContext.py | 4 +- ppanggolin/figures/tile_plot.py | 8 +-- ppanggolin/figures/ucurve.py | 2 +- ppanggolin/formats/writeBinaries.py | 2 +- ppanggolin/formats/writeFlat.py | 34 ++++++------- ppanggolin/formats/writeMSA.py | 8 +-- ppanggolin/geneFamily.py | 66 +++++++++++++++++++------ ppanggolin/mod/module.py | 2 +- ppanggolin/pangenome.py | 2 +- 10 files changed, 82 insertions(+), 48 deletions(-) diff --git a/VERSION b/VERSION index 5d6aa2b6..c8b333ff 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.144 +1.2.145 diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 29e0e04f..47512d56 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -219,10 +219,10 @@ def export_to_dataframe(families: Set[GeneFamily], gene_contexts: Set[GeneContex for family in gene_context.families: line = [gene_context.ID] if fam_to_seq is None or fam_to_seq.get(family.ID) is None: - line += [family.name, None, family.number_of_organisms(), family.named_partition] + line += [family.name, None, family.number_of_organisms, family.named_partition] else: line += [family.name, ','.join(fam_to_seq.get(family.ID)), - family.number_of_organisms(), family.named_partition] + family.number_of_organisms, family.named_partition] lines.append(line) df = pd.DataFrame(lines, columns=["GeneContext ID", "Gene family name", "Sequence ID", "Nb Genomes", "Partition"] diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 21ef9ff8..78fd5032 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -84,13 +84,13 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di partitions_dict[fam.partition].append(fam) if fam.partition.startswith("S"): shell_subs.add(fam.partition) # number of elements will tell the number of subpartitions - ordered_nodes_p = sorted(partitions_dict["P"], key=lambda n: n.number_of_organisms(), reverse=True) - ordered_nodes_c = sorted(partitions_dict["C"], key=lambda n: n.number_of_organisms(), reverse=True) + ordered_nodes_p = sorted(partitions_dict["P"], key=lambda n: n.number_of_organisms, reverse=True) + ordered_nodes_c = sorted(partitions_dict["C"], key=lambda n: n.number_of_organisms, reverse=True) sep_p = len(ordered_nodes_p) - 0.5 separators = [sep_p] shell_na = None if len(shell_subs) == 1: - ordered_nodes_s = sorted(partitions_dict[shell_subs.pop()], key=lambda n: n.number_of_organisms(), reverse=True) + ordered_nodes_s = sorted(partitions_dict[shell_subs.pop()], key=lambda n: n.number_of_organisms, reverse=True) ordered_nodes = ordered_nodes_p + ordered_nodes_s + ordered_nodes_c separators.append(separators[len(separators) - 1] + len(ordered_nodes_s)) separators.append(separators[len(separators) - 1] + len(ordered_nodes_c)) @@ -99,7 +99,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di for subpartition in sorted(shell_subs): if subpartition == "S_": shell_na = len(separators) - 1 - ordered_nodes_s = sorted(partitions_dict[subpartition], key=lambda n: n.number_of_organisms(), reverse=True) + ordered_nodes_s = sorted(partitions_dict[subpartition], key=lambda n: n.number_of_organisms, reverse=True) ordered_nodes += ordered_nodes_s separators.append(separators[len(separators) - 1] + len(ordered_nodes_s)) ordered_nodes += ordered_nodes_c diff --git a/ppanggolin/figures/ucurve.py b/ppanggolin/figures/ucurve.py index e76f51ec..638d3e7e 100644 --- a/ppanggolin/figures/ucurve.py +++ b/ppanggolin/figures/ucurve.py @@ -28,7 +28,7 @@ def draw_ucurve(pangenome: Pangenome, output: Path, soft_core: float = 0.95, di is_partitioned = False has_undefined = False for fam in pangenome.gene_families: - nb_org = fam.number_of_organisms() + nb_org = fam.number_of_organisms if fam.partition != "": is_partitioned = True if fam.partition == "U": diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index f31173c0..1b22c15f 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -734,7 +734,7 @@ def getmin(arg: iter) -> float: part_set = set() for fam in pangenome.gene_families: named_part_counter[fam.named_partition] += 1 - part_distribs[fam.named_partition].append(fam.number_of_organisms() / pangenome.number_of_organisms()) + part_distribs[fam.named_partition].append(fam.number_of_organisms / pangenome.number_of_organisms()) if fam.named_partition == "shell": subpart_counter[fam.partition] += 1 if fam.partition != "S_": diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index ce8a7ab5..a1cd5cf3 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -60,7 +60,7 @@ def write_json_gene_fam(gene_fam: GeneFamily, json: TextIO): :param gene_fam: file-like object, compressed or not :param json: file-like object, compressed or not """ - json.write('{' + f'"id": "{gene_fam.name}", "nb_genes": {gene_fam.number_of_genes()}, ' + json.write('{' + f'"id": "{gene_fam.name}", "nb_genes": {gene_fam.number_of_genes}, ' f'"partition": "{gene_fam.named_partition}", "subpartition": "{gene_fam.partition}"' + '}') org_dict = {} name_counts = Counter() @@ -232,22 +232,22 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): gexf.write(f' \n') gexf.write(f' \n') - gexf.write(f' \n') + gexf.write(f' \n') gexf.write(' \n') - gexf.write(f' \n') + gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') + f'{"exact_accessory" if fam.number_of_organisms != pan.number_of_organisms() else "exact_core"}" />\n') gexf.write(f' = (pan.number_of_organisms() * soft_core) else "soft_accessory"}"' + f'{"soft_core" if fam.number_of_organisms >= (pan.number_of_organisms() * soft_core) else "soft_accessory"}"' f' />\n') gexf.write(f' \n') gexf.write(f' \n') - gexf.write(f' \n') + gexf.write(f' \n') if not light: for org, genes in fam.get_org_dict().items(): gexf.write( @@ -373,9 +373,9 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool matrix.write(sep.join(['"' + fam.name + '"', # 1 '"' + alt + '"', # 2 '"' + str(product.most_common(1)[0][0]) + '"', # 3 - '"' + str(fam.number_of_organisms()) + '"', # 4 - '"' + str(fam.number_of_genes()) + '"', # 5 - '"' + str(round(fam.number_of_genes() / fam.number_of_organisms(), 2)) + '"', # 6 + '"' + str(fam.number_of_organisms) + '"', # 4 + '"' + str(fam.number_of_genes) + '"', # 5 + '"' + str(round(fam.number_of_genes / fam.number_of_organisms, 2)) + '"', # 6 '"NA"', # 7 '"NA"', # 8 '""', # 9 @@ -436,12 +436,12 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, "\n") for fam in pan.gene_families: if fam.named_partition == "persistent": - mean_pres = fam.number_of_genes() / fam.number_of_organisms() + mean_pres = fam.number_of_genes / fam.number_of_organisms nb_multi = 0 for gene_list in fam.get_org_dict().values(): if len(gene_list) > 1: nb_multi += 1 - dup_ratio = nb_multi / fam.number_of_organisms() + dup_ratio = nb_multi / fam.number_of_organisms is_scm = False if dup_ratio < dup_margin: is_scm = True @@ -455,9 +455,9 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, soft = set() # could use bitarrays if speed is needed core = set() for fam in pan.gene_families: - if fam.number_of_organisms() >= pan.number_of_organisms() * soft_core: + if fam.number_of_organisms >= pan.number_of_organisms() * soft_core: soft.add(fam) - if fam.number_of_organisms() == pan.number_of_organisms(): + if fam.number_of_organisms == pan.number_of_organisms(): core.add(fam) with write_compressed_or_not(output / "organisms_statistics.tsv", compress) as outfile: @@ -558,11 +558,11 @@ def write_org_file(org: Organism, output: Path, compress: bool = False): if needRegions: row.append(gene.RGP.name if gene.RGP is not None else gene.RGP) if needSpots: - if gene.family.number_of_spots() > 0: + if gene.family.number_of_spots > 0: spot = ','.join([str(spot.ID) for spot in gene.family.spots]) row.append(spot) if needModules: - if gene.family.number_of_modules() > 0: + if gene.family.number_of_modules > 0: modules = ','.join(["module_" + str(module.ID) for module in gene.family.modules]) row.append(modules) outfile.write("\t".join(map(str, row)) + "\n") @@ -603,9 +603,9 @@ def write_parts(output: Path, soft_core: float = 0.95): part_sets[fam.named_partition].add(fam.name) if fam.partition.startswith("S"): part_sets[fam.partition].add(fam.name) - if fam.number_of_organisms() >= pan.number_of_organisms() * soft_core: + if fam.number_of_organisms >= pan.number_of_organisms() * soft_core: part_sets["soft_core"].add(fam.name) - if fam.number_of_organisms() == pan.number_of_organisms(): + if fam.number_of_organisms == pan.number_of_organisms(): part_sets["exact_core"].add(fam.name) else: part_sets["exact_accessory"].add(fam.name) diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 172a7614..ff0af41e 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -35,7 +35,7 @@ def is_single_copy(family: GeneFamily, dup_margin: float = 0.95) -> bool: for gene_list in family.get_org_dict().values(): if len(gene_list) > 1: nb_multi += 1 - dup_ratio = nb_multi / family.number_of_organisms() + dup_ratio = nb_multi / family.number_of_organisms if dup_ratio < dup_margin: return True return False @@ -70,7 +70,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", elif partition_filter in ["core", "accessory", "softcore"]: if partition_filter == "core": for family in pangenome.gene_families: - if family.number_of_organisms() == nb_org: + if family.number_of_organisms == nb_org: if single_copy: if is_single_copy(family, dup_margin): families.add(family) @@ -78,7 +78,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", families.add(family) elif partition_filter == "accessory": for family in pangenome.gene_families: - if family.number_of_organisms() < nb_org: + if family.number_of_organisms < nb_org: if single_copy: if is_single_copy(family, dup_margin): families.add(family) @@ -86,7 +86,7 @@ def get_families_to_write(pangenome: Pangenome, partition_filter: str = "core", families.add(family) elif partition_filter == "softcore": for family in pangenome.gene_families: - if family.number_of_organisms() >= nb_org * soft_core: + if family.number_of_organisms >= nb_org * soft_core: if single_copy: if is_single_copy(family, dup_margin): families.add(family) diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index de498698..6a721264 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -7,8 +7,7 @@ import logging # installed libraries -from typing import Dict, List, Set - +from typing import Dict, Generator, Set import gmpy2 # local libraries @@ -20,14 +19,44 @@ class GeneFamily(MetaFeatures): """ This represents a single gene family. It will be a node in the pangenome graph, and be aware of its genes and edges. - - :param family_id: The internal identifier to give to the gene family - :type family_id: any - :param name: The name of the gene family (to be printed in output files) - :type name: str + Methods: + - named_partition: returns a meaningful name for the partition associated with the family. + - neighbors: returns all the GeneFamilies that are linked with an edge. + - edges: returns all Edges that are linked to this gene family. + - genes: returns all the genes associated with the family. + - organisms: returns all the Organisms that have this gene family. + - spots: returns all the spots associated with the family. + - modules: returns all the modules associated with the family. + - number_of_neighbor: returns the number of neighbor GeneFamilies. + - number_of_edges: returns the number of edges. + - number_of_genes: returns the number of genes. + - number_of_organisms: returns the number of organisms. + - number_of_spots: returns the number of spots. + - number_of_modules: returns the number of modules. + - set_edge: sets an edge between the current family and a target family. + - add_sequence: assigns a protein sequence to the gene family. + - add_gene: adds a gene to the gene family and sets the gene's family accordingly. + - add_spot: adds a spot to the gene family. + - add_module: adds a module to the gene family. + - mk_bitarray: produces a bitarray representing the presence/absence of the family in the pangenome using the provided index. + - get_org_dict: returns a dictionary of organisms as keys and sets of genes as values. + - get_genes_per_org: returns the genes belonging to the gene family in the given organism. + + Fields: + - name: the name of the gene family. + - ID: the internal identifier of the gene family. + - removed: a boolean indicating whether the family has been removed from the main graph. + - sequence: the protein sequence associated with the family. + - partition: the partition associated with the family. """ def __init__(self, family_id: int, name: str): + """Constructor method + :param family_id: The internal identifier to give to the gene family + :type family_id: any + :param name: The name of the gene family (to be printed in output files) + :type name: str + """ assert isinstance(family_id, int), "GeneFamily object id should be an integer" assert isinstance(name, str), "GeneFamily object name should be a string" assert name != '', "GeneFamily object cannot be created with an empty name" @@ -49,12 +78,12 @@ def __init__(self, family_id: int, name: str): def named_partition(self) -> str: """Reads the partition attribute and returns a meaningful name - :raises Exception: If the gene family has no partition assigned - :return: the partition name of the gene family + + :raises ValueError: If the gene family has no partition assigned """ if self.partition == "": - raise Exception("The gene family has not beed associated to a partition") + raise ValueError("The gene family has not beed associated to a partition") if self.partition.startswith("P"): return "persistent" elif self.partition.startswith("C"): @@ -65,7 +94,7 @@ def named_partition(self) -> str: return "undefined" @property - def neighbors(self) -> Set[GeneFamily]: + def neighbors(self) -> Generator[GeneFamily, None, None]: """Returns all the GeneFamilies that are linked with an edge :return: Neighbors @@ -74,7 +103,7 @@ def neighbors(self) -> Set[GeneFamily]: yield family @property - def edges(self) -> List[Edge]: + def edges(self) -> Generator[Edge, None, None]: """Returns all Edges that are linked to this gene family :return: Edges of the gene family @@ -88,7 +117,7 @@ def genes(self): yield gene @property - def organisms(self) -> Set[Organism]: + def organisms(self) -> Generator[Organism, None, None]: """Returns all the Organisms that have this gene family :return: Organisms that have this gene family @@ -104,40 +133,45 @@ def organisms(self) -> Set[Organism]: raise Exception("An unexpected error occurs. Please report in our GitHub") @property - def spots(self): + def spots(self) -> Generator[Spot, None, None]: for spot in self._spots: yield spot @property - def modules(self): + def modules(self) -> Generator[Module, None, None]: for module in self._modules: yield module - + @property def number_of_neighbor(self) -> int: """Get the number of neighbor for the current gene family """ return len(self._edges.keys()) + @property def number_of_edges(self) -> int: """Get the number of edges for the current gene family """ return len(self._edges.values()) + @property def number_of_genes(self) -> int: """Get the number of genes for the current gene family """ return len(self._genes) + @property def number_of_organisms(self) -> int: """Get the number of organisms for the current gene family """ return len(self._genePerOrg.keys()) + @property def number_of_spots(self) -> int: """Get the number of spots for the current gene family """ return len(self._spots) + @property def number_of_modules(self) -> int: """Get the number of modules for the current gene family """ diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index 1b6e0d97..d558b6c8 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -73,7 +73,7 @@ def compute_modules(g: nx.Graph, multi: set, weight: float = 0.85, min_fam: int """ # removing families with low presence - removed = set([fam for fam in g.nodes if fam.number_of_organisms() < min_fam]) + removed = set([fam for fam in g.nodes if fam.number_of_organisms < min_fam]) modules = set() c = 0 diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 52f4709f..7abc0c2b 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -417,7 +417,7 @@ def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[Gen if fam.named_partition == "persistent" or not persistent: dup = len([genes for org, genes in fam.get_org_dict().items() if len([gene for gene in genes if not gene.is_fragment]) > 1]) - if (dup / fam.number_of_organisms()) >= dup_margin: # tot / nborgs >= 1.05 + if (dup / fam.number_of_organisms) >= dup_margin: # tot / nborgs >= 1.05 multigenics.add(fam) return multigenics From 21d8281c6d16b166047ef5dafe3c4e4c877387d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 2 Aug 2023 14:36:29 +0200 Subject: [PATCH 28/75] Tests GeneFamily class --- VERSION | 2 +- ppanggolin/geneFamily.py | 59 ++--- ppanggolin/genome.py | 7 +- tests/test_GeneFamily.py | 534 ++++++++++++++++++++------------------- 4 files changed, 316 insertions(+), 286 deletions(-) diff --git a/VERSION b/VERSION index c8b333ff..70c2ba9d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.145 +1.2.146 diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 6a721264..008de4f1 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -74,6 +74,9 @@ def __init__(self, family_id: int, name: str): self._modules = set() self.bitarray = None + def __repr__(self): + return f"{self.ID}: {self.name}" + @property def named_partition(self) -> str: """Reads the partition attribute and returns a meaningful name @@ -99,8 +102,8 @@ def neighbors(self) -> Generator[GeneFamily, None, None]: :return: Neighbors """ - for family in self._edges.keys(): - yield family + for neighbor in self._edges.keys(): + yield neighbor @property def edges(self) -> Generator[Edge, None, None]: @@ -122,15 +125,10 @@ def organisms(self) -> Generator[Organism, None, None]: :return: Organisms that have this gene family """ - try: - for org in self._genePerOrg.keys(): - yield org - except AttributeError: # then the genes have been added before they had organisms - for gene in self.genes: - self._genePerOrg[gene.organism].add(gene) - return self.organisms - except Exception: - raise Exception("An unexpected error occurs. Please report in our GitHub") + if len(self._genePerOrg) == 0: + _ = self.get_org_dict() + for org in self._genePerOrg.keys(): + yield org @property def spots(self) -> Generator[Spot, None, None]: @@ -142,7 +140,7 @@ def modules(self) -> Generator[Module, None, None]: for module in self._modules: yield module @property - def number_of_neighbor(self) -> int: + def number_of_neighbors(self) -> int: """Get the number of neighbor for the current gene family """ return len(self._edges.keys()) @@ -163,6 +161,8 @@ def number_of_genes(self) -> int: def number_of_organisms(self) -> int: """Get the number of organisms for the current gene family """ + if len(self._genePerOrg) == 0: + _ = self.get_org_dict() return len(self._genePerOrg.keys()) @property @@ -200,13 +200,19 @@ def add_gene(self, gene: Gene): raise TypeError(f"'Gene' type object was expected, but '{type(gene)}' type object was provided.") self._genes.add(gene) gene.family = self - if hasattr(gene, "organism"): + if gene.organism is not None: self._genePerOrg[gene.organism].add(gene) def add_spot(self, spot: Spot): + from ppanggolin.region import Spot # prevent circular import error + if not isinstance(spot, Spot): + raise TypeError(f"A spot object is expected, you give a {type(spot)}") self._spots.add(spot) def add_module(self, module: Module): + from ppanggolin.region import Module # prevent circular import error + if not isinstance(module, Module): + raise TypeError(f"A module object is expected, you give a {type(module)}") self._modules.add(module) def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): @@ -237,28 +243,23 @@ def get_org_dict(self) -> Dict[Organism, Set[Gene]]: :return: a dictionnary of organism as key and set of genes as values """ - try: - return self._genePerOrg - except AttributeError: + if len(self._genePerOrg) == 0: for gene in self.genes: + if gene.organism is None: + raise AttributeError(f"Gene: {gene.name} is not fill with organism") self._genePerOrg[gene.organism].add(gene) - return self._genePerOrg - except Exception: - raise Exception("An unexpected error occurs. Please report in our GitHub") + return self._genePerOrg - def get_genes_per_org(self, org: Organism) -> Set[Gene]: + def get_genes_per_org(self, org: Organism) -> Generator[Gene, None, None]: """Returns the genes belonging to the gene family in the given Organism :param org: Organism to look for :return: a set of gene(s) """ - try: - for gene in self._genePerOrg[org]: - yield gene - except AttributeError: - for gene in self.genes: - self._genePerOrg[gene.organism].add(gene) - return self.get_genes_per_org(org) - except Exception: - raise Exception("An unexpected error occurs. Please report in our GitHub") \ No newline at end of file + if len(self._genePerOrg) == 0: + _ = self.get_org_dict() + if org not in self._genePerOrg: + raise KeyError(f"Organism don't belong to the gene family: {self.name}") + for gene in self._genePerOrg[org]: + yield gene \ No newline at end of file diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 158d289b..ec87838d 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -150,6 +150,7 @@ def fill_parents(self, organism: Organism = None, contig: Contig = None): :param contig: Parent contig """ if organism is not None: + # TODO test type self.organism = organism if contig is not None: self.contig = contig @@ -282,6 +283,7 @@ class Contig: def __init__(self, name: str, is_circular: bool = False): """Constructor method + :param name: Name of the contig :param is_circular: save if the contig is circular """ @@ -341,7 +343,8 @@ def genes(self) -> list: :return: list of gene in contig """ for gene in self._genes_position: - yield gene + if gene is not None: + yield gene @property def organism(self) -> Organism: @@ -438,7 +441,7 @@ def number_of_families(self) -> int: return len(self._families) @property - def genes(self) -> Iterator[Gene]: + def genes(self) -> Generator[Gene, None, None]: """ Generator to get genes in organism :return: Generator of genes in organism diff --git a/tests/test_GeneFamily.py b/tests/test_GeneFamily.py index 53023fe4..9a108f49 100644 --- a/tests/test_GeneFamily.py +++ b/tests/test_GeneFamily.py @@ -2,265 +2,291 @@ import pytest from random import randint, sample - +from typing import Generator, Set from collections import defaultdict +from itertools import combinations from ppanggolin.pangenome import Edge from ppanggolin.geneFamily import GeneFamily -from ppanggolin.genome import Gene - - -def test_cstr(): - identifier = 33 - name = "33" - o_family = GeneFamily(identifier, name) - assert isinstance(o_family, GeneFamily) - - for attr in "ID", "name", "genes", \ - "removed", "sequence", "partition": - assert hasattr(o_family, attr) - assert o_family.ID == identifier - assert o_family.name == name - assert o_family.genes == set() - assert o_family.removed is False - assert o_family.sequence == "" - assert o_family.partition == "" - - -@pytest.fixture() -def o_family(): - return GeneFamily(33, "trente-trois") - - -def test_add_sequence(o_family): - seq = "un de troa" - o_family.add_sequence(seq) - assert o_family.sequence == seq - - -def test_add_partition(o_family): - partition = "un de troa" - o_family.add_partition(partition) - assert o_family.partition == partition - - -def test_named_partition_error(o_family): - with pytest.raises(Exception): - o_family.named_partition - - -@pytest.mark.parametrize("partition, name", - [ - ("P", "persistent"), - ("Pp", "persistent"), - ("P whatever, only first letter is important", "persistent"), - ("C", "cloud"), - ("C loud", "cloud"), - ("C whatever, only first letter is important", "cloud"), - ("S", "shell"), - ("Shut", "shell"), - ("S whatever, only first letter is important", "shell"), - ("un de troa kvar", "undefined"), - ("1", "undefined"), - ("p", "undefined"), - ("c", "undefined"), - ("s", "undefined"), - ]) -def test_named_partition(o_family, partition, name): - o_family.add_partition(partition) - assert o_family.named_partition == name - - -@pytest.fixture() -def lo_genes(): - return [Gene(str(i)) for i in range(4)] - - -def test_add_gene_error(o_family, lo_genes): - with pytest.raises(TypeError): - o_family.add_gene(33) - - -def test_add_gene_solo(o_family, lo_genes): - o_gene = Gene(33) - o_family.add_gene(o_gene) - assert o_family.genes == {o_gene} - assert o_gene.family == o_family - - -def test_add_gene_many(o_family, lo_genes): - """ fill the family with genes from the same organism""" - organism = "organism" - for o_gene in lo_genes * 4: # *4 to assert duplicates are not considered - o_gene.fill_parents(organism, None) - o_family.add_gene(o_gene) - assert o_gene.family == o_family - assert o_family.genes == set(lo_genes) - - -def test_mk_bitarray_no_org(o_family): - # index is meaningless - o_family.mk_bitarray(None) - assert o_family.bitarray == 0 - - -def test_mk_bitarray_with_org(o_family): - organism = "organism" - o_gene = Gene(33) - o_gene.fill_parents(organism, None) - - o_family.add_gene(o_gene) - - for i in 1, 3, 7, 12: - index = {organism: i} - o_family.mk_bitarray(index) - assert o_family.bitarray == 1 << i - - -def test_get_org_dict_error(o_family): - with pytest.raises(AttributeError): - o_family.discard('_genePerOrg') - # I don't get how this can happen - - -def test_get_org_dict_empty(o_family): - dd = o_family.get_org_dict() - assert isinstance(dd, defaultdict) - assert 0 == len(dd) - - -def test_get_org_dict(o_family, lo_genes): - """ in lo_genes, none has organism. - I'll add one, several times, creating several sets.""" - n_orgs = randint(2, 10) - for org in range(n_orgs): - for o_gene in lo_genes: - o_gene.fill_parents(org, None) - o_family.add_gene(o_gene) +from ppanggolin.genome import Gene, Organism, Contig +from ppanggolin.region import Spot, Module - dd = o_family.get_org_dict() - assert n_orgs == len(dd) - for org in dd: - assert dd[org] == set(lo_genes) - # Note: after integration, genes can be edited - # which leads to inconsistent results. - # here the same genes are refered to 2 orgs. - # IMO this would be user pb as it is insane user behavior. - - -def test_get_genes_per_org_error(o_family): - with pytest.raises(AttributeError): - o_family.discard('_genePerOrg') - # I don't get how this can happen - - -def test_get_genes_per_org_no_gene(o_family): - org = "org" - - s_genes = o_family.get_genes_per_org(org) - assert 0 == len(s_genes) - - -def test_get_genes_per_org(o_family, lo_genes): - org = "org" - for o_gene in lo_genes: - o_gene.fill_parents(org, None) - o_family.add_gene(o_gene) - s_genes = o_family.get_genes_per_org(org) - assert s_genes == set(lo_genes) - - -def test_organisms_error(o_family, lo_genes): - with pytest.raises(AttributeError): - o_family.discard('_genePerOrg') - # I don't get how this can happen - - -def test_organisms_empty(o_family, lo_genes): - assert set() == o_family.organisms - - -def test_organisms(o_family, lo_genes): - l_org = [] - for o_gene in lo_genes: - org = randint(0, 5) - o_gene.fill_parents(org, None) - o_family.add_gene(o_gene) - l_org.append(org) - - assert set(l_org) == o_family.organisms - - -def test_neighbors_empty(o_family): - assert o_family.neighbors == set() - - -@pytest.fixture -def filled_families(): - """ - return a list of families and genes. - there will be between 3 and 10 genes/families. - Each family has only one gene. +class TestGeneFamily: + """Tests the gene family class """ - lo_genes = [] - lo_fam = [] - - n_families = randint(3, 10) - for fam in range(n_families): - o_gene = Gene(fam) - o_gene.fill_parents(None, None) - - o_family = GeneFamily(fam, fam) - o_family.add_gene(o_gene) - - lo_genes.append(o_gene) - lo_fam.append(o_family) - - return lo_fam, lo_genes - - -def test_neighbors(filled_families): - lo_fam, lo_genes = filled_families - - # get several genes and make an edge - # between them and the first of the list - n_genes = randint(2, len(lo_genes)) - sample_genes = sample(lo_genes, n_genes) - for o_gene in sample_genes: - # it is strange to me to update family attribute from another class. - Edge(lo_genes[0], o_gene) - # we have 0->{*} - - # first gene belong to the first family - # let's get the family neighbors - # set because order is not guaranted - s = set(lo_fam[0].neighbors) - print(s) - assert n_genes == len(s) - - xpected = {g.family for g in sample_genes} - assert xpected == s - - -def test_edges_empty(o_family): - d = o_family.edges - assert 0 == len(d) - - -def test_edges(filled_families): - lo_fam, lo_genes = filled_families - - # get several genes and make an edge - # between them and the first of the list - n_genes = randint(2, len(lo_genes)) - sample_genes = sample(lo_genes, n_genes) - l_edges = [] - for o_gene in sample_genes: - # it is strange to me to update family attribute from another class. - l_edges.append(Edge(lo_genes[0], o_gene)) - # we have 0->{*} - edge_list = lo_fam[0].edges - # set because order is not guaranted - assert set(l_edges) == set(edge_list) + def test_create_gene_family(self): + """Tests that a GeneFamily object can be created with valid family_id and name + """ + family = GeneFamily(1, 'test') + assert isinstance(family, GeneFamily) + assert all(attr in ["ID", "name", "_edges", "_genePerOrg", "_genes", "removed", "sequence", "partition", + "_spots", "_modules", "bitarray", "_metadataGetter"] for attr in + family.__dict__) # Check that no attribute was added else it should be tested + assert all(hasattr(family, attr) for attr in ["ID", "name", "_edges", "_genePerOrg", "_genes", "removed", + "sequence", "partition", "_spots", "_modules", + "bitarray"]) # Check that no attribute was removed else it should be tested + assert family.ID == 1 + assert family.name == 'test' + assert family._edges == {} + assert family._genePerOrg == {} + assert family._genes == set() + assert not family.removed # for the repeated family not added in the main graph + assert family.sequence == "" + assert family.partition == "" + assert family._spots == set() + assert family._modules == set() + assert family.bitarray is None + + @pytest.fixture + def family(self) -> Generator[GeneFamily, None, None]: + """Create a gene family for all tests""" + yield GeneFamily(1, "test") + + @pytest.mark.parametrize("partition, name", + [ + ("P", "persistent"), + ("Pp", "persistent"), + ("P whatever, only first letter is important", "persistent"), + ("C", "cloud"), + ("C loud", "cloud"), + ("C whatever, only first letter is important", "cloud"), + ("S", "shell"), + ("Shut", "shell"), + ("S whatever, only first letter is important", "shell"), + ("un de troa kvar", "undefined"), + ("1", "undefined"), + ("p", "undefined"), + ("c", "undefined"), + ("s", "undefined"), + ]) + def test_get_named_partition_of_gene_family_object(self, family, partition, name): + """Tests that the named partition of a GeneFamily object can be retrieved + """ + family.partition = partition + assert family.named_partition == name + + def test_get_named_partition_error_partition_empty(self, family): + """Tests that if no partition given to gene family, raise a ValueError + """ + with pytest.raises(ValueError): + _ = family.named_partition + + def test_add_sequence_to_gene_family(self, family): + """Tests that a sequence can be added to a GeneFamily object + """ + family.add_sequence('ATCG') + assert family.sequence == 'ATCG' + + def test_add_gene_to_gene_family(self, family): + """Tests that a Gene object can be added to a GeneFamily object + """ + family = GeneFamily(1, 'test') + gene = Gene('gene1') + family.add_gene(gene) + assert gene in family.genes + assert gene.family == family + + def test_add_gene_error(self, family): + """Tests that a non gene object can't be added to a GeneFamily as gene + """ + with pytest.raises(TypeError): + family.add_gene(33) + + @pytest.fixture + def genes(self) -> Generator[Set[Gene], None, None]: + """Creeate a set of genes to fill gene families + """ + genes = set() + for i in range(1, randint(11, 20)): + gene = Gene(f"gene_{str(i)}") + gene.fill_annotations(start=10*(i-1) + 1, stop=10*i, strand='+', position=i, genetic_code=4) + genes.add(gene) + yield genes + + def test_get_number_of_genes(self, family, genes): + """Tests that the number of genes can be retrieved + """ + for gene in genes: + family.add_gene(gene) + assert isinstance(family.number_of_genes, int) + assert family.number_of_genes == len(genes) + + @pytest.fixture + def organisms(self, genes) -> Generator[Set[Organism], None, None]: + """Create a set of organisms fill with genes to test edges + """ + organisms = set() + genes = list(genes) + nb_organisms = randint(2, 10) + nb_genes_per_organisms = len(genes) // nb_organisms + idx_org = 1 + while idx_org < nb_organisms: + organism = Organism(f"organism_{idx_org}") + contig = Contig(f"contig_{idx_org}") + organism.add_contig(contig) + idx_genes = 0 + while idx_genes < nb_genes_per_organisms: + gene = genes[(idx_org - 1) * nb_genes_per_organisms + idx_genes] + gene.fill_parents(organism, contig) + contig[gene.start] = gene + idx_genes += 1 + organisms.add(organism) + idx_org += 1 + # last family fill with all the gene left + organism = Organism(f"organism_{idx_org}") + contig = Contig(f"contig_{idx_org}") + organism.add_contig(contig) + idx_genes = (idx_org - 1) * nb_genes_per_organisms + while idx_genes < len(genes): + gene = genes[idx_genes] + gene.fill_parents(organism, contig) + contig[gene.start] = gene + idx_genes += 1 + organisms.add(organism) + yield organisms + + def test_get_org_dict(self, family, genes, organisms): + """""" + for gene in genes: + family.add_gene(gene) + org_dict = family.get_org_dict() + assert isinstance(org_dict, dict) + assert all(isinstance(org, Organism) for org in org_dict.keys()) + assert all(isinstance(gene, Gene) for gene_set in org_dict.values() for gene in gene_set) + assert set(org_dict.keys()) == organisms + assert set([gene for gene_set in org_dict.values() for gene in gene_set]) == genes + + def test_get_org_dict_with_no_organism_fill_to_genes(self, family, genes): + for gene in genes: + family.add_gene(gene) + with pytest.raises(AttributeError): + _ = family.get_org_dict() + + def test_organisms(self, family, organisms, genes): + for gene in genes: + family.add_gene(gene) + assert set(family.organisms) == organisms + + def test_number_of_organism(self, family, organisms, genes): + for gene in genes: + family.add_gene(gene) + assert isinstance(family.number_of_organisms, int) + assert family.number_of_organisms == len(organisms) + + def test_get_genes_per_org(self, family, organisms, genes): + for gene in genes: + family.add_gene(gene) + for organism in organisms: + assert set(family.get_genes_per_org(organism)) == set(organism.genes) + + def test_get_genes_per_org_if_org_not_in_family(self, family): + with pytest.raises(KeyError): + org = Organism("organism") + _ = set(family.get_genes_per_org(org)) + + @pytest.fixture + def families(self, genes) -> Generator[Set[GeneFamily], None, None]: + """Create a set of gene families fill with genes to test edges + """ + families = set() + genes = list(genes) + nb_families = randint(2, 10) + nb_genes_per_family = len(genes) // nb_families + idx_fam = 1 + while idx_fam < nb_families: + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = 0 + while idx_genes < nb_genes_per_family: + gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] + family.add_gene(gene) + gene.family = family + idx_genes += 1 + families.add(family) + idx_fam += 1 + # last family fill with all the gene left + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = (idx_fam - 1) * nb_genes_per_family + while idx_genes < len(genes): + gene = genes[idx_genes] + family.add_gene(gene) + gene.family = family + idx_genes += 1 + families.add(family) + yield families + + @pytest.fixture + def edges(self, families, genes) -> Generator[Set[Edge], None, None]: + """Create a set of edges fill with genes and gene families to test edges + """ + edges = {} + pair_genes = combinations(genes, 2) + for pair in pair_genes: + key = frozenset([pair[0].family, pair[1].family]) + edge = edges.get(key) + if edge is None: + edge = Edge(pair[0], pair[1]) + edges[key] = edge + else: + edge.add_genes(pair[0], pair[1]) + pair[0].family.set_edge(pair[1].family, edge) + pair[1].family.set_edge(pair[0].family, edge) + yield set(edges.values()) + + def test_get_neighbors_of_gene_family(self, families, edges): + for family in families: + assert all(isinstance(neighbor, GeneFamily) for neighbor in family.neighbors) + expected_neighbors = set([edge.source for edge in edges + if edge.target == family]).union(set([edge.target for edge in edges + if edge.source == family])) + assert set(family.neighbors) == expected_neighbors + + def test_get_number_of_neighbors(self, families, edges): + for family in families: + expected_neighbors = set([edge.source for edge in edges + if edge.target == family]).union(set([edge.target for edge in edges + if edge.source == family])) + assert isinstance(family.number_of_neighbors, int) + assert family.number_of_neighbors == len(expected_neighbors) + + # Tests that the edges of a GeneFamily object can be retrieved + def test_get_edges_of_gene_family(self, families, edges): + for family in families: + expected_edges = set([edge for edge in edges if edge.source == family or edge.target == family]) + assert all(isinstance(edge, Edge) for edge in family.edges) + assert set(family.edges) == expected_edges + + def test_get_number_of_edges(self, families, edges): + for family in families: + expected_edges = set([edge for edge in edges if edge.source == family or edge.target == family]) + assert isinstance(family.number_of_edges, int) + assert family.number_of_neighbors == len(expected_edges) + + def test_add_spot_to_gene_family(self, family): + """Tests that a Spot object can be added to a GeneFamily object + """ + spot = Spot('spot1') + family.add_spot(spot) + assert spot in family.spots + + def test_add_non_spot_as_spot_in_family(self, family): + """Tests that a non-spot object cannot be added to Gene Family + """ + with pytest.raises(TypeError): + family.add_spot(323) + + def test_add_module_to_gene_family(self, family): + """Tests that a Module object can be added to a GeneFamily object + """ + module = Module('module1') + family.add_module(module) + assert module in family.modules + + def test_add_non_module_as_module_in_family(self, family): + """Tests that a non-module object cannot be added to Gene Family + """ + with pytest.raises(TypeError): + family.add_module(323) + + # TODO test mk_bitarray \ No newline at end of file From 71f6e1ffee3e498fc030f23277aa6e8c3b33aef6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 3 Aug 2023 11:04:58 +0200 Subject: [PATCH 29/75] Test edge class --- VERSION | 2 +- ppanggolin/edge.py | 63 ++++++-- ppanggolin/formats/writeBinaries.py | 9 +- ppanggolin/formats/writeFlat.py | 17 +- ppanggolin/geneFamily.py | 1 + ppanggolin/nem/partition.py | 2 +- tests/test_Edge.py | 231 +++++++++++++--------------- 7 files changed, 168 insertions(+), 157 deletions(-) diff --git a/VERSION b/VERSION index 70c2ba9d..83ae3daf 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.146 +1.2.147 diff --git a/ppanggolin/edge.py b/ppanggolin/edge.py index 2b213f7d..d5353474 100644 --- a/ppanggolin/edge.py +++ b/ppanggolin/edge.py @@ -3,7 +3,7 @@ # default libraries from collections import defaultdict -from typing import Dict, List, Tuple +from typing import Dict, Generator, List, Tuple from ppanggolin.genome import Gene, Organism @@ -11,31 +11,56 @@ class Edge: """The Edge class represents an edge between two gene families in the pangenome graph. It is associated with all the organisms in which the neighborship is found, and all the involved genes as well. + Methods: + - __init__(self, source_gene: Gene, target_gene: Gene): Constructor method that initializes an Edge object with a source gene and a target gene. + - get_org_dict(self) -> Dict[Organism, List[Tuple[Gene, Gene]]]: Returns a dictionary with organisms as keys and an iterable of the pairs of genes as values. + - gene_pairs(self) -> List[Tuple[Gene, Gene]]: Returns a list of all the gene pairs of the Edge. + - add_genes(self, source_gene: Gene, target_gene: Gene): Adds genes to the edge. They are supposed to be on the same organism. - :param source_gene: a first gene to initialize the edge - :param target_gene: a second gene to initialize the edge + Fields: + - source: A GeneFamily object representing the source gene family of the edge. + - target: A GeneFamily object representing the target gene family of the edge. + - organisms: A defaultdict object representing the organisms in which the edge is found and the pairs of genes involved. """ def __init__(self, source_gene: Gene, target_gene: Gene): + """Constructor method + + :param source_gene: a first gene to initialize the edge + :param target_gene: a second gene to initialize the edge + """ + # TODO try to change for gene family ? if source_gene.family is None: - raise Exception(f"You cannot create a graph without gene families. " - f"gene {source_gene.ID} did not have a gene family.") + raise AttributeError(f"You cannot create a graph without gene families. " + f"gene {source_gene.ID} did not have a gene family.") if target_gene.family is None: - raise Exception(f"You cannot create a graph without gene families. " - f"gene {target_gene.ID} did not have a gene family.") + raise AttributeError(f"You cannot create a graph without gene families. " + f"gene {target_gene.ID} did not have a gene family.") self.source = source_gene.family self.target = target_gene.family self.source.set_edge(self.target, self) self.target.set_edge(self.source, self) - self.organisms = defaultdict(list) + self._organisms = defaultdict(list) self.add_genes(source_gene, target_gene) - def get_org_dict(self) -> Dict[Organism, List[Tuple[Gene, Gene]]]: - """ Create a dictionnary of the Organisms in which the edge is found + @property + def organisms(self) -> Generator[Organism, None, None]: + """Get all the organisms belonging to the edge - :return: Dictionary with organisms as key and an iterable of the pairs of genes as value + :return: Generator with organisms as key and an iterable of the pairs of genes as value """ - return self.organisms + for organism in self._organisms.keys(): + yield organism + + @property + def number_of_organisms(self): + return len(self._organisms) + + def get_organism_genes_pairs(self, organism: Organism): + return self._organisms[organism] + + def get_organisms_dict(self): + return self._organisms @property def gene_pairs(self) -> List[Tuple[Gene, Gene]]: @@ -43,7 +68,7 @@ def gene_pairs(self) -> List[Tuple[Gene, Gene]]: :return: A list of all the gene pairs of the Edge """ - return [gene_pair for gene_list in self.organisms.values() for gene_pair in gene_list] + return [gene_pair for gene_list in self.get_organisms_dict().values() for gene_pair in gene_list] def add_genes(self, source_gene: Gene, target_gene: Gene): """Adds genes to the edge. They are supposed to be on the same organism. @@ -51,10 +76,16 @@ def add_genes(self, source_gene: Gene, target_gene: Gene): :param source_gene: a source gene to add to the edge :param target_gene: a target gene to add to the edge + :raises TypeError: If the genes are not with Gene type + :raises ValueError: If genes are not associated to an organism :raises Exception: If the genes are not on the same organism. """ - org = source_gene.organism - if org != target_gene.organism: + if not isinstance(source_gene, Gene) or not isinstance(target_gene, Gene): + raise TypeError(f"Genes are expected to be added to edge. " + f"Given type for source: {type(source_gene)} and target: {type(target_gene)}") + if source_gene.organism is None or target_gene.organism is None: + raise ValueError("Genes are not associated to organism. It's needed to create add genes to edge") + if source_gene.organism != target_gene.organism: raise Exception(f"You tried to create an edge between two genes that are not even in the same organism ! " f"(genes are '{source_gene.ID}' and '{target_gene.ID}')") - self.organisms[org].append((source_gene, target_gene)) + self._organisms[source_gene.organism].append((source_gene, target_gene)) diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 1b22c15f..3f555f29 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -463,11 +463,10 @@ def write_graph(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis expectedrows=pangenome.number_of_edges()) edge_row = edge_table.row for edge in tqdm(pangenome.edges, total=pangenome.number_of_edges(), unit="edge", disable=disable_bar): - for gene_pairs in edge.organisms.values(): - for gene1, gene2 in gene_pairs: - edge_row["geneTarget"] = gene1.ID - edge_row["geneSource"] = gene2.ID - edge_row.append() + for gene1, gene2 in edge.gene_pairs: + edge_row["geneTarget"] = gene1.ID + edge_row["geneSource"] = gene2.ID + edge_row.append() edge_table.flush() diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index a1cd5cf3..fcdb2eb2 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -125,12 +125,12 @@ def write_json_edge(edge: Edge, json: TextIO): json.write(f'"weight": {len(edge.gene_pairs)}, "source": "{edge.source.name}", "target": "{edge.target.name}"') json.write(', "organisms": {') orgstr = [] - for org in edge.get_org_dict(): + for org in edge.organisms: orgstr.append('"' + org.name + '": [') genepairstr = [] - for genepair in edge.get_org_dict()[org]: - genepairstr.append('{"source": "' + genepair[0].ID + '", "target": "' + genepair[ - 1].ID + f'", "length": {genepair[0].start - genepair[1].stop}' + '}') + for gene_pair in edge.get_organism_genes_pairs(org): + genepairstr.append('{"source": "' + gene_pair[0].ID + '", "target": "' + gene_pair[ + 1].ID + f'", "length": {gene_pair[0].start - gene_pair[1].stop}' + '}') orgstr[-1] += ', '.join(genepairstr) + ']' json.write(', '.join(orgstr) + "}}") @@ -271,13 +271,14 @@ def write_gexf_edges(gexf: TextIO, light: bool = True): for edge in pan.edges: gexf.write(f' \n') - gexf.write(f' \n') + f'{edge.source.ID}" target="{edge.target.ID}" weight="{edge.number_of_organisms}">\n') + gexf.write(f' \n') gexf.write(' \n') gexf.write(f' \n') if not light: - for org, genes in edge.get_org_dict().items(): - gexf.write(f' \n') + print(edge.get_organisms_dict()) + for org, genes_pairs in edge.get_organisms_dict().items(): + gexf.write(f' \n') gexf.write(' \n') gexf.write(' \n') edgeids += 1 diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 008de4f1..03abd2bb 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -51,6 +51,7 @@ class GeneFamily(MetaFeatures): """ def __init__(self, family_id: int, name: str): + # TODO edges as genes in contig to get and set """Constructor method :param family_id: The internal identifier to give to the gene family :type family_id: any diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index 4303b8c5..137f2f27 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -289,7 +289,7 @@ def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> neighbor_number = 0 sum_dist_score = 0 for edge in fam.edges: # iter on the family's edges. - coverage = sum([len(gene_list) for org, gene_list in edge.organisms.items() if org in organisms]) + coverage = sum([len(gene_list) for org, gene_list in edge.get_organisms_dict().items() if org in organisms]) if coverage == 0: continue # nothing interesting to write, this edge does not exist with this subset of organisms. distance_score = coverage / len(organisms) diff --git a/tests/test_Edge.py b/tests/test_Edge.py index 6a5bf715..e94da982 100644 --- a/tests/test_Edge.py +++ b/tests/test_Edge.py @@ -1,134 +1,113 @@ #! /usr/bin/env python3 import pytest +from typing import Generator, Tuple -from ppanggolin.genome import Gene +from ppanggolin.genome import Gene, Organism from ppanggolin.edge import Edge from ppanggolin.geneFamily import GeneFamily -def test_cstr_error(): - o_src = Gene('source') - o_tgt = Gene('target') - # genes should have a family - with pytest.raises(Exception): - _ = Edge(o_src, o_tgt) - - o_family = GeneFamily(None, None) - o_family.add_gene(o_src) - # both genes sould have a family - with pytest.raises(Exception): - _ = Edge(o_src, o_tgt) - - # gene should belong to the same organism - o_family.add_gene(o_tgt) - o_src.fill_parents("", None) - o_tgt.fill_parents(None, None) - with pytest.raises(Exception): - _ = Edge(o_src, o_tgt) - - -def test_cstr(): - o_src = Gene('source') - o_tgt = Gene('target') - - # set organism and contig to None. - o_src.fill_parents(None, None) - o_tgt.fill_parents(None, None) - - # define the None GeneFamily, and add the 2 genes to it. - o_family = GeneFamily(None, None) - o_family.add_gene(o_src) - o_family.add_gene(o_tgt) - - o_edge = Edge(o_src, o_tgt) - assert isinstance(o_edge, Edge) - - assert o_edge.source == o_src.family - assert o_edge.target == o_tgt.family - assert dict(o_edge.organisms) == {None: [(o_src, o_tgt)]} - - -@pytest.fixture() -def make_gene_pair(): - def _make_gene_pair(org, gene_id1, gene_id2): - """create 2 genes from org. - each gene belong to its own family.""" - lo_genes = [] - for k in gene_id1, gene_id2: - o_gene = Gene(k) - o_gene.fill_parents(org, None) - - lo_genes.append(o_gene) - - o_family = GeneFamily(k, k) - o_family.add_gene(o_gene) - - return tuple(lo_genes) - - return _make_gene_pair - - -@pytest.fixture() -def o_edge(make_gene_pair): - p = make_gene_pair("org", "src", "tgt") - return Edge(*p) - - -def test_add_enes(make_gene_pair): - p1 = make_gene_pair("org1", "s1", "t1") - p2 = make_gene_pair("org1", "s2", "t1") - p3 = make_gene_pair("org2", "s1", "t2") - p4 = make_gene_pair("org2", "s1", "s2") - # org1: s1,s2 -- t1 - # org2: s1 -- t2,s2 - - o_edge = Edge(*p1) - o_edge.add_genes(*p2) - o_edge.add_genes(*p3) - o_edge.add_genes(*p4) - assert set(o_edge.organisms.keys()) == {"org1", "org2"} - assert o_edge.organisms["org1"] == [p1, p2] - assert o_edge.organisms["org2"] == [p3, p4] - - -@pytest.fixture() -def filled_edge(make_gene_pair): - # Note that the same edge here links 4 families. - p1 = make_gene_pair("org1", "s1", "t1") - p2 = make_gene_pair("org1", "s2", "t1") - p3 = make_gene_pair("org2", "s1", "t2") - p4 = make_gene_pair("org2", "s1", "s2") - # org1: s1,s2 -- t1 - # org2: s1 -- t2,s2 - - o_edge = Edge(*p1) - o_edge.add_genes(*p2) - o_edge.add_genes(*p3) - o_edge.add_genes(*p4) - - return o_edge - - -def test_get_org_dict(o_edge, filled_edge): - assert o_edge.get_org_dict() == o_edge.organisms - assert filled_edge.get_org_dict() == filled_edge.organisms - - -def test_gene_pairs(make_gene_pair): - # cannot use filled_edge because I need access to pair. - p1 = make_gene_pair("org1", "s1", "t1") - p2 = make_gene_pair("org1", "s2", "t1") - p3 = make_gene_pair("org2", "s1", "t2") - p4 = make_gene_pair("org2", "s1", "s2") - # org1: s1,s2 -- t1 - # org2: s1 -- t2,s2 - - o_edge = Edge(*p1) - o_edge.add_genes(*p2) - o_edge.add_genes(*p3) - o_edge.add_genes(*p4) - - # 'set' because the order is not guaranted due to '.values()'. - l_pairs = o_edge.gene_pairs - assert set(l_pairs) == {p1, p2, p3, p4} +class TestEdge: + """Test edge class + """ + # Tests that an Edge object can be created with two genes belonging to different families + + @pytest.fixture + def organism(self) -> Generator[Organism, None, None]: + yield Organism("organism") + + @pytest.fixture + def families_pair(self) -> Generator[Tuple[GeneFamily, GeneFamily], None, None]: + yield GeneFamily(1, "family1"), GeneFamily(2, "family2") + + @pytest.fixture + def genes_pair(self, organism, families_pair) -> Generator[Tuple[Gene, Gene], None, None]: + gene1, gene2 = Gene("gene1"), Gene("gene2") + gene1.fill_parents(organism, None) + gene2.fill_parents(organism, None) + gene1.family, gene2.family = GeneFamily(1, "family1"), GeneFamily(2, "family2") + yield gene1, gene2 + + @pytest.fixture + def edge(self, genes_pair): + edge = Edge(*genes_pair) + yield edge + + def test_constructor(self, genes_pair, organism, families_pair): + gene1, gene2 = genes_pair + edge = Edge(gene1, gene2) + assert edge.source == gene1.family + assert edge.target == gene2.family + assert edge.source._edges[edge.target] == edge + assert edge.target._edges[edge.source] == edge + assert edge._organisms == {organism: [(gene1, gene2)]} + + def test_constructor_attribute_error(self): + """Tests that an AttributeError is raised when creating an Edge object + with a gene that does not belong to any family + """ + gene1 = Gene('gene1') + gene1.family = GeneFamily(0, 'test') + gene2 = Gene('gene2') + with pytest.raises(AttributeError): + # Test target attribute error + Edge(gene1, gene2) + with pytest.raises(AttributeError): + # Test source attribute error + Edge(gene2, gene1) + + def test_gene_pairs(self, edge, genes_pair): + assert set(edge.gene_pairs) == {genes_pair} + + def test_get_organisms(self, edge, organism): + assert set(edge.organisms) == {organism} + + def test_get_number_of_organisms(self, edge): + assert isinstance(edge.number_of_organisms, int) + assert edge.number_of_organisms == 1 + + def test_get_organisms_dict(self, edge, organism, genes_pair): + assert edge.get_organisms_dict() == {organism: [genes_pair]} + + def test_get_organism_genes_pairs(self, edge, organism, genes_pair): + assert edge.get_organism_genes_pairs(organism) == [genes_pair] + + def test_edge_add_genes_same_organism(self, edge, genes_pair, organism): + """Tests that genes can be added to the edge that are on the same organism + """ + gene1, gene2, gene3, gene4 = *genes_pair, Gene('gene3'), Gene('gene4') + gene3.fill_parents(organism, None) + gene4.fill_parents(organism, None) + edge.add_genes(gene3, gene4) + assert edge.get_organism_genes_pairs(organism) == [(gene1, gene2), (gene3, gene4)] + + def test_edge_add_genes_different_organisms(self, edge, organism): + """Tests that an Exception is raised when adding genes to the edge that are not on the same organism + """ + gene1, gene2 = Gene('gene3'), Gene('gene4') + gene1.fill_parents(organism, None) + org = Organism("org") + gene2.fill_parents(org, None) + with pytest.raises(Exception): + edge.add_genes(gene1, gene2) + + def test_edge_add_genes_one_none_gene(self, edge, organism): + """Tests that a TypeError is raised when adding genes to the edge where one gene is None + """ + gene1 = Gene('gene1') + gene1.fill_parents(organism, None) + with pytest.raises(TypeError): + edge.add_genes(gene1, None) + with pytest.raises(TypeError): + edge.add_genes(None, gene1) + + def test_edge_add_genes_without_organisms(self, edge, organism): + """Tests that a ValueError is raised when adding genes not filled with organism + """ + gene1, gene2 = Gene('gene1'), Gene('gene2') + gene1.fill_parents(organism, None) + with pytest.raises(ValueError): + edge.add_genes(gene1, gene2) + with pytest.raises(ValueError): + edge.add_genes(gene2, gene1) From c9e107cccecec0285c658cd90eaf29d66f1d136a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 4 Aug 2023 13:56:46 +0200 Subject: [PATCH 30/75] Refactor Region Class --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 4 +- ppanggolin/figures/draw_spot.py | 2 +- ppanggolin/formats/readBinaries.py | 6 +- ppanggolin/formats/writeBinaries.py | 2 +- ppanggolin/formats/writeFlat.py | 8 +-- ppanggolin/region.py | 108 ++++++++++++---------------- 7 files changed, 58 insertions(+), 74 deletions(-) diff --git a/VERSION b/VERSION index 83ae3daf..e3ad4344 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.147 +1.2.148 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index eef66050..9a63c9b8 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -41,7 +41,7 @@ def extract_rgp(contig, node, rgp_id, naming) -> Region: elif naming == "organism": new_region = Region(node.gene.organism.name + "_" + contig.name + "_RGP_" + str(rgp_id)) while node.state: - new_region.append(node.gene) + new_region[node.gene.position] = node.gene node.state = 0 node.score = 0 node = node.prev @@ -184,7 +184,7 @@ def max_index_node(lst): while val >= min_score: new_region = extract_rgp(contig, matrix[index], len(contig_regions), naming) new_region.score = val - if (new_region[0].stop - new_region[-1].start) > min_length: + if new_region.lenght > min_length: contig_regions.add(new_region) rewrite_matrix(contig, matrix, index, persistent, continuity, multi) val, index = max_index_node(matrix) diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 3764c083..1092b805 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -174,7 +174,7 @@ def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = minpos = min([gene.position for border in borders for gene in border]) maxpos = max([gene.position for border in borders for gene in border]) else: - minpos = rgp.start_gene.position + minpos = rgp.starter.position maxpos = rgp.stop_gene.position gene_list = rgp.contig.get_genes(minpos, maxpos + 1) prev = None diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 820c86f2..e9e132b2 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -386,10 +386,8 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): except KeyError: region = Region(row["RGP"].decode()) pangenome.add_region(region) - region.append(pangenome.get_gene(row["gene"].decode())) - # order the genes properly in the regions - for region in pangenome.regions: - region.genes = sorted(region.genes, key=lambda x: x.position) # order the same way as on the contig + gene = pangenome.get_gene(row["gene"].decode()) + region[gene.position] = gene pangenome.status["predictedRGP"] = "Loaded" diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 3f555f29..dec55572 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -518,7 +518,7 @@ def write_rgp(pangenome: Pangenome, h5f: tables.File, force: bool = False, disab h5f.remove_node('/', 'RGP') rgp_table = h5f.create_table('/', 'RGP', rgp_desc(*get_rgp_len(pangenome)), - expectedrows=sum([len(region.genes) for region in pangenome.regions])) + expectedrows=sum([len(region) for region in pangenome.regions])) rgp_row = rgp_table.row for region in tqdm(pangenome.regions, total=pangenome.number_of_rgp(), unit="region", disable=disable_bar): for gene in region.genes: diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index fcdb2eb2..c0788db6 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -276,7 +276,6 @@ def write_gexf_edges(gexf: TextIO, light: bool = True): gexf.write(' \n') gexf.write(f' \n') if not light: - print(edge.get_organisms_dict()) for org, genes_pairs in edge.get_organisms_dict().items(): gexf.write(f' \n') gexf.write(' \n') @@ -651,10 +650,11 @@ def write_regions(output: Path, compress: bool = False): fname = output / "plastic_regions.tsv" with write_compressed_or_not(fname, compress) as tab: tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\n") - regions = sorted(pan.regions, key=lambda x: (x.organism.name, x.contig.name, x.start)) + regions = sorted(pan.regions, key=lambda x: (x.organism.name, x.contig.name, x.starter.start)) for region in regions: - tab.write('\t'.join(map(str, [region.name, region.organism, region.contig, region.start, region.stop, - len(region.genes), region.is_contig_border, region.is_whole_contig])) + "\n") + tab.write('\t'.join(map(str, [region.name, region.organism, region.contig, region.starter.start, + region.stopper.stop, len(region), region.is_contig_border, + region.is_whole_contig])) + "\n") def summarize_spots(spots: set, output: Path, compress: bool = False): diff --git a/ppanggolin/region.py b/ppanggolin/region.py index c776a41a..3e769d93 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -7,7 +7,7 @@ from collections.abc import Iterable # installed libraries -from typing import Dict, Set +from typing import Dict, Generator, Set import gmpy2 @@ -20,15 +20,22 @@ class Region(MetaFeatures): """ This class represent a region of genomic plasticity. - - :param region_id: identifier of the region """ def __init__(self, region_id: str): + """Constructor method + + :param region_id: identifier of the region + """ super().__init__() - self.genes = [] + self._genes_getter = {} self.name = region_id self.score = 0 + self.starter = None + self.stopper = None + + def __repr__(self): + return f"RGP name:{self.name}" def __hash__(self): return id(self) @@ -45,71 +52,50 @@ def __eq__(self, other: Region) -> bool: raise TypeError(f"'Region' type object was expected, but '{type(other)}' type object was provided.") if [gene.family for gene in self.genes] == [gene.family for gene in other.genes]: return True - if [gene.family for gene in self.genes] == [gene.family for gene in other.genes[::-1]]: + if [gene.family for gene in self.genes] == [gene.family for gene in list(other.genes)[::-1]]: return True return False def __len__(self): - return len(self.genes) - - def __getitem__(self, index): - return self.genes[index] - - def append(self, gene: Gene): - # TODO change name foir add_gene - """allowing only gene-class objects in a region + return len(self._genes_getter) - :param gene: gene which will be added - - :raise TypeError: If gene is not Gene type raise TypeError - """ - - if isinstance(gene, Gene): - self.genes.append(gene) - gene.RGP = self - else: + def __setitem__(self, position, gene): + if not isinstance(gene, Gene): raise TypeError(f"Unexpected class / type for {type(gene)} " f"when adding it to a region of genomic plasticity") + if len(self) > 0 and gene.organism != self.organism: + raise Exception(f"Gene {gene.name} is from a different organism than the first defined in RGP. " + f"That's not possible") + self._genes_getter[position] = gene + self.starter = self._genes_getter[min(self._genes_getter.keys())] + self.stopper = self._genes_getter[max(self._genes_getter.keys())] + gene.RGP = self - @property - def families(self) -> Set[GeneFamily]: - """Get the gene families in the RGP + def __getitem__(self, position): + return self._genes_getter[position] - :return: Set of gene families - """ - return {gene.family for gene in self.genes} + def __delitem__(self, position): + del self._genes_getter[position] @property - def start(self) -> int: - """ Get RGP starting position - - :return: Start position + def genes(self) -> Generator[Gene, None, None]: + """Generate the gene as they are ordered in contigs """ - return min(self.genes, key=lambda x: x.start).start - - @property # TODO try to change start with this method - def start_gene(self) -> Gene: - """ Get RGP starting gene - - :return: Start gene - """ - return min(self.genes, key=lambda x: x.position) + for gene in sorted(self._genes_getter.values(), key=lambda x: x.position): + yield gene @property - def stop_gene(self) -> Gene: - """ Get RGP stoping position + def families(self) -> Generator[GeneFamily, None, None]: + """Get the gene families in the RGP - :return: Stoping position + :return: Set of gene families """ - return max(self.genes, key=lambda x: x.position) + for gene in self.genes: + yield gene.family @property - def stop(self): - """ Get RGP stoping position - - :return: Stop position - """ - return max(self.genes, key=lambda x: x.stop).stop + def lenght(self): + return self.stopper.stop - self.starter.start @property def organism(self) -> Organism: @@ -117,15 +103,15 @@ def organism(self) -> Organism: :return: Organism """ - return self.genes[0].organism + return self.starter.organism @property def contig(self) -> Contig: - """ Get the Contig link to RGP + """ Get the starter contig link to RGP :return: Contig """ - return self.genes[0].contig + return self.starter.contig @property def is_whole_contig(self) -> bool: @@ -133,7 +119,7 @@ def is_whole_contig(self) -> bool: :return: True if whole contig """ - if self.start_gene.position == 0 and self.stop_gene.position == len(self.contig) - 1: + if self.starter.position == 0 and self.stopper.position == len(self.contig) - 1: return True return False @@ -143,10 +129,10 @@ def is_contig_border(self) -> bool: :return: True if bordering """ - if len(self.genes) == 0: + if len(self) == 0: raise Exception("Your region has no genes. Something wrong happenned.") - if (self.start_gene.position == 0 and not self.contig.is_circular) or \ - (self.stop_gene.position == len(self.contig) - 1 and not self.contig.is_circular): + if (self.starter.position == 0 and not self.contig.is_circular) or \ + (self.stopper.position == len(self.contig) - 1 and not self.contig.is_circular): return True return False @@ -157,7 +143,7 @@ def get_rnas(self) -> set: """ rnas = set() for rna in self.contig.RNAs: - if self.start < rna.start < self.stop: + if self.starter.start < rna.start < self.stopper.stop: rnas.add(rna) return rnas @@ -170,7 +156,7 @@ def get_bordering_genes(self, n: int, multigenics: set) -> list: :return: A list of bordering gene in start and stop position List[List[Start Gene], [Stop Gene]] """ border = [[], []] - pos = self.start_gene.position + pos = self.starter.position init = pos while len(border[0]) < n and (pos != 0 or self.contig.is_circular): curr_gene = None @@ -187,7 +173,7 @@ def get_bordering_genes(self, n: int, multigenics: set) -> list: pos = len(self.contig) if pos == init: break # looped around the contig - pos = self.stop_gene.position + pos = self.stopper.position init = pos while len(border[1]) < n and (pos != len(self.contig) - 1 or self.contig.is_circular): curr_gene = None From 6c582e0b9627a048980057ef8d717f43b66e2517 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 4 Aug 2023 15:44:10 +0200 Subject: [PATCH 31/75] Fix bug overlaps --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 2 +- ppanggolin/annotate/synta.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/VERSION b/VERSION index e3ad4344..c0786106 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.148 +1.2.149 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 8c822bf6..d735f22d 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -539,7 +539,7 @@ def launch_annotate_organism(pack: tuple) -> Organism: def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11, - kingdom: str = "bacteria", norna: bool = False, overlap: bool = True, procedure: str = None, + kingdom: str = "bacteria", norna: bool = False, overlap: bool = False, procedure: str = None, disable_bar: bool = False): """ Main function to annotate a pangenome diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index cd677e98..58f451be 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -241,12 +241,12 @@ def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, tmpdir: str, n return genes -def overlap_filter(all_genes: defaultdict, overlap: bool = True) -> defaultdict: +def overlap_filter(all_genes: defaultdict, overlap: bool = False) -> defaultdict: """ Removes the CDS that overlap with RNA genes. :param all_genes: Dictionary with complete list of genes - :param overlap: Allow to filter overlap + :param overlap: Use to not remove genes overlapping with RNA features :return: Dictionary with genes filtered """ @@ -255,7 +255,7 @@ def overlap_filter(all_genes: defaultdict, overlap: bool = True) -> defaultdict: for key, genes in all_genes.items(): tmp_genes = sorted(genes, key=lambda x: x.start) rm_genes = set() - if overlap: + if not overlap: for i, gene_i in enumerate(tmp_genes): if i + 1 < len(tmp_genes): gene_j = tmp_genes[i + 1] @@ -292,7 +292,7 @@ def get_dna_sequence(contig_seq: str, gene: Gene) -> str: def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: str, code: int = 11, norna: bool = False, kingdom: str = "bacteria", - overlap: bool = True, procedure: str = None) -> Organism: + overlap: bool = False, procedure: str = None) -> Organism: """ Function to annotate a single organism From 7f7094fc2ce5e3acf40b9a356049893f46e7940a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 7 Aug 2023 11:31:51 +0200 Subject: [PATCH 32/75] Refactor Class Region --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 2 +- ppanggolin/align/alignOnPang.py | 2 +- ppanggolin/figures/draw_spot.py | 4 ++-- ppanggolin/formats/writeFlat.py | 4 ++-- ppanggolin/formats/writeSequences.py | 2 +- ppanggolin/region.py | 16 +++++++++++----- 7 files changed, 19 insertions(+), 13 deletions(-) diff --git a/VERSION b/VERSION index c0786106..99b29b9d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.149 +1.2.150 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 9a63c9b8..d0bc6d08 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -184,7 +184,7 @@ def max_index_node(lst): while val >= min_score: new_region = extract_rgp(contig, matrix[index], len(contig_regions), naming) new_region.score = val - if new_region.lenght > min_length: + if new_region.length > min_length: contig_regions.add(new_region) rewrite_matrix(contig, matrix, index, persistent, continuity, multi) val, index = max_index_node(matrix) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 126b0047..af00f4cd 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -185,7 +185,7 @@ def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) \ fams = set() fams_border = set() for rgp in spot.regions: - fams |= rgp.families + fams |= set(rgp.families) fams_border |= set([gene.family for border in # Set of families in border of spot rgp.get_bordering_genes(pangenome.parameters["spots"]["set_size"], multigenics) for gene in border]) diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 1092b805..1dfd0528 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -175,7 +175,7 @@ def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = maxpos = max([gene.position for border in borders for gene in border]) else: minpos = rgp.starter.position - maxpos = rgp.stop_gene.position + maxpos = rgp.stopper.position gene_list = rgp.contig.get_genes(minpos, maxpos + 1) prev = None for gene in gene_list: @@ -201,7 +201,7 @@ def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = try: g[gene.family.name][prev]["rgp"].add(rgp) except KeyError: - g[gene.family.name][prev]["rgp"] = set(rgp) + g[gene.family.name][prev]["rgp"] = {rgp} prev = gene.family.name for node1, node2 in g.edges: g[node1][node2]["weight"] = len(g[node1][node2]["rgp"]) / len(spot.regions) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index c0788db6..a574abfa 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -679,8 +679,8 @@ def r_and_s(value: float): len_uniq_content = len(spot.get_uniq_content()) size_list = [] for rgp in spot.regions: - tot_fams |= rgp.families - size_list.append(len(rgp.genes)) + tot_fams |= set(rgp.families) + size_list.append(len(rgp)) mean_size = mean(size_list) stdev_size = stdev(size_list) if len(size_list) > 1 else 0 max_size = max(size_list) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 7abf1adb..033aacca 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -326,7 +326,7 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa loaded_genome = region.organism.name genome_sequence = read_genome_file(org_dict, loaded_genome) fasta.write(f">{region.name}\n") - fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60)) + fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.starter.start:region.stopper.stop], 60)) logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: '{outname}'") diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 3e769d93..ecd05fa3 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -63,9 +63,13 @@ def __setitem__(self, position, gene): if not isinstance(gene, Gene): raise TypeError(f"Unexpected class / type for {type(gene)} " f"when adding it to a region of genomic plasticity") - if len(self) > 0 and gene.organism != self.organism: - raise Exception(f"Gene {gene.name} is from a different organism than the first defined in RGP. " - f"That's not possible") + if len(self) > 0: + if gene.organism != self.organism: + raise Exception(f"Gene {gene.name} is from a different organism than the first defined in RGP. " + f"That's not possible") + if gene.contig != self.contig: + raise Exception(f"Gene {gene.name} is from a different contig than the first defined in RGP. " + f"That's not possible") self._genes_getter[position] = gene self.starter = self._genes_getter[min(self._genes_getter.keys())] self.stopper = self._genes_getter[max(self._genes_getter.keys())] @@ -94,7 +98,9 @@ def families(self) -> Generator[GeneFamily, None, None]: yield gene.family @property - def lenght(self): + def length(self): + """Get the length of the region + """ return self.stopper.stop - self.starter.start @property @@ -216,7 +222,7 @@ def families(self) -> set: union = set() for region in self.regions: - union |= region.families + union |= set(region.families) return union def add_regions(self, regions): From f3bd638a21825d884b1b097ea1fc2c0834463a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 8 Aug 2023 11:29:20 +0200 Subject: [PATCH 33/75] test Region class --- VERSION | 2 +- ppanggolin/genome.py | 4 + ppanggolin/region.py | 54 ++++--- tests/region/test_Region.py | 203 ------------------------- tests/test_Region.py | 294 ++++++++++++++++++++++++++++++++++++ 5 files changed, 335 insertions(+), 222 deletions(-) delete mode 100644 tests/region/test_Region.py create mode 100644 tests/test_Region.py diff --git a/VERSION b/VERSION index 99b29b9d..f0e06499 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.150 +1.2.151 diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index ec87838d..832024a2 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -306,6 +306,8 @@ def __setitem__(self, start: int, gene: Gene): :param start: Start position of the gene :param gene: Gene object to add """ + # TODO look at change start for position + if not isinstance(gene, Gene): raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") if start in self._genes_getter: @@ -324,6 +326,8 @@ def __getitem__(self, index: int) -> Gene: raise TypeError(f"Expected type is int, given type was '{type(index)}'") return self._genes_position[index] + # TODO define delitem + def get_genes(self, begin: int, end: int): """Gets a list of genes within a range :param begin: Position of first gene to retrieve diff --git a/ppanggolin/region.py b/ppanggolin/region.py index ecd05fa3..e9bc4f74 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -12,14 +12,30 @@ import gmpy2 # local libraries -from ppanggolin.genome import Gene, Organism, Contig +from ppanggolin.genome import Gene, Organism, Contig, RNA from ppanggolin.geneFamily import GeneFamily from ppanggolin.metadata import MetaFeatures class Region(MetaFeatures): """ - This class represent a region of genomic plasticity. + The 'Region' class represents a region of genomic plasticity. + Methods: + - 'genes': the property that generates the genes in the region as they are ordered in contigs. + - 'families': the property that generates the gene families in the region. + - 'length': the property that gets the length of the region. + - 'organism': the property that gets the organism linked to the region. + - 'contig': the property that gets the starter contig linked to the region. + - 'is_whole_contig': the property that indicates if the region is an entire contig. + - 'is_contig_border': the property that indicates if the region is bordering a contig. + - 'get_rnas(self) -> set': the method that gets the RNA in the region. + - 'get_bordering_genes(self, n: int, multigenics: set) -> list': the method that gets the bordered genes in the region. + + Fields: + - 'name': the name of the region. + - 'score': the score of the region. + - 'starter': the first gene in the region. + - 'stopper': the last gene in the region. """ def __init__(self, region_id: str): @@ -70,13 +86,18 @@ def __setitem__(self, position, gene): if gene.contig != self.contig: raise Exception(f"Gene {gene.name} is from a different contig than the first defined in RGP. " f"That's not possible") + if position in self._genes_getter and self[position] != gene: + raise ValueError("Another gene already exist at this position") self._genes_getter[position] = gene self.starter = self._genes_getter[min(self._genes_getter.keys())] self.stopper = self._genes_getter[max(self._genes_getter.keys())] gene.RGP = self def __getitem__(self, position): - return self._genes_getter[position] + try: + return self._genes_getter[position] + except KeyError: + raise KeyError(f"There is no gene at position {position} in RGP {self.name}") def __delitem__(self, position): del self._genes_getter[position] @@ -97,6 +118,11 @@ def families(self) -> Generator[GeneFamily, None, None]: for gene in self.genes: yield gene.family + def number_of_families(self) -> int: + """Get the number of different gene families in the region + """ + return len(set(self.families)) + @property def length(self): """Get the length of the region @@ -137,22 +163,13 @@ def is_contig_border(self) -> bool: """ if len(self) == 0: raise Exception("Your region has no genes. Something wrong happenned.") - if (self.starter.position == 0 and not self.contig.is_circular) or \ - (self.stopper.position == len(self.contig) - 1 and not self.contig.is_circular): - return True + min_pos = min(self.contig.genes, key=lambda x: x.position).position + max_pos = max(self.contig.genes, key=lambda x: x.position).position + if not self.contig.is_circular: + if self.starter.position == min_pos or self.stopper.position == max_pos: + return True return False - def get_rnas(self) -> set: - """ Get RNA in region - - :return: Set of RNA - """ - rnas = set() - for rna in self.contig.RNAs: - if self.starter.start < rna.start < self.stopper.stop: - rnas.add(rna) - return rnas - def get_bordering_genes(self, n: int, multigenics: set) -> list: """ Get the bordered genes in the region @@ -161,12 +178,13 @@ def get_bordering_genes(self, n: int, multigenics: set) -> list: :return: A list of bordering gene in start and stop position List[List[Start Gene], [Stop Gene]] """ + # TODO add Exception border = [[], []] pos = self.starter.position init = pos while len(border[0]) < n and (pos != 0 or self.contig.is_circular): curr_gene = None - if pos == 0: + if pos == 0: # TODO change for variable to be more flexible if self.contig.is_circular: curr_gene = self.contig[pos - 1] else: diff --git a/tests/region/test_Region.py b/tests/region/test_Region.py deleted file mode 100644 index 3fca27ae..00000000 --- a/tests/region/test_Region.py +++ /dev/null @@ -1,203 +0,0 @@ -#! /usr/bin/env python3 - -import pytest - -from ppanggolin.region import Region -from ppanggolin.geneFamily import GeneFamily -from ppanggolin.genome import Gene, Contig, Organism, RNA - - -# ================================================ -def test_cstr(): - identifier = 4 - o_region = Region(identifier) - assert isinstance(o_region, Region) - for attr in "genes", "name", "score": - assert hasattr(o_region, attr) - - assert o_region.score == 0 - assert o_region.name == identifier - assert o_region.genes == [] - - -# ================================================ -@pytest.fixture -def o_region(): - return Region(4) - - -@pytest.fixture -def o_org(): - return Organism("toto") - - -@pytest.fixture -def o_contig(): - return Contig(1) - - -@pytest.fixture -def o_rna(o_contig): - o_rna = RNA("Ah") - o_rna.fill_annotations(35, 45, "-") - o_contig.add_rna(o_rna) - return o_rna - - -@pytest.fixture -def l_genes(o_org, o_contig): - """ creates a small gene set for testing. - - returns a list of 4 genes that belongs - to the same contig and the same organism.""" - l_genes = [] - c = 10 - for i, gene_id in enumerate([ - "toto", "tata", "titi", "tutu", - "lolo", "lala", "lili", "lulu", - ]): - gene = Gene(gene_id) - gene.fill_annotations(c, c + 30, "+", position=i) - gene.fill_parents(o_org, o_contig) - o_contig.add_gene(gene) - gene.family = GeneFamily(i, gene_id) - gene.family.add_partition("c-cloud") - l_genes.append(gene) - c += 35 - return l_genes - - -# ================================================ -def test_append(l_genes, o_region): - for gene in l_genes: - o_region.append(gene) - - assert set(o_region.genes) == set(l_genes) - - -def test_append__error(o_region): - """append should raise a TypeError is used with non Gene param.""" - with pytest.raises(TypeError): - o_region.append(42) - - -def test_properties(l_genes, o_region, o_org, o_contig): - """All properties expect a region with genes.""" - s_families = set() - for gene in l_genes: - o_region.append(gene) - s_families.add(gene.family) - - # checking properties sanity - assert o_region.start == o_region.start_gene.start - assert o_region.stop == o_region.stop_gene.stop - assert o_region.organism == o_org - assert o_region.families == s_families - assert o_region.contig == o_contig - assert o_region.is_whole_contig is True - assert o_region.is_contig_border is True # first contig gene is in the region - - # remove the first gene of the contig - o_region.genes.pop(0) - assert o_region.is_contig_border is True # last contig gene is in the region - - # remove the last gene of the contig - # => the whole contig is not in the Region anymore - o_region.genes.pop() - assert o_region.is_whole_contig is False - assert o_region.is_contig_border is False - - -def test_is_contig_border(o_region): - """is_contig_border raise an exception - when the region contain no genes. - """ - with pytest.raises(Exception): - o_region.is_contig_border - - -def test_get_rnas(o_rna, o_region, l_genes): - for gene in l_genes: - o_region.append(gene) - assert set(o_region.get_rnas()) == {o_rna} - - -def test_hash(o_region): - """ a hash function returns an integer""" - # the same int if called twice on the same object - h = hash(o_region) - assert isinstance(h, int) - assert h == hash(o_region) - - # different ints if called on objects representing the same entity - name = "charming" - assert hash(Region(name)) != hash(Region(name)) - - -def test_equality(o_region, l_genes): - """2 regions are equals if they contain the same list of genes.""" - for gene in l_genes: - o_region.append(gene) - - # not the same list => False - o_other = Region("other") - assert o_region != o_other - - # the exact same list => True - o_other = Region("other") - for gene in l_genes: - o_other.append(gene) - assert o_region == o_other - - # the same list in reverse order => True - o_other = Region("other") - for gene in reversed(l_genes): - o_other.append(gene) - assert o_region == o_other - - -def test_equality__error(o_region): - """equality raises error if not compared to another Region""" - with pytest.raises(TypeError): - o_region == 42 - - -def test_len(o_region, l_genes): - assert 0 == len(o_region) - - for gene in l_genes: - o_region.append(gene) - assert len(l_genes) == len(o_region) - - -def test_get_item(o_region, l_genes): - with pytest.raises(IndexError): - o_region[1] - - for gene in l_genes: - o_region.append(gene) - assert o_region[2] == l_genes[2] - - -def test_get_bordering_genes(o_region, l_genes): - # return at most n-1 genes not in multigenics families - # nor in family with persistent partition. - - print("\n") - for gene in l_genes: - o_region.append(gene) - - l_first, l_last = o_region.get_bordering_genes(0, ['f1', 'f2']) - assert [] == l_first - assert [] == l_last - - # line 101 & 125 != while condition. => unreachable lines. - # return nothing if is_contig_border - l_first, l_last = o_region.get_bordering_genes(2, ['f1', 'f2']) - assert [] == l_first - assert [] == l_last - - # remove first and last gene - o_region.genes.pop(0) - o_region.genes.pop() - o_region.get_bordering_genes(4, ['f1', 'f2']) diff --git a/tests/test_Region.py b/tests/test_Region.py new file mode 100644 index 00000000..a1700056 --- /dev/null +++ b/tests/test_Region.py @@ -0,0 +1,294 @@ +#! /usr/bin/env python3 + +import pytest +from typing import Generator, Set +from random import randint + +from ppanggolin.region import Region +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.genome import Gene, Contig, Organism + + +class TestRegion: + """Tests for region class + """ + attr_val = {'score': 0, 'starter': None, 'stopper': None} + + @pytest.fixture + def region(self) -> Generator[Region, None, None]: + """Generate a region object to test class + """ + yield Region("RGP") + + def test_cstr(self, region: Region): + assert isinstance(region, Region) + assert region.name == "RGP" + assert isinstance(region._genes_getter, dict) + for attr, value in self.attr_val.items(): + assert region.__getattribute__(attr) == value + + def test_add_gene(self, region): + """Tests that genes can be aadded to a region + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + region[0] = gene + assert len(region._genes_getter) == 1 + assert region._genes_getter[0] == gene + assert region.starter == gene + assert region.stopper == gene + assert gene.RGP == region + + def test_add_gene_not_is_instance_gene(self, region): + """Test that adding object with instance not Gene return a TypeError + """ + with pytest.raises(TypeError): + region[0] = 0 + + def test_add_genes_at_position_already_taken(self, region): + """Test that adding genes with same position return a ValueError + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + region[0] = gene + with pytest.raises(ValueError): + gene = Gene('gene') + gene.fill_annotations(start=4, stop=12, strand='-', position=0) + region[0] = gene + + def test_add_genes_from_different_contigs(self, region): + """Test that adding genes from different contigs return an Exception + """ + gene1, gene2 = Gene('gene_1'), Gene('gene_2') + gene1.fill_annotations(start=0, stop=10, strand='+', position=0) + gene2.fill_annotations(start=11, stop=20, strand='+', position=1) + gene1.fill_parents(None, Contig('contig_1')) + region[0] = gene1 + gene2.fill_parents(None, Contig('contig_2')) + with pytest.raises(Exception): + region[1] = gene2 + + def test_add_genes_from_different_organisms(self, region): + """Test that adding genes from different organisms return an Exception + """ + gene1, gene2 = Gene('gene_1'), Gene('gene_2') + gene1.fill_annotations(start=0, stop=10, strand='+', position=0) + gene2.fill_annotations(start=11, stop=20, strand='+', position=1) + gene1.fill_parents(Organism("org_1")) + region[0] = gene1 + gene2.fill_parents(Organism("org_2")) + with pytest.raises(Exception): + region[1] = gene2 + + def test_get_genes(self, region): + """Tests that genes can be retrieved from the region + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + region[0] = gene + assert region[0] == gene + + def test_get_genes_with_position_not_in_region(self, region): + with pytest.raises(KeyError): + _ = region[randint(0, 20)] + + def test_del_gene(self, region): + """Tests that genes can be deleted from the region + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + region[0] = gene + assert region[0] == gene + del region[0] + assert 0 not in region._genes_getter + + def test_get_length(self, region): + """Tests that the length of the region can be retrieved + """ + gene1, gene2 = Gene('gene_1'), Gene('gene_2') + gene1.fill_annotations(start=0, stop=10, strand='+', position=0) + gene2.fill_annotations(start=11, stop=20, strand='+', position=1) + region[0] = gene1 + region[1] = gene2 + assert region.length == 20 + + def test_get_organism(self, region): + """Tests that the organism linked to the region can be retrieved + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_parents(Organism("org")) + region[0] = gene + assert region.organism.name == 'org' + + def test_get_contig(self, region): + """Tests that the contig linked to the region can be retrieved + """ + gene = Gene('gene') + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.fill_parents(None, Contig("contig")) + region[0] = gene + assert region.contig.name == 'contig' + + def test_is_whole_contig_true(self, region): + """Tests that the property is_whole_contig return True if region is same length as contig + """ + starter, stopper = Gene('starter'), Gene('stopper') + starter.fill_annotations(start=0, stop=10, strand='+', position=0) + stopper.fill_annotations(start=11, stop=20, strand='+', position=1) + contig = Contig("contig") + contig[starter.start], contig[stopper.start] = starter, stopper + starter.fill_parents(None, contig), stopper.fill_parents(None, contig) + region[starter.position], region[stopper.position] = starter, stopper + assert region.is_whole_contig is True + + def test_is_whole_contig_false(self, region): + """Tests that the property is_whole_contig return False if region is not same length as contig + """ + before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') + before.fill_annotations(start=0, stop=10, strand='+', position=0) + starter.fill_annotations(start=11, stop=20, strand='+', position=1) + stopper.fill_annotations(start=21, stop=30, strand='+', position=2) + after.fill_annotations(start=31, stop=40, strand='+', position=3) + contig = Contig("contig") + contig[before.start], contig[after.start] = before, after + contig[starter.start], contig[stopper.start] = starter, stopper + before.fill_parents(None, contig), after.fill_parents(None, contig) + starter.fill_parents(None, contig), stopper.fill_parents(None, contig) + region[starter.position], region[stopper.position] = starter, stopper + assert region.is_whole_contig is False + + def test_is_contig_border_true(self, region): + """Test that property is_contig_border return true if the region is bordering the contig + """ + before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') + before.fill_annotations(start=0, stop=10, strand='+', position=0) + starter.fill_annotations(start=11, stop=20, strand='+', position=1) + stopper.fill_annotations(start=21, stop=30, strand='+', position=2) + after.fill_annotations(start=31, stop=40, strand='+', position=3) + contig = Contig("contig") + before.fill_parents(None, contig), after.fill_parents(None, contig) + starter.fill_parents(None, contig), stopper.fill_parents(None, contig) + # Test bordering right + contig[before.start], contig[starter.start], contig[stopper.start] = before, starter, stopper + region[starter.position], region[stopper.position] = starter, stopper + assert region.is_contig_border is True + # Test bordering left + del contig._genes_position[before.position] + del contig._genes_getter[before.start] + contig[after.start] = after + assert region.is_contig_border is True + + def test_is_contig_border_false(self, region): + """Tests that the property is_contig_border return False if region is not bordering the contig + """ + before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') + before.fill_annotations(start=0, stop=10, strand='+', position=0) + starter.fill_annotations(start=11, stop=20, strand='+', position=1) + stopper.fill_annotations(start=21, stop=30, strand='+', position=2) + after.fill_annotations(start=31, stop=40, strand='+', position=3) + contig = Contig("contig") + contig[before.start], contig[after.start] = before, after + contig[starter.start], contig[stopper.start] = starter, stopper + before.fill_parents(None, contig), after.fill_parents(None, contig) + starter.fill_parents(None, contig), stopper.fill_parents(None, contig) + region[starter.position], region[stopper.position] = starter, stopper + assert region.is_contig_border is False + + @pytest.fixture + def genes(self) -> Generator[Set[Gene], None, None]: + """Create a set of genes to fill gene families + """ + genes = set() + for i in range(0, randint(11, 20)): + gene = Gene(f"gene_{str(i)}") + gene.fill_annotations(start=10*i + 1, stop=10*(i+1), strand='+', position=i, genetic_code=4) + genes.add(gene) + yield genes + + def test_len(self, region, genes): + for gene in genes: + region[gene.position] = gene + assert isinstance(len(region), int) + assert len(region) == len(genes) + + def test_equality(self, genes): + """Test equality between two regions + """ + region_1, region_2, region_3 = Region("RGP_1"), Region("RGP_2"), Region("RGP_3") + max_pos = max(genes, key=lambda gene: gene.position).position + for gene in genes: + region_1[gene.position] = gene + region_2[gene.position] = gene + region_3[max_pos - gene.position + 1] = gene + assert region_1 == region_2 + assert region_1 == region_3 + + def test_not_equal(self, region, genes): + """Test difference between two regions + """ + for gene in genes: + region[gene.position] = gene + assert region != Region("other_RGP") + + def test_equality_with_not_instance_region(self, region): + """Test comparison between a region and another object raise a TypeError + """ + with pytest.raises(TypeError): + assert region == 4 + + @pytest.fixture + def families(self, genes) -> Generator[Set[GeneFamily], None, None]: + """Create a set of gene families fill with genes to test edges + """ + families = set() + genes = list(genes) + nb_families = randint(2, 10) + nb_genes_per_family = len(genes) // nb_families + idx_fam = 1 + while idx_fam < nb_families: + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = 0 + while idx_genes < nb_genes_per_family: + gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] + family.add_gene(gene) + gene.family = family + idx_genes += 1 + families.add(family) + idx_fam += 1 + # last family fill with all the gene left + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = (idx_fam - 1) * nb_genes_per_family + while idx_genes < len(genes): + gene = genes[idx_genes] + family.add_gene(gene) + gene.family = family + idx_genes += 1 + families.add(family) + yield families + + def test_get_gene_families(self, region, genes, families): + """Tests that gene families can be retrieved from the region + """ + for gene in genes: + region[gene.position] = gene + assert all(isinstance(family, GeneFamily) for family in region.families) + assert set(region.families) == families + + def test_get_number_of_gene_families(self, region, genes, families): + """Tests that gene families can be retrieved from the region + """ + for gene in genes: + region[gene.position] = gene + assert isinstance(region.number_of_families(), int) + assert region.number_of_families() == len(families) + + # def test_get_bordering_genes(self, region, genes): + # # TODO test multigenic + # contig = Contig("contig") + # for gene in genes: + # contig[gene.start] = gene + # gene.fill_parents(None, contig) + # region[gene.position] = gene + # min_gene, max_gene = min(genes, key=lambda gene: gene.position), max(genes, key=lambda gene: gene.position) + # assert region.get_bordering_genes(1, {}) == [[min_gene], [max_gene]] \ No newline at end of file From c9527ff1700485ce2d46d09ee5dc0685914d4b61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 9 Aug 2023 09:56:08 +0200 Subject: [PATCH 34/75] Refactor Spot class --- VERSION | 2 +- ppanggolin/RGP/spot.py | 3 +- ppanggolin/figures/draw_spot.py | 2 +- ppanggolin/formats/readBinaries.py | 3 +- ppanggolin/formats/writeBinaries.py | 2 +- ppanggolin/formats/writeFlat.py | 7 +- ppanggolin/region.py | 104 ++++++++++++++++------------ 7 files changed, 68 insertions(+), 55 deletions(-) diff --git a/VERSION b/VERSION index f0e06499..ce592228 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.151 +1.2.152 diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index d27f3d4a..87977d13 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -134,7 +134,8 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): curr_spot = Spot(spot_id) spots.append(curr_spot) for node in comp: - curr_spot.add_regions(graph_spot.nodes[node]["rgp"]) + for region in graph_spot.nodes[node]["rgp"]: + curr_spot[region.name] = region spot_id += 1 if spot_graph: diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 1dfd0528..cf3b9660 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -204,7 +204,7 @@ def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = g[gene.family.name][prev]["rgp"] = {rgp} prev = gene.family.name for node1, node2 in g.edges: - g[node1][node2]["weight"] = len(g[node1][node2]["rgp"]) / len(spot.regions) + g[node1][node2]["weight"] = len(g[node1][node2]["rgp"]) / len(spot) del g[node1][node2]["rgp"] for node in g.nodes: if "name" in g.nodes[node]: diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index e9e132b2..649c5b7d 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -406,7 +406,8 @@ def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False if curr_spot is None: curr_spot = Spot(row["spot"]) spots[row["spot"]] = curr_spot - curr_spot.add_region(pangenome.get_region(row["RGP"].decode())) + region = pangenome.get_region(row["RGP"].decode()) + curr_spot[region.name] = region curr_spot.spot_2_families() for spot in spots.values(): pangenome.add_spot(spot) diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index dec55572..06399939 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -572,7 +572,7 @@ def write_spots(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis h5f.remove_node("/", "spots") spot_table = h5f.create_table("/", "spots", spot_desc(get_spot_desc(pangenome)), - expectedrows=sum([len(spot.regions) for spot in pangenome.spots])) + expectedrows=sum([len(spot) for spot in pangenome.spots])) spot_row = spot_table.row for spot in tqdm(pangenome.spots, total=pangenome.number_of_spots(), unit="spot", disable=disable_bar): for region in spot.regions: diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index a574abfa..c2004cd0 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -673,9 +673,8 @@ def r_and_s(value: float): with write_compressed_or_not(output / "summarize_spots.tsv", compress) as fout: fout.write("spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\t" "stdev_nb_genes\tmax_nb_genes\tmin_nb_genes\n") - for spot in sorted(spots, key=lambda x: len(x.regions), reverse=True): + for spot in sorted(spots, key=lambda x: len(x), reverse=True): tot_fams = set() - rgp_list = list(spot.regions) len_uniq_content = len(spot.get_uniq_content()) size_list = [] for rgp in spot.regions: @@ -685,7 +684,7 @@ def r_and_s(value: float): stdev_size = stdev(size_list) if len(size_list) > 1 else 0 max_size = max(size_list) min_size = min(size_list) - fout.write("\t".join(map(r_and_s, [f"spot_{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, + fout.write("\t".join(map(r_and_s, [f"spot_{spot.ID}", len(spot), len(tot_fams), len_uniq_content, mean_size, stdev_size, max_size, min_size])) + "\n") logging.getLogger("PPanGGOLiN").info(f"Done writing spots in : '{output.as_posix() + '/summarize_spots.tsv'}'") @@ -726,7 +725,7 @@ def write_borders(output: Path, dup_margin: float = 0.05, compress: bool = False all_fams = set() with write_compressed_or_not(output / "spot_borders.tsv", compress) as fout: fout.write("spot_id\tnumber\tborder1\tborder2\n") - for spot in sorted(pan.spots, key=lambda x: len(x.regions), reverse=True): + for spot in sorted(pan.spots, key=lambda x: len(x), reverse=True): curr_borders = spot.borders(pan.parameters["spots"]["set_size"], multigenics) for c, border in curr_borders: famstring1 = ",".join([fam.name for fam in border[0]]) diff --git a/ppanggolin/region.py b/ppanggolin/region.py index e9bc4f74..17ec19f1 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -7,7 +7,7 @@ from collections.abc import Iterable # installed libraries -from typing import Dict, Generator, Set +from typing import Dict, Generator, List, Set import gmpy2 @@ -23,9 +23,9 @@ class Region(MetaFeatures): Methods: - 'genes': the property that generates the genes in the region as they are ordered in contigs. - 'families': the property that generates the gene families in the region. - - 'length': the property that gets the length of the region. + - 'Length': the property that gets the length of the region. - 'organism': the property that gets the organism linked to the region. - - 'contig': the property that gets the starter contig linked to the region. + - 'Contig': the property that gets the starter contig linked to the region. - 'is_whole_contig': the property that indicates if the region is an entire contig. - 'is_contig_border': the property that indicates if the region is bordering a contig. - 'get_rnas(self) -> set': the method that gets the RNA in the region. @@ -34,7 +34,7 @@ class Region(MetaFeatures): Fields: - 'name': the name of the region. - 'score': the score of the region. - - 'starter': the first gene in the region. + - 'Starter': the first gene in the region. - 'stopper': the last gene in the region. """ @@ -170,13 +170,13 @@ def is_contig_border(self) -> bool: return True return False - def get_bordering_genes(self, n: int, multigenics: set) -> list: + def get_bordering_genes(self, n: int, multigenics: set) -> List[List[Gene], List[Gene]]: """ Get the bordered genes in the region :param n: number of genes to get :param multigenics: pangenome graph multigenic persistent families - :return: A list of bordering gene in start and stop position List[List[Start Gene], [Stop Gene]] + :return: A list of bordering gene in start and stop position """ # TODO add Exception border = [[], []] @@ -218,59 +218,73 @@ def get_bordering_genes(self, n: int, multigenics: set) -> list: class Spot(MetaFeatures): """ - This class represent a hotspot. + The 'Spot' class represents a region of genomic plasticity. + Methods: + - 'regions': the property that generates the regions in the spot. + - 'families': the property that generates the gene families in the spot. + - 'spot_2_families': add to Gene Families a link to spot. + - 'borders': Extracts all the borders of all RGPs belonging to the spot + - 'get_uniq_to_rgp': Get dictionnary with a representing RGP as key, and all identical RGPs as value + - 'get_uniq_ordered_set': Get an Iterable of all the unique syntenies in the spot + - 'get_uniq_content': Get an Iterable of all the unique rgp (in terms of gene family content) in the spot + - 'count_uniq_content': Get a counter of uniq RGP and number of identical RGP (in terms of gene family content) + - 'count_uniq_ordered_set': Get a counter of uniq RGP and number of identical RGP (in terms of synteny content) - :param spot_id: identifier of the spot + Fields: + - 'ID': Identifier of the spot """ - def __init__(self, spot_id): + def __init__(self, spot_id: int): + """Constructor method + + :param spot_id: identifier of the spot + """ super().__init__() self.ID = spot_id - self.regions = set() + self._region_getter = {} self._uniqOrderedSet = {} - self._compOrderedSet = False self._uniqContent = {} - self._compContent = False - @property - def families(self) -> set: - """Get the gene families in the RGP - - :return: Set of gene families - """ + def __setitem__(self, name, region): + if not isinstance(region, Region): + raise TypeError(f"A Region object is expected to be added to the spot. find type is {type(region)}") + if name in self._region_getter: + raise KeyError("A Region with the same name already exist in spot") + self._region_getter[name] = region - union = set() - for region in self.regions: - union |= set(region.families) - return union + def __getitem__(self, name): + return self._region_getter[name] - def add_regions(self, regions): - """ - Adds region(s) contained in an Iterable to the spot which all have the same bordering persistent genes - provided with 'borders' + def __delitem__(self, name): + del self._region_getter[name] - :param regions: Iterable list of RGP to add to spot - """ - if isinstance(regions, Iterable): - for region in regions: - self.add_region(region) - else: - raise Exception("The provided 'regions' variable was not an Iterable") + def __len__(self): + return len(self._region_getter) - def add_region(self, region): + @property + def regions(self) -> Generator[Region, None, None]: + """Generates the regions in the spot """ - Add one RGP to the spot + for region in self._region_getter.values(): + yield region - :param region: RGP to add to spot + @property + def families(self) -> Generator[GeneFamily, None, None]: + """Get the gene families in the RGP """ - if isinstance(region, Region): - self.regions.add(region) + families = set() + for region in self.regions: + for family in region.families: + if family not in families: + families.add(family) + yield family def spot_2_families(self): - """Add to Gene Families a link to spot""" + """Add to Gene Families a link to spot + """ for family in self.families: family.add_spot(self) - def borders(self, set_size: int, multigenics): + def borders(self, set_size: int, multigenics) -> List[List[int, List[GeneFamily], List[GeneFamily]]]: """ Extracts all the borders of all RGPs belonging to the spot :param set_size: number of genes to get @@ -323,9 +337,8 @@ def _get_content(self): :return: RGP groups that have identical gene content """ - if not self._compContent: + if len(self._uniqContent) == 0: self._mk_uniq_content() - self._compContent = True return self._uniqContent def _get_ordered_set(self): @@ -333,9 +346,8 @@ def _get_ordered_set(self): :return: RGP groups that have an identical synteny """ - if not self._compOrderedSet: + if len(self._uniqOrderedSet) == 0: self._mk_uniq_ordered_set_obj() - self._compOrderedSet = True return self._uniqOrderedSet def get_uniq_to_rgp(self) -> dict: @@ -345,14 +357,14 @@ def get_uniq_to_rgp(self) -> dict: """ return self._get_ordered_set() - def get_uniq_ordered_set(self): + def get_uniq_ordered_set(self) -> Set[Region]: """Get an Iterable of all the unique syntenies in the spot :return: Iterable of all the unique syntenies in the spot """ return set(self._get_ordered_set().keys()) - def get_uniq_content(self): + def get_uniq_content(self) -> Set[Region]: """ Get an Iterable of all the unique rgp (in terms of gene family content) in the spot :return: Iterable of all the unique rgp (in terms of gene family content) in the spot From 0182e30cb25553204ac299ea34a540e73dd95321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 9 Aug 2023 15:11:53 +0200 Subject: [PATCH 35/75] Test Spot class --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 4 +- ppanggolin/region.py | 64 ++++--- tests/genome/test_Contig.py | 79 -------- tests/genome/test_Gene.py | 57 ------ tests/genome/test_Organism.py | 88 --------- tests/test_Region.py | 277 ++++++++++++++++++++++++----- 7 files changed, 277 insertions(+), 294 deletions(-) delete mode 100644 tests/genome/test_Contig.py delete mode 100644 tests/genome/test_Gene.py delete mode 100644 tests/genome/test_Organism.py diff --git a/VERSION b/VERSION index ce592228..92b8c188 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.152 +1.2.153 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 649c5b7d..288906bb 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -402,9 +402,9 @@ def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False table = h5f.root.spots spots = {} for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="spot", disable=disable_bar): - curr_spot = spots.get(row["spot"]) + curr_spot = spots.get(int(row["spot"])) if curr_spot is None: - curr_spot = Spot(row["spot"]) + curr_spot = Spot(int(row["spot"])) spots[row["spot"]] = curr_spot region = pangenome.get_region(row["RGP"].decode()) curr_spot[region.name] = region diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 17ec19f1..a33dc7a4 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -238,21 +238,32 @@ def __init__(self, spot_id: int): :param spot_id: identifier of the spot """ + if not isinstance(spot_id, int): + raise TypeError(f"Spot identifier must be an integer. Given type is {type(spot_id)}") super().__init__() self.ID = spot_id self._region_getter = {} self._uniqOrderedSet = {} self._uniqContent = {} + def __repr__(self): + return f"Spot {self.ID} - #RGP: {len(self)}" + + def __str__(self): + return f"spot_{self.ID}" + def __setitem__(self, name, region): if not isinstance(region, Region): raise TypeError(f"A Region object is expected to be added to the spot. find type is {type(region)}") - if name in self._region_getter: + if name in self._region_getter and self[name] != region: raise KeyError("A Region with the same name already exist in spot") self._region_getter[name] = region def __getitem__(self, name): - return self._region_getter[name] + try: + return self._region_getter[name] + except KeyError: + raise KeyError(f"Region with {name} does not exist in spot") def __delitem__(self, name): del self._region_getter[name] @@ -278,6 +289,11 @@ def families(self) -> Generator[GeneFamily, None, None]: families.add(family) yield family + def number_of_families(self) -> int: + """Return the number of different families in the spot + """ + return len({family for region in self.regions for family in region.families}) + def spot_2_families(self): """Add to Gene Families a link to spot """ @@ -321,26 +337,6 @@ def _mk_uniq_ordered_set_obj(self): if z: self._uniqOrderedSet[rgp] = {rgp} - def _mk_uniq_content(self): - """cluster RGP into groups that have identical gene content""" - for rgp in self.regions: - z = True - for seen_rgp in self._uniqContent: - if rgp.families == seen_rgp.families: - z = False - self._uniqContent[seen_rgp].add(rgp) - if z: - self._uniqContent[rgp] = {rgp} - - def _get_content(self): - """Creates the _uniqContent object if it was never computed. Return it in any case - - :return: RGP groups that have identical gene content - """ - if len(self._uniqContent) == 0: - self._mk_uniq_content() - return self._uniqContent - def _get_ordered_set(self): """ Creates the _uniqSyn object if it was never computed. Return it in any case @@ -350,10 +346,10 @@ def _get_ordered_set(self): self._mk_uniq_ordered_set_obj() return self._uniqOrderedSet - def get_uniq_to_rgp(self) -> dict: + def get_uniq_to_rgp(self) -> Dict[Region, Set[Region]]: """ Get dictionnary with a representing RGP as key, and all identical RGPs as value - :return: Dictionnary with a representing RGP as key, and all identical RGPs as value + :return: Dictionnary with a representing RGP as key, and set of identical RGPs as value """ return self._get_ordered_set() @@ -364,6 +360,26 @@ def get_uniq_ordered_set(self) -> Set[Region]: """ return set(self._get_ordered_set().keys()) + def _mk_uniq_content(self): + """cluster RGP into groups that have identical gene content""" + for rgp in self.regions: + z = True + for seen_rgp in self._uniqContent: + if rgp.families == seen_rgp.families: + z = False + self._uniqContent[seen_rgp].add(rgp) + if z: + self._uniqContent[rgp] = {rgp} + + def _get_content(self): + """Creates the _uniqContent object if it was never computed. + + :return: RGP groups that have identical gene content + """ + if len(self._uniqContent) == 0: + self._mk_uniq_content() + return self._uniqContent + def get_uniq_content(self) -> Set[Region]: """ Get an Iterable of all the unique rgp (in terms of gene family content) in the spot diff --git a/tests/genome/test_Contig.py b/tests/genome/test_Contig.py deleted file mode 100644 index 6d7c79aa..00000000 --- a/tests/genome/test_Contig.py +++ /dev/null @@ -1,79 +0,0 @@ -#! /usr/bin/env python3 -import random - -import pytest - -from ppanggolin.genome import Contig, Gene, RNA - - -@pytest.fixture() -def o_ctg(): - return Contig("toto") - - -def test_cstr(): - name = 4 - o_ctg = Contig(name) - assert isinstance(o_ctg, Contig) - for attr in "name", "is_circular", "RNAs": - assert hasattr(o_ctg, attr) - assert o_ctg.name == name - assert o_ctg.is_circular is False - assert o_ctg.RNAs == set() - - o_ctg = Contig(name, True) - assert o_ctg.is_circular is True - - -def test_str(): - name = "ppoiu" - o_ctg = Contig(name) - assert str(o_ctg) == name - - -def test_add_rna(o_ctg): - with pytest.raises(TypeError): - o_ctg.add_rna(33) - - l_rnas = [] - for i in "abdc": - o_rna = RNA(i) - o_ctg.add_rna(o_rna) - l_rnas.append(o_rna) - assert o_ctg.RNAs == set(l_rnas) - - -@pytest.fixture() -def l_genes(): - l_genes = [] - for i in range(6, -1, -1): # Create 7 Gene - o_gene = Gene(i) - o_gene.fill_annotations(start=i*10, stop=i*10 - 1, strand='+', position=i) - l_genes.append(o_gene) - - return l_genes - - -def test_add_gene(o_ctg, l_genes): - with pytest.raises(TypeError): - o_ctg.add_gene(33) - - # gene must have a position before beeing added. - with pytest.raises(TypeError): - o_ctg.add_gene(Gene(33)) - - for o_gene in l_genes: - o_ctg.add_gene(o_gene) - - assert o_ctg.genes == sorted(l_genes, key=lambda x: x.position) - - -def test_iterator_behavior(o_ctg, l_genes): - # FIXME: is there a better way to check this ? - assert iter(o_ctg) - - for o_gene in l_genes: - o_ctg.add_gene(o_gene) - - l_ = [o_gene for o_gene in o_ctg] - assert l_ == sorted(l_genes, key=lambda x: x.start) diff --git a/tests/genome/test_Gene.py b/tests/genome/test_Gene.py deleted file mode 100644 index 45e02713..00000000 --- a/tests/genome/test_Gene.py +++ /dev/null @@ -1,57 +0,0 @@ -#! /usr/bin/env python3 - -import pytest - -from ppanggolin.genome import Feature, Gene - - -def test_cstr(): - """ By checking o_gene is a Feature, I rely on Feature tests.""" - identifier = 4 - o_gene = Gene(identifier) - assert isinstance(o_gene, Feature) - assert isinstance(o_gene, Gene) - - for attr in "position", "family": - assert hasattr(o_gene, attr) - assert o_gene.position is None - assert o_gene.family is None - - -def test_str(): - identifier = "un truc" - o_gene = Gene(identifier) - assert str(o_gene) == identifier - - -@pytest.fixture() -def o_gene(): - return Gene(4) - - -def test_fill_annotations_defaults(o_gene): - o_gene.fill_annotations(start=1, stop=9, strand='+') - for attr in "position", "genetic_code": - assert hasattr(o_gene, attr) - - assert o_gene.position is None - assert o_gene.genetic_code == 11 - - -def test_fill_annotations(o_gene): - position = 44 - genetic_code = 11 - o_gene.fill_annotations(start=1, stop=9, strand='+', position=44, genetic_code=11) - assert o_gene.position == position - assert o_gene.genetic_code == genetic_code - - -def test_add_protein_error(o_gene): - with pytest.raises(TypeError): - o_gene.add_protein(42) - - -def test_add_protein(o_gene): - prot = "une jolie protéïne, même avec des caractères bizarres ;)" - o_gene.add_protein(prot) - assert o_gene.protein == prot diff --git a/tests/genome/test_Organism.py b/tests/genome/test_Organism.py deleted file mode 100644 index 18c7a2cf..00000000 --- a/tests/genome/test_Organism.py +++ /dev/null @@ -1,88 +0,0 @@ -#! /usr/bin/env python3 - -import pytest -from random import randint - -from ppanggolin.genome import Contig, Gene, Organism - - -def test_cstr(): - name = 4 - o_org = Organism(name) - assert isinstance(o_org, Organism) - assert hasattr(o_org, "name") - assert o_org.name == name - - -def test_str(): - name = "ppoiu" - o_org = Organism(name) - assert str(o_org) == name - - -@pytest.fixture() -def o_org(): - return Organism("toto") - - -def test_get_or_add_contig(o_org): - o_ctg = o_org.get_contig('i') - assert isinstance(o_ctg, Contig) - - -@pytest.fixture() -def t_filled_org(o_org): - n = 0 - for k in "azerty'": - o_ctg = o_org.get_contig(k) - for i in range(randint(0, 5)): - o_gene = Gene(k + "-" + str(i)) - o_gene.fill_annotations(6, 1, k, position=i) - o_ctg.add_gene(o_gene) - n += 1 - - return o_org, n - - -def test_families(t_filled_org): - o_filled_org, _ = t_filled_org - - # families are never set - assert o_filled_org.families == {None} - - -def test_number_of_genes(t_filled_org): - o_filled_org, n = t_filled_org - - assert o_filled_org.number_of_genes() == n - - -def get_genes(): - for i in range(randint(0, 5)): - o_gene = Gene(str(i)) - start = randint(0, 100) - stop = randint(0, 100) - o_gene.fill_annotations(start, stop, 'x', position=i) - yield o_gene - - -def test_contigs(o_org): - l_contigs = [] - for k in "azer'": - o_ctg = o_org.get_contig(k) - for o_gene in get_genes(): - o_ctg.add_gene(o_gene) - l_contigs.append(o_ctg) - - assert list(o_org.contigs) == l_contigs - - -def test_genes(o_org): - o_ctg = o_org.get_contig("scrap") - for o_gene in get_genes(): - o_ctg.add_gene(o_gene) - - assert list(o_org.genes) == o_ctg.genes - - # FIXME: find a way to test when several contigs. - # => order of contig is not predictable. diff --git a/tests/test_Region.py b/tests/test_Region.py index a1700056..e9524bf3 100644 --- a/tests/test_Region.py +++ b/tests/test_Region.py @@ -1,14 +1,58 @@ #! /usr/bin/env python3 +import re import pytest from typing import Generator, Set from random import randint -from ppanggolin.region import Region +from ppanggolin.region import Region, Spot from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Contig, Organism +@pytest.fixture +def genes() -> Generator[Set[Gene], None, None]: + """Create a set of genes to fill gene families + """ + genes = set() + for i in range(0, randint(11, 20)): + gene = Gene(f"gene_{str(i)}") + gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + genes.add(gene) + yield genes + + +@pytest.fixture +def families(genes) -> Generator[Set[GeneFamily], None, None]: + """Create a set of gene families fill with genes to test edges + """ + families = set() + genes = list(genes) + nb_families = randint(2, 10) + nb_genes_per_family = len(genes) // nb_families + idx_fam = 1 + while idx_fam < nb_families: + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = 0 + while idx_genes < nb_genes_per_family: + gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] + family.add_gene(gene) + gene.family = family + idx_genes += 1 + families.add(family) + idx_fam += 1 + # last family fill with all the gene left + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = (idx_fam - 1) * nb_genes_per_family + while idx_genes < len(genes): + gene = genes[idx_genes] + family.add_gene(gene) + gene.family = family + idx_genes += 1 + families.add(family) + yield families + + class TestRegion: """Tests for region class """ @@ -195,17 +239,6 @@ def test_is_contig_border_false(self, region): region[starter.position], region[stopper.position] = starter, stopper assert region.is_contig_border is False - @pytest.fixture - def genes(self) -> Generator[Set[Gene], None, None]: - """Create a set of genes to fill gene families - """ - genes = set() - for i in range(0, randint(11, 20)): - gene = Gene(f"gene_{str(i)}") - gene.fill_annotations(start=10*i + 1, stop=10*(i+1), strand='+', position=i, genetic_code=4) - genes.add(gene) - yield genes - def test_len(self, region, genes): for gene in genes: region[gene.position] = gene @@ -237,36 +270,6 @@ def test_equality_with_not_instance_region(self, region): with pytest.raises(TypeError): assert region == 4 - @pytest.fixture - def families(self, genes) -> Generator[Set[GeneFamily], None, None]: - """Create a set of gene families fill with genes to test edges - """ - families = set() - genes = list(genes) - nb_families = randint(2, 10) - nb_genes_per_family = len(genes) // nb_families - idx_fam = 1 - while idx_fam < nb_families: - family = GeneFamily(idx_fam, f"family_{idx_fam}") - idx_genes = 0 - while idx_genes < nb_genes_per_family: - gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] - family.add_gene(gene) - gene.family = family - idx_genes += 1 - families.add(family) - idx_fam += 1 - # last family fill with all the gene left - family = GeneFamily(idx_fam, f"family_{idx_fam}") - idx_genes = (idx_fam - 1) * nb_genes_per_family - while idx_genes < len(genes): - gene = genes[idx_genes] - family.add_gene(gene) - gene.family = family - idx_genes += 1 - families.add(family) - yield families - def test_get_gene_families(self, region, genes, families): """Tests that gene families can be retrieved from the region """ @@ -291,4 +294,192 @@ def test_get_number_of_gene_families(self, region, genes, families): # gene.fill_parents(None, contig) # region[gene.position] = gene # min_gene, max_gene = min(genes, key=lambda gene: gene.position), max(genes, key=lambda gene: gene.position) - # assert region.get_bordering_genes(1, {}) == [[min_gene], [max_gene]] \ No newline at end of file + # assert region.get_bordering_genes(1, {}) == [[min_gene], [max_gene]] + + +class TestSpot: + @pytest.fixture + def spot(self) -> Generator[Spot, None, None]: + """Generate a spot for test + """ + yield Spot(0) + + def test_cstr(self, spot): + assert spot.ID == 0 + assert isinstance(spot._region_getter, dict) and len(spot._region_getter) == 0 + assert isinstance(spot._uniqOrderedSet, dict) and len(spot._uniqOrderedSet) == 0 + assert isinstance(spot._uniqContent, dict) and len(spot._uniqContent) == 0 + + def test_cstr_assert_error(self): + with pytest.raises(TypeError): + Spot("spot_0") + + def test_repr(self, spot): + """Test that the canonical string representing a spot does not change + """ + assert repr(spot) == "Spot 0 - #RGP: 0" + + def test_str(self, spot): + """Test that the writing spot method does not change + """ + assert str(spot) == "spot_0" + + @pytest.fixture + def region(self) -> Generator[Region, None, None]: + """Create a region for test + """ + yield Region("RGP_0") + + def test_add_region(self, spot, region): + """Tests that adding a Region object to the Spot object works as expected + """ + spot[region.name] = region + assert region == spot._region_getter[region.name] + + def test_add_not_instance_region(self, spot): + with pytest.raises(TypeError): + spot["spot"] = "spot" + + def test_add_different_region_with_same_name(self, spot): + """Test that adding a new Region same name than another in the spot return a KeyError + """ + region_1, region_2 = Region("RGP"), Region("RGP") + gene_1, gene_2 = Gene("gene_1"), Gene("gene_2") + gene_1.fill_annotations(start=0, stop=10, strand='+', position=0) + gene_2.fill_annotations(start=0, stop=10, strand='+', position=0) + gene_1.family, gene_2.family = GeneFamily(0, "Fam_0"), GeneFamily(1, "Fam_1") + region_1[0], region_2[0] = gene_1, gene_2 + spot[region_1.name] = region_1 + with pytest.raises(KeyError): + spot[region_2.name] = region_2 + + def test_add_two_time_the_same_region(self, spot, region): + """Test that adding a new Region same name than another in the spot return a KeyError + """ + gene = Gene("gene") + gene.fill_annotations(start=0, stop=10, strand='+', position=0) + gene.family = GeneFamily(0, "Fam") + region[0] = gene + spot[region.name] = region + assert region in spot._region_getter.values() + spot[region.name] = region + assert region in spot._region_getter.values() + + def test_get_region(self, spot, region): + """Tests that getting the region in the Spot object works as expected + """ + spot[region.name] = region + assert spot[region.name] == region + + def test_get_region_not_in_spot(self, spot): + with pytest.raises(KeyError): + _ = spot["rgp"] + + def test_delete_region_in_spot(self, spot, region): + spot[region.name] = region + del spot[region.name] + assert region.name not in spot._region_getter + + def test_len(self, spot, region): + assert isinstance(len(spot), int) + assert len(spot) == 0 + spot[region.name] = region + assert len(spot) == 1 + + @pytest.fixture + def regions(self, genes): + regions = set() + genes = sorted(list(genes), key=lambda x: x.position) + nb_regions = randint(2, len(genes)) + nb_genes_per_region = len(genes) // nb_regions + idx_region = 1 + while idx_region < nb_regions: + region = Region(f"RGP_{idx_region}") + genes_counter = 0 + while genes_counter < nb_genes_per_region: + gene = genes.pop(0) + region[gene.position] = gene + gene.RGP = region + genes_counter += 1 + regions.add(region) + idx_region += 1 + # last region fill with all the gene left + region = Region(f"RGP_{idx_region}") + while len(genes) > 0: + gene = genes.pop(0) + region[gene.position] = gene + gene.RGP = region + regions.add(region) + yield regions + + def test_get_all_regions(self, spot, regions): + """Tests that getting all the region in the spot works as expected + """ + for region in regions: + spot[region.name] = region + assert len(spot) == len(regions) + assert all(type(region) == Region for region in spot.regions) + assert regions == set(spot.regions) + + def test_get_families(self, spot, regions, families): + """Tests that getting the gene families in the Spot object works as expected + """ + for region in regions: + spot[region.name] = region + assert set(spot.families) == families + + def test_number_of_families(self, spot, regions, families): + """Tests that getting the number of families in the spot works as expected + """ + for region in regions: + spot[region.name] = region + assert spot.number_of_families() == len(families) + + def test_add_spot_to_families(self, spot, regions, families): + """Tests that adding spot to families works as expected + """ + for region in regions: + spot[region.name] = region + spot.spot_2_families() + assert all(set(family.spots) == {spot} for family in spot.families) + + @pytest.fixture + def srgps(self, regions): + """Create a random number of same rgp for all regions + """ + srgps = set() + for region in regions: + nb_sim_rgp = randint(1, 3) + for idx_sim_rgp in range(1, nb_sim_rgp + 1): + sim_rgp = Region(f"s{region.name}.{idx_sim_rgp}") + for gene in region.genes: + sim_rgp[gene.position] = gene + srgps.add(sim_rgp) + yield srgps + + def test_get_uniq_rgp_set(self, spot, regions, families, srgps): + """Tests that getting identical rgp in the Spot object works as expected + """ + for region in list(regions) + list(srgps): # With lists provide sRGP to be key RGP in dict + spot[region.name] = region + assert len(spot) == len(regions) + len(srgps) + uniq2rgp = spot.get_uniq_to_rgp() + for region, sim_rgps in uniq2rgp.items(): + assert region in regions + assert set(region.families) == set.union(*[set(srgp.families) for srgp in sim_rgps]) + + def test_get_uniq_ordered_set(self, spot, regions, families, srgps): + """Tests that getting the unique synteny in the Spot object works as expected + """ + for region in list(regions) + list(srgps): # With lists provide sRGP to be key RGP in dict + spot[region.name] = region + assert len(spot) == len(regions) + len(srgps) + assert spot.get_uniq_ordered_set().issubset(regions) + + def test_get_uniq_content(self, spot, regions, families, srgps): + """Tests that getting the unique RGP in the Spot object works as expected + """ + for region in list(regions) + list(srgps): # With lists provide sRGP to be key RGP in dict + spot[region.name] = region + assert len(spot) == len(regions) + len(srgps) + assert spot.get_uniq_ordered_set().issubset(regions) \ No newline at end of file From 31fd993194f71802682760942a05a0ade31c7322 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 9 Aug 2023 16:15:32 +0200 Subject: [PATCH 36/75] Refactor the module class --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 4 +-- ppanggolin/formats/writeBinaries.py | 4 +-- ppanggolin/formats/writeFlat.py | 6 ++--- ppanggolin/mod/module.py | 2 +- ppanggolin/region.py | 40 ++++++++++++++++++++--------- 6 files changed, 37 insertions(+), 21 deletions(-) diff --git a/VERSION b/VERSION index 92b8c188..ad0f46ac 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.153 +1.2.154 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 288906bb..0b6107cc 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -427,9 +427,9 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal table = h5f.root.modules modules = {} # id2mod for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="module", disable=disable_bar): - curr_module = modules.get(row['module']) + curr_module = modules.get(int(row['module'])) if curr_module is None: - curr_module = Module(row['module']) + curr_module = Module(int(row['module'])) modules[row["module"]] = curr_module curr_module.add_family(pangenome.get_gene_family(row['geneFam'].decode())) for module in modules.values(): diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 06399939..34635fb0 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -626,7 +626,7 @@ def write_modules(pangenome: Pangenome, h5f: tables.File, force: bool = False, d h5f.remove_node("/", "modules") mod_table = h5f.create_table('/', 'modules', mod_desc(get_mod_desc(pangenome)), - expectedrows=sum([len(mod.families) for mod in pangenome.modules])) + expectedrows=sum([len(mod) for mod in pangenome.modules])) mod_row = mod_table.row for mod in tqdm(pangenome.modules, total=pangenome.number_of_modules(), unit="modules", disable=disable_bar): @@ -760,7 +760,7 @@ def getmin(arg: iter) -> float: info_group._v_attrs.numberOfSpots = pangenome.number_of_spots() if pangenome.status["modules"] in ["Computed", "Loaded"]: info_group._v_attrs.numberOfModules = pangenome.number_of_modules() - info_group._v_attrs.numberOfFamiliesInModules = sum([len(mod.families) for mod in pangenome.modules]) + info_group._v_attrs.numberOfFamiliesInModules = sum([len(mod) for mod in pangenome.modules]) info_group._v_attrs.parameters = pangenome.parameters # saving the pangenome parameters diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index c2004cd0..ea34c209 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -758,8 +758,8 @@ def write_module_summary(output: Path, compress: bool = False): for gene in family.genes: org_dict[gene.organism].add(gene) fout.write( - f"module_{mod.ID}\t{len(mod.families)}\t{len(org_dict)}\t{partition_counter.most_common(1)[0][0]}\t" - f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod.families), 3)}\n") + f"module_{mod.ID}\t{len(mod)}\t{len(org_dict)}\t{partition_counter.most_common(1)[0][0]}\t" + f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod), 3)}\n") fout.close() logging.getLogger("PPanGGOLiN").info(f"Done writing module summary: '{output.as_posix() + '/modules_summary.tsv'}'") @@ -797,7 +797,7 @@ def write_org_modules(output: Path, compress: bool = False): for fam in mod.families: mod_orgs |= set(fam.organisms) for org in mod_orgs: - completion = round((org.number_of_families() + len(mod.families)) / len(mod.families), 2) + completion = round((org.number_of_families() + len(mod)) / len(mod), 2) fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n") fout.close() logging.getLogger("PPanGGOLiN").info( diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index d558b6c8..737fd3a7 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -122,7 +122,7 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int = fams = set() for mod in modules: - fams |= mod.families + fams |= set(mod.families) pangenome.add_module(mod) logging.getLogger("PPanGGOLiN").info(f"There are {len(fams)} families among {len(modules)} modules") diff --git a/ppanggolin/region.py b/ppanggolin/region.py index a33dc7a4..ef42e9d8 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -417,20 +417,39 @@ def __init__(self, module_id: int, families: set = None): 'associated_families' are gene families that you believe are associated to the module in some way, but do not define it. """ + if not isinstance(module_id, int): + raise TypeError(f"Module identifier must be an integer. Given type is {type(module_id)}") super().__init__() self.ID = module_id self._families = set() - if families is not None: - if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception("You provided elements that were not GeneFamily object. " - "Modules are only made of GeneFamily") - self._families |= set(families) + self._families_getter = {} + [self.add_family(family) for family in families] if families is not None else None self.bitarray = None + def __setitem__(self, name, family): + if not isinstance(family, GeneFamily): + raise TypeError(f"A gene family is expected to be added to module. Given type was {type(family)}") + if name in self._families_getter and self[name] != family: + raise KeyError("A different gene family with the same name already exist in the module") + self._families_getter[name] = family + family.add_module(self) + + def __getitem__(self, name) -> GeneFamily: + try: + return self._families_getter[name] + except KeyError: + raise KeyError(f"There isn't gene family with the name {name} in the module") + + def __delitem__(self, name): + del self._families_getter[name] + + def __len__(self): + return len(self._families_getter) + @property - def families(self) -> Set[GeneFamily]: - # TODO made as generator - return self._families + def families(self) -> Generator[GeneFamily, None, None]: + for family in self._families_getter.values(): + yield family def add_family(self, family: GeneFamily): """ @@ -438,10 +457,7 @@ def add_family(self, family: GeneFamily): :param family: the family that will ba added to the module """ - if not isinstance(family, GeneFamily): - raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") - family.add_module(self) - self._families.add(family) + self._families_getter[family.name] = family def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index From ef2375e5a1e47a25d78346ec1670afb6e5d5cb40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 10 Aug 2023 16:25:33 +0200 Subject: [PATCH 37/75] Tests for the module class --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 3 +- ppanggolin/geneFamily.py | 2 + ppanggolin/region.py | 91 ++++++++---------- tests/test_Region.py | 147 ++++++++++++++++++++++++++++- 5 files changed, 187 insertions(+), 58 deletions(-) diff --git a/VERSION b/VERSION index ad0f46ac..ecb3176a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.154 +1.2.155 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 0b6107cc..bfa51e77 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -431,7 +431,8 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal if curr_module is None: curr_module = Module(int(row['module'])) modules[row["module"]] = curr_module - curr_module.add_family(pangenome.get_gene_family(row['geneFam'].decode())) + family = pangenome.get_gene_family(row['geneFam'].decode()) + curr_module[family.name] = family for module in modules.values(): pangenome.add_module(module) pangenome.status["modules"] = "Loaded" diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 03abd2bb..0c553ad5 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -78,6 +78,8 @@ def __init__(self, family_id: int, name: str): def __repr__(self): return f"{self.ID}: {self.name}" + #TODO define __eq__ + @property def named_partition(self) -> str: """Reads the partition attribute and returns a meaningful name diff --git a/ppanggolin/region.py b/ppanggolin/region.py index ef42e9d8..60fe17d5 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -405,27 +405,52 @@ def count_uniq_ordered_set(self): class Module(MetaFeatures): - """ - This class represent a hotspot. + """The `Module` class represents a module in a pangenome analysis. + + The `Module` class has the following attributes: + - `ID`: An integer identifier for the module. + - `bitarray`: A bitarray representing the presence/absence of the gene families in an organism. - :param module_id: identifier of the module - :param families: Set of families which define the module + The `Module` class has the following methods: + - `families`: Returns a generator that yields the gene families in the module. + - `mk_bitarray`: Generates a bitarray representing the presence/absence of the gene families in an organism using the provided index. """ def __init__(self, module_id: int, families: set = None): - """ - 'core' are gene families that define the module. - 'associated_families' are gene families that you believe are associated to the module in some way, - but do not define it. + """Constructor method + + :param module_id: Module identifier + :param families: Set of families which define the module """ if not isinstance(module_id, int): raise TypeError(f"Module identifier must be an integer. Given type is {type(module_id)}") super().__init__() self.ID = module_id - self._families = set() self._families_getter = {} - [self.add_family(family) for family in families] if families is not None else None + if families is not None: + for family in families: + self[family.name] = family self.bitarray = None + def __repr__(self): + return f"Module {self.ID} - #Families: {len(self)}" + + def __str__(self): + return f"module_{self.ID}" + + def __hash__(self): + return id(self) + + def __len__(self): + return len(self._families_getter) + + def __eq__(self, other: Module): + if not isinstance(other, Module): + raise TypeError(f"Another module is expected to be compared to the first one. You give a {type(other)}") + if set(self.families) == set(other.families): + return True + else: + return False + def __setitem__(self, name, family): if not isinstance(family, GeneFamily): raise TypeError(f"A gene family is expected to be added to module. Given type was {type(family)}") @@ -441,54 +466,16 @@ def __getitem__(self, name) -> GeneFamily: raise KeyError(f"There isn't gene family with the name {name} in the module") def __delitem__(self, name): - del self._families_getter[name] - - def __len__(self): - return len(self._families_getter) + try: + del self._families_getter[name] + except KeyError: + raise KeyError(f"There isn't gene family with the name {name} in the module") @property def families(self) -> Generator[GeneFamily, None, None]: for family in self._families_getter.values(): yield family - def add_family(self, family: GeneFamily): - """ - Add a family to the module - - :param family: the family that will ba added to the module - """ - self._families_getter[family.name] = family - - def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): - """Produces a bitarray representing the presence / absence of families in the organism using the provided index - The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. - - :param partition: filter module by partition - :param index: The index computed by :func:`ppanggolin.pangenome.Pangenome.getIndex` - """ - self.bitarray = gmpy2.xmpz() # pylint: disable=no-member - if partition == 'all': - logging.getLogger("PPanGGOLiN").debug("all") - for fam in self.families: - self.bitarray[index[fam]] = 1 - elif partition == 'persistent': - logging.getLogger("PPanGGOLiN").debug("persistent") - for fam in self.families: - if fam.named_partition in ['persistent']: - self.bitarray[index[fam]] = 1 - elif partition in ['shell', 'cloud']: - logging.getLogger("PPanGGOLiN").debug("shell, cloud") - for fam in self.families: - if fam.named_partition == partition: - self.bitarray[index[fam]] = 1 - elif partition == 'accessory': - logging.getLogger("PPanGGOLiN").debug("accessory") - for fam in self.families: - if fam.named_partition in ['shell', 'cloud']: - self.bitarray[index[fam]] = 1 - else: - raise Exception("There is not any partition corresponding please report a github issue") - class GeneContext: """ diff --git a/tests/test_Region.py b/tests/test_Region.py index e9524bf3..1e99268c 100644 --- a/tests/test_Region.py +++ b/tests/test_Region.py @@ -4,8 +4,9 @@ import pytest from typing import Generator, Set from random import randint +import gmpy2 -from ppanggolin.region import Region, Spot +from ppanggolin.region import Region, Spot, Module from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Contig, Organism @@ -52,6 +53,36 @@ def families(genes) -> Generator[Set[GeneFamily], None, None]: families.add(family) yield families +@pytest.fixture +def organisms(genes) -> Generator[Set[Organism], None, None]: + """Create a set of organism object for test + + :return: Generator with set of organism object + """ + orgs = set() + genes = list(genes) + nb_organisms = randint(2, 10) + nb_genes_per_organism = len(genes) // nb_organisms + idx_org = 1 + while idx_org < nb_organisms: + org = Organism(f"organism_{idx_org}") + idx_genes = 0 + while idx_genes < nb_genes_per_organism: + gene = genes[(idx_org - 1) * nb_genes_per_organism + idx_genes] + gene.fill_parents(organism=org) + idx_genes += 1 + orgs.add(org) + idx_org += 1 + # last organism fill with all the gene left + org = Organism(f"organism_{idx_org}") + idx_genes = (idx_org - 1) * nb_genes_per_organism + while idx_genes < len(genes): + gene = genes[idx_genes] + gene.fill_parents(organism=org) + idx_genes += 1 + orgs.add(org) + yield orgs + class TestRegion: """Tests for region class @@ -310,7 +341,7 @@ def test_cstr(self, spot): assert isinstance(spot._uniqOrderedSet, dict) and len(spot._uniqOrderedSet) == 0 assert isinstance(spot._uniqContent, dict) and len(spot._uniqContent) == 0 - def test_cstr_assert_error(self): + def test_cstr_type_error(self): with pytest.raises(TypeError): Spot("spot_0") @@ -354,7 +385,7 @@ def test_add_different_region_with_same_name(self, spot): spot[region_2.name] = region_2 def test_add_two_time_the_same_region(self, spot, region): - """Test that adding a new Region same name than another in the spot return a KeyError + """Test that adding a two time the same region is working as expected """ gene = Gene("gene") gene.fill_annotations(start=0, stop=10, strand='+', position=0) @@ -482,4 +513,112 @@ def test_get_uniq_content(self, spot, regions, families, srgps): for region in list(regions) + list(srgps): # With lists provide sRGP to be key RGP in dict spot[region.name] = region assert len(spot) == len(regions) + len(srgps) - assert spot.get_uniq_ordered_set().issubset(regions) \ No newline at end of file + assert spot.get_uniq_ordered_set().issubset(regions) + + +class TestModule: + @pytest.fixture + def module(self): + yield Module(0) + + def test_cstr(self, module): + """Test that a module is construct as expected + """ + assert module.ID == 0 + assert isinstance(module._families_getter, dict) and module._families_getter == {} + assert module.bitarray is None + + def test_cstr_type_error(self): + """Test that if the identifier is not an integer it raises a TypeError + """ + with pytest.raises(TypeError): + Spot("mod_0") + + def test_repr(self, module): + """Test that the canonical string representing a module does not change + """ + assert repr(module) == "Module 0 - #Families: 0" + + def test_str(self, module): + """Test that the writing spot method does not change + """ + assert str(module) == "module_0" + + def test_hash(self, module): + """Test that len method work as expected + """ + assert isinstance(hash(module), int) + + def test_len(self, module): + """Test that len method work as expected + """ + module._families_getter["fam"] = GeneFamily(randint(1,5), "fam") + assert isinstance(len(module), int) + assert len(module) == 1 + + def test_eq(self, families): + module1, module2, module3 = Module(1), Module(2), Module(3) + for family in families: + module1[family.name] = family + module2[family.name] = family + assert module1 == module2 + assert module1 != module3 + + def test_eq_with_is_not_instance_module(self, module): + with pytest.raises(TypeError): + module == 4 + + @pytest.fixture + def family(self) -> Generator[GeneFamily, None, None]: + """Create a basic gene family for test + """ + yield GeneFamily(0, 'family') + + def test_add_family(self, module, family): + """Tests that a gene family can be added to the module + """ + module[family.name] = family + assert len(module._families_getter) == 1 + assert module._families_getter['family'] == family + + def test_add_different_families_with_same_name(self, module): + """Test that adding a new family with same name than another in the module return a KeyError + """ + family_1, family_2 = GeneFamily(1, 'family_1'), GeneFamily(1, 'family_1') + module[family_1.name] = family_1 + with pytest.raises(KeyError): + module[family_2.name] = family_2 + + def test_add_two_time_the_same_family(self, module, family): + """Test that adding a two time the same family is working as expected + """ + module[family.name] = family + assert family in module._families_getter.values() + module[family.name] = family + assert family in module._families_getter.values() + + def test_get_family(self, module, family): + """Tests that a gene family can be retrieved from the module + """ + module[family.name] = family + assert module['family'] == family + + def test_get_family_which_does_not_exist(self, module): + """Tests that if a gene family does not exist it raises a KeyError""" + fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") + with pytest.raises(KeyError): + _ = module[fam.name] + + def test_delete_family(self, module, family): + """Tests that a gene family can be deleted from the module + """ + module[family.name] = family + del module['family'] + assert len(module) == 0 + + def test_delete_family_which_does_not_exist(self, module): + """Tests that if a gene family does not exist it raises a KeyError + """ + fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") + with pytest.raises(KeyError): + del module[fam.name] From 72feee792a1a5d8ec2dcb8d9595b2d402465e683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 28 Aug 2023 16:36:29 +0200 Subject: [PATCH 38/75] Refactor and test GeneContext class --- VERSION | 2 +- ppanggolin/context/searchGeneContext.py | 2 +- ppanggolin/region.py | 96 ++++++++++++++------- tests/test_Region.py | 110 +++++++++++++++++++++++- 4 files changed, 175 insertions(+), 35 deletions(-) diff --git a/VERSION b/VERSION index ecb3176a..5fdc6d63 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.155 +1.2.156 diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 47512d56..803e5c48 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -82,7 +82,7 @@ def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: families = set() for gene_context in common_components: - families |= gene_context.families + families |= set(gene_context.families) if len(families) != 0: export_to_dataframe(families, common_components, fam_2_seq, output) diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 60fe17d5..1cba6fef 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -425,11 +425,7 @@ def __init__(self, module_id: int, families: set = None): raise TypeError(f"Module identifier must be an integer. Given type is {type(module_id)}") super().__init__() self.ID = module_id - self._families_getter = {} - if families is not None: - for family in families: - self[family.name] = family - self.bitarray = None + self._families_getter = {family.name: family for family in families} if families is not None else {} def __repr__(self): return f"Module {self.ID} - #Families: {len(self)}" @@ -446,10 +442,7 @@ def __len__(self): def __eq__(self, other: Module): if not isinstance(other, Module): raise TypeError(f"Another module is expected to be compared to the first one. You give a {type(other)}") - if set(self.families) == set(other.families): - return True - else: - return False + return set(self.families) == set(other.families) def __setitem__(self, name, family): if not isinstance(family, GeneFamily): @@ -467,38 +460,79 @@ def __getitem__(self, name) -> GeneFamily: def __delitem__(self, name): try: - del self._families_getter[name] + fam = self._families_getter[name] except KeyError: raise KeyError(f"There isn't gene family with the name {name} in the module") + else: + del self._families_getter[name] + fam._modules.remove(self) @property def families(self) -> Generator[GeneFamily, None, None]: - for family in self._families_getter.values(): - yield family + """Generator of the family in the module + """ + yield from self._families_getter.values() class GeneContext: - """ - A class used to represent a gene context + """Summary + The GeneContext class represents a gene context, which is a collection of gene families related to a specific genomic context. - :param gc_id : identifier of the Gene context - :param families: Gene families related to the GeneContext - """ + Methods + families: Generator that yields all the gene families in the gene context. + Fields + ID: The identifier of the gene context. + """ def __init__(self, gc_id: int, families: set = None): - self.ID = gc_id - self.families = set() - if families is not None: - if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception("You provided elements that were not GeneFamily object." - " GeneContext are only made of GeneFamily") - self.families |= set(families) - - def add_family(self, family: GeneFamily): - """ - Allow to add one family in the GeneContext - :param family: family to add + """Constructor method + + :param gc_id : identifier of the Gene context + :param families: Gene families related to the GeneContext """ + if not isinstance(gc_id, int): + raise TypeError(f"Gene context identifier must be an integer. Given type is {type(gc_id)}") + self.ID = gc_id + self._families_getter = {family.name: family for family in families} if families is not None else {} + + def __repr__(self): + return f"Context {self.ID} - #Families: {len(self)}" + + def __str__(self): + return f"context_{self.ID}" + + def __hash__(self): + return id(self) + + def __len__(self): + return len(self._families_getter) + + def __eq__(self, other: GeneContext): + if not isinstance(other, GeneContext): + raise TypeError(f"Another context is expected to be compared to the first one. You give a {type(other)}") + return set(self.families) == set(other.families) + + def __setitem__(self, name, family): if not isinstance(family, GeneFamily): - raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") - self.families.add(family) + raise TypeError(f"A gene family is expected to be added to gene context. Given type was {type(family)}") + if name in self._families_getter and self[name] != family: + raise KeyError("A different gene family with the same name already exist in the gene context") + self._families_getter[name] = family + + def __getitem__(self, name) -> GeneFamily: + try: + return self._families_getter[name] + except KeyError: + raise KeyError(f"There isn't gene family with the name {name} in the gene context") + + def __delitem__(self, name): + try: + del self._families_getter[name] + except KeyError: + raise KeyError(f"There isn't gene family with the name {name} in the gene context") + + @property + def families(self): + """Generator of the family in the context + """ + yield from self._families_getter.values() diff --git a/tests/test_Region.py b/tests/test_Region.py index 1e99268c..41820e30 100644 --- a/tests/test_Region.py +++ b/tests/test_Region.py @@ -6,7 +6,7 @@ from random import randint import gmpy2 -from ppanggolin.region import Region, Spot, Module +from ppanggolin.region import Region, Spot, Module, GeneContext from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Contig, Organism @@ -526,7 +526,6 @@ def test_cstr(self, module): """ assert module.ID == 0 assert isinstance(module._families_getter, dict) and module._families_getter == {} - assert module.bitarray is None def test_cstr_type_error(self): """Test that if the identifier is not an integer it raises a TypeError @@ -622,3 +621,110 @@ def test_delete_family_which_does_not_exist(self, module): fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") with pytest.raises(KeyError): del module[fam.name] + + +class TestGeneContext: + @pytest.fixture + def context(self): + yield GeneContext(0) + + def test_cstr(self, context): + """Test that a gene context is construct as expected + """ + assert context.ID == 0 + assert isinstance(context._families_getter, dict) and context._families_getter == {} + + def test_cstr_type_error(self): + """Test that if the identifier is not an integer it raises a TypeError + """ + with pytest.raises(TypeError): + Spot("gc_0") + + def test_repr(self, context): + """Test that the canonical string representing a context does not change + """ + assert repr(context) == "Context 0 - #Families: 0" + + def test_str(self, context): + """Test that the writing spot method does not change + """ + assert str(context) == "context_0" + + def test_hash(self, context): + """Test that len method work as expected + """ + assert isinstance(hash(context), int) + + def test_len(self, context): + """Test that len method work as expected + """ + context._families_getter["fam"] = GeneFamily(randint(1, 5), "fam") + assert isinstance(len(context), int) + assert len(context) == 1 + + def test_eq(self, families): + context1, context2, context3 = GeneContext(1), GeneContext(2), GeneContext(3) + for family in families: + context1[family.name] = family + context2[family.name] = family + assert context1 == context2 + assert context1 != context3 + + def test_eq_with_is_not_instance_context(self, context): + with pytest.raises(TypeError): + context == 4 + + @pytest.fixture + def family(self) -> Generator[GeneFamily, None, None]: + """Create a basic gene family for test + """ + yield GeneFamily(0, 'family') + + def test_add_family(self, context, family): + """Tests that a gene family can be added to the context + """ + context[family.name] = family + assert len(context._families_getter) == 1 + assert context._families_getter['family'] == family + + def test_add_different_families_with_same_name(self, context): + """Test that adding a new family with same name than another in the context return a KeyError + """ + family_1, family_2 = GeneFamily(1, 'family_1'), GeneFamily(1, 'family_1') + context[family_1.name] = family_1 + with pytest.raises(KeyError): + context[family_2.name] = family_2 + + def test_add_two_time_the_same_family(self, context, family): + """Test that adding a two time the same family is working as expected + """ + context[family.name] = family + assert family in context._families_getter.values() + context[family.name] = family + assert family in context._families_getter.values() + + def test_get_family(self, context, family): + """Tests that a gene family can be retrieved from the context + """ + context[family.name] = family + assert context['family'] == family + + def test_get_family_which_does_not_exist(self, context): + """Tests that if a gene family does not exist it raises a KeyError""" + fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") + with pytest.raises(KeyError): + _ = context[fam.name] + + def test_delete_family(self, context, family): + """Tests that a gene family can be deleted from the context + """ + context[family.name] = family + del context['family'] + assert len(context) == 0 + + def test_delete_family_which_does_not_exist(self, context): + """Tests that if a gene family does not exist it raises a KeyError + """ + fam = GeneFamily(randint(1, 20), f"fam{randint(1, 20)}") + with pytest.raises(KeyError): + del context[fam.name] From 947a89815795d1ed3ae887b54042f5ec2495d619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 29 Aug 2023 14:36:34 +0200 Subject: [PATCH 39/75] Refactor and test metadata --- VERSION | 2 +- ppanggolin/formats/writeMetadata.py | 4 +- ppanggolin/metadata.py | 180 +++++++++++++------- ppanggolin/pangenome.py | 2 +- tests/test_GeneFamily.py | 2 +- tests/test_metadata.py | 253 ++++++++++++++++------------ 6 files changed, 267 insertions(+), 176 deletions(-) diff --git a/VERSION b/VERSION index 5fdc6d63..7d8037d2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.156 +1.2.157 diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py index 974a6a02..139ef66c 100644 --- a/ppanggolin/formats/writeMetadata.py +++ b/ppanggolin/formats/writeMetadata.py @@ -114,7 +114,7 @@ def get_metadata_len(select_elem: List[Module], source: str) -> Tuple[Dict[str, else: raise Exception("Unexpected attribute. A recent change could create this error." " Please report the error on our github.") - for metadata in element.get_source(source=source): + for metadata in element[source]: for attr, value in ((k, v) for k, v in metadata.__dict__.items() if k != "source"): if isinstance(value, bytes): value = value.decode('UTF-8') @@ -160,7 +160,7 @@ def write_metadata_metatype(h5f: tables.File, source: str, metatype: str, source_table = h5f.create_table(metatype_group, source, desc_metadata(*meta_len[:-1]), expectedrows=meta_len[-1]) meta_row = source_table.row for element in tqdm(select_elements, unit=metatype, desc=f'Source = {source}', disable=disable_bar): - for metadata in element.get_source(source=source): + for metadata in element[source]: for desc in source_table.colnames: if desc == "ID": if hasattr(element, 'name') and len(element.name) > 0: diff --git a/ppanggolin/metadata.py b/ppanggolin/metadata.py index e1386d4c..31cb06a5 100644 --- a/ppanggolin/metadata.py +++ b/ppanggolin/metadata.py @@ -2,36 +2,39 @@ # coding: utf8 # default libraries -from typing import Generator, List, Tuple, Union +from typing import Generator, List, Tuple, Union, Any # installed libraries from pandas import isna class Metadata: - """The Metadata class represents a metadata link to genes, gene families, organisms, regions, spot or modules. It allows the creation of metadata objects with different attributes and values, and provides methods to access and manipulate these attributes. The class has a constructor method that initializes the Metadata object with a source and a dictionary of attributes and values. The class also has methods to get the value of a specific attribute, return a list of all the attributes, and join a list of strings into a single string separated by commas. The class has two fields: source, which represents the source of the metadata, and **kwargs, which is a dictionary of attributes and values representing the metadata. + """The Metadata class represents a metadata link to genes, gene families, organisms, regions, spot or modules. Methods: - - __init__(self, source: str, **kwargs): Constructor method that initializes the Metadata object with a source and a dictionary of attributes and values. - - number_of_attribute(self): Returns the number of attributes in the Metadata object. - - get(self, name: str, skip_error: bool = False): Returns the value of a specific attribute in the Metadata object, or None if the attribute does not exist. If skip_error is True, it does not raise an AttributeError if the attribute does not exist. - - fields(self) -> List[str]: Returns a list of all the attributes in the Metadata object. - - _join_list(attr_list: Union[str, List[str]]): Joins a list of strings into a single string separated by commas. + - number_of_attribute: Returns the number of attributes in the Metadata object. + - get: Returns the value of a specific attribute, or None if the attribute does not exist. + - fields: Returns a list of all the attributes in the Metadata object. + Fields: - source: A string representing the source of the metadata. - **kwargs: A dictionary of attributes and values representing the metadata. The attributes can be any string, and the values can be any type except None or NaN. """ def __init__(self, source: str, **kwargs): - """ - The Metadata class represents a metadata link to genes, gene families, organisms, regions, spot or modules. - It allows the creation of metadata objects with different attributes and values, and provides methods to access - and manipulate these attributes. Add attributes and values representing the metadata as mush as you want. - The attributes can be any string, and the values can be any type except None or NaN. + """Constructor Method :param source: A string representing the source of the metadata. :param kwargs: A dictionary of attributes and values representing the metadata. The attributes can be any string, and the values can be any type except None or NaN. + + :raises TypeError: Source name is not a string + :raises Exception: Source name is empty + :raises Exception: Metadata is empty """ + if not isinstance(source, str): + raise TypeError(f"Metadata source name must be a string. Given type {type(source)}") + if source == "": + raise ValueError("Metadata source name should not be empty.") self.source = source if len(kwargs) == 0: raise Exception(f"No metadata given for source: {source}") @@ -41,20 +44,36 @@ def __init__(self, source: str, **kwargs): if value is not None and not isna(value): setattr(self, attr, value) - def number_of_attribute(self): - return len(self.__dict__.keys()) + def __repr__(self): + return f"Metadata source: {self.source}, #attr: {len(self)}" - def get(self, name: str): - try: - value = self.__getattribute__(name) - except AttributeError as attr_error: - raise AttributeError(attr_error) - else: - return value + def __len__(self) -> int: + """Get the number of attribute links to the metadata object + + :return: Number of fields (atribute) of the metadata + """ + return len(self.__dict__) - 1 + + def __getattr__(self, item: str) -> Any: + """Get the value corresponding to the given attibute + + :return: Value of the attribute + + :raises AttributeError: The attribute does not exist in the metadata + """ + if item not in self.__dict__: + raise AttributeError(f"{item} is not an attribute of metadata") + return self.__dict__[item] @property def fields(self) -> List[str]: - return list(self.__dict__.keys()) + """Get all the field of the metadata + + :return: List of the field in the metadata + """ + fields = list(self.__dict__) + fields.remove("source") + return fields @staticmethod def _join_list(attr_list: Union[str, List[str]]): @@ -63,71 +82,106 @@ def _join_list(attr_list: Union[str, List[str]]): class MetaFeatures: """ - This represents a methods to access metadata in genes, gene families, organisms, regions, spot or modules + The MetaFeatures class provides methods to access and manipulate metadata in all ppanggolin classes. + + Methods + metadata: Generate all metadata from all sources. + sources: Generate all metadata sources. + get_metadata: Get metadata based on attribute values. + max_metadata_by_source: Gets the source with the maximum number of metadata and the corresponding count. """ + def __init__(self): - self._metadataGetter = {} + """Constructor method + """ + self._metadata_getter = {} + + def __setitem__(self, source: str, metadata: Metadata): + """Add metadata to metadata getter + + :param source: Name of the metadata source + :param metadata: metadata value to add for the source + + :raises AssertionError: Source or metadata is not with the correct type + """ + assert isinstance(metadata, Metadata), f"Metadata is not with type Metadata but with {type(metadata)}" + assert isinstance(source, str), f"Source is not a string but with {type(source)}" + + self._metadata_getter[source] = [metadata] + + def __getitem__(self, source: str) -> Union[List[Metadata], None]: + """Get all the metadata feature corresponding to the source + + :param source: Name of the source to get + + :return: List of metadata corresponding to the source + + :raises AssertionError: Source is not with the correct type + """ + assert isinstance(source, str), f"Source is not a string but with {type(source)}" + return self._metadata_getter.get(source) # if source in _metadata_getter return value else None + + def __delitem__(self, source: str): + """Remove a source from the feature + + :param source: Name of the source to delete + + :raises AssertionError: Source is not with the correct type + :raises KeyError: Source does not belong in the MetaFeature + """ + assert isinstance(source, str), f"Source is not a string but with {type(source)}" + try: + del self._metadata_getter[source] + except KeyError: + raise KeyError(f"Given source: {source} is not in {type(self)}") + + def add_metadata(self, source, metadata): + """Add metadata to metadata getter + + :param source: Name of the metadata source + :param metadata: metadata value to add for the source + """ + if self[source] is None: + self[source] = metadata + else: + self[source].append(metadata) @property def metadata(self) -> Generator[Metadata, None, None]: - """Generate metadatas in gene families + """Generate metadata in gene families - :return: Generator with all metadata from all sources + :return: Metadata from all sources """ - for meta_list in self._metadataGetter.values(): + for meta_list in self._metadata_getter.values(): for metadata in meta_list: yield metadata @property - def sources(self) -> List[str]: + def sources(self) -> Generator[str, None, None]: """ Get all metadata source in gene family - :return: List of metadata source - """ - return list(self._metadataGetter.keys()) - - def get_source(self, source: str) -> Union[List[Metadata], None]: - """ Get the metadata for a specific source in gene family - - :param source: Name of the source - - :return: All the metadata from the source if exist else None + :return: Metadata source """ - return self._metadataGetter[source] if source in self.sources else None + yield from self._metadata_getter.keys() def get_metadata(self, **kwargs) -> Generator[Metadata, None, None]: """Get metadata by one or more attribute - :return: metadata searched + :return: Metadata searched """ for metadata in self.metadata: for attr, value in kwargs.items(): if hasattr(metadata, attr): - if metadata.__getattribute__(attr) in value or metadata.__getattribute__(attr) == value: + # BUG If value is a list, the join block detection. + # It would be better to keep a list and change in writing and reading metadata to join the list + if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: yield metadata - def add_metadata(self, source: str, metadata: Metadata): - """ Add metadata - - :param source: Name of database source - :param metadata: Identifier of the metadata - """ - assert isinstance(metadata, Metadata) - source_annot = self.get_source(source) - if source_annot is not None: - self._metadataGetter[source].append(metadata) - else: - self._metadataGetter[source] = [metadata] - def max_metadata_by_source(self) -> Tuple[str, int]: - """Get the maximum number of annotation for one source - :return: Name of the source with the maximum annotation and the number of annotation corresponding + """Get the maximum number of metadata for one source + + :return: Name of the source with the maximum annotation and the number of metadata corresponding """ - max_meta = 0 - max_source = None - for source, metadata in self._metadataGetter.items(): - if len(metadata) > max_meta: - max_meta = len(metadata) - max_source = source - return max_source, max_meta \ No newline at end of file + max_source, max_meta = max(self._metadata_getter.items(), key=lambda x: len(x[1])) + return max_source, len(max_meta) diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 7abc0c2b..f28b026e 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -660,5 +660,5 @@ def get_elem_by_sources(self, source: List[str], metatype: str) -> Generator[ """ assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] for elem in self.select_elem(metatype): - if elem.get_source(source) is not None: + if elem[source] is not None: yield elem diff --git a/tests/test_GeneFamily.py b/tests/test_GeneFamily.py index 9a108f49..d4342334 100644 --- a/tests/test_GeneFamily.py +++ b/tests/test_GeneFamily.py @@ -22,7 +22,7 @@ def test_create_gene_family(self): family = GeneFamily(1, 'test') assert isinstance(family, GeneFamily) assert all(attr in ["ID", "name", "_edges", "_genePerOrg", "_genes", "removed", "sequence", "partition", - "_spots", "_modules", "bitarray", "_metadataGetter"] for attr in + "_spots", "_modules", "bitarray", "_metadata_getter"] for attr in family.__dict__) # Check that no attribute was added else it should be tested assert all(hasattr(family, attr) for attr in ["ID", "name", "_edges", "_genePerOrg", "_genes", "removed", "sequence", "partition", "_spots", "_modules", diff --git a/tests/test_metadata.py b/tests/test_metadata.py index b95a1833..c26131d9 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,126 +1,163 @@ #! /usr/bin/env python3 +# coding: utf8 import pytest -from random import choices, randint, sample +from random import randint from typing import Generator, Set from ppanggolin.metadata import Metadata, MetaFeatures class TestMetadata: - def test_create_metadata_with_attributes(self): - metadata = Metadata('source', attribute='value') - assert metadata.__getattribute__("attribute") == 'value' - assert metadata.source == 'source' - - def test_create_metadata_with_no_attributes(self): + @pytest.fixture + def metadata(self) -> Generator[Metadata, None, None]: + """Create a simple metadata + """ + yield Metadata("source", attribute1="value1", attribute2=["value2", "value3"]) + + def test_constructor(self, metadata): + """Tests that the Metadata object is created successfully with a valid source and attributes + """ + assert metadata.source == "source" + assert metadata.attribute1 == "value1" + assert metadata.attribute2 == "value2,value3" + + def test_constructor_with_empty_source_name(self): + """Tests that a ValueError is raised when creating a Metadata object with an empty source name + """ + with pytest.raises(ValueError): + Metadata("", attribute="value") + + def test_constructor_with_non_string_source_name(self): + """Tests that a TypeError is raised when creating a Metadata object with a non-string source name + """ + with pytest.raises(TypeError): + Metadata(123, attribute="value") + + def test_constructor_with_no_attributes(self): + """Tests that an Exception is raised when creating a Metadata object with no attributes + """ with pytest.raises(Exception): - Metadata(source='source') + Metadata("source") - def test_get_existing_attribute_value(self): - metadata = Metadata('source', attribute='value') - assert metadata.get('attribute') == 'value' + def test_get_existing_attribute_value(self, metadata): + """Tests that the value of an existing attribute is returned correctly + """ + assert metadata.attribute1 == "value1" - def test_get_non_existing_attribute_value(self): - metadata = Metadata('source', attribute='value') + def test_get_non_existing_attribute_value(self, metadata): + """Tests that an AttributeError is raised when getting the value of a non-existing attribute + """ with pytest.raises(AttributeError): - metadata.get('non_existing_attribute') - - def test_get_all_attributes(self): - metadata = Metadata('source', attribute='value', another_attribute='another_value') - assert metadata.fields == ['source', 'attribute', 'another_attribute'] + _ = metadata.non_existing_attribute - def test_join_list_attribute(self): - metadata = Metadata('source', attribute=['value1', 'value2']) - assert metadata.get("attribute") == 'value1,value2' + def test_attribute_fields(self, metadata): + """Tests that the 'fields' method returns a list of all the attributes in the Metadata object + """ + assert metadata.fields == ["attribute1", "attribute2"] - def test_metadata_number_of_attributes(self): - metadata = Metadata('source', attribute='value', another_attribute='another_value') - assert metadata.number_of_attribute() == 3 + def test_length(self, metadata): + """Tests that the number_of_attribute method returns the correct number of attributes in the Metadata object + """ + assert isinstance(len(metadata), int) + assert len(metadata) == 2 class TestMetaFeatures: - # Tests that metadata can be added to MetaFeatures and checking if it was added successfully - def test_add_metadata(self): - meta_features = MetaFeatures() - metadata = Metadata('source1', attribute1='value1') - meta_features.add_metadata('source1', metadata) - assert meta_features._metadataGetter['source1'] == [metadata] - - # Tests that metadata can be gotten from MetaFeatures by source and checking if it returns the correct metadata - def test_get_metadata_by_source(self): - meta_features = MetaFeatures() - metadata1 = Metadata('source1', attribute1='value1') - metadata2 = Metadata('source2', attribute2='value2') - meta_features.add_metadata('source1', metadata1) - meta_features.add_metadata('source2', metadata2) - assert meta_features.get_source('source1') == [metadata1] - assert meta_features.get_source('source2') == [metadata2] - - # Tests that metadata can be gotten from MetaFeatures by attribute and checking if it returns the correct metadata - def test_get_metadata_by_attribute(self): - meta_features = MetaFeatures() - metadata1 = Metadata('source1', attribute1='value1') - metadata2 = Metadata('source2', attribute2='value2') - meta_features.add_metadata('source1', metadata1) - meta_features.add_metadata('source2', metadata2) - assert list(meta_features.get_metadata(attribute1='value1')) == [metadata1] - - # Tests that all metadata can be gotten from MetaFeatures and checking if it returns all metadata - def test_get_all_metadata(self): - meta_features = MetaFeatures() - metadata1 = Metadata('source1', attribute1='value1') - metadata2 = Metadata('source2', attribute2='value2') - meta_features.add_metadata('source1', metadata1) - meta_features.add_metadata('source2', metadata2) - assert list(meta_features.metadata) == [metadata1, metadata2] - - # Tests that all metadata sources can be gotten from MetaFeatures and checking if it returns all sources - def test_get_all_metadata_sources(self): - meta_features = MetaFeatures() - metadata1 = Metadata('source1', attribute1='value1') - metadata2 = Metadata('source2', attribute2='value2') - meta_features.add_metadata('source1', metadata1) - meta_features.add_metadata('source2', metadata2) - assert meta_features.sources == ['source1', 'source2'] - - # Tests that the source with the maximum number of metadata can be gotten from MetaFeatures and checking if it returns the correct source and number - def test_get_source_with_maximum_metadata(self): - meta_features = MetaFeatures() - metadata1 = Metadata('source1', attribute1='value1') - metadata2 = Metadata('source1', attribute2='value2') - meta_features.add_metadata('source1', metadata1) - meta_features.add_metadata('source1', metadata2) - assert meta_features.max_metadata_by_source() == ('source1', 2) - - # Tests that getting metadata from MetaFeatures with non-existent source returns None - def test_get_metadata_with_non_existent_source_returns_none(self): - meta_features = MetaFeatures() - metadata = Metadata('source1', attribute1='value1') - meta_features.add_metadata('source1', metadata) - assert meta_features.get_source('source2') == None - - # Tests that getting metadata from MetaFeatures with non-existent attribute returns None - def test_get_metadata_with_non_existent_attribute_returns_none(self): - meta_features = MetaFeatures() - metadata = Metadata('source1', attribute1='value1') - meta_features.add_metadata('source1', metadata) - assert list(meta_features.get_metadata(attribute2='value2')) == [] - - # Tests that getting metadata from MetaFeatures with empty attribute value returns the correct metadata - def test_get_metadata_with_empty_attribute_value_returns_correct_metadata(self): - meta_features = MetaFeatures() - metadata1 = Metadata('source1', attribute1='') - metadata2 = Metadata('source2', attribute2='value2') - meta_features.add_metadata('source1', metadata1) - meta_features.add_metadata('source2', metadata2) - assert list(meta_features.get_metadata(attribute1='')) == [metadata1] - - # Tests that getting metadata from MetaFeatures with list attribute value returns the correct metadata - def test_get_metadata_with_list_attribute_value_returns_correct_metadata(self): - meta_features = MetaFeatures() - metadata1 = Metadata('source1', attribute1=['value1', 'value2']) - metadata2 = Metadata('source2', attribute2='value2') - meta_features.add_metadata('source1', metadata1) - meta_features.add_metadata('source2', metadata2) - assert list(meta_features.get_metadata(attribute1='value1,value2')) == [metadata1] + @pytest.fixture + def metadata(self) -> Generator[Set[Metadata], None, None]: + """Create a random number of metadata + + :return: Set of metadata + """ + metadata = set() + for i in range(randint(5, 10)): + metadata.add(Metadata(f"source_{i}", **{f"attr_{j}": j for j in range(randint(1, 5))})) + yield metadata + + @pytest.fixture + def metafeatures(self, metadata) -> Generator[MetaFeatures, None, None]: + """Create a simple metafeature object + + :return: metafeature fill with metadata + """ + metafeatures = MetaFeatures() + for meta in metadata: + metafeatures[meta.source] = meta + yield metafeatures + + def test_set_metadata_to_metadata_getter(self, metafeatures, metadata): + """Tests that metadata can be added to the metadata getter + """ + assert all(metafeatures._metadata_getter[meta.source] == [meta] for meta in metadata) + + def test_get_metadata_feature_corresponding_to_source(self, metafeatures, metadata): + """Tests that all the metadata features corresponding to a source can be retrieved + """ + assert all(metafeatures[meta.source] == [meta] for meta in metadata) + + def test_remove_source_from_feature(self, metafeatures): + """Tests that a source can be removed from the feature + """ + metadata = Metadata("source_del", attribute1="value") + metafeatures["source_del"] = metadata + del metafeatures["source_del"] + assert metafeatures["source_del"] is None + + def test_add_metadata_feature(self, metafeatures): + """Tests that adding metadata works as expected + """ + metadata1 = Metadata("source_add", attribute1="value1") + metadata2 = Metadata("source_add", attribute2="value2") + metafeatures.add_metadata("source_add", metadata1) + metafeatures.add_metadata("source_add", metadata2) + assert metafeatures["source_add"] == [metadata1, metadata2] + + def test_generate_metadata_in_gene_families(self, metafeatures, metadata): + """Tests that metadata can be generated in gene families + """ + assert set(metafeatures.metadata) == metadata + + def test_generate_all_metadata_sources(self, metafeatures, metadata): + """Tests that all metadata sources can be generated + """ + assert list(metafeatures.sources) == [meta.source for meta in metadata] + + def test_get_metadata_by_attribute_values(self, metafeatures): + """Tests that metadata can be retrieved based on attribute values + """ + meta = Metadata("source_test", attribute1="value_to_retrieve") + # meta_list = Metadata("source_list", attribute1=["val_1", "val_2"]) + metafeatures[meta.source] = meta + # metafeatures[meta_list.source] = meta_list + assert list(metafeatures.get_metadata(attribute1="value_to_retrieve")) == [meta] + # assert list(metafeatures.get_metadata(attribute1="val_1")) == [meta_list] + + def test_get_maximum_number_of_metadata_for_one_source(self, metafeatures, metadata): + """Tests that the maximum number of metadata for one source can be retrieved + """ + metadata1 = Metadata("source_max", attribute1="value1") + metadata2 = Metadata("source_max", attribute2="value2") + metafeatures.add_metadata("source_max", metadata1) + metafeatures.add_metadata("source_max", metadata2) + assert metafeatures.max_metadata_by_source() == ("source_max", 2) + + def test_metadata_is_not_with_type_metadata(self, metafeatures): + """Tests that an AssertionError is raised when metadata is not with type Metadata + """ + with pytest.raises(AssertionError): + metafeatures["source1"] = "not_metadata" + + def test_source_is_not_a_string(self, metafeatures): + """Tests that an AssertionError is raised when the source is not a string + """ + + metadata = Metadata("source1", attribute1="value1") + with pytest.raises(AssertionError): + metafeatures[1] = metadata + + def test_source_or_metadata_is_not_with_correct_type(self, metafeatures, metadata): + """Tests that an AssertionError is raised when the source or metadata is not with the correct type + """ + with pytest.raises(AssertionError): + metafeatures[1] = "not_metadata" From 9608e00cf8b82b3bc00ae69c3cea407e43f7637c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 29 Aug 2023 14:39:22 +0200 Subject: [PATCH 40/75] Rename test files --- VERSION | 2 +- tests/{test_Edge.py => tests_Edge.py} | 0 tests/{test_GeneFamily.py => tests_GeneFamily.py} | 0 tests/{test_metadata.py => tests_Metadata.py} | 0 tests/{test_Pangenome.py => tests_Pangenome.py} | 0 tests/{test_Region.py => tests_Region.py} | 0 6 files changed, 1 insertion(+), 1 deletion(-) rename tests/{test_Edge.py => tests_Edge.py} (100%) rename tests/{test_GeneFamily.py => tests_GeneFamily.py} (100%) rename tests/{test_metadata.py => tests_Metadata.py} (100%) rename tests/{test_Pangenome.py => tests_Pangenome.py} (100%) rename tests/{test_Region.py => tests_Region.py} (100%) diff --git a/VERSION b/VERSION index 7d8037d2..e8574990 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.157 +1.2.158 diff --git a/tests/test_Edge.py b/tests/tests_Edge.py similarity index 100% rename from tests/test_Edge.py rename to tests/tests_Edge.py diff --git a/tests/test_GeneFamily.py b/tests/tests_GeneFamily.py similarity index 100% rename from tests/test_GeneFamily.py rename to tests/tests_GeneFamily.py diff --git a/tests/test_metadata.py b/tests/tests_Metadata.py similarity index 100% rename from tests/test_metadata.py rename to tests/tests_Metadata.py diff --git a/tests/test_Pangenome.py b/tests/tests_Pangenome.py similarity index 100% rename from tests/test_Pangenome.py rename to tests/tests_Pangenome.py diff --git a/tests/test_Region.py b/tests/tests_Region.py similarity index 100% rename from tests/test_Region.py rename to tests/tests_Region.py From 387b761c66437cde560cd74e8b74407b14e11832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 29 Aug 2023 15:26:10 +0200 Subject: [PATCH 41/75] Refactor Edge class and tests --- VERSION | 2 +- ppanggolin/edge.py | 49 +++++++++++++++++++++++++++++---------------- tests/tests_Edge.py | 31 +++++++++++++++++++++------- 3 files changed, 57 insertions(+), 25 deletions(-) diff --git a/VERSION b/VERSION index e8574990..5dc98f7b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.158 +1.2.159 diff --git a/ppanggolin/edge.py b/ppanggolin/edge.py index d5353474..903f8aa6 100644 --- a/ppanggolin/edge.py +++ b/ppanggolin/edge.py @@ -12,10 +12,9 @@ class Edge: """The Edge class represents an edge between two gene families in the pangenome graph. It is associated with all the organisms in which the neighborship is found, and all the involved genes as well. Methods: - - __init__(self, source_gene: Gene, target_gene: Gene): Constructor method that initializes an Edge object with a source gene and a target gene. - - get_org_dict(self) -> Dict[Organism, List[Tuple[Gene, Gene]]]: Returns a dictionary with organisms as keys and an iterable of the pairs of genes as values. - - gene_pairs(self) -> List[Tuple[Gene, Gene]]: Returns a list of all the gene pairs of the Edge. - - add_genes(self, source_gene: Gene, target_gene: Gene): Adds genes to the edge. They are supposed to be on the same organism. + - get_org_dict: Returns a dictionary with organisms as keys and an iterable of the pairs in genes as values. + - gene_pairs: Returns a list of all the gene pairs in the Edge. + - add_genes: Adds genes to the edge. They are supposed to be in the same organism. Fields: - source: A GeneFamily object representing the source gene family of the edge. @@ -26,8 +25,8 @@ class Edge: def __init__(self, source_gene: Gene, target_gene: Gene): """Constructor method - :param source_gene: a first gene to initialize the edge - :param target_gene: a second gene to initialize the edge + :param source_gene: First gene to initialize the edge + :param target_gene: Second gene to initialize the edge """ # TODO try to change for gene family ? if source_gene.family is None: @@ -47,38 +46,54 @@ def __init__(self, source_gene: Gene, target_gene: Gene): def organisms(self) -> Generator[Organism, None, None]: """Get all the organisms belonging to the edge - :return: Generator with organisms as key and an iterable of the pairs of genes as value + :return: Generator with organisms as the key and an iterable of the gene pairs as value """ for organism in self._organisms.keys(): yield organism @property - def number_of_organisms(self): + def number_of_organisms(self) -> int: + """Get the number of organisms in the edge + + :return: Number of organisms + """ return len(self._organisms) - def get_organism_genes_pairs(self, organism: Organism): + def get_organism_genes_pairs(self, organism: Organism) -> List[Tuple[Gene, Gene]]: + """Get the gene pair corresponding to the given organism + + :param organism: Wanted organism + + :return: Pair of genes in the edge corresponding to the given organism + """ return self._organisms[organism] - def get_organisms_dict(self): + def get_organisms_dict(self) -> Dict[Organism, List[Tuple[Gene, Gene]]]: + """Get all the organisms with their corresponding pair of genes in the edge + + :return: Dictionary with the organism as the key and list of gene pairs as value + """ return self._organisms @property def gene_pairs(self) -> List[Tuple[Gene, Gene]]: - """ Get list of all the gene pairs of the Edge + """ Get the list of all the gene pairs in the Edge - :return: A list of all the gene pairs of the Edge + :return: A list of all the gene pairs in the Edge """ return [gene_pair for gene_list in self.get_organisms_dict().values() for gene_pair in gene_list] def add_genes(self, source_gene: Gene, target_gene: Gene): - """Adds genes to the edge. They are supposed to be on the same organism. + """ + Adds genes to the edge. + They are supposed to be in the same organism. - :param source_gene: a source gene to add to the edge - :param target_gene: a target gene to add to the edge + :param source_gene: Gene corresponding to the source of the edge + :param target_gene: Gene corresponding to the target of the edge :raises TypeError: If the genes are not with Gene type - :raises ValueError: If genes are not associated to an organism - :raises Exception: If the genes are not on the same organism. + :raises ValueError: If genes are not associated with an organism + :raises Exception: If the genes are not in the same organism. """ if not isinstance(source_gene, Gene) or not isinstance(target_gene, Gene): raise TypeError(f"Genes are expected to be added to edge. " diff --git a/tests/tests_Edge.py b/tests/tests_Edge.py index e94da982..818a6864 100644 --- a/tests/tests_Edge.py +++ b/tests/tests_Edge.py @@ -9,20 +9,22 @@ class TestEdge: - """Test edge class - """ - # Tests that an Edge object can be created with two genes belonging to different families - @pytest.fixture def organism(self) -> Generator[Organism, None, None]: + """Generate a basic organism object + """ yield Organism("organism") @pytest.fixture def families_pair(self) -> Generator[Tuple[GeneFamily, GeneFamily], None, None]: + """Generate a families pair + """ yield GeneFamily(1, "family1"), GeneFamily(2, "family2") @pytest.fixture def genes_pair(self, organism, families_pair) -> Generator[Tuple[Gene, Gene], None, None]: + """Generate genes_pair + """ gene1, gene2 = Gene("gene1"), Gene("gene2") gene1.fill_parents(organism, None) gene2.fill_parents(organism, None) @@ -31,10 +33,14 @@ def genes_pair(self, organism, families_pair) -> Generator[Tuple[Gene, Gene], No @pytest.fixture def edge(self, genes_pair): + """Generate a basic edge + """ edge = Edge(*genes_pair) yield edge def test_constructor(self, genes_pair, organism, families_pair): + """Tests that an Edge object can be created with two genes belonging to different families + """ gene1, gene2 = genes_pair edge = Edge(gene1, gene2) assert edge.source == gene1.family @@ -44,8 +50,9 @@ def test_constructor(self, genes_pair, organism, families_pair): assert edge._organisms == {organism: [(gene1, gene2)]} def test_constructor_attribute_error(self): - """Tests that an AttributeError is raised when creating an Edge object - with a gene that does not belong to any family + """ + Tests that an AttributeError is raised when creating an Edge object + with a gene that does not belong to any family """ gene1 = Gene('gene1') gene1.family = GeneFamily(0, 'test') @@ -58,19 +65,29 @@ def test_constructor_attribute_error(self): Edge(gene2, gene1) def test_gene_pairs(self, edge, genes_pair): + """Tests that gene pairs' generator return what's expected + """ assert set(edge.gene_pairs) == {genes_pair} def test_get_organisms(self, edge, organism): + """Tests that organism generator return what's expected + """ assert set(edge.organisms) == {organism} def test_get_number_of_organisms(self, edge): + """Tests that the good number of organism is returned + """ assert isinstance(edge.number_of_organisms, int) assert edge.number_of_organisms == 1 def test_get_organisms_dict(self, edge, organism, genes_pair): + """Tests that organism-gene_pairs dict is built as expected + """ assert edge.get_organisms_dict() == {organism: [genes_pair]} def test_get_organism_genes_pairs(self, edge, organism, genes_pair): + """Tests that the gene pairs corresponding to the organism is returned + """ assert edge.get_organism_genes_pairs(organism) == [genes_pair] def test_edge_add_genes_same_organism(self, edge, genes_pair, organism): @@ -96,7 +113,7 @@ def test_edge_add_genes_one_none_gene(self, edge, organism): """Tests that a TypeError is raised when adding genes to the edge where one gene is None """ gene1 = Gene('gene1') - gene1.fill_parents(organism, None) + gene1.fill_parents(organism) with pytest.raises(TypeError): edge.add_genes(gene1, None) with pytest.raises(TypeError): From 6a602697c326e84cfae4ce306ecb409ad8df17c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 29 Aug 2023 15:51:17 +0200 Subject: [PATCH 42/75] Refactor GeneFamily class and tests --- VERSION | 2 +- ppanggolin/geneFamily.py | 43 +++++++++++++++++++++++++++++++-------- tests/tests_GeneFamily.py | 42 ++++++++++++++++++++++++++------------ 3 files changed, 65 insertions(+), 22 deletions(-) diff --git a/VERSION b/VERSION index 5dc98f7b..892197b6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.159 +1.2.160 diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 0c553ad5..e8b0efb0 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -38,7 +38,7 @@ class GeneFamily(MetaFeatures): - add_gene: adds a gene to the gene family and sets the gene's family accordingly. - add_spot: adds a spot to the gene family. - add_module: adds a module to the gene family. - - mk_bitarray: produces a bitarray representing the presence/absence of the family in the pangenome using the provided index. + - Mk_bitarray: produces a bitarray representing the presence/absence of the family in the pangenome using the provided index. - get_org_dict: returns a dictionary of organisms as keys and sets of genes as values. - get_genes_per_org: returns the genes belonging to the gene family in the given organism. @@ -47,7 +47,7 @@ class GeneFamily(MetaFeatures): - ID: the internal identifier of the gene family. - removed: a boolean indicating whether the family has been removed from the main graph. - sequence: the protein sequence associated with the family. - - partition: the partition associated with the family. + - Partition: the partition associated with the family. """ def __init__(self, family_id: int, name: str): @@ -75,7 +75,9 @@ def __init__(self, family_id: int, name: str): self._modules = set() self.bitarray = None - def __repr__(self): + def __repr__(self) -> str: + """Family representation + """ return f"{self.ID}: {self.name}" #TODO define __eq__ @@ -84,7 +86,7 @@ def __repr__(self): def named_partition(self) -> str: """Reads the partition attribute and returns a meaningful name - :return: the partition name of the gene family + :return: The partition name of the gene family :raises ValueError: If the gene family has no partition assigned """ @@ -119,6 +121,10 @@ def edges(self) -> Generator[Edge, None, None]: @property def genes(self): + """Return all the genes belonging to the family + + :return: Generator of genes + """ for gene in self._genes: yield gene @@ -135,11 +141,19 @@ def organisms(self) -> Generator[Organism, None, None]: @property def spots(self) -> Generator[Spot, None, None]: + """Return all the spots belonging to the family + + :return: Generator of spots + """ for spot in self._spots: yield spot @property def modules(self) -> Generator[Module, None, None]: + """Return all the modules belonging to the family + + :return: Generator of modules + """ for module in self._modules: yield module @property @@ -181,12 +195,17 @@ def number_of_modules(self) -> int: return len(self._modules) def set_edge(self, target: GeneFamily, edge: Edge): + """Set the edge between the gene family and another one + + :param target: Neighbor family + :param edge: Edge connecting families + """ self._edges[target] = edge def add_sequence(self, seq: str): """Assigns a protein sequence to the gene family. - :param seq: the sequence to add to the gene family + :param seq: The sequence to add to the gene family """ assert isinstance(seq, str), "Sequence must be a string" @@ -195,7 +214,7 @@ def add_sequence(self, seq: str): def add_gene(self, gene: Gene): """Add a gene to the gene family, and sets the gene's :attr:family accordingly. - :param gene: the gene to add + :param gene: The gene to add :raises TypeError: If the provided `gene` is of the wrong type """ @@ -207,12 +226,20 @@ def add_gene(self, gene: Gene): self._genePerOrg[gene.organism].add(gene) def add_spot(self, spot: Spot): + """Add the given spot to the family + + :param spot: Spot belonging to the family + """ from ppanggolin.region import Spot # prevent circular import error if not isinstance(spot, Spot): raise TypeError(f"A spot object is expected, you give a {type(spot)}") self._spots.add(spot) def add_module(self, module: Module): + """Add the given module to the family + + :param module: Module belonging to the family + """ from ppanggolin.region import Module # prevent circular import error if not isinstance(module, Module): raise TypeError(f"A module object is expected, you give a {type(module)}") @@ -244,7 +271,7 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): def get_org_dict(self) -> Dict[Organism, Set[Gene]]: """Returns the organisms and the genes belonging to the gene family - :return: a dictionnary of organism as key and set of genes as values + :return: A dictionnary of organism as key and set of genes as values """ if len(self._genePerOrg) == 0: for gene in self.genes: @@ -258,7 +285,7 @@ def get_genes_per_org(self, org: Organism) -> Generator[Gene, None, None]: :param org: Organism to look for - :return: a set of gene(s) + :return: A set of gene(s) """ if len(self._genePerOrg) == 0: _ = self.get_org_dict() diff --git a/tests/tests_GeneFamily.py b/tests/tests_GeneFamily.py index d4342334..54a546df 100644 --- a/tests/tests_GeneFamily.py +++ b/tests/tests_GeneFamily.py @@ -1,9 +1,8 @@ #! /usr/bin/env python3 import pytest -from random import randint, sample +from random import randint from typing import Generator, Set -from collections import defaultdict from itertools import combinations from ppanggolin.pangenome import Edge @@ -15,11 +14,15 @@ class TestGeneFamily: """Tests the gene family class """ + @pytest.fixture + def family(self) -> Generator[GeneFamily, None, None]: + """Create a gene family for all tests + """ + yield GeneFamily(1, "test") - def test_create_gene_family(self): + def test_construct_gene_family(self, family): """Tests that a GeneFamily object can be created with valid family_id and name """ - family = GeneFamily(1, 'test') assert isinstance(family, GeneFamily) assert all(attr in ["ID", "name", "_edges", "_genePerOrg", "_genes", "removed", "sequence", "partition", "_spots", "_modules", "bitarray", "_metadata_getter"] for attr in @@ -39,11 +42,6 @@ def test_create_gene_family(self): assert family._modules == set() assert family.bitarray is None - @pytest.fixture - def family(self) -> Generator[GeneFamily, None, None]: - """Create a gene family for all tests""" - yield GeneFamily(1, "test") - @pytest.mark.parametrize("partition, name", [ ("P", "persistent"), @@ -82,14 +80,13 @@ def test_add_sequence_to_gene_family(self, family): def test_add_gene_to_gene_family(self, family): """Tests that a Gene object can be added to a GeneFamily object """ - family = GeneFamily(1, 'test') gene = Gene('gene1') family.add_gene(gene) assert gene in family.genes assert gene.family == family def test_add_gene_error(self, family): - """Tests that a non gene object can't be added to a GeneFamily as gene + """Tests that a non-gene object can't be added to a GeneFamily as gene """ with pytest.raises(TypeError): family.add_gene(33) @@ -148,7 +145,8 @@ def organisms(self, genes) -> Generator[Set[Organism], None, None]: yield organisms def test_get_org_dict(self, family, genes, organisms): - """""" + """Tests that all organisms and genes are retrieved as expected + """ for gene in genes: family.add_gene(gene) org_dict = family.get_org_dict() @@ -159,29 +157,39 @@ def test_get_org_dict(self, family, genes, organisms): assert set([gene for gene_set in org_dict.values() for gene in gene_set]) == genes def test_get_org_dict_with_no_organism_fill_to_genes(self, family, genes): + """Tests that if genes are not fill with organism an AttributeError is returned + """ for gene in genes: family.add_gene(gene) with pytest.raises(AttributeError): _ = family.get_org_dict() def test_organisms(self, family, organisms, genes): + """Tests that all organisms are retrieved as expected + """ for gene in genes: family.add_gene(gene) assert set(family.organisms) == organisms def test_number_of_organism(self, family, organisms, genes): + """Tests that the expected number of organisms is found + """ for gene in genes: family.add_gene(gene) assert isinstance(family.number_of_organisms, int) assert family.number_of_organisms == len(organisms) def test_get_genes_per_org(self, family, organisms, genes): + """Tests that for a giver organism, all the genes are retrieved as expected + """ for gene in genes: family.add_gene(gene) for organism in organisms: assert set(family.get_genes_per_org(organism)) == set(organism.genes) def test_get_genes_per_org_if_org_not_in_family(self, family): + """Test that a KeyError is generated if an organism not belonging to the family is given + """ with pytest.raises(KeyError): org = Organism("organism") _ = set(family.get_genes_per_org(org)) @@ -235,6 +243,8 @@ def edges(self, families, genes) -> Generator[Set[Edge], None, None]: yield set(edges.values()) def test_get_neighbors_of_gene_family(self, families, edges): + """Tests get all the expected neighbor of the family in the graph + """ for family in families: assert all(isinstance(neighbor, GeneFamily) for neighbor in family.neighbors) expected_neighbors = set([edge.source for edge in edges @@ -243,6 +253,8 @@ def test_get_neighbors_of_gene_family(self, families, edges): assert set(family.neighbors) == expected_neighbors def test_get_number_of_neighbors(self, families, edges): + """Tests that the expected number of neighbors is found + """ for family in families: expected_neighbors = set([edge.source for edge in edges if edge.target == family]).union(set([edge.target for edge in edges @@ -252,12 +264,16 @@ def test_get_number_of_neighbors(self, families, edges): # Tests that the edges of a GeneFamily object can be retrieved def test_get_edges_of_gene_family(self, families, edges): + """Tests that all the edges belonging to the family are retrieved + """ for family in families: expected_edges = set([edge for edge in edges if edge.source == family or edge.target == family]) assert all(isinstance(edge, Edge) for edge in family.edges) assert set(family.edges) == expected_edges def test_get_number_of_edges(self, families, edges): + """Tests that the expected number of edges is found + """ for family in families: expected_edges = set([edge for edge in edges if edge.source == family or edge.target == family]) assert isinstance(family.number_of_edges, int) @@ -289,4 +305,4 @@ def test_add_non_module_as_module_in_family(self, family): with pytest.raises(TypeError): family.add_module(323) - # TODO test mk_bitarray \ No newline at end of file + # TODO test mk_bitarray From 6ca836ca3b447079fc897c36b4046caf1c171421 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 29 Aug 2023 17:34:25 +0200 Subject: [PATCH 43/75] Refactor Region Spot Module and Context class and tests --- VERSION | 2 +- ppanggolin/region.py | 270 +++++++++++++++++++++++++++++++++--------- tests/tests_Region.py | 64 ++++++++-- 3 files changed, 266 insertions(+), 70 deletions(-) diff --git a/VERSION b/VERSION index 892197b6..30a7171c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.160 +1.2.161 diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 1cba6fef..4f02d593 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -3,16 +3,12 @@ # default libraries from __future__ import annotations -import logging -from collections.abc import Iterable # installed libraries from typing import Dict, Generator, List, Set -import gmpy2 - # local libraries -from ppanggolin.genome import Gene, Organism, Contig, RNA +from ppanggolin.genome import Gene, Organism, Contig from ppanggolin.geneFamily import GeneFamily from ppanggolin.metadata import MetaFeatures @@ -28,8 +24,8 @@ class Region(MetaFeatures): - 'Contig': the property that gets the starter contig linked to the region. - 'is_whole_contig': the property that indicates if the region is an entire contig. - 'is_contig_border': the property that indicates if the region is bordering a contig. - - 'get_rnas(self) -> set': the method that gets the RNA in the region. - - 'get_bordering_genes(self, n: int, multigenics: set) -> list': the method that gets the bordered genes in the region. + - 'get_rnas': the method that gets the RNA in the region. + - 'Get_bordering_genes': the method that gets the bordered genes in the region. Fields: - 'name': the name of the region. @@ -41,7 +37,7 @@ class Region(MetaFeatures): def __init__(self, region_id: str): """Constructor method - :param region_id: identifier of the region + :param region_id: Identifier of the region """ super().__init__() self._genes_getter = {} @@ -50,19 +46,25 @@ def __init__(self, region_id: str): self.starter = None self.stopper = None - def __repr__(self): + def __repr__(self) -> str: + """Region representation + """ return f"RGP name:{self.name}" - def __hash__(self): + def __hash__(self) -> int: + """Create a hash value for the region + """ return id(self) def __eq__(self, other: Region) -> bool: """ - Expects another Region type object. Will test whether two Region objects have the same gene families + Test whether two Region objects have the same gene families - :param other: Other region to test equality of region + :param other: Another region to test equality of regions - :return: equal or not + :return: Equal or not + + :raises TypeError: Try to compare a region with another type object """ if not isinstance(other, Region): raise TypeError(f"'Region' type object was expected, but '{type(other)}' type object was provided.") @@ -72,10 +74,21 @@ def __eq__(self, other: Region) -> bool: return True return False - def __len__(self): + def __len__(self) -> int: + """Get the number of genes in the region + """ return len(self._genes_getter) - def __setitem__(self, position, gene): + def __setitem__(self, position: int, gene: Gene): + """Set a gene by is position in the region + + :param position: Position of the gene in the contig + :param gene: Gene to add in the region + + :raises TypeError: Gene is not instance Gene + :raises Exception: Organism or contig of the gene is different from the region + :raises KeyError: Another gene already exists at the position + """ if not isinstance(gene, Gene): raise TypeError(f"Unexpected class / type for {type(gene)} " f"when adding it to a region of genomic plasticity") @@ -87,24 +100,42 @@ def __setitem__(self, position, gene): raise Exception(f"Gene {gene.name} is from a different contig than the first defined in RGP. " f"That's not possible") if position in self._genes_getter and self[position] != gene: - raise ValueError("Another gene already exist at this position") + raise KeyError("Another gene already exist at this position") self._genes_getter[position] = gene self.starter = self._genes_getter[min(self._genes_getter.keys())] self.stopper = self._genes_getter[max(self._genes_getter.keys())] gene.RGP = self - def __getitem__(self, position): + def __getitem__(self, position: int) -> Gene: + """Get the gene at the given position + + :param position: Position of the gene + + :return: Gene in the Region at the given position + + :raises KeyError: Gene at the given position does not exist + """ try: return self._genes_getter[position] except KeyError: raise KeyError(f"There is no gene at position {position} in RGP {self.name}") def __delitem__(self, position): - del self._genes_getter[position] + """Remove the gene at the given position + + :param position: Position of the gene + + :raises KeyError: Gene at the given position does not exist""" + try: + del self._genes_getter[position] + except KeyError: + raise KeyError(f"There is no gene at position {position} in RGP {self.name}") @property def genes(self) -> Generator[Gene, None, None]: """Generate the gene as they are ordered in contigs + + :return: Genes in the region """ for gene in sorted(self._genes_getter.values(), key=lambda x: x.position): yield gene @@ -113,19 +144,23 @@ def genes(self) -> Generator[Gene, None, None]: def families(self) -> Generator[GeneFamily, None, None]: """Get the gene families in the RGP - :return: Set of gene families + :return: Gene families """ for gene in self.genes: yield gene.family def number_of_families(self) -> int: """Get the number of different gene families in the region + + :return: Number of families """ return len(set(self.families)) @property def length(self): """Get the length of the region + + :return: Size of the region """ return self.stopper.stop - self.starter.start @@ -133,7 +168,7 @@ def length(self): def organism(self) -> Organism: """ Get the Organism link to RGP - :return: Organism + :return: Organism corresponding to the region """ return self.starter.organism @@ -141,7 +176,7 @@ def organism(self) -> Organism: def contig(self) -> Contig: """ Get the starter contig link to RGP - :return: Contig + :return: Contig corresponding to the region """ return self.starter.contig @@ -149,7 +184,7 @@ def contig(self) -> Contig: def is_whole_contig(self) -> bool: """Indicates if the region is an entire contig - :return: True if whole contig + :return: True if whole contig else False """ if self.starter.position == 0 and self.stopper.position == len(self.contig) - 1: return True @@ -159,10 +194,12 @@ def is_whole_contig(self) -> bool: def is_contig_border(self) -> bool: """Indicates if the region is bordering a contig - :return: True if bordering + :return: True if bordering else False + + :raises AssertionError: No genes in the regions, it's not expected """ - if len(self) == 0: - raise Exception("Your region has no genes. Something wrong happenned.") + assert len(self) > 0, "Your region has no genes. Something wrong happenned." + min_pos = min(self.contig.genes, key=lambda x: x.position).position max_pos = max(self.contig.genes, key=lambda x: x.position).position if not self.contig.is_circular: @@ -173,10 +210,10 @@ def is_contig_border(self) -> bool: def get_bordering_genes(self, n: int, multigenics: set) -> List[List[Gene], List[Gene]]: """ Get the bordered genes in the region - :param n: number of genes to get + :param n: Number of genes to get :param multigenics: pangenome graph multigenic persistent families - :return: A list of bordering gene in start and stop position + :return: A list of bordering genes in start and stop position """ # TODO add Exception border = [[], []] @@ -236,7 +273,7 @@ class Spot(MetaFeatures): def __init__(self, spot_id: int): """Constructor method - :param spot_id: identifier of the spot + :param spot_id: Identifier of the spot """ if not isinstance(spot_id, int): raise TypeError(f"Spot identifier must be an integer. Given type is {type(spot_id)}") @@ -246,34 +283,67 @@ def __init__(self, spot_id: int): self._uniqOrderedSet = {} self._uniqContent = {} - def __repr__(self): + def __repr__(self) -> str: + """Spot representation + """ return f"Spot {self.ID} - #RGP: {len(self)}" def __str__(self): + """String representation of the spot + """ return f"spot_{self.ID}" def __setitem__(self, name, region): + """Set the region belonging to the spot + + :param name: Name of the region + :param region: Region to add in the spot + + :raises TypeError: Region is not an instance Region + :raises KeyError: Name of the region is already in the spot for a different region + """ if not isinstance(region, Region): raise TypeError(f"A Region object is expected to be added to the spot. find type is {type(region)}") if name in self._region_getter and self[name] != region: raise KeyError("A Region with the same name already exist in spot") self._region_getter[name] = region - def __getitem__(self, name): + def __getitem__(self, name) -> Region: + """Get the region with the given name + + :param name: Name of the wanted region + + :return: Region in the spot for the given name + + :raises KeyError: Name does not exist in the spot + """ try: return self._region_getter[name] except KeyError: raise KeyError(f"Region with {name} does not exist in spot") def __delitem__(self, name): - del self._region_getter[name] + """Delete the region for the given name + + :param name: Name of the wanted region + + :raises KeyError: Name does not exist in the spot + """ + try: + del self._region_getter[name] + except KeyError: + raise KeyError(f"Region with {name} does not exist in spot") - def __len__(self): + def __len__(self) -> int: + """Get the number of regions in the spot + """ return len(self._region_getter) @property def regions(self) -> Generator[Region, None, None]: """Generates the regions in the spot + + :return: Regions in the spot """ for region in self._region_getter.values(): yield region @@ -281,6 +351,8 @@ def regions(self) -> Generator[Region, None, None]: @property def families(self) -> Generator[GeneFamily, None, None]: """Get the gene families in the RGP + + :return: Family in the spot """ families = set() for region in self.regions: @@ -290,7 +362,9 @@ def families(self) -> Generator[GeneFamily, None, None]: yield family def number_of_families(self) -> int: - """Return the number of different families in the spot + """Get the number of different families in the spot + + :return: Number of families """ return len({family for region in self.regions for family in region.families}) @@ -303,10 +377,10 @@ def spot_2_families(self): def borders(self, set_size: int, multigenics) -> List[List[int, List[GeneFamily], List[GeneFamily]]]: """ Extracts all the borders of all RGPs belonging to the spot - :param set_size: number of genes to get + :param set_size: Number of genes to get :param multigenics: pangenome graph multigenic persistent families - :return: families that bordering spot + :return: Families that bordering spot """ all_borders = [] for rgp in self.regions: @@ -327,7 +401,8 @@ def borders(self, set_size: int, multigenics) -> List[List[int, List[GeneFamily] return family_borders def _mk_uniq_ordered_set_obj(self): - """cluster RGP into groups that have an identical synteny""" + """cluster RGP into groups that have an identical synteny + """ for rgp in self.regions: z = True for seen_rgp in self._uniqOrderedSet: @@ -337,7 +412,7 @@ def _mk_uniq_ordered_set_obj(self): if z: self._uniqOrderedSet[rgp] = {rgp} - def _get_ordered_set(self): + def _get_ordered_set(self) -> Dict[Region, Set[Region]]: """ Creates the _uniqSyn object if it was never computed. Return it in any case :return: RGP groups that have an identical synteny @@ -347,9 +422,9 @@ def _get_ordered_set(self): return self._uniqOrderedSet def get_uniq_to_rgp(self) -> Dict[Region, Set[Region]]: - """ Get dictionnary with a representing RGP as key, and all identical RGPs as value + """ Get dictionnary with a representing RGP as the key, and all identical RGPs as value - :return: Dictionnary with a representing RGP as key, and set of identical RGPs as value + :return: Dictionnary with a representing RGP as the key, and set of identical RGPs as value """ return self._get_ordered_set() @@ -361,7 +436,8 @@ def get_uniq_ordered_set(self) -> Set[Region]: return set(self._get_ordered_set().keys()) def _mk_uniq_content(self): - """cluster RGP into groups that have identical gene content""" + """cluster RGP into groups that have identical gene content + """ for rgp in self.regions: z = True for seen_rgp in self._uniqContent: @@ -371,7 +447,7 @@ def _mk_uniq_content(self): if z: self._uniqContent[rgp] = {rgp} - def _get_content(self): + def _get_content(self) -> Dict[Region, Set[Region]]: """Creates the _uniqContent object if it was never computed. :return: RGP groups that have identical gene content @@ -391,7 +467,7 @@ def count_uniq_content(self) -> dict: """ Get a counter of uniq RGP and number of identical RGP (in terms of gene family content) - :return: dictionary with a representative rgp as key and number of identical rgp as value + :return: Dictionary with a representative rgp as the key and number of identical rgp as value """ return dict([(key, len(val)) for key, val in self._get_content().items()]) @@ -399,7 +475,7 @@ def count_uniq_ordered_set(self): """ Get a counter of uniq RGP and number of identical RGP (in terms of synteny content) - :return: dictionary with a representative rgp as key and number of identical rgp as value + :return: Dictionary with a representative rgp as the key and number of identical rgp as value """ return dict([(key, len(val)) for key, val in self._get_ordered_set().items()]) @@ -427,24 +503,49 @@ def __init__(self, module_id: int, families: set = None): self.ID = module_id self._families_getter = {family.name: family for family in families} if families is not None else {} - def __repr__(self): + def __repr__(self) -> str: + """Module representation + """ return f"Module {self.ID} - #Families: {len(self)}" - def __str__(self): + def __str__(self) -> str: + """String representation of the module + """ return f"module_{self.ID}" - def __hash__(self): + def __hash__(self) -> int: + """Create a hash value for the module + """ return id(self) - def __len__(self): + def __len__(self) -> int: + """Get the number of families in the module + """ return len(self._families_getter) - def __eq__(self, other: Module): + def __eq__(self, other: Module) -> bool: + """ + Test whether two Module objects have the same gene families + + :param other: Another module to test equality + + :return: Equal or not + + :raises TypeError: Try to compare a module with another type object + """ if not isinstance(other, Module): raise TypeError(f"Another module is expected to be compared to the first one. You give a {type(other)}") return set(self.families) == set(other.families) def __setitem__(self, name, family): + """Set a gene family in the module + + :param name: Name of the family + :param family: Gene family belonging to the module + + :raises TypeError: Family is not instance GeneFamily + :raises KeyError: Another family with the same name already exists in the module + """ if not isinstance(family, GeneFamily): raise TypeError(f"A gene family is expected to be added to module. Given type was {type(family)}") if name in self._families_getter and self[name] != family: @@ -453,12 +554,26 @@ def __setitem__(self, name, family): family.add_module(self) def __getitem__(self, name) -> GeneFamily: + """Get the gene family for the given name in the module + + :param name: Name of the gene family + + :return: Gene family with the given name + + :raises KeyError: Family with the given name does not exist in the module + """ try: return self._families_getter[name] except KeyError: raise KeyError(f"There isn't gene family with the name {name} in the module") def __delitem__(self, name): + """Remove the gene family for the given name in the module + + :param name: Name of the gene family + + :raises KeyError: Family with the given name does not exist in the module + """ try: fam = self._families_getter[name] except KeyError: @@ -470,12 +585,14 @@ def __delitem__(self, name): @property def families(self) -> Generator[GeneFamily, None, None]: """Generator of the family in the module + + :return: Families belonging to the module """ yield from self._families_getter.values() class GeneContext: - """Summary + """ The GeneContext class represents a gene context, which is a collection of gene families related to a specific genomic context. Methods @@ -487,7 +604,7 @@ class GeneContext: def __init__(self, gc_id: int, families: set = None): """Constructor method - :param gc_id : identifier of the Gene context + :param gc_id : Identifier of the Gene context :param families: Gene families related to the GeneContext """ if not isinstance(gc_id, int): @@ -495,24 +612,49 @@ def __init__(self, gc_id: int, families: set = None): self.ID = gc_id self._families_getter = {family.name: family for family in families} if families is not None else {} - def __repr__(self): + def __repr__(self) -> str: + """Context representation + """ return f"Context {self.ID} - #Families: {len(self)}" - def __str__(self): + def __str__(self) -> str: + """String representation of the gene context + """ return f"context_{self.ID}" - def __hash__(self): + def __hash__(self) -> int: + """Create a hash value for the region + """ return id(self) - def __len__(self): + def __len__(self) -> int: + """Get the number of families in the context + """ return len(self._families_getter) - def __eq__(self, other: GeneContext): + def __eq__(self, other: GeneContext) -> bool: + """ + Test whether two gene context objects have the same gene families + + :param other: Another gene context to test equality + + :return: Equal or not + + :raises TypeError: Try to compare a gene context with another type object + """ if not isinstance(other, GeneContext): raise TypeError(f"Another context is expected to be compared to the first one. You give a {type(other)}") return set(self.families) == set(other.families) def __setitem__(self, name, family): + """Set a gene family in the gene context + + :param name: Name of the family + :param family: Gene family belonging to the context + + :raises TypeError: Family is not instance GeneFamily + :raises KeyError: Another family with the same name already exists in the context + """ if not isinstance(family, GeneFamily): raise TypeError(f"A gene family is expected to be added to gene context. Given type was {type(family)}") if name in self._families_getter and self[name] != family: @@ -520,19 +662,35 @@ def __setitem__(self, name, family): self._families_getter[name] = family def __getitem__(self, name) -> GeneFamily: + """Get the gene family for the given name in the context + + :param name: Name of the gene family + + :return: Gene family with the given name + + :raises KeyError: Family with the given name does not exist in the context + """ try: return self._families_getter[name] except KeyError: raise KeyError(f"There isn't gene family with the name {name} in the gene context") def __delitem__(self, name): + """Remove the gene family for the given name in the context + + :param name: Name of the gene family + + :raises KeyError: Family with the given name does not exist in the context + """ try: del self._families_getter[name] except KeyError: raise KeyError(f"There isn't gene family with the name {name} in the gene context") @property - def families(self): + def families(self) -> Generator[GeneFamily, None, None]: """Generator of the family in the context + + :return: Gene families belonging to the context """ yield from self._families_getter.values() diff --git a/tests/tests_Region.py b/tests/tests_Region.py index 41820e30..aff96a90 100644 --- a/tests/tests_Region.py +++ b/tests/tests_Region.py @@ -1,10 +1,9 @@ #! /usr/bin/env python3 -import re +# coding: utf8 import pytest from typing import Generator, Set from random import randint -import gmpy2 from ppanggolin.region import Region, Spot, Module, GeneContext from ppanggolin.geneFamily import GeneFamily @@ -53,11 +52,12 @@ def families(genes) -> Generator[Set[GeneFamily], None, None]: families.add(family) yield families + @pytest.fixture def organisms(genes) -> Generator[Set[Organism], None, None]: """Create a set of organism object for test - :return: Generator with set of organism object + :return: Generator with a set of organism object """ orgs = set() genes = list(genes) @@ -73,7 +73,7 @@ def organisms(genes) -> Generator[Set[Organism], None, None]: idx_genes += 1 orgs.add(org) idx_org += 1 - # last organism fill with all the gene left + # The last organism fill with all the gene left org = Organism(f"organism_{idx_org}") idx_genes = (idx_org - 1) * nb_genes_per_organism while idx_genes < len(genes): @@ -96,6 +96,8 @@ def region(self) -> Generator[Region, None, None]: yield Region("RGP") def test_cstr(self, region: Region): + """Tests that region is constructed as expected + """ assert isinstance(region, Region) assert region.name == "RGP" assert isinstance(region._genes_getter, dict) @@ -126,7 +128,7 @@ def test_add_genes_at_position_already_taken(self, region): gene = Gene('gene') gene.fill_annotations(start=0, stop=10, strand='+', position=0) region[0] = gene - with pytest.raises(ValueError): + with pytest.raises(KeyError): gene = Gene('gene') gene.fill_annotations(start=4, stop=12, strand='-', position=0) region[0] = gene @@ -164,6 +166,8 @@ def test_get_genes(self, region): assert region[0] == gene def test_get_genes_with_position_not_in_region(self, region): + """Tests that getting a gene at position not belonging in the region return a KeyError + """ with pytest.raises(KeyError): _ = region[randint(0, 20)] @@ -201,12 +205,12 @@ def test_get_contig(self, region): """ gene = Gene('gene') gene.fill_annotations(start=0, stop=10, strand='+', position=0) - gene.fill_parents(None, Contig("contig")) + gene.fill_parents(contig=Contig("contig")) region[0] = gene assert region.contig.name == 'contig' def test_is_whole_contig_true(self, region): - """Tests that the property is_whole_contig return True if region is same length as contig + """Tests that the property is_whole_contig return True if the region has the same length as contig """ starter, stopper = Gene('starter'), Gene('stopper') starter.fill_annotations(start=0, stop=10, strand='+', position=0) @@ -218,7 +222,7 @@ def test_is_whole_contig_true(self, region): assert region.is_whole_contig is True def test_is_whole_contig_false(self, region): - """Tests that the property is_whole_contig return False if region is not same length as contig + """Tests that the property is_whole_contig return False if the region has not the same length as contig """ before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') before.fill_annotations(start=0, stop=10, strand='+', position=0) @@ -255,7 +259,7 @@ def test_is_contig_border_true(self, region): assert region.is_contig_border is True def test_is_contig_border_false(self, region): - """Tests that the property is_contig_border return False if region is not bordering the contig + """Tests that the property is_contig_border return False if the region is not bordering the contig """ before, starter, stopper, after = Gene('before'), Gene('starter'), Gene('stopper'), Gene('after') before.fill_annotations(start=0, stop=10, strand='+', position=0) @@ -270,7 +274,15 @@ def test_is_contig_border_false(self, region): region[starter.position], region[stopper.position] = starter, stopper assert region.is_contig_border is False + def test_is_contig_border_assertion_error_if_no_gene(self, region): + """Tests that an AssertionError is returned if there is no gene in the region + """ + with pytest.raises(AssertionError): + _ = region.is_contig_border + def test_len(self, region, genes): + """Tests that the expected number of genes is retrieved in the region + """ for gene in genes: region[gene.position] = gene assert isinstance(len(region), int) @@ -336,12 +348,16 @@ def spot(self) -> Generator[Spot, None, None]: yield Spot(0) def test_cstr(self, spot): + """Tests that spot is constructed as expected + """ assert spot.ID == 0 assert isinstance(spot._region_getter, dict) and len(spot._region_getter) == 0 assert isinstance(spot._uniqOrderedSet, dict) and len(spot._uniqOrderedSet) == 0 assert isinstance(spot._uniqContent, dict) and len(spot._uniqContent) == 0 def test_cstr_type_error(self): + """Tests that TypeError is returned if identifier is not an integer + """ with pytest.raises(TypeError): Spot("spot_0") @@ -368,6 +384,8 @@ def test_add_region(self, spot, region): assert region == spot._region_getter[region.name] def test_add_not_instance_region(self, spot): + """Tests that a TypeError is returned if a non-region type is trying to be added + """ with pytest.raises(TypeError): spot["spot"] = "spot" @@ -403,15 +421,21 @@ def test_get_region(self, spot, region): assert spot[region.name] == region def test_get_region_not_in_spot(self, spot): + """Tests that a KeyError is raised when the name of the region does not exist in the spot + """ with pytest.raises(KeyError): _ = spot["rgp"] def test_delete_region_in_spot(self, spot, region): + """Tests that remove a region from the spot work as expected + """ spot[region.name] = region del spot[region.name] assert region.name not in spot._region_getter def test_len(self, spot, region): + """Tests that getting the number of regions work as expected + """ assert isinstance(len(spot), int) assert len(spot) == 0 spot[region.name] = region @@ -419,6 +443,8 @@ def test_len(self, spot, region): @pytest.fixture def regions(self, genes): + """Create a random number of regions fill with genes + """ regions = set() genes = sorted(list(genes), key=lambda x: x.position) nb_regions = randint(2, len(genes)) @@ -519,6 +545,8 @@ def test_get_uniq_content(self, spot, regions, families, srgps): class TestModule: @pytest.fixture def module(self): + """Create a basic module + """ yield Module(0) def test_cstr(self, module): @@ -556,6 +584,8 @@ def test_len(self, module): assert len(module) == 1 def test_eq(self, families): + """Test equality between modules + """ module1, module2, module3 = Module(1), Module(2), Module(3) for family in families: module1[family.name] = family @@ -564,8 +594,10 @@ def test_eq(self, families): assert module1 != module3 def test_eq_with_is_not_instance_module(self, module): + """Test comparison between a module and another object raise a TypeError + """ with pytest.raises(TypeError): - module == 4 + assert module == 4 @pytest.fixture def family(self) -> Generator[GeneFamily, None, None]: @@ -581,7 +613,7 @@ def test_add_family(self, module, family): assert module._families_getter['family'] == family def test_add_different_families_with_same_name(self, module): - """Test that adding a new family with same name than another in the module return a KeyError + """Test that adding a new family with the same name as another in the module return a KeyError """ family_1, family_2 = GeneFamily(1, 'family_1'), GeneFamily(1, 'family_1') module[family_1.name] = family_1 @@ -626,6 +658,8 @@ def test_delete_family_which_does_not_exist(self, module): class TestGeneContext: @pytest.fixture def context(self): + """Generate a basic context + """ yield GeneContext(0) def test_cstr(self, context): @@ -663,6 +697,8 @@ def test_len(self, context): assert len(context) == 1 def test_eq(self, families): + """Test equality between two contexts + """ context1, context2, context3 = GeneContext(1), GeneContext(2), GeneContext(3) for family in families: context1[family.name] = family @@ -671,8 +707,10 @@ def test_eq(self, families): assert context1 != context3 def test_eq_with_is_not_instance_context(self, context): + """Test comparison between a context and another object raise a TypeError + """ with pytest.raises(TypeError): - context == 4 + assert context == 4 @pytest.fixture def family(self) -> Generator[GeneFamily, None, None]: @@ -688,7 +726,7 @@ def test_add_family(self, context, family): assert context._families_getter['family'] == family def test_add_different_families_with_same_name(self, context): - """Test that adding a new family with same name than another in the context return a KeyError + """Test that adding a new family with the same name as another in the context return a KeyError """ family_1, family_2 = GeneFamily(1, 'family_1'), GeneFamily(1, 'family_1') context[family_1.name] = family_1 From 76679e585efd425f5dc00304fef52535d5ed0fc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 30 Aug 2023 11:24:40 +0200 Subject: [PATCH 44/75] Refactor class and test genome --- VERSION | 2 +- ppanggolin/genome.py | 191 ++++++++++++++++++++++++++---------------- tests/tests_Genome.py | 28 ++++--- 3 files changed, 136 insertions(+), 85 deletions(-) diff --git a/VERSION b/VERSION index 30a7171c..8d76db64 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.161 +1.2.162 diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 832024a2..9e320c16 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -5,7 +5,7 @@ # installed libraries import logging -from typing import Dict, Iterator, Generator +from typing import Dict, Generator, List import gmpy2 @@ -17,9 +17,9 @@ class Feature(MetaFeatures): """This is a general class representation of Gene, RNA Methods: - - fill_annotations(): fills general annotation for child classes. - - fill_parents(): associates the object to an organism and a contig. - - add_dna(): adds DNA sequence to the feature. + - fill_annotations: fills general annotation for child classes. + - fill_parents: associates the object to an organism and a contig. + - Add_sequence: adds a sequence to the feature. Fields: - ID: Identifier of the feature given by PPanGGOLiN. @@ -38,7 +38,7 @@ class Feature(MetaFeatures): def __init__(self, identifier: str): """Constructor Method - :param identifier: identifier of the feature + :param identifier: Identifier of the feature """ assert isinstance(identifier, str), "Expected identifier should be a string" if identifier == '': @@ -85,6 +85,10 @@ def organism(self) -> Organism: @organism.setter def organism(self, organism: Organism): + """Set the organism to the Feature + + :param organism: Organism belonging to the feature + """ if not isinstance(organism, Organism): raise TypeError(f'Expected type Organism, got {type(organism)}') self._organism = organism @@ -99,6 +103,10 @@ def contig(self) -> Contig: @contig.setter def contig(self, contig: Contig): + """Set the contig to the Feature + + :param contig: Contig linked to the feature + """ if not isinstance(contig, Contig): raise TypeError(f'Expected type Contig, got {type(contig)}') self._contig = contig @@ -116,7 +124,7 @@ def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = :param product: Associated product :param local_identifier: Identifier provided by the original file - :raises TypeError: If attribute value not correspond to expected type + :raises TypeError: If attribute value does not correspond to the expected type :raises ValueError: If strand is not '+' or '-' """ if not isinstance(start, int): @@ -161,11 +169,11 @@ def fill_parents(self, organism: Organism = None, contig: Contig = None): raise AssertionError("You should provide at least organism or contig") def add_sequence(self, sequence): - """ Add DNA sequence to feature + """Add a sequence to feature - :param sequence: DNA sequence + :param sequence: Sequence corresponding to the feature - :raise TypeError: DNA sequence must be a string + :raise AssertionError: Sequence must be a string """ assert isinstance(sequence, str), f"'str' type was expected but you provided a '{type(sequence)}' type object" self.dna = sequence @@ -182,19 +190,19 @@ def __init__(self, rna_id: str): class Gene(Feature): - """Save gene from genome as an Object with some information for Pangenome + """Save gene from the genome as an Object with some information for Pangenome Methods: - - fill_annotations(): fills general annotation for the gene object and adds additional attributes such as + - fill_annotations: fills general annotation for the gene object and adds additional attributes such as position and genetic code. - - add_protein(): adds the protein sequence corresponding to the translated gene to the object. + - Add_protein: adds the protein sequence corresponding to the translated gene to the object. Fields: - position: the position of the gene in the genome. - family: the family that the gene belongs to. - RGP: a set of resistance gene profiles associated with the gene. - genetic_code: the genetic code associated with the gene. - - protein: the protein sequence corresponding to the translated gene. + - Protein: the protein sequence corresponding to the translated gene. """ def __init__(self, gene_id: str): """Constructor method @@ -219,6 +227,10 @@ def family(self): @family.setter def family(self, family): + """Set the GeneFamily blonging to the gene + + :param family: Gene family linked to the gene + """ from ppanggolin.geneFamily import GeneFamily if not isinstance(family, GeneFamily): raise TypeError(f'Expected type Organism, got {type(family)}') @@ -234,18 +246,24 @@ def RGP(self): return self._RGP @RGP.setter - def RGP(self, RGP): + def RGP(self, region): + """Set the Region blonging to the gene + + :param region: Region linked to the gene + """ from ppanggolin.region import Region - if not isinstance(RGP, Region): - raise TypeError(f'Expected type Organism, got {type(RGP)}') - self._RGP = RGP + if not isinstance(region, Region): + raise TypeError(f'Expected type Organism, got {type(region)}') + self._RGP = region def fill_annotations(self, position: int = None, genetic_code: int = 11, **kwargs): """Fill Gene annotation provide by PPanGGOLiN dependencies - :param position: Gene localisation in genome + :param position: Gene localization in genome :param genetic_code: Genetic code associated to gene :param kwargs: look at Feature.fill_annotations methods + + :raises TypeError: If position or genetic code value is not instance integers """ super().fill_annotations(**kwargs) if position is not None and not isinstance(position, int): @@ -256,7 +274,7 @@ def fill_annotations(self, position: int = None, genetic_code: int = 11, **kwarg self.genetic_code = genetic_code def add_protein(self, protein: str): - """Add protein sequence corresponding to translated gene + """Add a protein sequence corresponding to translated gene :param protein: Protein sequence @@ -271,9 +289,9 @@ class Contig: """ Describe the contig content and some information Methods: - - genes(self) -> list: Returns a list of gene objects present in the contig. - - add_rna(self, rna: RNA): Adds an RNA object to the contig. - - add_gene(self, gene: Gene): Adds a gene object to the contig. + - genes: Returns a list of gene objects present in the contig. + - add_rna: Adds an RNA object to the contig. + - add_gene: Adds a gene object to the contig. Fields: - name: Name of the contig. @@ -285,11 +303,11 @@ def __init__(self, name: str, is_circular: bool = False): """Constructor method :param name: Name of the contig - :param is_circular: save if the contig is circular + :param is_circular: saves if the contig is circular """ self.name = name self.is_circular = is_circular - self._rna_getter = set() # saving the rna annotations. We're not using them in the vast majority of cases. + self._rna_getter = set() # Saving the rna annotations. We're not using them in the vast majority of cases. self._genes_getter = {} self._genes_position = [] self._organism = None @@ -297,7 +315,7 @@ def __init__(self, name: str, is_circular: bool = False): def __str__(self) -> str: return self.name - def __len__(self): + def __len__(self) -> int: return len(self._genes_position) def __setitem__(self, start: int, gene: Gene): @@ -305,6 +323,10 @@ def __setitem__(self, start: int, gene: Gene): :param start: Start position of the gene :param gene: Gene object to add + + :raises TypeError: If the gene is not instance Gene + :raises ValueError: If a gene in getter already exists at the start + :raises AttributeError: If the gene position in the contig is not fill """ # TODO look at change start for position @@ -314,24 +336,39 @@ def __setitem__(self, start: int, gene: Gene): raise ValueError(f"Gene with start position {start} already exists in the contig") if gene.position is None: raise AttributeError("The gene object needs to have its position in the contig filled before adding it") - # adding empty values. They should be filled by the end of the parsing. + # Adding empty values. + # They should be filled by the end of the parsing. # Doing this because genes are not always met in order. self._genes_position.extend([None] * (gene.position - len(self._genes_position) + 1)) self._genes_position[gene.position] = gene self._genes_getter[gene.start] = gene # retrieve gene by start position - def __getitem__(self, index: int) -> Gene: - if not isinstance(index, int): - raise TypeError(f"Expected type is int, given type was '{type(index)}'") - return self._genes_position[index] + def __getitem__(self, position: int) -> Gene: + """Get the gene for the given position + + :param position: Position of the gene in the contig + + :return: Wanted gene for the position + + :raises TypeError: If position is not an integer + """ + if not isinstance(position, int): + raise TypeError(f"Expected type is int, given type was '{type(position)}'") + return self._genes_position[position] # TODO define delitem - def get_genes(self, begin: int, end: int): + def get_genes(self, begin: int, end: int) -> List[Gene]: """Gets a list of genes within a range - :param begin: Position of first gene to retrieve - :param end: Position of last gene to not retrieve + + :param begin: Position of the first gene to retrieve + :param end: Position of the last gene to not retrieve + + :return: List of genes between begin and end position + + :raises TypeError: If begin or end is not an integer + :raises ValueError: If begin position is greater than end positon """ if not isinstance(begin, int) or not isinstance(end, int): raise TypeError(f"Expected type is int, given type was '{type(begin)}, {type(end)}'") @@ -341,10 +378,10 @@ def get_genes(self, begin: int, end: int): return self._genes_position[begin: end] @property - def genes(self) -> list: + def genes(self) -> Generator[Gene, None, None]: """ Give the gene content of the contig - :return: list of gene in contig + :return: Generator of genes in contig """ for gene in self._genes_position: if gene is not None: @@ -360,6 +397,12 @@ def organism(self) -> Organism: @organism.setter def organism(self, organism: Organism): + """Set the organism belonging to the contig + + :param organism: Organism to set + + :raises TypeError: Given organism is not an instance Organism + """ if not isinstance(organism, Organism): raise TypeError(f'Expected type Organism, got {type(organism)}') self._organism = organism @@ -368,6 +411,9 @@ def add_rna(self, rna: RNA): """ Add RNA to contig :param rna: RNA object to add + + :raises TypeError: RNA is not instance RNA + :raises KeyError: Another RNA with the same ID already exists in the contig """ if not isinstance(rna, RNA): raise TypeError(f"'RNA' type was expected but you provided a '{type(rna)}' type object") @@ -381,8 +427,7 @@ def RNAs(self) -> Generator[RNA, None, None]: :return: Generator of RNA """ - for rna in self._rna_getter: - yield rna + yield from self._rna_getter class Organism(MetaFeatures): @@ -390,13 +435,13 @@ class Organism(MetaFeatures): Describe the Genome content and some information Methods: - - `families(self) -> set`: Returns a set of gene families present in the organism. - - `genes(self) -> Iterator[Gene]`: Returns a generator to get genes in the organism. - - `number_of_genes(self) -> int`: Returns the number of genes in the organism. - - `contigs(self) -> dict.values`: Returns the values in the contig dictionary from the organism. - - `get_contig(self, contig_id: str, is_circular: bool = False)`: Gets the contig with the given identifier in the organism, adding it if it does not exist. - - `_create_contig(self, contig_id: str, is_circular: bool = False)`: Creates a new contig object and adds it to the contig dictionary. - - `mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all')`: Produces a bitarray representing the presence/absence of gene families in the organism using the provided index. + - `families`: Returns a set of gene families present in the organism. + - `genes`: Returns a generator to get genes in the organism. + - `number_of_genes`: Returns the number of genes in the organism. + - `contigs`: Returns the values in the contig dictionary from the organism. + - `get_contig`: Gets the contig with the given identifier in the organism, adding it if it does not exist. + - `_create_contig`: Creates a new contig object and adds it to the contig dictionary. + - `mk_bitarray`: Produces a bitarray representing the presence/absence of gene families in the organism using the provided index. Fields: - `name`: Name of the organism. @@ -405,6 +450,7 @@ class Organism(MetaFeatures): def __init__(self, name: str): """Constructor Method + :param name: Name of the genome """ assert isinstance(name, str), "Organism name should be a string" @@ -419,56 +465,54 @@ def __init__(self, name: str): def __str__(self): return self.name - def _get_families(self) -> set: - """Get the set of gene families belonging to organism""" + def _set_families(self): + """Set the set of gene families belonging to organism + """ self._families = {gene.family for gene in self.genes} @property def families(self): - """returns the gene families present in the organism + """Return the gene families present in the organism - :return: Generator of gene families in organism + :return: Generator of gene families :rtype: Generator[GeneFamily, None, None] """ if self._families is None: - self._get_families() - for fam in self._families: - yield fam + self._set_families() + yield from self._families def number_of_families(self) -> int: - """Return number of gene families in organism + """Get the number of gene families in the organism - :return: Number of gene families in organism + :return: Number of gene families """ if self._families is None: - self._get_families() + self._set_families() return len(self._families) @property def genes(self) -> Generator[Gene, None, None]: - """ Generator to get genes in organism + """Generator to get genes in the organism - :return: Generator of genes in organism + :return: Generator of genes """ for contig in self.contigs: - for gene in contig.genes: - yield gene + yield from contig.genes def number_of_genes(self) -> int: - """ Get number of genes in organism + """ Get number of genes in the organism - :return: Number of genes in organism + :return: Number of genes """ return sum([len(contig) for contig in self.contigs]) @property def contigs(self) -> Generator[Contig, None, None]: - """ Get contigs in organism + """ Generator of contigs in the organism - :return: values in contig dictionary from organism + :return: Values in contig dictionary from organism """ - for contig in self._contigs_getter.values(): - yield contig + yield from self._contigs_getter.values() def number_of_contigs(self) -> int: """ Get number of contigs in organism @@ -483,23 +527,26 @@ def get_contig(self, name: str) -> Contig: :param name: Contig identifier - :return: the contig with the given identifier + :return: The contig with the given identifier + + :raises KeyError: Contig with the given name does not exist in the organism """ assert isinstance(name, str), f"To get a contig, name with string type is expected. Given type: {type(name)}" try: - contig = self._contigs_getter[name] + return self._contigs_getter[name] except KeyError: raise KeyError(f"Contig {name} does not belong to organism {self.name}") - else: - return contig def add_contig(self, contig: Contig): """Add a contig to organism - :param: contig to add in organism + + :param: Contig to add in organism + + :raises KeyError: Contig with the given name already exist in the organism """ assert isinstance(contig, Contig), f"Contig object is expected, given type was {type(contig)}" try: - contig = self.get_contig(contig.name) + _ = self.get_contig(contig.name) except KeyError: self._contigs_getter[contig.name] = contig contig.organism = self @@ -510,8 +557,10 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. - :param partition: Filter partition + :param partition: Filters partition :param index: The index computed by :func:`ppanggolin.pangenome.Pangenome.getIndex` + + :raises Exception: Partition is not recognized """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': diff --git a/tests/tests_Genome.py b/tests/tests_Genome.py index 528e97c4..42f0e930 100644 --- a/tests/tests_Genome.py +++ b/tests/tests_Genome.py @@ -19,7 +19,7 @@ def feature(self) -> Generator[Feature, None, None]: yield Feature('test_id') def test_creation(self, feature): - """Tests that 'Feature' object is created successfully with the given identifier + """Tests that 'Feature' is created successfully with the given identifier """ assert feature.ID == 'test_id' assert not feature.is_fragment @@ -64,7 +64,7 @@ def test_fill_annotations(self, feature): assert feature.local_identifier == 'local_id' def test_fill_annotations_type_error(self, feature): - """Tests that 'fill_annotations' method raises a TypeError if attribute value is not with correct type + """Tests that 'fill_annotations' method raises a TypeError if attribute value is not with the correct type """ with pytest.raises(TypeError): feature.fill_annotations('1', 10, '+', 'gene_type', 'name', 'product', 'local_id') @@ -115,27 +115,27 @@ def test_fill_parents_with_nothing(self, feature): feature.fill_parents() def test_set_organism(self, feature): - """Tests that organism setter sets organism with valid type + """Tests that organism setter sets organism with the valid type """ organism = Organism('organism') feature.organism = organism assert feature.organism == organism def test_set_organism_not_isinstance_organism(self, feature): - """Tests that organism setter return TypeError if sets organism with invalid type + """Tests that organism setter return TypeError if sets organism with the invalid type """ with pytest.raises(TypeError): feature.organism = 4 def test_set_contig(self, feature): - """Tests that contig setter sets contig with valid type + """Tests that contig setter sets contig with the valid type """ contig = Contig('contig') feature.contig = contig assert feature.contig == contig def test_set_contig_not_isinstance_contig(self, feature): - """Tests that contig setter return TypeError if sets contig with invalid type + """Tests that contig setter return TypeError if sets contig with the invalid type """ with pytest.raises(TypeError): feature.contig = 4 @@ -234,7 +234,7 @@ def test_add_protein_non_string(self, gene): gene.add_protein(123) def test_set_family(self, gene): - """Tests that family setter sets family with valid type + """Tests that family setter sets family with the valid type """ family = GeneFamily(0, 'family') gene.family = family @@ -247,7 +247,7 @@ def test_set_family_not_instance_gene_family(self, gene): gene.family = 4 def test_set_rgp(self, gene): - """Tests that RGP setter sets family with valid type + """Tests that RGP setter sets family with the valid type """ region = Region(0) gene.RGP = region @@ -279,7 +279,7 @@ def gene(self) -> Generator[Gene, None, None]: @pytest.fixture def genes(self) -> Generator[Tuple[Gene, Gene, Gene], None, None]: - """Generate 3 basic genes for tests + """Generate three basic genes for tests """ gene1 = Gene('test_gene1') gene1.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) @@ -294,7 +294,7 @@ def test_create_contig(self, contig): """ assert contig.name == "contig" assert not contig.is_circular - assert contig._rna_getter == set() # saving the rna annotations. We're not using them in the vast majority of cases. + assert contig._rna_getter == set() # Saving the rna annotations. We're not using them in the vast majority of cases. assert contig._genes_getter == {} assert contig._genes_position == [] assert contig._organism is None @@ -324,13 +324,13 @@ def test_add_gene_at_far_position(self, gene, contig): assert contig._genes_position[1:6] == [None]*5 def test_add_gene_not_instance_gene(self, contig): - """Tests that the contig cannot be fill with a non gene object + """Tests that the contig cannot be fill with a non-gene object """ with pytest.raises(TypeError): contig[1] = "4" def test_add_gene_with_start_already_taken(self, contig, gene): - """Tests that the contig cannot be fill with a non gene object + """Tests that the contig cannot be fill with a non-gene object """ contig[gene.start] = gene with pytest.raises(ValueError): @@ -339,6 +339,8 @@ def test_add_gene_with_start_already_taken(self, contig, gene): contig[new_gene.start] = new_gene def test_add_gene_without_position(self, contig): + """Test that adding a gene not fill with position raise an AttributeError + """ with pytest.raises(AttributeError): gene = Gene('test_gene') contig[gene.start] = gene @@ -422,7 +424,7 @@ def test_set_organism(self, contig): assert contig.organism == organism def test_set_organism_with_not_instance_organism(self, contig): - """Tests that the contig cannot be fill with a non organism object + """Tests that the contig cannot be fill with a non-organism object """ with pytest.raises(TypeError): contig.organism = 4 From eaaed36cba13e7004881136c7b81d466b5b2f3db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 30 Aug 2023 11:57:18 +0200 Subject: [PATCH 45/75] Refactor class and test pangenome --- VERSION | 2 +- ppanggolin/pangenome.py | 79 ++++++++++-------- tests/tests_Pangenome.py | 169 +++++++++++++++++++++------------------ 3 files changed, 134 insertions(+), 116 deletions(-) diff --git a/VERSION b/VERSION index 8d76db64..31b16d88 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.162 +1.2.163 diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index f28b026e..2a81059c 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -106,7 +106,7 @@ def _mk_gene_getter(self): Since the genes are never explicitly 'added' to a pangenome (but rather to a gene family, or a contig), the pangenome cannot directly extract a gene from a geneID since it does not 'know' them. - if at some point we want to extract genes from a pangenome we'll create a geneGetter. + If at some point we want to extract genes from a pangenome we'll create a geneGetter. The assumption behind this is that the pangenome has been filled and no more gene will be added. """ self._geneGetter = {} @@ -114,11 +114,11 @@ def _mk_gene_getter(self): self._geneGetter[gene.ID] = gene def get_gene(self, gene_id: str) -> Gene: - """returns the gene that has the given gene ID + """Returns the gene that has the given gene ID :param gene_id: The gene ID to look for - :return: returns the gene that has the ID `gene_id` + :return: Returns the gene that has the ID `gene_id` :raises AssertionError: If the `gene_id` is not an integer :raises KeyError: If the `gene_id` is not in the pangenome @@ -130,14 +130,14 @@ def get_gene(self, gene_id: str) -> Gene: except AttributeError: # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. self._mk_gene_getter() # make it - return self.get_gene(gene_id) # return what was expected. If geneID does not exist it will raise an error. + return self.get_gene(gene_id) # Return what was expected. If geneID does not exist it will raise an error. except KeyError: raise KeyError(f"{gene_id} does not exist in the pangenome.") def number_of_genes(self) -> int: """Returns the number of gene present in the pangenome - :return: the number of genes + :return: The number of genes """ try: return len(self._geneGetter) @@ -148,15 +148,21 @@ def number_of_genes(self) -> int: """Gene families methods""" @property def max_fam_id(self): + """Get the last family identifier + """ return self._max_fam_id @max_fam_id.setter def max_fam_id(self, value): + """Set the last family identifier + + :param value: value of the maximum family identifer + """ self._max_fam_id = value @property def gene_families(self) -> Generator[GeneFamily, None, None]: - """returns all the gene families in the pangenome + """Returns all the gene families in the pangenome :return: Generator of gene families """ @@ -166,16 +172,16 @@ def gene_families(self) -> Generator[GeneFamily, None, None]: def number_of_gene_families(self) -> int: """Returns the number of gene families present in the pangenome - :return: the number of gene families + :return: The number of gene families """ return len(self._famGetter) def get_gene_family(self, name: str) -> GeneFamily: - """returns the gene family that has the given `name` + """Returns the gene family that has the given `name` :param name: The gene family name to look for - :return: returns the gene family that has the name `name` + :return: Returns the gene family that has the name `name` :raises AssertionError: If the `name` is not an integer :raises KeyError: If the `name` is not corresponding to any family in the pangenome @@ -213,7 +219,7 @@ def add_gene_family(self, family: GeneFamily): """Graph methods""" @property def edges(self) -> Generator[Edge, None, None]: - """returns all the edges in the pangenome graph + """Returns all the edges in the pangenome graph :return: Generator of edge """ @@ -227,7 +233,7 @@ def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: :param gene1: The first gene :param gene2: The second gene - :return: the created Edge + :return: The created Edge :raises AssertionError: Genes object are expected :raises AttributeError: Genes are not associated to any families @@ -250,14 +256,14 @@ def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: def number_of_edges(self) -> int: """Returns the number of edge present in the pangenome - :return: the number of gene families + :return: The number of gene families """ return len(self._edgeGetter) """Organism methods""" @property def organisms(self) -> Generator[Organism, None, None]: - """returns all the organisms in the pangenome + """Returns all the organisms in the pangenome :return: Generator :class:`ppanggolin.genome.Organism` """ @@ -267,7 +273,7 @@ def organisms(self) -> Generator[Organism, None, None]: def number_of_organisms(self) -> int: """Returns the number of organisms present in the pangenome - :return: the number of organism + :return: The number of organism """ return len(self._orgGetter) @@ -291,7 +297,7 @@ def get_organism(self, name: str) -> Organism: def add_organism(self, organism: Organism): """ - adds an organism that did not exist previously in the pangenome if an Organism object is provided. + Adds an organism that did not exist previously in the pangenome if an Organism object is provided. If an organism with the same name exists it will raise an error. If a str object is provided, will return the corresponding organism that has this name OR create a new one if it does not exist. @@ -330,7 +336,7 @@ def compute_family_bitarrays(self, part: str = 'all') -> Dict[Organism, int]: :param part: Filter the organism in function of the given partition - :return: the index of organisms in pangenome + :return: The index of organisms in pangenome """ if self._org_index is None: # then the bitarrays don't exist yet, since the org index does not exist either. @@ -404,10 +410,10 @@ def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[Gen Returns the multigenic persistent families of the pangenome graph. A family will be considered multigenic if it is duplicated in more than `dup_margin` of the genomes where it is present. - :param dup_margin: the ratio of presence in multicopy above which a gene family is considered multigenic + :param dup_margin: The ratio of presence in multicopy above which a gene family is considered multigenic :param persistent: if we consider only the persistent genes - :return: set of gene families considered multigenic + :return: Set of gene families considered multigenic """ assert isinstance(dup_margin, float), "Dup margin should be a float" assert isinstance(persistent, bool), "persistent should be a boolean" @@ -441,15 +447,17 @@ def add_region(self, region: Region): def number_of_rgp(self) -> int: """Returns the number of gene families present in the pangenome - :return: the number of gene families + :return: The number of gene families """ return len(self._regionGetter) """Spot methods""" @property def spots(self) -> Generator[Spot, None, None]: - for spot in self._spotGetter.values(): - yield spot + """Generate spots in the pangenome + + :return: Spot generator""" + yield from self._spotGetter.values() def get_spot(self, spot_id: Union[int, str]) -> Spot: # TODO Change for only str or only int @@ -482,7 +490,7 @@ def get_spot(self, spot_id: Union[int, str]) -> Spot: def add_spot(self, spot: Spot): """Adds the given iterable of spots to the pangenome. - :param spot: spot which should be added + :param spot: Spot which should be added :raise AssertionError: Error if spot is not a Spot object :raise KeyError: Error if another Spot exist in pangenome with the same identifier @@ -500,15 +508,16 @@ def add_spot(self, spot: Spot): def number_of_spots(self) -> int: """Returns the number of gene families present in the pangenome - :return: the number of gene families + :return: The number of gene families """ return len(self._spotGetter) """Modules methods""" @property def modules(self) -> Module: - for module in self._moduleGetter.values(): - yield module + """Generate modules in the pangenome + """ + yield from self._moduleGetter.values() def get_module(self, module_id: Union[int, str]) -> Module: # TODO Change for only str or only int @@ -543,7 +552,7 @@ def get_module(self, module_id: Union[int, str]) -> Module: def add_module(self, module: Module): """Add the given module to the pangenome - :param module: module to add in pangenome + :param module: Module to add in pangenome :raise AssertionError: Error if module is not a Module object :raise KeyError: Error if another module exist in pangenome with the same name @@ -580,7 +589,7 @@ def compute_mod_bitarrays(self, part: str = 'all') -> Dict[GeneFamily, int]: def number_of_modules(self) -> int: """Returns the number of modules present in the pangenome - :return: the number of modules + :return: The number of modules """ return len(self._moduleGetter) @@ -588,7 +597,7 @@ def number_of_modules(self) -> int: def select_elem(self, metatype: str): """Get all the element for the given metatype - :param metatype: name of pangenome component that will be get + :param metatype: Name of pangenome component that will be get :return: All elements from pangenome for the metatype @@ -613,11 +622,11 @@ def select_elem(self, metatype: str): raise KeyError("Given metatype is not allowed") def metadata_sources(self, metatype: str) -> Set[str]: - """returns all the metadata source in the pangenomes + """Returns all the metadata source in the pangenomes - :param metatype: select to which pangenome element metadata should be searched + :param metatype: Select to which pangenome element metadata should be searched - :return: set of metadata source + :return: Set of metadata source :raise AssertionError: Error if metatype is not a string """ @@ -630,9 +639,9 @@ def metadata_sources(self, metatype: str) -> Set[str]: def metadata(self, metatype: str) -> Generator[Metadata, None, None]: """Create a generator with all metadatas in the pangenome - :param metatype: select to which pangenome element metadata should be generate + :param metatype: Select to which pangenome element metadata should be generate - :return: set of metadata source + :return: Set of metadata source """ for elem in self.select_elem(metatype): yield elem.metadata @@ -641,10 +650,10 @@ def get_elem_by_metadata(self, metatype: str, **kwargs) -> Generator[ Union[GeneFamily, Gene, Organism, Region, Spot, Module], None, None]: """Get element in pangenome with metadata attribute expected - :param metatype: select to which pangenome element metadata + :param metatype: Select to which pangenome element metadata :param kwargs: attributes to identify metadata - :return: metadata element + :return: Metadata element """ for elem in self.select_elem(metatype): if len(list(elem.get_metadata(**kwargs))) > 0: diff --git a/tests/tests_Pangenome.py b/tests/tests_Pangenome.py index b21fb71b..3558db54 100644 --- a/tests/tests_Pangenome.py +++ b/tests/tests_Pangenome.py @@ -1,9 +1,8 @@ #! /usr/bin/env python3 import pytest -from random import choices, randint, sample +from random import choices, randint from typing import Generator, Set, Tuple, Union -from pathlib import Path from ppanggolin.genome import Gene, Organism, Contig from ppanggolin.pangenome import Pangenome @@ -15,7 +14,7 @@ class TestPangenome: """This class tests methods in pangenome class associated to pangenome direclty. - For pangenome components, there are subclass to test each component. + For pangenome components, there are subclasses to test each component. This class also generate a pangenome for all the test """ @@ -23,7 +22,7 @@ class TestPangenome: def pangenome(self) -> Generator[Pangenome, None, None]: """Create a pangenomes object for test - :return: Generator with pangenomes object + :return: Generator with the pangenome object """ pangenome = Pangenome() yield pangenome @@ -38,19 +37,19 @@ def test_cstr(self, pangenome): :return: A pangenome object """ pangenome_attr_type = { - "file": type(None), - "_famGetter": dict, - "_org_index": type(None), - "_fam_index": type(None), - "_max_fam_id": int, - "_orgGetter": dict, - "_edgeGetter": dict, - "_regionGetter": dict, - "_spotGetter": dict, - "_moduleGetter": dict, - "status": dict, - "parameters": dict - } + "file": type(None), + "_famGetter": dict, + "_org_index": type(None), + "_fam_index": type(None), + "_max_fam_id": int, + "_orgGetter": dict, + "_edgeGetter": dict, + "_regionGetter": dict, + "_spotGetter": dict, + "_moduleGetter": dict, + "status": dict, + "parameters": dict + } status_keys = [ 'genomesAnnotated', 'geneSequences', @@ -98,7 +97,7 @@ def test_is_instance_pangenome(self, pangenome): This test is important because it ensures that the class name does not change and that we are working with a Pangenome object, and not some other type of object. - :param pangenome: object to test if is an instance of the pangenome class + :param pangenome: Object to test if is an instance of the pangenome class :raise AssertionError: If pangenome is not an instance of the pangenome class """ @@ -107,7 +106,7 @@ def test_is_instance_pangenome(self, pangenome): def test_add_file_is_not_path(self, pangenome): """Tests that the add_file method raises an AssertionError if a file is not an instance of the Path class - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method """ with pytest.raises(AssertionError): pangenome.add_file("pangenome.h5") @@ -119,14 +118,15 @@ class TestPangenomeOrganism(TestPangenome): @pytest.fixture def organism(self) -> Generator[Organism, None, None]: - organism = Organism(name="organism") - yield organism + """Create a basic organism + """ + yield Organism(name="organism") def test_add_organism(self, pangenome, organism): """Tests the add_organism method of the Pangenome class. - :param pangenome: pangenome object to test method - :param organism: organism object to test method + :param pangenome: Pangenome object to test method + :param organism: organism object to test method """ pangenome.add_organism(organism) assert set(pangenome.organisms) == {organism} @@ -134,8 +134,8 @@ def test_add_organism(self, pangenome, organism): def test_add_organism_already_in_pangenome(self, pangenome, organism): """Tests that adding organism that already exist return a KeyError. - :param pangenome: pangenome object to test method - :param organism: organism object to test method + :param pangenome: Pangenome object to test method + :param organism: organism object to test method """ pangenome.add_organism(organism) with pytest.raises(KeyError): @@ -144,7 +144,7 @@ def test_add_organism_already_in_pangenome(self, pangenome, organism): def test_add_organism_not_instance_organism(self, pangenome): """Ensure that it raises an AssertionError when a non-Organism object is passed as an argument. - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method """ with pytest.raises(AssertionError): pangenome.add_organism("org") @@ -152,8 +152,8 @@ def test_add_organism_not_instance_organism(self, pangenome): def test_get_organism(self, pangenome, organism): """Tests the get_organism method of the Pangenome class. - :param pangenome: pangenome object to test method - :param organism: organism object to test method + :param pangenome: Pangenome object to test method + :param organism: organism object to test method """ pangenome.add_organism(organism) get_org = pangenome.get_organism("organism") @@ -163,7 +163,7 @@ def test_get_organism(self, pangenome, organism): def test_get_organism_not_in_pangenome(self, pangenome): """Ensure that it raises a KeyError when an Organism is not in the pangenome. - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method """ with pytest.raises(KeyError): pangenome.get_organism('org') @@ -171,7 +171,7 @@ def test_get_organism_not_in_pangenome(self, pangenome): def test_get_organism_with_name_not_instance_string(self, pangenome): """Ensure that it raises an AssertionError when a non-string name is passed as organism name. - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method """ with pytest.raises(AssertionError): pangenome.get_organism(33) @@ -180,7 +180,7 @@ def test_get_organism_with_name_not_instance_string(self, pangenome): def organisms(self) -> Generator[Set[Organism], None, None]: """Create a set of organism object for test - :return: Generator with set of organism object + :return: Generator with the set of organism object """ orgs = set() for i in range(randint(5, 20)): @@ -190,10 +190,10 @@ def organisms(self) -> Generator[Set[Organism], None, None]: @pytest.fixture def add_organisms(self, pangenome, organisms): - """Add set of organims to pangenome + """Add the set of organims to pangenome - :param pangenome: pangenome object to test method - :param orgs: set of organisms to add to pangenome + :param pangenome: Pangenome object to test method + :param organisms: Set of organisms to add to pangenome """ for org in organisms: pangenome.add_organism(org) @@ -201,9 +201,9 @@ def add_organisms(self, pangenome, organisms): def test_number_of_organisms(self, add_organisms, pangenome, organisms): """Tests the number_of_organisms method of the pangenome class. - :param add_organisms: method to add organisms to pangenome - :param pangenome: pangenome object to test method - :param orgs: set of organisms to add to pangenome + :param add_organisms: Method to add organisms + :param pangenome: Pangenome object to test method + :param organisms: Set of organisms to add to pangenome """ assert isinstance(pangenome.number_of_organisms(), int) assert pangenome.number_of_organisms() == len(organisms) @@ -212,6 +212,7 @@ def test_number_of_organisms(self, add_organisms, pangenome, organisms): class TestPangenomeGeneFamilies(TestPangenome): """This class tests methods in pangenome class associated to gene families. """ + @pytest.fixture def family(self) -> Generator[GeneFamily, None, None]: """Create a Gene Family object @@ -224,7 +225,7 @@ def family(self) -> Generator[GeneFamily, None, None]: def test_max_fam_id_is_instance_int_and_egal_zero(self, pangenome): """Tests that the max_fam_id attribute is corretly set - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method """ assert isinstance(pangenome.max_fam_id, int) assert pangenome.max_fam_id == 0 @@ -232,8 +233,8 @@ def test_max_fam_id_is_instance_int_and_egal_zero(self, pangenome): def test_add_gene_family(self, pangenome, family): """Tests the add_gene_family method of the Pangenome class. - :param pangenome: pangenome object to test method - :param family: gene family object to test method + :param pangenome: Pangenome object to test method + :param family: gene family object to test method """ pangenome.add_gene_family(family) assert 1 == pangenome.max_fam_id @@ -242,7 +243,7 @@ def test_add_gene_family(self, pangenome, family): def test_add_gene_family_already_in_pangenome(self, pangenome, family): """Tests that adding gene family that already exist return a KeyError. - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method :param family: gene family object to test method """ pangenome.add_gene_family(family) @@ -252,7 +253,7 @@ def test_add_gene_family_already_in_pangenome(self, pangenome, family): def test_get_gene_family(self, pangenome, family): """Tests that get_gene_family return a gene family object corresponding to the requested gene family - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method :param family: gene family object to test method """ pangenome.add_gene_family(family) @@ -262,7 +263,7 @@ def test_get_gene_family(self, pangenome, family): def test_get_gene_family_not_in_pangenome(self, pangenome, family): """Tests that return a KeyError if family does not exist in pangenome - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method :param family: gene family object to test method """ with pytest.raises(KeyError): @@ -271,7 +272,7 @@ def test_get_gene_family_not_in_pangenome(self, pangenome, family): def test_get_gene_family_with_name_not_isinstance_string(self, pangenome): """Tests that return an AssertionError if family name used to get family is not string - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method """ with pytest.raises(AssertionError): pangenome.get_gene_family(3) @@ -280,7 +281,7 @@ def test_get_gene_family_with_name_not_isinstance_string(self, pangenome): def families(self) -> Generator[Set[GeneFamily], None, None]: """Create a set of Gene Family object for test - :return: Generator with set of organism object + :return: Generator with the set of organism object """ families = set() for i in range(randint(5, 20)): @@ -290,10 +291,10 @@ def families(self) -> Generator[Set[GeneFamily], None, None]: @pytest.fixture def add_families(self, pangenome, families): - """Add set of gene families to pangenome + """Add the set of gene families to pangenome :param pangenome: pangenome object to test method - :param orgs: set of gene families to add to pangenome + :param families: set of gene families to add to pangenome """ for family in families: pangenome.add_gene_family(family) @@ -301,7 +302,7 @@ def add_families(self, pangenome, families): def test_number_of_gene_families_empty(self, add_families, pangenome, families): """Tests the number_of_gene_families method of the pangenome class. - :param add_organisms: method to add gene families to pangenome + :param add_families: Method to add gene families :param pangenome: pangenome object to test method :param families: set of families to add to pangenome """ @@ -312,11 +313,12 @@ def test_number_of_gene_families_empty(self, add_families, pangenome, families): class TestPangenomeGene(TestPangenome): """This class tests methods in pangenome class associated to Gene. """ + @pytest.fixture - def genes(self)-> Generator[Set[Gene], None, None]: + def genes(self) -> Generator[Set[Gene], None, None]: """Create a set of Gene object for test - :return: Generator with set of organism object + :return: Generator with the set of organism object """ genes = set() for i in range(randint(5, 20)): @@ -338,7 +340,7 @@ def fill_org_with_genes(self) -> Generator[Union[Organism, Set[Gene]], None, Non gene = Gene(gene_id=f"{organism.name}.{contig_id}.{gene_idx}") gene.position = gene_idx gene.start = gene_idx - contig.add_gene(gene) + contig[gene.start] = gene genes.add(gene) yield organism, genes @@ -359,10 +361,10 @@ def fill_family_with_genes(self, pangenome): yield family, genes def test_genes_generator_from_organism(self, pangenome, organism_genes): - """Tests genes generator from organism in pangenome object + """Tests genes generator from organism in the pangenome object - :param pangenome: pangenome object to test method - :param organism_genes: method to get an organism object fill with genes + :param pangenome: Pangenome object + :param organism_genes: method to get an organism object filled with genes """ organism, genes = organism_genes pangenome.add_organism(organism) @@ -371,8 +373,8 @@ def test_genes_generator_from_organism(self, pangenome, organism_genes): def test_get_gene_with_organism(self, pangenome, organism_genes): """Tests get genes from organism in pangenome object - :param pangenome: pangenome object to test method - :param organism_genes: method to get an organism object fill with genes + :param pangenome: Pangenome object + :param organism_genes: Method to get an organism object filled with genes """ organism, genes = organism_genes pangenome.add_organism(organism) @@ -382,8 +384,8 @@ def test_get_gene_with_organism(self, pangenome, organism_genes): def test_genes_generator_from_gene_families(self, family_genes, pangenome): """Tests genes generator from gene families in pangenome object - :param pangenome: pangenome object to test method - :param family_genes: method to get a gene family object fill with genes + :param pangenome: Pangenome object to test method + :param family_genes: method to get a gene family object filled with genes """ family, genes = family_genes pangenome.add_gene_family(family) @@ -392,8 +394,8 @@ def test_genes_generator_from_gene_families(self, family_genes, pangenome): def test_get_with_gene_family(self, pangenome, family_genes): """Tests genes generator from gene families in pangenome object - :param pangenome: pangenome object to test method - :param family_genes: method to get a gene family object fill with genes + :param pangenome: Pangenome object to test method + :param family_genes: method to get a gene family object filled with genes """ family, genes = family_genes pangenome.add_gene_family(family) @@ -403,7 +405,7 @@ def test_get_with_gene_family(self, pangenome, family_genes): def test_get_gene_not_in_pangenome(self, pangenome): """Tests that return a KeyError if gene does not exist in pangenome - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method """ with pytest.raises(KeyError): pangenome.get_gene("12151405613024") @@ -411,7 +413,7 @@ def test_get_gene_not_in_pangenome(self, pangenome): def test_get_gene_with_id_not_string(self, pangenome): """Tests that return an AssertionError if gene identifier is not a string - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method """ with pytest.raises(AssertionError): pangenome.get_gene(gene_id=4) @@ -440,9 +442,10 @@ def test_get_multigenic(self, pangenome): class TestPangenomeEdge(TestPangenome): """This class tests methods in pangenome class associated to Edge. """ + @staticmethod def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2) -> Tuple[Gene, Gene]: - """create a pair of genes that belong to the same organism in 2 different families + """Create a pair of genes that belong to the same organism in two different families :return: Two genes linked to contigs, organism and gene families """ @@ -461,7 +464,7 @@ def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2) -> Tuple[Gene, Gene]: @pytest.fixture def gene_pair(self) -> Generator[Tuple[Gene, Gene], None, None]: - """Call method to create a pair of genes that belong to the same organism in 2 different families + """Call method to create a pair of genes that belong to the same organism in two different families :return: Two genes linked to contigs, organism and gene families """ @@ -470,7 +473,7 @@ def gene_pair(self) -> Generator[Tuple[Gene, Gene], None, None]: def test_add_edge(self, pangenome, gene_pair): """Tests the add_edge method of the Pangenome class. - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method :param gene_pair: Pair of gene coding for the edge """ gene1, gene2 = gene_pair @@ -479,9 +482,9 @@ def test_add_edge(self, pangenome, gene_pair): assert set(pangenome.edges) == {edge} def test_add_edge_already_in_pangenome(self, pangenome, gene_pair): - """Tests that adding the same pair of gene as edge return the edge. + """Tests that adding the same pair of genes as edge return the edge. - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method :param gene_pair: Pair of gene coding for the edge """ gene1, gene2 = gene_pair @@ -491,7 +494,7 @@ def test_add_edge_already_in_pangenome(self, pangenome, gene_pair): def test_add_edge_with_gene_not_isinstance_gene(self, pangenome): """Tests that return an AssertionError if genes are not Gene objects - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method """ with pytest.raises(AssertionError): pangenome.add_edge("gene1", "gene2") @@ -499,7 +502,7 @@ def test_add_edge_with_gene_not_isinstance_gene(self, pangenome): def test_number_of_edges(self, pangenome, gene_pair): """Tests the number_of_edges method of the Pangenome class. - :param pangenome: pangenome object to test method + :param pangenome: Pangenome object to test method :param gene_pair: Pair of gene coding for the edge """ pangenome.add_edge(*gene_pair) @@ -510,7 +513,8 @@ def test_number_of_edges(self, pangenome, gene_pair): class TestPangenomeBinary(TestPangenomeOrganism, TestPangenomeGeneFamilies): """This class tests methods in pangenome class associated to binary methods. """ - #TODO Better test for this part + + # TODO Better test for this part def test_get_org_index(self, add_organisms, pangenome): """Tests the get_org_index function in pangenome class @@ -583,11 +587,12 @@ def test_compute_org_bitarrays_without_index_already_computed(self, add_organism class TestPangenomeRGP(TestPangenome): """This class tests methods in pangenome class associated to Region """ + def test_add_region(self, pangenome): """Tests the add_region method in the Pangenome class. - :param pangenome: Access the pangenome object - """ + :param pangenome: Access the pangenome object + """ rgp = Region(region_id="rgp") pangenome.add_region(rgp) assert len(pangenome._regionGetter) == 1 @@ -621,7 +626,7 @@ def test_get_region(self, pangenome): assert pangenome.get_region("rgp") == rgp def test_get_region_not_in_pangenome(self, pangenome): - """Tests get region not in pangenome return a KeyError. + """Tests get the region not in pangenome return a KeyError. :param pangenome: Access the pangenome object """ @@ -650,6 +655,7 @@ def test_number_of_rgp(self, pangenome): class TestPangenomeSpot(TestPangenome): """This class tests methods in pangenome class associated to Spot. """ + def test_add_spot(self, pangenome): """Tests the add_spot method in the Pangenome class. @@ -718,6 +724,7 @@ def test_number_of_spots(self, pangenome): class TestPangenomeModule(TestPangenome): """This class tests methods in pangenome class associated to Modules. """ + def test_add_module(self, pangenome): """Tests the add_module method in the Pangenome class. @@ -739,7 +746,7 @@ def test_add_module_already_in_pangenome(self, pangenome): pangenome.add_module(module) def test_add_module_with_isinstance_not_region(self, pangenome): - """Tests that adding an object with not Module type return an AssertionError. + """Tests that adding an object with not Module type return an AssertionError. :param pangenome: Access the pangenome object """ @@ -782,15 +789,17 @@ def test_number_of_modules(self, pangenome): assert isinstance(pangenome.number_of_modules(), int) assert pangenome.number_of_modules() == 1 + class TestPangenomeMetadata(TestPangenome): """This class tests methods in pangenome class associated to Metadata. """ + @pytest.fixture def add_element_to_pangenome(self, pangenome): - """Adds a metadata element to each elements of pangenome + """Adds a metadata element to each element of pangenome - :param pangenome: Access the pangenome object - """ + :param pangenome: Access the pangenome object + """ metadata = Metadata(source="source", attribute="attr") family = GeneFamily(family_id=pangenome.max_fam_id, name="Fam") family.add_metadata(source=metadata.source, metadata=metadata) @@ -801,7 +810,7 @@ def add_element_to_pangenome(self, pangenome): gene = Gene("Gene") gene.position, gene.start = (0, 0) gene.add_metadata(source=metadata.source, metadata=metadata) - ctg.add_gene(gene) + ctg[gene.start] = gene pangenome.add_organism(org) rgp = Region("RGP") rgp.add_metadata(source=metadata.source, metadata=metadata) @@ -817,7 +826,7 @@ def test_select_elem(self, add_element_to_pangenome, pangenome): """Tests the select_elem method of the Pangenome class. :param add_element_to_pangenome: Add elements to the pangenome - :param pangenome: Access the pangenome object + :param pangenome: Access the pangenome object """ assert all(isinstance(elem, GeneFamily) for elem in set(pangenome.select_elem("families"))) assert all(isinstance(elem, Organism) for elem in set(pangenome.select_elem("genomes"))) @@ -832,7 +841,7 @@ def test_metadata_sources(self, add_element_to_pangenome, pangenome): """Tests the metadata_sources method of the Pangenome class. :param add_element_to_pangenome: Add elements to the pangenome - :param pangenome: Access the pangenome object + :param pangenome: Access the pangenome object """ for metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]: assert isinstance(pangenome.metadata_sources(metatype), set) From 1bf00bbb95c64698699c6d43b09287a853956be5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 30 Aug 2023 15:32:55 +0200 Subject: [PATCH 46/75] Fix bug with overlap in __setitem__ from metafeature --- VERSION | 2 +- ppanggolin/formats/writeMetadata.py | 4 +- ppanggolin/metadata.py | 90 +++++++++++++++-------------- ppanggolin/pangenome.py | 4 +- tests/tests_GeneFamily.py | 11 ++-- tests/tests_Metadata.py | 36 ++++-------- tests/tests_Pangenome.py | 6 +- 7 files changed, 72 insertions(+), 81 deletions(-) diff --git a/VERSION b/VERSION index 31b16d88..0af33fb9 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.163 +1.2.164 diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py index 139ef66c..1d0ffa34 100644 --- a/ppanggolin/formats/writeMetadata.py +++ b/ppanggolin/formats/writeMetadata.py @@ -114,7 +114,7 @@ def get_metadata_len(select_elem: List[Module], source: str) -> Tuple[Dict[str, else: raise Exception("Unexpected attribute. A recent change could create this error." " Please report the error on our github.") - for metadata in element[source]: + for metadata in element.get_metadata_by_source(source): for attr, value in ((k, v) for k, v in metadata.__dict__.items() if k != "source"): if isinstance(value, bytes): value = value.decode('UTF-8') @@ -160,7 +160,7 @@ def write_metadata_metatype(h5f: tables.File, source: str, metatype: str, source_table = h5f.create_table(metatype_group, source, desc_metadata(*meta_len[:-1]), expectedrows=meta_len[-1]) meta_row = source_table.row for element in tqdm(select_elements, unit=metatype, desc=f'Source = {source}', disable=disable_bar): - for metadata in element[source]: + for metadata in element.get_metadata_by_source(source): for desc in source_table.colnames: if desc == "ID": if hasattr(element, 'name') and len(element.name) > 0: diff --git a/ppanggolin/metadata.py b/ppanggolin/metadata.py index 31cb06a5..a6f52cc3 100644 --- a/ppanggolin/metadata.py +++ b/ppanggolin/metadata.py @@ -2,7 +2,9 @@ # coding: utf8 # default libraries +import logging from typing import Generator, List, Tuple, Union, Any +from collections import defaultdict # installed libraries from pandas import isna @@ -94,9 +96,28 @@ class MetaFeatures: def __init__(self): """Constructor method """ - self._metadata_getter = {} + self._metadata_getter = defaultdict(list) - def __setitem__(self, source: str, metadata: Metadata): + @property + def metadata(self) -> Generator[Metadata, None, None]: + """Generate metadata in gene families + + :return: Metadata from all sources + """ + + for meta_list in self._metadata_getter.values(): + for metadata in meta_list: + yield metadata + + @property + def sources(self) -> Generator[str, None, None]: + """ Get all metadata source in gene family + + :return: Metadata source + """ + yield from self._metadata_getter.keys() + + def add_metadata(self, source, metadata): """Add metadata to metadata getter :param source: Name of the metadata source @@ -107,9 +128,9 @@ def __setitem__(self, source: str, metadata: Metadata): assert isinstance(metadata, Metadata), f"Metadata is not with type Metadata but with {type(metadata)}" assert isinstance(source, str), f"Source is not a string but with {type(source)}" - self._metadata_getter[source] = [metadata] + self._metadata_getter[source].append(metadata) - def __getitem__(self, source: str) -> Union[List[Metadata], None]: + def get_metadata_by_source(self, source: str) -> Union[List[Metadata], None]: """Get all the metadata feature corresponding to the source :param source: Name of the source to get @@ -121,7 +142,20 @@ def __getitem__(self, source: str) -> Union[List[Metadata], None]: assert isinstance(source, str), f"Source is not a string but with {type(source)}" return self._metadata_getter.get(source) # if source in _metadata_getter return value else None - def __delitem__(self, source: str): + def get_metadata_by_attribute(self, **kwargs) -> Generator[Metadata, None, None]: + """Get metadata by one or more attribute + + :return: Metadata searched + """ + for metadata in self.metadata: + for attr, value in kwargs.items(): + if hasattr(metadata, attr): + # BUG If value is a list, the join block detection. + # It would be better to keep a list and change in writing and reading metadata to join the list + if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: + yield metadata + + def del_metadata_by_source(self, source: str): """Remove a source from the feature :param source: Name of the source to delete @@ -130,53 +164,21 @@ def __delitem__(self, source: str): :raises KeyError: Source does not belong in the MetaFeature """ assert isinstance(source, str), f"Source is not a string but with {type(source)}" - try: - del self._metadata_getter[source] - except KeyError: - raise KeyError(f"Given source: {source} is not in {type(self)}") + if self._metadata_getter.pop(source, None) is None: + logging.getLogger("PPanGGOLiN").warning("The source to remove does not exist") - def add_metadata(self, source, metadata): - """Add metadata to metadata getter - - :param source: Name of the metadata source - :param metadata: metadata value to add for the source - """ - if self[source] is None: - self[source] = metadata - else: - self[source].append(metadata) - - @property - def metadata(self) -> Generator[Metadata, None, None]: - """Generate metadata in gene families - - :return: Metadata from all sources - """ - - for meta_list in self._metadata_getter.values(): - for metadata in meta_list: - yield metadata - - @property - def sources(self) -> Generator[str, None, None]: - """ Get all metadata source in gene family - - :return: Metadata source - """ - yield from self._metadata_getter.keys() - - def get_metadata(self, **kwargs) -> Generator[Metadata, None, None]: - """Get metadata by one or more attribute + def del_metadata_by_attribute(self, **kwargs): + """Remove a source from the feature - :return: Metadata searched + :param source: Name of the source to delete """ - for metadata in self.metadata: + for source, metadata in self._metadata_getter.items(): for attr, value in kwargs.items(): if hasattr(metadata, attr): # BUG If value is a list, the join block detection. # It would be better to keep a list and change in writing and reading metadata to join the list if getattr(metadata, attr, None) in value or getattr(metadata, attr, None) == value: - yield metadata + self._metadata_getter[source].remove(metadata) def max_metadata_by_source(self) -> Tuple[str, int]: """Get the maximum number of metadata for one source diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 2a81059c..4c01eed3 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -656,7 +656,7 @@ def get_elem_by_metadata(self, metatype: str, **kwargs) -> Generator[ :return: Metadata element """ for elem in self.select_elem(metatype): - if len(list(elem.get_metadata(**kwargs))) > 0: + if len(list(elem.get_metadata_by_attribute(**kwargs))) > 0: yield elem def get_elem_by_sources(self, source: List[str], metatype: str) -> Generator[ @@ -669,5 +669,5 @@ def get_elem_by_sources(self, source: List[str], metatype: str) -> Generator[ """ assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"] for elem in self.select_elem(metatype): - if elem[source] is not None: + if elem.get_metadata_by_source(source) is not None: yield elem diff --git a/tests/tests_GeneFamily.py b/tests/tests_GeneFamily.py index 54a546df..36948016 100644 --- a/tests/tests_GeneFamily.py +++ b/tests/tests_GeneFamily.py @@ -3,7 +3,7 @@ import pytest from random import randint from typing import Generator, Set -from itertools import combinations +from itertools import combinations_with_replacement from ppanggolin.pangenome import Edge from ppanggolin.geneFamily import GeneFamily @@ -225,11 +225,12 @@ def families(self, genes) -> Generator[Set[GeneFamily], None, None]: yield families @pytest.fixture - def edges(self, families, genes) -> Generator[Set[Edge], None, None]: + def edges(self, families, genes, organisms) -> Generator[Set[Edge], None, None]: """Create a set of edges fill with genes and gene families to test edges """ edges = {} - pair_genes = combinations(genes, 2) + pair_genes = filter(lambda x: x[0] != x[1] and x[0].organism == x[1].organism, + combinations_with_replacement(genes, 2)) for pair in pair_genes: key = frozenset([pair[0].family, pair[1].family]) edge = edges.get(key) @@ -282,7 +283,7 @@ def test_get_number_of_edges(self, families, edges): def test_add_spot_to_gene_family(self, family): """Tests that a Spot object can be added to a GeneFamily object """ - spot = Spot('spot1') + spot = Spot(1) family.add_spot(spot) assert spot in family.spots @@ -295,7 +296,7 @@ def test_add_non_spot_as_spot_in_family(self, family): def test_add_module_to_gene_family(self, family): """Tests that a Module object can be added to a GeneFamily object """ - module = Module('module1') + module = Module(1) family.add_module(module) assert module in family.modules diff --git a/tests/tests_Metadata.py b/tests/tests_Metadata.py index c26131d9..d4bab2e4 100644 --- a/tests/tests_Metadata.py +++ b/tests/tests_Metadata.py @@ -83,10 +83,10 @@ def metafeatures(self, metadata) -> Generator[MetaFeatures, None, None]: """ metafeatures = MetaFeatures() for meta in metadata: - metafeatures[meta.source] = meta + metafeatures.add_metadata(meta.source, meta) yield metafeatures - def test_set_metadata_to_metadata_getter(self, metafeatures, metadata): + def test_add_metadata(self, metafeatures, metadata): """Tests that metadata can be added to the metadata getter """ assert all(metafeatures._metadata_getter[meta.source] == [meta] for meta in metadata) @@ -94,29 +94,15 @@ def test_set_metadata_to_metadata_getter(self, metafeatures, metadata): def test_get_metadata_feature_corresponding_to_source(self, metafeatures, metadata): """Tests that all the metadata features corresponding to a source can be retrieved """ - assert all(metafeatures[meta.source] == [meta] for meta in metadata) + assert all(metafeatures.get_metadata_by_source(meta.source) == [meta] for meta in metadata) def test_remove_source_from_feature(self, metafeatures): """Tests that a source can be removed from the feature """ metadata = Metadata("source_del", attribute1="value") - metafeatures["source_del"] = metadata - del metafeatures["source_del"] - assert metafeatures["source_del"] is None - - def test_add_metadata_feature(self, metafeatures): - """Tests that adding metadata works as expected - """ - metadata1 = Metadata("source_add", attribute1="value1") - metadata2 = Metadata("source_add", attribute2="value2") - metafeatures.add_metadata("source_add", metadata1) - metafeatures.add_metadata("source_add", metadata2) - assert metafeatures["source_add"] == [metadata1, metadata2] - - def test_generate_metadata_in_gene_families(self, metafeatures, metadata): - """Tests that metadata can be generated in gene families - """ - assert set(metafeatures.metadata) == metadata + metafeatures.add_metadata("source_del", metadata) + metafeatures.del_metadata_by_source("source_del") + assert metafeatures.get_metadata_by_source("source_del") is None def test_generate_all_metadata_sources(self, metafeatures, metadata): """Tests that all metadata sources can be generated @@ -128,9 +114,9 @@ def test_get_metadata_by_attribute_values(self, metafeatures): """ meta = Metadata("source_test", attribute1="value_to_retrieve") # meta_list = Metadata("source_list", attribute1=["val_1", "val_2"]) - metafeatures[meta.source] = meta + metafeatures.add_metadata(meta.source, meta) # metafeatures[meta_list.source] = meta_list - assert list(metafeatures.get_metadata(attribute1="value_to_retrieve")) == [meta] + assert list(metafeatures.get_metadata_by_attribute(attribute1="value_to_retrieve")) == [meta] # assert list(metafeatures.get_metadata(attribute1="val_1")) == [meta_list] def test_get_maximum_number_of_metadata_for_one_source(self, metafeatures, metadata): @@ -146,7 +132,7 @@ def test_metadata_is_not_with_type_metadata(self, metafeatures): """Tests that an AssertionError is raised when metadata is not with type Metadata """ with pytest.raises(AssertionError): - metafeatures["source1"] = "not_metadata" + metafeatures.add_metadata("source1", "not_metadata") def test_source_is_not_a_string(self, metafeatures): """Tests that an AssertionError is raised when the source is not a string @@ -154,10 +140,10 @@ def test_source_is_not_a_string(self, metafeatures): metadata = Metadata("source1", attribute1="value1") with pytest.raises(AssertionError): - metafeatures[1] = metadata + metafeatures.add_metadata(1, metadata) def test_source_or_metadata_is_not_with_correct_type(self, metafeatures, metadata): """Tests that an AssertionError is raised when the source or metadata is not with the correct type """ with pytest.raises(AssertionError): - metafeatures[1] = "not_metadata" + metafeatures.add_metadata(1, "not_metadata") diff --git a/tests/tests_Pangenome.py b/tests/tests_Pangenome.py index 3558db54..5336d017 100644 --- a/tests/tests_Pangenome.py +++ b/tests/tests_Pangenome.py @@ -335,7 +335,8 @@ def fill_org_with_genes(self) -> Generator[Union[Organism, Set[Gene]], None, Non genes = set() organism = Organism(name="organism") for contig_id in range(randint(2, 10)): - contig = organism.get_contig("k_{}".format(contig_id)) + contig = Contig("k_{}".format(contig_id)) + organism.add_contig(contig) for gene_idx in range(randint(2, 10)): gene = Gene(gene_id=f"{organism.name}.{contig_id}.{gene_idx}") gene.position = gene_idx @@ -806,7 +807,8 @@ def add_element_to_pangenome(self, pangenome): pangenome.add_gene_family(family) org = Organism("Org") org.add_metadata(source=metadata.source, metadata=metadata) - ctg = org.get_contig("Ctg") + ctg = Contig("Ctg") + org.add_contig(ctg) gene = Gene("Gene") gene.position, gene.start = (0, 0) gene.add_metadata(source=metadata.source, metadata=metadata) From 1771be780c30a7ec32b0307452b497ddc36b3ea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 30 Aug 2023 15:39:36 +0200 Subject: [PATCH 47/75] Rename file to use pytest automatically --- VERSION | 2 +- tests/{tests_Edge.py => test_edge.py} | 0 tests/{tests_GeneFamily.py => test_genefamily.py} | 0 tests/{tests_Genome.py => test_genome.py} | 0 tests/{tests_Metadata.py => test_metadata.py} | 0 tests/{tests_Pangenome.py => test_pangenome.py} | 0 tests/{tests_Region.py => test_region.py} | 0 7 files changed, 1 insertion(+), 1 deletion(-) rename tests/{tests_Edge.py => test_edge.py} (100%) rename tests/{tests_GeneFamily.py => test_genefamily.py} (100%) rename tests/{tests_Genome.py => test_genome.py} (100%) rename tests/{tests_Metadata.py => test_metadata.py} (100%) rename tests/{tests_Pangenome.py => test_pangenome.py} (100%) rename tests/{tests_Region.py => test_region.py} (100%) diff --git a/VERSION b/VERSION index 0af33fb9..dfcef939 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.164 +1.2.165 diff --git a/tests/tests_Edge.py b/tests/test_edge.py similarity index 100% rename from tests/tests_Edge.py rename to tests/test_edge.py diff --git a/tests/tests_GeneFamily.py b/tests/test_genefamily.py similarity index 100% rename from tests/tests_GeneFamily.py rename to tests/test_genefamily.py diff --git a/tests/tests_Genome.py b/tests/test_genome.py similarity index 100% rename from tests/tests_Genome.py rename to tests/test_genome.py diff --git a/tests/tests_Metadata.py b/tests/test_metadata.py similarity index 100% rename from tests/tests_Metadata.py rename to tests/test_metadata.py diff --git a/tests/tests_Pangenome.py b/tests/test_pangenome.py similarity index 100% rename from tests/tests_Pangenome.py rename to tests/test_pangenome.py diff --git a/tests/tests_Region.py b/tests/test_region.py similarity index 100% rename from tests/tests_Region.py rename to tests/test_region.py From cd1276282b6ca69a8804515ca7d7d61cc2e77597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 31 Aug 2023 15:11:31 +0200 Subject: [PATCH 48/75] Fix merge pb with rgp clustering --- VERSION | 2 +- ppanggolin/RGP/rgp_cluster.py | 230 +++++++++++++------------ tests/region/test_rgp_cluster.py | 280 +++++++++++++++++++++---------- tests/test_edge.py | 1 + tests/test_genefamily.py | 1 + tests/test_genome.py | 1 + tests/test_pangenome.py | 1 + 7 files changed, 325 insertions(+), 191 deletions(-) diff --git a/VERSION b/VERSION index 2e40cf8c..04d08bef 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.166 +1.2.167 diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py index 1b81bddc..db09fec4 100644 --- a/ppanggolin/RGP/rgp_cluster.py +++ b/ppanggolin/RGP/rgp_cluster.py @@ -5,21 +5,17 @@ import logging import argparse import os -from itertools import combinations, chain +from itertools import combinations from collections.abc import Callable from collections import defaultdict -from multiprocessing.pool import Pool from typing import Dict, List, Tuple, Set, Union, Any - # installed libraries from tqdm import tqdm import networkx as nx import pandas as pd - # local libraries -from ppanggolin.genome import Organism, Contig from ppanggolin.pangenome import Pangenome from ppanggolin.region import Region from ppanggolin.formats import check_pangenome_info @@ -27,7 +23,6 @@ from ppanggolin.geneFamily import GeneFamily - class IdenticalRegions: """ Represents a group of Identical Regions within a pangenome. @@ -39,6 +34,20 @@ class IdenticalRegions: """ def __init__(self, name: str, identical_rgps: Set[Region], families: Set[GeneFamily], is_contig_border: bool): + if not isinstance(identical_rgps, set): + raise TypeError("Expected 'identical_rgps' to be a set") + else: + if len(identical_rgps) == 0: + raise ValueError("Set of identical_rgps must not be empty") + if not all(isinstance(region, Region) for region in identical_rgps): + raise TypeError("All element in identical_rgps must be `Region`") + if not isinstance(families, set): + raise TypeError("Expected 'families' to be a set") + else: + if len(families) == 0: + raise ValueError("Set of families must not be empty") + if not all(isinstance(family, GeneFamily) for family in families): + raise TypeError("All element in families must be `GeneFamilies`") self.name = name self.families = families self.rgps = identical_rgps @@ -55,37 +64,46 @@ def __eq__(self, other: 'IdenticalRegions') -> bool: """ if not isinstance(other, IdenticalRegions): # don't attempt to compare against unrelated types - return NotImplemented + raise TypeError("'IdenticalRegions' type object was expected, " + f"but '{type(other)}' type object was provided.") return self.families == other.families and self.rgps == other.rgps and self.is_contig_border == other.is_contig_border - + + def __repr__(self): + return f"IdenticalRegions(name='{self.name}', num_rgps={len(self.rgps)}, num_families={len(self.families)}, is_contig_border={self.is_contig_border})" + def __str__(self): return self.name def __hash__(self): return id(self) - + def __lt__(self, obj): - return ((self.ID) < (obj.ID)) - + return self.ID < obj.ID + def __gt__(self, obj): - return ((self.ID) > (obj.ID)) - - + return self.ID > obj.ID + + def __le__(self, obj): + return self.ID <= obj.ID + + def __ge__(self, obj): + return self.ID >= obj.ID + + def compute_grr(rgp_a_families: Set[GeneFamily], rgp_b_families: Set[GeneFamily], mode: Callable) -> float: """ Compute gene repertoire relatedness (GRR) between two rgp. - mode can be the function min to compute min GRR or max to compute max_grr + Mode can be the function min to compute min GRR or max to compute max_grr - :param rgp_a: rgp A - :param rgp_b: rgp B + :param rgp_a_families: Rgp A + :param rgp_b_families: rgp B :param mode: min or max function - :return : grr value between 0 and 1 + :return: GRR value between 0 and 1 """ - grr = len((rgp_a_families & rgp_b_families)) / \ - mode(len(rgp_a_families), len(rgp_b_families)) + grr = len((rgp_a_families & rgp_b_families)) / mode(len(rgp_a_families), len(rgp_b_families)) return grr @@ -94,14 +112,13 @@ def compute_jaccard_index(rgp_a_families: set, rgp_b_families: set) -> float: """ Compute jaccard index between two rgp based on their famillies. - :param rgp_a: rgp A - :param rgp_b: rgp B + :param rgp_a_families: Rgp A + :param rgp_b_families: rgp B - :return : jaccard index + :return : Jaccard index """ - jaccard_index = len((rgp_a_families & rgp_b_families)) / \ - len(rgp_a_families | rgp_b_families) + jaccard_index = len((rgp_a_families & rgp_b_families)) / len(rgp_a_families | rgp_b_families) return jaccard_index @@ -113,14 +130,14 @@ def add_info_to_rgp_nodes(graph, regions: List[Region], region_to_spot: dict): This function takes a list of RGPs and a dictionary mapping each RGP to its corresponding spot ID, and formats the RGP information into a dictionary for further processing or addition to a graph. + :param graph: RGPs graph :param regions: A list of RGPs. :param region_to_spot: A dictionary mapping each RGP to its corresponding spot ID. - :return: A dictionary with RGP id as key and a dictionaries containing information on the corresponding RGP as value. + :return: A dictionary with RGP id as the key and a dictionary containing information on the corresponding RGP as value. """ region_attributes = {} for region in regions: - region_info = {"contig": region.contig.name, 'organism': region.organism.name, "name": region.name, @@ -128,17 +145,16 @@ def add_info_to_rgp_nodes(graph, regions: List[Region], region_to_spot: dict): "is_contig_border": region.is_contig_border, "is_whole_contig": region.is_whole_contig, "spot_id": get_spot_id(region, region_to_spot), - 'families_count' : len(region.families)} + 'families_count': len(region.families)} region_attributes[region.ID] = region_info - + node_attributes = graph.nodes[region.ID] node_attributes.update(region_info) return region_attributes - def join_dicts(dicts: List[Dict[str, Any]], delimiter: str = ';') -> Dict[str, Any]: """ Join dictionaries by concatenating the values with a custom delimiter for common keys. @@ -157,7 +173,6 @@ def join_dicts(dicts: List[Dict[str, Any]], delimiter: str = ';') -> Dict[str, A return {k: delimiter.join(v) for k, v in final_dict.items()} - def format_rgp_metadata(rgp: Region) -> Dict[str, str]: """ Format RGP metadata by combining source and field values. @@ -177,7 +192,6 @@ def format_rgp_metadata(rgp: Region) -> Dict[str, str]: return {col_name: '|'.join(values) for col_name, values in source_field_2_value.items()} - def add_rgp_metadata_to_graph(graph: nx.Graph, rgps: Set[Union[Region, IdenticalRegions]]) -> None: """ Add metadata from Region or IdenticalRegions objects to the graph. @@ -194,56 +208,60 @@ def add_rgp_metadata_to_graph(graph: nx.Graph, rgps: Set[Union[Region, Identical rgp_metadata = join_dicts(rgp_metadata_dicts) else: raise TypeError(f'Expect Region or IdenticalRegions object, not {type(rgp)}') - + for metadata_name, value in rgp_metadata.items(): graph.nodes[rgp.ID][metadata_name] = value - -def add_info_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List[IdenticalRegions], rgp_to_spot: Dict[Region, int]): + +def add_info_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List[IdenticalRegions], + rgp_to_spot: Dict[Region, int]): """ Add identical rgps info in the graph as node attributes. :params rgp_graph: Graph with rgp id as node and grr value as edges - :params rgp_to_identical_rgps: dict with uniq RGP as key and set of identical rgps as value + :params rgp_to_identical_rgps: dict with uniq RGP as the key and set of identical rgps as value """ for identical_rgp_obj in identical_rgps_objects: - spots_of_identical_rgp_obj = {get_spot_id(i_rgp, rgp_to_spot) for i_rgp in identical_rgp_obj.rgps} rgp_graph.add_node(identical_rgp_obj.ID, - identical_rgp_group = True, - name = identical_rgp_obj.name, - families_count = len(identical_rgp_obj.families), + identical_rgp_group=True, + name=identical_rgp_obj.name, + families_count=len(identical_rgp_obj.families), identical_rgp_count=len(identical_rgp_obj.rgps), identical_rgp_names=';'.join([i_rgp.name for i_rgp in identical_rgp_obj.rgps]), - identical_rgp_organisms = ';'.join({i_rgp.organism.name for i_rgp in identical_rgp_obj.rgps}), - identical_rgp_contig_border_count = len([True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_contig_border]), - identical_rgp_whole_contig_count = len([True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_whole_contig]), - identical_rgp_spots = ";".join(spots_of_identical_rgp_obj), - spot_id = spots_of_identical_rgp_obj.pop() if len(spots_of_identical_rgp_obj) == 1 else "Mulitple spots" + identical_rgp_organisms=';'.join({i_rgp.organism.name for i_rgp in identical_rgp_obj.rgps}), + identical_rgp_contig_border_count=len( + [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_contig_border]), + identical_rgp_whole_contig_count=len( + [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_whole_contig]), + identical_rgp_spots=";".join(spots_of_identical_rgp_obj), + spot_id=spots_of_identical_rgp_obj.pop() if len( + spots_of_identical_rgp_obj) == 1 else "Mulitple spots" ) - + def add_edges_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List[IdenticalRegions]): """ - Replace identical rgp object by all identical rgp it contains. + Replace the identical rgp object by all identical rgp it contains. :param rgp_graph: The RGP graph to add edges to. - :param rgp_to_identical_rgps: A dictionary mapping RGPs to sets of identical RGPs. + :param identical_rgps_objects: A dictionary mapping RGPs to sets of identical RGPs. """ identical_edge_data = {'grr': 1.0, 'max_grr': 1.0, 'min_grr': 1.0, "identical_famillies": True} - + added_identical_rgps = [] for identical_rgp_obj in identical_rgps_objects: - rgp_graph.add_nodes_from([ident_rgp.ID for ident_rgp in identical_rgp_obj.rgps], identical_rgp_group = identical_rgp_obj.name) + rgp_graph.add_nodes_from([ident_rgp.ID for ident_rgp in identical_rgp_obj.rgps], + identical_rgp_group=identical_rgp_obj.name) - # add edge between identical rgp with metrics at 1 (perfect score) + # add edge between identical rgp with metrics at one (perfect score) edges_to_add = [(rgp_a.ID, rgp_b.ID, identical_edge_data) for rgp_a, rgp_b in combinations(identical_rgp_obj.rgps, 2)] @@ -263,14 +281,15 @@ def add_edges_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: Lis return added_identical_rgps -def dereplicate_rgp(rgps: List[Union[Region, IdenticalRegions]], disable_bar: bool = False) -> List[Union[Region, IdenticalRegions]]: +def dereplicate_rgp(rgps: Set[Union[Region, IdenticalRegions]], + disable_bar: bool = False) -> List[Union[Region, IdenticalRegions]]: """ Dereplicate RGPs that have the same families. Given a list of Region or IdenticalRegions objects representing RGPs, this function groups together RGPs with the same families into IdenticalRegions objects and returns a list of dereplicated RGPs. - :param rgps: A list of Region or IdenticalRegions objects representing the RGPs to be dereplicated. + :param rgps: A set of Region or IdenticalRegions objects representing the RGPs to be dereplicated. :param disable_bar: If True, disable the progress bar. :return: A list of dereplicated RGPs (Region or IdenticalRegions objects). For RGPs with the same families, @@ -288,16 +307,16 @@ def dereplicate_rgp(rgps: List[Union[Region, IdenticalRegions]], disable_bar: bo if len(rgps) == 1: dereplicated_rgps.append(rgps[0]) else: - families = rgps[0].families + families = set(rgps[0].families) # identical regions object is considered on a contig border if all rgp are contig border is_contig_border = all([rgp.is_contig_border for rgp in rgps]) # create a new object that will represent the identical rgps identical_rgp = IdenticalRegions(name=f"identical_rgps_{identical_region_count}", - identical_rgps=rgps, + identical_rgps=set(rgps), families=families, - is_contig_border = is_contig_border) + is_contig_border=is_contig_border) identical_region_count += 1 dereplicated_rgps.append(identical_rgp) @@ -308,12 +327,13 @@ def dereplicate_rgp(rgps: List[Union[Region, IdenticalRegions]], disable_bar: bo def compute_rgp_metric(rgp_a: Region, rgp_b: Region, grr_cutoff: float, - grr_metric: str) -> Tuple[int, int, dict]: + grr_metric: str) -> Union[Tuple[int, int, dict], None]: """ Compute GRR metric between two RGPs. - :param rgp_a: a rgp + :param rgp_a: A rgp :param rgp_b: another rgp + :param grr_cutoff: Cutoff filter :param grr_metric: grr mode between min_grr, max_grr and incomplete_aware_grr :returns: Tuple containing the IDs of the two RGPs and the computed metrics as a dictionary @@ -323,51 +343,50 @@ def compute_rgp_metric(rgp_a: Region, # RGP at a contig border are seen as incomplete and min GRR is used instead of max GRR if rgp_a.is_contig_border or rgp_b.is_contig_border: - edge_metrics["incomplete_aware_grr"] = compute_grr( - rgp_a.families, rgp_b.families, min) + edge_metrics["incomplete_aware_grr"] = compute_grr(set(rgp_a.families), set(rgp_b.families), min) else: - edge_metrics["incomplete_aware_grr"] = compute_grr( - rgp_a.families, rgp_b.families, max) + edge_metrics["incomplete_aware_grr"] = compute_grr(set(rgp_a.families), set(rgp_b.families), max) # Compute max and min GRR metrics - edge_metrics['max_grr'] = compute_grr(rgp_a.families, rgp_b.families, max) - edge_metrics['min_grr'] = compute_grr(rgp_a.families, rgp_b.families, min) + edge_metrics['max_grr'] = compute_grr(set(rgp_a.families), set(rgp_b.families), max) + edge_metrics['min_grr'] = compute_grr(set(rgp_a.families), set(rgp_b.families), min) - # Number of shared families can be useful when visualising the graph - edge_metrics['shared_family'] = len(rgp_a.families & rgp_b.families) + # The number of shared families can be useful when visualizing the graph + edge_metrics['shared_family'] = len(set(rgp_a.families).intersection(set(rgp_b.families))) # Only return the metrics if the GRR value is above the cutoff if edge_metrics[grr_metric] >= grr_cutoff: - return (rgp_a.ID, rgp_b.ID, edge_metrics) + return rgp_a.ID, rgp_b.ID, edge_metrics -def cluster_rgp_on_grr(G: nx.Graph, clustering_attribute: str = "grr"): +def cluster_rgp_on_grr(graph: nx.Graph, clustering_attribute: str = "grr"): """ Cluster rgp based on grr using louvain communities clustering. - :param G: NetworkX graph object representing the RGPs and their relationships + :param graph: NetworkX graph object representing the RGPs and their relationship :param clustering_attribute: Attribute of the graph to use for clustering (default is "grr") """ partitions = nx.algorithms.community.louvain_communities( - G, weight=clustering_attribute) + graph, weight=clustering_attribute) # Add partition index in node attributes for i, cluster_nodes in enumerate(partitions): nx.set_node_attributes( - G, {node: f"cluster_{i}" for node in cluster_nodes}, name=f"{clustering_attribute}_cluster") + graph, {node: f"cluster_{i}" for node in cluster_nodes}, name=f"{clustering_attribute}_cluster") logging.info( f"Graph has {len(partitions)} clusters using {clustering_attribute}") - -def get_spot_id(rgp:Region, rgp_to_spot:Dict[Region, int]) -> str: + + +def get_spot_id(rgp: Region, rgp_to_spot: Dict[Region, int]) -> str: """ Return Spot ID associated to an RGP. It adds the prefix "spot_" to the spot ID. - When no spot is associated to the RGP, then the string "No spot" is return + When no spot is associated with the RGP, then the string "No spot" is return :params rgp: RGP id - :params rgp_to_spot: A dictionary mapping an RGP to its spot . + :params rgp_to_spot: A dictionary mapping an RGP to its spot. :return: Spot ID of the given RGP with the prefix spot_ or "No spot". """ @@ -376,6 +395,7 @@ def get_spot_id(rgp:Region, rgp_to_spot:Dict[Region, int]) -> str: else: return "No spot" + def write_rgp_cluster_table(outfile: str, grr_graph: nx.Graph, rgps_in_graph: List[Union[Region, IdenticalRegions]], grr_metric: str, @@ -383,9 +403,9 @@ def write_rgp_cluster_table(outfile: str, grr_graph: nx.Graph, """ Writes RGP cluster info to a TSV file using pandas. - :param outfile: name of the tsv file + :param outfile: Name of the tsv file :param grr_graph: The GRR graph. - :param rgp_to_identical_rgps: A dictionary mapping an RGP to a set of identical RGPs. + :param rgps_in_graph: A dictionary mapping an RGP to a set of identical RGPs. :param grr_metric: The GRR metric used for clustering. :param rgp_to_spot: A dictionary mapping an RGP to its spot. :return: None @@ -393,11 +413,10 @@ def write_rgp_cluster_table(outfile: str, grr_graph: nx.Graph, all_rgps_infos = [] for rgp_in_graph in rgps_in_graph: - cluster = grr_graph.nodes[rgp_in_graph.ID][f'{grr_metric}_cluster'] identical_rgps = [rgp_in_graph] if isinstance(rgp_in_graph, Region) else rgp_in_graph.rgps - + all_rgps_infos += [{"RGPs": r.name, "cluster": cluster, "spot_id": get_spot_id(r, rgp_to_spot)} for r in identical_rgps] @@ -424,13 +443,14 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, if pangenome.status["metadata"]["RGPs"] == "inFile": need_metadata = True - logging.info(f'Some RGPs metadata have been found in pangenome, they will be included in rgp graph.') + logging.info('Some RGPs metadata have been found in pangenome, they will be included in rgp graph.') else: need_metadata = False # check statuses and load info check_pangenome_info(pangenome, need_families=True, need_annotations=True, - disable_bar=disable_bar, need_rgp=True, need_spots=True, need_metadata=need_metadata, metatype="RGPs") + disable_bar=disable_bar, need_rgp=True, need_spots=True, need_metadata=need_metadata, + metatype="RGPs") if pangenome.regions == 0: raise Exception( @@ -440,13 +460,14 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, if ignore_incomplete_rgp: valid_rgps = [ rgp for rgp in pangenome.regions if not rgp.is_contig_border] - + ignored_rgp_count = len(pangenome.regions) - len(valid_rgps) total_rgp_count = len(pangenome.regions) - logging.info(f'Ignoring {ignored_rgp_count}/{total_rgp_count} ({100*(ignored_rgp_count)/total_rgp_count:.2f}%) ' - 'RGPs that are located at a contig border and are likely incomplete.') - + logging.info( + f'Ignoring {ignored_rgp_count}/{total_rgp_count} ({100 * ignored_rgp_count / total_rgp_count:.2f}%) ' + 'RGPs that are located at a contig border and are likely incomplete.') + if len(valid_rgps) == 0: raise Exception( "The pangenome has no complete RGPs. The clustering of RGP is then not possible.") @@ -464,11 +485,11 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, for rgp in dereplicated_rgps: for fam in rgp.families: family2rgp[fam].add(rgp) - + rgp_pairs = set() for rgps in family2rgp.values(): rgp_pairs |= {tuple(sorted(rgp_pair)) for rgp_pair in combinations(rgps, 2)} - + pairs_count = len(rgp_pairs) logging.info( @@ -485,7 +506,6 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, grr_graph.add_edges_from(pairs_of_rgps_metrics) - identical_rgps_objects = [rgp for rgp in dereplicated_rgps if isinstance(rgp, IdenticalRegions)] rgp_objects_in_graph = [rgp for rgp in dereplicated_rgps if isinstance(rgp, Region)] @@ -502,24 +522,24 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, for spot in pangenome.spots for region in spot.regions} if not unmerge_identical_rgps: - logging.info(f"Add info on identical RGPs merged in the graph") + logging.info("Add info on identical RGPs merged in the graph") add_info_to_identical_rgps(grr_graph, identical_rgps_objects, rgp_to_spot) rgps_in_graph = rgp_objects_in_graph if unmerge_identical_rgps else dereplicated_rgps # add some attribute to the graph nodes. - logging.info(f"Add RGP information to the graph") + logging.info("Add RGP information to the graph") add_info_to_rgp_nodes(grr_graph, rgp_objects_in_graph, rgp_to_spot) - + if need_metadata: - add_rgp_metadata_to_graph(grr_graph, rgps_in_graph) + add_rgp_metadata_to_graph(grr_graph, rgps_in_graph) if "gexf" in graph_formats: # writting graph in gexf format graph_file_name = os.path.join(output, f"{basename}.gexf") logging.info(f"Writting graph in gexf format in {graph_file_name}.") nx.readwrite.gexf.write_gexf(grr_graph, graph_file_name) - + if "graphml" in graph_formats: graph_file_name = os.path.join(output, f"{basename}.graphml") logging.info(f"Writting graph in graphml format in {graph_file_name}.") @@ -536,7 +556,7 @@ def launch(args: argparse.Namespace): """ Command launcher - :param args: All arguments provide by user + :param args: All arguments provided by user """ pangenome = Pangenome() @@ -554,9 +574,9 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser """ Subparser to launch PPanGGOLiN in Command line - :param sub_parser : sub_parser for cluster_rgp command + :param sub_parser : Sub_parser for cluster_rgp command - :return : parser arguments for cluster_rgp command + :return : Parser arguments for cluster_rgp command """ parser = sub_parser.add_parser( "rgp_cluster", formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -568,7 +588,7 @@ def parser_cluster_rgp(parser: argparse.ArgumentParser): """ Parser for specific argument of rgp command - :param parser: parser for cluster_rgp argument + :param parser: Parser for cluster_rgp argument """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") @@ -581,11 +601,11 @@ def parser_cluster_rgp(parser: argparse.ArgumentParser): help="Min gene repertoire relatedness metric used in the rgp clustering") optional.add_argument('--grr_metric', required=False, type=str, default="incomplete_aware_grr", help="The grr (Gene Repertoire Relatedness) is used to assess the similarity between two RGPs based on their gene families. " - "There are three different modes for calculating the grr value: 'min_grr', 'max_grr' or 'incomplete_aware_grr'." - " 'min_grr': Computes the number of gene families shared between the two RGPs and divides it by the smaller number of gene families among the two RGPs. " - " 'max_grr': Calculates the number of gene families shared between the two RGPs and divides it by the larger number of gene families among the two RGPs. " - " 'incomplete_aware_grr' (default): If at least one RGP is considered incomplete, which occurs when it is located at the border of a contig, " - "the 'min_grr' mode is used. Otherwise, the 'max_grr' mode is applied.", + "There are three different modes for calculating the grr value: 'min_grr', 'max_grr' or 'incomplete_aware_grr'." + " 'min_grr': Computes the number of gene families shared between the two RGPs and divides it by the smaller number of gene families among the two RGPs. " + " 'max_grr': Calculates the number of gene families shared between the two RGPs and divides it by the larger number of gene families among the two RGPs. " + " 'incomplete_aware_grr' (default): If at least one RGP is considered incomplete, which occurs when it is located at the border of a contig, " + "the 'min_grr' mode is used. Otherwise, the 'max_grr' mode is applied.", choices=['incomplete_aware_grr', "min_grr", "max_grr"]) optional.add_argument('--ignore_incomplete_rgp', required=False, action="store_true", @@ -594,13 +614,11 @@ def parser_cluster_rgp(parser: argparse.ArgumentParser): optional.add_argument('--no_identical_rgp_merging', required=False, action="store_true", help="Do not merge in one node identical RGP (i.e. having the same family content) before clustering.") - optional.add_argument("--basename", required=False, default="rgp_cluster", help="basename for the output file") optional.add_argument('-o', '--output', required=False, type=str, default="rgp_clustering", help="Output directory") - + optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", - default=['gexf'], help="Format of the output graph.") - + default=['gexf'], help="Format of the output graph.") diff --git a/tests/region/test_rgp_cluster.py b/tests/region/test_rgp_cluster.py index 2e2579c2..96b25816 100644 --- a/tests/region/test_rgp_cluster.py +++ b/tests/region/test_rgp_cluster.py @@ -1,113 +1,225 @@ #! /usr/bin/env python3 +# coding: utf8 import pytest - +from random import randint +from typing import Generator, Set from ppanggolin.RGP import rgp_cluster +from ppanggolin.genome import Gene, Contig, Organism +from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region - -from test_Region import l_genes, o_org, o_contig -import networkx as nx +from ppanggolin.RGP.rgp_cluster import IdenticalRegions @pytest.fixture -def identical_rgps(l_genes): - rgp1 = Region("one") - rgp2 = Region('two') - rgp3 = Region('three') - - # the three rgp have the gene content. - # in terms of family they are identical - for g in l_genes: - rgp1.append(g) - rgp2.append(g) - rgp3.append(g) - - return [rgp1, rgp2, rgp3] +def genes() -> Generator[Set[Gene], None, None]: + """Create a set of genes to fill gene families + """ + organism = Organism("organism") + contig = Contig("contig") + genes = set() + for i in range(0, randint(11, 20)): + gene = Gene(f"gene_{str(i)}") + gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) + gene.fill_parents(organism, contig) + contig[gene.start] = gene + genes.add(gene) + yield genes -@pytest.fixture -def RGP_a(l_genes): - # rgp_a has 8 families and is at contig border - rgp = Region("A") - - for g in l_genes[:8]: - rgp.append(g) - - return rgp @pytest.fixture -def RGP_b(l_genes): - # rgp_b has 2 families and is not at contig border - rgp = Region("B") - - for g in l_genes[5:7]: - rgp.append(g) +def families(genes) -> Generator[Set[GeneFamily], None, None]: + """Create a set of gene families fill with genes to test edges + """ + families = set() + genes = list(genes) + nb_families = randint(9, 20) + nb_genes_per_family = len(genes) // nb_families + idx_fam = 1 + while idx_fam < nb_families: + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = 0 + while idx_genes < nb_genes_per_family: + gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] + family.add_gene(gene) + gene.family = family + idx_genes += 1 + families.add(family) + idx_fam += 1 + # last family fill with all the gene left + family = GeneFamily(idx_fam, f"family_{idx_fam}") + idx_genes = (idx_fam - 1) * nb_genes_per_family + while idx_genes < len(genes): + gene = genes[idx_genes] + family.add_gene(gene) + gene.family = family + idx_genes += 1 + families.add(family) + yield families - return rgp +@pytest.fixture +def identical_rgps(genes, families) -> Generator[Set[Region], None, None]: + """Create a set of identical rgps + """ + identical_rgps = set() + for i in range(1, randint(6, 21)): + rgp = Region(f"RGP_{i}") + # the three rgp have the gene content. + # in terms of family they are identical + for gene in genes: + rgp[gene.position] = gene + identical_rgps.add(rgp) + yield identical_rgps + + +class TestIdenticalRegions: + def test_init_with_valid_inputs(self, identical_rgps, families): + """Tests that the IdenticalRegions object is initialized correctly with valid inputs. + """ + is_contig_border = True + identical_regions = IdenticalRegions("IdenticalRegions", identical_rgps, families, is_contig_border) + + assert identical_regions.name == "IdenticalRegions" + assert identical_regions.rgps == identical_rgps + assert identical_regions.families == families + assert identical_regions.is_contig_border == is_contig_border + + @pytest.mark.parametrize("wrong_type", + ["string", + 1, + 0.8, + list(), + dict()]) + def test_init_with_identical_rgps_not_isintance_set(self, wrong_type, families): + """Tests that the IdenticalRegions object cannot be initialized with a not instance set for identical_rgps. + """ + with pytest.raises(TypeError): + IdenticalRegions("IdenticalRegions", wrong_type, families, True) + + def test_init_with_rgp_is_not_instance_region_in_identical_rgps(self, identical_rgps, families): + """Tests that the IdenticalRegions object raise TypeError if one element is not instance Region. + """ + with pytest.raises(TypeError): + IdenticalRegions("IdenticalRegions", identical_rgps.union({1}), families, True) + + def test_init_with_empty_identical_rgps(self, families): + """Tests that the IdenticalRegions object cannot be initialized with an empty set of identical regions. + """ + with pytest.raises(ValueError): + IdenticalRegions("IdenticalRegions", set(), families, True) + + @pytest.mark.parametrize("wrong_type", + ["string", + 1, + 0.8, + list(), + dict()]) + def test_init_with_families_not_isintance_set(self, wrong_type, identical_rgps): + """Tests that the IdenticalRegions object cannot be initialized with a not instance set. + """ + with pytest.raises(TypeError): + IdenticalRegions("IdenticalRegions", identical_rgps, wrong_type, True) + + def test_init_with_family_is_not_instance_genefamilies_in_families(self, identical_rgps, families): + """Tests that the IdenticalRegions object raise TypeError if one element is not instance Region. + """ + with pytest.raises(TypeError): + IdenticalRegions("IdenticalRegions", identical_rgps, families.union({1}), True) + + def test_init_with_empty_families(self, identical_rgps): + """Tests that the IdenticalRegions object cannot be initialized with an empty set of identical regions. + """ + with pytest.raises(ValueError): + IdenticalRegions("IdenticalRegions", identical_rgps, set(), True) + + def test_eq_with_equal_identical_regions(self): + """Tests that the __eq__ method returns Trueè when comparing two IdenticalRegions objects that have the same families, + identical regions, and contig border status. + """ + rgp1 = Region("RGP1") + rgp2 = Region("RGP2") + family1 = GeneFamily(1, "Family1") + family2 = GeneFamily(2, "Family2") + identical_rgps1 = {rgp1, rgp2} + identical_rgps2 = {rgp1, rgp2} + families1 = {family1, family2} + families2 = {family1, family2} + is_contig_border = True + + identical_regions1 = IdenticalRegions("IdenticalRegions", identical_rgps1, families1, is_contig_border) + identical_regions2 = IdenticalRegions("IdenticalRegions", identical_rgps2, families2, is_contig_border) + + assert identical_regions1 == identical_regions2 + + def test_eq_with_non_identical_regions(self): + """Tests that the __eq__ method returns False when comparing + two IdenticalRegions objects that have different families. + """ + rgp1 = Region("RGP1") + rgp2 = Region("RGP2") + family1 = GeneFamily(1, "Family1") + family2 = GeneFamily(2, "Family2") + identical_rgps1 = {rgp1, rgp2} + identical_rgps2 = {rgp1, rgp2} + families1 = {family1, family2} + families2 = {family1} + is_contig_border = True + + identical_regions1 = IdenticalRegions("IdenticalRegions", identical_rgps1, families1, is_contig_border) + identical_regions2 = IdenticalRegions("IdenticalRegions", identical_rgps2, families2, is_contig_border) + + assert identical_regions1 != identical_regions2 -def o_region(): - return Region(4) def test_compute_grr(): + """Tests that compute_grr returns the correct value when there is a non-zero intersection between families + """ set1 = {1, 2, 3, 4, 5} set2 = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10} assert rgp_cluster.compute_grr(set1, set2, min) == 1.0 assert rgp_cluster.compute_grr(set1, set2, max) == 0.5 - -def test_dereplicate_rgp(identical_rgps): - - rgp1 = identical_rgps[0] - assert rgp_cluster.dereplicate_rgp([rgp1]) == [rgp1] - - identical_region_obj = rgp_cluster.IdenticalRegions(name=f"identical_rgps_0", - identical_rgps=identical_rgps, - families=identical_rgps[0].families, - is_contig_border = True) - - assert rgp_cluster.dereplicate_rgp(identical_rgps)[0] == identical_region_obj - - -def test_compute_rgp_metric(RGP_a, RGP_b): - - - assert RGP_a.is_contig_border == True - assert RGP_b.is_contig_border == False +def test_dereplicate_rgp(identical_rgps): + list_identical_rgps = list(identical_rgps) + rgp1 = list_identical_rgps[0] + assert rgp_cluster.dereplicate_rgp({rgp1}) == [rgp1] + + identical_region_obj = rgp_cluster.IdenticalRegions(name="identical_rgps_0", + identical_rgps=identical_rgps, + families=set(list_identical_rgps[0].families), + is_contig_border=True) + assert rgp_cluster.dereplicate_rgp(rgps=identical_rgps)[0] == identical_region_obj + + +def test_compute_rgp_metric(genes, families): + RGP_a = Region("A") + RGP_b = Region("B") + list_genes = sorted(genes, key=lambda x: x.position) + + for g in list_genes[:8]: + RGP_a[g.position] = g + for g in list_genes[3:7]: + RGP_b[g.position] = g + + assert RGP_a.is_contig_border + assert not RGP_b.is_contig_border + + shared_families = len(set(RGP_a.families).intersection(set(RGP_b.families))) + expected_grr = (RGP_a.ID, RGP_b.ID, {'incomplete_aware_grr': shared_families / min(len(set(RGP_a.families)), len(set(RGP_b.families))), + "min_grr": shared_families / min(len(set(RGP_a.families)), len(set(RGP_b.families))), + 'max_grr': shared_families / max(len(set(RGP_a.families)), len(set(RGP_b.families))), + 'shared_family': shared_families}) # min_grr - min_result = rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 0.8, "min_grr") - expected_min_grr = (RGP_a.ID, RGP_b.ID, {'incomplete_aware_grr':2/2, - "min_grr":2/2, - 'max_grr':2/8, - 'shared_family':2}) - assert min_result == expected_min_grr + min_result = rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 0, "min_grr") + assert min_result == expected_grr # incomplete_aware_grr: same as min grr as rgp1 is incomplete - incomplete_aware_result = rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 0.8, "incomplete_aware_grr") + incomplete_aware_result = rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 0, "incomplete_aware_grr") - assert incomplete_aware_result == expected_min_grr + assert incomplete_aware_result == expected_grr # max grr is below cutoff so None is returned - assert rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 0.8, "max_grr") == None - - -def add_info_to_rgp_nodes(RGP_a): - rgp_id = RGP_a.ID - graph = nx.Graph() - graph.add_node(rgp_id) - - region_to_spot={RGP_a:6} - region_info = {"contig": 1, - 'organism': "toto", - "name": "A", - "genes_count": 8, - "is_contig_border": True, - "is_whole_contig": True, - "spot_id": '6', - "families_count":8 - } - - assert rgp_cluster.get_rgp_info_dict([RGP_a], region_to_spot) == {rgp_id:region_info} - - + assert rgp_cluster.compute_rgp_metric(RGP_a, RGP_b, 1000, "max_grr") is None diff --git a/tests/test_edge.py b/tests/test_edge.py index 818a6864..147cd418 100644 --- a/tests/test_edge.py +++ b/tests/test_edge.py @@ -1,4 +1,5 @@ #! /usr/bin/env python3 +# coding: utf8 import pytest from typing import Generator, Tuple diff --git a/tests/test_genefamily.py b/tests/test_genefamily.py index 36948016..656b0380 100644 --- a/tests/test_genefamily.py +++ b/tests/test_genefamily.py @@ -1,4 +1,5 @@ #! /usr/bin/env python3 +# coding: utf8 import pytest from random import randint diff --git a/tests/test_genome.py b/tests/test_genome.py index 42f0e930..e734aa6b 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -1,4 +1,5 @@ #! /usr/bin/env python3 +# coding: utf8 import pytest from typing import Generator, Tuple diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index 68d62063..9de9d482 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -1,4 +1,5 @@ #! /usr/bin/env python3 +# coding: utf8 import pytest from random import choices, randint From fe0ea3be5b9a9a1d258d53d75600522e22f5c008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 4 Sep 2023 10:01:02 +0200 Subject: [PATCH 49/75] Fix bug in module class --- VERSION | 2 +- ppanggolin/formats/writeBinaries.py | 2 +- ppanggolin/region.py | 33 +++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 04d08bef..fd85adc3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.167 +1.2.168 diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 4d87d8bf..18821b6b 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -825,7 +825,7 @@ def part_spec(part: str) -> list: pangenome.compute_mod_bitarrays(part) return [popcount(module.bitarray) for module in pangenome.modules] - mod_fam = [len(module.families) for module in pangenome.modules] + mod_fam = [len(module) for module in pangenome.modules] info_group._v_attrs.StatOfFamiliesInModules = {"min": getmin(mod_fam), "max": getmax(mod_fam), "sd": getstdev(mod_fam), diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 036a1303..f214c4bb 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -3,9 +3,11 @@ # default libraries from __future__ import annotations +import logging # installed libraries from typing import Dict, Generator, List, Set +import gmpy2 # local libraries from ppanggolin.genome import Gene, Organism, Contig @@ -514,6 +516,7 @@ def __init__(self, module_id: int, families: set = None): super().__init__() self.ID = module_id self._families_getter = {family.name: family for family in families} if families is not None else {} + self.bitarray = None def __repr__(self) -> str: """Module representation @@ -602,6 +605,36 @@ def families(self) -> Generator[GeneFamily, None, None]: """ yield from self._families_getter.values() + def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): + """Produces a bitarray representing the presence / absence of families in the organism using the provided index + The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. + + :param partition: filter module by partition + :param index: The index computed by :func:`ppanggolin.pangenome.Pangenome.getIndex` + """ + self.bitarray = gmpy2.xmpz() # pylint: disable=no-member + if partition == 'all': + logging.getLogger("PPanGGOLiN").debug("all") + for fam in self.families: + self.bitarray[index[fam]] = 1 + elif partition == 'persistent': + logging.getLogger("PPanGGOLiN").debug("persistent") + for fam in self.families: + if fam.named_partition in ['persistent']: + self.bitarray[index[fam]] = 1 + elif partition in ['shell', 'cloud']: + logging.getLogger("PPanGGOLiN").debug("shell, cloud") + for fam in self.families: + if fam.named_partition == partition: + self.bitarray[index[fam]] = 1 + elif partition == 'accessory': + logging.getLogger("PPanGGOLiN").debug("accessory") + for fam in self.families: + if fam.named_partition in ['shell', 'cloud']: + self.bitarray[index[fam]] = 1 + else: + raise Exception("There is not any partition corresponding please report a github issue") + class GeneContext: """ From 392eec6fae350dabdf0c3dd1f7a0e9d9dcdf731a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 4 Sep 2023 10:48:35 +0200 Subject: [PATCH 50/75] Fix ,merge RGP clustering --- VERSION | 2 +- ppanggolin/RGP/rgp_cluster.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/VERSION b/VERSION index fd85adc3..16544749 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.168 +1.2.169 diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py index db09fec4..96540eca 100644 --- a/ppanggolin/RGP/rgp_cluster.py +++ b/ppanggolin/RGP/rgp_cluster.py @@ -9,6 +9,7 @@ from collections.abc import Callable from collections import defaultdict from typing import Dict, List, Tuple, Set, Union, Any +from pathlib import Path # installed libraries from tqdm import tqdm @@ -141,11 +142,11 @@ def add_info_to_rgp_nodes(graph, regions: List[Region], region_to_spot: dict): region_info = {"contig": region.contig.name, 'organism': region.organism.name, "name": region.name, - "genes_count": len(region.genes), + "genes_count": len(region), "is_contig_border": region.is_contig_border, "is_whole_contig": region.is_whole_contig, "spot_id": get_spot_id(region, region_to_spot), - 'families_count': len(region.families)} + 'families_count': region.number_of_families()} region_attributes[region.ID] = region_info @@ -461,8 +462,8 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, valid_rgps = [ rgp for rgp in pangenome.regions if not rgp.is_contig_border] - ignored_rgp_count = len(pangenome.regions) - len(valid_rgps) - total_rgp_count = len(pangenome.regions) + ignored_rgp_count = pangenome.number_of_rgp - len(valid_rgps) + total_rgp_count = pangenome.number_of_rgp logging.info( f'Ignoring {ignored_rgp_count}/{total_rgp_count} ({100 * ignored_rgp_count / total_rgp_count:.2f}%) ' @@ -472,7 +473,7 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, raise Exception( "The pangenome has no complete RGPs. The clustering of RGP is then not possible.") else: - valid_rgps = pangenome.regions + valid_rgps = set(pangenome.regions) dereplicated_rgps = dereplicate_rgp(valid_rgps, disable_bar=disable_bar) @@ -593,7 +594,7 @@ def parser_cluster_rgp(parser: argparse.ArgumentParser): required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=True, - type=str, help="The pangenome .h5 file") + type=Path, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") @@ -617,7 +618,7 @@ def parser_cluster_rgp(parser: argparse.ArgumentParser): optional.add_argument("--basename", required=False, default="rgp_cluster", help="basename for the output file") - optional.add_argument('-o', '--output', required=False, type=str, + optional.add_argument('-o', '--output', required=False, type=Path, default="rgp_clustering", help="Output directory") optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", From 1a1388c254f6cddbcee92b047685f494ae242f8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 4 Sep 2023 11:38:09 +0200 Subject: [PATCH 51/75] Fix merge with metadata and gexf --- VERSION | 2 +- ppanggolin/formats/writeFlat.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 16544749..14016729 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.169 +1.2.170 diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 38256148..a2fa1c29 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -274,7 +274,7 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): for m in fam.metadata: if m.source == source_metadata_families: for field in m.fields: - to_concat[field].append(str(m.get(field))) + to_concat[field].append(str(getattr(m, field))) for field in source_fields[source_metadata_families]: concatenated_fields = '|'.join(to_concat[field]) gexf.write(f' \n') @@ -973,6 +973,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core needPartitions = True if gexf or light_gexf or json: needGraph = True + needRegions = True if pan.status["predictedRGP"] == "inFile" else False needSpots = True if pan.status["spots"] == "inFile" else False needModules = True if pan.status["modules"] == "inFile" else False if pangenome.status["metadata"]["families"] == "inFile": From edb49aef932ccc2bf30b7c16af6805d25288abe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 4 Sep 2023 12:06:25 +0200 Subject: [PATCH 52/75] Fix merge bug with config --- VERSION | 2 +- ppanggolin/utility/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/VERSION b/VERSION index 14016729..5a5fd84a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.170 +1.2.171 diff --git a/ppanggolin/utility/utils.py b/ppanggolin/utility/utils.py index 8e4a6b99..bbb43ac0 100644 --- a/ppanggolin/utility/utils.py +++ b/ppanggolin/utility/utils.py @@ -132,7 +132,7 @@ def launch_default_config(args: argparse.Namespace): """ initial_command = args.default_config - if os.path.exists(args.output) and not args.force: + if args.output.exists() and not args.force: raise FileExistsError(f"{args.output} already exists. Use -f if you want to overwrite it.") ignored_params = ['config', 'help'] @@ -211,7 +211,7 @@ def launch_default_config(args: argparse.Namespace): arg_lines.append(f"\n{sub_command}:") arg_lines += get_default_argument_lines(specific_actions) - mk_outdir(args.output.parent, args.force) + mk_outdir(args.output.parent, True) # Everytime it is True because the config file is already tested logging.getLogger("PPanGGOLiN").info(f'Writting default config in {args.output}') with open(args.output, 'w') as fl: fl.write('\n'.join(arg_lines) + '\n') From 9da890733682beaef9d0ac1e590e71f265012c2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 4 Sep 2023 16:00:40 +0200 Subject: [PATCH 53/75] Append methods add, get and remove in regions classes to be more readable --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 2 +- ppanggolin/RGP/rgp_cluster.py | 2 +- ppanggolin/RGP/spot.py | 2 +- ppanggolin/formats/readBinaries.py | 6 +- ppanggolin/region.py | 122 ++++++++++++++++++++++++++--- tests/test_pangenome.py | 8 +- tests/test_region.py | 89 ++++++++++++--------- 8 files changed, 174 insertions(+), 59 deletions(-) diff --git a/VERSION b/VERSION index 5a5fd84a..dfbbc667 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.171 +1.2.172 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index dc6e9d8c..b763591b 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -41,7 +41,7 @@ def extract_rgp(contig, node, rgp_id, naming) -> Region: elif naming == "organism": new_region = Region(node.gene.organism.name + "_" + contig.name + "_RGP_" + str(rgp_id)) while node.state: - new_region[node.gene.position] = node.gene + new_region.add(node.gene) node.state = 0 node.score = 0 node = node.prev diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py index 96540eca..f19fec21 100644 --- a/ppanggolin/RGP/rgp_cluster.py +++ b/ppanggolin/RGP/rgp_cluster.py @@ -146,7 +146,7 @@ def add_info_to_rgp_nodes(graph, regions: List[Region], region_to_spot: dict): "is_contig_border": region.is_contig_border, "is_whole_contig": region.is_whole_contig, "spot_id": get_spot_id(region, region_to_spot), - 'families_count': region.number_of_families()} + 'families_count': region.number_of_families} region_attributes[region.ID] = region_info diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index 87977d13..44828d02 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -135,7 +135,7 @@ def add_new_node(g: nx.Graph, region: Region, borders: list): spots.append(curr_spot) for node in comp: for region in graph_spot.nodes[node]["rgp"]: - curr_spot[region.name] = region + curr_spot.add(region) spot_id += 1 if spot_graph: diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index a56d7e83..8b60a4ad 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -387,7 +387,7 @@ def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): region = Region(row["RGP"].decode()) pangenome.add_region(region) gene = pangenome.get_gene(row["gene"].decode()) - region[gene.position] = gene + region.add(gene) pangenome.status["predictedRGP"] = "Loaded" @@ -407,7 +407,7 @@ def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False curr_spot = Spot(int(row["spot"])) spots[row["spot"]] = curr_spot region = pangenome.get_region(row["RGP"].decode()) - curr_spot[region.name] = region + curr_spot.add(region) curr_spot.spot_2_families() for spot in spots.values(): pangenome.add_spot(spot) @@ -432,7 +432,7 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal curr_module = Module(int(row['module'])) modules[row["module"]] = curr_module family = pangenome.get_gene_family(row['geneFam'].decode()) - curr_module[family.name] = family + curr_module.add(family) for module in modules.values(): pangenome.add_module(module) pangenome.status["modules"] = "Loaded" diff --git a/ppanggolin/region.py b/ppanggolin/region.py index f214c4bb..08eabd86 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -37,14 +37,14 @@ class Region(MetaFeatures): """ id_counter = 0 - def __init__(self, region_id: str): + def __init__(self, name: str): """Constructor method - :param region_id: Identifier of the region + :param name: Name of the region """ super().__init__() self._genes_getter = {} - self.name = region_id + self.name = name self.score = 0 self.starter = None self.stopper = None @@ -103,9 +103,6 @@ def __setitem__(self, position: int, gene: Gene): :raises Exception: Organism or contig of the gene is different from the region :raises KeyError: Another gene already exists at the position """ - if not isinstance(gene, Gene): - raise TypeError(f"Unexpected class / type for {type(gene)} " - f"when adding it to a region of genomic plasticity") if len(self) > 0: if gene.organism != self.organism: raise Exception(f"Gene {gene.name} is from a different organism than the first defined in RGP. " @@ -145,6 +142,42 @@ def __delitem__(self, position): except KeyError: raise KeyError(f"There is no gene at position {position} in RGP {self.name}") + def add(self, gene: Gene): + """Add a gene to the region + + :param gene: Gene to add + """ + if not isinstance(gene, Gene): + raise TypeError(f"Unexpected class / type for {type(gene)} " + f"when adding it to a region of genomic plasticity") + if gene.position is None: + raise AttributeError(f'Gene {gene.name} is not fill with position') + self[gene.position] = gene + + def get(self, position: int) -> Gene: + """Get a gene by its position + + :param position: Position of the gene in the contig + + :return: Wanted gene + + :raises TypeError: Position is not an integer + """ + if not isinstance(position, int): + raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + return self[position] + + def remove(self, position): + """Remove a gene by its position + + :param position: Position of the gene in the contig + + :raises TypeError: Position is not an integer + """ + if not isinstance(position, int): + raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + del self[position] + @property def genes(self) -> Generator[Gene, None, None]: """Generate the gene as they are ordered in contigs @@ -163,6 +196,7 @@ def families(self) -> Generator[GeneFamily, None, None]: for gene in self.genes: yield gene.family + @property def number_of_families(self) -> int: """Get the number of different gene families in the region @@ -307,17 +341,14 @@ def __str__(self): """ return f"spot_{self.ID}" - def __setitem__(self, name, region): + def __setitem__(self, name: str, region: Region): """Set the region belonging to the spot :param name: Name of the region :param region: Region to add in the spot - :raises TypeError: Region is not an instance Region :raises KeyError: Name of the region is already in the spot for a different region """ - if not isinstance(region, Region): - raise TypeError(f"A Region object is expected to be added to the spot. find type is {type(region)}") if name in self._region_getter and self[name] != region: raise KeyError("A Region with the same name already exist in spot") self._region_getter[name] = region @@ -330,7 +361,10 @@ def __getitem__(self, name) -> Region: :return: Region in the spot for the given name :raises KeyError: Name does not exist in the spot + :raises TypeError: Name is not a string """ + if not isinstance(name, str): + raise TypeError(f"Name of the region must be a string. The provided type was {type(name)}") try: return self._region_getter[name] except KeyError: @@ -342,7 +376,10 @@ def __delitem__(self, name): :param name: Name of the wanted region :raises KeyError: Name does not exist in the spot + :raises TypeError: Name is not a string """ + if not isinstance(name, str): + raise TypeError(f"Name of the region must be a string. The provided type was {type(name)}") try: del self._region_getter[name] except KeyError: @@ -353,6 +390,36 @@ def __len__(self) -> int: """ return len(self._region_getter) + def add(self, region: Region): + """Add a region to the spot. + Alias more readable for setitem + + :param region: Region to add in the spot + + :raises TypeError: Region is not an instance Region + """ + if not isinstance(region, Region): + raise TypeError(f"A Region object is expected to be added to the spot. find type is {type(region)}") + self[region.name] = region + + def get(self, name: str) -> Region: + """Get a region by its name. + Alias more readable for getitem + + :param name: Name of the region + + :return: Wanted region + """ + return self[name] + + def remove(self, name: str): + """Remove a region by its name. + Alias more readable for delitem + + :param name: Name of the region + """ + del self[name] + @property def regions(self) -> Generator[Region, None, None]: """Generates the regions in the spot @@ -375,6 +442,7 @@ def families(self) -> Generator[GeneFamily, None, None]: families.add(family) yield family + @property def number_of_families(self) -> int: """Get the number of different families in the spot @@ -552,7 +620,7 @@ def __eq__(self, other: Module) -> bool: raise TypeError(f"Another module is expected to be compared to the first one. You give a {type(other)}") return set(self.families) == set(other.families) - def __setitem__(self, name, family): + def __setitem__(self, name: str, family: GeneFamily): """Set a gene family in the module :param name: Name of the family @@ -561,8 +629,6 @@ def __setitem__(self, name, family): :raises TypeError: Family is not instance GeneFamily :raises KeyError: Another family with the same name already exists in the module """ - if not isinstance(family, GeneFamily): - raise TypeError(f"A gene family is expected to be added to module. Given type was {type(family)}") if name in self._families_getter and self[name] != family: raise KeyError("A different gene family with the same name already exist in the module") self._families_getter[name] = family @@ -597,6 +663,36 @@ def __delitem__(self, name): del self._families_getter[name] fam._modules.remove(self) + def add(self, family: GeneFamily): + """Add a family to the module. + Alias more readable for setitem + + :param family: Region to add in the spot + + :raises TypeError: Region is not an instance Region + """ + if not isinstance(family, GeneFamily): + raise TypeError(f"A gene family is expected to be added to module. Given type was {type(family)}") + self[family.name] = family + + def get(self, name: str) -> GeneFamily: + """Get a family by its name. + Alias more readable for getitem + + :param name: Name of the family + + :return: Wanted family + """ + return self[name] + + def remove(self, name: str): + """Remove a family by its name. + Alias more readable for delitem + + :param name: Name of the family + """ + del self[name] + @property def families(self) -> Generator[GeneFamily, None, None]: """Generator of the family in the module diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index 9de9d482..cd1bd73a 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -595,7 +595,7 @@ def test_add_region(self, pangenome): :param pangenome: Access the pangenome object """ - rgp = Region(region_id="rgp") + rgp = Region(name="rgp") pangenome.add_region(rgp) assert len(pangenome._regionGetter) == 1 assert pangenome._regionGetter["rgp"] == rgp @@ -605,7 +605,7 @@ def test_add_region_already_in_pangenome(self, pangenome): :param pangenome: Access the pangenome object """ - rgp = Region(region_id="rgp") + rgp = Region(name="rgp") pangenome.add_region(rgp) with pytest.raises(KeyError): pangenome.add_region(rgp) @@ -623,7 +623,7 @@ def test_get_region(self, pangenome): :param pangenome: Access the pangenome object """ - rgp = Region(region_id="rgp") + rgp = Region(name="rgp") pangenome.add_region(rgp) assert pangenome.get_region("rgp") == rgp @@ -648,7 +648,7 @@ def test_number_of_rgp(self, pangenome): :param pangenome: Pass the pangenome object to the function """ - rgp = Region(region_id="rgp") + rgp = Region(name="rgp") pangenome.add_region(rgp) assert isinstance(pangenome.number_of_rgp, int) assert pangenome.number_of_rgp == 1 diff --git a/tests/test_region.py b/tests/test_region.py index b84c88b9..ec605481 100644 --- a/tests/test_region.py +++ b/tests/test_region.py @@ -109,7 +109,7 @@ def test_add_gene(self, region): """ gene = Gene('gene') gene.fill_annotations(start=0, stop=10, strand='+', position=0) - region[0] = gene + region.add(gene) assert len(region._genes_getter) == 1 assert region._genes_getter[0] == gene assert region.starter == gene @@ -120,18 +120,24 @@ def test_add_gene_not_is_instance_gene(self, region): """Test that adding object with instance not Gene return a TypeError """ with pytest.raises(TypeError): - region[0] = 0 + region.add(0) + + def test_add_gene_not_fill_with_position(self, region): + """Test that adding gene not fill with position return an AttributeError + """ + with pytest.raises(AttributeError): + region.add(Gene('gene')) def test_add_genes_at_position_already_taken(self, region): """Test that adding genes with same position return a ValueError """ gene = Gene('gene') gene.fill_annotations(start=0, stop=10, strand='+', position=0) - region[0] = gene + region.add(gene) with pytest.raises(KeyError): gene = Gene('gene') gene.fill_annotations(start=4, stop=12, strand='-', position=0) - region[0] = gene + region.add(gene) def test_add_genes_from_different_contigs(self, region): """Test that adding genes from different contigs return an Exception @@ -140,10 +146,10 @@ def test_add_genes_from_different_contigs(self, region): gene1.fill_annotations(start=0, stop=10, strand='+', position=0) gene2.fill_annotations(start=11, stop=20, strand='+', position=1) gene1.fill_parents(None, Contig('contig_1')) - region[0] = gene1 + region.add(gene1) gene2.fill_parents(None, Contig('contig_2')) with pytest.raises(Exception): - region[1] = gene2 + region.add(gene2) def test_add_genes_from_different_organisms(self, region): """Test that adding genes from different organisms return an Exception @@ -152,43 +158,55 @@ def test_add_genes_from_different_organisms(self, region): gene1.fill_annotations(start=0, stop=10, strand='+', position=0) gene2.fill_annotations(start=11, stop=20, strand='+', position=1) gene1.fill_parents(Organism("org_1")) - region[0] = gene1 + region.add(gene1) gene2.fill_parents(Organism("org_2")) with pytest.raises(Exception): - region[1] = gene2 + region.add(gene2) def test_get_genes(self, region): """Tests that genes can be retrieved from the region """ gene = Gene('gene') gene.fill_annotations(start=0, stop=10, strand='+', position=0) - region[0] = gene - assert region[0] == gene + region.add(gene) + assert region.get(0) == gene + + def test_get_genes_with_position_not_integer(self, region): + """Tests that getting a gene with wrong type for position raise a TypeError + """ + with pytest.raises(TypeError): + region.get("0") def test_get_genes_with_position_not_in_region(self, region): """Tests that getting a gene at position not belonging in the region return a KeyError """ with pytest.raises(KeyError): - _ = region[randint(0, 20)] + region.get(randint(0, 20)) def test_del_gene(self, region): """Tests that genes can be deleted from the region """ gene = Gene('gene') gene.fill_annotations(start=0, stop=10, strand='+', position=0) - region[0] = gene - assert region[0] == gene - del region[0] + region.add(gene) + assert region.get(0) == gene + region.remove(0) assert 0 not in region._genes_getter + def test_del_genes_with_position_not_integer(self, region): + """Tests that removing a gene with wrong type for position raise a TypeError + """ + with pytest.raises(TypeError): + region.remove("0") + def test_get_length(self, region): """Tests that the length of the region can be retrieved """ gene1, gene2 = Gene('gene_1'), Gene('gene_2') gene1.fill_annotations(start=0, stop=10, strand='+', position=0) gene2.fill_annotations(start=11, stop=20, strand='+', position=1) - region[0] = gene1 - region[1] = gene2 + region.add(gene1) + region.add(gene2) assert region.length == 20 def test_get_organism(self, region): @@ -197,7 +215,7 @@ def test_get_organism(self, region): gene = Gene('gene') gene.fill_annotations(start=0, stop=10, strand='+', position=0) gene.fill_parents(Organism("org")) - region[0] = gene + region.add(gene) assert region.organism.name == 'org' def test_get_contig(self, region): @@ -206,7 +224,7 @@ def test_get_contig(self, region): gene = Gene('gene') gene.fill_annotations(start=0, stop=10, strand='+', position=0) gene.fill_parents(contig=Contig("contig")) - region[0] = gene + region.add(gene) assert region.contig.name == 'contig' def test_is_whole_contig_true(self, region): @@ -218,7 +236,7 @@ def test_is_whole_contig_true(self, region): contig = Contig("contig") contig[starter.start], contig[stopper.start] = starter, stopper starter.fill_parents(None, contig), stopper.fill_parents(None, contig) - region[starter.position], region[stopper.position] = starter, stopper + region.add(starter), region.add(stopper) assert region.is_whole_contig is True def test_is_whole_contig_false(self, region): @@ -234,7 +252,7 @@ def test_is_whole_contig_false(self, region): contig[starter.start], contig[stopper.start] = starter, stopper before.fill_parents(None, contig), after.fill_parents(None, contig) starter.fill_parents(None, contig), stopper.fill_parents(None, contig) - region[starter.position], region[stopper.position] = starter, stopper + region.add(starter), region.add(stopper) assert region.is_whole_contig is False def test_is_contig_border_true(self, region): @@ -250,7 +268,7 @@ def test_is_contig_border_true(self, region): starter.fill_parents(None, contig), stopper.fill_parents(None, contig) # Test bordering right contig[before.start], contig[starter.start], contig[stopper.start] = before, starter, stopper - region[starter.position], region[stopper.position] = starter, stopper + region.add(starter), region.add(stopper) assert region.is_contig_border is True # Test bordering left del contig._genes_position[before.position] @@ -271,7 +289,7 @@ def test_is_contig_border_false(self, region): contig[starter.start], contig[stopper.start] = starter, stopper before.fill_parents(None, contig), after.fill_parents(None, contig) starter.fill_parents(None, contig), stopper.fill_parents(None, contig) - region[starter.position], region[stopper.position] = starter, stopper + region.add(starter), region.add(stopper) assert region.is_contig_border is False def test_is_contig_border_assertion_error_if_no_gene(self, region): @@ -284,7 +302,7 @@ def test_len(self, region, genes): """Tests that the expected number of genes is retrieved in the region """ for gene in genes: - region[gene.position] = gene + region.add(gene) assert isinstance(len(region), int) assert len(region) == len(genes) @@ -294,8 +312,8 @@ def test_equality(self, genes): region_1, region_2, region_3 = Region("RGP_1"), Region("RGP_2"), Region("RGP_3") max_pos = max(genes, key=lambda gene: gene.position).position for gene in genes: - region_1[gene.position] = gene - region_2[gene.position] = gene + region_1.add(gene) + region_2.add(gene) region_3[max_pos - gene.position + 1] = gene assert region_1 == region_2 assert region_1 == region_3 @@ -304,7 +322,7 @@ def test_not_equal(self, region, genes): """Test difference between two regions """ for gene in genes: - region[gene.position] = gene + region.add(gene) assert region != Region("other_RGP") def test_equality_with_not_instance_region(self, region): @@ -317,7 +335,7 @@ def test_get_gene_families(self, region, genes, families): """Tests that gene families can be retrieved from the region """ for gene in genes: - region[gene.position] = gene + region.add(gene) assert all(isinstance(family, GeneFamily) for family in region.families) assert set(region.families) == families @@ -325,9 +343,9 @@ def test_get_number_of_gene_families(self, region, genes, families): """Tests that gene families can be retrieved from the region """ for gene in genes: - region[gene.position] = gene - assert isinstance(region.number_of_families(), int) - assert region.number_of_families() == len(families) + region.add(gene) + assert isinstance(region.number_of_families, int) + assert region.number_of_families == len(families) # def test_get_bordering_genes(self, region, genes): # # TODO test multigenic @@ -380,14 +398,14 @@ def region(self) -> Generator[Region, None, None]: def test_add_region(self, spot, region): """Tests that adding a Region object to the Spot object works as expected """ - spot[region.name] = region + spot.add(region) assert region == spot._region_getter[region.name] def test_add_not_instance_region(self, spot): """Tests that a TypeError is returned if a non-region type is trying to be added """ with pytest.raises(TypeError): - spot["spot"] = "spot" + spot.add("region") def test_add_different_region_with_same_name(self, spot): """Test that adding a new Region same name than another in the spot return a KeyError @@ -417,8 +435,8 @@ def test_add_two_time_the_same_region(self, spot, region): def test_get_region(self, spot, region): """Tests that getting the region in the Spot object works as expected """ - spot[region.name] = region - assert spot[region.name] == region + spot.add(region) + assert spot.get(region.name) == region def test_get_region_not_in_spot(self, spot): """Tests that a KeyError is raised when the name of the region does not exist in the spot @@ -490,7 +508,8 @@ def test_number_of_families(self, spot, regions, families): """ for region in regions: spot[region.name] = region - assert spot.number_of_families() == len(families) + assert isinstance(spot.number_of_families, int) + assert spot.number_of_families == len(families) def test_add_spot_to_families(self, spot, regions, families): """Tests that adding spot to families works as expected From 3d7b94462558892a14e85b80edc52f674328c115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 5 Sep 2023 10:18:54 +0200 Subject: [PATCH 54/75] Add setitem getitem with add and get alliases --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 16 +-- ppanggolin/annotate/synta.py | 8 +- ppanggolin/cluster/cluster.py | 6 +- ppanggolin/figures/draw_spot.py | 2 +- ppanggolin/formats/readBinaries.py | 6 +- ppanggolin/formats/writeFlat.py | 10 +- ppanggolin/geneFamily.py | 118 ++++++++++++++++++--- ppanggolin/genome.py | 158 +++++++++++++++++++++++++---- tests/region/test_rgp_cluster.py | 4 +- tests/test_genefamily.py | 34 +++---- tests/test_genome.py | 94 ++++++++--------- tests/test_pangenome.py | 10 +- tests/test_region.py | 4 +- 14 files changed, 337 insertions(+), 135 deletions(-) diff --git a/VERSION b/VERSION index dfbbc667..52cd5461 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.172 +1.2.173 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index d735f22d..9758a07d 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -122,10 +122,10 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p contig_id = line[12:].strip() if contig_id != "": try: - contig = org.get_contig(contig_id) + contig = org.get(contig_id) except KeyError: contig = Contig(contig_id, True if contig_id in circular_contigs else False) - org.add_contig(contig) + org.add(contig) set_contig = True line = lines.pop() if not set_contig: @@ -133,10 +133,10 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p # Should be unique in a dataset, but if there's an update the contig ID # might still be the same even though it should not(?) try: - contig = org.get_contig(contig_locus_id) + contig = org.get(contig_locus_id) except KeyError: contig = Contig(contig_locus_id, True if contig_locus_id in circular_contigs else False) - org.add_contig(contig) + org.add(contig) # start of the feature object. dbxref = set() gene_name = "" @@ -297,10 +297,10 @@ def get_id_attribute(attributes_dict: dict) -> str: elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] try: - contig = org.get_contig(fields[1]) + contig = org.get(fields[1]) except KeyError: contig = Contig(fields[1], True if fields[1] in circular_contigs else False) - org.add_contig(contig) + org.add(contig) continue elif line.startswith('#'): # comment lines to be ignores by parsers @@ -341,11 +341,11 @@ def get_id_attribute(attributes_dict: dict) -> str: if contig is None or contig.name != fields_gff[gff_seqname]: # get the current contig try: - contig = org.get_contig(fields_gff[gff_seqname]) + contig = org.get(fields_gff[gff_seqname]) except KeyError: contig = Contig(fields_gff[gff_seqname], True if fields_gff[gff_seqname] in circular_contigs else False) - org.add_contig(contig) + org.add(contig) if fields_gff[gff_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 58f451be..9c2f10e9 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -171,10 +171,10 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in all_contig_len += len(contig_seq) contig_seq = "" try: - contig = org.get_contig(line.split()[0][1:]) + contig = org.get(line.split()[0][1:]) except KeyError: contig = Contig(line.split()[0][1:]) - org.add_contig(contig) + org.add(contig) else: contig_seq += line.strip() if len(contig_seq) >= 1: # processing the last contig @@ -326,10 +326,10 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: for contig_name, genes in genes.items(): try: - contig = org.get_contig(contig_name) + contig = org.get(contig_name) except KeyError: contig = Contig(contig_name, True if contig_name in circular_contigs else False) - org.add_contig(contig) + org.add(contig) for gene in genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) gene.fill_parents(org, contig) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 7b71876e..8d2763cb 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -275,7 +275,7 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F else: gene_obj = Gene(gene) gene_obj.is_fragment = is_frag - fam.add_gene(gene_obj) + fam.add(gene_obj) def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = True, code: int = 11, @@ -360,7 +360,7 @@ def infer_singletons(pangenome: Pangenome): for gene in pangenome.genes: if gene.family is None: fam = GeneFamily(family_id=pangenome.max_fam_id, name=gene.ID) - fam.add_gene(gene) + fam.add(gene) pangenome.add_gene_family(fam) singleton_counter += 1 logging.getLogger("PPanGGOLiN").info(f"Inferred {singleton_counter} singleton families") @@ -409,7 +409,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet fam = GeneFamily(pangenome.max_fam_id, fam_id) pangenome.add_gene_family(fam) gene_obj.is_fragment = True if is_frag == "F" else False # F for Fragment - fam.add_gene(gene_obj) + fam.add(gene_obj) if is_frag == "F": frag = True except Exception: diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index cf3b9660..2b02dbf5 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -615,7 +615,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: uniq_gene_lists = [] ordered_counts = [] for genelist in gene_lists: - curr_genelist_count = count_uniq.get(genelist[2], None) + curr_genelist_count = count_uniq.get(genelist[2]) if curr_genelist_count is not None: uniq_gene_lists.append(genelist) ordered_counts.append(curr_genelist_count) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 8b60a4ad..900a8c4e 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -241,10 +241,10 @@ def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circul gene, gene_type = (None, None) for contig_name, gene_list in contig_dict.items(): try: - contig = org.get_contig(contig_name) + contig = org.get(contig_name) except KeyError: contig = Contig(contig_name, is_circular=circular_contigs[contig_name]) - org.add_contig(contig) + org.add(contig) for row in gene_list: if link: # if the gene families are already computed/loaded the gene exists. gene = pangenome.get_gene(row["ID"].decode()) @@ -322,7 +322,7 @@ def read_gene_families(pangenome: Pangenome, h5f: tables.File, disable_bar: bool gene_obj = pangenome.get_gene(row["gene"].decode()) else: # else, no gene_obj = Gene(row["gene"].decode()) - fam.add_gene(gene_obj) + fam.add(gene_obj) pangenome.status["genesClustered"] = "Loaded" diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index a2fa1c29..c9a12ef3 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -62,7 +62,7 @@ def write_json_gene_fam(gene_fam: GeneFamily, json: TextIO): :param gene_fam: file-like object, compressed or not :param json: file-like object, compressed or not """ - json.write('{' + f'"id": "{gene_fam.name}", "nb_genes": {gene_fam.number_of_genes}, ' + json.write('{' + f'"id": "{gene_fam.name}", "nb_genes": {len(gene_fam)}, ' f'"partition": "{gene_fam.named_partition}", "subpartition": "{gene_fam.partition}"' + '}') org_dict = {} name_counts = Counter() @@ -247,7 +247,7 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): gexf.write(f' \n') gexf.write(f' \n') gexf.write(' \n') - gexf.write(f' \n') + gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') gexf.write(f' \n') @@ -406,8 +406,8 @@ def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool '"' + alt + '"', # 2 '"' + str(product.most_common(1)[0][0]) + '"', # 3 '"' + str(fam.number_of_organisms) + '"', # 4 - '"' + str(fam.number_of_genes) + '"', # 5 - '"' + str(round(fam.number_of_genes / fam.number_of_organisms, 2)) + '"', # 6 + '"' + str(len(fam)) + '"', # 5 + '"' + str(round(len(fam) / fam.number_of_organisms, 2)) + '"', # 6 '"NA"', # 7 '"NA"', # 8 '""', # 9 @@ -468,7 +468,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, "\n") for fam in pan.gene_families: if fam.named_partition == "persistent": - mean_pres = fam.number_of_genes / fam.number_of_organisms + mean_pres = len(fam) / fam.number_of_organisms nb_multi = 0 for gene_list in fam.get_org_dict().values(): if len(gene_list) > 1: diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index e8b0efb0..0bf13aef 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -67,7 +67,7 @@ def __init__(self, family_id: int, name: str): self.ID = family_id self._edges = {} self._genePerOrg = defaultdict(set) - self._genes = set() + self._genes_getter = {} self.removed = False # for the repeated family not added in the main graph self.sequence = "" self.partition = "" @@ -80,6 +80,106 @@ def __repr__(self) -> str: """ return f"{self.ID}: {self.name}" + + + def __len__(self) -> int: + return len(self._genes_getter) + + def __setitem__(self, identifier: str, gene: Gene): + """ Set gene to Gene Family + + :param identifier: ID of the gene + :param gene: Gene object to add + + :raises TypeError: If the gene is not instance Gene + :raises TypeError: If the identifier is not instance string + :raises ValueError: If a gene in getter already exists at the name + """ + # TODO look at change start for position + + if not isinstance(gene, Gene): + raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") + if not isinstance(identifier, str): + raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + if identifier in self._genes_getter: + raise KeyError(f"Gene with name {identifier} already exists in the gene family") + self._genes_getter[identifier] = gene + + # TODO define eq function + + # retrieve gene by start position + def __getitem__(self, identifier: str) -> Gene: + """Get the gene for the given name + + :param identifier: ID of the gene in the gene family + + :return: Wanted gene + + :raises TypeError: If the identifier is not instance string + :raises KeyError: Gene with the given identifier does not exist in the contig + """ + if not isinstance(identifier, str): + raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + try: + return self._genes_getter[identifier] + except KeyError: + raise KeyError(f"Gene with the ID: {identifier} does not exist in the family") + + def __delitem__(self, identifier: str): + """Remove the gene for the given name in the gene family + + :param position: ID of the gene in the family + + :raises TypeError: If the identifier is not instance string + :raises KeyError: Gene with the given identifier does not exist in the contig + """ + if not isinstance(identifier, str): + raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + try: + del self._genes_getter[identifier] + except KeyError: + raise KeyError(f"Gene with the name: {identifier} does not exist in the family") + + def add(self, gene: Gene): + """Add a gene to the gene family, and sets the gene's :attr:family accordingly. + + :param gene: The gene to add + + :raises TypeError: If the provided `gene` is of the wrong type + """ + if not isinstance(gene, Gene): + raise TypeError(f"'Gene' type object was expected, but '{type(gene)}' type object was provided.") + self[gene.ID] = gene + gene.family = self + if gene.organism is not None: + self._genePerOrg[gene.organism].add(gene) + + def get(self, identifier: str) -> Gene: + """Get a gene by its name + + :param identifier: ID of the gene + + :return: Wanted gene + + :raises TypeError: If the identifier is not instance string + """ + if not isinstance(identifier, str): + raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + return self[identifier] + + def remove(self, identifier): + """Remove a gene by its name + + :param identifier: Name of the gene + + :return: Wanted gene + + :raises TypeError: If the identifier is not instance string + """ + if not isinstance(identifier, str): + raise TypeError(f"Gene ID should be a string. You provided a '{type(identifier)}' type object") + del self[identifier] + #TODO define __eq__ @property @@ -125,7 +225,7 @@ def genes(self): :return: Generator of genes """ - for gene in self._genes: + for gene in self._genes_getter.values(): yield gene @property @@ -211,20 +311,6 @@ def add_sequence(self, seq: str): self.sequence = seq - def add_gene(self, gene: Gene): - """Add a gene to the gene family, and sets the gene's :attr:family accordingly. - - :param gene: The gene to add - - :raises TypeError: If the provided `gene` is of the wrong type - """ - if not isinstance(gene, Gene): - raise TypeError(f"'Gene' type object was expected, but '{type(gene)}' type object was provided.") - self._genes.add(gene) - gene.family = self - if gene.organism is not None: - self._genePerOrg[gene.organism].add(gene) - def add_spot(self, spot: Spot): """Add the given spot to the family diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 9e320c16..d63818b6 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -343,6 +343,8 @@ def __setitem__(self, start: int, gene: Gene): self._genes_position[gene.position] = gene self._genes_getter[gene.start] = gene + # TODO define eq function + # retrieve gene by start position def __getitem__(self, position: int) -> Gene: """Get the gene for the given position @@ -355,9 +357,66 @@ def __getitem__(self, position: int) -> Gene: """ if not isinstance(position, int): raise TypeError(f"Expected type is int, given type was '{type(position)}'") - return self._genes_position[position] + try: + return self._genes_position[position] + except KeyError: + raise KeyError("Position of the gene in the contig does not exist") + + def __delitem__(self, position): + """Remove the gene for the given position in the contig + + :param position: Position of the gene in the contig + + :raises KeyError: Gene at the given position does not exist in the contig + """ + if not isinstance(position, int): + raise TypeError(f"Expected type is int, given type was '{type(position)}'") + try: + del self._genes_position[position] + except KeyError: + raise KeyError("Position of the gene in the contig does not exist") + + def add(self, gene: Gene): + """Add a gene to the contig + + :param gene: Gene to add + + :raises TypeError: Region is not an instance Region + """ + if not isinstance(gene, Gene): + raise TypeError(f"Unexpected class / type for {type(gene)} when adding it to a contig") + if gene.start is None: + raise AttributeError(f'Gene {gene.name} is not fill with start') + if gene.position is None: + raise AttributeError(f'Gene {gene.name} is not fill with position') + self[gene.start] = gene + + def get(self, position: int) -> Gene: + """Get a gene by its position + + :param position: Position of the gene in the contig - # TODO define delitem + :return: Wanted gene + + :raises TypeError: Position is not an integer + """ + if not isinstance(position, int): + raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + gene = self[position] + if gene is None: + logging.getLogger("PPanGGOLiN").debug("Given position result with a None Gene") + return gene + + def remove(self, position): + """Remove a gene by its position + + :param position: Position of the gene in the contig + + :raises TypeError: Position is not an integer + """ + if not isinstance(position, int): + raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}") + del self[position] def get_genes(self, begin: int, end: int) -> List[Gene]: """Gets a list of genes within a range @@ -470,6 +529,58 @@ def _set_families(self): """ self._families = {gene.family for gene in self.genes} + def __setitem__(self, name: str, contig: Contig): + """ Set contig to the organism + + :param name: Name of the contig + :param contig: Contig object to add in the organism + + :raises TypeError: If the contig is not instance Contig + :raises TypeError: If the name is not instance string + :raises KeyError: Contig with the given name already exist in the organism + """ + + if not isinstance(name, str): + raise TypeError(f"Contig name should be a string. You provided a '{type(name)}' type object") + if not isinstance(contig, Contig): + raise TypeError(f"'Contig' type was expected but you provided a '{type(contig)}' type object") + if name in self._contigs_getter: # Add test if contig are equivalent when __eq__ method will be defined in Contig + raise KeyError(f"Contig {contig.name} already in organism {self.name}") + self._contigs_getter[contig.name] = contig + contig.organism = self + + def __getitem__(self, name: str) -> Contig: + """Get the contig for the given position + + :param name: Name of the contig + + :return: Wanted contig for the given name + + :raises TypeError: If name is not a string + :raises KeyError: Name does not exist in the organism + """ + if not isinstance(name, str): + raise TypeError(f"Expected type is string, given type was '{type(name)}'") + try: + return self._contigs_getter[name] + except KeyError: + raise KeyError(f"Contig with the name: {name} does not exist in the organism") + + def __delitem__(self, name): + """Remove the contig for the given name + + :param name: Name of the contig + + :raises TypeError: If name is not a string + :raises KeyError: Name does not exist in the organism + """ + if not isinstance(name, int): + raise TypeError(f"Expected type is int, given type was '{type(name)}'") + try: + del self._contigs_getter[name] + except KeyError: + raise KeyError("Position of the gene in the contig does not exist") + @property def families(self): """Return the gene families present in the organism @@ -521,23 +632,7 @@ def number_of_contigs(self) -> int: """ return len(self._contigs_getter) - def get_contig(self, name: str) -> Contig: - """ - Get contig with the given identifier in the organim - - :param name: Contig identifier - - :return: The contig with the given identifier - - :raises KeyError: Contig with the given name does not exist in the organism - """ - assert isinstance(name, str), f"To get a contig, name with string type is expected. Given type: {type(name)}" - try: - return self._contigs_getter[name] - except KeyError: - raise KeyError(f"Contig {name} does not belong to organism {self.name}") - - def add_contig(self, contig: Contig): + def add(self, contig: Contig): """Add a contig to organism :param: Contig to add in organism @@ -546,13 +641,32 @@ def add_contig(self, contig: Contig): """ assert isinstance(contig, Contig), f"Contig object is expected, given type was {type(contig)}" try: - _ = self.get_contig(contig.name) + _ = self.get(contig.name) except KeyError: - self._contigs_getter[contig.name] = contig - contig.organism = self + self[contig.name] = contig else: raise KeyError(f"Contig {contig.name} already in organism {self.name}") + def get(self, name: str) -> Contig: + """ + Get contig with the given identifier in the organism + + :param name: Contig identifier + + :return: The contig with the given identifier + """ + return self[name] + + def remove(self, name: str) -> Contig: + """ + Remove a contig with the given identifier in the organism + + :param name: Contig identifier + + :return: The contig with the given identifier + """ + del self[name] + def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. diff --git a/tests/region/test_rgp_cluster.py b/tests/region/test_rgp_cluster.py index 96b25816..d6913d30 100644 --- a/tests/region/test_rgp_cluster.py +++ b/tests/region/test_rgp_cluster.py @@ -41,7 +41,7 @@ def families(genes) -> Generator[Set[GeneFamily], None, None]: idx_genes = 0 while idx_genes < nb_genes_per_family: gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] - family.add_gene(gene) + family.add(gene) gene.family = family idx_genes += 1 families.add(family) @@ -51,7 +51,7 @@ def families(genes) -> Generator[Set[GeneFamily], None, None]: idx_genes = (idx_fam - 1) * nb_genes_per_family while idx_genes < len(genes): gene = genes[idx_genes] - family.add_gene(gene) + family.add(gene) gene.family = family idx_genes += 1 families.add(family) diff --git a/tests/test_genefamily.py b/tests/test_genefamily.py index 656b0380..9baddb00 100644 --- a/tests/test_genefamily.py +++ b/tests/test_genefamily.py @@ -25,17 +25,17 @@ def test_construct_gene_family(self, family): """Tests that a GeneFamily object can be created with valid family_id and name """ assert isinstance(family, GeneFamily) - assert all(attr in ["ID", "name", "_edges", "_genePerOrg", "_genes", "removed", "sequence", "partition", + assert all(attr in ["ID", "name", "_edges", "_genePerOrg", "_genes_getter", "removed", "sequence", "partition", "_spots", "_modules", "bitarray", "_metadata_getter"] for attr in family.__dict__) # Check that no attribute was added else it should be tested - assert all(hasattr(family, attr) for attr in ["ID", "name", "_edges", "_genePerOrg", "_genes", "removed", + assert all(hasattr(family, attr) for attr in ["ID", "name", "_edges", "_genePerOrg", "_genes_getter", "removed", "sequence", "partition", "_spots", "_modules", "bitarray"]) # Check that no attribute was removed else it should be tested assert family.ID == 1 assert family.name == 'test' assert family._edges == {} assert family._genePerOrg == {} - assert family._genes == set() + assert family._genes_getter == dict() assert not family.removed # for the repeated family not added in the main graph assert family.sequence == "" assert family.partition == "" @@ -82,7 +82,7 @@ def test_add_gene_to_gene_family(self, family): """Tests that a Gene object can be added to a GeneFamily object """ gene = Gene('gene1') - family.add_gene(gene) + family.add(gene) assert gene in family.genes assert gene.family == family @@ -90,7 +90,7 @@ def test_add_gene_error(self, family): """Tests that a non-gene object can't be added to a GeneFamily as gene """ with pytest.raises(TypeError): - family.add_gene(33) + family.add(33) @pytest.fixture def genes(self) -> Generator[Set[Gene], None, None]: @@ -107,9 +107,9 @@ def test_get_number_of_genes(self, family, genes): """Tests that the number of genes can be retrieved """ for gene in genes: - family.add_gene(gene) - assert isinstance(family.number_of_genes, int) - assert family.number_of_genes == len(genes) + family.add(gene) + assert isinstance(len(family), int) + assert len(family) == len(genes) @pytest.fixture def organisms(self, genes) -> Generator[Set[Organism], None, None]: @@ -123,7 +123,7 @@ def organisms(self, genes) -> Generator[Set[Organism], None, None]: while idx_org < nb_organisms: organism = Organism(f"organism_{idx_org}") contig = Contig(f"contig_{idx_org}") - organism.add_contig(contig) + organism.add(contig) idx_genes = 0 while idx_genes < nb_genes_per_organisms: gene = genes[(idx_org - 1) * nb_genes_per_organisms + idx_genes] @@ -135,7 +135,7 @@ def organisms(self, genes) -> Generator[Set[Organism], None, None]: # last family fill with all the gene left organism = Organism(f"organism_{idx_org}") contig = Contig(f"contig_{idx_org}") - organism.add_contig(contig) + organism.add(contig) idx_genes = (idx_org - 1) * nb_genes_per_organisms while idx_genes < len(genes): gene = genes[idx_genes] @@ -149,7 +149,7 @@ def test_get_org_dict(self, family, genes, organisms): """Tests that all organisms and genes are retrieved as expected """ for gene in genes: - family.add_gene(gene) + family.add(gene) org_dict = family.get_org_dict() assert isinstance(org_dict, dict) assert all(isinstance(org, Organism) for org in org_dict.keys()) @@ -161,7 +161,7 @@ def test_get_org_dict_with_no_organism_fill_to_genes(self, family, genes): """Tests that if genes are not fill with organism an AttributeError is returned """ for gene in genes: - family.add_gene(gene) + family.add(gene) with pytest.raises(AttributeError): _ = family.get_org_dict() @@ -169,14 +169,14 @@ def test_organisms(self, family, organisms, genes): """Tests that all organisms are retrieved as expected """ for gene in genes: - family.add_gene(gene) + family.add(gene) assert set(family.organisms) == organisms def test_number_of_organism(self, family, organisms, genes): """Tests that the expected number of organisms is found """ for gene in genes: - family.add_gene(gene) + family.add(gene) assert isinstance(family.number_of_organisms, int) assert family.number_of_organisms == len(organisms) @@ -184,7 +184,7 @@ def test_get_genes_per_org(self, family, organisms, genes): """Tests that for a giver organism, all the genes are retrieved as expected """ for gene in genes: - family.add_gene(gene) + family.add(gene) for organism in organisms: assert set(family.get_genes_per_org(organism)) == set(organism.genes) @@ -209,7 +209,7 @@ def families(self, genes) -> Generator[Set[GeneFamily], None, None]: idx_genes = 0 while idx_genes < nb_genes_per_family: gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] - family.add_gene(gene) + family.add(gene) gene.family = family idx_genes += 1 families.add(family) @@ -219,7 +219,7 @@ def families(self, genes) -> Generator[Set[GeneFamily], None, None]: idx_genes = (idx_fam - 1) * nb_genes_per_family while idx_genes < len(genes): gene = genes[idx_genes] - family.add_gene(gene) + family.add(gene) gene.family = family idx_genes += 1 families.add(family) diff --git a/tests/test_genome.py b/tests/test_genome.py index e734aa6b..c58b296c 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -308,7 +308,7 @@ def tests_write_contig(self, contig): def test_add_gene(self, gene, contig): """Tests that a gene can be added to the contig """ - contig[gene.start] = gene + contig.add(gene) assert len(contig._genes_getter) == 1 assert len(contig._genes_position) == 1 assert contig._genes_getter[gene.start] == gene @@ -317,10 +317,10 @@ def test_add_gene(self, gene, contig): def test_add_gene_at_far_position(self, gene, contig): """Tests that a gene can be added at each position and between position are fill with None """ - contig[gene.start] = gene + contig.add(gene) new_gene = Gene("Gene2") new_gene.fill_annotations(start=50, stop=72, strand='+', position=6, genetic_code=4) - contig[new_gene.start] = new_gene + contig.add(new_gene) assert len(contig._genes_position) == 7 assert contig._genes_position[1:6] == [None]*5 @@ -328,47 +328,49 @@ def test_add_gene_not_instance_gene(self, contig): """Tests that the contig cannot be fill with a non-gene object """ with pytest.raises(TypeError): - contig[1] = "4" + contig.add(1) + with pytest.raises(TypeError): + contig[1] = '4' def test_add_gene_with_start_already_taken(self, contig, gene): """Tests that the contig cannot be fill with a non-gene object """ - contig[gene.start] = gene + contig.add(gene) with pytest.raises(ValueError): new_gene = Gene('test_gene') new_gene.fill_annotations(start=1, stop=12, strand='+', position=2, genetic_code=4) - contig[new_gene.start] = new_gene + contig.add(new_gene) def test_add_gene_without_position(self, contig): """Test that adding a gene not fill with position raise an AttributeError """ with pytest.raises(AttributeError): gene = Gene('test_gene') - contig[gene.start] = gene + contig.add(gene) def test_get_len(self, genes, contig): """Tests len method """ gene1, gene2, gene3 = genes - contig[gene1.start] = gene1 - contig[gene2.start] = gene2 - contig[gene3.start] = gene3 + contig.add(gene1) + contig.add(gene2) + contig.add(gene3) assert isinstance(len(contig), int) assert len(contig) == 3 def test_get_gene(self, gene, contig): """Tests that a gene can be retrieved by its position """ - contig[gene.start] = gene + contig.add(gene) assert contig[0] == gene def test_get_genes(self, genes, contig): """Tests that a list of genes within a range can be retrieved """ gene1, gene2, gene3 = genes - contig[gene1.start] = gene1 - contig[gene2.start] = gene2 - contig[gene3.start] = gene3 + contig.add(gene1) + contig.add(gene2) + contig.add(gene3) assert set(contig.get_genes(0, 3)) == set(genes) def test_get_gene_with_non_integer_index(self, contig): @@ -381,9 +383,9 @@ def test_get_genes_with_non_integer_begin_and_end_positions(self, genes, contig) """Tests that genes cannot be retrieved with non-integer begin and end positions """ gene1, gene2, gene3 = genes - contig[gene1.start] = gene1 - contig[gene2.start] = gene2 - contig[gene3.start] = gene3 + contig.add(gene1) + contig.add(gene2) + contig.add(gene3) with pytest.raises(TypeError): contig.get_genes('a', 4) with pytest.raises(TypeError): @@ -395,9 +397,9 @@ def test_get_genes_with_end_position_lower_than_begin_position(self, genes, cont """Tests that genes cannot be retrieved with end position lower than begin position """ gene1, gene2, gene3 = genes - contig[gene1.start] = gene1 - contig[gene2.start] = gene2 - contig[gene3.start] = gene3 + contig.add(gene1) + contig.add(gene2) + contig.add(gene3) with pytest.raises(ValueError): contig.get_genes(2, 0) @@ -405,9 +407,9 @@ def test_iterate_over_genes(self, genes, contig): """Tests that all genes in the contig can be iterated over """ gene1, gene2, gene3 = genes - contig[gene1.start] = gene1 - contig[gene2.start] = gene2 - contig[gene3.start] = gene3 + contig.add(gene1) + contig.add(gene2) + contig.add(gene3) assert list(contig.genes) == sorted([gene1, gene2, gene3], key=lambda x: x.position) def test_add_rna(self, contig): @@ -482,54 +484,54 @@ def tests_write_organism(self, organism): def test_add_contig(self, organism, contig): """Tests that a contig can be added to an Organism instance """ - organism.add_contig(contig) + organism.add(contig) assert organism._contigs_getter['contig'] == contig def test_add_contig_not_instance_contig(self, organism): """Tests that a non Contig object cannot be added to an Organism instance """ with pytest.raises(AssertionError): - organism.add_contig(4) + organism.add(4) def test_add_contig_existing_name(self, organism, contig): """Tests that a contig with an existing name cannot be added to an Organism instance """ - organism.add_contig(contig) + organism.add(contig) with pytest.raises(KeyError): - organism.add_contig(Contig('contig')) + organism.add(Contig('contig')) def test_get_contig(self, organism, contig): """Tests that a contig can be retrieved from an Organism instance """ - organism.add_contig(contig) - assert organism.get_contig('contig') == contig + organism.add(contig) + assert organism.get('contig') == contig def test_get_contig_not_instance_string(self, organism): """Tests that a non Contig object cannot be added to an Organism instance """ - with pytest.raises(AssertionError): - organism.get_contig(4) + with pytest.raises(TypeError): + organism.get(4) def test_get_nonexistent_contig(self, organism): """Tests that a non-existent contig cannot be retrieved from an Organism instance """ with pytest.raises(KeyError): - organism.get_contig('contig1') + organism.get('contig1') def test_number_of_contigs(self, organism): """Tests that the number of contigs in an organism instance can be retrieved """ - organism.add_contig(Contig('contig1')) - organism.add_contig(Contig('contig2')) + organism.add(Contig('contig1')) + organism.add(Contig('contig2')) assert organism.number_of_contigs() == 2 def test_get_families(self, organism, contig, gene): """Tests that gene families in an organism can be retrieved """ family = GeneFamily(0, "fam") - family.add_gene(gene) + family.add(gene) gene.fill_parents(organism, contig) - organism.add_contig(contig) + organism.add(contig) contig[gene.start] = gene assert set(organism.families) == {family} @@ -537,26 +539,26 @@ def test_number_of_families(self, organism, contig, gene): """Tests that the number of gene families in an organism instance can be retrieved """ family = GeneFamily(0, "fam") - family.add_gene(gene) + family.add(gene) gene.fill_parents(organism, contig) - organism.add_contig(contig) - contig[gene.start] = gene + organism.add(contig) + contig.add(gene) assert organism.number_of_families() == 1 def tests_get_genes(self, organism, contig, gene): """Tests that genes in an organism can be retrieved """ gene.fill_parents(organism, contig) - organism.add_contig(contig) - contig[gene.start] = gene + organism.add(contig) + contig.add(gene) assert set(organism.genes) == {gene} def test_number_of_genes(self, organism, contig, gene): """Tests that the number of genes in an organism instance can be retrieved """ gene.fill_parents(organism, contig) - organism.add_contig(contig) - contig[gene.start] = gene + organism.add(contig) + contig.add(gene) assert organism.number_of_genes() == 1 def test_mk_bitarray(self, organism, contig): @@ -568,11 +570,11 @@ def test_mk_bitarray(self, organism, contig): gene2 = Gene('gene2') gene1.fill_annotations(start=1, stop=10, strand='+', position=0, genetic_code=4) gene2.fill_annotations(start=11, stop=19, strand='+', position=1, genetic_code=4) - fam1.add_gene(gene1) - fam2.add_gene(gene2) + fam1.add(gene1) + fam2.add(gene2) contig[gene1.start] = gene1 contig[gene2.start] = gene2 - organism.add_contig(contig) + organism.add(contig) index = {fam1: 1, fam2: 2} organism.mk_bitarray(index) assert organism.bitarray == gmpy2.xmpz(6) diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index cd1bd73a..6a4d7e72 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -337,7 +337,7 @@ def fill_org_with_genes(self) -> Generator[Union[Organism, Set[Gene]], None, Non organism = Organism(name="organism") for contig_id in range(randint(2, 10)): contig = Contig("k_{}".format(contig_id)) - organism.add_contig(contig) + organism.add(contig) for gene_idx in range(randint(2, 10)): gene = Gene(gene_id=f"{organism.name}.{contig_id}.{gene_idx}") gene.position = gene_idx @@ -358,7 +358,7 @@ def fill_family_with_genes(self, pangenome): gene = Gene(gene_id=f"{family.name}_{gene_idx}") gene.position = gene_idx gene.start = gene_idx - family.add_gene(gene) + family.add(gene) genes.add(gene) yield family, genes @@ -457,8 +457,8 @@ def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2) -> Tuple[Gene, Gene]: fam2 = GeneFamily(family_id=2, name=f"fam_{gene_id_2}") ctg1 = Contig(name=f"ctg_{gene_id_1}") ctg2 = Contig(name=f"ctg_{gene_id_2}") - fam1.add_gene(gene1) - fam2.add_gene(gene2) + fam1.add(gene1) + fam2.add(gene2) organism = Organism(name=f"org_{choices([gene_id_1, gene_id_2], k=1)}") gene1.fill_parents(organism, ctg1) gene2.fill_parents(organism, ctg2) @@ -809,7 +809,7 @@ def add_element_to_pangenome(self, pangenome): org = Organism("Org") org.add_metadata(source=metadata.source, metadata=metadata) ctg = Contig("Ctg") - org.add_contig(ctg) + org.add(ctg) gene = Gene("Gene") gene.position, gene.start = (0, 0) gene.add_metadata(source=metadata.source, metadata=metadata) diff --git a/tests/test_region.py b/tests/test_region.py index ec605481..16a4d68f 100644 --- a/tests/test_region.py +++ b/tests/test_region.py @@ -36,7 +36,7 @@ def families(genes) -> Generator[Set[GeneFamily], None, None]: idx_genes = 0 while idx_genes < nb_genes_per_family: gene = genes[(idx_fam - 1) * nb_genes_per_family + idx_genes] - family.add_gene(gene) + family.add(gene) gene.family = family idx_genes += 1 families.add(family) @@ -46,7 +46,7 @@ def families(genes) -> Generator[Set[GeneFamily], None, None]: idx_genes = (idx_fam - 1) * nb_genes_per_family while idx_genes < len(genes): gene = genes[idx_genes] - family.add_gene(gene) + family.add(gene) gene.family = family idx_genes += 1 families.add(family) From dd9c726f2a4b05a46daa4c62fdf35996646ebefc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 5 Sep 2023 14:44:40 +0200 Subject: [PATCH 55/75] commit before merge with unitTest branch --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 2 +- ppanggolin/formats/writeBinaries.py | 6 +++++- ppanggolin/formats/writeFlat.py | 4 ++-- ppanggolin/formats/writeMSA.py | 2 +- ppanggolin/formats/writeSequences.py | 4 ++-- ppanggolin/metrics/fluidity.py | 2 +- ppanggolin/pangenome.py | 3 ++- 8 files changed, 15 insertions(+), 10 deletions(-) diff --git a/VERSION b/VERSION index e829013f..e144fab6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.132 +1.2.133 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index c03ba66f..5bc8d935 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -251,7 +251,7 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain multigenics = pangenome.get_multigenics(dup_margin) logging.getLogger().info("Compute Regions of Genomic Plasticity ...") name_scheme = naming_scheme(pangenome) - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar): + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genomes", disable=disable_bar): pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme)) logging.getLogger().info(f"Predicted {len(pangenome.regions)} RGP") diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index b01a3a35..6a5a8a5d 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -145,6 +145,10 @@ def get_genedata(gene:Feature) -> Tuple[int, str, str, int, str, str, int]: gene.product, genetic_code) +def write_organism(pangenome: Pangenome, h5f: tables.File, disable_bar = False): + organism_table = h5f.create_table(annotation, "organism", gene_desc(*get_max_len_annotations(pangenome)), + expectedrows=pangenome.number_of_organisms) + def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ Function writing all the pangenome annotations @@ -163,7 +167,7 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool genedata_counter = 0 gene_row = gene_table.row - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genome", disable=disable_bar): + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): for contig in org.contigs: for gene in contig.genes + list(contig.RNAs): gene_row["organism"] = org.name diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index aa0078bc..7a3bde02 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -487,9 +487,9 @@ def write_stats(output: str, soft_core: float = 0.95, dup_margin: float = 0.05, soft = set() # could use bitarrays if speed is needed core = set() for fam in pan.gene_families: - if len(fam.organisms) >= pan.number_of_organisms() * soft_core: + if len(fam.organisms) >= pan.number_of_organisms * soft_core: soft.add(fam) - if len(fam.organisms) == pan.number_of_organisms(): + if len(fam.organisms) == pan.number_of_organisms: core.add(fam) with write_compressed_or_not(output + "/organisms_statistics.tsv", compress) as outfile: diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index f8af1135..72c2f4bf 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -51,7 +51,7 @@ def getFamiliesToWrite(pangenome, partition_filter, soft_core=0.95, dup_margin=0 :return: set of families unique to one partition """ fams = set() - nb_org = pangenome.number_of_organisms() + nb_org = pangenome.number_of_organisms if partition_filter == "all": return set(pangenome.gene_families) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 725b1e20..29b5ce86 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -106,14 +106,14 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c elif partition == "softcore": logging.getLogger().info( f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes") - threshold = pangenome.number_of_organisms() * soft_core + threshold = pangenome.number_of_organisms * soft_core for fam in pangenome.gene_families: if len(fam.organisms) >= threshold: genefams.add(fam) elif partition == "core": logging.getLogger().info(f"Writing the representative {type_name} of the {partition} gene families...") for fam in pangenome.gene_families: - if len(fam.organisms) == pangenome.number_of_organisms(): + if len(fam.organisms) == pangenome.number_of_organisms: genefams.add(fam) elif "module_" in partition: logging.getLogger().info(f"Writing the representation {type_name} of {partition} gene families...") diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py index f7fd1282..2d1a2d97 100644 --- a/ppanggolin/metrics/fluidity.py +++ b/ppanggolin/metrics/fluidity.py @@ -40,7 +40,7 @@ def gen_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: common_fam = popcount(c_organisms[0].bitarray & c_organisms[1].bitarray) - 1 if tot_fam > 0 and common_fam > 0: g_sum += (tot_fam - 2 * common_fam) / tot_fam - fluidity_dict[subset] = (2 / (pangenome.number_of_organisms() * (pangenome.number_of_organisms() - 1))) * g_sum + fluidity_dict[subset] = (2 / (pangenome.number_of_organisms * (pangenome.number_of_organisms - 1))) * g_sum return fluidity_dict diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 92836995..93848cda 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -94,7 +94,7 @@ def _yield_genes(self) -> Iterator[Gene]: :return: an iterator of Gene """ - if self.number_of_organisms() > 0: # if we have organisms, they're supposed to have genes + if self.number_of_organisms > 0: # if we have organisms, they're supposed to have genes for org in self.organisms: for contig in org.contigs: for gene in contig.genes: @@ -245,6 +245,7 @@ def organisms(self) -> List[Organism]: """ return list(self._orgGetter.values()) + @property def number_of_organisms(self) -> int: """Returns the number of organisms present in the pangenome From 5eb33bcafbc3b4d2c64cc0966ecb8a849db1fa36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Tue, 5 Sep 2023 17:48:10 +0200 Subject: [PATCH 56/75] Change HDF5 to write contig and organism table --- VERSION | 2 +- ppanggolin/formats/writeAnnotations.py | 356 +++++++++++++++++++++++++ ppanggolin/formats/writeBinaries.py | 282 +------------------- ppanggolin/genome.py | 14 +- ppanggolin/pangenome.py | 14 +- 5 files changed, 379 insertions(+), 289 deletions(-) create mode 100644 ppanggolin/formats/writeAnnotations.py diff --git a/VERSION b/VERSION index c262507c..8464960b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.174 +1.2.175 diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py new file mode 100644 index 00000000..907034de --- /dev/null +++ b/ppanggolin/formats/writeAnnotations.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 +# coding:utf-8 + +# default libraries +import logging +from typing import Dict, Tuple, Union + +# installed libraries +from tqdm import tqdm +import tables + +# local libraries +from ppanggolin.pangenome import Pangenome +from ppanggolin.genome import Feature, Gene +from ppanggolin.formats.readBinaries import Genedata + + +def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int, int]: + """ + Get the maximum size of each annotation information to optimize disk space + + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = 1, 1, 1, 1, 1, 1 + for org in pangenome.organisms: + if len(org.name) > max_org_len: + max_org_len = len(org.name) + for contig in org.contigs: + if len(contig.name) > max_contig_len: + max_contig_len = len(contig.name) + for gene in contig.genes: + if len(gene.ID) > max_gene_id_len: + max_gene_id_len = len(gene.ID) + if len(gene.local_identifier) > max_gene_local_id: + max_gene_local_id = len(gene.local_identifier) + for rna in contig.RNAs: + if len(rna.ID) > max_rna_id_len: + max_rna_id_len = len(gene.ID) + if len(rna.local_identifier) > max_rna_local_id: + max_rna_local_id = len(gene.local_identifier) + + return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id + + +def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]: + """ + Create a table to save organism-related information + + :param org_len: Maximum size of organism + :param contig_len: Maximum size of contigs + + :return: Formatted table + """ + return {'name': tables.StringCol(itemsize=org_len), + "contig": tables.StringCol(itemsize=contig_len)} + + +def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + organism_desc: Dict[str, tables.StringCol], disable_bar=False): + organism_table = h5f.create_table(annotation, "genome", organism_desc, + expectedrows=pangenome.number_of_organisms) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes") + organism_row = organism_table.row + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): + for contig in org.contigs: + organism_row["name"] = org.name + organism_row["contig"] = contig.name + organism_row.append() + organism_table.flush() + + +def contig_desc(contig_len: int, max_gene_id_len: int, + max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol]]: + """ + Create a table to save organism-related information + + :param org_len: Maximum size of organism + :param contig_len: Maximum size of contigs + + :return: Formatted table + """ + return {'name': tables.StringCol(itemsize=contig_len), + "is_circular": tables.BoolCol(dflt=False), + "gene": tables.StringCol(itemsize=max_gene_id_len), + "rna": tables.StringCol(itemsize=max_rna_id_len)} + + +def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol]], + disable_bar=False): + contig_table = h5f.create_table(annotation, "contig", contig_desc, expectedrows=pangenome.number_of_contigs) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") + contig_row = contig_table.row + for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): + rna_list = list(contig.RNAs) + for index, gene in enumerate(contig.genes): + contig_row["name"] = contig.name + contig_row["is_circular"] = contig.is_circular + contig_row["gene"] = gene.ID + if index < len(rna_list): + contig_row["rna"] = rna_list[index].ID + contig_row.append() + contig_table.flush() + + +def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: + """ + Create a table to save gene-related information + + :param org_len: Maximum size of organism + :param contig_len: Maximum size of contigs + :param id_len: Maximum size of gene ID + + :param max_local_id: Maximum size of gene local identifier + + :return: Formatted table + """ + return {'ID': tables.StringCol(itemsize=id_len), + 'genedata_id': tables.UInt32Col(), + 'local': tables.StringCol(itemsize=max_local_id), + 'is_fragment': tables.BoolCol(dflt=False)} + + +def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + gene_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]], + disable_bar=False) -> Dict[Genedata, int]: + genedata2gene = {} + genedata_counter = 0 + gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") + gene_row = gene_table.row + for gene in tqdm(pangenome.genes, total=pangenome.number_of_organisms, unit="gene", disable=disable_bar): + gene_row["ID"] = gene.ID + gene_row["is_fragment"] = gene.is_fragment + if gene.type == "CDS": + gene_row["local"] = gene.local_identifier + genedata = get_genedata(gene) + genedata_id = genedata2gene.get(genedata) + if genedata_id is None: + genedata_id = genedata_counter + genedata2gene[genedata] = genedata_id + genedata_counter += 1 + gene_row["genedata_id"] = genedata_id + gene_row.append() + gene_table.flush() + return genedata2gene + + +def genedata_desc(type_len, name_len, product_len): + """ + Creates a table for gene-related data + + :param type_len: Maximum size of gene Type + :param name_len: Maximum size of gene name + :param product_len: Maximum size of gene product + :return: Formatted table for gene metadata + """ + return { + 'genedata_id': tables.UInt32Col(), + 'start': tables.UInt32Col(), + 'stop': tables.UInt32Col(), + 'strand': tables.StringCol(itemsize=1), + 'gene_type': tables.StringCol(itemsize=type_len), + 'position': tables.UInt32Col(), + 'name': tables.StringCol(itemsize=name_len), + 'product': tables.StringCol(itemsize=product_len), + 'genetic_code': tables.UInt32Col(dflt=11), + } + + +def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]: + """ + Get the maximum size of each gene data information to optimize disk space + + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_name_len = 1 + max_product_len = 1 + max_type_len = 1 + for org in pangenome.organisms: + for contig in org.contigs: + for gene in contig.genes: + if len(gene.name) > max_name_len: + max_name_len = len(gene.name) + if len(gene.product) > max_product_len: + max_product_len = len(gene.product) + if len(gene.type) > max_type_len: + max_type_len = len(gene.type) + for gene in contig.RNAs: + if len(gene.name) > max_name_len: + max_name_len = len(gene.name) + if len(gene.product) > max_product_len: + max_product_len = len(gene.product) + if len(gene.type) > max_type_len: + max_type_len = len(gene.type) + + return max_type_len, max_name_len, max_product_len + + +def get_genedata(gene: Feature) -> Genedata: + """ + Gets the genedata type of Feature + + :param gene: a Feature + :return: Tuple with a Feature associated data + """ + position = None + genetic_code = 11 + if gene.type == "CDS": + gene: Gene + position = gene.position + genetic_code = gene.genetic_code + return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name, + gene.product, genetic_code) + + +def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + genedata2gene: Dict[Genedata, int], disable_bar=False): + genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), + expectedrows=len(genedata2gene)) + logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data " + "(can be lower than the number of genes)") + genedata_row = genedata_table.row + for genedata, genedata_id in genedata2gene.items(): + genedata_row["genedata_id"] = genedata_id + genedata_row["start"] = genedata.start + genedata_row["stop"] = genedata.stop + genedata_row["strand"] = genedata.strand + genedata_row["gene_type"] = genedata.gene_type + if genedata.gene_type == "CDS": + genedata_row["position"] = genedata.position + genedata_row["genetic_code"] = genedata.genetic_code + genedata_row["name"] = genedata.name + genedata_row["product"] = genedata.product + genedata_row.append() + genedata_table.flush() + + +def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, + rec_contigs: bool = True, rec_genes: bool = True, disable_bar: bool = False): + """ + Function writing all the pangenome annotations + + :param pangenome: Annotated pangenome + :param h5f: Pangenome HDF5 file + :param disable_bar: Alow to disable progress bar + """ + annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") + + max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = get_max_len_annotations( + pangenome) + + if rec_organisms: + desc = organism_desc(max_org_len, max_contig_len) + write_organisms(pangenome, h5f, annotation, desc, disable_bar) + if rec_contigs: + desc = contig_desc(max_contig_len, max_gene_id_len, max_rna_id_len) + write_contigs(pangenome, h5f, annotation, desc, disable_bar) + if rec_genes: + desc = gene_desc(max_gene_id_len, max_gene_local_id) + genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar) + write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar) + + +def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: + """ + Get the maximum size of gene sequences to optimize disk space + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_gene_id_len = 1 + max_gene_type = 1 + for gene in pangenome.genes: + if len(gene.ID) > max_gene_id_len: + max_gene_id_len = len(gene.ID) + if len(gene.type) > max_gene_type: + max_gene_type = len(gene.type) + return max_gene_id_len, max_gene_type + + +def gene_sequences_desc(gene_id_len, gene_type_len) -> dict: + """ + Create table to save gene sequences + :param gene_id_len: Maximum size of gene sequence identifier + :param gene_type_len: Maximum size of gene type + :return: Formated table + """ + return { + "gene": tables.StringCol(itemsize=gene_id_len), + "seqid": tables.UInt32Col(), + "type": tables.StringCol(itemsize=gene_type_len) + } + + +def get_sequence_len(pangenome: Pangenome) -> int: + """ + Get the maximum size of gene sequences to optimize disk space + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_seq_len = 1 + for gene in pangenome.genes: + if len(gene.dna) > max_seq_len: + max_seq_len = len(gene.dna) + return max_seq_len + + +def sequence_desc(max_seq_len: int) -> dict: + """ + Table description to save sequences + :param max_seq_len: Maximum size of gene type + :return: Formated table + """ + return { + "seqid": tables.UInt32Col(), + "dna": tables.StringCol(itemsize=max_seq_len) + } + + +def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Function writing all the pangenome gene sequences + :param pangenome: Pangenome with gene sequences + :param h5f: Pangenome HDF5 file without sequences + :param disable_bar: Disable progress bar + """ + gene_seq = h5f.create_table("/annotations", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), + expectedrows=pangenome.number_of_genes) + # process sequences to save them only once + seq2seqid = {} + id_counter = 0 + gene_row = gene_seq.row + for gene in tqdm(sorted(pangenome.genes, key=lambda x: x.ID), total=pangenome.number_of_genes, unit="gene", + disable=disable_bar): + curr_seq_id = seq2seqid.get(gene.dna) + if curr_seq_id is None: + curr_seq_id = id_counter + seq2seqid[gene.dna] = id_counter + id_counter += 1 + gene_row["gene"] = gene.ID + gene_row["seqid"] = curr_seq_id + gene_row["type"] = gene.type + gene_row.append() + gene_seq.flush() + + seq_table = h5f.create_table("/annotations", "sequences", sequence_desc(get_sequence_len(pangenome)), + expectedrows=len(seq2seqid)) + + seq_row = seq_table.row + for seq, seqid in seq2seqid.items(): + seq_row["dna"] = seq + seq_row["seqid"] = seqid + seq_row.append() + seq_table.flush() diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 3a9df929..1a3fcc56 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -5,7 +5,7 @@ import logging from collections import Counter, defaultdict import statistics -from typing import Tuple +from typing import Tuple, Union import pkg_resources # installed libraries @@ -15,290 +15,12 @@ # local libraries from ppanggolin.pangenome import Pangenome +from ppanggolin.formats.writeAnnotations import write_annotations, write_gene_sequences from ppanggolin.formats.writeMetadata import write_metadata, erase_metadata, write_metadata_status from ppanggolin.genome import Feature, Gene from ppanggolin.formats.readBinaries import read_genedata, Genedata -def gene_desc(org_len, contig_len, id_len, max_local_id) -> dict: - """ - Create a table to save gene-related information - - :param org_len: Maximum size of organism - :param contig_len: Maximum size of contigs - :param id_len: Maximum size of gene ID - - :param max_local_id: Maximum size of gene local identifier - - :return: Formatted table - """ - return { - 'organism': tables.StringCol(itemsize=org_len), - "contig": { - 'name': tables.StringCol(itemsize=contig_len), - "is_circular": tables.BoolCol(dflt=False) - }, - "gene": { - 'ID': tables.StringCol(itemsize=id_len), - 'genedata_id': tables.UInt32Col(), - 'local': tables.StringCol(itemsize=max_local_id), - 'is_fragment': tables.BoolCol(dflt=False) - } - } - - -def genedata_desc(type_len, name_len, product_len): - """ - Creates a table for gene-related data - - :param type_len: Maximum size of gene Type - :param name_len: Maximum size of gene name - :param product_len: Maximum size of gene product - :return: Formatted table for gene metadata - """ - return { - 'genedata_id': tables.UInt32Col(), - 'start': tables.UInt32Col(), - 'stop': tables.UInt32Col(), - 'strand': tables.StringCol(itemsize=1), - 'gene_type': tables.StringCol(itemsize=type_len), - 'position': tables.UInt32Col(), - 'name': tables.StringCol(itemsize=name_len), - 'product': tables.StringCol(itemsize=product_len), - 'genetic_code': tables.UInt32Col(dflt=11), - } - - -def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int]: - """ - Get the maximum size of each annotation information to optimize disk space - - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_org_len = 1 - max_contig_len = 1 - max_gene_id_len = 1 - max_local_id = 1 - for org in pangenome.organisms: - if len(org.name) > max_org_len: - max_org_len = len(org.name) - for contig in org.contigs: - if len(contig.name) > max_contig_len: - max_contig_len = len(contig.name) - for gene in contig.genes: - if len(gene.ID) > max_gene_id_len: - max_gene_id_len = len(gene.ID) - if len(gene.local_identifier) > max_local_id: - max_local_id = len(gene.local_identifier) - for gene in contig.RNAs: - if len(gene.ID) > max_gene_id_len: - max_gene_id_len = len(gene.ID) - if len(gene.local_identifier) > max_local_id: - max_local_id = len(gene.local_identifier) - - return max_org_len, max_contig_len, max_gene_id_len, max_local_id - - -def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]: - """ - Get the maximum size of each gene data information to optimize disk space - - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_name_len = 1 - max_product_len = 1 - max_type_len = 1 - for org in pangenome.organisms: - for contig in org.contigs: - for gene in contig.genes: - if len(gene.name) > max_name_len: - max_name_len = len(gene.name) - if len(gene.product) > max_product_len: - max_product_len = len(gene.product) - if len(gene.type) > max_type_len: - max_type_len = len(gene.type) - for gene in contig.RNAs: - if len(gene.name) > max_name_len: - max_name_len = len(gene.name) - if len(gene.product) > max_product_len: - max_product_len = len(gene.product) - if len(gene.type) > max_type_len: - max_type_len = len(gene.type) - - return max_type_len, max_name_len, max_product_len - - -def get_genedata(gene: Feature) -> Genedata: - """ - Gets the genedata type of Feature - - :param gene: a Feature - :return: Tuple with a Feature associated data - """ - position = None - genetic_code = 11 - if gene.type == "CDS": - gene: Gene - position = gene.position - genetic_code = gene.genetic_code - return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name, - gene.product, genetic_code) - - -def write_organism(pangenome: Pangenome, h5f: tables.File, disable_bar = False): - organism_table = h5f.create_table(annotation, "organism", gene_desc(*get_max_len_annotations(pangenome)), - expectedrows=pangenome.number_of_organisms) - -def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): - """ - Function writing all the pangenome annotations - - :param pangenome: Annotated pangenome - :param h5f: Pangenome HDF5 file - :param disable_bar: Alow to disable progress bar - """ - annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") - gene_table = h5f.create_table(annotation, "genes", gene_desc(*get_max_len_annotations(pangenome)), - expectedrows=pangenome.number_of_genes) - - logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") - - genedata2gene = {} - genedata_counter = 0 - - gene_row = gene_table.row - for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): - for contig in org.contigs: - for gene in list(contig.genes) + list(contig.RNAs): - gene_row["organism"] = org.name - gene_row["contig/name"] = contig.name - gene_row["contig/is_circular"] = contig.is_circular - gene_row["gene/ID"] = gene.ID - gene_row["gene/is_fragment"] = gene.is_fragment - if gene.type == "CDS": - gene_row["gene/local"] = gene.local_identifier - genedata = get_genedata(gene) - genedata_id = genedata2gene.get(genedata) - if genedata_id is None: - genedata_id = genedata_counter - genedata2gene[genedata] = genedata_id - genedata_counter += 1 - gene_row["gene/genedata_id"] = genedata_id - gene_row.append() - gene_table.flush() - - genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), - expectedrows=len(genedata2gene)) - logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data (can be lower than the number of genes)") - genedata_row = genedata_table.row - for genedata, genedata_id in genedata2gene.items(): - genedata_row["genedata_id"] = genedata_id - genedata_row["start"] = genedata.start - genedata_row["stop"] = genedata.stop - genedata_row["strand"] = genedata.strand - genedata_row["gene_type"] = genedata.gene_type - if genedata.gene_type == "CDS": - genedata_row["position"] = genedata.position - genedata_row["genetic_code"] = genedata.genetic_code - genedata_row["name"] = genedata.name - genedata_row["product"] = genedata.product - genedata_row.append() - genedata_table.flush() - - -def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: - """ - Get the maximum size of gene sequences to optimize disk space - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_gene_id_len = 1 - max_gene_type = 1 - for gene in pangenome.genes: - if len(gene.ID) > max_gene_id_len: - max_gene_id_len = len(gene.ID) - if len(gene.type) > max_gene_type: - max_gene_type = len(gene.type) - return max_gene_id_len, max_gene_type - - -def gene_sequences_desc(gene_id_len, gene_type_len) -> dict: - """ - Create table to save gene sequences - :param gene_id_len: Maximum size of gene sequence identifier - :param gene_type_len: Maximum size of gene type - :return: Formated table - """ - return { - "gene": tables.StringCol(itemsize=gene_id_len), - "seqid": tables.UInt32Col(), - "type": tables.StringCol(itemsize=gene_type_len) - } - - -def get_sequence_len(pangenome: Pangenome) -> int: - """ - Get the maximum size of gene sequences to optimize disk space - :param pangenome: Annotated pangenome - :return: maximum size of each annotation - """ - max_seq_len = 1 - for gene in pangenome.genes: - if len(gene.dna) > max_seq_len: - max_seq_len = len(gene.dna) - return max_seq_len - - -def sequence_desc(max_seq_len: int) -> dict: - """ - Table description to save sequences - :param max_seq_len: Maximum size of gene type - :return: Formated table - """ - return { - "seqid": tables.UInt32Col(), - "dna": tables.StringCol(itemsize=max_seq_len) - } - - -def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): - """ - Function writing all the pangenome gene sequences - :param pangenome: Pangenome with gene sequences - :param h5f: Pangenome HDF5 file without sequences - :param disable_bar: Disable progress bar - """ - gene_seq = h5f.create_table("/", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), - expectedrows=pangenome.number_of_genes) - # process sequences to save them only once - seq2seqid = {} - id_counter = 0 - gene_row = gene_seq.row - for gene in tqdm(sorted(pangenome.genes, key=lambda x: x.ID), total=pangenome.number_of_genes, unit="gene", disable=disable_bar): - curr_seq_id = seq2seqid.get(gene.dna) - if curr_seq_id is None: - curr_seq_id = id_counter - seq2seqid[gene.dna] = id_counter - id_counter += 1 - gene_row["gene"] = gene.ID - gene_row["seqid"] = curr_seq_id - gene_row["type"] = gene.type - gene_row.append() - gene_seq.flush() - - seq_table = h5f.create_table("/", "sequences", sequence_desc(get_sequence_len(pangenome)), - expectedrows=len(seq2seqid)) - - seq_row = seq_table.row - for seq, seqid in seq2seqid.items(): - seq_row["dna"] = seq - seq_row["seqid"] = seqid - seq_row.append() - seq_table.flush() - - def gene_fam_desc(max_name_len: int, max_sequence_length: int, max_part_len: int) -> dict: """ Create a formated table for gene families description diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index d63818b6..84462d77 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -581,6 +581,13 @@ def __delitem__(self, name): except KeyError: raise KeyError("Position of the gene in the contig does not exist") + def __len__(self): + """ Get number of contigs in organism + + :return: Number of contigs in organism + """ + return len(self._contigs_getter) + @property def families(self): """Return the gene families present in the organism @@ -625,13 +632,6 @@ def contigs(self) -> Generator[Contig, None, None]: """ yield from self._contigs_getter.values() - def number_of_contigs(self) -> int: - """ Get number of contigs in organism - - :return: Number of contigs in organism - """ - return len(self._contigs_getter) - def add(self, contig: Contig): """Add a contig to organism diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index ba35e4c1..c63bc3ec 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -8,7 +8,7 @@ from pathlib import Path # local libraries -from ppanggolin.genome import Organism, Gene +from ppanggolin.genome import Organism, Contig, Gene from ppanggolin.region import Region, Spot, Module from ppanggolin.geneFamily import GeneFamily from ppanggolin.edge import Edge @@ -281,6 +281,18 @@ def number_of_organisms(self) -> int: """ return len(self._orgGetter) + @property + def contigs(self) -> Generator[Contig, None, None]: + for organism in self.organisms: + yield from organism.contigs + @property + def number_of_contigs(self) -> int: + """Returns the number of contigs present in the pangenome + + :return: The number of contigs + """ + return sum(len(org) for org in self.organisms) + def get_organism(self, name: str) -> Organism: """ Get an organism that is expected to be in the pangenome using its name, which is supposedly unique. From 30788c3fe08eb23fb00012ffe7127cb3fda21cfe Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 6 Sep 2023 10:16:29 +0200 Subject: [PATCH 57/75] adjust docstring --- ppanggolin/RGP/rgp_cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py index f19fec21..63b9f2a8 100644 --- a/ppanggolin/RGP/rgp_cluster.py +++ b/ppanggolin/RGP/rgp_cluster.py @@ -245,7 +245,7 @@ def add_info_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List def add_edges_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List[IdenticalRegions]): """ - Replace the identical rgp object by all identical rgp it contains. + Replace identical rgp objects by all identical RGPs it contains. :param rgp_graph: The RGP graph to add edges to. :param identical_rgps_objects: A dictionary mapping RGPs to sets of identical RGPs. From 29c0da760ecaf07ba45b2e3b66fc632f69fe2284 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 6 Sep 2023 10:17:11 +0200 Subject: [PATCH 58/75] check input annotate files in annotate rather than in main --- ppanggolin/annotate/annotate.py | 16 +++++++++++++--- ppanggolin/main.py | 4 ---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 9758a07d..92a0aea5 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -17,7 +17,7 @@ from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence from ppanggolin.pangenome import Pangenome from ppanggolin.genome import Organism, Gene, RNA, Contig -from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype +from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files from ppanggolin.formats import write_pangenome @@ -32,6 +32,12 @@ def check_annotate_args(args): raise Exception("You must provide at least a file with the --fasta option to annotate from sequences, " "or a file with the --gff option to load annotations from.") + + if hasattr(args, "fasta") and args.fasta is not None: + check_input_files(args.fasta, True) + + if hasattr(args, "anno") and args.anno is not None: + check_input_files(args.anno, True) def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set, start: int, stop: int, strand: str, gene_type: str, position: int = None, gene_name: str = "", @@ -560,16 +566,20 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: arguments = [] # Argument given to annotate organism in same order than prototype for line in read_compressed_or_not(fasta_list): + elements = [el.strip() for el in line.split("\t")] - if len(elements) <= 1: # TODO remove ? Already tested by check TSV sanity - raise Exception("No tabulation separator found in organisms file") org_path = Path(elements[1]) + if not org_path.exists(): # Check tsv sanity test if it's not one it's the other org_path = fasta_list.parent.joinpath(org_path) + arguments.append((elements[0], org_path, elements[2:], tmpdir, translation_table, norna, kingdom, overlap, procedure)) + if len(arguments) == 0: raise Exception("There are no genomes in the provided file") + + logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") with get_context('fork').Pool(processes=cpu) as p: for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", diff --git a/ppanggolin/main.py b/ppanggolin/main.py index 210d56b6..a2de8535 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -153,10 +153,6 @@ def main(): if hasattr(args, "pangenome") and args.pangenome is not None: check_input_files(args.pangenome) - if hasattr(args, "fasta") and args.fasta is not None: - check_input_files(args.fasta, True) - if hasattr(args, "anno") and args.anno is not None: - check_input_files(args.anno, True) if args.subcommand == "annotate": ppanggolin.annotate.launch(args) From 594204ea3418750bfe71aa81f5f68dba619bd5ae Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 6 Sep 2023 10:59:56 +0200 Subject: [PATCH 59/75] change ambiguous overlap by allow_overlap --- ppanggolin/annotate/annotate.py | 10 +++++----- ppanggolin/annotate/synta.py | 12 ++++++------ ppanggolin/workflow/all.py | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 92a0aea5..e2362eb0 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -545,7 +545,7 @@ def launch_annotate_organism(pack: tuple) -> Organism: def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11, - kingdom: str = "bacteria", norna: bool = False, overlap: bool = False, procedure: str = None, + kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, disable_bar: bool = False): """ Main function to annotate a pangenome @@ -557,7 +557,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: :param translation_table: Translation table (genetic code) to use. :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. :param norna: Use to avoid annotating RNA features. - :param overlap: Use to not remove genes overlapping with RNA features + :param allow_overlap: Use to not remove genes overlapping with RNA features :param procedure: prodigal procedure used :param disable_bar: Disable the progresse bar """ @@ -574,7 +574,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: org_path = fasta_list.parent.joinpath(org_path) arguments.append((elements[0], org_path, elements[2:], tmpdir, translation_table, - norna, kingdom, overlap, procedure)) + norna, kingdom, allow_overlap, procedure)) if len(arguments) == 0: raise Exception("There are no genomes in the provided file") @@ -592,7 +592,7 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: pangenome.status["genomesAnnotated"] = "Computed" # the pangenome is now annotated. pangenome.status["geneSequences"] = "Computed" # the gene objects have their respective gene sequences. pangenome.parameters["annotation"] = {} - pangenome.parameters["annotation"]["remove_Overlapping_CDS"] = overlap + pangenome.parameters["annotation"]["remove_Overlapping_CDS"] = allow_overlap pangenome.parameters["annotation"]["annotate_RNA"] = True if not norna else False pangenome.parameters["annotation"]["kingdom"] = kingdom pangenome.parameters["annotation"]["translation_table"] = translation_table @@ -611,7 +611,7 @@ def launch(args: argparse.Namespace): if args.fasta is not None and args.anno is None: annotate_pangenome(pangenome, args.fasta, tmpdir=args.tmpdir, cpu=args.cpu, procedure=args.prodigal_procedure, translation_table=args.translation_table, kingdom=args.kingdom, norna=args.norna, - overlap=args.allow_overlap, disable_bar=args.disable_prog_bar) + allow_overlap=args.allow_overlap, disable_bar=args.disable_prog_bar) elif args.anno is not None: read_annotations(pangenome, args.anno, cpu=args.cpu, pseudo=args.use_pseudo, disable_bar=args.disable_prog_bar) if pangenome.status["geneSequences"] == "No": diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 9c2f10e9..afad1cca 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -241,12 +241,12 @@ def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, tmpdir: str, n return genes -def overlap_filter(all_genes: defaultdict, overlap: bool = False) -> defaultdict: +def overlap_filter(all_genes: defaultdict, allow_overlap: bool = False) -> defaultdict: """ Removes the CDS that overlap with RNA genes. :param all_genes: Dictionary with complete list of genes - :param overlap: Use to not remove genes overlapping with RNA features + :param allow_overlap: Use to not remove genes overlapping with RNA features :return: Dictionary with genes filtered """ @@ -255,7 +255,7 @@ def overlap_filter(all_genes: defaultdict, overlap: bool = False) -> defaultdict for key, genes in all_genes.items(): tmp_genes = sorted(genes, key=lambda x: x.start) rm_genes = set() - if not overlap: + if not allow_overlap: for i, gene_i in enumerate(tmp_genes): if i + 1 < len(tmp_genes): gene_j = tmp_genes[i + 1] @@ -292,7 +292,7 @@ def get_dna_sequence(contig_seq: str, gene: Gene) -> str: def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: str, code: int = 11, norna: bool = False, kingdom: str = "bacteria", - overlap: bool = False, procedure: str = None) -> Organism: + allow_overlap: bool = False, procedure: str = None) -> Organism: """ Function to annotate a single organism @@ -303,7 +303,7 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. :param norna: Use to avoid annotating RNA features. :param tmpdir: Path to temporary directory - :param overlap: Use to not remove genes overlapping with RNA features + :param allow_overlap: Use to not remove genes overlapping with RNA features :param procedure: prodigal procedure used :return: Complete organism object for pangenome @@ -322,7 +322,7 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: else: procedure = "single" genes = syntaxic_annotation(org, fasta_file, tmpdir, norna, kingdom, code, procedure) - genes = overlap_filter(genes, overlap) + genes = overlap_filter(genes, allow_overlap=allow_overlap) for contig_name, genes in genes.items(): try: diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index b67b97a0..2d496f5f 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -87,7 +87,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, procedure=args.annotate.prodigal_procedure, translation_table=args.annotate.translation_table, kingdom=args.annotate.kingdom, norna=args.annotate.norna, - overlap=args.annotate.allow_overlap) + allow_overlap=args.annotate.allow_overlap) anno_time = time.time() - start_anno start_writing = time.time() From 6f746293099bc0e5b561fff4eb117a4768f69449 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 6 Sep 2023 12:00:11 +0200 Subject: [PATCH 60/75] check tsv sanity in workflow --- ppanggolin/workflow/all.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index 2d496f5f..4946abdf 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -12,7 +12,7 @@ # local libraries from ppanggolin.pangenome import Pangenome from ppanggolin.utils import mk_file_name, mk_outdir, check_option_workflow, restricted_float -from ppanggolin.annotate.annotate import annotate_pangenome, read_annotations, get_gene_sequences_from_fastas +from ppanggolin.annotate.annotate import annotate_pangenome, read_annotations, get_gene_sequences_from_fastas, check_annotate_args from ppanggolin.cluster.cluster import clustering, read_clustering from ppanggolin.graph.makeGraph import compute_neighbors_graph from ppanggolin.nem.rarefaction import make_rarefaction_curve @@ -41,6 +41,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, """ check_option_workflow(args) + check_annotate_args(args) pangenome = Pangenome() filename = mk_file_name(args.basename, args.output, args.force) From bb071fbcc88d0d8eba7ead0161376f9a3b4d2cc7 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 6 Sep 2023 12:09:25 +0200 Subject: [PATCH 61/75] replace the ambiguous len(contig) by contig.number_of_genes --- ppanggolin/RGP/genomicIsland.py | 2 +- ppanggolin/annotate/annotate.py | 8 +++----- ppanggolin/context/searchGeneContext.py | 2 +- ppanggolin/genome.py | 9 +++++---- ppanggolin/graph/makeGraph.py | 2 +- ppanggolin/mod/module.py | 2 +- ppanggolin/region.py | 10 +++++----- tests/test_genome.py | 6 +++--- 8 files changed, 20 insertions(+), 21 deletions(-) diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index b763591b..79f7e128 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -195,7 +195,7 @@ def compute_org_rgp(organism: Organism, multigenics: set, persistent_penalty: in min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> Set[Region]: org_regions = set() for contig in organism.contigs: - if len(contig) != 0: # some contigs have no coding genes... + if contig.number_of_genes != 0: # some contigs have no coding genes... # can definitely multiprocess this part, as not THAT much information is needed... matrix = init_matrices(contig, multigenics, persistent_penalty, variable_gain) org_regions |= mk_regions(contig, matrix, multigenics, min_length, min_score, persistent_penalty, diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index e2362eb0..ad0dd348 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -161,7 +161,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p if curr_type != "": if useful_info: create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, - len(contig), gene_name, product, genetic_code, protein_id) + contig.number_of_genes, gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 else: @@ -221,7 +221,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p # end of contig if useful_info: # saving the last element... create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, - len(contig), gene_name, product, genetic_code, protein_id) + contig.number_of_genes, gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 else: @@ -358,7 +358,7 @@ def get_id_attribute(attributes_dict: dict) -> str: # here contig is filled in order, so position is the number of genes already stored in the contig. gene.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]), strand=fields_gff[gff_strand], gene_type=fields_gff[gff_type], name=name, - position=len(contig), product=product, local_identifier=gene_id, + position=contig.number_of_genes, product=product, local_identifier=gene_id, genetic_code=genetic_code) gene.fill_parents(org, contig) gene_counter += 1 @@ -463,8 +463,6 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p args = [] for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] - if len(elements) <= 1: - raise Exception(f"No tabulation separator found in given --fasta file: '{organisms_file}'") org_path = Path(elements[1]) if not org_path.exists(): # Check tsv sanity test if it's not one it's the other org_path = organisms_file.parent.joinpath(org_path) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 803e5c48..f8fa02ff 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -149,7 +149,7 @@ def extract_gene_context(gene: Gene, contig: list, families: dict, t: int = 4) - """ pos_left, pos_right = (max(0, gene.position - t), - min(gene.position + t, len(contig) - 1)) # Gene positions to compare family + min(gene.position + t, contig.number_of_genes - 1)) # Gene positions to compare family in_context_left, in_context_right = (False, False) while pos_left < gene.position and not in_context_left: if contig[pos_left].family in families.values(): diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index d63818b6..8017266e 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -315,9 +315,6 @@ def __init__(self, name: str, is_circular: bool = False): def __str__(self) -> str: return self.name - def __len__(self) -> int: - return len(self._genes_position) - def __setitem__(self, start: int, gene: Gene): """ Set gene to Contig @@ -436,6 +433,10 @@ def get_genes(self, begin: int, end: int) -> List[Gene]: else: return self._genes_position[begin: end] + @property + def number_of_genes(self) -> int: + return len(self._genes_position) + @property def genes(self) -> Generator[Gene, None, None]: """ Give the gene content of the contig @@ -615,7 +616,7 @@ def number_of_genes(self) -> int: :return: Number of genes """ - return sum([len(contig) for contig in self.contigs]) + return sum([contig.number_of_genes for contig in self.contigs]) @property def contigs(self) -> Generator[Contig, None, None]: diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 74f97129..73e0affd 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -103,7 +103,7 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, raise AttributeError("a Gene does not have a GeneFamily object associated") except Exception: raise Exception("Unexpected error. Please report on our github.") - if prev is not None and contig.is_circular and len(contig) > 0: + if prev is not None and contig.is_circular and contig.number_of_genes > 0: # if prev is None, the contig is entirely made of duplicated genes, so no edges are added pangenome.add_edge(contig[0], prev) logging.getLogger("PPanGGOLiN").info("Done making the neighbors graph.") diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index d4d4f638..23f2f94b 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -44,7 +44,7 @@ def compute_mod_graph(pangenome: Pangenome, t: int = 1, disable_bar: bool = Fals g = nx.Graph() for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): for contig in org.contigs: - if len(contig) > 0: + if contig.number_of_genes > 0: start_gene = contig[0] g.add_node(start_gene.family) add_gene(g.nodes[start_gene.family], start_gene, fam_split=False) diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 08eabd86..5581e55c 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -234,7 +234,7 @@ def is_whole_contig(self) -> bool: :return: True if whole contig else False """ - if self.starter.position == 0 and self.stopper.position == len(self.contig) - 1: + if self.starter.position == 0 and self.stopper.position == self.contig.number_of_genes - 1: return True return False @@ -279,14 +279,14 @@ def get_bordering_genes(self, n: int, multigenics: set) -> List[List[Gene], List border[0].append(curr_gene) pos -= 1 if pos == -1 and self.contig.is_circular: - pos = len(self.contig) + pos = self.contig.number_of_genes if pos == init: break # looped around the contig pos = self.stopper.position init = pos - while len(border[1]) < n and (pos != len(self.contig) - 1 or self.contig.is_circular): + while len(border[1]) < n and (pos != self.contig.number_of_genes - 1 or self.contig.is_circular): curr_gene = None - if pos == len(self.contig) - 1: + if pos == self.contig.number_of_genes - 1: if self.contig.is_circular: curr_gene = self.contig[0] else: @@ -294,7 +294,7 @@ def get_bordering_genes(self, n: int, multigenics: set) -> List[List[Gene], List if curr_gene is not None and curr_gene.family not in multigenics: border[1].append(curr_gene) pos += 1 - if pos == len(self.contig) and self.contig.is_circular: + if pos == self.contig.number_of_genes and self.contig.is_circular: pos = -1 if pos == init: break # looped around the contig diff --git a/tests/test_genome.py b/tests/test_genome.py index c58b296c..4b511b98 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -348,15 +348,15 @@ def test_add_gene_without_position(self, contig): gene = Gene('test_gene') contig.add(gene) - def test_get_len(self, genes, contig): + def test_number_of_genes(self, genes, contig): """Tests len method """ gene1, gene2, gene3 = genes contig.add(gene1) contig.add(gene2) contig.add(gene3) - assert isinstance(len(contig), int) - assert len(contig) == 3 + assert isinstance(contig.number_of_genes, int) + assert contig.number_of_genes == 3 def test_get_gene(self, gene, contig): """Tests that a gene can be retrieved by its position From 8fe2bd09d84011c9b02eab0a39c4e68e96e13a02 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 6 Sep 2023 13:30:51 +0200 Subject: [PATCH 62/75] fix contig leng in context --- ppanggolin/context/searchGeneContext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index f8fa02ff..803e5c48 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -149,7 +149,7 @@ def extract_gene_context(gene: Gene, contig: list, families: dict, t: int = 4) - """ pos_left, pos_right = (max(0, gene.position - t), - min(gene.position + t, contig.number_of_genes - 1)) # Gene positions to compare family + min(gene.position + t, len(contig) - 1)) # Gene positions to compare family in_context_left, in_context_right = (False, False) while pos_left < gene.position and not in_context_left: if contig[pos_left].family in families.values(): From 084ce284d42fe9082536cecec15d6a417ee1925f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 6 Sep 2023 16:09:59 +0200 Subject: [PATCH 63/75] Read new version of annotation Tables --- VERSION | 2 +- ppanggolin/cluster/cluster.py | 2 +- ppanggolin/formats/readBinaries.py | 210 ++++++++++++++----------- ppanggolin/formats/writeAnnotations.py | 210 ++++++++++++++++++------- ppanggolin/formats/writeBinaries.py | 4 +- ppanggolin/genome.py | 6 + ppanggolin/pangenome.py | 20 +++ tests/test_genome.py | 3 +- 8 files changed, 304 insertions(+), 153 deletions(-) diff --git a/VERSION b/VERSION index 8464960b..bfdb369d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.175 +1.2.176 diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 8d2763cb..77637015 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -284,7 +284,7 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = """ Main function to cluster pangenome gene sequences into families - :param pangenome: Annoatated Pangenome + :param pangenome: Annotated Pangenome :param tmpdir: Path to temporary directory :param cpu: number of CPU cores to use :param defrag: Allow to remove fragment diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 900a8c4e..215ad7a3 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -5,10 +5,9 @@ import logging import sys from pathlib import Path -# installed libraries -from typing import TextIO -from typing import List +from typing import TextIO, List, Dict, Tuple +# installed libraries from tables import Table from tqdm import tqdm import tables @@ -27,10 +26,10 @@ class Genedata: This is a general class storing unique gene-related data to be written in a specific genedata table - :param start: Start position of a gene - :param stop: Stop position of a gene - :param strand: associated strand - :param gene_type: Type of gene + :param start: Gene start position + :param stop: Gene stop position + :param strand: Associated strand + :param gene_type: Gene type :param position: Position of the gene on its contig :param name: Name of the feature :param product: Associated product @@ -164,7 +163,7 @@ def read_chunks(table: Table, column: str = None, chunk: int = 10000): yield row -def read_genedata(h5f: tables.File) -> dict: +def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: """ Reads the genedata table and returns a genedata_id2genedata dictionnary :param h5f: the hdf5 file handler @@ -192,7 +191,7 @@ def read_sequences(h5f: tables.File) -> dict: :param h5f: the hdf5 file handler :return: dictionnary linking sequences to the seq identifier """ - table = h5f.root.sequences + table = h5f.root.annotations.sequences seqid2seq = {} for row in read_chunks(table, chunk=20000): seqid2seq[row["seqid"]] = row['dna'].decode() @@ -204,6 +203,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter """ Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. + :param filename: Name of the pangenome file :param file_obj: Name of the output file :param list_cds: An iterable object of CDS @@ -212,7 +212,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter """ logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...") h5f = tables.open_file(filename, "r", driver_core_backing_store=0) - table = h5f.root.geneSequences + table = h5f.root.annotations.geneSequences list_cds = set(list_cds) if list_cds is not None else None seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): @@ -225,60 +225,6 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter h5f.close() -def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circular_contigs: dict, genedata_dict: dict, - link: bool = False): - """ - Read information from pangenome to assign to organism object - - :param pangenome: Input pangenome - :param org_name: Name of the organism - :param contig_dict: Dictionary with all contig and associate genes - :param circular_contigs: Dictionary of contigs - :param genedata_dict: dictionnary linking genedata to the genedata identifier - :param link: get the gene object if the genes are clustered - """ - org = Organism(org_name) - gene, gene_type = (None, None) - for contig_name, gene_list in contig_dict.items(): - try: - contig = org.get(contig_name) - except KeyError: - contig = Contig(contig_name, is_circular=circular_contigs[contig_name]) - org.add(contig) - for row in gene_list: - if link: # if the gene families are already computed/loaded the gene exists. - gene = pangenome.get_gene(row["ID"].decode()) - else: # else creating the gene. - curr_genedata = genedata_dict[row["genedata_id"]] - gene_type = curr_genedata.gene_type - if gene_type == "CDS": - gene = Gene(row["ID"].decode()) - elif "RNA" in gene_type: - gene = RNA(row["ID"].decode()) - try: - local = row["local"].decode() - except ValueError: - local = "" - if isinstance(gene, Gene): - gene.fill_annotations(start=curr_genedata.start, stop=curr_genedata.stop, strand=curr_genedata.strand, - gene_type=gene_type, name=curr_genedata.name, position=curr_genedata.position, - genetic_code=curr_genedata.genetic_code, product=curr_genedata.product, - local_identifier=local) - else: - gene.fill_annotations(start=curr_genedata.start, stop=curr_genedata.stop, strand=curr_genedata.strand, - gene_type=gene_type, name=curr_genedata.name, - product=curr_genedata.product, local_identifier=local) - gene.is_fragment = row["is_fragment"] - gene.fill_parents(org, contig) - if gene_type == "CDS": - contig[gene.start] = gene - elif "RNA" in gene_type: - contig.add_rna(gene) - else: - raise Exception(f"A strange type '{gene_type}', which we do not know what to do with, was met.") - pangenome.add_organism(org) - - def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ Read information about graph in pangenome hdf5 file to add in pangenome object @@ -350,6 +296,7 @@ def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar: def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ Read gene sequences in pangenome hdf5 file to add in pangenome object + :param pangenome: Pangenome object without gene sequence associate to gene :param h5f: Pangenome HDF5 file with gene sequence associate to gene :param disable_bar: Disable the progress bar @@ -357,7 +304,7 @@ def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: boo if pangenome.status["genomesAnnotated"] not in ["Computed", "Loaded"]: raise Exception("It's not possible to read the pangenome gene dna sequences " "if the annotations have not been loaded.") - table = h5f.root.geneSequences + table = h5f.root.annotations.geneSequences seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): @@ -437,8 +384,80 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal pangenome.add_module(module) pangenome.status["modules"] = "Loaded" - -def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): +def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size: int = 20000, + disable_bar: bool = False) -> Tuple[Dict[str, Gene], Dict[str, RNA]]: + table = annotations.genomes + contig2organism = {} + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar): + try: + organism = pangenome.get_organism(row["name"].decode()) + except: + organism = Organism(row["name"].decode()) + pangenome.add_organism(organism) + contig = Contig(name=row["contig"].decode()) + organism.add(contig) + contig2organism[contig.name] = organism.name + table = annotations.contigs + contig_name = None + genes_dict = {} + rna_dict = {} + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar): + if contig_name != row["name"].decode(): + contig_name = row["name"].decode() + organism = pangenome.get_organism(contig2organism[contig_name]) + contig = organism.get(contig_name) + contig.is_circular = row["is_circular"] + try: + gene = Gene(row["gene"].decode()) + except ValueError: + pass + else: + gene.fill_parents(organism, contig) + if row["gene"].decode() in genes_dict: + logging.getLogger().warning("A gene with the same ID already pass. " + "It could be a problem in the number of genes") + genes_dict[gene.ID] = gene + try: + rna = RNA(row["rna"].decode()) + except ValueError: + pass + else: + rna_dict[rna.ID] = rna + rna.fill_parents(organism, contig) + return genes_dict, rna_dict + +def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], + gene_dict: Dict[str, Gene], chunk_size: int = 20000, disable_bar: bool = False): + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): + gene = gene_dict[row["ID"].decode()] + genedata = genedata_dict[row["genedata_id"]] + try: + local = row["local"].decode() + except ValueError: + local = "" + gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, + gene_type=genedata.gene_type, name=genedata.name, position=genedata.position, + genetic_code=genedata.genetic_code, product=genedata.product, + local_identifier=local) + gene.is_fragment = row["is_fragment"] + if gene.contig is not None: + gene.contig.add(gene) + + +def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], + rna_dict: Dict[str, RNA], chunk_size: int = 20000, disable_bar: bool = False): + for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): + rna = rna_dict[row["ID"].decode()] + genedata = genedata_dict[row["genedata_id"]] + rna.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, + gene_type=genedata.gene_type, name=genedata.name, + product=genedata.product) + if rna.contig is not None: + rna.contig.add_rna(rna) + + +def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_genes: bool = True, + load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False): """ Read annotation in pangenome hdf5 file to add in pangenome object @@ -448,35 +467,33 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = """ annotations = h5f.root.annotations - table = annotations.genes - pangenome_dict = {} - circular_contigs = {} + if load_organisms: + if load_genes: + genedata_dict = read_genedata(h5f) + if load_rnas: + gene_dict, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar) - genedata_dict = read_genedata(h5f) - - for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): - decode_org = row["organism"].decode() - try: - # new gene, seen contig, seen org - pangenome_dict[decode_org][row["contig"]["name"].decode()].append(row["gene"]) - except KeyError: - try: - # new contig, seen org - pangenome_dict[decode_org][row["contig"]["name"].decode()] = [row["gene"]] - circular_contigs[decode_org][row["contig"]["name"].decode()] = row["contig"]["is_circular"] - except KeyError: - # new org - pangenome_dict[sys.intern(decode_org)] = {row["contig"]["name"].decode(): [row["gene"]]} - circular_contigs[decode_org] = {row["contig"]["name"].decode(): row["contig"]["is_circular"]} - - link = True if pangenome.status["genesClustered"] in ["Computed", "Loaded"] else False - - for org_name, contig_dict in tqdm(pangenome_dict.items(), total=len(pangenome_dict), - unit="organism", disable=disable_bar): - read_organism(pangenome, org_name, contig_dict, circular_contigs[org_name], genedata_dict, link) + read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) + read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar) + else: + gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar) + read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) + else: + if load_rnas: + genedata_dict = read_genedata(h5f) + _, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar) + read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar) + else: + if load_genes: + if pangenome.status["genesClustered"] not in ["Loaded", "Computed"]: + raise Exception("Genes must be linked to gene families or organisms, but none are laoded") + gene_dict = {gene.ID: gene for gene in pangenome.genes} # Dictionary with genes in families + gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar) + read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) pangenome.status["genomesAnnotated"] = "Loaded" + def read_info(h5f: tables.File): """ Read the pangenome content @@ -630,12 +647,14 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa fix_partitioned(pangenome.file) h5f = tables.open_file(filename, "r") - if annotation: + + if annotation: # I place annotation here, to link gene to gene families if organism are not loaded if h5f.root.status._v_attrs.genomesAnnotated: logging.getLogger("PPanGGOLiN").info("Reading pangenome annotations...") read_annotation(pangenome, h5f, disable_bar=disable_bar) else: raise Exception(f"The pangenome in file '{filename}' has not been annotated, or has been improperly filled") + if gene_sequences: if h5f.root.status._v_attrs.geneSequences: logging.getLogger("PPanGGOLiN").info("Reading pangenome gene dna sequences...") @@ -652,6 +671,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception( f"The pangenome in file '{filename}' does not have gene families, or has been improperly filled") + if graph: if h5f.root.status._v_attrs.NeighborsGraph: logging.getLogger("PPanGGOLiN").info("Reading the neighbors graph edges...") @@ -659,6 +679,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have graph information, " f"or has been improperly filled") + if rgp: if h5f.root.status._v_attrs.predictedRGP: logging.getLogger("PPanGGOLiN").info("Reading the RGP...") @@ -666,6 +687,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have RGP information, " f"or has been improperly filled") + if spots: if h5f.root.status._v_attrs.spots: logging.getLogger("PPanGGOLiN").info("Reading the spots...") @@ -673,6 +695,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have spots information, " f"or has been improperly filled") + if modules: if h5f.root.status._v_attrs.modules: logging.getLogger("PPanGGOLiN").info("Reading the modules...") @@ -680,6 +703,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' does not have modules information, " f"or has been improperly filled") + if metadata: assert metatype is not None if sources is None: diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index 907034de..fc4e2e70 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -11,18 +11,22 @@ # local libraries from ppanggolin.pangenome import Pangenome -from ppanggolin.genome import Feature, Gene +from ppanggolin.genome import Gene, RNA from ppanggolin.formats.readBinaries import Genedata -def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int, int]: +genedata_counter = 0 + + +def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int]: """ Get the maximum size of each annotation information to optimize disk space :param pangenome: Annotated pangenome - :return: maximum size of each annotation + + :return: Maximum size of each annotation """ - max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = 1, 1, 1, 1, 1, 1 + max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id = 1, 1, 1, 1, 1 for org in pangenome.organisms: if len(org.name) > max_org_len: max_org_len = len(org.name) @@ -36,19 +40,17 @@ def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, i max_gene_local_id = len(gene.local_identifier) for rna in contig.RNAs: if len(rna.ID) > max_rna_id_len: - max_rna_id_len = len(gene.ID) - if len(rna.local_identifier) > max_rna_local_id: - max_rna_local_id = len(gene.local_identifier) + max_rna_id_len = len(rna.ID) - return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id + return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]: """ - Create a table to save organism-related information + Table description to save organism-related information - :param org_len: Maximum size of organism - :param contig_len: Maximum size of contigs + :param org_len: Maximum size of organism name. + :param contig_len: Maximum size of contigs name :return: Formatted table """ @@ -58,7 +60,15 @@ def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]: def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, organism_desc: Dict[str, tables.StringCol], disable_bar=False): - organism_table = h5f.create_table(annotation, "genome", organism_desc, + """Write organisms information in the pangenome file + + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param organism_desc: Organisms table description. + :param disable_bar: Allow disabling progress bar + """ + organism_table = h5f.create_table(annotation, "genomes", organism_desc, expectedrows=pangenome.number_of_organisms) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes") organism_row = organism_table.row @@ -72,11 +82,11 @@ def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.G def contig_desc(contig_len: int, max_gene_id_len: int, max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol]]: - """ - Create a table to save organism-related information + """Table description to save contig-related information - :param org_len: Maximum size of organism - :param contig_len: Maximum size of contigs + :param contig_len: Maximum size of contig name + :param max_gene_id_len: Maximum size of gene name + :param max_rna_id_len: Maximum size of rna name :return: Formatted table """ @@ -89,29 +99,44 @@ def contig_desc(contig_len: int, max_gene_id_len: int, def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol]], disable_bar=False): - contig_table = h5f.create_table(annotation, "contig", contig_desc, expectedrows=pangenome.number_of_contigs) + """Write contigs information in the pangenome file + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param contig_desc: Contigs table description + :param disable_bar: Allow disabling progress bar + """ + contig_table = h5f.create_table(annotation, "contigs", contig_desc, expectedrows=pangenome.number_of_contigs) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") contig_row = contig_table.row for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): - rna_list = list(contig.RNAs) - for index, gene in enumerate(contig.genes): - contig_row["name"] = contig.name - contig_row["is_circular"] = contig.is_circular - contig_row["gene"] = gene.ID - if index < len(rna_list): - contig_row["rna"] = rna_list[index].ID - contig_row.append() + if len(contig) >= contig.number_of_rnas: + rna_list = list(contig.RNAs) + for index, gene in enumerate(contig.genes): + contig_row["name"] = contig.name + contig_row["is_circular"] = contig.is_circular + contig_row["gene"] = gene.ID + if index < len(rna_list): + if rna_list[index].ID == 'GCF_001293965.1_ASM129396v1_genomic_tRNA_005': + print("pika") + contig_row["rna"] = rna_list[index].ID + contig_row.append() + else: + gene_list = list(contig.genes) + for index, rna in enumerate(contig.RNAs): + contig_row["name"] = contig.name + contig_row["is_circular"] = contig.is_circular + contig_row["rna"] = rna.ID + if index < len(gene_list): + contig_row["gene"] = gene_list[index].ID + contig_row.append() contig_table.flush() def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: - """ - Create a table to save gene-related information - - :param org_len: Maximum size of organism - :param contig_len: Maximum size of contigs - :param id_len: Maximum size of gene ID + """Table description to save gene-related information + :param id_len: Maximum size of gene name :param max_local_id: Maximum size of gene local identifier :return: Formatted table @@ -125,16 +150,26 @@ def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables. def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, gene_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]], disable_bar=False) -> Dict[Genedata, int]: + """Write genes information in the pangenome file + + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param gene_desc: Genes table description + :param disable_bar: Allow to disable progress bar + + :returns: Dictionnary linking genedata to gene identifier + """ + global genedata_counter + print(genedata_counter) genedata2gene = {} - genedata_counter = 0 gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") gene_row = gene_table.row - for gene in tqdm(pangenome.genes, total=pangenome.number_of_organisms, unit="gene", disable=disable_bar): + for gene in tqdm(pangenome.genes, total=pangenome.number_of_genes, unit="gene", disable=disable_bar): gene_row["ID"] = gene.ID gene_row["is_fragment"] = gene.is_fragment - if gene.type == "CDS": - gene_row["local"] = gene.local_identifier + gene_row["local"] = gene.local_identifier genedata = get_genedata(gene) genedata_id = genedata2gene.get(genedata) if genedata_id is None: @@ -144,14 +179,60 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou gene_row["genedata_id"] = genedata_id gene_row.append() gene_table.flush() + print(genedata_counter) return genedata2gene +def rna_desc(id_len) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]: + """Table description to save rna-related information + + :param id_len: Maximum size of RNA identifier + + :return: Formatted table + """ + return {'ID': tables.StringCol(itemsize=id_len), + 'genedata_id': tables.UInt32Col()} + + +def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, + rna_desc: Dict[str, Union[tables.StringCol, tables.UInt32Col]], + disable_bar=False) -> Dict[Genedata, int]: + """Write RNAs information in the pangenome file + + :param pangenome: Annotated pangenome object + :param h5f: Pangenome file + :param annotation: Annotation table group + :param rna_desc: RNAs table description + :param disable_bar: Allow to disable progress bar + + :returns: Dictionnary linking genedata to RNA identifier + """ + global genedata_counter + print(genedata_counter) + genedata2rna = {} + rna_table = h5f.create_table(annotation, "RNAs", rna_desc, expectedrows=pangenome.number_of_genes) + logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") + rna_row = rna_table.row + for rna in tqdm(pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar): + rna_row["ID"] = rna.ID + genedata = get_genedata(rna) + genedata_id = genedata2rna.get(genedata) + if genedata_id is None: + genedata_id = genedata_counter + genedata2rna[genedata] = genedata_id + genedata_counter += 1 + rna_row["genedata_id"] = genedata_id + rna_row.append() + rna_table.flush() + print(genedata_counter) + return genedata2rna + + def genedata_desc(type_len, name_len, product_len): """ Creates a table for gene-related data - :param type_len: Maximum size of gene Type + :param type_len: Maximum size of gene Type. :param name_len: Maximum size of gene name :param product_len: Maximum size of gene product :return: Formatted table for gene metadata @@ -199,31 +280,42 @@ def get_max_len_genedata(pangenome: Pangenome) -> Tuple[int, int, int]: return max_type_len, max_name_len, max_product_len -def get_genedata(gene: Feature) -> Genedata: +def get_genedata(feature: Union[Gene, RNA]) -> Genedata: """ Gets the genedata type of Feature - :param gene: a Feature + :param feature: Gene or RNA object + :return: Tuple with a Feature associated data """ position = None genetic_code = 11 - if gene.type == "CDS": - gene: Gene - position = gene.position - genetic_code = gene.genetic_code - return Genedata(gene.start, gene.stop, gene.strand, gene.type, position, gene.name, - gene.product, genetic_code) + if isinstance(feature, Gene): + position = feature.position + genetic_code = feature.genetic_code + return Genedata(feature.start, feature.stop, feature.strand, feature.type, position, feature.name, + feature.product, genetic_code) def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, genedata2gene: Dict[Genedata, int], disable_bar=False): - genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), - expectedrows=len(genedata2gene)) + """Writting genedata information in pangenome file + + :param pangenome: Pangenome object filled with annotation. + :param h5f: Pangenome file + :param annotation: Annotation group in Table + :param genedata2gene: Dictionnary linking genedata to gene identifier. + :param disable_bar: Allow disabling progress bar + """ + try: + genedata_table = annotation.genedata + except tables.exceptions.NoSuchNodeError: + genedata_table = h5f.create_table(annotation, "genedata", genedata_desc(*get_max_len_genedata(pangenome)), + expectedrows=len(genedata2gene)) logging.getLogger("PPanGGOLiN").debug(f"Writing {len(genedata2gene)} gene-related data " "(can be lower than the number of genes)") genedata_row = genedata_table.row - for genedata, genedata_id in genedata2gene.items(): + for genedata, genedata_id in tqdm(genedata2gene.items(), unit="genedata", disable=disable_bar): genedata_row["genedata_id"] = genedata_id genedata_row["start"] = genedata.start genedata_row["stop"] = genedata.stop @@ -239,29 +331,37 @@ def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.G def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, - rec_contigs: bool = True, rec_genes: bool = True, disable_bar: bool = False): - """ - Function writing all the pangenome annotations + rec_contigs: bool = True, rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False): + """Function writing all the pangenome annotations :param pangenome: Annotated pangenome :param h5f: Pangenome HDF5 file + :param rec_organisms: Allow writing organisms in pangenomes + :param rec_contigs: Allow writing contigs in pangenomes + :param rec_genes: Allow writing genes in pangenomes + :param rec_rnas: Allow writing RNAs in pangenomes :param disable_bar: Alow to disable progress bar """ annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") - max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id, max_rna_local_id = get_max_len_annotations( - pangenome) + org_len, contig_len, gene_id_len, rna_id_len, gene_local_id = get_max_len_annotations(pangenome) + + # I add these boolean in case we would one day only load organism, contig or genes, without the other. if rec_organisms: - desc = organism_desc(max_org_len, max_contig_len) + desc = organism_desc(org_len, contig_len) write_organisms(pangenome, h5f, annotation, desc, disable_bar) if rec_contigs: - desc = contig_desc(max_contig_len, max_gene_id_len, max_rna_id_len) + desc = contig_desc(contig_len, gene_id_len, rna_id_len) write_contigs(pangenome, h5f, annotation, desc, disable_bar) if rec_genes: - desc = gene_desc(max_gene_id_len, max_gene_local_id) + desc = gene_desc(gene_id_len, gene_local_id) genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar) write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar) + if rec_rnas: + desc = rna_desc(rna_id_len) + genedata2rna = write_rnas(pangenome, h5f, annotation, desc, disable_bar) + write_genedata(pangenome, h5f, annotation, genedata2rna, disable_bar) def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 1a3fcc56..dd0123ca 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -606,9 +606,9 @@ def update_gene_fragments(pangenome: Pangenome, h5f: tables.File, disable_bar: b table = h5f.root.annotations.genes for row in tqdm(table, total=table.nrows, unit="gene", disable=disable_bar): - genedata_id = row['gene/genedata_id'] + genedata_id = row['genedata_id'] if genedataid2genedata[genedata_id].gene_type == 'CDS': - row['gene/is_fragment'] = pangenome.get_gene(row['gene/ID'].decode()).is_fragment + row['is_fragment'] = pangenome.get_gene(row['ID'].decode()).is_fragment row.update() table.flush() diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 84462d77..48b846b0 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -488,6 +488,12 @@ def RNAs(self) -> Generator[RNA, None, None]: """ yield from self._rna_getter + @property + def number_of_rnas(self) -> int: + """Get the number of RNA in the contig + """ + return len(self._rna_getter) + class Organism(MetaFeatures): """ diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index c63bc3ec..28ab459b 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -146,6 +146,26 @@ def number_of_genes(self) -> int: self._mk_gene_getter() # make it return len(self._geneGetter) + """RNAs methods""" + + @property + def RNAs(self) -> Generator[Gene, None, None]: + """Generator of genes in the pangenome. + + :return: gene generator + """ + for org in self.organisms: + for contig in org.contigs: + yield from contig.RNAs + + @property + def number_of_rnas(self) -> int: + """Returns the number of gene present in the pangenome + + :return: The number of genes + """ + return sum(ctg.number_of_rnas for ctg in self.contigs) + """Gene families methods""" @property def max_fam_id(self): diff --git a/tests/test_genome.py b/tests/test_genome.py index c58b296c..b19eda01 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -523,7 +523,8 @@ def test_number_of_contigs(self, organism): """ organism.add(Contig('contig1')) organism.add(Contig('contig2')) - assert organism.number_of_contigs() == 2 + assert isinstance(len(organism), int) + assert len(organism) == 2 def test_get_families(self, organism, contig, gene): """Tests that gene families in an organism can be retrieved From 6f7229ed060ce933044d19a61bd114d41bec9137 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 6 Sep 2023 16:20:42 +0200 Subject: [PATCH 64/75] Fix compatibility problem with merge --- VERSION | 2 +- ppanggolin/formats/writeAnnotations.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index bfdb369d..27a69d0e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.176 +1.2.177 diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index fc4e2e70..e1dd1640 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -110,7 +110,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") contig_row = contig_table.row for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): - if len(contig) >= contig.number_of_rnas: + if contig.number_of_genes >= contig.number_of_rnas: rna_list = list(contig.RNAs) for index, gene in enumerate(contig.genes): contig_row["name"] = contig.name From 9499d2abde7ba3cec1efeb0ba9498052396afb8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 7 Sep 2023 14:28:26 +0200 Subject: [PATCH 65/75] Get length of contig for GFF and GBFF files --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 123 ++++++++++--------------- ppanggolin/formats/writeAnnotations.py | 8 +- ppanggolin/genome.py | 20 +++- ppanggolin/pangenome.py | 1 - testingDataset/organisms.gbff.list | 3 +- 6 files changed, 71 insertions(+), 86 deletions(-) diff --git a/VERSION b/VERSION index 27a69d0e..8c12c1db 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.177 +1.2.178 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index ad0dd348..e4e9053d 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -32,13 +32,13 @@ def check_annotate_args(args): raise Exception("You must provide at least a file with the --fasta option to annotate from sequences, " "or a file with the --gff option to load annotations from.") - if hasattr(args, "fasta") and args.fasta is not None: check_input_files(args.fasta, True) if hasattr(args, "anno") and args.anno is not None: check_input_files(args.anno, True) + def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set, start: int, stop: int, strand: str, gene_type: str, position: int = None, gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = ""): @@ -91,20 +91,19 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_parents(org, contig) -def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> ( +def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> ( Organism, bool): """ Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file - :param organism: Organism name - :param gbff_file_path: Path to corresponding GFF file + :param organism_name: Organism name + :param gbff_file_path: Path to corresponding GBFF file :param circular_contigs: list of contigs :param pseudo: Allow to read pseudogène :return: Organism complete and true for sequence in file """ - org = Organism(organism) - + organism = Organism(organism_name) logging.getLogger("PPanGGOLiN").debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}") # revert the order of the file, to read the first line first. lines = read_compressed_or_not(gbff_file_path).readlines()[::-1] @@ -114,35 +113,29 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p line = lines.pop() # beginning of contig contig = None - set_contig = False is_circ = False - contig_locus_id = None + contig_id = None if line.startswith('LOCUS'): if "CIRCULAR" in line.upper(): # this line contains linear/circular word telling if the dna sequence is circularized or not is_circ = True - contig_locus_id = line.split()[1] + # TODO maybe it could be a good thing to add a elif for linear + # and if circular or linear are not found raise a warning + + contig_id = line.split()[1] # If contig_id is not specified in VERSION afterward like with Prokka, in that case we use the one in LOCUS while not line.startswith('FEATURES'): - if line.startswith('VERSION'): + if line.startswith('VERSION') and line[12:].strip() != "": contig_id = line[12:].strip() - if contig_id != "": - try: - contig = org.get(contig_id) - except KeyError: - contig = Contig(contig_id, True if contig_id in circular_contigs else False) - org.add(contig) - set_contig = True line = lines.pop() - if not set_contig: - # if no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. - # Should be unique in a dataset, but if there's an update the contig ID - # might still be the same even though it should not(?) - try: - contig = org.get(contig_locus_id) - except KeyError: - contig = Contig(contig_locus_id, True if contig_locus_id in circular_contigs else False) - org.add(contig) + # If no contig ids were filled after VERSION, we use what was found in LOCUS for the contig ID. + # Should be unique in a dataset, but if there's an update + # the contig ID might still be the same even though it should not(?) + try: + contig = organism.get(contig_id) + except KeyError: + contig = Contig(contig_id, True if contig_id in circular_contigs or is_circ else False) + organism.add(contig) # start of the feature object. dbxref = set() gene_name = "" @@ -160,8 +153,8 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p curr_type = line[5:21].strip() if curr_type != "": if useful_info: - create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, - contig.number_of_genes, gene_name, product, genetic_code, protein_id) + create_gene(organism, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, + obj_type, contig.number_of_genes, gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 else: @@ -192,6 +185,9 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p pass # don't know what to do with that, ignoring for now. # there is a protein with a frameshift mecanism. + elif curr_type == 'source': # Get Contig length + start, end = map(int, map(str.strip, line[21:].split('..'))) + contig.length = end - start + 1 elif useful_info: # current info goes to current objtype, if it's useful. if line[21:].startswith("/db_xref"): dbxref.add(line.split("=")[1].replace('"', '').strip()) @@ -220,7 +216,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p line = lines.pop() # end of contig if useful_info: # saving the last element... - create_gene(org, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, + create_gene(organism, contig, gene_counter, rna_counter, locus_tag, dbxref, start, stop, strand, obj_type, contig.number_of_genes, gene_name, product, genetic_code, protein_id) if obj_type == "CDS": gene_counter += 1 @@ -237,7 +233,7 @@ def read_org_gbff(organism: str, gbff_file_path: Path, circular_contigs: list, p # get each gene's sequence. for gene in contig.genes: gene.add_sequence(get_dna_sequence(sequence, gene)) - return org, True + return organism, True def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: bool = False) -> (Organism, bool): @@ -245,22 +241,22 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: b Read annotation from GFF file :param organism: Organism name - :param gff_file_path: Path to corresponding GFF file - :param circular_contigs: list of contigs + :param gff_file_path: Path corresponding to GFF file + :param circular_contigs: List of circular contigs :param pseudo: Allow to read pseudogène - :return: Organism object and if there are sequences associate or not + :return: Organism object and if there are sequences associated or not """ (gff_seqname, _, gff_type, gff_start, gff_end, _, gff_strand, _, gff_attribute) = range(0, 9) - # missing values : source, score, frame. They are unused. + # Missing values: source, score, frame. They are unused. def get_gff_attributes(gff_fields: list) -> dict: - """ - Parses the gff attribute's line and outputs the attributes_get in a dict structure. - :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff. + """Parses the gff attribute's line and outputs the attributes_get in a dict structure. - :return: attributes get + :param gff_fields: A gff line stored as a list. Each element of the list is a column of the gff. + + :return: Attributes get """ attributes_field = [f for f in gff_fields[gff_attribute].strip().split(';') if len(f) > 0] attributes_get = {} @@ -276,7 +272,8 @@ def get_id_attribute(attributes_dict: dict) -> str: """ Gets the ID of the element from which the provided attributes_get were extracted. Raises an error if no ID is found. - :param attributes_dict: attributes from one gff line + + :param attributes_dict: Attributes from one gff line :return: CDS identifier """ @@ -302,11 +299,9 @@ def get_id_attribute(attributes_dict: dict) -> str: has_fasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] - try: - contig = org.get(fields[1]) - except KeyError: - contig = Contig(fields[1], True if fields[1] in circular_contigs else False) - org.add(contig) + contig = Contig(fields[1], True if fields[1] in circular_contigs else False) + org.add(contig) + contig.length = int(fields[-1]) - int(fields[3]) + 1 continue elif line.startswith('#'): # comment lines to be ignores by parsers @@ -326,32 +321,14 @@ def get_id_attribute(attributes_dict: dict) -> str: # if it's not found, we get the one under the 'ID' field which must exist # (otherwise not a gff3 compliant file) gene_id = get_id_attribute(attributes) - try: - name = attributes.pop('NAME') - except KeyError: - try: - name = attributes.pop('GENE') - except KeyError: - name = "" + name = attributes.pop('NAME', attributes.pop('GENE', "")) if "pseudo" in attributes or "pseudogene" in attributes: pseudogene = True - try: - product = attributes.pop('PRODUCT') - except KeyError: - product = "" - - try: - genetic_code = int(attributes.pop("TRANSL_TABLE")) - except KeyError: - genetic_code = 11 + product = attributes.pop('PRODUCT', "") + genetic_code = int(attributes.pop("TRANSL_TABLE", 11)) if contig is None or contig.name != fields_gff[gff_seqname]: # get the current contig - try: - contig = org.get(fields_gff[gff_seqname]) - except KeyError: - contig = Contig(fields_gff[gff_seqname], - True if fields_gff[gff_seqname] in circular_contigs else False) - org.add(contig) + contig = org.get(fields_gff[gff_seqname]) if fields_gff[gff_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) @@ -391,8 +368,8 @@ def launch_read_anno(args: tuple) -> (Organism, bool): return read_anno_file(*args) -def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, pseudo: bool = False) -> ( - Organism, bool): +def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, + pseudo: bool = False) -> (Organism, bool): """ Read a GBFF file for one organism @@ -543,8 +520,8 @@ def launch_annotate_organism(pack: tuple) -> Organism: def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11, - kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, - disable_bar: bool = False): + kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, + procedure: str = None, disable_bar: bool = False): """ Main function to annotate a pangenome @@ -570,14 +547,13 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: if not org_path.exists(): # Check tsv sanity test if it's not one it's the other org_path = fasta_list.parent.joinpath(org_path) - + arguments.append((elements[0], org_path, elements[2:], tmpdir, translation_table, norna, kingdom, allow_overlap, procedure)) if len(arguments) == 0: raise Exception("There are no genomes in the provided file") - logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") with get_context('fork').Pool(processes=cpu) as p: for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", @@ -622,7 +598,6 @@ def launch(args: argparse.Namespace): logging.getLogger("PPanGGOLiN").warning( "You will be able to proceed with your analysis ONLY if you provide " "the clustering results in the next step.") - write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) @@ -684,8 +659,6 @@ def parser_annot(parser: argparse.ArgumentParser): if __name__ == '__main__': """To test local change and allow using debugger""" - import tempfile - from ppanggolin.utils import set_verbosity_level, add_common_arguments main_parser = argparse.ArgumentParser( diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index e1dd1640..d74a97a3 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -109,7 +109,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro contig_table = h5f.create_table(annotation, "contigs", contig_desc, expectedrows=pangenome.number_of_contigs) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") contig_row = contig_table.row - for contig in tqdm(pangenome.contigs, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): + for contig in tqdm(pangenome.contigs, total=pangenome.number_of_contigs, unit="contigs", disable=disable_bar): if contig.number_of_genes >= contig.number_of_rnas: rna_list = list(contig.RNAs) for index, gene in enumerate(contig.genes): @@ -117,8 +117,6 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro contig_row["is_circular"] = contig.is_circular contig_row["gene"] = gene.ID if index < len(rna_list): - if rna_list[index].ID == 'GCF_001293965.1_ASM129396v1_genomic_tRNA_005': - print("pika") contig_row["rna"] = rna_list[index].ID contig_row.append() else: @@ -161,7 +159,6 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou :returns: Dictionnary linking genedata to gene identifier """ global genedata_counter - print(genedata_counter) genedata2gene = {} gene_table = h5f.create_table(annotation, "genes", gene_desc, expectedrows=pangenome.number_of_genes) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") @@ -179,7 +176,6 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou gene_row["genedata_id"] = genedata_id gene_row.append() gene_table.flush() - print(genedata_counter) return genedata2gene @@ -208,7 +204,6 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group :returns: Dictionnary linking genedata to RNA identifier """ global genedata_counter - print(genedata_counter) genedata2rna = {} rna_table = h5f.create_table(annotation, "RNAs", rna_desc, expectedrows=pangenome.number_of_genes) logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_genes} genes") @@ -224,7 +219,6 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group rna_row["genedata_id"] = genedata_id rna_row.append() rna_table.flush() - print(genedata_counter) return genedata2rna diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 499abd37..205b08dc 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -311,6 +311,7 @@ def __init__(self, name: str, is_circular: bool = False): self._genes_getter = {} self._genes_position = [] self._organism = None + self._length = None def __str__(self) -> str: return self.name @@ -342,6 +343,23 @@ def __setitem__(self, start: int, gene: Gene): # TODO define eq function + @property + def length(self): + if self._length is None: + logging.getLogger("PPanGGOLiN").warning("Contig length is unknown") + return self._length + + @length.setter + def length(self, contig_len: int): + if not isinstance(contig_len, int): + raise TypeError("Contig length is expected to be an integer") + if contig_len < 0: + raise ValueError("Contig length must be positive") + self._length = contig_len + + def __len__(self): + return self.length + # retrieve gene by start position def __getitem__(self, position: int) -> Gene: """Get the gene for the given position @@ -593,7 +611,7 @@ def __len__(self): :return: Number of contigs in organism """ - return len(self._contigs_getter) + return len(self._contigs_getter.keys()) @property def families(self): diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 28ab459b..ef37bb4b 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -147,7 +147,6 @@ def number_of_genes(self) -> int: return len(self._geneGetter) """RNAs methods""" - @property def RNAs(self) -> Generator[Gene, None, None]: """Generator of genes in the pangenome. diff --git a/testingDataset/organisms.gbff.list b/testingDataset/organisms.gbff.list index 1537925a..093d4beb 100644 --- a/testingDataset/organisms.gbff.list +++ b/testingDataset/organisms.gbff.list @@ -21,6 +21,7 @@ GCF_000472205.1 GBFF/GCF_000472205.1_E_CS88_f__genomic.gbff.gz GCF_000590575.1 GBFF/GCF_000590575.1_ASM59057v1_genomic.gbff.gz GCF_000590635.1 GBFF/GCF_000590635.1_ASM59063v1_genomic.gbff.gz GCF_000590695.1 GBFF/GCF_000590695.1_ASM59069v1_genomic.gbff.gz +GCF_001183765.1 GBFF/GCF_001183765.1_ASM118376v1_genomic.gbff.gz GCF_001183805.1 GBFF/GCF_001183805.1_ASM118380v1_genomic.gbff.gz GCF_001183825.1 GBFF/GCF_001183825.1_ASM118382v1_genomic.gbff.gz GCF_001183845.1 GBFF/GCF_001183845.1_ASM118384v1_genomic.gbff.gz @@ -50,4 +51,4 @@ GCF_006508265.1 GBFF/GCF_006508265.1_ASM650826v1_genomic.gbff.gz GCF_000068585.1 GBFF/NC_010287.1.gff.gz GCF_000008725.1 GBFF/NC_000117.1.gbk.gz GCF_001183765.1_gff GBFF/PROKKA_12132021.gff.gz -GCF_001183765.1_gbk GBFF/PROKKA_12132021.gbk.gz +GCF_001183765.1_gbk GBFF/PROKKA_12132021.gbk.gz \ No newline at end of file From 1452b239587b719fa0716eebb0e7d97f7e10e0db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 7 Sep 2023 15:16:40 +0200 Subject: [PATCH 66/75] Get contig length for fasta files --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 54 ++++++++++++++++++--------------- ppanggolin/annotate/synta.py | 16 ++++++++-- 3 files changed, 45 insertions(+), 27 deletions(-) diff --git a/VERSION b/VERSION index 8c12c1db..e9a3656e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.178 +1.2.179 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index e4e9053d..baad0c20 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -9,6 +9,7 @@ from pathlib import Path import tempfile import time +from typing import List, Set, Tuple # installed libraries from tqdm import tqdm @@ -21,7 +22,7 @@ from ppanggolin.formats import write_pangenome -def check_annotate_args(args): +def check_annotate_args(args: argparse.Namespace): """Check That the given arguments are usable :param args: All arguments provide by user @@ -39,7 +40,7 @@ def check_annotate_args(args): check_input_files(args.anno, True) -def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set, +def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: Set[str], start: int, stop: int, strand: str, gene_type: str, position: int = None, gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = ""): """ @@ -82,7 +83,7 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, position=position, product=product, local_identifier=gene_id, genetic_code=genetic_code) - contig[new_gene.start] = new_gene + contig.add(new_gene) else: # if not CDS, it is RNA new_gene = RNA(org.name + "_RNA_" + str(rna_counter).zfill(4)) new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, @@ -91,8 +92,8 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i new_gene.fill_parents(org, contig) -def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: list, pseudo: bool = False) -> ( - Organism, bool): +def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: List[str], + pseudo: bool = False) -> Tuple[Organism, bool]: """ Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file @@ -236,7 +237,8 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: li return organism, True -def read_org_gff(organism: str, gff_file_path: Path, circular_contigs, pseudo: bool = False) -> (Organism, bool): +def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str], + pseudo: bool = False) -> Tuple[Organism, bool]: """ Read annotation from GFF file @@ -358,7 +360,7 @@ def get_id_attribute(attributes_dict: dict) -> str: return org, has_fasta -def launch_read_anno(args: tuple) -> (Organism, bool): +def launch_read_anno(args: Tuple[str, Path, List[str], bool]) -> Tuple[Organism, bool]: """ Allow to launch in multiprocessing the read of genome annotation :param args: Pack of argument for annotate_organism function @@ -368,8 +370,8 @@ def launch_read_anno(args: tuple) -> (Organism, bool): return read_anno_file(*args) -def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, - pseudo: bool = False) -> (Organism, bool): +def read_anno_file(organism_name: str, filename: Path, circular_contigs: List[str], + pseudo: bool = False) -> Tuple[Organism, bool]: """ Read a GBFF file for one organism @@ -396,7 +398,7 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, "You may be able to use --fasta instead.") -def chose_gene_identifiers(pangenome) -> bool: +def chose_gene_identifiers(pangenome: Pangenome) -> bool: """ Parses the pangenome genes to decide whether to use local_identifiers or ppanggolin generated gene identifiers. If the local identifiers are unique within the pangenome they are picked, otherwise ppanggolin ones are used. @@ -467,15 +469,15 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p pangenome.parameters["annotation"]["read_annotations_from_file"] = True -def get_gene_sequences_from_fastas(pangenome, fasta_file): +def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: List[Path]): """ Get gene sequences from fastas - :param pangenome: input pangenome - :param fasta_file: list of fasta file + :param pangenome: Input pangenome + :param fasta_files: list of fasta file """ fasta_dict = {} - for line in read_compressed_or_not(fasta_file): + for line in read_compressed_or_not(fasta_files): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: logging.getLogger("PPanGGOLiN").error("No tabulation separator found in organisms file") @@ -483,15 +485,15 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): try: org = pangenome.get_organism(elements[0]) except KeyError: - raise KeyError(f"One of the genome in your '{fasta_file}' was not found in the pan." + raise KeyError(f"One of the genome in your '{fasta_files}' was not found in the pan." f" This might mean that the genome names between your annotation file and " f"your fasta file are different.") with read_compressed_or_not(elements[1]) as currFastaFile: fasta_dict[org], _ = read_fasta(org, currFastaFile) if set(pangenome.organisms) > set(fasta_dict.keys()): - missing = pangenome.number_of_organisms() - len(set(pangenome.organisms) & set(fasta_dict.keys())) + missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys())) raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. " - f"{missing} are missing (out of {pangenome.number_of_organisms()}).") + f"{missing} are missing (out of {pangenome.number_of_organisms}).") for org in pangenome.organisms: for contig in org.contigs: @@ -509,7 +511,7 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): pangenome.status["geneSequences"] = "Computed" -def launch_annotate_organism(pack: tuple) -> Organism: +def launch_annotate_organism(pack: Tuple[str, Path, List[str], str, int, bool, str, bool, str]) -> Organism: """ Allow to launch in multiprocessing the genome annotation :param pack: Pack of argument for annotate_organism function @@ -592,12 +594,16 @@ def launch(args: argparse.Namespace): if args.fasta: get_gene_sequences_from_fastas(pangenome, args.fasta) else: - logging.getLogger("PPanGGOLiN").warning( - "You provided gff files without sequences, and you did not provide " - "fasta sequences. Thus it was not possible to get the gene sequences.") - logging.getLogger("PPanGGOLiN").warning( - "You will be able to proceed with your analysis ONLY if you provide " - "the clustering results in the next step.") + logging.getLogger("PPanGGOLiN").warning("You provided gff files without sequences, " + "and you did not provide fasta sequences. " + "Thus it was not possible to get the gene sequences.") + logging.getLogger("PPanGGOLiN").warning("You will be able to proceed with your analysis " + "ONLY if you provide the clustering results in the next step.") + else: + if args.fasta: + logging.getLogger("PPanGGOLiN").warning("You provided fasta sequences " + "but your gff files were already with sequences." + "PPanGGOLiN will use sequences in GFF and not from your fasta.") write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index afad1cca..6220db85 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -6,6 +6,7 @@ import os import tempfile from io import TextIOWrapper +from multiprocessing import Manager from subprocess import Popen, PIPE import ast from collections import defaultdict @@ -17,6 +18,10 @@ from ppanggolin.utils import is_compressed, read_compressed_or_not +manager = Manager() +contig_len = manager.dict() + + def reverse_complement(seq: str): """reverse complement the given dna sequence @@ -79,6 +84,8 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str :return: Annotated genes in a list of gene objects """ + global contig_len + locustag = org.name cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"])) logging.getLogger("PPanGGOLiN").debug(f"prodigal command : {' '.join(cmd)}") @@ -89,9 +96,13 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str header = "" for line in p.communicate()[0].decode().split("\n"): if line.startswith("# Sequence Data: "): + length = None for data in line.split(";"): - if data.startswith("seqhdr"): + if data.startswith("seqlen"): + length = int(data.split("=")[1]) + elif data.startswith("seqhdr"): header = data.split("=")[1].replace('"', "").split()[0] + contig_len[header] = length elif line.startswith(">"): c += 1 @@ -330,11 +341,12 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: except KeyError: contig = Contig(contig_name, True if contig_name in circular_contigs else False) org.add(contig) + contig.length = contig_len[contig.name] for gene in genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) gene.fill_parents(org, contig) if isinstance(gene, Gene): - contig[gene.start] = gene + contig.add(gene) elif isinstance(gene, RNA): contig.add_rna(gene) return org From 3e4f6a0f6a4c89d5594cd02898a6529dd17cd11c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 8 Sep 2023 09:53:16 +0200 Subject: [PATCH 67/75] Write and read contig length --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 1 + ppanggolin/formats/writeAnnotations.py | 6 ++++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/VERSION b/VERSION index e9a3656e..ea25a270 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.179 +1.2.180 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 215ad7a3..1b323599 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -407,6 +407,7 @@ def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size: organism = pangenome.get_organism(contig2organism[contig_name]) contig = organism.get(contig_name) contig.is_circular = row["is_circular"] + contig.length = row["length"] try: gene = Gene(row["gene"].decode()) except ValueError: diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index d74a97a3..a8c0c6f1 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -81,7 +81,7 @@ def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.G def contig_desc(contig_len: int, max_gene_id_len: int, - max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol]]: + max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]: """Table description to save contig-related information :param contig_len: Maximum size of contig name @@ -92,12 +92,13 @@ def contig_desc(contig_len: int, max_gene_id_len: int, """ return {'name': tables.StringCol(itemsize=contig_len), "is_circular": tables.BoolCol(dflt=False), + 'length': tables.UInt32Col(), "gene": tables.StringCol(itemsize=max_gene_id_len), "rna": tables.StringCol(itemsize=max_rna_id_len)} def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, - contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol]], + contig_desc: Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]], disable_bar=False): """Write contigs information in the pangenome file :param pangenome: Annotated pangenome object @@ -115,6 +116,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro for index, gene in enumerate(contig.genes): contig_row["name"] = contig.name contig_row["is_circular"] = contig.is_circular + contig_row["length"] = len(contig) contig_row["gene"] = gene.ID if index < len(rna_list): contig_row["rna"] = rna_list[index].ID From aa4e308754bca3594af647d1c5c79f72cd622467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 8 Sep 2023 10:54:03 +0200 Subject: [PATCH 68/75] Remove manager dict to save contig length --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 5 ++-- ppanggolin/annotate/synta.py | 44 +++++++++++---------------------- 3 files changed, 17 insertions(+), 34 deletions(-) diff --git a/VERSION b/VERSION index ea25a270..177b0d1a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.180 +1.2.181 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index baad0c20..6c5ca044 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -113,7 +113,6 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li while len(lines) != 0: line = lines.pop() # beginning of contig - contig = None is_circ = False contig_id = None if line.startswith('LOCUS'): @@ -351,7 +350,7 @@ def get_id_attribute(attributes_dict: dict) -> str: # GET THE FASTA SEQUENCES OF THE GENES if has_fasta and fasta_string != "": - contig_sequences, _ = read_fasta(org, fasta_string.split('\n')) # _ is total contig length + contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length for contig in org.contigs: for gene in contig.genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) @@ -489,7 +488,7 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: List[Path] f" This might mean that the genome names between your annotation file and " f"your fasta file are different.") with read_compressed_or_not(elements[1]) as currFastaFile: - fasta_dict[org], _ = read_fasta(org, currFastaFile) + fasta_dict[org] = read_fasta(org, currFastaFile) if set(pangenome.organisms) > set(fasta_dict.keys()): missing = pangenome.number_of_organisms - len(set(pangenome.organisms) & set(fasta_dict.keys())) raise Exception(f"Not all of your pangenome organisms are present within the provided fasta file. " diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 6220db85..677da87d 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -6,11 +6,10 @@ import os import tempfile from io import TextIOWrapper -from multiprocessing import Manager from subprocess import Popen, PIPE import ast from collections import defaultdict -from typing import Union +from typing import Dict, List, Union from pathlib import Path # local libraries @@ -18,10 +17,6 @@ from ppanggolin.utils import is_compressed, read_compressed_or_not -manager = Manager() -contig_len = manager.dict() - - def reverse_complement(seq: str): """reverse complement the given dna sequence @@ -84,7 +79,6 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str :return: Annotated genes in a list of gene objects """ - global contig_len locustag = org.name cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"])) @@ -96,13 +90,9 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str header = "" for line in p.communicate()[0].decode().split("\n"): if line.startswith("# Sequence Data: "): - length = None for data in line.split(";"): - if data.startswith("seqlen"): - length = int(data.split("=")[1]) - elif data.startswith("seqhdr"): + if data.startswith("seqhdr"): header = data.split("=")[1].replace('"', "").split()[0] - contig_len[header] = length elif line.startswith(">"): c += 1 @@ -162,7 +152,7 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = " return gene_objs -def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, int): +def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, str]: """ Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value. :param org: Organism corresponding to fasta file @@ -173,24 +163,21 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in try: contigs = {} contig_seq = "" - all_contig_len = 0 contig = None for line in fna_file: if line.startswith('>'): if len(contig_seq) >= 1: # contig filter = 1 contigs[contig.name] = contig_seq.upper() - all_contig_len += len(contig_seq) + contig.length = len(contig_seq) contig_seq = "" - try: - contig = org.get(line.split()[0][1:]) - except KeyError: - contig = Contig(line.split()[0][1:]) - org.add(contig) + contig = Contig(line.split()[0][1:]) + org.add(contig) else: contig_seq += line.strip() if len(contig_seq) >= 1: # processing the last contig contigs[contig.name] = contig_seq.upper() - all_contig_len += len(contig_seq) + contig.length = len(contig_seq) + except AttributeError as e: raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. " f"One possibility for this error is that the file did not start with a '>' " @@ -198,7 +185,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> (dict, in except Exception: # To manage other exception which can occur raise Exception("Unexpected error. Please check your input file and if everything looks fine, " "please post an issue on our github") - return contigs, all_contig_len + return contigs def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrapper: @@ -301,7 +288,7 @@ def get_dna_sequence(contig_seq: str, gene: Gene) -> str: return reverse_complement(contig_seq[gene.start - 1:gene.stop]) -def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: str, +def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str], tmpdir: str, code: int = 11, norna: bool = False, kingdom: str = "bacteria", allow_overlap: bool = False, procedure: str = None) -> Organism: """ @@ -323,10 +310,11 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: fasta_file = read_compressed_or_not(file_name) - contig_sequences, all_contig_len = read_fasta(org, fasta_file) + contig_sequences = read_fasta(org, fasta_file) if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) if procedure is None: # prodigal procedure is not force by user + all_contig_len = sum(len(contig) for contig in org.contigs) logging.getLogger("PPanGGOLiN").debug(all_contig_len) if all_contig_len < 20000: # case of short sequence procedure = "meta" @@ -336,12 +324,8 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs, tmpdir: genes = overlap_filter(genes, allow_overlap=allow_overlap) for contig_name, genes in genes.items(): - try: - contig = org.get(contig_name) - except KeyError: - contig = Contig(contig_name, True if contig_name in circular_contigs else False) - org.add(contig) - contig.length = contig_len[contig.name] + contig = org.get(contig_name) + contig.is_circular = True if contig.name in circular_contigs else False for gene in genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) gene.fill_parents(org, contig) From cb8fecaaf55970a2b7ec51b8ebfdad524489ca19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 8 Sep 2023 10:58:02 +0200 Subject: [PATCH 69/75] Fix bug to read annotation --- VERSION | 2 +- ppanggolin/annotate/synta.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/VERSION b/VERSION index 177b0d1a..12b3f290 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.181 +1.2.182 diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 677da87d..c21dbdfd 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -170,8 +170,11 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, contigs[contig.name] = contig_seq.upper() contig.length = len(contig_seq) contig_seq = "" - contig = Contig(line.split()[0][1:]) - org.add(contig) + try: + contig = org.get(line.split()[0][1:]) + except KeyError: + contig = Contig(line.split()[0][1:]) + org.add(contig) else: contig_seq += line.strip() if len(contig_seq) >= 1: # processing the last contig From 40c3687db48333d7edb155a10fea621939f9e435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 8 Sep 2023 11:32:30 +0200 Subject: [PATCH 70/75] Fix bug to read contig len --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 12b3f290..2870b30c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.182 +1.2.183 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 1b323599..46cf1e1f 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -407,7 +407,7 @@ def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size: organism = pangenome.get_organism(contig2organism[contig_name]) contig = organism.get(contig_name) contig.is_circular = row["is_circular"] - contig.length = row["length"] + contig.length = int(row["length"]) try: gene = Gene(row["gene"].decode()) except ValueError: From 1a04c77ae61f3dd7d9e5aebb21bf84cb4e67773d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 8 Sep 2023 15:32:03 +0200 Subject: [PATCH 71/75] Remove GCF_001183765 organism in GBFF list to prevent gene overlap in contig --- VERSION | 2 +- testingDataset/organisms.gbff.list | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 2870b30c..9df9b3c2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.183 +1.2.184 diff --git a/testingDataset/organisms.gbff.list b/testingDataset/organisms.gbff.list index 093d4beb..d7fb3bf5 100644 --- a/testingDataset/organisms.gbff.list +++ b/testingDataset/organisms.gbff.list @@ -21,7 +21,6 @@ GCF_000472205.1 GBFF/GCF_000472205.1_E_CS88_f__genomic.gbff.gz GCF_000590575.1 GBFF/GCF_000590575.1_ASM59057v1_genomic.gbff.gz GCF_000590635.1 GBFF/GCF_000590635.1_ASM59063v1_genomic.gbff.gz GCF_000590695.1 GBFF/GCF_000590695.1_ASM59069v1_genomic.gbff.gz -GCF_001183765.1 GBFF/GCF_001183765.1_ASM118376v1_genomic.gbff.gz GCF_001183805.1 GBFF/GCF_001183805.1_ASM118380v1_genomic.gbff.gz GCF_001183825.1 GBFF/GCF_001183825.1_ASM118382v1_genomic.gbff.gz GCF_001183845.1 GBFF/GCF_001183845.1_ASM118384v1_genomic.gbff.gz From b4768ae4fbb415d1b767844dd707437d8b0f0858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 14 Sep 2023 17:08:33 +0200 Subject: [PATCH 72/75] Change format of table more relational DB --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 133 ++++++++++++------------- ppanggolin/formats/writeAnnotations.py | 67 +++++-------- ppanggolin/pangenome.py | 34 ++++++- 4 files changed, 121 insertions(+), 115 deletions(-) diff --git a/VERSION b/VERSION index 9df9b3c2..7844733d 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.184 +1.2.185 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 46cf1e1f..976499c1 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -384,81 +384,80 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal pangenome.add_module(module) pangenome.status["modules"] = "Loaded" -def read_organisms(pangenome: Pangenome, annotations: tables.Group, chunk_size: int = 20000, - disable_bar: bool = False) -> Tuple[Dict[str, Gene], Dict[str, RNA]]: - table = annotations.genomes +def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, + disable_bar: bool = False): contig2organism = {} for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar): - try: - organism = pangenome.get_organism(row["name"].decode()) - except: - organism = Organism(row["name"].decode()) - pangenome.add_organism(organism) - contig = Contig(name=row["contig"].decode()) - organism.add(contig) - contig2organism[contig.name] = organism.name - table = annotations.contigs - contig_name = None - genes_dict = {} - rna_dict = {} + organism = Organism(row["name"].decode()) + pangenome.add_organism(organism) + + +def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, + disable_bar: bool = False): for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar): - if contig_name != row["name"].decode(): - contig_name = row["name"].decode() - organism = pangenome.get_organism(contig2organism[contig_name]) - contig = organism.get(contig_name) - contig.is_circular = row["is_circular"] - contig.length = int(row["length"]) + contig = Contig(name=row["name"].decode()) + contig.is_circular = row["is_circular"] + contig.length = int(row["length"]) try: - gene = Gene(row["gene"].decode()) - except ValueError: - pass - else: - gene.fill_parents(organism, contig) - if row["gene"].decode() in genes_dict: - logging.getLogger().warning("A gene with the same ID already pass. " - "It could be a problem in the number of genes") - genes_dict[gene.ID] = gene - try: - rna = RNA(row["rna"].decode()) - except ValueError: + organism = pangenome.get_organism(row["organism"].decode()) + except KeyError: pass else: - rna_dict[rna.ID] = rna - rna.fill_parents(organism, contig) - return genes_dict, rna_dict + organism.add(contig) def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], - gene_dict: Dict[str, Gene], chunk_size: int = 20000, disable_bar: bool = False): + link: bool = True, chunk_size: int = 20000, disable_bar: bool = False): + """Read genes in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: Genes table + :param genedata_dict: Dictionary to link genedata with gene + :param link: Allow to link gene to organism and contig + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): - gene = gene_dict[row["ID"].decode()] + gene = Gene(row["ID"].decode()) genedata = genedata_dict[row["genedata_id"]] try: local = row["local"].decode() except ValueError: local = "" gene.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, - gene_type=genedata.gene_type, name=genedata.name, position=genedata.position, - genetic_code=genedata.genetic_code, product=genedata.product, - local_identifier=local) + gene_type=genedata.gene_type, name=genedata.name, position=genedata.position, + genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local) gene.is_fragment = row["is_fragment"] - if gene.contig is not None: - gene.contig.add(gene) + if link: + contig = pangenome.get_contig(row["contig"].decode()) + gene.fill_parents(contig.organism, contig) + contig.add(gene) def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata], - rna_dict: Dict[str, RNA], chunk_size: int = 20000, disable_bar: bool = False): + link: bool = True, chunk_size: int = 20000, disable_bar: bool = False): + """Read RNAs in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: RNAs table + :param genedata_dict: Dictionary to link genedata with gene + :param link: Allow to link gene to organism and contig + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="gene", disable=disable_bar): - rna = rna_dict[row["ID"].decode()] + rna = RNA(row["ID"].decode()) genedata = genedata_dict[row["genedata_id"]] rna.fill_annotations(start=genedata.start, stop=genedata.stop, strand=genedata.strand, gene_type=genedata.gene_type, name=genedata.name, product=genedata.product) - if rna.contig is not None: - rna.contig.add_rna(rna) + if link: + contig = pangenome.get_contig(row["contig"].decode()) + rna.fill_parents(contig.organism, contig) + contig.add_rna(rna) -def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_genes: bool = True, - load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False): +def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_contigs: bool = True, + load_genes: bool = True, load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False): """ Read annotation in pangenome hdf5 file to add in pangenome object @@ -467,30 +466,20 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool :param disable_bar: Disable the progress bar """ annotations = h5f.root.annotations - + genedata_dict = None if load_organisms: - if load_genes: - genedata_dict = read_genedata(h5f) - if load_rnas: - gene_dict, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar) - - read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) - read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar) - else: - gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar) - read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) - else: - if load_rnas: - genedata_dict = read_genedata(h5f) - _, rna_dict = read_organisms(pangenome, annotations, disable_bar=disable_bar) - read_rnas(pangenome, annotations.RNAs, genedata_dict, rna_dict, disable_bar=disable_bar) - else: - if load_genes: - if pangenome.status["genesClustered"] not in ["Loaded", "Computed"]: - raise Exception("Genes must be linked to gene families or organisms, but none are laoded") - gene_dict = {gene.ID: gene for gene in pangenome.genes} # Dictionary with genes in families - gene_dict, _ = read_organisms(pangenome, annotations, disable_bar=disable_bar) - read_genes(pangenome, annotations.genes, genedata_dict, gene_dict, disable_bar=disable_bar) + read_organisms(pangenome, annotations.genomes, disable_bar=disable_bar) + + if load_contigs: + read_contigs(pangenome, annotations.contigs, disable_bar=disable_bar) + + if load_genes: + genedata_dict = read_genedata(h5f) + read_genes(pangenome, annotations.genes, genedata_dict, + all([load_organisms, load_contigs]), disable_bar=disable_bar) + if load_rnas: + read_rnas(pangenome, annotations.RNAs, read_genedata(h5f) if genedata_dict is None else genedata_dict, + all([load_organisms, load_contigs]), disable_bar=disable_bar) pangenome.status["genomesAnnotated"] = "Loaded" diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index a8c0c6f1..821fd513 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -45,17 +45,15 @@ def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, i return max_org_len, max_contig_len, max_gene_id_len, max_rna_id_len, max_gene_local_id -def organism_desc(org_len: int, contig_len: int) -> Dict[str, tables.StringCol]: +def organism_desc(org_len: int) -> Dict[str, tables.StringCol]: """ Table description to save organism-related information :param org_len: Maximum size of organism name. - :param contig_len: Maximum size of contigs name :return: Formatted table """ - return {'name': tables.StringCol(itemsize=org_len), - "contig": tables.StringCol(itemsize=contig_len)} + return {'name': tables.StringCol(itemsize=org_len)} def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -73,28 +71,23 @@ def write_organisms(pangenome: Pangenome, h5f: tables.File, annotation: tables.G logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_organisms} genomes") organism_row = organism_table.row for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genome", disable=disable_bar): - for contig in org.contigs: - organism_row["name"] = org.name - organism_row["contig"] = contig.name - organism_row.append() + organism_row["name"] = org.name + organism_row.append() organism_table.flush() -def contig_desc(contig_len: int, max_gene_id_len: int, - max_rna_id_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]: +def contig_desc(contig_len: int, org_len: int) -> Dict[str, Union[tables.StringCol, tables.BoolCol, tables.UInt32Col]]: """Table description to save contig-related information :param contig_len: Maximum size of contig name - :param max_gene_id_len: Maximum size of gene name - :param max_rna_id_len: Maximum size of rna name + :param org_len: Maximum size of organism name. :return: Formatted table """ return {'name': tables.StringCol(itemsize=contig_len), "is_circular": tables.BoolCol(dflt=False), 'length': tables.UInt32Col(), - "gene": tables.StringCol(itemsize=max_gene_id_len), - "rna": tables.StringCol(itemsize=max_rna_id_len)} + "organism": tables.StringCol(itemsize=org_len)} def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -111,40 +104,28 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs") contig_row = contig_table.row for contig in tqdm(pangenome.contigs, total=pangenome.number_of_contigs, unit="contigs", disable=disable_bar): - if contig.number_of_genes >= contig.number_of_rnas: - rna_list = list(contig.RNAs) - for index, gene in enumerate(contig.genes): - contig_row["name"] = contig.name - contig_row["is_circular"] = contig.is_circular - contig_row["length"] = len(contig) - contig_row["gene"] = gene.ID - if index < len(rna_list): - contig_row["rna"] = rna_list[index].ID - contig_row.append() - else: - gene_list = list(contig.genes) - for index, rna in enumerate(contig.RNAs): - contig_row["name"] = contig.name - contig_row["is_circular"] = contig.is_circular - contig_row["rna"] = rna.ID - if index < len(gene_list): - contig_row["gene"] = gene_list[index].ID - contig_row.append() + contig_row["name"] = contig.name + contig_row["is_circular"] = contig.is_circular + contig_row["length"] = len(contig) + contig_row["organism"] = contig.organism.name + contig_row.append() contig_table.flush() -def gene_desc(id_len, max_local_id) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: +def gene_desc(id_len: int, max_local_id: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]: """Table description to save gene-related information :param id_len: Maximum size of gene name :param max_local_id: Maximum size of gene local identifier + :param max_contig_len: Maximum size of contig identifier :return: Formatted table """ return {'ID': tables.StringCol(itemsize=id_len), 'genedata_id': tables.UInt32Col(), 'local': tables.StringCol(itemsize=max_local_id), - 'is_fragment': tables.BoolCol(dflt=False)} + 'is_fragment': tables.BoolCol(dflt=False), + 'contig': tables.StringCol(itemsize=max_contig_len)} def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -169,6 +150,7 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou gene_row["ID"] = gene.ID gene_row["is_fragment"] = gene.is_fragment gene_row["local"] = gene.local_identifier + gene_row["contig"] = gene.contig.name genedata = get_genedata(gene) genedata_id = genedata2gene.get(genedata) if genedata_id is None: @@ -181,15 +163,17 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou return genedata2gene -def rna_desc(id_len) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]: +def rna_desc(id_len: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]: """Table description to save rna-related information :param id_len: Maximum size of RNA identifier + :param max_contig_len: Maximum size of contig identifier :return: Formatted table """ return {'ID': tables.StringCol(itemsize=id_len), - 'genedata_id': tables.UInt32Col()} + 'genedata_id': tables.UInt32Col(), + 'contig': tables.StringCol(itemsize=max_contig_len)} def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, @@ -212,6 +196,7 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group rna_row = rna_table.row for rna in tqdm(pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar): rna_row["ID"] = rna.ID + rna_row["contig"] = rna.contig.name genedata = get_genedata(rna) genedata_id = genedata2rna.get(genedata) if genedata_id is None: @@ -345,17 +330,17 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: boo # I add these boolean in case we would one day only load organism, contig or genes, without the other. if rec_organisms: - desc = organism_desc(org_len, contig_len) + desc = organism_desc(org_len) write_organisms(pangenome, h5f, annotation, desc, disable_bar) if rec_contigs: - desc = contig_desc(contig_len, gene_id_len, rna_id_len) + desc = contig_desc(contig_len, org_len) write_contigs(pangenome, h5f, annotation, desc, disable_bar) if rec_genes: - desc = gene_desc(gene_id_len, gene_local_id) + desc = gene_desc(gene_id_len, gene_local_id, contig_len) genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar) write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar) if rec_rnas: - desc = rna_desc(rna_id_len) + desc = rna_desc(rna_id_len, contig_len) genedata2rna = write_rnas(pangenome, h5f, annotation, desc, disable_bar) write_genedata(pangenome, h5f, annotation, genedata2rna, disable_bar) diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index ef37bb4b..b5bf7c88 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -38,7 +38,6 @@ def __init__(self): self._regionGetter = {} self._spotGetter = {} self._moduleGetter = {} - self.status = { 'genomesAnnotated': "No", 'geneSequences': "No", @@ -312,6 +311,39 @@ def number_of_contigs(self) -> int: """ return sum(len(org) for org in self.organisms) + def _mk_contig_getter(self): + """ + Builds the attribute _contig_getter of the pangenome + + Since the genes are never explicitly 'added' to a pangenome (but rather to an organism), + the pangenome cannot directly extract a gene from a geneID since it does not 'know' them. + If at some point we want to extract contig from a pangenome we'll create a contig_getter. + The assumption behind this is that the pangenome has been filled and no more contig will be added. + """ + self._contig_getter = {} + for contig in self.contigs: + self._contig_getter[contig.name] = contig + + def get_contig(self, name: str) -> Contig: + """Returns the contig that has the given name + + :param name: The ,ame of the contig to look for + + :return: Returns the wanted contig + + :raises AssertionError: If the `gene_id` is not an integer + :raises KeyError: If the `gene_id` is not in the pangenome + """ + assert isinstance(name, str), "Contig name should be a string" + + try: + return self._contig_getter[name] + except AttributeError: + # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. + self._mk_contig_getter() # make it + return self.get_contig(name) # Return what was expected. If geneID does not exist it will raise an error. + except KeyError: + raise KeyError(f"Contig: {name}, does not exist in the pangenome.") def get_organism(self, name: str) -> Organism: """ Get an organism that is expected to be in the pangenome using its name, which is supposedly unique. From f4e6648a20e01edbd847adbc7d5f6f74e8e82d75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 15 Sep 2023 10:37:56 +0200 Subject: [PATCH 73/75] Add documentation --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 49 ++++++++++++++++++++------ ppanggolin/formats/writeAnnotations.py | 12 ++++--- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/VERSION b/VERSION index 7844733d..aaf4dca1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.185 +1.2.186 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 976499c1..dda50046 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -25,19 +25,21 @@ class Genedata: """ This is a general class storing unique gene-related data to be written in a specific genedata table - - :param start: Gene start position - :param stop: Gene stop position - :param strand: Associated strand - :param gene_type: Gene type - :param position: Position of the gene on its contig - :param name: Name of the feature - :param product: Associated product - :param genetic_code: associated genetic code, if any """ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: int, name: str, product: str, genetic_code: int): + """Constructor method + + :param start: Gene start position + :param stop: Gene stop position + :param strand: Associated strand + :param gene_type: Gene type + :param position: Position of the gene on its contig + :param name: Name of the feature + :param product: Associated product + :param genetic_code: associated genetic code, if any + """ self.start = start self.stop = stop self.strand = strand @@ -47,7 +49,7 @@ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: self.product = product self.genetic_code = genetic_code - def __eq__(self, other): + def __eq__(self, other: Genedata): return self.start == other.start \ and self.stop == other.stop \ and self.strand == other.strand \ @@ -166,7 +168,9 @@ def read_chunks(table: Table, column: str = None, chunk: int = 10000): def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: """ Reads the genedata table and returns a genedata_id2genedata dictionnary + :param h5f: the hdf5 file handler + :return: dictionnary linking genedata to the genedata identifier """ table = h5f.root.annotations.genedata @@ -386,6 +390,13 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, disable_bar: bool = False): + """Read organism table in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: Organism table + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ contig2organism = {} for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar): organism = Organism(row["name"].decode()) @@ -394,6 +405,13 @@ def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000, disable_bar: bool = False): + """Read contig table in pangenome file to add them to the pangenome object + + :param pangenome: Pangenome object + :param table: Contig table + :param chunk_size: Size of the chunck reading + :param disable_bar: Disable progress bar + """ for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar): contig = Contig(name=row["name"].decode()) contig.is_circular = row["is_circular"] @@ -563,7 +581,16 @@ def read_modules_info(h5f: tables.File): f"\t\t\t- mean: {info_group._v_attrs['StatOfFamiliesInModules']['mean']}") -def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, sources: List[str] = None, disable_bar: bool = False): +def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str, + sources: List[str] = None, disable_bar: bool = False): + """Read metadata to add them to the pangenome object + + :param pangenome: Pangenome object + :param h5f: Pangenome file + :param metatype: Object type to associate metadata + :param sources: Source name of metadata + :param disable_bar: Disable progress bar + """ metadata_group = h5f.root.metadata._f_get_child(metatype) for source in sources: source_table = metadata_group._f_get_child(source) diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index 821fd513..b09e4166 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -209,7 +209,7 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group return genedata2rna -def genedata_desc(type_len, name_len, product_len): +def genedata_desc(type_len: int, name_len: int, product_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: """ Creates a table for gene-related data @@ -311,8 +311,8 @@ def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.G genedata_table.flush() -def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, - rec_contigs: bool = True, rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False): +def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: bool = True, rec_contigs: bool = True, + rec_genes: bool = True, rec_rnas: bool = True, disable_bar: bool = False): """Function writing all the pangenome annotations :param pangenome: Annotated pangenome @@ -361,11 +361,13 @@ def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: return max_gene_id_len, max_gene_type -def gene_sequences_desc(gene_id_len, gene_type_len) -> dict: +def gene_sequences_desc(gene_id_len: int, gene_type_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: """ Create table to save gene sequences + :param gene_id_len: Maximum size of gene sequence identifier :param gene_type_len: Maximum size of gene type + :return: Formated table """ return { @@ -388,7 +390,7 @@ def get_sequence_len(pangenome: Pangenome) -> int: return max_seq_len -def sequence_desc(max_seq_len: int) -> dict: +def sequence_desc(max_seq_len: int) -> Dict[str, Union[tables.UIntCol, tables.StringCol]]: """ Table description to save sequences :param max_seq_len: Maximum size of gene type From e992a17308e1a4e9d2353a8cdf62109ce1fe9a24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 15 Sep 2023 10:57:36 +0200 Subject: [PATCH 74/75] Remove Genedata in type for __eq__ --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index aaf4dca1..0da01cf8 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.186 +1.2.187 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index dda50046..968c1f2c 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -49,7 +49,7 @@ def __init__(self, start: int, stop: int, strand: str, gene_type: str, position: self.product = product self.genetic_code = genetic_code - def __eq__(self, other: Genedata): + def __eq__(self, other): return self.start == other.start \ and self.stop == other.stop \ and self.strand == other.strand \ From c2e8ef747b40a2cbd85d787f1fdc757e6c8fb11b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= <39793176+jpjarnoux@users.noreply.github.com> Date: Tue, 19 Sep 2023 11:00:18 +0200 Subject: [PATCH 75/75] Update writeFlat.py Fix completness computation --- ppanggolin/formats/writeFlat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index c9a12ef3..4a9926c6 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -530,7 +530,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, nb_gene_core += 1 completeness = "NA" if len(single_copy_markers) > 0: - completeness = round(((org.number_of_families() + len(single_copy_markers)) / + completeness = round((len(set(org.families) & single_copy_markers) / len(single_copy_markers)) * 100, 2) outfile.write("\t".join(map(str, [org.name, org.number_of_families(),