diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e0f59001..9297851c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -67,7 +67,7 @@ jobs: ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1 ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05 - ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1 + ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1 --fasta organisms.fasta.list --gff --proksee ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log @@ -134,10 +134,10 @@ jobs: run: | cd testingDataset head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list - ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_lisy_of_gbff --anno organisms.gbff.head.list + ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno organisms.gbff.head.list --gff --proksee ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \ --organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \ - --spot_graph --graph_formats graphml --fast --keep_tmp -f + --spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 28a6bc83..e95852ac 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -278,10 +278,10 @@ def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq partition_proj = output.absolute() / "sequences_partition_projection.tsv" with open(partition_proj, "w") as partProjFile: - for input_seq, pangFam in seqid_to_gene_family.items(): - partProjFile.write(input_seq + "\t" + pangFam.named_partition + "\n") - for remainingSeq in seq_set - seqid_to_gene_family.keys(): - partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. + for input_seq, gene_fam in seqid_to_gene_family.items(): + partProjFile.write(input_seq + "\t" + gene_fam.named_partition + "\n") + for remaining_seq in seq_set - seqid_to_gene_family.keys(): + partProjFile.write(remaining_seq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. return partition_proj @@ -298,11 +298,11 @@ def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_s gene_fam_map_file = output.absolute() / "gene_to_gene_family.tsv" with open(gene_fam_map_file, "w") as partProjFile: - for input_seq, pangFam in seqid_to_gene_family.items(): - partProjFile.write(f"{input_seq}\t{pangFam.name}\n") + for input_seq, gene_fam in seqid_to_gene_family.items(): + partProjFile.write(f"{input_seq}\t{gene_fam.name}\n") - for remainingSeq in seq_set - seqid_to_gene_family.keys(): - partProjFile.write(f"{remainingSeq}\t{remainingSeq}\n") # if there is no hit, gene family is itself. + for remaining_seq in seq_set - seqid_to_gene_family.keys(): + partProjFile.write(f"{remaining_seq}\t{remaining_seq}\n") # if there is no hit, gene family is itself. return gene_fam_map_file @@ -517,9 +517,8 @@ def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Itera def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8, coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, - use_representatives: bool = False, - draw_related: bool = False, translation_table: int = 11, tmpdir: Path = None, - disable_bar: bool = False, keep_tmp=False): + use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11, + tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False): """ Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2. diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 5d4c5624..3a2b7680 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -238,6 +238,9 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li while not line.startswith('//'): sequence += line[10:].replace(" ", "").strip().upper() line = lines.pop() + + if contig.length != len(sequence): + raise ValueError("The contig lenght defined is different than the sequence length") # get each gene's sequence. for gene in contig.genes: gene.add_sequence(get_dna_sequence(sequence, gene)) @@ -315,7 +318,7 @@ def get_id_attribute(attributes_dict: dict) -> str: True if fields[1] in circular_contigs else False) contig_counter.value += 1 org.add(contig) - contig.length = int(fields[-1]) - int(fields[3]) + 1 + contig.length = int(fields[-1]) - int(fields[2]) + 1 continue elif line.startswith('#'): # comment lines to be ignores by parsers @@ -368,6 +371,9 @@ def get_id_attribute(attributes_dict: dict) -> str: if has_fasta and fasta_string != "": contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length for contig in org.contigs: + if contig.length != len(contig_sequences[contig.name]): + raise ValueError("The contig lenght defined is different than the sequence length") + for gene in contig.genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) for rna in contig.RNAs: diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 8dbe7f2d..18e512ab 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -169,7 +169,6 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, :return: Dictionnary with contig_name as keys and contig sequence in values """ global contig_counter - try: contigs = {} contig_seq = "" diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 4382ba43..4ed3ad2e 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -309,8 +309,6 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) tmp_path = Path(newtmpdir.name) - # newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - # tmp_path = Path(newtmpdir.name) with open(tmp_path/'nucleotid_sequences', "w") as sequence_file: check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...") diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index fc3ded05..535bc9be 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -9,7 +9,7 @@ import logging import os from typing import List, Dict, Tuple, Iterable, Hashable, Iterator, Set -from itertools import zip_longest, chain +from itertools import chain from collections import defaultdict from pathlib import Path @@ -21,7 +21,7 @@ # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.genome import Gene, Contig -from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not +from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not, extract_contig_window from ppanggolin.pangenome import Pangenome from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, \ get_input_seq_to_family_with_all, get_seq_ids @@ -445,63 +445,6 @@ def get_n_next_genes_index(current_index: int, next_genes_count: int, yield next_gene_index -def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, - is_circular: bool = False): - """ - Extracts contiguous windows around positions of interest within a contig. - - :param contig_size: Number of genes in contig. - :param positions_of_interest: An iterable containing the positions of interest. - :param window_size: The size of the window to extract around each position of interest. - :param is_circular: Indicates if the contig is circular. - :return: Yields tuples representing the start and end positions of each contiguous window. - """ - windows_coordinates = [] - - # Sort the positions of interest - sorted_positions = sorted(positions_of_interest) - - # Check if any position of interest is out of range - if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size: - raise IndexError(f'Positions of interest are out of range. ' - f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") - - if is_circular: - first_position = sorted_positions[0] - last_position = sorted_positions[-1] - # in a circular contig, if the window of a gene of interest overlaps the end/start of the contig - # an out of scope position is added to the sorted positions to take into account those positions - # the returned window are always checked that its positions are not out of range... - # so there's no chance to find an out of scope position in final list - if first_position - window_size < 0: - out_of_scope_position = contig_size + first_position - sorted_positions.append(out_of_scope_position) - - if last_position + window_size >= contig_size: - out_of_scope_position = last_position - contig_size - sorted_positions.insert(0, out_of_scope_position) - - start_po = max(sorted_positions[0] - window_size, 0) - - for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]): - - if next_po is None: - # If there are no more positions, add the final window - end_po = min(position + window_size, contig_size - 1) - windows_coordinates.append((start_po, end_po)) - - elif position + window_size + 1 < next_po - window_size: - # If there is a gap between positions, add the current window - # and update the start position for the next window - end_po = min(position + window_size, contig_size - 1) - - windows_coordinates.append((start_po, end_po)) - - start_po = max(next_po - window_size, 0) - - return windows_coordinates - - def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set[Gene]]: """ Group genes from specified gene families by contig. diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index f675682b..d40b38c3 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -647,10 +647,10 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar: need_rgp=True, need_spots=True, need_modules=need_mod, disable_bar=disable_bar) if spot_list == 'all' or any(x == 'all' for x in spot_list): - logging.getLogger("PPanGGOLiN").debug(f"'all' value is found in spot list, all spots are drawn.") + logging.getLogger("PPanGGOLiN").debug("'all' value is found in spot list, all spots are drawn.") selected_spots = list(pangenome.spots) elif spot_list == "synteny" or any(x == 'synteny' for x in spot_list): - logging.getLogger().debug(f"'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.") + logging.getLogger().debug("'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.") selected_spots = [s for s in pangenome.spots if len(s.get_uniq_ordered_set()) > 1] else: curated_spot_list = {'spot_' + str(s) if not s.startswith("spot_") else str(s) for s in spot_list} diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 762d978d..e0ff3f14 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -173,7 +173,7 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di shapes=shapes, plot_bgcolor='#ffffff') logging.getLogger("PPanGGOLiN").info("Drawing the figure itself...") - #fig = go.Figure(data=[heatmap], layout=layout) + fig = go.Figure(data=[heatmap]) fig.add_trace(go.Scatter(x=dendro_org['icoord'], diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index f653ccf9..e12170dd 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -5,20 +5,32 @@ import argparse import logging from multiprocessing import get_context +from itertools import combinations from collections import Counter, defaultdict +import logging +from typing import TextIO,List, Dict from pathlib import Path from typing import TextIO from importlib.metadata import distribution from statistics import median, mean, stdev import os +import random + + +import networkx as nx +from plotly.express.colors import qualitative + # local libraries from ppanggolin.edge import Edge from ppanggolin.geneFamily import GeneFamily -from ppanggolin.genome import Organism +from ppanggolin.genome import Organism, Gene, Contig, RNA +from ppanggolin.region import Region, Spot, Module from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float +from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float, extract_contig_window, parse_input_paths_file from ppanggolin.formats.readBinaries import check_pangenome_info +from ppanggolin.formats.write_proksee import write_proksee_organism +from ppanggolin.formats.writeSequences import read_genome_file, write_spaced_fasta # global variable to store the pangenome pan = Pangenome() # TODO change to pangenome:Pangenome = Pangenome=() ? @@ -622,6 +634,292 @@ def write_projections(output: Path, compress: bool = False): logging.getLogger("PPanGGOLiN").info("Done writing the projection files") +def write_proksee(output: Path, fasta: Path = None, anno: Path = None): + """ + Generate ProkSee data for multiple organisms and write it to the specified output directory. + + :param output: The directory where the ProkSee data will be written. + :param fasta: The path to a FASTA file containing genome sequences (optional). + :param anno: The path to an annotation file (optional). + + This function generates ProkSee data for multiple organisms and writes it to the specified output directory. + If genome sequences are provided in a FASTA file or annotations in a separate file, they will be used in the generation + of ProkSee data for each organism to add sequences data to proksee files. + """ + + proksee_outdir = output / "proksee" + mk_outdir(proksee_outdir, True) + + organisms_file = fasta if fasta is not None else anno + + if organisms_file: + org_dict = parse_input_paths_file(organisms_file) + + org_to_modules = defaultdict(set) + + # Create a mapping of organisms to the modules they belong to + for mod in pan.modules: + for org in mod.organisms: + org_to_modules[org].add(mod) + + # Generate a color mapping for modules + module_to_colors = manage_module_colors(list(pan.modules)) + + features = ["all"] + + for organism in pan.organisms: + if organisms_file: + genome_sequences = read_genome_file(org_dict[organism.name]['path'], organism) + else: + genome_sequences = None + + # Generate a color mapping for modules specific to the organism + org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in org_to_modules[organism]} + + output_file = proksee_outdir / f"{organism.name}.json" + + # Write ProkSee data for the organism + write_proksee_organism(organism, output_file, features=features, module_to_colors=org_module_to_color, rgps=pan.regions, + genome_sequences=genome_sequences) + + + logging.getLogger().info("Done writing the proksee files") + +def manage_module_colors(modules: List[Module], window_size:int=50) -> Dict[Module, str]: + """ + Manages colors for a list of modules based on gene positions and a specified window size. + + :param modules: A list of module objects for which you want to determine colors. + :param window_size: Minimum number of genes between two modules to color them with the same color. + A higher value results in more module colors. + :return: A dictionary that maps each module to its assigned color. + """ + + color_mod_graph = nx.Graph() + color_mod_graph.add_nodes_from((module for module in modules)) + + contig_to_mod_genes = defaultdict(set) + gene_to_module = {} + + for module in modules: + for fam in module.families: + for gene in fam.genes: + contig_to_mod_genes[gene.contig].add(gene) + gene_to_module[gene] = module + + for contig, mod_genes in contig_to_mod_genes.items(): + gene_positions = (gene.position for gene in mod_genes) + contig_windows = extract_contig_window( + contig.number_of_genes, gene_positions, window_size=window_size, is_circular=contig.is_circular + ) + contig_windows = list(contig_windows) + + for (start, end) in contig_windows: + module_in_window = {gene_to_module[gene] for gene in mod_genes if start <= gene.position <= end} + + # Add edges between closely located modules + module_edges = [(mod_a, mod_b) for mod_a, mod_b in combinations(module_in_window, 2)] + color_mod_graph.add_edges_from(module_edges) + + module_to_color_int = nx.coloring.greedy_color(color_mod_graph) + + # If you want to export the graph to see the coloring: + # nx.set_node_attributes(color_mod_graph, color_dict, name="color") + # nx.readwrite.graphml.write_graphml(color_mod_graph, f"module_graph_window_size{window_size}.graphml") + + nb_colors = len(set(module_to_color_int.values())) + logging.getLogger().debug(f"We have found that {nb_colors} colors were necessary to color Modules.") + colors = palette(nb_colors) + module_to_color = {mod: colors[col_i] for mod, col_i in module_to_color_int.items()} + + return module_to_color + +def palette(nb_colors: int) -> List[str]: + """ + Generates a palette of colors for visual representation. + + :param nb_colors: The number of colors needed in the palette. + + :return: A list of color codes in hexadecimal format. + """ + + # Combine two sets of predefined colors for variety + colors = qualitative.Vivid + qualitative.Safe + + if len(colors) < nb_colors: + # Generate random colors if not enough predefined colors are available + random.seed(1) + random_colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in range(nb_colors - len(colors))] + colors += random_colors + else: + colors = colors[:nb_colors] + + return colors + + +def write_gff(output: str, compress: bool = False, fasta: Path = None, anno: Path = None): + + """ + Write the gff files for all organisms + + :param output: Path to output directory + :param compress: Compress the file in .gz + :param fasta: The path to a FASTA file containing genome sequences (optional). + :param anno: The path to an annotation file (optional). + + """ + logging.getLogger().info("Writing the gff files...") + + organisms_file = fasta if fasta is not None else anno + + if organisms_file: + org_dict = parse_input_paths_file(organisms_file) + + outdir = output / "gff" + mk_outdir(outdir, True) + + if pan.parameters["annotate"]["# read_annotations_from_file"]: + annotation_sources = {"rRNA": "external", + "tRNA": "external", + "CDS":"external"} + else: + annotation_sources = {} + + contig_to_rgp = defaultdict(list) + for rgp in pan.regions: + contig_to_rgp[rgp.contig].append(rgp) + + rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in pan.spots for rgp in spot.regions} + + for org in pan.organisms: + if organisms_file: + genome_sequences = read_genome_file(org_dict[org.name]['path'], org) + else: + genome_sequences = None + + write_gff_file(org, contig_to_rgp, rgp_to_spot_id, outdir, compress, annotation_sources, genome_sequences) + + logging.getLogger().info("Done writing the gff files") + + +def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], + rgp_to_spotid: Dict[Region, str], outdir: str, compress: bool, + annotation_sources: Dict[str, str], genome_sequences:Dict[str,str]): + """ + Write the GFF file of the provided organism. + + :param org: Organism object for which the GFF file is being written. + :param contig_to_rgp: Dictionary mapping Contig objects to their corresponding Region objects. + :param rgp_to_spotid: Dictionary mapping Region objects to their corresponding spot IDs. + :param outdir: Path to the output directory where the GFF file will be written. + :param compress: If True, compress the output GFF file using .gz format. + :param annotation_sources: A dictionary that maps types of features to their source information. + :param genome_sequences: A dictionary mapping contig names to their DNA sequences (default: None). + """ + + # sort contig by their name + sorted_contigs = sorted(org.contigs, key= lambda x : x.name) + + with write_compressed_or_not(outdir / F"{org.name}.gff", compress) as outfile: + # write gff header + outfile.write('##gff-version 3\n') + for contig in sorted_contigs: + if contig.length is None: + raise AttributeError(f'Contig {contig.name} has no length defined.') + + outfile.write(f'##sequence-region {contig.name} 1 {contig.length}\n') + + for contig in sorted_contigs: + contig_elements = sorted(contig_to_rgp[contig] + list(contig.genes) + list(contig.RNAs), key=lambda x: (x.start)) + + for feature in contig_elements: + + if type(feature) in [Gene, RNA]: + feat_type = feature.type + + strand = feature.strand + + source = annotation_sources.get(feat_type, "external") + + # before the CDS or RNA line a gene line is created. with the following id + parent_gene_id=f"gene-{feature.ID}" + + attributes = [("ID", feature.ID), + ("Name", feature.name), + ('Parent', parent_gene_id), + ("product", feature.product), + ] + + score = '.' + + if isinstance(feature, Gene): + rgp = feature.RGP.name if feature.RGP else "" + attributes += [ + ("Family", feature.family.name), + ("Partition", feature.family.named_partition), + ('RGP', rgp), + ('Module', ','.join((f"module_{module.ID}" for module in feature.family.modules)) ) + ] + + + + # add an extra line of type gene + + gene_line = [contig.name, + source, + 'gene', + feature.start, + feature.stop, + '.', + strand, + ".", + f'ID={parent_gene_id}' + ] + + line_str = '\t'.join(map(str, gene_line)) + outfile.write(line_str + "\n") + + elif isinstance(feature, Region): + feat_type = "region" + source = "ppanggolin" + strand = "." + score = feature.score # TODO does RGP score make sens and do we want it in gff file? + attributes = [ + ("Name", feature.name), + ("Spot", rgp_to_spotid.get(feature, "No_spot")), + ("Note", "Region of Genomic Plasticity (RGP)") + ] + + + else: + raise TypeError(f'The feature to write in gff file does not have an expected types. {type(feature)}') + + + attributes_str = ';'.join([f"{k}={v}" for k,v in attributes if v != "" and v is not None]) + + line = [contig.name, + source, # Source + feat_type, + feature.start, + feature.stop, + score, + strand, + ".", + attributes_str, + ] + + line_str = '\t'.join(map(str, line)) + outfile.write(line_str + "\n") + + if genome_sequences: + logging.getLogger("PPanGGOLiN").debug("Writing fasta section of gff file...") + outfile.write("##FASTA\n") + for contig in sorted_contigs: + outfile.write(f">{contig.name}\n") + + outfile.write(write_spaced_fasta(genome_sequences[contig.name], space=60)) + + def write_parts(output: Path, soft_core: float = 0.95): """ Write the list of gene families for each partition @@ -839,8 +1137,8 @@ def write_org_modules(output: Path, compress: bool = False): for fam in mod.families: mod_orgs |= set(fam.organisms) for org in mod_orgs: - completion = round((org.number_of_families() + len(mod)) / len(mod), 2) - fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n") + completion = len(set(org.families) & set(mod.families)) / len(mod) + fout.write(f"module_{mod.ID}\t{org.name}\t{completion:.2}\n") fout.close() logging.getLogger("PPanGGOLiN").info( f"Done writing modules to organisms associations to: '{output.as_posix() + '/modules_in_organisms.tsv'}'") @@ -926,10 +1224,10 @@ def write_rgp_modules(output: Path, compress: bool = False): def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core: float = 0.95, dup_margin: float = 0.05, csv: bool = False, gene_pa: bool = False, gexf: bool = False, - light_gexf: bool = False, projection: bool = False, stats: bool = False, json: bool = False, + light_gexf: bool = False, projection: bool = False, gff: bool = False, proksee: bool = False, stats: bool = False, json: bool = False, partitions: bool = False, regions: bool = False, families_tsv: bool = False, spots: bool = False, borders: bool = False, modules: bool = False, spot_modules: bool = False, compress: bool = False, - disable_bar: bool = False): + disable_bar: bool = False, fasta=None, anno=None): """ Main function to write flat files from pangenome @@ -943,6 +1241,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core :param gexf: write pangenome graph in gexf format :param light_gexf: write pangenome graph with only gene families :param projection: write projection of pangenome for organisms + :param gff: write a gff file with pangenome annotation for each organisms :param stats: write statistics about pangenome :param json: write pangenome graph in json file :param partitions: write the gene families for each partition @@ -956,7 +1255,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core :param disable_bar: Disable progress bar """ # TODO Add force parameter to check if output already exist - if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, stats, json, partitions, regions, spots, borders, + if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, gff, proksee, stats, json, partitions, regions, spots, borders, families_tsv, modules, spot_modules]): raise Exception("You did not indicate what file you wanted to write.") @@ -976,10 +1275,10 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core pan = pangenome if csv or gene_pa or gexf or light_gexf or projection or stats or json or partitions or regions or spots or \ - families_tsv or borders or modules or spot_modules: + families_tsv or borders or modules or spot_modules or gff or proksee: needAnnotations = True needFamilies = True - if projection or stats or partitions or regions or spots or borders: + if projection or stats or partitions or regions or spots or borders or gff or proksee: needPartitions = True if gexf or light_gexf or json: needGraph = True @@ -997,7 +1296,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core needSpots = True if modules or spot_modules: # or projection: needModules = True - if projection: + if projection or gff or proksee: needRegions = True if pan.status["predictedRGP"] == "inFile" else False needSpots = True if pan.status["spots"] == "inFile" else False needModules = True if pan.status["modules"] == "inFile" else False @@ -1018,6 +1317,10 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core processes.append(p.apply_async(func=write_gexf, args=(output, True, compress))) if projection: processes.append(p.apply_async(func=write_projections, args=(output, compress))) + if proksee: + processes.append(p.apply_async(func=write_proksee, args=(output, fasta, anno))) + if gff: + processes.append(p.apply_async(func=write_gff, args=(output, compress, fasta, anno))) if stats: processes.append(p.apply_async(func=write_stats, args=(output, soft_core, dup_margin, compress))) if json: @@ -1054,10 +1357,10 @@ def launch(args: argparse.Namespace): global pan pan.add_file(args.pangenome) write_flat_files(pan, args.output, cpu=args.cpu, soft_core=args.soft_core, dup_margin=args.dup_margin, csv=args.csv, - gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, + gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, gff=args.gff, proksee=args.proksee, stats=args.stats, json=args.json, partitions=args.partitions, regions=args.regions, families_tsv=args.families_tsv, spots=args.spots, borders=args.borders, modules=args.modules, - spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar) + spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar, fasta=args.fasta, anno=args.anno) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -1103,11 +1406,19 @@ def parser_flat(parser: argparse.ArgumentParser): optional.add_argument("--projection", required=False, action="store_true", help="a csv file for each organism providing information on the projection of the graph " "on the organism") + optional.add_argument("--gff", required=False, action="store_true", + help="Generate a gff file for each organism containing pangenome annotations.") + + optional.add_argument("--proksee", required=False, action="store_true", + help="Generate JSON map files for PROKSEE for each organism containing pangenome annotations to be used to in proksee.") + optional.add_argument("--stats", required=False, action="store_true", help="tsv files with some statistics for each organism and for each gene family") + optional.add_argument("--partitions", required=False, action="store_true", help="list of families belonging to each partition, with one file per partitions and " "one family per line") + optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") optional.add_argument("--json", required=False, action="store_true", help="Writes the graph in a json file format") optional.add_argument("--regions", required=False, action="store_true", @@ -1124,6 +1435,19 @@ def parser_flat(parser: argparse.ArgumentParser): optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") + context = parser.add_argument_group(title="Contextually required arguments", + description="With --proksee and -gff, the following arguments can be " + "used to add sequence information to the output file:") + + context.add_argument('--fasta', required=False, type=Path, + help="A tab-separated file listing the organism names, and the fasta filepath of its genomic " + "sequence(s) (the fastas can be compressed with gzip). One line per organism.") + + context.add_argument('--anno', required=False, type=Path, + help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " + "annotations (the files can be compressed with gzip). One line per organism. " + "If this is provided, those annotations will be used.") + if __name__ == '__main__': """To test local change and allow using debugger""" from ppanggolin.utils import set_verbosity_level, add_common_arguments diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 6c7363f5..af189edd 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -237,14 +237,12 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: str, genome_id = "" seq = "" curr_len = 0 - dup_gene = 0 # TODO Remove ? curr_phylo_dict = {} for line in fin: if line.startswith('>'): if genome_id != "": if genome_id not in missing_genomes: - dup_gene += 1 # duplicated genes. Replacing them with gaps. curr_phylo_dict[genome_id] = "-" * curr_len else: diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py index 59c3d7c4..0cb7e2cb 100644 --- a/ppanggolin/formats/writeMetadata.py +++ b/ppanggolin/formats/writeMetadata.py @@ -124,7 +124,7 @@ def get_metadata_len(select_elem: List[Module], source: str) -> Tuple[Dict[str, if isinstance(value, float) or isinstance(value, int): if attr in type_dict: if type_dict[attr] != type(value): - if type(value) == float and type_dict[attr] == int: + if isinstance(value, float) and isinstance(type_dict[attr], int): type_dict[attr] = tables.Float64Col() else: if isinstance(value, float): diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index ed5e590a..6ad32c29 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -14,12 +14,11 @@ # local libraries from ppanggolin.pangenome import Pangenome from ppanggolin.geneFamily import GeneFamily -from ppanggolin.genome import Gene +from ppanggolin.genome import Gene, Organism from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file - -module_regex = re.compile(r'^module_[0-9]+') +module_regex = re.compile(r'^module_\d+') #\d == [0-9] poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id." @@ -191,15 +190,16 @@ def read_fasta_or_gff(file_path: Path) -> Dict[str, str]: sequence_dict = {} seqname = "" seq = "" - z = False + in_fasta_part = False with read_compressed_or_not(file_path) as f: for line in f: if line.startswith(">"): - z = True - if z: + in_fasta_part = True + if in_fasta_part: if line.startswith('>'): if seq != "": sequence_dict[seqname] = seq + seq = "" seqname = line[1:].strip().split()[0] else: seq += line.strip() @@ -246,26 +246,33 @@ def read_fasta_gbk(file_path: Path) -> Dict[str, str]: return sequence_dict -def read_genome_file(file_dict: Dict[str, Path], genome_name: str) -> Dict[str, str]: +def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]: """ - Read the genome file associated to organism + Read the genome file associated to organism to extract sequences - :param file_dict: Dictionary given association between organism and fasta file - :param genome_name: organism name + :param genome_file: Path to a fasta file or gbff/gff file + :param genome: organism object :return: Dictionary with all sequences associated to contig """ - filetype = detect_filetype(file_dict[genome_name]) + filetype = detect_filetype(genome_file) if filetype in ["fasta", "gff"]: - return read_fasta_or_gff(file_dict[genome_name]) + contig_to_sequence = read_fasta_or_gff(genome_file) elif filetype == "gbff": - return read_fasta_gbk(file_dict[genome_name]) + contig_to_sequence = read_fasta_gbk(genome_file) else: - raise Exception(f"Unknown filetype detected: '{file_dict[genome_name]}'") + raise Exception(f"Unknown filetype detected: '{genome_file}'") + + # check_contig_names + if set(contig_to_sequence) != {contig.name for contig in organism.contigs}: + raise Exception(f"Contig name inconsistency detected in organism '{organism.name}' between the " + f"information stored in the pangenome file and the contigs found in '{genome_file}'.") + return contig_to_sequence def write_spaced_fasta(sequence: str, space: int = 60) -> str: - """Write a maximum of element per line + """ + Write a maximum of element per line :param sequence: sequence to write :param space: maximum of size for one line @@ -322,8 +329,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa loaded_genome = "" for region in tqdm(regions_to_write, unit="rgp", disable=disable_bar): if region.organism.name != loaded_genome: - loaded_genome = region.organism.name - genome_sequence = read_genome_file(org_dict, loaded_genome) + organism = region.organism + genome_sequence = read_genome_file(org_dict[organism.name], organism) fasta.write(f">{region.name}\n") fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.starter.start:region.stopper.stop], 60)) logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: '{outname}'") diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py new file mode 100644 index 00000000..9df61e6a --- /dev/null +++ b/ppanggolin/formats/write_proksee.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +# coding:utf-8 + +# default libraries +import json +import logging +from pathlib import Path +from tqdm import tqdm +from typing import Dict, List, Tuple +from collections import defaultdict + +# installed libraries + + +# local libraries +from ppanggolin.genome import Organism, Gene +from ppanggolin.region import Module, Region +from ppanggolin.pangenome import Pangenome + + + +def write_legend_items(features: List[str], module_to_color: Dict[Module, str]): + """ + Generates legend items based on the selected features and module-to-color mapping. + + :param features: A list of features to include in the legend. + :param module_to_color: A dictionary mapping modules to their assigned colors. + + :return: A data structure containing legend items based on the selected features and module colors. + """ + # use https://medialab.github.io/iwanthue/ to find nice colors + # that associate well with established partition colors (orange, light green, light blue) + main_colors = { + "orange": "#e59c04", + "light green": "#00d860" , + "light blue": "#79deff", + "purple": "#a567bb", + "dark green": "#7a9a4c", + "dark red": "#ca5c55", + } + + legend_data = {"items" : [ + {"name": "persistent", "swatchColor": main_colors['orange'], "decoration": "arrow"}, + {"name": "shell", "swatchColor": main_colors['light green'], "decoration": "arrow"}, + {"name": "cloud", "swatchColor": main_colors['light blue'], "decoration": "arrow"}, + {"name": "RNA", "swatchColor": main_colors['purple'], "decoration": "arrow"}, + ] + } + if "rgp" in features or "all" in features: + legend_data["items"].append({"name": "RGP", "swatchColor": main_colors['dark green'], "decoration": "arc"}), + + if "modules" in features or "all" in features: + for mod, color in sorted(module_to_color.items(), key=lambda x: x[0].ID): + legend_data["items"].append({"name": f"module_{mod.ID}", "decoration": "arc", "swatchColor": color, "visible":False}) + + return legend_data + +def write_tracks(features: List[str]): + """ + Generates track information based on the selected features. + + :param features: A list of features to include in the ProkSee data. + + :return: A list of track configurations based on the selected features. + """ + tracks = [ + { + "name": "Gene", + "separateFeaturesBy": "strand", + "position": "outside", + "thicknessRatio": 1, + "dataType": "feature", + "dataMethod": "source", + "dataKeys": "Gene" + } + ] + + if "rgp" in features or "all" in features: + tracks.append({ + "name": "RGP", + "separateFeaturesBy": "None", + "position": "inside", + "thicknessRatio": 1, + "dataType": "feature", + "dataMethod": "source", + "dataKeys": "RGP" + }) + + if "modules" in features or "all" in features: + tracks.append({ + "name": "Module", + "separateFeaturesBy": "None", + "position": "inside", + "thicknessRatio": 1, + "dataType": "feature", + "dataMethod": "source", + "dataKeys": "Module" + }) + + return tracks + + +def initiate_proksee_data(features: List[str], org_name: str, module_to_color: Dict[Module, str]): + """ + Initializes ProkSee data structure with legends, tracks, and captions. + + :param features: A list of features to include in the ProkSee data. + :param org_name: The name of the organism for which the ProkSee data is being generated. + :param module_to_color: A dictionary mapping modules to their assigned colors. + + :return: ProkSee data structure containing legends, tracks, and captions. + """ + proksee_legends = write_legend_items(features, module_to_color) + proksee_tracks = write_tracks(features) + + proksee_captions = { + "name": f"{org_name} annotated with PPanGGOLiN", + "position": "bottom-center", + "font": "sans-serif,plain,18", + "backgroundColor": "rgba(255,255,255,0.4)" + } + + cgview_data = { + "name": "PPanGGOLiN annotation at genome level", + "version": "1.5.0", + 'settings': {}, + "legend": proksee_legends, + "tracks": proksee_tracks, + "sequence": {}, + 'captions': [proksee_captions], + } + + return {"cgview": cgview_data} + + +def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None) -> List[Dict]: + """ + Writes contig data for a given organism in proksee format. + + :param organism: The organism for which contig data will be written. + :param genome_sequences: A dictionary mapping contig names to their DNA sequences (default: None). + + :return: A list of contig data in a structured format. + """ + contigs_data_list = [] + + for contig in tqdm(organism.contigs, unit="contig", disable=True): + contig_info = { + "name": contig.name, + "length": contig.length, + "orientation": "+", + } + + if genome_sequences: + contig_info['seq'] = genome_sequences.get(contig.name, "") + + contigs_data_list.append(contig_info) + + return contigs_data_list + + + +def write_genes(organism: Organism, disable_bar: bool = True) -> Tuple[List[Dict], Dict[str, List[Gene]]]: + """ + Writes gene data for a given organism, including both protein-coding genes and RNA genes. + + :param organism: The organism for which gene data will be written. + :param disable_bar: A flag to disable the progress bar when processing genes (default: True). + + :return: A tuple containing a list of gene data in a structured format and a dictionary mapping gene families to genes. + """ + genes_data_list = [] + gf2gene = defaultdict(list) + + # Process protein-coding genes + for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=disable_bar): + gf = gene.family + gf2gene[gf.name].append(gene) + + genes_data_list.append({ + "name": gene.name, + "type": "Gene", + "contig": gene.contig.name, + "start": gene.start, + "stop": gene.stop, + "strand": 1 if gene.strand == "+" else -1, + "product": gene.product, + "tags": [gene.family.named_partition, gene.family.name], + "source": "Gene", + "legend": gene.family.named_partition, + "meta": "" # annotations + }) + + # Process RNA genes + for gene in tqdm(organism.rna_genes, total=organism.number_of_rnas(), unit="rnas", disable=disable_bar): + genes_data_list.append({ + "name": gene.name, + "type": "Gene", + "contig": gene.contig.name, + "start": gene.start, + "stop": gene.stop, + "strand": 1 if gene.strand == "+" else -1, + "product": gene.product, + "tags": [], + "source": "Gene", + "legend": "RNA", + "meta": "" # annotations + }) + + return genes_data_list, gf2gene + + +def write_rgp(rgps: Pangenome, organism: Organism): + """ + Writes RGP (Region of Genomic Plasticity) data for a given organism in proksee format. + + :param pangenome: The pangenome containing information about RGPs. + :param organism: The specific organism for which RGP data will be written. + + :return: A list of RGP data in a structured format. + """ + rgp_data_list = [] + + # Iterate through each RGP in the pangenome + for rgp in tqdm(rgps, unit="RGP", disable=True): + if rgp.organism == organism: + # Create an entry for the RGP in the data list + rgp_data_list.append({ + "name": rgp.name, + "contig": rgp.contig.name, + "start": rgp.start, + "stop": rgp.stop, + "legend": "RGP", + "source": "RGP", + "tags": [] + }) + + return rgp_data_list + + +def write_modules(modules: List[Module], organism: Organism, gf2genes: Dict[str, List[Gene]]): + """ + Writes module data in proksee format for a list of modules associated with a given organism. + + :param modules: A list of modules for which data will be written. + :param organism: The organism to which the modules are associated. + :param gf2genes: A dictionary that maps gene families to the genes they contain. + + :return: A list of module data in a structured format. + """ + modules_data_list = [] + + # Iterate through each module and find intersecting gene families + for module in modules: + gf_intersection = set(organism.families) & set(module.families) + + if gf_intersection: + # Calculate the completion percentage + completion = round(len(gf_intersection) / len(set(module.families)), 2) + + # Create module data entries for genes within intersecting gene families + for gf in gf_intersection: + for gene in gf2genes[gf.name]: + modules_data_list.append({ + "name": f"Module_{module.ID}", + "presence": "Module", + "start": gene.start, + "stop": gene.stop, + "contig": gene.contig.name, + "legend": f"module_{module.ID}", + "source": "Module", + "tags": [], + "meta": { + "completion": completion + } + }) + + return modules_data_list + + +def write_proksee_organism(organism: Organism, output_file: Path, + features: List[str] = None, + module_to_colors: Dict[Module, str] = None, + rgps:List[Region] = None, + genome_sequences: Dict[str,str] = None): + """ + Write ProkSee data for a given organism. + + :param organism: The organism for which ProkSee data will be written. + :param output_file: The output file where ProkSee data will be written. + :param features: A list of features to include in the ProkSee data, e.g., ["rgp", "modules", "all"]. + :param module_to_colors: A dictionary mapping modules to their assigned colors. + :patram rgps: list of RGPs that belong to the organisms + :param genome_sequences: The genome sequences for the organism. + + This function writes ProkSee data for a given organism, including contig information, genes colored by partition, RGPs, + and modules. The resulting data is saved as a JSON file in the specified output file. + """ + proksee_data = initiate_proksee_data(features, organism.name, module_to_colors) + + proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences) + + genes_features, gf2genes = write_genes(organism) + + proksee_data["cgview"]["features"] = genes_features + + if "rgp" in features or "all" in features: + proksee_data["cgview"]["features"] += write_rgp(rgps, organism=organism) + + if "modules" in features or "all" in features: + proksee_data["cgview"]["features"] += write_modules(modules=module_to_colors, organism=organism, gf2genes=gf2genes) + + logging.debug(f"Write ProkSee for {organism.name}") + with open(output_file, "w") as out_json: + json.dump(proksee_data, out_json, indent=2) diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index a6e05198..a8a8f3ff 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -341,16 +341,16 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': - logging.getLogger("PPanGGOLiN").debug(f"all") + logging.getLogger("PPanGGOLiN").debug("all") for org in self.organisms: self.bitarray[index[org]] = 1 elif partition in ['shell', 'cloud']: - logging.getLogger("PPanGGOLiN").debug(f"shell, cloud") + logging.getLogger("PPanGGOLiN").debug("shell, cloud") if self.named_partition == partition: for org in self.organisms: self.bitarray[index[org]] = 1 elif partition == 'accessory': - logging.getLogger("PPanGGOLiN").debug(f"accessory") + logging.getLogger("PPanGGOLiN").debug("accessory") if self.named_partition in ['shell', 'cloud']: for org in self.organisms: self.bitarray[index[org]] = 1 diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index ef2e13b0..d74e8779 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -200,7 +200,7 @@ class Gene(Feature): Fields: - position: the position of the gene in the genome. - family: the family that the gene belongs to. - - RGP: a set of resistance gene profiles associated with the gene. + - RGP: A putative Region of Plasticity that contains the gene. - genetic_code: the genetic code associated with the gene. - Protein: the protein sequence corresponding to the translated gene. """ @@ -359,7 +359,13 @@ def length(self, contig_len: int): raise TypeError("Contig length is expected to be an integer") if contig_len < 0: raise ValueError("Contig length must be positive") - self._length = contig_len + + if self._length is None: + self._length = contig_len + elif self.length != contig_len: + logging.getLogger("PPanGGOLiN").debug(f"Known contig length = {self.length}, new length = {contig_len}") + raise ValueError('Attempting to define a contig length different from the previously defined value.') + def __len__(self): return self.length @@ -654,12 +660,28 @@ def genes(self) -> Generator[Gene, None, None]: for contig in self.contigs: yield from contig.genes + @property + def rna_genes(self) -> Generator[RNA, None, None]: + """Generator to get genes in the organism + + :return: Generator of genes + """ + for contig in self.contigs: + yield from contig.RNAs + def number_of_genes(self) -> int: """ Get number of genes in the organism :return: Number of genes """ - return sum([contig.number_of_genes for contig in self.contigs]) + return sum((contig.number_of_genes for contig in self.contigs)) + + def number_of_rnas(self) -> int: + """ Get number of genes in the organism + + :return: Number of genes + """ + return sum((contig.number_of_rnas for contig in self.contigs)) @property def contigs(self) -> Generator[Contig, None, None]: @@ -743,3 +765,4 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): self.bitarray[index[fam]] = 1 else: raise Exception("There is not any partition corresponding please report a github issue") + diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index c9272a60..69557705 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -110,9 +110,7 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, pangenome.status["neighborsGraph"] = "Computed" pangenome.parameters["graph"] = {} - # pangenome.parameters["graph"]["removed_high_copy_number_families"] = False if remove_copy_number > 0: - # pangenome.parameters["graph"]["removed_high_copy_number_families"] = True pangenome.parameters["graph"]["remove_high_copy_number"] = remove_copy_number diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index 44205b4a..f7506599 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -65,7 +65,7 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame: :return: Dataframe with metadata loaded """ assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"] - colname_check = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$') + colname_check = re.compile('^[a-zA-Z_]\w*$') # \w = [A-Za-z0-9_] metadata_df = pd.read_csv(metadata, sep="\t", header=0, quoting=csv.QUOTE_NONE, dtype={metatype: str}) metadata_df.replace(to_replace='-', value=pd.NA, inplace=True) diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index bf72e1f8..7727193f 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -29,15 +29,15 @@ def __init__(self): self.file = None # basic parameters - self._famGetter = {} + self._fam_getter = {} self._org_index = None self._fam_index = None self._max_fam_id = 0 - self._orgGetter = {} - self._edgeGetter = {} - self._regionGetter = {} - self._spotGetter = {} - self._moduleGetter = {} + self._org_getter = {} + self._edge_getter = {} + self._region_getter = {} + self._spot_getter = {} + self._module_getter = {} self.status = { 'genomesAnnotated': "No", 'geneSequences': "No", @@ -103,16 +103,16 @@ def genes(self) -> Generator[Gene, None, None]: def _mk_gene_getter(self): """ - Builds the attribute _geneGetter of the pangenome + Builds the attribute _gene_getter of the pangenome Since the genes are never explicitly 'added' to a pangenome (but rather to a gene family, or a contig), the pangenome cannot directly extract a gene from a geneID since it does not 'know' them. - If at some point we want to extract genes from a pangenome we'll create a geneGetter. + If at some point we want to extract genes from a pangenome we'll create a gene_getter. The assumption behind this is that the pangenome has been filled and no more gene will be added. """ - self._geneGetter = {} + self._gene_getter = {} for gene in self.genes: - self._geneGetter[gene.ID] = gene + self._gene_getter[gene.ID] = gene def get_gene(self, gene_id: str) -> Gene: """Returns the gene that has the given gene ID @@ -127,7 +127,7 @@ def get_gene(self, gene_id: str) -> Gene: assert isinstance(gene_id, str), "Gene id should be an integer" try: - return self._geneGetter[gene_id] + return self._gene_getter[gene_id] except AttributeError: # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. self._mk_gene_getter() # make it @@ -142,10 +142,10 @@ def number_of_genes(self) -> int: :return: The number of genes """ try: - return len(self._geneGetter) + return len(self._gene_getter) except AttributeError: # in that case the gene getter has not been computed self._mk_gene_getter() # make it - return len(self._geneGetter) + return len(self._gene_getter) """RNAs methods""" @property @@ -187,7 +187,7 @@ def gene_families(self) -> Generator[GeneFamily, None, None]: :return: Generator of gene families """ - for family in self._famGetter.values(): + for family in self._fam_getter.values(): yield family @property @@ -196,7 +196,7 @@ def number_of_gene_families(self) -> int: :return: The number of gene families """ - return len(self._famGetter) + return len(self._fam_getter) def get_gene_family(self, name: str) -> GeneFamily: """Returns the gene family that has the given `name` @@ -210,7 +210,7 @@ def get_gene_family(self, name: str) -> GeneFamily: """ assert isinstance(name, str), "Name of gene family should be a string" try: - fam = self._famGetter[name] + fam = self._fam_getter[name] except KeyError: raise KeyError(f"Gene family with name={name} is not in pangenome") except Exception as error: @@ -231,7 +231,7 @@ def add_gene_family(self, family: GeneFamily): try: _ = self.get_gene_family(family.name) except KeyError: - self._famGetter[family.name] = family + self._fam_getter[family.name] = family self.max_fam_id += 1 except Exception as error: raise Exception(error) @@ -245,7 +245,7 @@ def edges(self) -> Generator[Edge, None, None]: :return: Generator of edge """ - for edge in self._edgeGetter.values(): + for edge in self._edge_getter.values(): yield edge def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: @@ -267,10 +267,10 @@ def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: raise AttributeError("Genes are not linked to families. Check that you compute the gene families and post an" " issue on our GitHub") key = frozenset([family_1, family_2 ]) - edge = self._edgeGetter.get(key) + edge = self._edge_getter.get(key) if edge is None: edge = Edge(gene1, gene2) - self._edgeGetter[key] = edge + self._edge_getter[key] = edge else: edge.add_genes(gene1, gene2) return edge @@ -281,7 +281,7 @@ def number_of_edges(self) -> int: :return: The number of gene families """ - return len(self._edgeGetter) + return len(self._edge_getter) """Organism methods""" @property @@ -290,7 +290,7 @@ def organisms(self) -> Generator[Organism, None, None]: :return: Generator :class:`ppanggolin.genome.Organism` """ - for organism in self._orgGetter.values(): + for organism in self._org_getter.values(): yield organism @property @@ -299,7 +299,7 @@ def number_of_organisms(self) -> int: :return: The number of organism """ - return len(self._orgGetter) + return len(self._org_getter) @property def contigs(self) -> Generator[Contig, None, None]: @@ -377,7 +377,7 @@ def get_organism(self, name: str) -> Organism: """ assert isinstance(name, str), "Organism name should be a string" try: - return self._orgGetter[name] + return self._org_getter[name] except KeyError: raise KeyError(f"{name} does not seem to be in your pangenome") @@ -397,7 +397,7 @@ def add_organism(self, organism: Organism): try: self.get_organism(organism.name) except KeyError: - self._orgGetter[organism.name] = organism + self._org_getter[organism.name] = organism else: raise KeyError(f"Redondant organism name was found ({organism.name})." f"All of your organisms must have unique names.") @@ -469,7 +469,7 @@ def regions(self) -> Generator[Region, None, None]: :return: list of RGP """ - for region in self._regionGetter.values(): + for region in self._region_getter.values(): yield region def get_region(self, name: str) -> Region: @@ -485,7 +485,7 @@ def get_region(self, name: str) -> Region: assert isinstance(name, str), "RGP name should be a string" try: - rgp = self._regionGetter[name] + rgp = self._region_getter[name] except KeyError: # then the region is not stored in this pangenome. raise KeyError(f"There is no RGP with name={name}") else: @@ -526,7 +526,7 @@ def add_region(self, region: Region): try: self.get_region(region.name) except KeyError: - self._regionGetter[region.name] = region + self._region_getter[region.name] = region else: raise KeyError(f"A RGP with this name ({region.name} already exist in pangenome") @@ -536,7 +536,7 @@ def number_of_rgp(self) -> int: :return: The number of gene families """ - return len(self._regionGetter) + return len(self._region_getter) """Spot methods""" @property @@ -544,7 +544,7 @@ def spots(self) -> Generator[Spot, None, None]: """Generate spots in the pangenome :return: Spot generator""" - yield from self._spotGetter.values() + yield from self._spot_getter.values() def get_spot(self, spot_id: Union[int, str]) -> Spot: # TODO Change for only str or only int @@ -568,7 +568,7 @@ def get_spot(self, spot_id: Union[int, str]) -> Spot: raise ValueError(f"The provided spot ID '{spot_id}' does not have the expected format." "It should be an integer or in the format 'spot_'.") try: - spot = self._spotGetter[spot_id] + spot = self._spot_getter[spot_id] except KeyError: raise KeyError(f"Spot {spot_id} does not exist in the pangenome.") else: @@ -586,7 +586,7 @@ def add_spot(self, spot: Spot): try: self.get_spot(spot.ID) except KeyError: - self._spotGetter[spot.ID] = spot + self._spot_getter[spot.ID] = spot except Exception as error: raise Exception(error) else: @@ -598,14 +598,14 @@ def number_of_spots(self) -> int: :return: The number of gene families """ - return len(self._spotGetter) + return len(self._spot_getter) """Modules methods""" @property def modules(self) -> Generator[Module, None, None]: """Generate modules in the pangenome """ - yield from self._moduleGetter.values() + yield from self._module_getter.values() def get_module(self, module_id: Union[int, str]) -> Module: # TODO Change for only str or only int @@ -631,7 +631,7 @@ def get_module(self, module_id: Union[int, str]) -> Module: "It should be an integer or in the format 'module_'.") try: - module = self._moduleGetter[module_id] + module = self._module_getter[module_id] except KeyError: raise KeyError(f"Module {module_id} does not exist in the pangenome.") else: @@ -649,7 +649,7 @@ def add_module(self, module: Module): try: self.get_module(module.ID) except KeyError: - self._moduleGetter[module.ID] = module + self._module_getter[module.ID] = module except Exception as error: raise Exception(error) else: @@ -680,7 +680,7 @@ def number_of_modules(self) -> int: :return: The number of modules """ - return len(self._moduleGetter) + return len(self._module_getter) """Metadata""" def select_elem(self, metatype: str): diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index 335f89ec..8b0d6161 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -28,7 +28,9 @@ from ppanggolin.annotate.annotate import init_contig_counter, read_anno_file, annotate_organism, local_identifiers_are_unique from ppanggolin.annotate import subparser as annotate_subparser from ppanggolin.pangenome import Pangenome -from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, check_input_files +from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, \ + restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, \ + check_input_files, parse_input_paths_file from ppanggolin.align.alignOnPang import write_gene_to_gene_family, get_input_seq_to_family_with_rep,get_input_seq_to_family_with_all, project_and_write_partition from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations from ppanggolin.formats.readBinaries import check_pangenome_info @@ -37,8 +39,8 @@ from ppanggolin.genome import Organism from ppanggolin.geneFamily import GeneFamily from ppanggolin.region import Region, Spot, Module -from ppanggolin.formats.writeFlat import summarize_spots - +from ppanggolin.formats.writeFlat import summarize_spots, write_proksee_organism, manage_module_colors, write_gff_file +from ppanggolin.formats.writeSequences import read_genome_file class NewSpot(Spot): """ @@ -49,26 +51,32 @@ class NewSpot(Spot): def __str__(self): return f'new_spot_{str(self.ID)}' -def launch(args: argparse.Namespace): +def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool): """ - Command launcher + Check the status of a pangenome and determine whether projection is possible. - :param args: All arguments provide by user - """ + :param pangenome: The pangenome to be checked. + :param fast_aln: Whether to use the fast alignment option for gene projection. - output_dir = Path(args.output) - mk_outdir(output_dir, args.force) + This function checks various attributes of a pangenome to determine whether it is suitable for projecting + features into a provided genome. + + Returns: + A tuple indicating whether RGP prediction, spot projection, and module projection + are possible (True) or not (False) based on the pangenome's status. + + Raises: + NameError: If the pangenome has not been partitioned. + Exception: If the pangenome lacks gene sequences or gene family sequences, and fast alignment is not enabled. + """ - # For the moment these elements of the pangenome are predicted by default project_modules = True predict_rgp = True project_spots = True - pangenome = Pangenome() - pangenome.add_file(args.pangenome) if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]: - raise NameError(f"The provided pangenome has not been partitioned. " + raise NameError("The provided pangenome has not been partitioned. " "Annotation of an external genome is therefore not possible. " "See the 'partition' subcommands.") @@ -89,7 +97,7 @@ def launch(args: argparse.Namespace): project_modules = False - if pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] and not args.fast: + if pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] and not fast_aln: raise Exception("The provided pangenome has no gene sequences. " "Projection is still possible with the --fast option to use representative " "sequences rather than all genes to annotate input genes.") @@ -98,130 +106,163 @@ def launch(args: argparse.Namespace): raise Exception("The provided pangenome has no gene families sequences. " "This is not possible to annotate an input organism to this pangenome.") - - check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, - need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False, - need_spots=project_spots) - - print("number_of_organisms", pangenome.number_of_organisms) - logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.') - pangenome_params = argparse.Namespace( - **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) - - # dup margin value here is specified in argument and is used to compute completeness. - # Thats mean it can be different than dup margin used in spot and RGPS. - - # TODO make this single_copy_fams a method of class Pangenome that should be used in write --stats - single_copy_fams = set() - - for fam in pangenome.gene_families: - if fam.named_partition == "persistent": - dup = len([genes for genes in fam.get_org_dict().values() if - len([gene for gene in genes if not gene.is_fragment]) > 1]) - - if (dup / fam.number_of_organisms) < args.dup_margin: - single_copy_fams.add(fam) + return predict_rgp, project_spots, project_modules - genome_name_to_fasta_path, genome_name_to_annot_path = None, None +def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta, + organism_name, circular_contigs, pangenome_params, + cpu, use_pseudo, disable_bar, tmpdir, config): + """ + """ + genome_name_to_path = None - if args.input_mode == "multiple": - if args.anno: - genome_name_to_annot_path = parse_input_paths_file(args.anno) + if input_mode == "multiple": + if anno: + input_type = "annotation" + genome_name_to_path = parse_input_paths_file(anno) - if args.fasta: - genome_name_to_fasta_path = parse_input_paths_file(args.fasta) + elif fasta: + input_type = "fasta" + genome_name_to_path = parse_input_paths_file(fasta) else: # args.input_mode == "single: - circular_contigs = args.circular_contigs if args.circular_contigs else [] - if args.anno: - genome_name_to_annot_path = {args.organism_name: {"path": args.annot, + circular_contigs = circular_contigs if circular_contigs else [] + if anno: + input_type = "annotation" + genome_name_to_path = {organism_name: {"path": anno, "circular_contigs": circular_contigs}} - if args.fasta: - genome_name_to_fasta_path = {args.organism_name: {"path": args.fasta, + elif fasta: + input_type = "fasta" + genome_name_to_path = {organism_name: {"path": fasta, "circular_contigs": circular_contigs}} - - if genome_name_to_annot_path: - check_input_names(pangenome, genome_name_to_annot_path) - organisms, org_2_has_fasta = read_annotation_files(genome_name_to_annot_path, cpu=args.cpu, pseudo=args.use_pseudo, - disable_bar=args.disable_prog_bar) + if input_type == "annotation": + check_input_names(pangenome, genome_name_to_path) + + organisms, org_2_has_fasta = read_annotation_files(genome_name_to_path, cpu=cpu, pseudo=use_pseudo, + disable_bar=disable_bar) if not all((has_fasta for has_fasta in org_2_has_fasta.values())): organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta} - if args.fasta: - get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_fasta_path) + if fasta: + get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_path) + else: raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) " - "organisms without associated sequence data, and you did not provide " + "organisms without associated sequence data, and you did not provide " "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. " f"The following organisms have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}") - elif genome_name_to_fasta_path: + elif input_type == "fasta": annotate_param_names = ["norna", "kingdom", "allow_overlap", "prodigal_procedure"] - annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, args.config) + annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, config) - check_input_names(pangenome, genome_name_to_fasta_path) - organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_fasta_path, tmpdir=args.tmpdir, cpu=args.cpu, - translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom, - allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=args.disable_prog_bar ) - - - input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, - output=output_dir, cpu=args.cpu, use_representatives=args.fast, - no_defrag=args.no_defrag, identity=args.identity, - coverage=args.coverage, tmpdir=args.tmpdir, - translation_table=int(pangenome_params.cluster.translation_table), - keep_tmp=args.keep_tmp, - disable_bar=args.disable_prog_bar) - - - input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {} - - if predict_rgp: - logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') - - multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) - - input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, - min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, output_dir=output_dir, - disable_bar=args.disable_prog_bar) + check_input_names(pangenome, genome_name_to_path) + organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu, + translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom, + allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=disable_bar) + return organisms, genome_name_to_path, input_type + + +def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input_org_2_rgps:Dict[Organism, Set[Region]], + input_org_to_spots:Dict[Organism, Set[Spot]], + input_orgs_to_modules:Dict[Organism, Set[Module]] , + input_org_to_lonely_genes_count:Dict[Organism, int], + write_proksee:bool, write_gff:bool, add_sequences:bool, + genome_name_to_path:Dict[str,dict], input_type:str, + output_dir:Path, dup_margin:float, ): + """ + Write the results of the projection of pangneome onto input genomes. + + :param pangenome: The pangenome onto which the projection is performed. + :param organisms: A set of input organisms for projection. + :param input_org_2_rgps: A dictionary mapping input organisms to sets of regions of genomic plasticity (RGPs). + :param input_org_to_spots: A dictionary mapping input organisms to sets of spots. + :param input_orgs_to_modules: A dictionary mapping input organisms to sets of modules. + :param input_org_to_lonely_genes_count: A dictionary mapping input organisms to the count of lonely genes. + :param write_proksee: Whether to write ProkSee JSON files. + :param write_gff: Whether to write GFF files. + :param add_sequences: Whether to add sequences to the output files. + :param genome_name_to_path: A dictionary mapping genome names to file paths. + :param input_type: The type of input data (e.g., "annotation"). + :param output_dir: The directory where the output files will be written. + :param dup_margin: The duplication margin used to compute completeness. + + Note: + - If `write_proksee` is True and input organisms have modules, module colors for ProkSee are obtained. + - The function calls other functions such as `summarize_projection`, `read_genome_file`, `write_proksee_organism`, + `write_gff_file`, and `write_summaries` to generate various output files and summaries. + """ + if write_proksee and input_orgs_to_modules: + # get module color for proksee + module_to_colors = manage_module_colors(set(pangenome.modules)) - if project_spots: - logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.') - input_org_to_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots), - initial_regions=pangenome.regions, - input_org_2_rgps=input_org_2_rgps, - multigenics=multigenics, - output=output_dir, - write_graph_flag=args.spot_graph, - graph_formats=args.graph_formats, - overlapping_match=pangenome_params.spot.overlapping_match, - set_size=pangenome_params.spot.set_size, - exact_match=pangenome_params.spot.exact_match_size) - - if project_modules: - input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir) + single_copy_families = get_single_copy_families(pangenome, dup_margin) organism_2_summary = {} for organism in organisms: + + org_outdir = output_dir / organism.name + # summarize projection for all input organisms - organism_2_summary[organism] = summarize_projection(organism, pangenome, single_copy_fams, + organism_2_summary[organism] = summarize_projection(organism, pangenome, single_copy_families, input_org_2_rgps.get(organism, None), input_org_to_spots.get(organism, None), input_orgs_to_modules.get(organism, None), input_org_to_lonely_genes_count[organism]) - write_summaries(organism_2_summary, output_dir) + if (write_proksee or write_gff) and add_sequences: + genome_sequences = read_genome_file(genome_name_to_path[organism.name]['path'], organism) + genome_name_to_path[organism.name]['path'] + else: + genome_sequences = None + + if write_proksee: + org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in input_orgs_to_modules.get(organism, [])} + + output_file = output_dir / organism.name / f"{organism.name}_proksee.json" + + + write_proksee_organism(organism, output_file, features='all', module_to_colors=org_module_to_color, + rgps=input_org_2_rgps.get(organism, None), + genome_sequences=genome_sequences) + + + if write_gff: + if input_type == "annotation": # if the genome has not been annotated by PPanGGOLiN + annotation_sources = {"rRNA": "external", + "tRNA": "external", + "CDS":"external"} + else: + annotation_sources = {} + + contig_to_rgp, rgp_to_spot_id = {}, {} + + if organism in input_org_2_rgps: + contig_to_rgp = defaultdict(list) + for rgp in input_org_2_rgps[organism]: + contig_to_rgp[rgp.contig].append(rgp) + + if organism in input_org_to_spots: + rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in input_org_to_spots[organism] for rgp in spot.regions if rgp in input_org_2_rgps[organism] } + + + write_gff_file(organism, contig_to_rgp, rgp_to_spot_id, outdir=org_outdir, compress=False, + annotation_sources=annotation_sources, genome_sequences=genome_sequences) + + + write_summaries(organism_2_summary, output_dir) + + def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str, cpu: int = 1, translation_table: int = 11, kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None, disable_bar: bool = False): @@ -231,7 +272,7 @@ def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str :param genome_name_to_fasta_path: :param fasta_list: List of fasta file containing sequences that will be base of pangenome :param tmpdir: Path to temporary directory - :param cpu: number of CPU cores to use + :param cpu: number of CPU cores to use :param translation_table: Translation table (genetic code) to use. :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. :param norna: Use to avoid annotating RNA features. @@ -370,48 +411,6 @@ def check_input_names(pangenome, input_names): raise NameError(f"{len(duplicated_names)} provided organism names already exist in the given pangenome: {' '.join(duplicated_names)}") -def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, List[str]]]: - """ - Parse an input paths file to extract genome information. - - This function reads an input paths file, which is in TSV format, and extracts genome information - including file paths and putative circular contigs. - - :param path_list_file: The path to the input paths file. - :return: A dictionary where keys are genome names and values are dictionaries containing path information and - putative circular contigs. - :raises FileNotFoundError: If a specified genome file path does not exist. - :raises Exception: If there are no genomes in the provided file. - """ - logging.getLogger("PPanGGOLiN").info(f"Reading {path_list_file} to process organism files") - genome_name_to_genome_path = {} - - for line in read_compressed_or_not(path_list_file): - elements = [el.strip() for el in line.split("\t")] - genome_file_path = Path(elements[1]) - genome_name = elements[0] - putative_circular_contigs = elements[2:] - - if not genome_file_path.exists(): - # Check if the file path doesn't exist and try an alternative path. - genome_file_path_alt = path_list_file.parent.joinpath(genome_file_path) - - if not genome_file_path_alt.exists(): - raise FileNotFoundError(f"The file path '{genome_file_path}' for genome '{genome_name}' specified in '{path_list_file}' does not exist.") - else: - genome_file_path = genome_file_path_alt - - genome_name_to_genome_path[genome_name] = { - "path": genome_file_path, - "circular_contigs": putative_circular_contigs - } - - if len(genome_name_to_genome_path) == 0: - raise Exception(f"There are no genomes in the provided file: {path_list_file} ") - - return genome_name_to_genome_path - - def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_dir: Path): """ @@ -435,7 +434,7 @@ def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_d flat_summary = {} for key, val in summary_info.items(): - if type(val) == dict: + if isinstance(val, dict): for nest_k, nest_v in val.items(): flat_summary[f"{key} {nest_k}"] = nest_v else: @@ -447,15 +446,47 @@ def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_d df_summary.to_csv(output_dir / "summary_projection.tsv", sep='\t', index=False) +def get_single_copy_families(pangenome: Pangenome, dup_margin:float): + """ + Get single copy families + + :param pangenome: The pangenome onto which the projection is performed. + :param dup_margin: The duplication margin used to compute single copy families. + + """ + + # TODO make this single_copy_fams a method of class Pangenome that should be used in write --stats + single_copy_families = set() + + for fam in pangenome.gene_families: + if fam.named_partition == "persistent": + dup = len([genes for genes in fam.get_org_dict().values() if + len([gene for gene in genes if not gene.is_fragment]) > 1]) + + if (dup / fam.number_of_organisms) < dup_margin: + single_copy_families.add(fam) + + return single_copy_families + def summarize_projection(input_organism:Organism, pangenome:Pangenome, single_copy_families:Set, input_org_rgps:Region, input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int): """ + Summarize the projection of an input organism onto a pangenome. - :param singleton_gene_count: Number of genes that do not cluster with any of the gene families of the pangenome. + :param input_organism: The input organism for projection. + :param input_org_rgps: The regions of genomic plasticity (RGPs) in the input organism. + :param input_org_spots: The spots in the input organism. + :param input_org_modules: The modules in the input organism. + :param singleton_gene_count: Number of genes that do not cluster with any gene families in the pangenome. + + Returns: + A dictionary containing summary information about the projection, including organism details, + gene and family counts, completeness, and counts of RGPs, spots, new spots, and modules. """ + partition_to_gene = defaultdict(set) contigs_count = 0 for contig in input_organism.contigs: @@ -469,7 +500,7 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, single_c completeness = "NA" - single_copy_markers_count = len(set(input_organism.families) & single_copy_families ) + single_copy_markers_count = len(set(input_organism.families) & single_copy_families) if len(single_copy_families) > 0: completeness = round((single_copy_markers_count / len(single_copy_families)) * 100, 2) @@ -529,17 +560,17 @@ def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_org """ seq_fasta_files = [] - logging.getLogger('PPanGGOLiN').info(f'Writting gene sequences of input genomes.') + logging.getLogger('PPanGGOLiN').info('Writting gene sequences of input genomes.') for input_organism in input_organisms: seq_outdir = output / input_organism.name mk_outdir(seq_outdir, force=True) - seq_fasta_file = seq_outdir / f"cds_sequences.fasta" + seq_fasta_file = seq_outdir / "cds_sequences.fasta" with open(seq_fasta_file, "w") as fh_out_faa: - write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True, add=f"ppanggolin_") + write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True, add="ppanggolin_") seq_fasta_files.append(seq_fasta_file) @@ -838,7 +869,7 @@ def predict_spots_in_input_organisms( initial_regions: List[Region], input_org_2_rgps: Dict[Organism, Set[Region]], multigenics: Set[GeneFamily], - output: str, + output: Path, write_graph_flag: bool = False, graph_formats: List[str] = ['gexf'], overlapping_match: int = 2, @@ -861,7 +892,7 @@ def predict_spots_in_input_organisms( :return: A dictionary mapping input organism RGPs to their predicted spots. """ - logging.getLogger("PPanGGOLiN").debug(f"Rebuilding original spot graph.") + logging.getLogger("PPanGGOLiN").debug("Rebuilding original spot graph.") graph_spot = make_spot_graph(rgps=initial_regions, multigenics=multigenics, overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match) @@ -949,7 +980,7 @@ def predict_spot_in_one_organism( "as they are on a contig border (or have " f"less than {set_size} persistent gene families until the contig border). " "Projection of spots stops here") - return {} + return set() # remove node that were already in the graph new_nodes = set(input_org_node_to_rgps) - original_nodes @@ -1033,7 +1064,7 @@ def predict_spot_in_one_organism( input_org_spots = {spot for spots in input_rgp_to_spots.values() for spot in spots } - new_spots = {spot for spot in input_org_spots if type(spot) == NewSpot} + new_spots = {spot for spot in input_org_spots if isinstance(spot, NewSpot)} logging.getLogger('PPanGGOLiN').debug( @@ -1157,7 +1188,7 @@ def check_projection_arguments(args: argparse.Namespace, parser: argparse.Argume if args.circular_contigs: parser.error("You provided a TSV file listing the files of genomes you wish to annotate. " - f"Therefore, the argument '--circular_contigs' is incompatible with this multiple genomes file.") + "Therefore, the argument '--circular_contigs' is incompatible with this multiple genomes file.") if args.fasta: check_input_files(args.fasta, True) @@ -1168,6 +1199,91 @@ def check_projection_arguments(args: argparse.Namespace, parser: argparse.Argume return input_mode + + +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ + + output_dir = Path(args.output) + mk_outdir(output_dir, args.force) + + # For the moment these elements of the pangenome are predicted by default + + pangenome = Pangenome() + pangenome.add_file(args.pangenome) + + predict_rgp, project_spots, project_modules = check_pangenome_for_projection(pangenome, args.fast) + + check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar, + need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False, + need_spots=project_spots) + + logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.') + pangenome_params = argparse.Namespace( + **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()}) + + organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome, + input_mode=args.input_mode, + anno=args.anno, fasta=args.fasta, + organism_name=args.organism_name, + circular_contigs=args.circular_contigs, + pangenome_params=pangenome_params, + cpu=args.cpu, use_pseudo=args.use_pseudo, + disable_bar=args.disable_prog_bar, + tmpdir= args.tmpdir, config=args.config) + + + + input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms, + output=output_dir, cpu=args.cpu, use_representatives=args.fast, + no_defrag=args.no_defrag, identity=args.identity, + coverage=args.coverage, tmpdir=args.tmpdir, + translation_table=int(pangenome_params.cluster.translation_table), + keep_tmp=args.keep_tmp, + disable_bar=args.disable_prog_bar) + + + input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {} + + if predict_rgp: + + logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.') + + multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) + + input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain, + min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, output_dir=output_dir, + disable_bar=args.disable_prog_bar) + + if project_spots: + logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.') + input_org_to_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots), + initial_regions=pangenome.regions, + input_org_2_rgps=input_org_2_rgps, + multigenics=multigenics, + output=output_dir, + write_graph_flag=args.spot_graph, + graph_formats=args.graph_formats, + overlapping_match=pangenome_params.spot.overlapping_match, + set_size=pangenome_params.spot.set_size, + exact_match=pangenome_params.spot.exact_match_size) + + if project_modules: + input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir) + + write_projection_results(pangenome, organisms, input_org_2_rgps, + input_org_to_spots, + input_orgs_to_modules, + input_org_to_lonely_genes_count, + write_proksee=args.proksee, write_gff=args.gff, add_sequences=args.add_sequences, + genome_name_to_path=genome_name_to_path, input_type=input_type, + output_dir=output_dir, dup_margin=args.dup_margin) + + def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ Subparser to launch PPanGGOLiN in Command line @@ -1249,6 +1365,15 @@ def parser_projection(parser: argparse.ArgumentParser): optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+", default=['gexf'], help="Format of the output graph.") + optional.add_argument("--gff", required=False, action="store_true", + help="Generate GFF files with projected pangenome annotations for each input organism.") + + optional.add_argument("--proksee", required=False, action="store_true", + help="Generate JSON map files for PROKSEE with projected pangenome annotations for each input organism.") + + optional.add_argument("--add_sequences", required=False, action="store_true", + help="Include input genome DNA sequences in GFF and Proksee output.") + optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 17ccfff3..a2abcc3c 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -58,7 +58,8 @@ def __str__(self): return self.name def __repr__(self) -> str: - """Region representation + """ + Region representation """ return f"RGP name:{self.name}" @@ -242,6 +243,25 @@ def contig(self) -> Contig: :return: Contig corresponding to the region """ return self.starter.contig + + @property + def start(self) -> int: + """ + Get the starter start link to RGP + + :return: start position in the contig of the first gene of the RGP + """ + return self.starter.start + + @property + def stop(self) -> int: + """ + Get the stopper stop link to RGP + + :return: start position in the contig of the last gene of the RGP + """ + return self.stopper.stop + @property def is_whole_contig(self) -> bool: @@ -716,6 +736,18 @@ def families(self) -> Generator[GeneFamily, None, None]: :return: Families belonging to the module """ yield from self._families_getter.values() + + @property + def organisms(self) -> Generator[Organism, None, None]: + """Returns all the Organisms that have this module + + :return: Organisms that have this module + """ + organisms = set() + for fam in self.families: + organisms |= set(fam.organisms) + yield from organisms + def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 7dc81e67..e6bc20cb 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -9,10 +9,11 @@ import argparse from io import TextIOWrapper from pathlib import Path -from typing import TextIO, Union, BinaryIO, Tuple, List, Set, Iterable +from typing import TextIO, Union, BinaryIO, Tuple, List, Set, Iterable, Dict from contextlib import contextmanager import tempfile import time +from itertools import zip_longest import networkx as nx from importlib.metadata import distribution @@ -482,7 +483,7 @@ def get_arg_name(arg_val: Union[str, TextIOWrapper]) -> Union[str, TextIOWrapper :return: Either a string or a TextIOWrapper object, depending on the type of the input argument. """ - if type(arg_val) == TextIOWrapper: + if isinstance(arg_val, TextIOWrapper): return arg_val.name return arg_val @@ -732,7 +733,7 @@ def count_different_values(values: Iterable[Union[int, str, Tuple, List]]) -> in """ hashable_values = set() for value in values: - hashable_value = tuple(value) if type(value) == list else value + hashable_value = tuple(value) if isinstance(value, list) else value hashable_values.add(hashable_value) return len(hashable_values) @@ -764,7 +765,7 @@ def set_up_config_param_to_parser(config_param_val: dict) -> list: arguments_to_parse = [] for param, val in config_param_val.items(): - if type(val) == bool or val is None or val == "None": + if isinstance(val, bool) or val is None or val == "None": # param is a flag if val is True: arguments_to_parse.append(f"--{param}") @@ -772,7 +773,7 @@ def set_up_config_param_to_parser(config_param_val: dict) -> list: else: arguments_to_parse.append(f"--{param}") - if type(val) == list: + if isinstance(val, list): # range of values need to be added one by one arguments_to_parse += [str(v) for v in val] else: @@ -905,8 +906,6 @@ def get_cli_args(subparser_fct: Callable) -> argparse.Namespace: # remove argument that have not been specified delete_unspecified_args(cli_args) delattr(cli_args, 'subcommand') - # if 'config' in cli_args: - # delattr(cli_args, 'config') return cli_args @@ -935,3 +934,104 @@ def delete_unspecified_args(args: argparse.Namespace): for arg_name, arg_val in args._get_kwargs(): if arg_val is None: delattr(args, arg_name) + + +def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int, + is_circular: bool = False): + """ + Extracts contiguous windows around positions of interest within a contig. + + :param contig_size: Number of genes in contig. + :param positions_of_interest: An iterable containing the positions of interest. + :param window_size: The size of the window to extract around each position of interest. + :param is_circular: Indicates if the contig is circular. + :return: Yields tuples representing the start and end positions of each contiguous window. + """ + windows_coordinates = [] + + # Sort the positions of interest + sorted_positions = sorted(positions_of_interest) + + # Check if any position of interest is out of range + if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size: + raise IndexError(f'Positions of interest are out of range. ' + f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions") + + if is_circular: + first_position = sorted_positions[0] + last_position = sorted_positions[-1] + # in a circular contig, if the window of a gene of interest overlaps the end/start of the contig + # an out of scope position is added to the sorted positions to take into account those positions + # the returned window are always checked that its positions are not out of range... + # so there's no chance to find an out of scope position in final list + if first_position - window_size < 0: + out_of_scope_position = contig_size + first_position + sorted_positions.append(out_of_scope_position) + + if last_position + window_size >= contig_size: + out_of_scope_position = last_position - contig_size + sorted_positions.insert(0, out_of_scope_position) + + start_po = max(sorted_positions[0] - window_size, 0) + + for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]): + + if next_po is None: + # If there are no more positions, add the final window + end_po = min(position + window_size, contig_size - 1) + windows_coordinates.append((start_po, end_po)) + + elif position + window_size + 1 < next_po - window_size: + # If there is a gap between positions, add the current window + # and update the start position for the next window + end_po = min(position + window_size, contig_size - 1) + + windows_coordinates.append((start_po, end_po)) + + start_po = max(next_po - window_size, 0) + + return windows_coordinates + + + +def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, List[str]]]: + """ + Parse an input paths file to extract genome information. + + This function reads an input paths file, which is in TSV format, and extracts genome information + including file paths and putative circular contigs. + + :param path_list_file: The path to the input paths file. + :return: A dictionary where keys are genome names and values are dictionaries containing path information and + putative circular contigs. + :raises FileNotFoundError: If a specified genome file path does not exist. + :raises Exception: If there are no genomes in the provided file. + """ + logging.getLogger("PPanGGOLiN").info(f"Reading {path_list_file} to process organism files") + genome_name_to_genome_path = {} + + for line in read_compressed_or_not(path_list_file): + elements = [el.strip() for el in line.split("\t")] + genome_file_path = Path(elements[1]) + genome_name = elements[0] + putative_circular_contigs = elements[2:] + + if not genome_file_path.exists(): + # Check if the file path doesn't exist and try an alternative path. + genome_file_path_alt = path_list_file.parent.joinpath(genome_file_path) + + if not genome_file_path_alt.exists(): + raise FileNotFoundError(f"The file path '{genome_file_path}' for genome '{genome_name}' specified in '{path_list_file}' does not exist.") + else: + genome_file_path = genome_file_path_alt + + genome_name_to_genome_path[genome_name] = { + "path": genome_file_path, + "circular_contigs": putative_circular_contigs + } + + if len(genome_name_to_genome_path) == 0: + raise Exception(f"There are no genomes in the provided file: {path_list_file} ") + + return genome_name_to_genome_path + diff --git a/requirements.txt b/requirements.txt index 45c799ce..6396883f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ pyrodigal>=3.0.1 aragorn>=1.2.41 infernal>=1.1.4 mmseqs2>=13.45111 -networkx>=2.7 +networkx>=2.7,<=3.1 dataclasses>=0.8 scipy>=1.7.3 plotly>=4.14.3 diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index 8b1123ad..d67eb1ce 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -39,15 +39,15 @@ def test_cstr(self, pangenome): """ pangenome_attr_type = { "file": type(None), - "_famGetter": dict, + "_fam_getter": dict, "_org_index": type(None), "_fam_index": type(None), "_max_fam_id": int, - "_orgGetter": dict, - "_edgeGetter": dict, - "_regionGetter": dict, - "_spotGetter": dict, - "_moduleGetter": dict, + "_org_getter": dict, + "_edge_getter": dict, + "_region_getter": dict, + "_spot_getter": dict, + "_module_getter": dict, "status": dict, "parameters": dict } @@ -597,8 +597,8 @@ def test_add_region(self, pangenome): """ rgp = Region(name="rgp") pangenome.add_region(rgp) - assert len(pangenome._regionGetter) == 1 - assert pangenome._regionGetter["rgp"] == rgp + assert len(pangenome._region_getter) == 1 + assert pangenome._region_getter["rgp"] == rgp def test_add_region_already_in_pangenome(self, pangenome): """Tests that adding region already in pangenome return a KeyError. @@ -665,8 +665,8 @@ def test_add_spot(self, pangenome): """ spot = Spot(spot_id=0) pangenome.add_spot(spot) - assert len(pangenome._spotGetter) == 1 - assert pangenome._spotGetter[0] == spot + assert len(pangenome._spot_getter) == 1 + assert pangenome._spot_getter[0] == spot def test_add_spot_already_in_pangenome(self, pangenome): """Tests that adding spot already in pangenome return a KeyError. @@ -734,8 +734,8 @@ def test_add_module(self, pangenome): """ module = Module(module_id=0) pangenome.add_module(module) - assert len(pangenome._moduleGetter) == 1 - assert pangenome._moduleGetter[0] == module + assert len(pangenome._module_getter) == 1 + assert pangenome._module_getter[0] == module def test_add_module_already_in_pangenome(self, pangenome): """Tests that adding module already in pangenome return a KeyError.