diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 126cfc2d..9297851c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -67,11 +67,11 @@ jobs:
ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
- ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1
+ ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1 --fasta organisms.fasta.list --gff --proksee
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log
- cd -
+ cd -
- name: gbff parsing and MSA computing
shell: bash -l {0}
run: |
@@ -100,14 +100,18 @@ jobs:
shell: bash -l {0}
run: |
cd testingDataset
- ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_align --draw_related --getinfo
+ ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --sequences some_chlam_proteins.fasta \
+ --output test_align --draw_related --getinfo --fast
cd -
- name: testing context command
shell: bash -l {0}
run: |
cd testingDataset
- ppanggolin context --pangenome myannopang/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_context
- ppanggolin context --pangenome readclusterpang/pangenome.h5 --family some_chlam_families.txt --output test_context -f
+ ppanggolin context --pangenome myannopang/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_context --fast
+
+ # test from gene family ids. Test here with one family of module 1. The context should find all families of module 1
+ echo AP288_RS05055 > one_family_of_module_1.txt
+ ppanggolin context --pangenome myannopang/pangenome.h5 --family one_family_of_module_1.txt --output test_context_from_id
cd -
- name: testing metadata command
shell: bash -l {0}
@@ -124,3 +128,16 @@ jobs:
cd testingDataset
ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml
ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml
+ cd -
+ - name: testing projection cmd
+ shell: bash -l {0}
+ run: |
+ cd testingDataset
+ head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list
+ ppanggolin projection --pangenome stepbystep/pangenome.h5 -o projection_from_list_of_gbff --anno organisms.gbff.head.list --gff --proksee
+
+
+ ppanggolin projection --pangenome mybasicpangenome/pangenome.h5 -o projection_from_single_fasta \
+ --organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
+ --spot_graph --graph_formats graphml --fast --keep_tmp -f --add_sequences --gff --proksee
+
diff --git a/VERSION b/VERSION
index 3cff2901..897c4fa2 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.189
+1.2.194
diff --git a/docs/index.md b/docs/index.md
index ea7064f6..995dfce0 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -163,6 +163,7 @@ user/Regions-of-Genome-Plasticity
user/Conserved-modules
user/Align
user/Genomic-context
+user/projection
user/metadata
user/Outputs
```
diff --git a/docs/user/Flat/orgStat.md b/docs/user/Flat/orgStat.md
index 1bae7ac4..1310d36a 100644
--- a/docs/user/Flat/orgStat.md
+++ b/docs/user/Flat/orgStat.md
@@ -19,7 +19,7 @@ This file is made of 15 columns described in the following table
| nb_cloud_genes | The number of genes whose family is cloud in that genome |
| nb_exact_core_genes | The number of genes whose family is exact core in that genome |
| nb_soft_core_genes | The number of genes whose family is soft core in that genome |
-| completeness | This is an indicator of the proportion of single copy markers in the persistent that are present in the genome. While it is expected to be relatively close to 100 when working with isolates, it may be particularly interesting when working with very fragmented genomes as this provide a *de novo* estimation of the completess based on the expectation that single copy markers within the persistent should be mostly present in all individuals of the studied taxonomic group |
+| completeness | This is an indicator of the proportion of single copy markers in the persistent that are present in the genome. While it is expected to be relatively close to 100 when working with isolates, it may be particularly interesting when working with very fragmented genomes as this provide a *de novo* estimation of the completeness based on the expectation that single copy markers within the persistent should be mostly present in all individuals of the studied taxonomic group |
| nb_single_copy_markers | This indicates the number of present single copy markers in the genomes. They are computed using the parameter duplication_margin indicated at the beginning of the file. They correspond to all of the persistent gene families that are not present in more than one copy in 5% (or more) of the genomes by default. |
It can be generated using the 'write' subcommand as such :
diff --git a/docs/user/Genomic-context.md b/docs/user/Genomic-context.md
index 69021b00..96b2e6d0 100644
--- a/docs/user/Genomic-context.md
+++ b/docs/user/Genomic-context.md
@@ -30,21 +30,25 @@ In this case, you can give a pangenome without gene families representatives seq
In case of you are using families ID, you will only have as output the `gene_context.tsv` file. In the other case, you use sequences, you will have another output file to report the alignment between sequences and pangenome families (see detail in align subcommand).
-There are 4 columns in `gene_context.tsv`.
+There are 6 columns in `gene_context.tsv`.
-1. **geneContext ID**: identifier of the found context. It is incrementally generated, beginning with 1
+1. **geneContext ID**: Identifier of the found context. It is incrementally generated, beginning with 1
2. **Gene family name**: Identifier of the gene family, from the pangenome, correspond to the found context
3. **Sequence ID**: Identifier of the searched sequence in the pangenome
4. **Nb Genomes**: Number of genomes where the genomic context is found
5. **Partition**: Partition of the gene family corresponding to the found context
+6. **Target family**: Whether the family is a target family, meaning it matches an input sequence, or a family provided as input.
In **sequence Id**, it is possible to find a NA value. This case, correspond to another gene family found in the context.
## Detailed options
-| option name | Description |
-|------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| --no_defrag | Do not use the defragmentation step, to align sequences with MMseqs2 |
-| --identity | Minimum identity percentage threshold |
-| --coverage | Minimum coverage percentage threshold |
-| -t, --transitive | Size of the transitive closure used to build the graph. This indicates the number of non-related genes allowed in-between two related genes. Increasing it will improve precision but lower sensitivity a little. |
-| -s, --jaccard | Minimum jaccard similarity used to filter edges between gene families. Increasing it will improve precision but lower sensitivity a lot. |
\ No newline at end of file
+
+| option name | Description |
+|-----------------------------|---------------------------------------------------------------------------|
+| --fast | Use representative sequences of gene families for input gene alignment. This option is recommended for faster processing but may be less sensitive. By default, all pangenome genes are used for alignment. This argument makes sense only when --sequence is provided. (default: False) |
+| --no_defrag | Do not use the defragmentation step, to align sequences with MMseqs2 (default: False) |
+| --identity | Minimum identity percentage threshold (default: 0.8)|
+| --coverage | Minimum coverage percentage threshold (default: 0.8)|
+| -t, --transitive | Size of the transitive closure used to build the graph. This indicates the number of non-related genes allowed in-between two related genes. Increasing it will improve precision but lower sensitivity a little. (default: 4) |
+| -s, --jaccard | Minimum jaccard similarity used to filter edges between gene families. Increasing it will improve precision but lower sensitivity a lot. (default: 0.85) |
+| -w, --window_size | Number of neighboring genes that are considered on each side of a gene of interest when searching for conserved genomic contexts. (default: 5) |
diff --git a/docs/user/projection.md b/docs/user/projection.md
new file mode 100644
index 00000000..5c6322b4
--- /dev/null
+++ b/docs/user/projection.md
@@ -0,0 +1,64 @@
+# Projection
+The ppanggolin projection command allows you to annotate external genomes using an existing pangenome. This process eliminates the need to recompute all components, streamlining the annotation process. Input genomes are expected to belong to the same species.
+
+Genes within the input genome are aligned with genes in the pangenome to determine their gene families and partitions. Genes that do not align with any existing gene in the pangenome are considered specific to the input genome and are assigned to the "Cloud" partition. Based on the alignment and partition assignment, Regions of Plasticity (RGPs) within the input genome are predicted. Each RGP that is not located on a contig border is assigned to a spot of insertion. Finally, conserved modules of the pangenome found in the input genome are reported in the output files.
+
+## Input files:
+
+This command supports two input modes depending on whether you want to project a single genome or multiple genomes at once:
+
+Multiple Files in One TSV:
+- **Options**: `--fasta` or `--anno`
+- **Description**: You can provide a tab-separated file listing organism names alongside their respective FASTA genomic sequences or annotation filepaths, with one line per organism. This mode is suitable when you want to annotate multiple genomes in a single operation. The format of this file is identical to the format used in the annotate and workflow commands; for more details, refer here.
+
+Single File:
+- **Options**: `--organism_name` with `--fasta` or `--anno` and `--circular_contigs` (optional)
+- **Description**: When annotating a single genome, you can directly provide a single FASTA genomic sequence file or an annotation file in GFF/GBFF format. Additionally, specify the name of the organism using the `--organism_name` option. You can also indicate circular contigs using the `--circular_contigs` option when necessary.
+
+
+## Output files:
+
+The Output directory contains `summary_projection.tsv` giving an overview of the projection. one line per organism.
+
+
+| Column | Description|
+|--------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Organism name | This column contains name or identifier of the organisms being analyzed.|
+| Pangenome file | The path to the pangenome file (pangenome.h5) used for the analysis.|
+| Contigs | The number of contigs in the projected genome.|
+| Genes | The total number of genes identified in the input genome.|
+| Families | The total number of gene families to which genes in the genome of the input organism are assigned.|
+| Persistent genes | The number of genes in the "Persistent" partition.|
+| Persistent families | The number of gene families in the "Persistent" partition.|
+| Shell genes | The number of genes in the "Shell" partition.|
+| Shell families | The number of gene families in the "Shell" partition.|
+| Cloud genes | The number of genes in the "Cloud" partition.|
+| Cloud families | The number of gene families in the "Cloud" parition.|
+| Cloud specific families | The number of gene families that are specific to the input organism. These families are unique to the input organism and do not have homologs in any other genomes within the pangenome and have been assigned to the "Cloud" partition.|
+| completeness | This indicates the proportion of single copy markers from the persistent partition that are present in the genome. While it is expected to be relatively close to 100 when working with isolates, it may be particularly interesting when working with very fragmented genomes as this provide a *de novo* estimation of the completeness based on the expectation that single copy markers within the persistent should be mostly present in all individuals of the studied taxonomic group. |
+| RGPs (Regions of Genomic Plasticity) | The number of Regions of Genomic Plasticity (RGPs) predicted within the input genome.|
+| Spots | The total number of spots of insertion associated with RGPs in the input genome.|
+| New spots | The number of new insertion spots that have been identified in the input genome. These spots represent novel genomic regions compared to other genomes in the pangenome.|
+| Modules | The number of modules that have been projected onto the input genome.|
+
+
+Additionally, within the Output directory, there is a subdirectory for each input genome, named after the input genome itself. Each of these subdirectories contains several files:
+
+For Gene Family and Partition of Input Genes:
+
+- `cds_sequences.fasta`: This file contains the sequences of coding regions (CDS) from the input genome.
+- `gene_to_gene_family.tsv`: It provides the mapping of genes to gene families of the pangenome. its format follows [this output](Outputs.md#gene-families-and-genes)
+- `sequences_partition_projection.tsv`: This file maps the input genes to its partition (Persistent, Shell or Cloud).
+- `specific_genes.tsv`: This file list the gene of the input genomes that do not align to any gene of the pangenome. These genes are assigned to Cloud parititon.
+
+For RGPs and Spots:
+
+- `plastic_regions.tsv`: This file contains information about Regions of Genomic Plasticity (RGPs) within the input genome. Its format follows [this output](Outputs.md#plastic-regions).
+- `input_organism_rgp_to_spot.tsv`: It provides information about the association between RGPs and insertion spots in the input genome. Its format follows [this ouput](Outputs.md#spots).
+
+Optionally, you can produce a graph of the RGPs using the `--spot_graph` option. This graph is similar as the one produce by the `ppanggolin spot` command.
+
+For Modules:
+
+- `modules_in_input_organism.tsv`: This file lists the modules that have been found in the input genome. Its format follows [this ouput](Outputs.md#modules-in-organisms).
+
diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py
index 79f7e128..96cf54f6 100644
--- a/ppanggolin/RGP/genomicIsland.py
+++ b/ppanggolin/RGP/genomicIsland.py
@@ -5,7 +5,7 @@
import logging
import argparse
from pathlib import Path
-from typing import Set
+from typing import Set, Iterable
# installed libraries
from tqdm import tqdm
@@ -191,27 +191,56 @@ def max_index_node(lst):
return contig_regions
-def compute_org_rgp(organism: Organism, multigenics: set, persistent_penalty: int = 3, variable_gain: int = 1,
- min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> Set[Region]:
+def compute_org_rgp( organism: Organism, multigenics: set,
+ persistent_penalty: int = 3, variable_gain: int = 1, min_length: int = 3000, min_score: int = 4,
+ naming: str = "contig", disable_bar: bool = True ) -> set:
+ """
+ Compute regions of genomic plasticity (RGP) on the given organism based on the provided parameters.
+
+ :param organism: The Organism object representing the organism.
+ :param multigenics: A set of multigenic persistent families of the pangenome graph.
+ :param persistent_penalty: Penalty score to apply to persistent multigenic families (default: 3).
+ :param variable_gain: Gain score to apply to variable multigenic families (default: 1).
+ :param min_length: Minimum length threshold (in base pairs) for the regions to be considered RGP (default: 3000).
+ :param min_score: Minimum score threshold for considering a region as RGP (default: 4).
+ :param naming: Naming scheme for the regions, either "contig" or "organism" (default: "contig").
+ :param disable_bar: Whether to disable the progress bar. It is recommended to disable it when calling this function in a loop on multiple organisms (default: True).
+
+ :return: A set of RGPs of the provided organism.
+ """
org_regions = set()
- for contig in organism.contigs:
+ for contig in tqdm(organism.contigs, total=organism.number_of_contigs, unit="contig", disable=disable_bar):
if contig.number_of_genes != 0: # some contigs have no coding genes...
# can definitely multiprocess this part, as not THAT much information is needed...
matrix = init_matrices(contig, multigenics, persistent_penalty, variable_gain)
- org_regions |= mk_regions(contig, matrix, multigenics, min_length, min_score, persistent_penalty,
- variable_gain, naming=naming)
+ org_regions |= mk_regions(
+ contig,
+ matrix,
+ multigenics,
+ min_length,
+ min_score,
+ persistent_penalty,
+ variable_gain,
+ naming=naming
+ )
return org_regions
-def naming_scheme(pangenome: Pangenome):
+def naming_scheme(organisms: Iterable[Organism]) -> str:
+ """
+ Determine the naming scheme for the contigs in the pangenome.
+
+ :param organisms: Iterable of organims objects
+ :return: Naming scheme for the contigs ("contig" or "organism").
+ """
contigsids = set()
- for org in pangenome.organisms:
+ for org in organisms:
for contig in org.contigs:
oldlen = len(contigsids)
contigsids.add(contig.name)
if oldlen == len(contigsids):
logging.getLogger("PPanGGOLiN").warning("You have contigs with identical identifiers in your "
- "assemblies. identifiers will be supplemented with your "
+ "assemblies. Identifiers will be supplemented with your "
"provided organism names.")
return "organism"
return "contig"
@@ -248,25 +277,25 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain
# check statuses and load info
check_pangenome_former_rgp(pangenome, force)
check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=False, need_partitions=True,
- disable_bar=disable_bar)
+ disable_bar=disable_bar)
logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...")
multigenics = pangenome.get_multigenics(dup_margin)
logging.getLogger("PPanGGOLiN").info("Compute Regions of Genomic Plasticity ...")
- name_scheme = naming_scheme(pangenome)
+ name_scheme = naming_scheme(pangenome.organisms)
for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genomes", disable=disable_bar):
for region in compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length,
- min_score, naming=name_scheme):
- pangenome.add_region(region)
+ min_score, naming=name_scheme):
+ pangenome.add_region(region)
logging.getLogger("PPanGGOLiN").info(f"Predicted {pangenome.number_of_rgp} RGP")
# save parameters and save status
- pangenome.parameters["RGP"] = {}
- pangenome.parameters["RGP"]["persistent_penalty"] = persistent_penalty
- pangenome.parameters["RGP"]["variable_gain"] = variable_gain
- pangenome.parameters["RGP"]["min_length"] = min_length
- pangenome.parameters["RGP"]["min_score"] = min_score
- pangenome.parameters["RGP"]["dup_margin"] = dup_margin
+ pangenome.parameters["rgp"] = {}
+ pangenome.parameters["rgp"]["persistent_penalty"] = persistent_penalty
+ pangenome.parameters["rgp"]["variable_gain"] = variable_gain
+ pangenome.parameters["rgp"]["min_length"] = min_length
+ pangenome.parameters["rgp"]["min_score"] = min_score
+ pangenome.parameters["rgp"]["dup_margin"] = dup_margin
pangenome.status['predictedRGP'] = "Computed"
diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py
index 44828d02..00b6e7a6 100644
--- a/ppanggolin/RGP/spot.py
+++ b/ppanggolin/RGP/spot.py
@@ -7,6 +7,7 @@
import time
import os
from pathlib import Path
+from typing import List
# installed libraries
import networkx as nx
@@ -69,43 +70,43 @@ def check_sim(pair_border1: list, pair_border2: list, overlapping_match: int = 2
return True
return False
+def add_new_node_in_spot_graph(g: nx.Graph, region: Region, borders: list) -> str:
+ """
+ Add bordering region as node to graph
-def make_spot_graph(rgps: list, multigenics: set, output: Path, spot_graph: bool = False, overlapping_match: int = 2,
- set_size: int = 3, exact_match: int = 1) -> list:
+ :param g: spot graph
+ :param region: region in spot
+ :param borders: bordering families in spot
+ :return blocks: name of the node that has been added
+ """
+ blocks = str(sorted([[gene.family.ID for gene in borders[0]], [gene.family.ID for gene in borders[1]]],
+ key=lambda x: x[0]))
+ g.add_node(blocks)
+ try:
+ g.nodes[blocks]["nb_rgp"] += 1
+ g.nodes[blocks]["rgp"].add(region)
+ except KeyError:
+ g.nodes[blocks]["nb_rgp"] = 1
+ g.nodes[blocks]["border1"] = [gene.family for gene in borders[1]]
+ g.nodes[blocks]["border0"] = [gene.family for gene in borders[0]]
+ g.nodes[blocks]["rgp"] = {region}
+
+ return blocks
+
+def make_spot_graph(rgps: list, multigenics: set, overlapping_match: int = 2,
+ set_size: int = 3, exact_match: int = 1) -> nx.Graph:
"""
Create a spot graph from pangenome RGP
:param rgps: list of pangenome RGP
:param multigenics: pangenome graph multigenic persistent families
- :param output: Output directory to save the spot graph
- :param spot_graph: Writes gexf graph of pairs of blocks of single copy markers flanking RGPs from same hotspot
:param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes
:param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation
:param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs
- :return: list of computed spot
+ :return: spot graph
"""
- def add_new_node(g: nx.Graph, region: Region, borders: list):
- """
- Add bordering region as node to graph
-
- :param g: spot graph
- :param region: region in spot
- :param borders: bordering families in spot
- """
- blocks = str(sorted([[gene.family.ID for gene in borders[0]], [gene.family.ID for gene in borders[1]]],
- key=lambda x: x[0]))
- g.add_node(blocks)
- try:
- g.nodes[blocks]["nb_rgp"] += 1
- g.nodes[blocks]["rgp"].add(region)
- except KeyError:
- g.nodes[blocks]["nb_rgp"] = 1
- g.nodes[blocks]["border1"] = [gene.family for gene in borders[1]]
- g.nodes[blocks]["border0"] = [gene.family for gene in borders[0]]
- g.nodes[blocks]["rgp"] = {region}
-
graph_spot = nx.Graph()
lost = 0
used = 0
@@ -115,7 +116,7 @@ def add_new_node(g: nx.Graph, region: Region, borders: list):
lost += 1
else:
used += 1
- add_new_node(graph_spot, rgp, border)
+ add_new_node_in_spot_graph(graph_spot, rgp, border)
logging.getLogger("PPanGGOLiN").info(f"{lost} RGPs were not used as they are on a contig border (or have "
f"less than {set_size} persistent gene families until the contig border)")
logging.getLogger("PPanGGOLiN").info(f"{used} RGPs are being used to predict spots of insertion")
@@ -128,24 +129,26 @@ def add_new_node(g: nx.Graph, region: Region, borders: list):
if check_sim([node_obj_i["border0"], node_obj_i["border1"]], [node_obj_j["border0"], node_obj_j["border1"]],
overlapping_match, set_size, exact_match):
graph_spot.add_edge(nodei, nodej)
- spots = []
- spot_id = 0
- for comp in nx.algorithms.components.connected_components(graph_spot):
- curr_spot = Spot(spot_id)
- spots.append(curr_spot)
- for node in comp:
- for region in graph_spot.nodes[node]["rgp"]:
- curr_spot.add(region)
- spot_id += 1
- if spot_graph:
+ return graph_spot
+
+def write_spot_graph(graph_spot, outdir, graph_formats, file_basename="spotGraph"):
for node in graph_spot.nodes:
- del graph_spot.nodes[node]["border0"]
- del graph_spot.nodes[node]["border1"]
- del graph_spot.nodes[node]["rgp"]
+ graph_spot.nodes[node]["border0"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border0"]])
+ graph_spot.nodes[node]["border1"] = ';'.join([fam.name for fam in graph_spot.nodes[node]["border1"]])
- nx.readwrite.gexf.write_gexf(graph_spot, output.as_posix() + "/spotGraph.gexf")
- return spots
+ graph_spot.nodes[node]["organisms"] = ';'.join({rgp.organism.name for rgp in graph_spot.nodes[node]["rgp"]})
+ graph_spot.nodes[node]["rgp"] = ';'.join([rgp.name for rgp in graph_spot.nodes[node]["rgp"]])
+
+ if "gexf" in graph_formats:
+ outfile = outdir / f"{file_basename}.gexf"
+ logging.getLogger("PPanGGOLiN").info(f'Writing spot graph in {outfile}')
+ nx.readwrite.gexf.write_gexf(graph_spot, outfile)
+
+ if "graphml" in graph_formats:
+ outfile = outdir / f"{file_basename}.graphml"
+ logging.getLogger("PPanGGOLiN").info(f'Writing spot graph in {outfile}')
+ nx.readwrite.graphml.write_graphml(graph_spot, outfile)
def check_pangenome_former_spots(pangenome: Pangenome, force: bool = False):
@@ -162,14 +165,15 @@ def check_pangenome_former_spots(pangenome: Pangenome, force: bool = False):
erase_pangenome(pangenome, spots=True)
-def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = False, overlapping_match: int = 2,
+def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = False, graph_formats: List[str] = ['gexf'], overlapping_match: int = 2,
set_size: int = 3, exact_match: int = 1, force: bool = False, disable_bar: bool = False):
"""
Main function to predict hotspot
:param pangenome: Blank pangenome object
:param output: Output directory to save the spot graph
- :param spot_graph: Writes gexf graph of pairs of blocks of single copy markers flanking RGPs from same hotspot
+ :param spot_graph: Writes graph of pairs of blocks of single copy markers flanking RGPs from same hotspot
+ :param graph_formats: Set of graph file formats to save the output
:param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes
:param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation
:param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs
@@ -191,13 +195,27 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals
# get multigenic gene families
logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...")
- multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"])
+ multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"])
logging.getLogger("PPanGGOLiN").info("Detecting hotspots in the pangenome...")
- # predict spots
- spots = make_spot_graph(pangenome.regions, multigenics, output, spot_graph, overlapping_match, set_size,
+ # make spots
+ graph_spot = make_spot_graph(pangenome.regions, multigenics, overlapping_match, set_size,
exact_match)
+
+ spots = []
+ for spot_id, comp in enumerate(nx.algorithms.components.connected_components(graph_spot)):
+ curr_spot = Spot(spot_id)
+ spots.append(curr_spot)
+
+ for node in comp:
+ for region in graph_spot.nodes[node]["rgp"]:
+ curr_spot.add(region)
+ if spot_graph:
+ graph_spot.nodes[node]["spot_id"] = str(curr_spot)
+
+ if spot_graph:
+ write_spot_graph(graph_spot, output, graph_formats)
if len(spots) == 0:
logging.getLogger("PPanGGOLiN").warning("No spots were detected.")
@@ -206,10 +224,10 @@ def predict_hotspots(pangenome: Pangenome, output: Path, spot_graph: bool = Fals
for spot in spots:
pangenome.add_spot(spot)
pangenome.status["spots"] = "Computed"
- pangenome.parameters["spots"] = {}
- pangenome.parameters["spots"]["set_size"] = set_size
- pangenome.parameters["spots"]["overlapping_match"] = overlapping_match
- pangenome.parameters["spots"]["exact_match"] = exact_match
+ pangenome.parameters["spot"] = {}
+ pangenome.parameters["spot"]["set_size"] = set_size
+ pangenome.parameters["spot"]["overlapping_match"] = overlapping_match
+ pangenome.parameters["spot"]["exact_match_size"] = exact_match
def launch(args: argparse.Namespace):
@@ -222,9 +240,10 @@ def launch(args: argparse.Namespace):
pangenome.add_file(args.pangenome)
if args.spot_graph:
mk_outdir(args.output, args.force)
- predict_hotspots(pangenome, args.output, force=args.force, spot_graph=args.spot_graph,
+ predict_hotspots(pangenome, args.output, force=args.force,
+ spot_graph=args.spot_graph, graph_formats=args.graph_formats,
overlapping_match=args.overlapping_match, set_size=args.set_size,
- exact_match=args.exact_match_size, disable_bar=args.disable_prog_bar)
+ exact_match=args.exact_match_size, disable_bar=args.disable_prog_bar, )
write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar)
@@ -257,7 +276,7 @@ def parser_spot(parser: argparse.ArgumentParser):
f"_PID{str(os.getpid())}"),
help="Output directory")
optional.add_argument("--spot_graph", required=False, action="store_true",
- help="Writes a graph in .gexf format of pairs of blocks of single copy markers flanking RGPs,"
+ help="Writes a graph of pairs of blocks of single copy markers flanking RGPs,"
" supposedly belonging to the same hotspot")
optional.add_argument("--overlapping_match", required=False, type=int, default=2,
help="The number of 'missing' persistent genes allowed when comparing flanking genes during "
@@ -269,7 +288,8 @@ def parser_spot(parser: argparse.ArgumentParser):
help="Number of perfectly matching flanking single copy markers required to associate RGPs "
"during hotspot computation (Ex: If set to 1, two RGPs are in the same hotspot "
"if both their 1st flanking genes are the same)")
-
+ optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+",
+ default=['gexf'], help="Format of the output graph.")
if __name__ == '__main__':
"""To test local change and allow using debugger"""
diff --git a/ppanggolin/__init__.py b/ppanggolin/__init__.py
index 6033ba1d..f8626afa 100755
--- a/ppanggolin/__init__.py
+++ b/ppanggolin/__init__.py
@@ -12,6 +12,7 @@
import ppanggolin.mod
import ppanggolin.context
import ppanggolin.workflow
+import ppanggolin.projection
# import ppanggolin.utility
import ppanggolin.meta
@@ -35,6 +36,7 @@
"spot": ppanggolin.RGP.spot.subparser,
"module": ppanggolin.mod.subparser,
"context": ppanggolin.context.subparser,
+ "projection":ppanggolin.projection.subparser,
"rgp_cluster":ppanggolin.RGP.rgp_cluster.subparser,
"metadata": ppanggolin.meta.subparser
}
diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
index af00f4cd..e95852ac 100644
--- a/ppanggolin/align/alignOnPang.py
+++ b/ppanggolin/align/alignOnPang.py
@@ -2,138 +2,274 @@
# coding:utf-8
# default libraries
+import time
from _io import TextIOWrapper
import logging
import tempfile
import subprocess
import argparse
-from collections import defaultdict
-from typing import List, Tuple, Set, Dict, IO
+from collections import defaultdict, Counter
+from typing import List, Tuple, Set, Dict, IO, Iterable
from pathlib import Path
+from tqdm import tqdm
+
# local libraries
from ppanggolin.formats import check_pangenome_info
from ppanggolin.geneFamily import GeneFamily
-from ppanggolin.utils import mk_outdir, read_compressed_or_not
+from ppanggolin.utils import mk_outdir, read_compressed_or_not, create_tmpdir
from ppanggolin.pangenome import Pangenome
from ppanggolin.region import Spot
from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph
+from ppanggolin.formats.readBinaries import get_non_redundant_gene_sequences_from_file
+
+
+def create_mmseqs_db(seq_files: Iterable[Path], tmpdir: Path, basename="sequences") -> Path:
+ """
+ Create a MMseqs2 sequence database with the given fasta files.
+
+ :param seq_files: An iterable of path of FASTA files.
+ :param tmpdir: Path to the temporary directory where the database will be created.
+ :param basename: Prefix for the database file (default: "sequences").
+
+ :return: Path to the created MMseqs2 database file.
+ """
+
+ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, suffix=".DB", prefix=basename) as seqdb:
+ cmd = ["mmseqs", "createdb"] + [seq_file.as_posix() for seq_file in seq_files] + [seqdb.name, '--dbtype', '0']
+
+ logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
+ subprocess.run(cmd, stdout=subprocess.DEVNULL)
+ return Path(seqdb.name)
-def createdb(file_obj: TextIOWrapper, tmpdir: Path) -> IO:
+
+def translate_with_mmseqs(seqdb: Path, translation_table: int, cpu: int, tmpdir: Path) -> Path:
"""
- Create a MMseqs2 sequence database with the given fasta file
+ Translate nucleotide sequences in an MMseqs2 sequence database to amino acid sequences.
- :param file_obj: Fasta file
- :param tmpdir: temporary directory
+ :param seqdb: Path to the input MMseqs2 sequence database containing nucleotide sequences.
+ :param translation_table: The translation table to use for conversion.
+ :param cpu: Number of CPU cores to use for translation.
+ :param tmpdir: Path to the temporary directory for intermediate files.
- :return: DB file
+ :return: Path to the new MMseqs2 sequence database containing translated amino acid sequences.
"""
- seqdb = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir)
- cmd = ["mmseqs", "createdb", file_obj.name, seqdb.name, '--dbtype', '0']
- logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
- subprocess.run(cmd, stdout=subprocess.DEVNULL)
- return seqdb
+ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, delete=False, prefix=seqdb.stem,
+ suffix=".aa.DB") as seqdb_aa:
+ cmd = ["mmseqs", "translatenucs", seqdb.as_posix(), seqdb_aa.name, "--translation-table",
+ f"{translation_table}", "--threads", str(cpu)]
+
+ logging.getLogger().debug(" ".join(cmd))
+ subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+
+ return Path(seqdb_aa.name)
-def align_seq_to_pang(pang_file: IO, seq_file: TextIOWrapper, output: Path,
+
+def align_seq_to_pang(target_seq_file: Path, query_seq_files: Iterable[Path],
tmpdir: Path, cpu: int = 1, no_defrag: bool = False,
- identity: float = 0.8, coverage: float = 0.8) -> Path:
+ identity: float = 0.8, coverage: float = 0.8,
+ is_query_nt: bool = False, is_target_nt: bool = False, translation_table: int = None) -> Path:
"""
- Align pangenome sequences against fasta sequence
+ Align fasta sequence to pangenome sequences.
- :param pang_file: File with sequences in pangenome
- :param seq_file: File with sequences from input file
- :param output: Path of the output directory
+ :param target_seq_file: File with sequences of pangenome (target)
+ :param query_seq_files: Iterable of files with sequences from input file (query)
:param tmpdir: Temporary directory to align sequences
:param cpu: Number of available cpu
- :param no_defrag: Allow to pass the defragmentation step
+ :param no_defrag: Do not apply defragmentation
:param identity: minimal identity threshold for the alignment
:param coverage: minimal identity threshold for the alignment
+ :param is_query_nt: Is the sequence file (query) are nucleotide sequences. If True, sequences are translated by mmseqs
+ :param is_target_nt: Is the sequences of pangenome (target) are nucleotide sequences. If True, sequences are translated by mmseqs
+ :param translation_table: Translation table to use, if sequences are nucleotide and need to be translated.
:return: Alignement result file
"""
- pang_db = createdb(pang_file, tmpdir)
- seq_db = createdb(seq_file, tmpdir)
- cov_mode = "0" # coverage of query and target
- if not no_defrag:
- cov_mode = "1" # coverage of target
- aln_db = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir)
- cmd = list(map(str, ["mmseqs", "search", seq_db.name, pang_db.name, aln_db.name, tmpdir, "-a",
- "--min-seq-id", identity, "-c", coverage, "--cov-mode", cov_mode, "--threads", cpu]))
- logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
- logging.getLogger("PPanGGOLiN").info("Aligning sequences to cluster representatives...")
- subprocess.run(cmd, stdout=subprocess.DEVNULL)
- outfile = output.absolute() / "input_to_pangenome_associations.blast-tab_tmp" # write a tmp file of the results
- cmd = list(map(str, ["mmseqs", "convertalis", seq_db.name, pang_db.name,
- aln_db.name, outfile, "--format-mode", "2"]))
- logging.getLogger("PPanGGOLiN").debug(" ".join(cmd))
- logging.getLogger("PPanGGOLiN").info("Extracting alignments...")
- subprocess.run(cmd, stdout=subprocess.DEVNULL)
- pang_db.close()
- seq_db.close()
- aln_db.close()
+ target_db = create_mmseqs_db([target_seq_file], tmpdir, basename="target_sequences")
+ query_db = create_mmseqs_db(query_seq_files, tmpdir, basename="query_sequences")
+
+ if is_target_nt:
+ logging.getLogger().debug(
+ f"Target sequences will be translated by mmseqs with translation table {translation_table}")
+ target_db = translate_with_mmseqs(target_db, translation_table, cpu, tmpdir)
+
+ if is_query_nt:
+ logging.getLogger().debug(
+ f"Query sequences will be translated by mmseqs with translation table {translation_table}")
+ query_db = translate_with_mmseqs(query_db, translation_table, cpu, tmpdir)
+
+ cov_mode = "2" # coverage of query
+ if no_defrag:
+ cov_mode = "0" # coverage of query and target
+
+ # mmseqs search command
+ # see https://github.com/soedinglab/MMseqs2/issues/373 Using a combination of param to no miss short proteins
- return outfile
+ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), prefix="aln_result_db_file", suffix=".aln.DB",
+ delete=False) as aln_db:
+ cmd = ["mmseqs", "search", query_db.as_posix(), target_db.as_posix(), aln_db.name, tmpdir.as_posix(), "-a",
+ "--min-seq-id", str(identity),
+ "-c", str(coverage), "--cov-mode", cov_mode, "--threads", str(cpu),
+ "--seed-sub-mat", "VTML40.out", "-s", "2", '--comp-bias-corr', "0", "--mask", "0", "-e", "1"]
+ logging.getLogger().info("Aligning sequences")
+ logging.getLogger().debug(" ".join(cmd))
-def read_alignments(aln_res: Path, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]:
+ start = time.time()
+ subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+ align_time = time.time() - start
+ logging.getLogger().info(f"Done aligning sequences in {round(align_time, 2)} seconds")
+
+ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir, prefix="aln_result_db_file", suffix=".tsv",
+ delete=False) as outfile:
+ cmd = ["mmseqs", "convertalis", query_db.as_posix(), target_db.as_posix(), aln_db.name, outfile.name,
+ "--format-mode", "2"]
+
+ logging.getLogger().info("Extracting alignments...")
+ logging.getLogger().debug(" ".join(cmd))
+ subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True)
+
+ return Path(outfile.name)
+
+
+def map_input_gene_to_family_all_aln(aln_res: Path, outdir: Path,
+ pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], Path]:
"""
- Read alignment result to link input sequence to pangenome
+ Read alignment result to link input sequences to pangenome gene family.
+ Alignment have been made against all genes of the pangenome.
:param aln_res: Alignement result file
+ :param outdir: Output directory
:param pangenome: Input pangenome
- :return: Dictionnary with sequence link to pangenome gene families and actual name of resulting alignment file
+ :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file
"""
+
seq2pang = {}
- outname = open(aln_res.absolute().as_posix().replace("_tmp", ""), "w") # write the actual result file
- with open(aln_res, "r") as alnFile:
+ aln_file_clean = outdir / "alignment_input_seqs_to_all_pangenome_genes.tsv" # write the actual result file
+ logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}')
+
+ with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl:
for line in alnFile:
- line = line.replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id
- outname.write(line)
- line = line.split()
- if seq2pang.get(line[0]) is None: # if no results were found yet
- seq2pang[line[0]] = pangenome.get_gene_family(line[1]) # then the best hit is the first one we see.
- outname.close()
- return seq2pang, outname.name
+ line_splitted = line.split()
+
+ line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id
+ line_splitted[0] = line_splitted[0].replace("ppanggolin_", "")
+
+ input_seq_id, gene_id = line_splitted[0:2]
+
+ aln_outfl.write("\t".join(line_splitted) + "\n")
+
+ if seq2pang.get(input_seq_id) is None: # if no results were found yet
+ family = pangenome.get_gene(gene_id).family
+ seq2pang[input_seq_id] = family # then the best hit is the first one we see.
+
+ return seq2pang, aln_file_clean
+
+
+def map_input_gene_to_family_rep_aln(aln_res: Path, outdir: Path,
+ pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]:
+ """
+ Read alignment result to link input sequences to pangenome gene family.
+ Alignment have been made against representative sequence of gene families of the pangenome.
+
+ :param aln_res: Alignement result file
+ :param outdir: Output directory
+ :param pangenome: Input pangenome
+
+ :return: Dictionnary with sequence link to pangenome gene families and actual path to the cleaned alignment file
+ """
+ seq2pang = {}
+ aln_file_clean = outdir / "alignment_input_seqs_to_pangenome_gene_families.tsv" # write the actual result file
+
+ logging.getLogger().debug(f'Writing alignment file in {aln_file_clean}')
+
+ with open(aln_res, "r") as alnFile, open(aln_file_clean, "w") as aln_outfl:
+ for line in alnFile:
+ line_splitted = line.split()
+
+ line_splitted[1] = line_splitted[1].replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id
+ line_splitted[0] = line_splitted[0].replace("ppanggolin_", "")
+
+ aln_outfl.write("\t".join(line_splitted) + "\n")
+
+ input_seq_id, gene_family_id = line_splitted[0:2]
+
+ if seq2pang.get(input_seq_id) is None: # if no results were found yet
+ family = pangenome.get_gene_family(gene_family_id) # then the best hit is the first one we see.
+ seq2pang[input_seq_id] = family
+
+ return seq2pang, aln_file_clean
-def get_seq(seq_file: TextIOWrapper) -> Set[str]:
+def get_seq_ids(seq_file: TextIOWrapper) -> Tuple[Set[str], bool]:
"""
- get sequence from sequence input file
+ Get sequence IDs from a sequence input file in FASTA format and guess the sequence type based on the first sequences.
- :param seq_file: file containing sequences
+ :param seq_file: A file object containing sequences in FASTA format.
- :return: set of sequences
+ :return: A tuple containing a set of sequence IDs and a boolean indicating if the sequences are nucleotide sequences.
"""
- seqset = set()
+ dna_expected_char = {'A', 'T', 'G', 'C', 'N'}
+ seq_set = set()
+ seq_count = 0
+ first_seq_concat = ""
+
for line in seq_file:
if line.startswith(">"):
- seqset.add(line[1:])
- return seqset
+ seq_set.add(line[1:].split()[0].strip())
+ seq_count += 1
+ elif seq_count <= 20:
+ first_seq_concat += line.strip()
+
+ char_counter = Counter(first_seq_concat)
+ is_nucleotide = all(char in dna_expected_char for char in char_counter)
+
+ return seq_set, is_nucleotide
-def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = ""):
+def write_gene_fam_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False):
"""
- Export the sequence of genes in families
+ Export the sequence of gene families
:param pangenome: Pangenome containing families
:param file_obj: Temporary file where sequences will be written
:param add: Add prefix to sequence name
+ :param disable_bar: disable progress bar
"""
- for fam in pangenome.gene_families:
+ for fam in tqdm(pangenome.gene_families, unit="families", disable=disable_bar,
+ total=pangenome.number_of_gene_families):
file_obj.write(">" + add + fam.name + "\n")
file_obj.write(fam.sequence + "\n")
- file_obj.flush()
+ # file_obj.flush()
+
+
+def write_all_gene_sequences(pangenome: Pangenome, file_obj: IO, add: str = "", disable_bar: bool = False):
+ """
+ Export the sequence of pangenome genes
+
+ :param pangenome: Pangenome containing genes
+ :param file_obj: Temporary file where sequences will be written
+ :param add: Add prefix to sequence name
+ :param disable_bar: disable progress bar
+
+ """
+
+ if pangenome.status["geneSequences"] == "inFile":
+ get_non_redundant_gene_sequences_from_file(pangenome.file, file_obj, add=add, disable_bar=disable_bar)
+ else:
+ # this should never happen if the pangenome has been properly checked before launching this function.
+ raise Exception("The pangenome does not include gene sequences")
-def project_partition(seq_to_pang: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> str:
+def project_and_write_partition(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path:
"""
- Project the partition of each sequence from the input file
+ Project the partition of each sequence from the input file and write them in a file
- :param seq_to_pang: dictionnary which link sequence and pangenome
+ :param seqid_to_gene_family: dictionnary which link sequence and pangenome gene family
:param seq_set: input sequences
:param output: Path of the output directory
@@ -142,13 +278,35 @@ def project_partition(seq_to_pang: Dict[str, GeneFamily], seq_set: Set[str], out
partition_proj = output.absolute() / "sequences_partition_projection.tsv"
with open(partition_proj, "w") as partProjFile:
- for key, pang_fam in seq_to_pang.items():
- partProjFile.write(key + "\t" + pang_fam.named_partition + "\n")
- for remaining_seq in (seq_to_pang.keys() & seq_set):
+ for input_seq, gene_fam in seqid_to_gene_family.items():
+ partProjFile.write(input_seq + "\t" + gene_fam.named_partition + "\n")
+ for remaining_seq in seq_set - seqid_to_gene_family.keys():
partProjFile.write(remaining_seq + "\tcloud\n") # if there is no hit, it's going to be cloud genes.
return partition_proj
+def write_gene_to_gene_family(seqid_to_gene_family: Dict[str, GeneFamily], seq_set: Set[str], output: Path) -> Path:
+ """
+ Write input gene to gene family.
+
+ :param seqid_to_gene_family: dictionnary which link sequence and pangenome gene family
+ :param seq_set: input sequences
+ :param output: Path of the output directory
+
+ :return: Path to file which contain partition projection
+ """
+
+ gene_fam_map_file = output.absolute() / "gene_to_gene_family.tsv"
+ with open(gene_fam_map_file, "w") as partProjFile:
+ for input_seq, gene_fam in seqid_to_gene_family.items():
+ partProjFile.write(f"{input_seq}\t{gene_fam.name}\n")
+
+ for remaining_seq in seq_set - seqid_to_gene_family.keys():
+ partProjFile.write(f"{remaining_seq}\t{remaining_seq}\n") # if there is no hit, gene family is itself.
+
+ return gene_fam_map_file
+
+
def get_fam_to_rgp(pangenome, multigenics: set) -> dict:
"""
Associate families to the RGP they belong to, and those they are bordering
@@ -162,7 +320,7 @@ def get_fam_to_rgp(pangenome, multigenics: set) -> dict:
for rgp in pangenome.regions:
for fam in rgp.families:
fam2rgp[fam].append(rgp.name)
- for fam in [gene.family for border in rgp.get_bordering_genes(pangenome.parameters["spots"]["set_size"],
+ for fam in [gene.family for border in rgp.get_bordering_genes(pangenome.parameters["spot"]["set_size"],
multigenics) for gene in border]:
fam2rgp[fam].append(rgp.name)
return fam2rgp
@@ -187,7 +345,7 @@ def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) \
for rgp in spot.regions:
fams |= set(rgp.families)
fams_border |= set([gene.family for border in # Set of families in border of spot
- rgp.get_bordering_genes(pangenome.parameters["spots"]["set_size"], multigenics)
+ rgp.get_bordering_genes(pangenome.parameters["spot"]["set_size"], multigenics)
for gene in border])
for fam in fams:
fam2spot[fam].append(spot)
@@ -235,7 +393,7 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel
:return:
"""
logging.getLogger("PPanGGOLiN").info("Writing RGP and spot information related to hits in the pangenome")
- multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"])
+ multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"])
finfo = open(output / "info_input_seq.tsv", "w")
finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n")
@@ -256,8 +414,8 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel
drawn_spots.add(spot)
logging.getLogger("PPanGGOLiN").info(f"Drawing the {len(drawn_spots)} spots with more than 1 organization "
f"related to hits of the input sequences...")
- draw_selected_spots(drawn_spots, pangenome, output, pangenome.parameters["spots"]["overlapping_match"],
- pangenome.parameters["spots"]["exact_match"], pangenome.parameters["spots"]["set_size"],
+ draw_selected_spots(drawn_spots, pangenome, output, pangenome.parameters["spot"]["overlapping_match"],
+ pangenome.parameters["spot"]["exact_match_size"], pangenome.parameters["spot"]["set_size"],
disable_bar=disable_bar)
fam2mod = {} # fam2module
@@ -272,54 +430,112 @@ def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: Path, draw_rel
f"{output / 'info_input_seq.tsv'}")
-def get_seq2pang(pangenome: Pangenome, sequence_file: Path, output: Path, tmpdir: Path, cpu: int = 1,
- no_defrag: bool = False, identity: float = 0.8, coverage: float = 0.8) -> Tuple[set, str, dict]:
+def get_input_seq_to_family_with_rep(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path,
+ tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False,
+ identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11,
+ disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]:
"""
- Assign a pangenome gene family to the input sequences.
+ Assign gene families from a pangenome to input sequences.
- :param pangenome: Pangenome with gene families to align with the given input sequences
- :param sequence_file: Path to sequences in a .fasta file to align with the given Pangenome
- :param output: Path of the output directory
- :param tmpdir: Temporary directory
- :param cpu: number of CPU cores to use
- :param no_defrag: do not use the defrag workflow if true
- :param identity: minimal identity threshold for the alignment
- :param coverage: minimal identity threshold for the alignment
+ This function aligns input sequences to gene families in a pangenome using MMseqs2 and assigns them
+ to appropriate gene families based on alignment results.
+
+ :param pangenome: Annotated pangenome containing gene families.
+ :param sequence_files: Iterable of paths of FASTA files containing input sequences to align.
+ :param output: Path to the output directory where alignment results will be stored.
+ :param tmpdir: Temporary directory for intermediate files.
+ :param is_input_seq_nt: Is input sequence file nucleotide sequences.
+ :param cpu: Number of CPU cores to use for the alignment (default: 1).
+ :param no_defrag: If True, the defragmentation workflow is skipped (default: False).
+ :param identity: Minimum identity threshold for the alignment (default: 0.8).
+ :param coverage: Minimum coverage threshold for the alignment (default: 0.8).
+ :param translation_table: Translation table to use if sequences need to be translated (default: 11).
+ :param disable_bar: If True, disable the progress bar.
+
+ :return: A tuple containing the path to the alignment result file,
+ and a dictionary mapping input sequences to gene families.
- :return: sequence set, blast-tab result file string, and sequences aligned with families
"""
- tmp_pang_file = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir)
+ # delete False to be able to keep tmp file. If they are not keep tmpdir will be destroyed so no need to delete tmpfile
+ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False,
+ prefix="representative_genes", suffix=".faa") as tmp_pang_file:
+ logging.getLogger().debug(f'Write gene family sequences in {tmp_pang_file.name}')
+ write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar)
- write_gene_fam_sequences(pangenome, tmp_pang_file, add="ppanggolin_")
+ align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files,
+ tmpdir=tmpdir, cpu=cpu,
+ no_defrag=no_defrag, identity=identity, coverage=coverage,
+ is_query_nt=is_input_seq_nt, is_target_nt=False,
+ translation_table=translation_table)
- with read_compressed_or_not(sequence_file) as seqFileObj:
- seq_set = get_seq(seqFileObj)
- align_file = align_seq_to_pang(tmp_pang_file, seqFileObj, output, tmpdir, cpu, no_defrag, identity, coverage)
+ seq2pang, align_file = map_input_gene_to_family_rep_aln(align_file, output, pangenome)
+
+ return align_file, seq2pang
- seq2pang, align_file = read_alignments(align_file, pangenome)
- tmp_pang_file.close()
+def get_input_seq_to_family_with_all(pangenome: Pangenome, sequence_files: Iterable[Path], output: Path,
+ tmpdir: Path, is_input_seq_nt: bool, cpu: int = 1, no_defrag: bool = False,
+ identity: float = 0.8, coverage: float = 0.8, translation_table: int = 11,
+ disable_bar: bool = False) -> Tuple[Path, Dict[str, GeneFamily]]:
+ """
+ Assign gene families from a pangenome to input sequences.
- return seq_set, align_file, seq2pang
+ This function aligns input sequences to all genes of the pangenome using MMseqs2 and assigns them
+ to a gene families based on alignment results.
+ :param pangenome: Annotated pangenome containing genes.
+ :param sequence_files: Iterable of paths of FASTA files containing input sequences to align.
+ :param output: Path to the output directory where alignment results will be stored.
+ :param tmpdir: Temporary directory for intermediate files.
+ :param is_input_seq_nt: Is input sequence file nucleotide sequences.
+ :param cpu: Number of CPU cores to use for the alignment (default: 1).
+ :param no_defrag: If True, the defragmentation workflow is skipped (default: False).
+ :param identity: Minimum identity threshold for the alignment (default: 0.8).
+ :param coverage: Minimum coverage threshold for the alignment (default: 0.8).
+ :param translation_table: Translation table to use if sequences need to be translated (default: 11).
+ :param disable_bar: If True, disable the progress bar.
-def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8,
- coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False,
- draw_related: bool = False, tmpdir: Path = None, disable_bar: bool = False):
+ :return: A tuple containing the path to the alignment result file,
+ and a dictionary mapping input sequences to gene families.
"""
- Main function to align pangenome sequences with fasta file using MMSeqs2
- :param pangenome: Pangenome with gene families to align with the given input sequences
- :param sequence_file: Path to sequences in a .fasta file to align with the given Pangenome
- :param output: Path of the output directory
- :param identity: minimal identity threshold for the alignment
- :param coverage: minimal coverage threshold for the alignment
- :param no_defrag: do not use the defrag workflow if true
- :param cpu: number of CPU cores to use
- :param getinfo: Extract info related to the best hit of each query, such as the RGP it is in, or the spots.
- :param draw_related: Draw figures and graphs in a gexf format of spots associated to the input sequences
- :param tmpdir: Temporary directory
- :param disable_bar: Disable the progresse bar
+ with tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.as_posix(), delete=False,
+ prefix="all_pangenome_genes", suffix=".fna") as tmp_pang_file:
+ logging.getLogger().debug(f'Write all pangenome gene sequences in {tmp_pang_file.name}')
+ write_all_gene_sequences(pangenome, tmp_pang_file, add="ppanggolin_", disable_bar=disable_bar)
+
+ align_file = align_seq_to_pang(target_seq_file=Path(tmp_pang_file.name), query_seq_files=sequence_files,
+ tmpdir=tmpdir, cpu=cpu,
+ no_defrag=no_defrag, identity=identity, coverage=coverage,
+ is_query_nt=is_input_seq_nt, is_target_nt=True,
+ translation_table=translation_table)
+
+ seq2pang, align_file = map_input_gene_to_family_all_aln(align_file, output, pangenome)
+
+ return align_file, seq2pang
+
+
+def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: float = 0.8,
+ coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False,
+ use_representatives: bool = False, draw_related: bool = False, translation_table: int = 11,
+ tmpdir: Path = None, disable_bar: bool = False, keep_tmp=False):
+ """
+ Aligns pangenome sequences with sequences in a FASTA file using MMSeqs2.
+
+ :param pangenome: Pangenome object containing gene families to align with the input sequences.
+ :param sequence_file: Path to a FASTA file containing sequences to align with the pangenome.
+ :param output: Path to the output directory.
+ :param identity: Minimum identity threshold for the alignment.
+ :param coverage: Minimum coverage threshold for the alignment.
+ :param no_defrag: If True, the defrag workflow will not be used.
+ :param cpu: Number of CPU cores to use.
+ :param getinfo: If True, extract information related to the best hit of each query, such as the RGP it is in or the spots.
+ :param use_representatives: If True, use representative sequences of gene families rather than all sequences to align input genes.
+ :param draw_related: If True, draw figures and graphs in a gexf format of spots associated with the input sequences.
+ :param translation_table: Translation table ID for nucleotide sequences.
+ :param tmpdir: Temporary directory for intermediate files.
+ :param disable_bar: If True, disable the progress bar.
+ :param keep_tmp: If True, keep temporary files.
"""
tmpdir = Path(tempfile.gettempdir()) if tmpdir is None else tmpdir
@@ -339,21 +555,34 @@ def align(pangenome: Pangenome, sequence_file: Path, output: Path, identity: flo
else:
check_pangenome_info(pangenome, need_families=True, disable_bar=disable_bar)
- # TODO add possibility to keep_tmp
- new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
- tmp_path = Path(new_tmpdir.name)
- seq_set, align_file, seq2pang = get_seq2pang(pangenome, sequence_file, output, tmp_path, cpu, no_defrag, identity,
- coverage)
+ with read_compressed_or_not(sequence_file) as seqFileObj:
+ seq_set, is_nucleotide = get_seq_ids(seqFileObj)
+
+ with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
+
+ if use_representatives:
+ align_file, seq2pang = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output, new_tmpdir,
+ is_input_seq_nt=is_nucleotide,
+ cpu=cpu, no_defrag=no_defrag, identity=identity,
+ coverage=coverage,
+ translation_table=translation_table,
+ disable_bar=disable_bar)
+ else:
+ align_file, seq2pang = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=[sequence_file],
+ output=output, tmpdir=new_tmpdir,
+ is_input_seq_nt=is_nucleotide,
+ cpu=cpu, no_defrag=no_defrag, identity=identity,
+ coverage=coverage,
+ translation_table=translation_table,
+ disable_bar=disable_bar)
if getinfo or draw_related: # TODO Add getinfo to function and remove if
get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar)
- part_proj = project_partition(seq2pang, seq_set, output) # write the partition assignation only
- logging.getLogger("PPanGGOLiN").info(f"sequences partition projection : '{part_proj}'")
- logging.getLogger("PPanGGOLiN").info(
- f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.")
- logging.getLogger("PPanGGOLiN").info(f"Blast-tab file of the alignment : '{align_file}'")
- new_tmpdir.cleanup()
+ part_proj = project_and_write_partition(seq2pang, seq_set, output) # write the partition assignation only
+ logging.getLogger().info(f"sequences partition projection : '{part_proj}'")
+ logging.getLogger().info(f"{len(seq2pang)} sequences over {len(seq_set)} have at least one hit in the pangenome.")
+ logging.getLogger().info(f"Blast-tab file of the alignment : '{align_file}'")
def launch(args: argparse.Namespace):
@@ -365,9 +594,12 @@ def launch(args: argparse.Namespace):
mk_outdir(args.output, args.force)
pangenome = Pangenome()
pangenome.add_file(args.pangenome)
- align(pangenome=pangenome, sequence_file=args.sequences, output=args.output, tmpdir=args.tmpdir,
- identity=args.identity, coverage=args.coverage, no_defrag=args.no_defrag, cpu=args.cpu, getinfo=args.getinfo,
- draw_related=args.draw_related, disable_bar=args.disable_prog_bar)
+ align(pangenome=pangenome, sequence_file=args.sequences, output=args.output,
+ tmpdir=args.tmpdir, identity=args.identity, coverage=args.coverage,
+ no_defrag=args.no_defrag, cpu=args.cpu, getinfo=args.getinfo,
+ use_representatives=args.fast, draw_related=args.draw_related,
+ translation_table=args.translation_table,
+ disable_bar=args.disable_prog_bar, keep_tmp=args.keep_tmp)
def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
@@ -406,6 +638,9 @@ def parser_align(parser: argparse.ArgumentParser):
help="min identity percentage threshold")
optional.add_argument('--coverage', required=False, type=float, default=0.8,
help="min coverage percentage threshold")
+ optional.add_argument("--fast", required=False, action="store_true",
+ help="Use representative sequences of gene families for input gene alignment. "
+ "This option is faster but may be less sensitive. By default, all pangenome genes are used.")
optional.add_argument("--translation_table", required=False, default="11",
help="Translation table (genetic code) to use.")
optional.add_argument("--getinfo", required=False, action="store_true",
@@ -421,11 +656,13 @@ def parser_align(parser: argparse.ArgumentParser):
optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
help="directory for storing temporary files")
+ optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
+ help="Keeping temporary files (useful for debugging).")
if __name__ == '__main__':
"""To test local change and allow using debugger"""
- from ppanggolin.utils import check_log, set_verbosity_level, add_common_arguments
+ from ppanggolin.utils import set_verbosity_level, add_common_arguments
main_parser = argparse.ArgumentParser(
description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors",
diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
index 6c5ca044..3a2b7680 100644
--- a/ppanggolin/annotate/annotate.py
+++ b/ppanggolin/annotate/annotate.py
@@ -4,24 +4,28 @@
# default libraries
import argparse
import logging
+from concurrent.futures import ProcessPoolExecutor
from multiprocessing import get_context
import os
from pathlib import Path
import tempfile
import time
-from typing import List, Set, Tuple
+from typing import List, Set, Tuple, Iterable
# installed libraries
from tqdm import tqdm
# local libraries
-from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence
+from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence, init_contig_counter, contig_counter
from ppanggolin.pangenome import Pangenome
from ppanggolin.genome import Organism, Gene, RNA, Contig
from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files
from ppanggolin.formats import write_pangenome
+ctg_counter = contig_counter
+
+
def check_annotate_args(args: argparse.Namespace):
"""Check That the given arguments are usable
@@ -104,6 +108,8 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
:return: Organism complete and true for sequence in file
"""
+ global ctg_counter
+
organism = Organism(organism_name)
logging.getLogger("PPanGGOLiN").debug(f"Extracting genes informations from the given gbff {gbff_file_path.name}")
# revert the order of the file, to read the first line first.
@@ -115,14 +121,15 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
# beginning of contig
is_circ = False
contig_id = None
+ contig_len = None
if line.startswith('LOCUS'):
if "CIRCULAR" in line.upper():
# this line contains linear/circular word telling if the dna sequence is circularized or not
is_circ = True
# TODO maybe it could be a good thing to add a elif for linear
# and if circular or linear are not found raise a warning
-
contig_id = line.split()[1]
+ contig_len = int(line.split()[2])
# If contig_id is not specified in VERSION afterward like with Prokka, in that case we use the one in LOCUS
while not line.startswith('FEATURES'):
if line.startswith('VERSION') and line[12:].strip() != "":
@@ -134,8 +141,12 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
try:
contig = organism.get(contig_id)
except KeyError:
- contig = Contig(contig_id, True if contig_id in circular_contigs or is_circ else False)
+ with contig_counter.get_lock():
+ contig = Contig(contig_counter.value, contig_id,
+ True if contig_id in circular_contigs or is_circ else False)
+ contig_counter.value += 1
organism.add(contig)
+ contig.length = contig_len
# start of the feature object.
dbxref = set()
gene_name = ""
@@ -185,9 +196,6 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
pass
# don't know what to do with that, ignoring for now.
# there is a protein with a frameshift mecanism.
- elif curr_type == 'source': # Get Contig length
- start, end = map(int, map(str.strip, line[21:].split('..')))
- contig.length = end - start + 1
elif useful_info: # current info goes to current objtype, if it's useful.
if line[21:].startswith("/db_xref"):
dbxref.add(line.split("=")[1].replace('"', '').strip())
@@ -230,6 +238,9 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
while not line.startswith('//'):
sequence += line[10:].replace(" ", "").strip().upper()
line = lines.pop()
+
+ if contig.length != len(sequence):
+ raise ValueError("The contig lenght defined is different than the sequence length")
# get each gene's sequence.
for gene in contig.genes:
gene.add_sequence(get_dna_sequence(sequence, gene))
@@ -248,6 +259,8 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str]
:return: Organism object and if there are sequences associated or not
"""
+ global ctg_counter
+
(gff_seqname, _, gff_type, gff_start, gff_end, _, gff_strand, _, gff_attribute) = range(0, 9)
# Missing values: source, score, frame. They are unused.
@@ -300,9 +313,12 @@ def get_id_attribute(attributes_dict: dict) -> str:
has_fasta = True
elif line.startswith('sequence-region', 2, 17):
fields = [el.strip() for el in line.split()]
- contig = Contig(fields[1], True if fields[1] in circular_contigs else False)
+ with contig_counter.get_lock():
+ contig = Contig(contig_counter.value, fields[1],
+ True if fields[1] in circular_contigs else False)
+ contig_counter.value += 1
org.add(contig)
- contig.length = int(fields[-1]) - int(fields[3]) + 1
+ contig.length = int(fields[-1]) - int(fields[2]) + 1
continue
elif line.startswith('#'): # comment lines to be ignores by parsers
@@ -340,6 +356,8 @@ def get_id_attribute(attributes_dict: dict) -> str:
genetic_code=genetic_code)
gene.fill_parents(org, contig)
gene_counter += 1
+ contig.add(gene)
+
elif "RNA" in fields_gff[gff_type]:
rna = RNA(org.name + "_CDS_" + str(rna_counter).zfill(4))
rna.fill_annotations(start=int(fields_gff[gff_start]), stop=int(fields_gff[gff_end]),
@@ -347,11 +365,15 @@ def get_id_attribute(attributes_dict: dict) -> str:
product=product, local_identifier=gene_id)
rna.fill_parents(org, contig)
rna_counter += 1
+ contig.add_rna(rna)
# GET THE FASTA SEQUENCES OF THE GENES
if has_fasta and fasta_string != "":
contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length
for contig in org.contigs:
+ if contig.length != len(contig_sequences[contig.name]):
+ raise ValueError("The contig lenght defined is different than the sequence length")
+
for gene in contig.genes:
gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene))
for rna in contig.RNAs:
@@ -359,18 +381,8 @@ def get_id_attribute(attributes_dict: dict) -> str:
return org, has_fasta
-def launch_read_anno(args: Tuple[str, Path, List[str], bool]) -> Tuple[Organism, bool]:
- """ Allow to launch in multiprocessing the read of genome annotation
-
- :param args: Pack of argument for annotate_organism function
+def read_anno_file(organism_name: str, filename: Path, circular_contigs: list, pseudo: bool = False) -> Tuple[Organism, bool]:
- :return: Organism object for pangenome
- """
- return read_anno_file(*args)
-
-
-def read_anno_file(organism_name: str, filename: Path, circular_contigs: List[str],
- pseudo: bool = False) -> Tuple[Organism, bool]:
"""
Read a GBFF file for one organism
@@ -379,8 +391,9 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: List[st
:param circular_contigs: list of sequence in contig
:param pseudo: allow to read pseudogène
- :return: Annotated organism for pangenome
+ :return: Annotated organism for pangenome and true for sequence in file
"""
+ global ctg_counter
filetype = detect_filetype(filename)
if filetype == "gff":
try:
@@ -395,7 +408,7 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: List[st
else: # Fasta type obligatory because unknow raise an error in detect_filetype function
raise Exception("Wrong file type provided. This looks like a fasta file. "
"You may be able to use --fasta instead.")
-
+
def chose_gene_identifiers(pangenome: Pangenome) -> bool:
"""
@@ -406,19 +419,36 @@ def chose_gene_identifiers(pangenome: Pangenome) -> bool:
:return: Boolean stating True if local identifiers are used, and False otherwise
"""
+
+ if local_identifiers_are_unique(pangenome.genes):
+
+ for gene in pangenome.genes:
+ gene.ID = gene.local_identifier # Erase ppanggolin generated gene ids and replace with local identifiers
+ gene.local_identifier = "" # this is now useless, setting it to default value
+ pangenome._mk_gene_getter() # re-build the gene getter
+ return True
+
+ else:
+ return False
+
+
+def local_identifiers_are_unique(genes: Iterable[Gene]) -> bool:
+ """
+ Check if local_identifiers of genes are uniq in order to decide if they should be used as gene id.
+
+ :param genes: Iterable of gene objects
+
+ :return: Boolean stating True if local identifiers are uniq, and False otherwise
+ """
gene_id_2_local = {}
local_to_gene_id = {}
- for gene in pangenome.genes:
+ for gene in genes:
gene_id_2_local[gene.ID] = gene.local_identifier
local_to_gene_id[gene.local_identifier] = gene.ID
if len(local_to_gene_id) != len(gene_id_2_local):
# then, there are non unique local identifiers
return False
# if we reach this line, local identifiers are unique within the pangenome
- for gene in pangenome.genes:
- gene.ID = gene.local_identifier # Erase ppanggolin generated gene ids and replace with local identifiers
- gene.local_identifier = "" # this is now useless, setting it to default value
- pangenome._mk_gene_getter() # re-build the gene getter
return True
@@ -433,6 +463,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p
:param pseudo: allow to read pseudogène
:param disable_bar: Disable the progresse bar
"""
+
logging.getLogger("PPanGGOLiN").info(f"Reading {organisms_file.name} the list of organism files ...")
pangenome.status["geneSequences"] = "Computed"
@@ -445,12 +476,22 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p
if not org_path.exists(): # Check tsv sanity test if it's not one it's the other
org_path = organisms_file.parent.joinpath(org_path)
args.append((elements[0], org_path, elements[2:], pseudo))
- with get_context('fork').Pool(cpu) as p:
- for org, flag in tqdm(p.imap_unordered(launch_read_anno, args), unit="file", total=len(args),
- disable=disable_bar):
- pangenome.add_organism(org)
- if not flag:
- pangenome.status["geneSequences"] = "No"
+
+ with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu,
+ initializer=init_contig_counter, initargs=(contig_counter, )) as executor:
+ with tqdm(total=len(args), unit="file", disable=disable_bar) as progress:
+ futures = []
+
+ for fn_args in args:
+ future = executor.submit(read_anno_file, *fn_args)
+ future.add_done_callback(lambda p: progress.update())
+ futures.append(future)
+
+ for future in futures:
+ org, flag = future.result()
+ pangenome.add_organism(org)
+ if not flag:
+ pangenome.status["geneSequences"] = "No"
# decide whether we use local ids or ppanggolin ids.
used_local_identifiers = chose_gene_identifiers(pangenome)
@@ -462,10 +503,10 @@ def read_annotations(pangenome: Pangenome, organisms_file: Path, cpu: int = 1, p
"PPanGGOLiN will use self-generated identifiers.")
pangenome.status["genomesAnnotated"] = "Computed"
- pangenome.parameters["annotation"] = {}
- pangenome.parameters["annotation"]["used_local_identifiers"] = used_local_identifiers
- pangenome.parameters["annotation"]["read_pseudogenes"] = pseudo
- pangenome.parameters["annotation"]["read_annotations_from_file"] = True
+ pangenome.parameters["annotate"] = {}
+ pangenome.parameters["annotate"]["# used_local_identifiers"] = used_local_identifiers
+ pangenome.parameters["annotate"]["use_pseudo"] = pseudo
+ pangenome.parameters["annotate"]["# read_annotations_from_file"] = True
def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: List[Path]):
@@ -510,16 +551,6 @@ def get_gene_sequences_from_fastas(pangenome: Pangenome, fasta_files: List[Path]
pangenome.status["geneSequences"] = "Computed"
-def launch_annotate_organism(pack: Tuple[str, Path, List[str], str, int, bool, str, bool, str]) -> Organism:
- """ Allow to launch in multiprocessing the genome annotation
-
- :param pack: Pack of argument for annotate_organism function
-
- :return: Organism object for pangenome
- """
- return annotate_organism(*pack)
-
-
def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu: int = 1, translation_table: int = 11,
kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False,
procedure: str = None, disable_bar: bool = False):
@@ -556,22 +587,29 @@ def annotate_pangenome(pangenome: Pangenome, fasta_list: Path, tmpdir: str, cpu:
raise Exception("There are no genomes in the provided file")
logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...")
- with get_context('fork').Pool(processes=cpu) as p:
- for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome",
- total=len(arguments), disable=disable_bar):
- pangenome.add_organism(organism)
- p.close()
- p.join()
+ with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu,
+ initializer=init_contig_counter, initargs=(contig_counter,)) as executor:
+ with tqdm(total=len(arguments), unit="file", disable=disable_bar) as progress:
+ futures = []
+
+ for fn_args in arguments:
+ future = executor.submit(annotate_organism, *fn_args)
+ future.add_done_callback(lambda p: progress.update())
+ futures.append(future)
+
+ for future in futures:
+ pangenome.add_organism(future.result())
logging.getLogger("PPanGGOLiN").info("Done annotating genomes")
pangenome.status["genomesAnnotated"] = "Computed" # the pangenome is now annotated.
pangenome.status["geneSequences"] = "Computed" # the gene objects have their respective gene sequences.
- pangenome.parameters["annotation"] = {}
- pangenome.parameters["annotation"]["remove_Overlapping_CDS"] = allow_overlap
- pangenome.parameters["annotation"]["annotate_RNA"] = True if not norna else False
- pangenome.parameters["annotation"]["kingdom"] = kingdom
- pangenome.parameters["annotation"]["translation_table"] = translation_table
- pangenome.parameters["annotation"]["read_annotations_from_file"] = False
+ pangenome.parameters["annotate"] = {}
+ pangenome.parameters["annotate"]["norna"] = norna
+ pangenome.parameters["annotate"]["kingdom"] = kingdom
+ pangenome.parameters["annotate"]["translation_table"] = translation_table
+ pangenome.parameters["annotate"]["prodigal_procedure"] = None if procedure is None else procedure
+ pangenome.parameters["annotate"]["allow_overlap"] = allow_overlap
+ pangenome.parameters["annotate"]["# read_annotations_from_file"] = False
def launch(args: argparse.Namespace):
diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py
index c21dbdfd..18e512ab 100644
--- a/ppanggolin/annotate/synta.py
+++ b/ppanggolin/annotate/synta.py
@@ -6,17 +6,30 @@
import os
import tempfile
from io import TextIOWrapper
+from multiprocessing import Value
from subprocess import Popen, PIPE
import ast
from collections import defaultdict
from typing import Dict, List, Union
from pathlib import Path
+# install libraries
+from pyrodigal import GeneFinder, Sequence
+
# local libraries
from ppanggolin.genome import Organism, Gene, RNA, Contig
from ppanggolin.utils import is_compressed, read_compressed_or_not
+contig_counter: Value = Value('i', 0)
+
+
+def init_contig_counter(value: Value):
+ """Initialize the contig counter for later use"""
+ global contig_counter
+ contig_counter = value
+
+
def reverse_complement(seq: str):
"""reverse complement the given dna sequence
@@ -61,51 +74,46 @@ def launch_aragorn(fna_file: str, org: Organism) -> defaultdict:
line_data = line.split()
start, stop = map(int, ast.literal_eval(line_data[2].replace("c", "")))
c += 1
- gene = RNA(rna_id=locustag + '_tRNA_' + str(c).zfill(3))
+ gene = RNA(rna_id=locustag + '_tRNA_' + str(c).zfill(4))
gene.fill_annotations(start=start, stop=stop, strand="-" if line_data[2].startswith("c") else "+",
gene_type="tRNA", product=line_data[1] + line_data[4])
gene_objs[header].add(gene)
return gene_objs
-def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str = None) -> defaultdict:
+def launch_prodigal(contig_sequences: Dict[str, str], org: Organism, code: int = 11, use_meta: bool = False) -> defaultdict:
"""
- Launches Prodigal to annotate CDS. Takes a fna file name and a locustag to give an ID to the found genes.
+ Launches Prodigal to annotate CDS. Takes a fna file name and a locustag to give an ID to the pred genes.
- :param fna_file: file-like object containing the uncompressed fasta sequences
+ :param contig_sequences: Dict containing contig sequences for pyrodigal
:param org: Organism which will be annotated
:param code: Translation table (genetic code) to use.
- :param procedure: prodigal procedure used
+ :param use_meta: use meta procedure in Prodigal
:return: Annotated genes in a list of gene objects
"""
-
- locustag = org.name
- cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", procedure, "-q"]))
- logging.getLogger("PPanGGOLiN").debug(f"prodigal command : {' '.join(cmd)}")
- p = Popen(cmd, stdout=PIPE)
-
gene_objs = defaultdict(set)
- c = 0
- header = ""
- for line in p.communicate()[0].decode().split("\n"):
- if line.startswith("# Sequence Data: "):
- for data in line.split(";"):
- if data.startswith("seqhdr"):
- header = data.split("=")[1].replace('"', "").split()[0]
-
- elif line.startswith(">"):
- c += 1
- line_data = line[1:].split("_") # not considering the '>'
- gene = Gene(gene_id=locustag + "_CDS_" + str(c).zfill(4))
- gene.fill_annotations(start=int(line_data[1]), stop=int(line_data[2]), strand=line_data[3], gene_type="CDS",
- genetic_code=code)
- gene_objs[header].add(gene)
-
+ sequences = {contig_name: Sequence(sequence) for contig_name, sequence in contig_sequences.items()}
+ gene_finder = GeneFinder(
+ meta=use_meta, # '-p meta' if meta is true else '-p single'
+ closed=True, # -c: Closed ends. Do not allow genes to run off edges.
+ mask=True, # -m: Treat runs of N as masked sequence; don't build genes across them.
+ min_gene=120 # This is to prevent erreur with mmseqs translatenucs that cut too short sequences
+ )
+ gene_finder.train(max(sequences.values(), key=len), force_nonsd=False,
+ translation_table=code) # -g: Specify a translation table to use (default 11).
+ gene_counter = 1
+ for contig_name, sequence in sequences.items():
+ for pred in gene_finder.find_genes(sequence):
+ gene = Gene(gene_id=f"{org.name}_CDS_{str(gene_counter).zfill(4)}")
+ gene.fill_annotations(start=pred.begin, stop=pred.end, strand='-' if pred.strand == -1 else '+',
+ gene_type="CDS", genetic_code=code)
+ gene_counter += 1
+ gene_objs[contig_name].add(gene)
return gene_objs
-def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "bacteria") -> defaultdict:
+def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "bacteria") -> defaultdict:
"""
Launches Infernal in hmmer-only mode to annotate rRNAs.
@@ -145,7 +153,7 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "
line_data = line.split()
strand = line_data[9]
start, stop = map(int, (line_data[8], line_data[7]) if strand == "-" else (line_data[7], line_data[8]))
- gene = RNA(rna_id=locustag + "_rRNA_" + str(c).zfill(3))
+ gene = RNA(rna_id=locustag + "_rRNA_" + str(c).zfill(4))
gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type="rRNA",
product=" ".join(line_data[17:]))
gene_objs[line_data[2]].add(gene)
@@ -160,6 +168,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str,
:return: Dictionnary with contig_name as keys and contig sequence in values
"""
+ global contig_counter
try:
contigs = {}
contig_seq = ""
@@ -173,7 +182,9 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str,
try:
contig = org.get(line.split()[0][1:])
except KeyError:
- contig = Contig(line.split()[0][1:])
+ with contig_counter.get_lock():
+ contig = Contig(contig_counter.value, line.split()[0][1:])
+ contig_counter.value += 1
org.add(contig)
else:
contig_seq += line.strip()
@@ -213,25 +224,27 @@ def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrappe
return tmp_file
-def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, tmpdir: str, norna: bool = False,
- kingdom: str = "bacteria", code: int = 11, procedure: str = None) -> defaultdict:
+def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, contig_sequences: Dict[str, str],
+ tmpdir: str, norna: bool = False, kingdom: str = "bacteria",
+ code: int = 11, use_meta: bool = False) -> defaultdict:
"""
Runs the different software for the syntaxic annotation.
:param org: Organism which will be annotated
:param fasta_file: file-like object containing the uncompressed fasta sequences
+ :param contig_sequences: Dict containing contig sequences for pyrodigal
:param tmpdir: Path to temporary directory
:param norna: Use to avoid annotating RNA features.
:param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation.
:param code: Translation table (genetic code) to use.
- :param procedure: prodigal procedure used
+ :param use_meta: Use meta prodigal procedure
:return: list of genes in the organism
"""
# launching tools for syntaxic annotation
genes = defaultdict(list)
- for key, items in launch_prodigal(fna_file=fasta_file.name, org=org, code=code, procedure=procedure).items():
+ for key, items in launch_prodigal(contig_sequences=contig_sequences, org=org, code=code, use_meta=use_meta).items():
genes[key].extend(items)
if not norna:
for key, items in launch_aragorn(fna_file=fasta_file.name, org=org).items():
@@ -277,8 +290,7 @@ def overlap_filter(all_genes: defaultdict, allow_overlap: bool = False) -> defau
def get_dna_sequence(contig_seq: str, gene: Gene) -> str:
- """
- Return the gene sequence
+ """Return the gene sequence
:param contig_seq: Contig sequence
:param gene: Gene
@@ -317,13 +329,14 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str
if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj
fasta_file = write_tmp_fasta(contig_sequences, tmpdir)
if procedure is None: # prodigal procedure is not force by user
- all_contig_len = sum(len(contig) for contig in org.contigs)
- logging.getLogger("PPanGGOLiN").debug(all_contig_len)
- if all_contig_len < 20000: # case of short sequence
- procedure = "meta"
+ max_contig_len = max(len(contig) for contig in org.contigs)
+ if max_contig_len < 20000: # case of short sequence
+ use_meta = True
else:
- procedure = "single"
- genes = syntaxic_annotation(org, fasta_file, tmpdir, norna, kingdom, code, procedure)
+ use_meta = False
+ else:
+ use_meta = True if procedure == "meta" else False
+ genes = syntaxic_annotation(org, fasta_file, contig_sequences, tmpdir, norna, kingdom, code, use_meta)
genes = overlap_filter(genes, allow_overlap=allow_overlap)
for contig_name, genes in genes.items():
diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
index 77637015..4ed3ad2e 100644
--- a/ppanggolin/cluster/cluster.py
+++ b/ppanggolin/cluster/cluster.py
@@ -10,6 +10,7 @@
import argparse
from typing import TextIO, Tuple, Dict, Set
from pathlib import Path
+import time
# installed libraries
from networkx import Graph
@@ -23,6 +24,7 @@
from ppanggolin.formats.writeBinaries import write_pangenome, erase_pangenome
from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file
from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations
+from ppanggolin.utils import mk_outdir
# Global functions
@@ -55,7 +57,7 @@ def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: TextIO, force
check_pangenome_former_clustering(pangenome, force)
if pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
# we append the gene ids by 'ppanggolin' to avoid crashes from mmseqs when sequence IDs are only numeric.
- write_gene_sequences_from_annotations(pangenome, tmp_file, add="ppanggolin_", disable_bar=disable_bar)
+ write_gene_sequences_from_annotations(pangenome.genes, tmp_file, add="ppanggolin_", disable_bar=disable_bar)
elif pangenome.status["geneSequences"] == "inFile":
get_gene_sequences_from_file(pangenome.file, tmp_file, add="ppanggolin_",
disable_bar=disable_bar) # write CDS sequences to the tmpFile
@@ -260,6 +262,8 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F
link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False
if link and len(gene_to_fam) != pangenome.number_of_genes: # then maybe there are genes with identical IDs
+ logging.getLogger("PPanGGOLiN").debug(f"gene_to_fam size: {len(gene_to_fam)}, "
+ f"Pangenome nb genes: {pangenome.number_of_genes}")
raise Exception("Something unexpected happened during clustering (have less genes clustered than genes "
"in the pangenome). A probable reason is that two genes in two different organisms have "
"the same IDs; If you are sure that all of your genes have non identical IDs, please post an "
@@ -280,24 +284,31 @@ def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = F
def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool = True, code: int = 11,
coverage: float = 0.8, identity: float = 0.8, mode: int = 1, force: bool = False,
- disable_bar: bool = False):
+ disable_bar: bool = False, keep_tmp_files: bool = True):
"""
- Main function to cluster pangenome gene sequences into families
-
- :param pangenome: Annotated Pangenome
- :param tmpdir: Path to temporary directory
- :param cpu: number of CPU cores to use
- :param defrag: Allow to remove fragment
- :param code: Genetic code used
- :param coverage: minimal coverage threshold for the alignment
- :param identity: minimal identity threshold for the alignment
- :param mode: MMseqs2 clustering mode
- :param force: force to write in the pangenome
- :param disable_bar: Allow to disable progress bar
+ Cluster gene sequences from an annotated pangenome into families.
+
+ :param pangenome: Annotated Pangenome object.
+ :param tmpdir: Path to a temporary directory for intermediate files.
+ :param cpu: Number of CPU cores to use for clustering.
+ :param defrag: Allow removal of fragmented sequences during clustering.
+ :param code: Genetic code used for sequence translation.
+ :param coverage: Minimum coverage threshold for sequence alignment during clustering.
+ :param identity: Minimum identity threshold for sequence alignment during clustering.
+ :param mode: Clustering mode (MMseqs2 mode).
+ :param force: Force writing clustering results back to the pangenome.
+ :param disable_bar: Disable the progress bar during clustering.
+ :param keep_tmp_files: Keep temporary files (useful for debugging).
"""
- newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
- tmp_path = Path(newtmpdir.name)
+ if keep_tmp_files:
+ dir_name = 'clustering_tmpdir' + time.strftime("_%Y-%m-%d_%H.%M.%S",time.localtime())
+ tmp_path = Path(tmpdir) / dir_name
+ mk_outdir(tmp_path, force=True)
+ else:
+ newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
+ tmp_path = Path(newtmpdir.name)
+
with open(tmp_path/'nucleotid_sequences', "w") as sequence_file:
check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar)
logging.getLogger("PPanGGOLiN").info("Clustering all of the genes sequences...")
@@ -312,7 +323,8 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
aln = align_rep(rep, tmp_path, cpu, coverage, identity)
genes2fam, fam2seq = refine_clustering(tsv, aln, fam2seq)
pangenome.status["defragmented"] = "Computed"
- newtmpdir.cleanup()
+ if not keep_tmp_files:
+ newtmpdir.cleanup()
read_fam2seq(pangenome, fam2seq)
read_gene2fam(pangenome, genes2fam, disable_bar=disable_bar)
@@ -322,9 +334,12 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
pangenome.parameters["cluster"] = {}
pangenome.parameters["cluster"]["coverage"] = coverage
pangenome.parameters["cluster"]["identity"] = identity
- pangenome.parameters["cluster"]["defragmentation"] = defrag
+ pangenome.parameters["cluster"]["mode"] = mode
+ pangenome.parameters["cluster"]["# defragmentation"] = defrag
+ pangenome.parameters["cluster"]["no_defrag"] = not defrag
+
pangenome.parameters["cluster"]["translation_table"] = code
- pangenome.parameters["cluster"]["read_clustering_from_file"] = False
+ pangenome.parameters["cluster"]["# read_clustering_from_file"] = False
# Read clustering
@@ -340,12 +355,12 @@ def mk_local_to_gene(pangenome: Pangenome) -> dict:
old_len = len(local_dict)
local_dict[gene.local_identifier] = gene
if len(local_dict) == old_len:
- if pangenome.parameters["annotation"]["read_annotations_from_file"] and not \
- pangenome.parameters["annotation"]["used_local_identifiers"]:
+ if pangenome.parameters["annotate"]["# read_annotations_from_file"] and not \
+ pangenome.parameters["annotate"]["# used_local_identifiers"]:
raise Exception(f"'{gene.local_identifier}' was found multiple times used as an identifier. "
f"The identifier of the genes (locus_tag, protein_id in gbff, ID in gff) were not "
f"unique throughout all of the files. It is thus impossible to differentiate the genes."
- f" To use this function while importing annotation, all identifiers MUST be unique "
+ f" To use this function while importing annotate, all identifiers MUST be unique "
f"throughout all of your genomes")
return {} # local identifiers are not unique.
return local_dict
@@ -433,7 +448,7 @@ def read_clustering(pangenome: Pangenome, families_tsv_file: Path, infer_singlet
if frag: # if there was fragment information in the file.
pangenome.status["defragmented"] = "Computed"
pangenome.parameters["cluster"] = {}
- pangenome.parameters["cluster"]["read_clustering_from_file"] = True
+ pangenome.parameters["cluster"]["# read_clustering_from_file"] = True
pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton
@@ -451,7 +466,7 @@ def launch(args: argparse.Namespace):
"creation. To infer singleton you should give a clustering")
clustering(pangenome, args.tmpdir, args.cpu, defrag=not args.no_defrag, code=args.translation_table,
coverage=args.coverage, identity=args.identity, mode=args.mode, force=args.force,
- disable_bar=args.disable_prog_bar)
+ disable_bar=args.disable_prog_bar, keep_tmp_files=args.keep_tmp)
logging.getLogger("PPanGGOLiN").info("Done with the clustering")
else:
if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table,
@@ -510,6 +525,9 @@ def parser_clust(parser: argparse.ArgumentParser):
optional = parser.add_argument_group(title="Optional arguments")
optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
help="directory for storing temporary files")
+ optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
+ help="Keeping temporary files (useful for debugging).")
+
if __name__ == '__main__':
diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py
index 803e5c48..535bc9be 100644
--- a/ppanggolin/context/searchGeneContext.py
+++ b/ppanggolin/context/searchGeneContext.py
@@ -6,8 +6,12 @@
import logging
import tempfile
import time
+import logging
+import os
+from typing import List, Dict, Tuple, Iterable, Hashable, Iterator, Set
+from itertools import chain
+from collections import defaultdict
from pathlib import Path
-from typing import Set
# installed libraries
from tqdm import tqdm
@@ -17,174 +21,482 @@
# local libraries
from ppanggolin.formats import check_pangenome_info
from ppanggolin.genome import Gene, Contig
-from ppanggolin.geneFamily import GeneFamily
-from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components
+from ppanggolin.utils import mk_outdir, restricted_float, create_tmpdir, read_compressed_or_not, extract_contig_window
from ppanggolin.pangenome import Pangenome
-from ppanggolin.align.alignOnPang import get_seq2pang, project_partition
+from ppanggolin.align.alignOnPang import project_and_write_partition, get_input_seq_to_family_with_rep, \
+ get_input_seq_to_family_with_all, get_seq_ids
from ppanggolin.region import GeneContext
+from ppanggolin.geneFamily import GeneFamily
+from ppanggolin.projection.projection import write_gene_to_gene_family
-def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: Path, sequences: Path = None,
+def search_gene_context_in_pangenome(pangenome: Pangenome, output: Path, tmpdir: Path, sequence_file: Path = None,
families: Path = None, transitive: int = 4, identity: float = 0.5,
- coverage: float = 0.8, jaccard: float = 0.85, no_defrag: bool = False,
- cpu: int = 1, disable_bar=True):
+ coverage: float = 0.8, use_representatives: bool = False,
+ jaccard_threshold: float = 0.85,
+ window_size: int = 1, no_defrag: bool = False,
+ cpu: int = 1, graph_format: str = "graphml", disable_bar=True,
+ translation_table: int = 11, keep_tmp: bool = False):
"""
Main function to search common gene contexts between sequence set and pangenome families
:param pangenome: Pangenome containing GeneFamilies to align with sequence set
- :param sequences: Path to file containing the sequences
+ :param sequence_file: Path to file containing the sequences
:param families: Path to file containing families name
:param output: Path to output directory
:param tmpdir: Path to temporary directory
:param transitive: number of genes to check on both sides of a family aligned with an input sequence
:param identity: minimum identity threshold between sequences and gene families for the alignment
:param coverage: minimum coverage threshold between sequences and gene families for the alignment
- :param jaccard: Jaccard index to filter edges in graph
+ :param use_representatives: Use representative sequences of gene families rather than all sequences to align input genes
+ :param jaccard_threshold: Jaccard index threshold to filter edges in graph
+ :param window_size: Number of genes to consider in the gene context.
:param no_defrag: do not use the defrag workflow if true
:param cpu: Number of core used to process
+ :param graph_format: Write format of the context graph. Can be graphml or gexf
:param disable_bar: Allow preventing bar progress print
+ :param translation_table: The translation table to use when the input sequences are nucleotide sequences.
+ :param keep_tmp: If True, keep temporary files.
"""
-
# check statuses and load info
- if sequences is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]:
+ if sequence_file is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]:
raise Exception("Cannot use this function as your pangenome does not have gene families representatives "
"associated to it. For now this works only if the clustering is realised by PPanGGOLiN.")
check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=disable_bar)
- gene_families = {}
- fam_2_seq = None
- if sequences is not None:
+
+ families_of_interest = set()
+ family_2_input_seqid = {}
+ if sequence_file is not None:
# Alignment of sequences on pangenome families
- new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
- tmp_path = Path(new_tmpdir.name)
- seq_set, _, seq2pan = get_seq2pang(pangenome, sequences, output, tmp_path, cpu, no_defrag, identity, coverage)
- project_partition(seq2pan, seq_set, output)
- new_tmpdir.cleanup()
- for k, v in seq2pan.items():
- gene_families[v.name] = v
- fam_2_seq = fam2seq(seq2pan)
+ with read_compressed_or_not(sequence_file) as seqFileObj:
+ seq_set, is_nucleotide = get_seq_ids(seqFileObj)
+
+ logging.debug(f"Input sequences are {'nucleotide' if is_nucleotide else 'protein'} sequences")
+
+ with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
+
+ if use_representatives:
+ _, seqid2fam = get_input_seq_to_family_with_rep(pangenome, [sequence_file], output,
+ new_tmpdir, is_input_seq_nt=is_nucleotide,
+ cpu=cpu, no_defrag=no_defrag,
+ identity=identity, coverage=coverage,
+ translation_table=translation_table,
+ disable_bar=disable_bar)
+ else:
+ _, seqid2fam = get_input_seq_to_family_with_all(pangenome=pangenome,
+ sequence_files=[sequence_file],
+ output=output, tmpdir=new_tmpdir,
+ is_input_seq_nt=is_nucleotide,
+ cpu=cpu, no_defrag=no_defrag,
+ identity=identity, coverage=coverage,
+ translation_table=translation_table,
+ disable_bar=disable_bar)
+
+ project_and_write_partition(seqid2fam, seq_set, output)
+ write_gene_to_gene_family(seqid2fam, seq_set, output)
+
+ family_2_input_seqid = defaultdict(set)
+ for seqid, gf in seqid2fam.items():
+ family_2_input_seqid[gf].add(seqid)
+
+ for pan_family in seqid2fam.values():
+ families_of_interest.add(pan_family)
if families is not None:
- with open(families, 'r') as f:
+ with read_compressed_or_not(families) as f:
for fam_name in f.read().splitlines():
- gene_families[fam_name] = pangenome.get_gene_family(fam_name)
+ families_of_interest.add(pangenome.get_gene_family(fam_name))
# Compute the graph with transitive closure size provided as parameter
start_time = time.time()
- logging.getLogger("PPanGGOLiN").info("Building the graph...")
- g = compute_gene_context_graph(families=gene_families, t=transitive, disable_bar=disable_bar)
- logging.getLogger("PPanGGOLiN").info(
+
+ logging.getLogger().info("Building the graph...")
+
+ gene_context_graph = compute_gene_context_graph(families=families_of_interest, transitive=transitive,
+ window_size=window_size, disable_bar=disable_bar)
+
+ logging.getLogger().info(
f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find common gene contexts")
- logging.getLogger("PPanGGOLiN").debug(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges")
- # extract the modules from the graph
- common_components = compute_gene_context(g, jaccard)
+ logging.getLogger().debug(
+ f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges")
+
+ compute_edge_metrics(gene_context_graph, jaccard_threshold)
- families = set()
- for gene_context in common_components:
- families |= set(gene_context.families)
+ # Filter graph
+ filter_flag = f'is_jaccard_gene_>_{jaccard_threshold}'
+
+ edges_to_remove = [(n, v) for n, v, d in gene_context_graph.edges(data=True) if not d[filter_flag]]
+ gene_context_graph.remove_edges_from(edges_to_remove)
+
+ logging.getLogger().debug(f"Filtering context graph on {filter_flag}")
+ logging.getLogger().debug(
+ f"Context graph made of {nx.number_of_nodes(gene_context_graph)} nodes and {nx.number_of_edges(gene_context_graph)} edges")
+
+ gene_contexts = get_gene_contexts(gene_context_graph, families_of_interest)
+
+ gene_context_graph = make_graph_writable(gene_context_graph)
+ out_graph_file = write_graph(gene_context_graph, output, graph_format)
+
+ if len(gene_contexts) != 0:
+ logging.getLogger().info(
+ f"There are {sum((len(gc) for gc in gene_contexts))} families among {len(gene_contexts)} gene contexts")
+
+ output_file = output / "gene_contexts.tsv"
+ export_context_to_dataframe(gene_contexts, family_2_input_seqid, families_of_interest, output_file)
- if len(families) != 0:
- export_to_dataframe(families, common_components, fam_2_seq, output)
else:
logging.getLogger("PPanGGOLiN").info("No gene contexts were found")
logging.getLogger("PPanGGOLiN").info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds")
+ return gene_context_graph, out_graph_file
-def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = False) -> nx.Graph:
+
+def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFamily]) -> Set[GeneContext]:
+ """
+ Extract gene contexts from a context graph based on the provided set of gene families of interest.
+
+ Gene contexts are extracted from a context graph by identifying connected components.
+ The function filters the connected components based on the following criteria:
+ - Remove singleton families (components with only one gene family).
+ - Remove components that do not contain any gene families of interest.
+
+ For each remaining connected component, a GeneContext object is created.
+
+ :param context_graph: The context graph from which to extract gene contexts.
+ :param families_of_interest: Set of gene families of interest.
+ :return: Set of GeneContext objects representing the extracted gene contexts.
"""
- Construct the graph of gene contexts between families of the pangenome
- :param families: Gene families of interest
- :param t: transitive value
- :param disable_bar: Prevents progress bar printing
+ connected_components = nx.connected_components(context_graph)
+
+ # Connected component graph Filtering
+
+ # remove singleton famillies
+ connected_components = (component for component in connected_components if len(component) > 1)
+
+ # remove component made only of famillies not initially requested
+ connected_components = (component for component in connected_components if component & families_of_interest)
+
+ gene_contexts = set()
+ families_in_context = set()
+
+ for i, component in enumerate(connected_components):
+ families_in_context |= component
+ family_of_interest_of_gc = component & families_of_interest
+ gene_context = GeneContext(gc_id=i, families=component, families_of_interest=family_of_interest_of_gc)
+
+ # add gc id to node attribute
+ node_attributes = {n: {"gene_context_id": i, "families_of_interest": n in families_of_interest} for n in
+ component}
+ nx.set_node_attributes(context_graph, node_attributes)
+
+ gene_contexts.add(gene_context)
- :return: Graph of gene contexts between interesting gene families of the pangenome
+ node_not_in_context = set(context_graph.nodes()) - families_in_context
+ context_graph.remove_nodes_from(node_not_in_context)
+
+ return gene_contexts
+
+
+def make_graph_writable(context_graph):
"""
+ The original context graph contains ppanggolin objects as nodes and lists and dictionaries in edge attributes.
+ Since these objects cannot be written to the output graph,
+ this function creates a new graph that contains only writable objects.
+
+ :param context_graph: List of gene context. it includes graph of the context
+ """
+ def filter_attribute(data: dict):
+ """
+ Helper function to filter the edge attributes.
+
+ :param data: The edge attribute data.
+ :return: A filtered dictionary containing only non-collection attributes.
+ """
+ return {k: v for k, v in data.items() if type(v) not in [set, dict, list]}
+
+ G = nx.Graph()
- g = nx.Graph()
- for family in tqdm(families.values(), unit="families", disable=disable_bar):
- for gene in family.genes:
- contig = list(gene.contig.genes)
- pos_left, in_context_left, pos_right, in_context_right = extract_gene_context(gene, contig, families, t)
- if in_context_left or in_context_right:
- for env_gene in contig[pos_left:pos_right + 1]:
- _compute_gene_context_graph(g, env_gene, contig, pos_right)
- return g
+ G.add_edges_from((f1.name, f2.name, filter_attribute(d)) for f1, f2, d in context_graph.edges(data=True))
+ # convert transitivity dict to str
+ edges_with_transitivity_str = {(f1.name, f2.name): str(d['transitivity']) for f1, f2, d in
+ context_graph.edges(data=True)}
-def _compute_gene_context_graph(g: nx.Graph, env_gene: Gene, contig: Contig, pos_r: int):
+ nx.set_edge_attributes(G, edges_with_transitivity_str, name="transitivity")
+
+ nodes_attributes_filtered = {f.name: filter_attribute(d) for f, d in context_graph.nodes(data=True)}
+
+ # on top of attributes already contained in node of context graph
+ # add organisms and genes count that have the family, the partition and if the family was in initially requested
+ nodes_family_data = {f.name: {"organisms": f.number_of_organisms,
+ "partition": f.named_partition,
+ "genes": f.number_of_genes} for f in context_graph.nodes()}
+
+ for f, d in G.nodes(data=True):
+ d.update(nodes_family_data[f])
+ d.update(nodes_attributes_filtered[f])
+
+ return G
+
+
+def write_graph(G: nx.Graph, output_dir: Path, graph_format: str):
"""
- Compute graph of gene contexts between one gene and the other part of the contig
+ Write a graph to file in the GraphML format or/and in GEXF format.
+
+ :param G: Graph to write
+ :param output_dir: The output directory where the graph file will be written.
+ :param graph_format: Formats of the output graph. Can be graphml or gexf
- :param: Graph of gene contexts between interesting gene families of the pangenome
- :param env_gene: Gene of the current position
- :param contig: Current contig to search a gene context
- :param pos_r: Gene to search a gene context
"""
- g.add_node(env_gene.family)
- add_gene(g.nodes[env_gene.family], env_gene, fam_split=False)
- pos = env_gene.position + 1
- while pos <= pos_r:
- if env_gene.family != contig[pos].family:
- g.add_edge(env_gene.family, contig[pos].family)
- edge = g[env_gene.family][contig[pos].family]
- add_gene(edge, env_gene)
- add_gene(edge, contig[pos])
- pos += 1
+ if "graphml" == graph_format:
+ out_file = output_dir / "graph_context.graphml"
+ logging.info(f'Writting context graph in {out_file}')
+ nx.write_graphml_lxml(G, out_file)
+
+ elif "gexf" == graph_format:
+ out_file = output_dir / "graph_context.gexf"
+ logging.info(f'Writting context graph in {out_file}')
+ nx.readwrite.gexf.write_gexf(G, out_file)
+ else:
+ raise ValueError(f'The given graph format ({graph_format}) is not correct. it should be "graphml" or gexf')
+
+ return out_file
-def extract_gene_context(gene: Gene, contig: list, families: dict, t: int = 4) -> (int, bool, int, bool):
+def compute_edge_metrics(context_graph: nx.Graph, gene_proportion_cutoff: float) -> None:
+ """
+ Compute various metrics on the edges of the context graph.
+
+ :param context_graph: The context graph.
+ :param gene_proportion_cutoff: The minimum proportion of shared genes between two features for their edge to be considered significant.
"""
- Extract gene context and whether said gene context exists
+ # compute jaccard on organism and on genes
+ for f1, f2, data in context_graph.edges(data=True):
+ data['jaccard_organism'] = len(data['organisms']) / len(set(f1.organisms) | set(f2.organisms))
+
+ f1_gene_proportion = len(data['genes'][f1]) / f1.number_of_genes
+ f2_gene_proportion = len(data['genes'][f2]) / f2.number_of_genes
+
+ data['f1'] = f1.name
+ data['f2'] = f2.name
+ data['f1_jaccard_gene'] = f1_gene_proportion
+ data['f2_jaccard_gene'] = f2_gene_proportion
+
+ data[f'is_jaccard_gene_>_{gene_proportion_cutoff}'] = (f1_gene_proportion >= gene_proportion_cutoff) and (
+ f2_gene_proportion >= gene_proportion_cutoff)
+
+ transitivity_counter = data['transitivity']
- :param gene: Gene of interest
- :param contig: list of genes in contig
- :param families: Alignment results
- :param t: transitive value
+ mean_transitivity = sum(
+ (transitivity * counter for transitivity, counter in transitivity_counter.items())) / sum(
+ (counter for counter in transitivity_counter.values()))
- :return: Position of the context and if it exists for each side ('left' and 'right')
+ data['mean_transitivity'] = mean_transitivity
+
+ # the following commented out lines are additional metrics that could be used
+
+ # data['min_jaccard_organism'] = len(data['organisms'])/min(len(f1.organisms), len(f2.organisms))
+ # data['max_jaccard_organism'] = len(data['organisms'])/max(len(f1.organisms), len(f2.organisms))
+ # f1_gene_proportion_partial = len(data['genes'][f1])/len(context_graph.nodes[f1]['genes'])
+ # f2_gene_proportion_partial = len(data['genes'][f2])/len(context_graph.nodes[f2]['genes'])
+ # data[f'f1_jaccard_gene_partital'] = f1_gene_proportion_partial
+ # data[f'f2_jaccard_gene_partital'] = f2_gene_proportion_partial
+
+
+def add_edges_to_context_graph(context_graph: nx.Graph,
+ contig_genes: List[Gene],
+ contig_windows: List[Tuple[int, int]],
+ transitivity: int,
+ is_circular: bool):
"""
+ Add edges to the context graph based on contig genes and windows.
- pos_left, pos_right = (max(0, gene.position - t),
- min(gene.position + t, len(contig) - 1)) # Gene positions to compare family
- in_context_left, in_context_right = (False, False)
- while pos_left < gene.position and not in_context_left:
- if contig[pos_left].family in families.values():
- in_context_left = True
- else:
- pos_left += 1
+ :param context_graph: The context graph to which edges will be added.
+ :param contig_genes: An iterable of genes in the contig.
+ :param contig_windows: A list of tuples representing the start and end positions of contig windows.
+ :param transitivity: The number of next genes to consider when adding edges.
+ :param is_circular: A boolean indicating if the contig is circular.
- while pos_right > gene.position and not in_context_right:
- if contig[pos_right].family in families.values():
- in_context_right = True
- else:
- pos_right -= 1
+ """
+ for window_start, window_end in contig_windows:
+ for gene_index in range(window_start, window_end + 1):
+ gene = contig_genes[gene_index]
+ next_genes = get_n_next_genes_index(gene_index, next_genes_count=transitivity + 1,
+ contig_size=len(contig_genes), is_circular=is_circular)
+ next_genes = list(next_genes)
+
+ for i, next_gene_index in enumerate(next_genes):
+ # Check if the next gene is within the contig windows
+ if not any(lower <= next_gene_index <= upper for (lower, upper) in contig_windows):
+ # next_gene_index is not in any range of genes in the context
+ # so it is ignored along with all following genes
+ break
- return pos_left, in_context_left, pos_right, in_context_right
+ next_gene = contig_genes[next_gene_index]
+ if next_gene.family == gene.family:
+ # If the next gene has the same family, the two genes refer to the same node
+ # so they are ignored
+ continue
+ context_graph.add_edge(gene.family, next_gene.family)
-def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set:
+ edge_dict = context_graph[gene.family][next_gene.family]
+
+ if i == 0:
+ edge_dict['adjacent_family'] = True
+
+ # Store information of the transitivity used to link the two genes:
+ if "transitivity" not in edge_dict:
+ edge_dict['transitivity'] = {i: 0 for i in range(transitivity + 1)}
+ edge_dict['transitivity'][i] += 1
+
+ # Add node attributes
+ node_gene_dict = context_graph.nodes[gene.family]
+ next_gene_gene_dict = context_graph.nodes[next_gene.family]
+
+ increment_attribute_counter(node_gene_dict, "genes_count")
+ increment_attribute_counter(next_gene_gene_dict, "genes_count")
+
+ add_val_to_dict_attribute(node_gene_dict, "genes", gene)
+ add_val_to_dict_attribute(next_gene_gene_dict, "genes", next_gene)
+
+ # Add edge attributes
+ edge_dict = context_graph[gene.family][next_gene.family]
+ try:
+ genes_edge_dict = edge_dict['genes']
+ except Exception:
+ genes_edge_dict = {}
+ edge_dict['genes'] = genes_edge_dict
+
+ add_val_to_dict_attribute(genes_edge_dict, gene.family, gene)
+ add_val_to_dict_attribute(genes_edge_dict, next_gene.family, next_gene)
+
+ add_val_to_dict_attribute(edge_dict, "organisms", gene.organism)
+
+ increment_attribute_counter(edge_dict, "gene_pairs")
+
+ assert gene.organism == next_gene.organism, f"Gene of the same contig have a different organism. {gene.organism} and {next_gene.organism}"
+
+
+def add_val_to_dict_attribute(attr_dict: dict, attribute_key, attribute_value):
"""
- Compute the gene contexts in the graph
+ Add an attribute value to a edge or node dictionary set.
- :param g: Graph of gene contexts between interesting gene families of the pangenome
- :param jaccard: Jaccard index
+ :param attr_dict: The dictionary containing the edge/node attributes.
+ :param attribute_key: The key of the attribute.
+ :param attribute_value: The value of the attribute to be added.
- :return: Set of gene contexts find in graph
"""
- gene_contexts = set()
- c = 1
- for comp in connected_components(g, removed=set(), weight=jaccard):
- gene_contexts.add(GeneContext(gc_id=c, families=comp))
- c += 1
- return gene_contexts
+ try:
+ attr_dict[attribute_key].add(attribute_value)
+ except KeyError:
+ attr_dict[attribute_key] = {attribute_value}
+
+
+def increment_attribute_counter(edge_dict: dict, key: Hashable):
+ """
+ Increment the counter for an edge/node attribute in the edge/node dictionary.
+
+ :param edge_dict: The dictionary containing the attributes.
+ :param key: The key of the attribute.
+
+ """
+
+ try:
+ edge_dict[key] += 1
+ except KeyError:
+ edge_dict[key] = 1
+
+
+def get_n_next_genes_index(current_index: int, next_genes_count: int,
+ contig_size: int, is_circular: bool = False) -> Iterator[int]:
+ """
+ Generate the indices of the next genes based on the current index and contig properties.
+
+ :param current_index: The index of the current gene.
+ :param next_genes_count: The number of next genes to consider.
+ :param contig_size: The total number of genes in the contig.
+ :param is_circular: Flag indicating whether the contig is circular (default: False).
+ :return: An iterator yielding the indices of the next genes.
+
+ Raises:
+ - IndexError: If the current index is out of range for the given contig size.
+
+ """
+
+ # Check if the current index is out of range
+ if current_index >= contig_size:
+ raise IndexError(f'current gene index is out of range. '
+ f"Contig has {contig_size} genes while the given gene index is {current_index}")
+ if is_circular:
+ next_genes = chain(range(current_index + 1, contig_size), range(0, current_index))
+ else:
+ next_genes = range(current_index + 1, contig_size)
+
+ for i, next_gene_index in enumerate(next_genes):
+ if i == next_genes_count:
+ break
+ yield next_gene_index
+
+def get_contig_to_genes(gene_families: Iterable[GeneFamily]) -> Dict[Contig, Set[Gene]]:
+ """
+ Group genes from specified gene families by contig.
+
+ :param gene_families: An iterable of gene families object.
+
+ :return: A dictionary mapping contigs to sets of genes.
+ """
+
+ contig_to_genes_of_interest = defaultdict(set)
+ for gene_family in gene_families:
+ for gene in gene_family.genes:
+ contig = gene.contig
+ contig_to_genes_of_interest[contig].add(gene)
+ return contig_to_genes_of_interest
+
+
+def compute_gene_context_graph(families: Iterable[GeneFamily], transitive: int = 4, window_size: int = 0,
+ disable_bar: bool = False) -> nx.Graph:
+ """
+ Construct the graph of gene contexts between families of the pangenome.
+
+ :param families: An iterable of gene families.
+ :param transitive: Size of the transitive closure used to build the graph.
+ :param window_size: Size of the window for extracting gene contexts (default: 0).
+ :param disable_bar: Flag to disable the progress bar (default: False).
+
+ :return: The constructed gene context graph.
+ """
+
+ context_graph = nx.Graph()
+
+ contig_to_genes_of_interest = get_contig_to_genes(families)
+
+ for contig, genes_of_interest in tqdm(contig_to_genes_of_interest.items(), unit="contig",
+ total=len(contig_to_genes_of_interest), disable=disable_bar):
+ genes_count = contig.number_of_genes
+
+ genes_of_interest_positions = [g.position for g in genes_of_interest]
+
+ contig_windows = extract_contig_window(genes_count, genes_of_interest_positions,
+ window_size=window_size, is_circular=contig.is_circular)
+
+ add_edges_to_context_graph(context_graph,
+ contig.get_genes(),
+ contig_windows,
+ transitive,
+ contig.is_circular)
+ return context_graph
-def fam2seq(seq_to_pan: dict) -> dict:
+
+def fam_to_seq(seq_to_pan: dict) -> dict:
"""
Create a dictionary with gene families as keys and list of sequences id as values
@@ -203,33 +515,40 @@ def fam2seq(seq_to_pan: dict) -> dict:
return fam_2_seq
-def export_to_dataframe(families: Set[GeneFamily], gene_contexts: Set[GeneContext], fam_to_seq: dict, output: str):
- """ Export the results into dataFrame
+def export_context_to_dataframe(gene_contexts: set, fam2seq: Dict[str, int], families_of_interest: Set[GeneFamily], output: Path):
+ """
+ Export the results into dataFrame
- :param families: Families related to the connected components
:param gene_contexts: connected components found in the pangenome
- :param fam_to_seq: Dictionary with gene families as keys and list of sequence ids as values
+ :param fam2seq: Dictionary with gene families ID as keys and list of sequence ids as values
+ :param families_of_interest: families of interest that are at the origine of the context.
:param output: output path
"""
- logging.getLogger("PPanGGOLiN").debug(f"There are {len(families)} families among {len(gene_contexts)} gene contexts")
-
lines = []
for gene_context in gene_contexts:
for family in gene_context.families:
- line = [gene_context.ID]
- if fam_to_seq is None or fam_to_seq.get(family.ID) is None:
- line += [family.name, None, family.number_of_organisms, family.named_partition]
+ if fam2seq.get(family) is None:
+ sequence_id = None
else:
- line += [family.name, ','.join(fam_to_seq.get(family.ID)),
- family.number_of_organisms, family.named_partition]
- lines.append(line)
- df = pd.DataFrame(lines,
- columns=["GeneContext ID", "Gene family name", "Sequence ID", "Nb Genomes", "Partition"]
- ).set_index("GeneContext ID")
- df.sort_values(["GeneContext ID", "Sequence ID"], na_position='last').to_csv(
- path_or_buf=f"{output}/gene_contexts.tsv", sep="\t", na_rep='NA')
- logging.getLogger("PPanGGOLiN").info(f"detected gene context(s) are listed in: '{output}/gene_contexts.tsv'")
+ sequence_id = ','.join(fam2seq.get(family))
+
+ family_info = {"GeneContext ID": gene_context.ID,
+ "Gene family name": family.name,
+ "Sequence ID": sequence_id,
+ "Nb Genomes": family.number_of_organisms,
+ "Partition": family.named_partition,
+ "Target family": family in families_of_interest}
+
+ lines.append(family_info)
+
+ df = pd.DataFrame(lines).set_index("GeneContext ID")
+
+ df = df.sort_values(["GeneContext ID", "Sequence ID"], na_position='last')
+
+ df.to_csv(output, sep="\t", na_rep='NA')
+
+ logging.getLogger().debug(f"detected gene context(s) are listed in: '{output}'")
def launch(args: argparse.Namespace):
@@ -241,13 +560,27 @@ def launch(args: argparse.Namespace):
if not any([args.sequences, args.family]):
raise Exception("At least one of --sequences or --family option must be given")
+
mk_outdir(args.output, args.force)
+
pangenome = Pangenome()
pangenome.add_file(args.pangenome)
+
+ # check statuses and load info
+ if args.sequences is not None and pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]:
+ raise Exception("Cannot use this function as your pangenome does not have gene families representatives "
+ "associated to it. For now this works only if the clustering has been made by PPanGGOLiN.")
+
+ check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar)
+
search_gene_context_in_pangenome(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir,
- sequences=args.sequences, families=args.family, transitive=args.transitive,
- identity=args.identity, coverage=args.coverage, jaccard=args.jaccard,
- no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar)
+ sequence_file=args.sequences, families=args.family, transitive=args.transitive,
+ identity=args.identity, coverage=args.coverage, use_representatives=args.fast,
+ jaccard_threshold=args.jaccard,
+ window_size=args.window_size,
+ no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar,
+ graph_format=args.graph_format,
+ translation_table=args.translation_table, keep_tmp=args.keep_tmp)
def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
@@ -275,6 +608,8 @@ def parser_context(parser: argparse.ArgumentParser):
description="All of the following arguments are required :")
required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome.h5 file")
required.add_argument('-o', '--output', required=False, type=Path,
+ default="ppanggolin_context" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S",
+ time.localtime()) + "_PID" + str(os.getpid()),
help="Output directory where the file(s) will be written")
onereq = parser.add_argument_group(title="Input file", description="One of the following argument is required :")
onereq.add_argument('-S', '--sequences', required=False, type=Path,
@@ -286,20 +621,35 @@ def parser_context(parser: argparse.ArgumentParser):
optional.add_argument('--no_defrag', required=False, action="store_true",
help="DO NOT Realign gene families to link fragments with"
"their non-fragmented gene family.")
- optional.add_argument('--identity', required=False, type=float, default=0.5,
+ optional.add_argument("--fast", required=False, action="store_true",
+ help="Use representative sequences of gene families for input gene alignment. "
+ "This option is recommended for faster processing but may be less sensitive. "
+ "By default, all pangenome genes are used for alignment. "
+ "This argument makes sense only when --sequence is provided.")
+ optional.add_argument('--identity', required=False, type=float, default=0.8,
help="min identity percentage threshold")
optional.add_argument('--coverage', required=False, type=float, default=0.8,
help="min coverage percentage threshold")
+ optional.add_argument("--translation_table", required=False, default="11",
+ help="The translation table (genetic code) to use when the input sequences are nucleotide sequences. ")
optional.add_argument("-t", "--transitive", required=False, type=int, default=4,
help="Size of the transitive closure used to build the graph. This indicates the number of "
"non related genes allowed in-between two related genes. Increasing it will improve "
"precision but lower sensitivity a little.")
+ optional.add_argument("-w", "--window_size", required=False, type=int, default=5,
+ help="Number of neighboring genes that are considered on each side of "
+ "a gene of interest when searching for conserved genomic contexts.")
+
optional.add_argument("-s", "--jaccard", required=False, type=restricted_float, default=0.85,
help="minimum jaccard similarity used to filter edges between gene families. Increasing it "
"will improve precision but lower sensitivity a lot.")
+ optional.add_argument('--graph_format', help="Format of the context graph. Can be gexf or graphml.",
+ default='graphml', choices=['gexf', 'graphml'])
optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
optional.add_argument("--tmpdir", required=False, type=str, default=Path(tempfile.gettempdir()),
help="directory for storing temporary files")
+ optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
+ help="Keeping temporary files (useful for debugging).")
if __name__ == '__main__':
diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py
index 2b02dbf5..d40b38c3 100644
--- a/ppanggolin/figures/draw_spot.py
+++ b/ppanggolin/figures/draw_spot.py
@@ -566,7 +566,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome:
logging.getLogger("PPanGGOLiN").info("Ordering genes among regions, and drawing spots...")
- multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"])
+ multigenics = pangenome.get_multigenics(pangenome.parameters["rgp"]["dup_margin"])
fam2mod = {}
for mod in pangenome.modules:
@@ -647,7 +647,10 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar:
need_rgp=True, need_spots=True, need_modules=need_mod, disable_bar=disable_bar)
if spot_list == 'all' or any(x == 'all' for x in spot_list):
- logging.getLogger("PPanGGOLiN").debug("all is found in spot list, all spot are drawn.")
+ logging.getLogger("PPanGGOLiN").debug("'all' value is found in spot list, all spots are drawn.")
+ selected_spots = list(pangenome.spots)
+ elif spot_list == "synteny" or any(x == 'synteny' for x in spot_list):
+ logging.getLogger().debug("'synteny' value is found in spot list, all spots with more than 1 conserved synteny are drawn.")
selected_spots = [s for s in pangenome.spots if len(s.get_uniq_ordered_set()) > 1]
else:
curated_spot_list = {'spot_' + str(s) if not s.startswith("spot_") else str(s) for s in spot_list}
@@ -666,6 +669,6 @@ def draw_spots(pangenome: Pangenome, output: Path, spot_list: str, disable_bar:
logging.getLogger("PPanGGOLiN").info(f"Drawing {len(selected_spots)} spots")
draw_selected_spots(selected_spots, pangenome, output,
- overlapping_match=pangenome.parameters["spots"]["overlapping_match"],
- exact_match=pangenome.parameters["spots"]["exact_match"],
- set_size=pangenome.parameters["spots"]["set_size"], disable_bar=disable_bar)
+ overlapping_match=pangenome.parameters["spot"]["overlapping_match"],
+ exact_match=pangenome.parameters["spot"]["exact_match_size"],
+ set_size=pangenome.parameters["spot"]["set_size"], disable_bar=disable_bar)
diff --git a/ppanggolin/figures/drawing.py b/ppanggolin/figures/drawing.py
index bd03d21b..2e09674f 100644
--- a/ppanggolin/figures/drawing.py
+++ b/ppanggolin/figures/drawing.py
@@ -91,7 +91,7 @@ def parser_draw(parser: argparse.ArgumentParser):
optional.add_argument("--draw_spots", required=False, default=False, action="store_true",
help="draw plots for spots of the pangenome")
optional.add_argument("--spots", required=False, default='all', nargs='+',
- help="a comma-separated list of spots to draw (or 'all' to draw all spots).")
+ help="a comma-separated list of spots to draw (or 'all' to draw all spots, or 'synteny' to draw spots with different RGP syntenies).")
if __name__ == '__main__':
diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py
index 1a900c5b..e0ff3f14 100644
--- a/ppanggolin/figures/tile_plot.py
+++ b/ppanggolin/figures/tile_plot.py
@@ -172,9 +172,8 @@ def draw_tile_plot(pangenome: Pangenome, output: Path, nocloud: bool = False, di
tickfont=dict(size=10)),
shapes=shapes,
plot_bgcolor='#ffffff')
- logging.getLogger().info("Drawing the figure itself...")
+ logging.getLogger("PPanGGOLiN").info("Drawing the figure itself...")
- #fig = go.Figure(data=[heatmap], layout=layout)
fig = go.Figure(data=[heatmap])
fig.add_trace(go.Scatter(x=dendro_org['icoord'],
diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py
index 968c1f2c..5b5a2af6 100644
--- a/ppanggolin/formats/readBinaries.py
+++ b/ppanggolin/formats/readBinaries.py
@@ -3,9 +3,8 @@
# default libraries
import logging
-import sys
from pathlib import Path
-from typing import TextIO, List, Dict, Tuple
+from typing import TextIO, Dict, Any, List
# installed libraries
from tables import Table
@@ -20,7 +19,6 @@
from ppanggolin.metadata import Metadata
-
class Genedata:
"""
This is a general class storing unique gene-related data to be written in a specific
@@ -105,6 +103,7 @@ def fix_partitioned(pangenome_file: str):
del status_group._v_attrs.Partitionned
h5f.close()
+
def get_status(pangenome: Pangenome, pangenome_file: Path):
"""
Checks which elements are already present in the file.
@@ -138,7 +137,6 @@ def get_status(pangenome: Pangenome, pangenome_file: Path):
if hasattr(status_group._v_attrs, "modules") and status_group._v_attrs.modules:
pangenome.status["modules"] = "inFile"
- # pangenome.status["annotations_sources"] = status_group._v_attrs.annotations_sources
if hasattr(status_group._v_attrs, "metadata") and status_group._v_attrs.metadata:
metastatus = status_group.metastatus
@@ -152,6 +150,7 @@ def get_status(pangenome: Pangenome, pangenome_file: Path):
pangenome.parameters = info_group._v_attrs.parameters
h5f.close()
+
def read_chunks(table: Table, column: str = None, chunk: int = 10000):
"""
Reading entirely the provided table (or column if specified) chunk per chunk to limit RAM usage.
@@ -202,20 +201,57 @@ def read_sequences(h5f: tables.File) -> dict:
return seqid2seq
-def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter = None, add: str = '',
+def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, file_obj: TextIO, add: str = '',
+ disable_bar: bool = False):
+ """
+ Writes the non redundant CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS,
+ and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file.
+
+ :param pangenome_filename: Name of the pangenome file
+ :param file_obj: Name of the output file
+ :param add: Add a prefix to sequence header
+ :param disable_bar: disable progress bar
+
+ """
+
+ logging.getLogger("PPanGGOLiN").info(
+ f"Extracting and writing non redundant CDS sequences from {pangenome_filename} to {file_obj.name}")
+
+ with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f:
+
+ # get a dictionarry mapping seqid to cds_name
+ # seqid are uniq and can have multiple cds name.
+ # We just want one of the cds name to have non redundant fasta sequences
+ seqid2cds_name = {}
+ for row in read_chunks(h5f.root.annotations.geneSequences, chunk=20000):
+ # Read the table chunk per chunk otherwise RAM dies on big pangenomes
+ seqid2cds_name[row["seqid"]] = row["gene"].decode()
+
+ table = h5f.root.annotations.sequences
+ for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar):
+ cds_name = seqid2cds_name[row["seqid"]]
+ file_obj.write(f'>{add}{cds_name}\n')
+ file_obj.write(f'{row["dna"].decode()}\n')
+
+ file_obj.flush()
+
+
+def get_gene_sequences_from_file(pangenome_filename: str, file_obj: TextIO, list_cds: iter = None, add: str = '',
disable_bar: bool = False):
"""
Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS,
and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file.
- :param filename: Name of the pangenome file
+ :param pangenome_filename: Name of the pangenome file
:param file_obj: Name of the output file
:param list_cds: An iterable object of CDS
:param add: Add a prefix to sequence header
:param disable_bar: Prevent to print disable progress bar
"""
- logging.getLogger("PPanGGOLiN").info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...")
- h5f = tables.open_file(filename, "r", driver_core_backing_store=0)
+ logging.getLogger("PPanGGOLiN").info(
+ f"Extracting and writing CDS sequences from a {pangenome_filename} file to a fasta file...")
+ h5f = tables.open_file(pangenome_filename, "r", driver_core_backing_store=0)
+
table = h5f.root.annotations.geneSequences
list_cds = set(list_cds) if list_cds is not None else None
seqid2seq = read_sequences(h5f)
@@ -385,9 +421,10 @@ def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = Fal
family = pangenome.get_gene_family(row['geneFam'].decode())
curr_module.add(family)
for module in modules.values():
- pangenome.add_module(module)
+ pangenome.add_module(module)
pangenome.status["modules"] = "Loaded"
+
def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000,
disable_bar: bool = False):
"""Read organism table in pangenome file to add them to the pangenome object
@@ -397,14 +434,13 @@ def read_organisms(pangenome: Pangenome, table: tables.Table, chunk_size: int =
:param chunk_size: Size of the chunck reading
:param disable_bar: Disable progress bar
"""
- contig2organism = {}
for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="genome", disable=disable_bar):
organism = Organism(row["name"].decode())
pangenome.add_organism(organism)
def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20000,
- disable_bar: bool = False):
+ disable_bar: bool = False):
"""Read contig table in pangenome file to add them to the pangenome object
:param pangenome: Pangenome object
@@ -413,8 +449,7 @@ def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20
:param disable_bar: Disable progress bar
"""
for row in tqdm(read_chunks(table, chunk=chunk_size), total=table.nrows, unit="contig", disable=disable_bar):
- contig = Contig(name=row["name"].decode())
- contig.is_circular = row["is_circular"]
+ contig = Contig(identifier=int(row["ID"]), name=row["name"].decode(), is_circular=row["is_circular"])
contig.length = int(row["length"])
try:
organism = pangenome.get_organism(row["organism"].decode())
@@ -423,6 +458,7 @@ def read_contigs(pangenome: Pangenome, table: tables.Table, chunk_size: int = 20
else:
organism.add(contig)
+
def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int, Genedata],
link: bool = True, chunk_size: int = 20000, disable_bar: bool = False):
"""Read genes in pangenome file to add them to the pangenome object
@@ -446,7 +482,7 @@ def read_genes(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[in
genetic_code=genedata.genetic_code, product=genedata.product, local_identifier=local)
gene.is_fragment = row["is_fragment"]
if link:
- contig = pangenome.get_contig(row["contig"].decode())
+ contig = pangenome.get_contig(int(row["contig"]))
gene.fill_parents(contig.organism, contig)
contig.add(gene)
@@ -469,13 +505,14 @@ def read_rnas(pangenome: Pangenome, table: tables.Table, genedata_dict: Dict[int
gene_type=genedata.gene_type, name=genedata.name,
product=genedata.product)
if link:
- contig = pangenome.get_contig(row["contig"].decode())
+ contig = pangenome.get_contig(int(row["contig"]))
rna.fill_parents(contig.organism, contig)
contig.add_rna(rna)
def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool = True, load_contigs: bool = True,
- load_genes: bool = True, load_rnas: bool = True, chunk_size: int = 20000, disable_bar: bool = False):
+ load_genes: bool = True, load_rnas: bool = True, chunk_size: int = 20000,
+ disable_bar: bool = False):
"""
Read annotation in pangenome hdf5 file to add in pangenome object
@@ -486,22 +523,21 @@ def read_annotation(pangenome: Pangenome, h5f: tables.File, load_organisms: bool
annotations = h5f.root.annotations
genedata_dict = None
if load_organisms:
- read_organisms(pangenome, annotations.genomes, disable_bar=disable_bar)
+ read_organisms(pangenome, annotations.genomes, chunk_size=chunk_size, disable_bar=disable_bar)
if load_contigs:
- read_contigs(pangenome, annotations.contigs, disable_bar=disable_bar)
+ read_contigs(pangenome, annotations.contigs, chunk_size=chunk_size, disable_bar=disable_bar)
if load_genes:
genedata_dict = read_genedata(h5f)
read_genes(pangenome, annotations.genes, genedata_dict,
- all([load_organisms, load_contigs]), disable_bar=disable_bar)
+ all([load_organisms, load_contigs]), chunk_size=chunk_size, disable_bar=disable_bar)
if load_rnas:
read_rnas(pangenome, annotations.RNAs, read_genedata(h5f) if genedata_dict is None else genedata_dict,
- all([load_organisms, load_contigs]), disable_bar=disable_bar)
+ all([load_organisms, load_contigs]), chunk_size=chunk_size, disable_bar=disable_bar)
pangenome.status["genomesAnnotated"] = "Loaded"
-
def read_info(h5f: tables.File):
"""
Read the pangenome content
@@ -619,20 +655,33 @@ def read_metadata(pangenome: Pangenome, h5f: tables.File, metatype: str,
element.add_metadata(source=source, metadata=meta)
pangenome.status["metadata"][metatype] = "Loaded"
+
def read_parameters(h5f: tables.File):
"""
Read pangenome parameters
:param h5f: Pangenome HDF5 file
"""
+ step_to_parameters = get_pangenome_parameters(h5f)
+
+ for step, param_name_to_value in step_to_parameters.items():
+ print(f"{step}:")
+ for param_name, val in param_name_to_value.items():
+ print(f" {param_name} : {val}")
+
+
+def get_pangenome_parameters(h5f: tables.File) -> Dict[str, Dict[str, Any]]:
+ """
+ Read and return the pangenome parameters.
+
+ :param h5f: Pangenome HDF5 file
+ :return: A dictionary containing the name of the ppanggolin step as the key, and a dictionary of parameter names
+ and their corresponding values used for that step.
+ """
if "/info" in h5f:
info_group = h5f.root.info
if "parameters" in info_group._v_attrs._f_list():
- print("Parameters: ")
- for key, dic in info_group._v_attrs["parameters"].items():
- print(f"\t- {key}")
- for key2, val in dic.items():
- print(f"\t\t- {key2} : {val}")
+ return info_group._v_attrs["parameters"]
def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = False, graph: bool = False,
@@ -728,7 +777,8 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa
if h5f.root.status._v_attrs.metadata:
metastatus = h5f.root.status._f_get_child("metastatus")
metasources = h5f.root.status._f_get_child("metasources")
- if metastatus._v_attrs[metatype] and all([True if source in metasources._v_attrs[metatype] else False for source in sources]):
+ if metastatus._v_attrs[metatype] and all(
+ [True if source in metasources._v_attrs[metatype] else False for source in sources]):
logging.getLogger().info(f"Reading the {metatype} metadata from sources {sources}...")
read_metadata(pangenome, h5f, metatype, sources, disable_bar=disable_bar)
else:
@@ -736,6 +786,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa
f"or has been improperly filled")
h5f.close()
+
def check_pangenome_info(pangenome, need_annotations: bool = False, need_families: bool = False,
need_graph: bool = False, need_partitions: bool = False, need_rgp: bool = False,
need_spots: bool = False, need_gene_sequences: bool = False, need_modules: bool = False,
@@ -817,10 +868,11 @@ def check_pangenome_info(pangenome, need_annotations: bool = False, need_familie
if source in pangenome.status["metasources"][metatype]:
metadata = True
else:
- raise Exception(f"There is no metadata assign to {metatype} for source : {source} in your pangenome.")
+ raise Exception(
+ f"There is no metadata assign to {metatype} for source : {source} in your pangenome.")
else:
metadata = True
- elif not pangenome.status["metastatus"][metatype] in ["Computed", "Loaded"]:
+ elif pangenome.status["metastatus"][metatype] not in ["Computed", "Loaded"]:
raise Exception(f"Your pangenome don't have any metadata for {metatype}. See the 'metadata' subcommand")
if any([annotation, gene_families, graph, rgp, spots, gene_sequences, modules, metadata]):
diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py
index b09e4166..bb1de011 100644
--- a/ppanggolin/formats/writeAnnotations.py
+++ b/ppanggolin/formats/writeAnnotations.py
@@ -84,7 +84,8 @@ def contig_desc(contig_len: int, org_len: int) -> Dict[str, Union[tables.StringC
:return: Formatted table
"""
- return {'name': tables.StringCol(itemsize=contig_len),
+ return {'ID': tables.UInt32Col(),
+ 'name': tables.StringCol(itemsize=contig_len),
"is_circular": tables.BoolCol(dflt=False),
'length': tables.UInt32Col(),
"organism": tables.StringCol(itemsize=org_len)}
@@ -104,6 +105,7 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro
logging.getLogger("PPanGGOLiN").debug(f"Writing {pangenome.number_of_contigs} contigs")
contig_row = contig_table.row
for contig in tqdm(pangenome.contigs, total=pangenome.number_of_contigs, unit="contigs", disable=disable_bar):
+ contig_row["ID"] = contig.ID
contig_row["name"] = contig.name
contig_row["is_circular"] = contig.is_circular
contig_row["length"] = len(contig)
@@ -112,12 +114,11 @@ def write_contigs(pangenome: Pangenome, h5f: tables.File, annotation: tables.Gro
contig_table.flush()
-def gene_desc(id_len: int, max_local_id: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]:
+def gene_desc(id_len: int, max_local_id: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col, tables.BoolCol]]:
"""Table description to save gene-related information
:param id_len: Maximum size of gene name
:param max_local_id: Maximum size of gene local identifier
- :param max_contig_len: Maximum size of contig identifier
:return: Formatted table
"""
@@ -125,7 +126,7 @@ def gene_desc(id_len: int, max_local_id: int, max_contig_len: int) -> Dict[str,
'genedata_id': tables.UInt32Col(),
'local': tables.StringCol(itemsize=max_local_id),
'is_fragment': tables.BoolCol(dflt=False),
- 'contig': tables.StringCol(itemsize=max_contig_len)}
+ 'contig': tables.UInt32Col()}
def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group,
@@ -150,7 +151,7 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou
gene_row["ID"] = gene.ID
gene_row["is_fragment"] = gene.is_fragment
gene_row["local"] = gene.local_identifier
- gene_row["contig"] = gene.contig.name
+ gene_row["contig"] = gene.contig.ID
genedata = get_genedata(gene)
genedata_id = genedata2gene.get(genedata)
if genedata_id is None:
@@ -163,7 +164,7 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou
return genedata2gene
-def rna_desc(id_len: int, max_contig_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]:
+def rna_desc(id_len: int) -> Dict[str, Union[tables.StringCol, tables.UInt32Col]]:
"""Table description to save rna-related information
:param id_len: Maximum size of RNA identifier
@@ -173,7 +174,7 @@ def rna_desc(id_len: int, max_contig_len: int) -> Dict[str, Union[tables.StringC
"""
return {'ID': tables.StringCol(itemsize=id_len),
'genedata_id': tables.UInt32Col(),
- 'contig': tables.StringCol(itemsize=max_contig_len)}
+ 'contig': tables.UInt32Col()}
def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group,
@@ -196,7 +197,7 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group
rna_row = rna_table.row
for rna in tqdm(pangenome.RNAs, total=pangenome.number_of_rnas, unit="RNA", disable=disable_bar):
rna_row["ID"] = rna.ID
- rna_row["contig"] = rna.contig.name
+ rna_row["contig"] = rna.contig.ID
genedata = get_genedata(rna)
genedata_id = genedata2rna.get(genedata)
if genedata_id is None:
@@ -336,11 +337,11 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: boo
desc = contig_desc(contig_len, org_len)
write_contigs(pangenome, h5f, annotation, desc, disable_bar)
if rec_genes:
- desc = gene_desc(gene_id_len, gene_local_id, contig_len)
+ desc = gene_desc(gene_id_len, gene_local_id)
genedata2gene = write_genes(pangenome, h5f, annotation, desc, disable_bar)
write_genedata(pangenome, h5f, annotation, genedata2gene, disable_bar)
if rec_rnas:
- desc = rna_desc(rna_id_len, contig_len)
+ desc = rna_desc(rna_id_len)
genedata2rna = write_rnas(pangenome, h5f, annotation, desc, disable_bar)
write_genedata(pangenome, h5f, annotation, genedata2rna, disable_bar)
diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py
index 30e19b2a..c45cdd76 100644
--- a/ppanggolin/formats/writeBinaries.py
+++ b/ppanggolin/formats/writeBinaries.py
@@ -635,7 +635,6 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo
raise AssertionError
except AssertionError:
raise AssertionError("To erase metadata. You should provide metatype and source")
-
h5f = tables.open_file(pangenome.file, "a")
status_group = h5f.root.status
info_group = h5f.root.info
@@ -741,14 +740,14 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable
h5f = tables.open_file(filename, "a")
if pangenome.status["geneSequences"] == "Computed":
- logging.getLogger("PPanGGOLiN").info("writing the protein coding gene dna sequences")
+ logging.getLogger("PPanGGOLiN").info("writing the protein coding gene dna sequences in pangenome...")
write_gene_sequences(pangenome, h5f, disable_bar=disable_bar)
pangenome.status["geneSequences"] = "Loaded"
if pangenome.status["genesClustered"] == "Computed":
- logging.getLogger("PPanGGOLiN").info("Writing gene families and gene associations...")
+ logging.getLogger("PPanGGOLiN").info("Writing gene families and gene associations in pangenome...")
write_gene_families(pangenome, h5f, force, disable_bar=disable_bar)
- logging.getLogger("PPanGGOLiN").info("Writing gene families information...")
+ logging.getLogger("PPanGGOLiN").info("Writing gene families information in pangenome...")
write_gene_fam_info(pangenome, h5f, force, disable_bar=disable_bar)
if pangenome.status["genomesAnnotated"] in ["Loaded", "inFile"] and \
pangenome.status["defragmented"] == "Computed":
@@ -757,7 +756,7 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable
update_gene_fragments(pangenome, h5f, disable_bar=disable_bar)
pangenome.status["genesClustered"] = "Loaded"
if pangenome.status["neighborsGraph"] == "Computed":
- logging.getLogger("PPanGGOLiN").info("Writing the edges...")
+ logging.getLogger("PPanGGOLiN").info("Writing the edges of neighbors graph in pangenome...")
write_graph(pangenome, h5f, force, disable_bar=disable_bar)
pangenome.status["neighborsGraph"] = "Loaded"
if pangenome.status["partitioned"] == "Computed" and \
@@ -766,17 +765,17 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable
pangenome.status["partitioned"] = "Loaded"
if pangenome.status['predictedRGP'] == "Computed":
- logging.getLogger("PPanGGOLiN").info("Writing Regions of Genomic Plasticity...")
+ logging.getLogger("PPanGGOLiN").info("Writing Regions of Genomic Plasticity in pangenome...")
write_rgp(pangenome, h5f, force, disable_bar=disable_bar)
pangenome.status['predictedRGP'] = "Loaded"
if pangenome.status["spots"] == "Computed":
- logging.getLogger("PPanGGOLiN").info("Writing Spots of Insertion...")
+ logging.getLogger("PPanGGOLiN").info("Writing Spots of Insertion in pangenome...")
write_spots(pangenome, h5f, force, disable_bar=disable_bar)
pangenome.status['spots'] = "Loaded"
if pangenome.status["modules"] == "Computed":
- logging.getLogger("PPanGGOLiN").info("Writing Modules...")
+ logging.getLogger("PPanGGOLiN").info("Writing Modules in pangenome...")
write_modules(pangenome, h5f, force, disable_bar=disable_bar)
pangenome.status["modules"] = "Loaded"
@@ -787,3 +786,5 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable
h5f.close()
logging.getLogger("PPanGGOLiN").info(f"Done writing the pangenome. It is in file : {filename}")
+
+
\ No newline at end of file
diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py
index d63d0f79..e12170dd 100644
--- a/ppanggolin/formats/writeFlat.py
+++ b/ppanggolin/formats/writeFlat.py
@@ -5,20 +5,32 @@
import argparse
import logging
from multiprocessing import get_context
+from itertools import combinations
from collections import Counter, defaultdict
+import logging
+from typing import TextIO,List, Dict
from pathlib import Path
from typing import TextIO
from importlib.metadata import distribution
from statistics import median, mean, stdev
import os
+import random
+
+
+import networkx as nx
+from plotly.express.colors import qualitative
+
# local libraries
from ppanggolin.edge import Edge
from ppanggolin.geneFamily import GeneFamily
-from ppanggolin.genome import Organism
+from ppanggolin.genome import Organism, Gene, Contig, RNA
+from ppanggolin.region import Region, Spot, Module
from ppanggolin.pangenome import Pangenome
-from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float
+from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float, extract_contig_window, parse_input_paths_file
from ppanggolin.formats.readBinaries import check_pangenome_info
+from ppanggolin.formats.write_proksee import write_proksee_organism
+from ppanggolin.formats.writeSequences import read_genome_file, write_spaced_fasta
# global variable to store the pangenome
pan = Pangenome() # TODO change to pangenome:Pangenome = Pangenome=() ?
@@ -263,10 +275,10 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95):
gexf.write(f' \n')
if pan.number_of_spots > 0:
str_spot = "|".join([str(s) for s in list(fam.spots)])
- gexf.write(f' \n')
+ gexf.write(f' \n')
if pan.number_of_modules > 0:
str_module = "|".join([str(m) for m in list(fam.modules)])
- gexf.write(f' \n')
+ gexf.write(f' \n')
shift = 14
source_fields = {m.source: m.fields for f in pan.gene_families if len(list(f.metadata)) > 0 for m in f.metadata}
for source_metadata_families in pan.metadata_sources("families"):
@@ -336,13 +348,18 @@ def write_gexf(output: Path, light: bool = True, compress: bool = False):
txt += "light gexf file for the pangenome graph..." if light else "gexf file for the pangenome graph..."
logging.getLogger("PPanGGOLiN").info(txt)
- outname = output / f"pangenomeGraph{'_light' if light else ''}.gexf"
+ outname = output / f"pangenomeGraph{'_light' if light else ''}.gexf{'.gz' if compress else ''}"
with write_compressed_or_not(outname, compress) as gexf:
+ graph_type = 'ligth gexf' if light else 'gexf'
+ logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} header...")
write_gexf_header(gexf, light)
+ logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} nodes...")
write_gexf_nodes(gexf, light)
+ logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} edges...")
write_gexf_edges(gexf, light)
+ logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} ends...")
write_gexf_end(gexf)
- logging.getLogger("PPanGGOLiN").info(f"Done writing the gexf file : '{outname.as_posix()}'")
+ logging.getLogger("PPanGGOLiN").info(f"Done writing the gexf file : '{gexf.name}'")
def write_matrix(output: Path, sep: str = ',', ext: str = 'csv', compress: bool = False, gene_names: bool = False):
@@ -529,16 +546,17 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05,
if gene.family in core:
nb_gene_core += 1
completeness = "NA"
+ org_families = set(org.families)
if len(single_copy_markers) > 0:
- completeness = round((len(set(org.families) & single_copy_markers) /
+ completeness = round((len(org_families & single_copy_markers) /
len(single_copy_markers)) * 100, 2)
outfile.write("\t".join(map(str, [org.name,
org.number_of_families(),
nb_pers,
nb_shell,
nb_cloud,
- len(core) + org.number_of_families(),
- len(soft) + org.number_of_families(),
+ len(core & org_families),
+ len(soft & org_families),
org.number_of_genes(),
nb_gene_pers,
nb_gene_shell,
@@ -546,7 +564,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05,
nb_gene_core,
nb_gene_soft,
completeness,
- org.number_of_families() + len(single_copy_markers)])) + "\n")
+ len(org_families & single_copy_markers)])) + "\n")
logging.getLogger("PPanGGOLiN").info("Done writing genome per genome statistics")
@@ -616,6 +634,292 @@ def write_projections(output: Path, compress: bool = False):
logging.getLogger("PPanGGOLiN").info("Done writing the projection files")
+def write_proksee(output: Path, fasta: Path = None, anno: Path = None):
+ """
+ Generate ProkSee data for multiple organisms and write it to the specified output directory.
+
+ :param output: The directory where the ProkSee data will be written.
+ :param fasta: The path to a FASTA file containing genome sequences (optional).
+ :param anno: The path to an annotation file (optional).
+
+ This function generates ProkSee data for multiple organisms and writes it to the specified output directory.
+ If genome sequences are provided in a FASTA file or annotations in a separate file, they will be used in the generation
+ of ProkSee data for each organism to add sequences data to proksee files.
+ """
+
+ proksee_outdir = output / "proksee"
+ mk_outdir(proksee_outdir, True)
+
+ organisms_file = fasta if fasta is not None else anno
+
+ if organisms_file:
+ org_dict = parse_input_paths_file(organisms_file)
+
+ org_to_modules = defaultdict(set)
+
+ # Create a mapping of organisms to the modules they belong to
+ for mod in pan.modules:
+ for org in mod.organisms:
+ org_to_modules[org].add(mod)
+
+ # Generate a color mapping for modules
+ module_to_colors = manage_module_colors(list(pan.modules))
+
+ features = ["all"]
+
+ for organism in pan.organisms:
+ if organisms_file:
+ genome_sequences = read_genome_file(org_dict[organism.name]['path'], organism)
+ else:
+ genome_sequences = None
+
+ # Generate a color mapping for modules specific to the organism
+ org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in org_to_modules[organism]}
+
+ output_file = proksee_outdir / f"{organism.name}.json"
+
+ # Write ProkSee data for the organism
+ write_proksee_organism(organism, output_file, features=features, module_to_colors=org_module_to_color, rgps=pan.regions,
+ genome_sequences=genome_sequences)
+
+
+ logging.getLogger().info("Done writing the proksee files")
+
+def manage_module_colors(modules: List[Module], window_size:int=50) -> Dict[Module, str]:
+ """
+ Manages colors for a list of modules based on gene positions and a specified window size.
+
+ :param modules: A list of module objects for which you want to determine colors.
+ :param window_size: Minimum number of genes between two modules to color them with the same color.
+ A higher value results in more module colors.
+ :return: A dictionary that maps each module to its assigned color.
+ """
+
+ color_mod_graph = nx.Graph()
+ color_mod_graph.add_nodes_from((module for module in modules))
+
+ contig_to_mod_genes = defaultdict(set)
+ gene_to_module = {}
+
+ for module in modules:
+ for fam in module.families:
+ for gene in fam.genes:
+ contig_to_mod_genes[gene.contig].add(gene)
+ gene_to_module[gene] = module
+
+ for contig, mod_genes in contig_to_mod_genes.items():
+ gene_positions = (gene.position for gene in mod_genes)
+ contig_windows = extract_contig_window(
+ contig.number_of_genes, gene_positions, window_size=window_size, is_circular=contig.is_circular
+ )
+ contig_windows = list(contig_windows)
+
+ for (start, end) in contig_windows:
+ module_in_window = {gene_to_module[gene] for gene in mod_genes if start <= gene.position <= end}
+
+ # Add edges between closely located modules
+ module_edges = [(mod_a, mod_b) for mod_a, mod_b in combinations(module_in_window, 2)]
+ color_mod_graph.add_edges_from(module_edges)
+
+ module_to_color_int = nx.coloring.greedy_color(color_mod_graph)
+
+ # If you want to export the graph to see the coloring:
+ # nx.set_node_attributes(color_mod_graph, color_dict, name="color")
+ # nx.readwrite.graphml.write_graphml(color_mod_graph, f"module_graph_window_size{window_size}.graphml")
+
+ nb_colors = len(set(module_to_color_int.values()))
+ logging.getLogger().debug(f"We have found that {nb_colors} colors were necessary to color Modules.")
+ colors = palette(nb_colors)
+ module_to_color = {mod: colors[col_i] for mod, col_i in module_to_color_int.items()}
+
+ return module_to_color
+
+def palette(nb_colors: int) -> List[str]:
+ """
+ Generates a palette of colors for visual representation.
+
+ :param nb_colors: The number of colors needed in the palette.
+
+ :return: A list of color codes in hexadecimal format.
+ """
+
+ # Combine two sets of predefined colors for variety
+ colors = qualitative.Vivid + qualitative.Safe
+
+ if len(colors) < nb_colors:
+ # Generate random colors if not enough predefined colors are available
+ random.seed(1)
+ random_colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for _ in range(6)]) for _ in range(nb_colors - len(colors))]
+ colors += random_colors
+ else:
+ colors = colors[:nb_colors]
+
+ return colors
+
+
+def write_gff(output: str, compress: bool = False, fasta: Path = None, anno: Path = None):
+
+ """
+ Write the gff files for all organisms
+
+ :param output: Path to output directory
+ :param compress: Compress the file in .gz
+ :param fasta: The path to a FASTA file containing genome sequences (optional).
+ :param anno: The path to an annotation file (optional).
+
+ """
+ logging.getLogger().info("Writing the gff files...")
+
+ organisms_file = fasta if fasta is not None else anno
+
+ if organisms_file:
+ org_dict = parse_input_paths_file(organisms_file)
+
+ outdir = output / "gff"
+ mk_outdir(outdir, True)
+
+ if pan.parameters["annotate"]["# read_annotations_from_file"]:
+ annotation_sources = {"rRNA": "external",
+ "tRNA": "external",
+ "CDS":"external"}
+ else:
+ annotation_sources = {}
+
+ contig_to_rgp = defaultdict(list)
+ for rgp in pan.regions:
+ contig_to_rgp[rgp.contig].append(rgp)
+
+ rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in pan.spots for rgp in spot.regions}
+
+ for org in pan.organisms:
+ if organisms_file:
+ genome_sequences = read_genome_file(org_dict[org.name]['path'], org)
+ else:
+ genome_sequences = None
+
+ write_gff_file(org, contig_to_rgp, rgp_to_spot_id, outdir, compress, annotation_sources, genome_sequences)
+
+ logging.getLogger().info("Done writing the gff files")
+
+
+def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region],
+ rgp_to_spotid: Dict[Region, str], outdir: str, compress: bool,
+ annotation_sources: Dict[str, str], genome_sequences:Dict[str,str]):
+ """
+ Write the GFF file of the provided organism.
+
+ :param org: Organism object for which the GFF file is being written.
+ :param contig_to_rgp: Dictionary mapping Contig objects to their corresponding Region objects.
+ :param rgp_to_spotid: Dictionary mapping Region objects to their corresponding spot IDs.
+ :param outdir: Path to the output directory where the GFF file will be written.
+ :param compress: If True, compress the output GFF file using .gz format.
+ :param annotation_sources: A dictionary that maps types of features to their source information.
+ :param genome_sequences: A dictionary mapping contig names to their DNA sequences (default: None).
+ """
+
+ # sort contig by their name
+ sorted_contigs = sorted(org.contigs, key= lambda x : x.name)
+
+ with write_compressed_or_not(outdir / F"{org.name}.gff", compress) as outfile:
+ # write gff header
+ outfile.write('##gff-version 3\n')
+ for contig in sorted_contigs:
+ if contig.length is None:
+ raise AttributeError(f'Contig {contig.name} has no length defined.')
+
+ outfile.write(f'##sequence-region {contig.name} 1 {contig.length}\n')
+
+ for contig in sorted_contigs:
+ contig_elements = sorted(contig_to_rgp[contig] + list(contig.genes) + list(contig.RNAs), key=lambda x: (x.start))
+
+ for feature in contig_elements:
+
+ if type(feature) in [Gene, RNA]:
+ feat_type = feature.type
+
+ strand = feature.strand
+
+ source = annotation_sources.get(feat_type, "external")
+
+ # before the CDS or RNA line a gene line is created. with the following id
+ parent_gene_id=f"gene-{feature.ID}"
+
+ attributes = [("ID", feature.ID),
+ ("Name", feature.name),
+ ('Parent', parent_gene_id),
+ ("product", feature.product),
+ ]
+
+ score = '.'
+
+ if isinstance(feature, Gene):
+ rgp = feature.RGP.name if feature.RGP else ""
+ attributes += [
+ ("Family", feature.family.name),
+ ("Partition", feature.family.named_partition),
+ ('RGP', rgp),
+ ('Module', ','.join((f"module_{module.ID}" for module in feature.family.modules)) )
+ ]
+
+
+
+ # add an extra line of type gene
+
+ gene_line = [contig.name,
+ source,
+ 'gene',
+ feature.start,
+ feature.stop,
+ '.',
+ strand,
+ ".",
+ f'ID={parent_gene_id}'
+ ]
+
+ line_str = '\t'.join(map(str, gene_line))
+ outfile.write(line_str + "\n")
+
+ elif isinstance(feature, Region):
+ feat_type = "region"
+ source = "ppanggolin"
+ strand = "."
+ score = feature.score # TODO does RGP score make sens and do we want it in gff file?
+ attributes = [
+ ("Name", feature.name),
+ ("Spot", rgp_to_spotid.get(feature, "No_spot")),
+ ("Note", "Region of Genomic Plasticity (RGP)")
+ ]
+
+
+ else:
+ raise TypeError(f'The feature to write in gff file does not have an expected types. {type(feature)}')
+
+
+ attributes_str = ';'.join([f"{k}={v}" for k,v in attributes if v != "" and v is not None])
+
+ line = [contig.name,
+ source, # Source
+ feat_type,
+ feature.start,
+ feature.stop,
+ score,
+ strand,
+ ".",
+ attributes_str,
+ ]
+
+ line_str = '\t'.join(map(str, line))
+ outfile.write(line_str + "\n")
+
+ if genome_sequences:
+ logging.getLogger("PPanGGOLiN").debug("Writing fasta section of gff file...")
+ outfile.write("##FASTA\n")
+ for contig in sorted_contigs:
+ outfile.write(f">{contig.name}\n")
+
+ outfile.write(write_spaced_fasta(genome_sequences[contig.name], space=60))
+
+
def write_parts(output: Path, soft_core: float = 0.95):
"""
Write the list of gene families for each partition
@@ -669,7 +973,7 @@ def write_gene_families_tsv(output: Path, compress: bool = False):
tsv.write("\t".join([fam.name, gene.ID if gene.local_identifier == "" else gene.local_identifier,
"F" if gene.is_fragment else ""]) + "\n")
logging.getLogger("PPanGGOLiN").info("Done writing the file providing the association between genes and "
- f"gene families : '{outname}'")
+ f"gene families: '{outname}'")
def write_regions(output: Path, compress: bool = False):
@@ -689,20 +993,24 @@ def write_regions(output: Path, compress: bool = False):
region.is_whole_contig])) + "\n")
-def summarize_spots(spots: set, output: Path, compress: bool = False):
+def summarize_spots(spots: set, output: Path, compress: bool = False, file_name="summarize_spots.tsv"):
"""
Write a file providing summarize information about hotspots
:param spots: set of spots in pangenome
:param output: Path to output directory
:param compress: Compress the file in .gz
+ :patam file_name: Name of the output file
"""
def r_and_s(value: float):
"""rounds to dp figures and returns a str of the provided value"""
return str(round(value, 3)) if isinstance(value, float) else str(value)
- with write_compressed_or_not(output / "summarize_spots.tsv", compress) as fout:
+
+ file_path = output / file_name
+
+ with write_compressed_or_not(file_path, compress) as fout:
fout.write("spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\t"
"stdev_nb_genes\tmax_nb_genes\tmin_nb_genes\n")
for spot in sorted(spots, key=lambda x: len(x), reverse=True):
@@ -718,7 +1026,7 @@ def r_and_s(value: float):
min_size = min(size_list)
fout.write("\t".join(map(r_and_s, [f"{str(spot)}", len(spot), len(tot_fams), len_uniq_content,
mean_size, stdev_size, max_size, min_size])) + "\n")
- logging.getLogger("PPanGGOLiN").info(f"Done writing spots in : '{output.as_posix() + '/summarize_spots.tsv'}'")
+ logging.getLogger("PPanGGOLiN").info(f"Done writing spots in '{file_path}'")
def spot2rgp(spots: set, output: Path, compress: bool = False):
@@ -758,7 +1066,7 @@ def write_borders(output: Path, dup_margin: float = 0.05, compress: bool = False
with write_compressed_or_not(output / "spot_borders.tsv", compress) as fout:
fout.write("spot_id\tnumber\tborder1\tborder2\n")
for spot in sorted(pan.spots, key=lambda x: len(x), reverse=True):
- curr_borders = spot.borders(pan.parameters["spots"]["set_size"], multigenics)
+ curr_borders = spot.borders(pan.parameters["spot"]["set_size"], multigenics)
for c, border in curr_borders:
famstring1 = ",".join([fam.name for fam in border[0]])
famstring2 = ",".join([fam.name for fam in border[1]])
@@ -829,8 +1137,8 @@ def write_org_modules(output: Path, compress: bool = False):
for fam in mod.families:
mod_orgs |= set(fam.organisms)
for org in mod_orgs:
- completion = round((org.number_of_families() + len(mod)) / len(mod), 2)
- fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n")
+ completion = len(set(org.families) & set(mod.families)) / len(mod)
+ fout.write(f"module_{mod.ID}\t{org.name}\t{completion:.2}\n")
fout.close()
logging.getLogger("PPanGGOLiN").info(
f"Done writing modules to organisms associations to: '{output.as_posix() + '/modules_in_organisms.tsv'}'")
@@ -916,10 +1224,10 @@ def write_rgp_modules(output: Path, compress: bool = False):
def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core: float = 0.95,
dup_margin: float = 0.05, csv: bool = False, gene_pa: bool = False, gexf: bool = False,
- light_gexf: bool = False, projection: bool = False, stats: bool = False, json: bool = False,
+ light_gexf: bool = False, projection: bool = False, gff: bool = False, proksee: bool = False, stats: bool = False, json: bool = False,
partitions: bool = False, regions: bool = False, families_tsv: bool = False, spots: bool = False,
borders: bool = False, modules: bool = False, spot_modules: bool = False, compress: bool = False,
- disable_bar: bool = False):
+ disable_bar: bool = False, fasta=None, anno=None):
"""
Main function to write flat files from pangenome
@@ -933,6 +1241,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core
:param gexf: write pangenome graph in gexf format
:param light_gexf: write pangenome graph with only gene families
:param projection: write projection of pangenome for organisms
+ :param gff: write a gff file with pangenome annotation for each organisms
:param stats: write statistics about pangenome
:param json: write pangenome graph in json file
:param partitions: write the gene families for each partition
@@ -946,7 +1255,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core
:param disable_bar: Disable progress bar
"""
# TODO Add force parameter to check if output already exist
- if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, stats, json, partitions, regions, spots, borders,
+ if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, gff, proksee, stats, json, partitions, regions, spots, borders,
families_tsv, modules, spot_modules]):
raise Exception("You did not indicate what file you wanted to write.")
@@ -966,10 +1275,10 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core
pan = pangenome
if csv or gene_pa or gexf or light_gexf or projection or stats or json or partitions or regions or spots or \
- families_tsv or borders or modules or spot_modules:
+ families_tsv or borders or modules or spot_modules or gff or proksee:
needAnnotations = True
needFamilies = True
- if projection or stats or partitions or regions or spots or borders:
+ if projection or stats or partitions or regions or spots or borders or gff or proksee:
needPartitions = True
if gexf or light_gexf or json:
needGraph = True
@@ -987,7 +1296,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core
needSpots = True
if modules or spot_modules: # or projection:
needModules = True
- if projection:
+ if projection or gff or proksee:
needRegions = True if pan.status["predictedRGP"] == "inFile" else False
needSpots = True if pan.status["spots"] == "inFile" else False
needModules = True if pan.status["modules"] == "inFile" else False
@@ -1003,11 +1312,15 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core
if gene_pa:
processes.append(p.apply_async(func=write_gene_presence_absence, args=(output, compress)))
if gexf:
- processes.append(p.apply_async(func=write_gexf, args=(output, False, soft_core)))
+ processes.append(p.apply_async(func=write_gexf, args=(output, False, compress)))
if light_gexf:
- processes.append(p.apply_async(func=write_gexf, args=(output, True, soft_core)))
+ processes.append(p.apply_async(func=write_gexf, args=(output, True, compress)))
if projection:
processes.append(p.apply_async(func=write_projections, args=(output, compress)))
+ if proksee:
+ processes.append(p.apply_async(func=write_proksee, args=(output, fasta, anno)))
+ if gff:
+ processes.append(p.apply_async(func=write_gff, args=(output, compress, fasta, anno)))
if stats:
processes.append(p.apply_async(func=write_stats, args=(output, soft_core, dup_margin, compress)))
if json:
@@ -1044,10 +1357,10 @@ def launch(args: argparse.Namespace):
global pan
pan.add_file(args.pangenome)
write_flat_files(pan, args.output, cpu=args.cpu, soft_core=args.soft_core, dup_margin=args.dup_margin, csv=args.csv,
- gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection,
+ gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, gff=args.gff, proksee=args.proksee,
stats=args.stats, json=args.json, partitions=args.partitions, regions=args.regions,
families_tsv=args.families_tsv, spots=args.spots, borders=args.borders, modules=args.modules,
- spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar)
+ spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar, fasta=args.fasta, anno=args.anno)
def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
@@ -1093,11 +1406,19 @@ def parser_flat(parser: argparse.ArgumentParser):
optional.add_argument("--projection", required=False, action="store_true",
help="a csv file for each organism providing information on the projection of the graph "
"on the organism")
+ optional.add_argument("--gff", required=False, action="store_true",
+ help="Generate a gff file for each organism containing pangenome annotations.")
+
+ optional.add_argument("--proksee", required=False, action="store_true",
+ help="Generate JSON map files for PROKSEE for each organism containing pangenome annotations to be used to in proksee.")
+
optional.add_argument("--stats", required=False, action="store_true",
help="tsv files with some statistics for each organism and for each gene family")
+
optional.add_argument("--partitions", required=False, action="store_true",
help="list of families belonging to each partition, with one file per partitions and "
"one family per line")
+
optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz")
optional.add_argument("--json", required=False, action="store_true", help="Writes the graph in a json file format")
optional.add_argument("--regions", required=False, action="store_true",
@@ -1114,6 +1435,19 @@ def parser_flat(parser: argparse.ArgumentParser):
optional.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus")
+ context = parser.add_argument_group(title="Contextually required arguments",
+ description="With --proksee and -gff, the following arguments can be "
+ "used to add sequence information to the output file:")
+
+ context.add_argument('--fasta', required=False, type=Path,
+ help="A tab-separated file listing the organism names, and the fasta filepath of its genomic "
+ "sequence(s) (the fastas can be compressed with gzip). One line per organism.")
+
+ context.add_argument('--anno', required=False, type=Path,
+ help="A tab-separated file listing the organism names, and the gff/gbff filepath of its "
+ "annotations (the files can be compressed with gzip). One line per organism. "
+ "If this is provided, those annotations will be used.")
+
if __name__ == '__main__':
"""To test local change and allow using debugger"""
from ppanggolin.utils import set_verbosity_level, add_common_arguments
diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py
index 6c7363f5..af189edd 100644
--- a/ppanggolin/formats/writeMSA.py
+++ b/ppanggolin/formats/writeMSA.py
@@ -237,14 +237,12 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: str,
genome_id = ""
seq = ""
curr_len = 0
- dup_gene = 0 # TODO Remove ?
curr_phylo_dict = {}
for line in fin:
if line.startswith('>'):
if genome_id != "":
if genome_id not in missing_genomes:
- dup_gene += 1
# duplicated genes. Replacing them with gaps.
curr_phylo_dict[genome_id] = "-" * curr_len
else:
diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py
index 1d0ffa34..0cb7e2cb 100644
--- a/ppanggolin/formats/writeMetadata.py
+++ b/ppanggolin/formats/writeMetadata.py
@@ -42,6 +42,9 @@ def write_metadata_status(pangenome: Pangenome, h5f: tables.File, status_group:
if metastatus["genes"] in ["Computed", "Loaded", "inFile"]:
metadata_group._v_attrs.genes = True
metasources_group._v_attrs.genes = metasources["genes"]
+ if metastatus["contigs"] in ["Computed", "Loaded", "inFile"]:
+ metadata_group._v_attrs.contigs = True
+ metasources_group._v_attrs.contigs = metasources["contigs"]
if metastatus["genomes"] in ["Computed", "Loaded", "inFile"]:
metadata_group._v_attrs.genomes = True
metasources_group._v_attrs.genomes = metasources["genomes"]
@@ -121,7 +124,7 @@ def get_metadata_len(select_elem: List[Module], source: str) -> Tuple[Dict[str,
if isinstance(value, float) or isinstance(value, int):
if attr in type_dict:
if type_dict[attr] != type(value):
- if type(value) == float and type_dict[attr] == int:
+ if isinstance(value, float) and isinstance(type_dict[attr], int):
type_dict[attr] = tables.Float64Col()
else:
if isinstance(value, float):
@@ -238,6 +241,14 @@ def write_metadata(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = F
"genomes", select_genomes, disable_bar)
pangenome.status["metadata"]["genomes"] = "Loaded"
+ if pangenome.status["metadata"]["contigs"] == "Computed":
+ logging.getLogger().info("Writing contigs metadata in pangenome")
+ select_genomes = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["contigs"][-1],
+ metatype="contigs"))
+ write_metadata_metatype(h5f, pangenome.status["metasources"]["contigs"][-1],
+ "contigs", select_genomes, disable_bar)
+ pangenome.status["metadata"]["contigs"] = "Loaded"
+
if pangenome.status["metadata"]["genes"] == "Computed":
logging.getLogger().info("Writing genes metadata in pangenome")
select_genes = list(pangenome.get_elem_by_sources(source=pangenome.status["metasources"]["genes"][-1],
diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
index 84634065..6ad32c29 100644
--- a/ppanggolin/formats/writeSequences.py
+++ b/ppanggolin/formats/writeSequences.py
@@ -6,7 +6,7 @@
import logging
import re
from pathlib import Path
-from typing import TextIO, Dict, Set
+from typing import TextIO, Dict, Set, Iterable
# installed libraries
from tqdm import tqdm
@@ -14,33 +14,34 @@
# local libraries
from ppanggolin.pangenome import Pangenome
from ppanggolin.geneFamily import GeneFamily
+from ppanggolin.genome import Gene, Organism
from ppanggolin.utils import write_compressed_or_not, mk_outdir, read_compressed_or_not, restricted_float, detect_filetype
from ppanggolin.formats.readBinaries import check_pangenome_info, get_gene_sequences_from_file
-module_regex = re.compile(r'^module_[0-9]+')
+module_regex = re.compile(r'^module_\d+') #\d == [0-9]
poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex]
poss_values_log = f"Possible values are {', '.join(poss_values[:-1])}, module_X with X being a module id."
-def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO, list_cds: list = None,
- add: str = '', disable_bar: bool = False):
- """
- Writes the CDS sequences given through list_CDS of the Pangenome object to a tmpFile object,
- and adds the str provided through add in front of it.
- Loads the sequences from previously computed or loaded annotations
- :param pangenome: Pangenome object with gene families sequences
- :param file_obj: Output file to write sequences
- :param list_cds: Selected genes
- :param add: Add prefix to gene ID
- :param disable_bar: Disable progress bar
+def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], file_obj: TextIO, add: str = '',
+ disable_bar: bool = False):
"""
- logging.getLogger("PPanGGOLiN").info("Writing all of the CDS sequences...")
- for gene in tqdm(sorted(list_cds if list_cds is not None else pangenome.genes, key=lambda x: x.ID),
- unit="gene", disable=disable_bar):
+ Writes the CDS sequences to a File object,
+ and adds the string provided through `add` in front of it.
+ Loads the sequences from previously computed or loaded annotations.
+
+ :param genes_to_write: Genes to write.
+ :param file_obj: Output file to write sequences.
+ :param add: Add prefix to gene ID.
+ :param disable_bar: Disable progress bar.
+ """
+ logging.getLogger("PPanGGOLiN").info(f"Writing all CDS sequences in {file_obj.name}")
+ for gene in tqdm(genes_to_write, unit="gene", disable=disable_bar):
if gene.type == "CDS":
- file_obj.write('>' + add + gene.ID + "\n")
- file_obj.write(gene.dna + "\n")
+ gene_id = gene.ID if gene.local_identifier == "" else gene.local_identifier
+ file_obj.write(f'>{add}{gene_id}\n')
+ file_obj.write(f'{gene.dna}\n')
file_obj.flush()
@@ -73,7 +74,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
get_gene_sequences_from_file(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]),
disable_bar=disable_bar)
elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
- write_gene_sequences_from_annotations(pangenome, fasta, genes_to_write, disable_bar=disable_bar)
+ write_gene_sequences_from_annotations(genes_to_write, fasta, disable_bar=disable_bar)
else:
# this should never happen if the pangenome has been properly checked before launching this function.
raise Exception("The pangenome does not include gene sequences")
@@ -189,15 +190,16 @@ def read_fasta_or_gff(file_path: Path) -> Dict[str, str]:
sequence_dict = {}
seqname = ""
seq = ""
- z = False
+ in_fasta_part = False
with read_compressed_or_not(file_path) as f:
for line in f:
if line.startswith(">"):
- z = True
- if z:
+ in_fasta_part = True
+ if in_fasta_part:
if line.startswith('>'):
if seq != "":
sequence_dict[seqname] = seq
+ seq = ""
seqname = line[1:].strip().split()[0]
else:
seq += line.strip()
@@ -244,26 +246,33 @@ def read_fasta_gbk(file_path: Path) -> Dict[str, str]:
return sequence_dict
-def read_genome_file(file_dict: Dict[str, Path], genome_name: str) -> Dict[str, str]:
+def read_genome_file(genome_file: Path, organism: Organism) -> Dict[str, str]:
"""
- Read the genome file associated to organism
+ Read the genome file associated to organism to extract sequences
- :param file_dict: Dictionary given association between organism and fasta file
- :param genome_name: organism name
+ :param genome_file: Path to a fasta file or gbff/gff file
+ :param genome: organism object
:return: Dictionary with all sequences associated to contig
"""
- filetype = detect_filetype(file_dict[genome_name])
+ filetype = detect_filetype(genome_file)
if filetype in ["fasta", "gff"]:
- return read_fasta_or_gff(file_dict[genome_name])
+ contig_to_sequence = read_fasta_or_gff(genome_file)
elif filetype == "gbff":
- return read_fasta_gbk(file_dict[genome_name])
+ contig_to_sequence = read_fasta_gbk(genome_file)
else:
- raise Exception(f"Unknown filetype detected: '{file_dict[genome_name]}'")
+ raise Exception(f"Unknown filetype detected: '{genome_file}'")
+
+ # check_contig_names
+ if set(contig_to_sequence) != {contig.name for contig in organism.contigs}:
+ raise Exception(f"Contig name inconsistency detected in organism '{organism.name}' between the "
+ f"information stored in the pangenome file and the contigs found in '{genome_file}'.")
+ return contig_to_sequence
def write_spaced_fasta(sequence: str, space: int = 60) -> str:
- """Write a maximum of element per line
+ """
+ Write a maximum of element per line
:param sequence: sequence to write
:param space: maximum of size for one line
@@ -320,8 +329,8 @@ def write_regions_sequences(pangenome: Pangenome, output: Path, regions: str, fa
loaded_genome = ""
for region in tqdm(regions_to_write, unit="rgp", disable=disable_bar):
if region.organism.name != loaded_genome:
- loaded_genome = region.organism.name
- genome_sequence = read_genome_file(org_dict, loaded_genome)
+ organism = region.organism
+ genome_sequence = read_genome_file(org_dict[organism.name], organism)
fasta.write(f">{region.name}\n")
fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.starter.start:region.stopper.stop], 60))
logging.getLogger("PPanGGOLiN").info(f"Done writing the regions nucleotide sequences: '{outname}'")
@@ -356,14 +365,18 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
need_regions = False
need_modules = False
- if any(x is not None for x in [regions, genes, gene_families, prot_families]):
+ if prot_families is not None:
+ need_families = True
+
+ if any(x is not None for x in [regions, genes, gene_families]):
need_annotations = True
need_families = True
if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)):
+ need_annotations = True
need_regions = True
if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)):
need_partitions = True
- for x in (genes, gene_families, prot_families):
+ for x in (genes, gene_families):
if x is not None and 'module_' in x:
need_modules = True
diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py
new file mode 100644
index 00000000..9df61e6a
--- /dev/null
+++ b/ppanggolin/formats/write_proksee.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+# coding:utf-8
+
+# default libraries
+import json
+import logging
+from pathlib import Path
+from tqdm import tqdm
+from typing import Dict, List, Tuple
+from collections import defaultdict
+
+# installed libraries
+
+
+# local libraries
+from ppanggolin.genome import Organism, Gene
+from ppanggolin.region import Module, Region
+from ppanggolin.pangenome import Pangenome
+
+
+
+def write_legend_items(features: List[str], module_to_color: Dict[Module, str]):
+ """
+ Generates legend items based on the selected features and module-to-color mapping.
+
+ :param features: A list of features to include in the legend.
+ :param module_to_color: A dictionary mapping modules to their assigned colors.
+
+ :return: A data structure containing legend items based on the selected features and module colors.
+ """
+ # use https://medialab.github.io/iwanthue/ to find nice colors
+ # that associate well with established partition colors (orange, light green, light blue)
+ main_colors = {
+ "orange": "#e59c04",
+ "light green": "#00d860" ,
+ "light blue": "#79deff",
+ "purple": "#a567bb",
+ "dark green": "#7a9a4c",
+ "dark red": "#ca5c55",
+ }
+
+ legend_data = {"items" : [
+ {"name": "persistent", "swatchColor": main_colors['orange'], "decoration": "arrow"},
+ {"name": "shell", "swatchColor": main_colors['light green'], "decoration": "arrow"},
+ {"name": "cloud", "swatchColor": main_colors['light blue'], "decoration": "arrow"},
+ {"name": "RNA", "swatchColor": main_colors['purple'], "decoration": "arrow"},
+ ]
+ }
+ if "rgp" in features or "all" in features:
+ legend_data["items"].append({"name": "RGP", "swatchColor": main_colors['dark green'], "decoration": "arc"}),
+
+ if "modules" in features or "all" in features:
+ for mod, color in sorted(module_to_color.items(), key=lambda x: x[0].ID):
+ legend_data["items"].append({"name": f"module_{mod.ID}", "decoration": "arc", "swatchColor": color, "visible":False})
+
+ return legend_data
+
+def write_tracks(features: List[str]):
+ """
+ Generates track information based on the selected features.
+
+ :param features: A list of features to include in the ProkSee data.
+
+ :return: A list of track configurations based on the selected features.
+ """
+ tracks = [
+ {
+ "name": "Gene",
+ "separateFeaturesBy": "strand",
+ "position": "outside",
+ "thicknessRatio": 1,
+ "dataType": "feature",
+ "dataMethod": "source",
+ "dataKeys": "Gene"
+ }
+ ]
+
+ if "rgp" in features or "all" in features:
+ tracks.append({
+ "name": "RGP",
+ "separateFeaturesBy": "None",
+ "position": "inside",
+ "thicknessRatio": 1,
+ "dataType": "feature",
+ "dataMethod": "source",
+ "dataKeys": "RGP"
+ })
+
+ if "modules" in features or "all" in features:
+ tracks.append({
+ "name": "Module",
+ "separateFeaturesBy": "None",
+ "position": "inside",
+ "thicknessRatio": 1,
+ "dataType": "feature",
+ "dataMethod": "source",
+ "dataKeys": "Module"
+ })
+
+ return tracks
+
+
+def initiate_proksee_data(features: List[str], org_name: str, module_to_color: Dict[Module, str]):
+ """
+ Initializes ProkSee data structure with legends, tracks, and captions.
+
+ :param features: A list of features to include in the ProkSee data.
+ :param org_name: The name of the organism for which the ProkSee data is being generated.
+ :param module_to_color: A dictionary mapping modules to their assigned colors.
+
+ :return: ProkSee data structure containing legends, tracks, and captions.
+ """
+ proksee_legends = write_legend_items(features, module_to_color)
+ proksee_tracks = write_tracks(features)
+
+ proksee_captions = {
+ "name": f"{org_name} annotated with PPanGGOLiN",
+ "position": "bottom-center",
+ "font": "sans-serif,plain,18",
+ "backgroundColor": "rgba(255,255,255,0.4)"
+ }
+
+ cgview_data = {
+ "name": "PPanGGOLiN annotation at genome level",
+ "version": "1.5.0",
+ 'settings': {},
+ "legend": proksee_legends,
+ "tracks": proksee_tracks,
+ "sequence": {},
+ 'captions': [proksee_captions],
+ }
+
+ return {"cgview": cgview_data}
+
+
+def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None) -> List[Dict]:
+ """
+ Writes contig data for a given organism in proksee format.
+
+ :param organism: The organism for which contig data will be written.
+ :param genome_sequences: A dictionary mapping contig names to their DNA sequences (default: None).
+
+ :return: A list of contig data in a structured format.
+ """
+ contigs_data_list = []
+
+ for contig in tqdm(organism.contigs, unit="contig", disable=True):
+ contig_info = {
+ "name": contig.name,
+ "length": contig.length,
+ "orientation": "+",
+ }
+
+ if genome_sequences:
+ contig_info['seq'] = genome_sequences.get(contig.name, "")
+
+ contigs_data_list.append(contig_info)
+
+ return contigs_data_list
+
+
+
+def write_genes(organism: Organism, disable_bar: bool = True) -> Tuple[List[Dict], Dict[str, List[Gene]]]:
+ """
+ Writes gene data for a given organism, including both protein-coding genes and RNA genes.
+
+ :param organism: The organism for which gene data will be written.
+ :param disable_bar: A flag to disable the progress bar when processing genes (default: True).
+
+ :return: A tuple containing a list of gene data in a structured format and a dictionary mapping gene families to genes.
+ """
+ genes_data_list = []
+ gf2gene = defaultdict(list)
+
+ # Process protein-coding genes
+ for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=disable_bar):
+ gf = gene.family
+ gf2gene[gf.name].append(gene)
+
+ genes_data_list.append({
+ "name": gene.name,
+ "type": "Gene",
+ "contig": gene.contig.name,
+ "start": gene.start,
+ "stop": gene.stop,
+ "strand": 1 if gene.strand == "+" else -1,
+ "product": gene.product,
+ "tags": [gene.family.named_partition, gene.family.name],
+ "source": "Gene",
+ "legend": gene.family.named_partition,
+ "meta": "" # annotations
+ })
+
+ # Process RNA genes
+ for gene in tqdm(organism.rna_genes, total=organism.number_of_rnas(), unit="rnas", disable=disable_bar):
+ genes_data_list.append({
+ "name": gene.name,
+ "type": "Gene",
+ "contig": gene.contig.name,
+ "start": gene.start,
+ "stop": gene.stop,
+ "strand": 1 if gene.strand == "+" else -1,
+ "product": gene.product,
+ "tags": [],
+ "source": "Gene",
+ "legend": "RNA",
+ "meta": "" # annotations
+ })
+
+ return genes_data_list, gf2gene
+
+
+def write_rgp(rgps: Pangenome, organism: Organism):
+ """
+ Writes RGP (Region of Genomic Plasticity) data for a given organism in proksee format.
+
+ :param pangenome: The pangenome containing information about RGPs.
+ :param organism: The specific organism for which RGP data will be written.
+
+ :return: A list of RGP data in a structured format.
+ """
+ rgp_data_list = []
+
+ # Iterate through each RGP in the pangenome
+ for rgp in tqdm(rgps, unit="RGP", disable=True):
+ if rgp.organism == organism:
+ # Create an entry for the RGP in the data list
+ rgp_data_list.append({
+ "name": rgp.name,
+ "contig": rgp.contig.name,
+ "start": rgp.start,
+ "stop": rgp.stop,
+ "legend": "RGP",
+ "source": "RGP",
+ "tags": []
+ })
+
+ return rgp_data_list
+
+
+def write_modules(modules: List[Module], organism: Organism, gf2genes: Dict[str, List[Gene]]):
+ """
+ Writes module data in proksee format for a list of modules associated with a given organism.
+
+ :param modules: A list of modules for which data will be written.
+ :param organism: The organism to which the modules are associated.
+ :param gf2genes: A dictionary that maps gene families to the genes they contain.
+
+ :return: A list of module data in a structured format.
+ """
+ modules_data_list = []
+
+ # Iterate through each module and find intersecting gene families
+ for module in modules:
+ gf_intersection = set(organism.families) & set(module.families)
+
+ if gf_intersection:
+ # Calculate the completion percentage
+ completion = round(len(gf_intersection) / len(set(module.families)), 2)
+
+ # Create module data entries for genes within intersecting gene families
+ for gf in gf_intersection:
+ for gene in gf2genes[gf.name]:
+ modules_data_list.append({
+ "name": f"Module_{module.ID}",
+ "presence": "Module",
+ "start": gene.start,
+ "stop": gene.stop,
+ "contig": gene.contig.name,
+ "legend": f"module_{module.ID}",
+ "source": "Module",
+ "tags": [],
+ "meta": {
+ "completion": completion
+ }
+ })
+
+ return modules_data_list
+
+
+def write_proksee_organism(organism: Organism, output_file: Path,
+ features: List[str] = None,
+ module_to_colors: Dict[Module, str] = None,
+ rgps:List[Region] = None,
+ genome_sequences: Dict[str,str] = None):
+ """
+ Write ProkSee data for a given organism.
+
+ :param organism: The organism for which ProkSee data will be written.
+ :param output_file: The output file where ProkSee data will be written.
+ :param features: A list of features to include in the ProkSee data, e.g., ["rgp", "modules", "all"].
+ :param module_to_colors: A dictionary mapping modules to their assigned colors.
+ :patram rgps: list of RGPs that belong to the organisms
+ :param genome_sequences: The genome sequences for the organism.
+
+ This function writes ProkSee data for a given organism, including contig information, genes colored by partition, RGPs,
+ and modules. The resulting data is saved as a JSON file in the specified output file.
+ """
+ proksee_data = initiate_proksee_data(features, organism.name, module_to_colors)
+
+ proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences)
+
+ genes_features, gf2genes = write_genes(organism)
+
+ proksee_data["cgview"]["features"] = genes_features
+
+ if "rgp" in features or "all" in features:
+ proksee_data["cgview"]["features"] += write_rgp(rgps, organism=organism)
+
+ if "modules" in features or "all" in features:
+ proksee_data["cgview"]["features"] += write_modules(modules=module_to_colors, organism=organism, gf2genes=gf2genes)
+
+ logging.debug(f"Write ProkSee for {organism.name}")
+ with open(output_file, "w") as out_json:
+ json.dump(proksee_data, out_json, indent=2)
diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py
index 2e075bda..a8a8f3ff 100644
--- a/ppanggolin/geneFamily.py
+++ b/ppanggolin/geneFamily.py
@@ -273,7 +273,7 @@ def number_of_edges(self) -> int:
def number_of_genes(self) -> int:
"""Get the number of genes for the current gene family
"""
- return len(self._genes)
+ return len(self._genes_getter)
@property
def number_of_organisms(self) -> int:
@@ -341,16 +341,16 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'):
"""
self.bitarray = gmpy2.xmpz() # pylint: disable=no-member
if partition == 'all':
- logging.getLogger("PPanGGOLiN").debug(f"all")
+ logging.getLogger("PPanGGOLiN").debug("all")
for org in self.organisms:
self.bitarray[index[org]] = 1
elif partition in ['shell', 'cloud']:
- logging.getLogger("PPanGGOLiN").debug(f"shell, cloud")
+ logging.getLogger("PPanGGOLiN").debug("shell, cloud")
if self.named_partition == partition:
for org in self.organisms:
self.bitarray[index[org]] = 1
elif partition == 'accessory':
- logging.getLogger("PPanGGOLiN").debug(f"accessory")
+ logging.getLogger("PPanGGOLiN").debug("accessory")
if self.named_partition in ['shell', 'cloud']:
for org in self.organisms:
self.bitarray[index[org]] = 1
diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py
index 205b08dc..d74e8779 100644
--- a/ppanggolin/genome.py
+++ b/ppanggolin/genome.py
@@ -200,7 +200,7 @@ class Gene(Feature):
Fields:
- position: the position of the gene in the genome.
- family: the family that the gene belongs to.
- - RGP: a set of resistance gene profiles associated with the gene.
+ - RGP: A putative Region of Plasticity that contains the gene.
- genetic_code: the genetic code associated with the gene.
- Protein: the protein sequence corresponding to the translated gene.
"""
@@ -285,7 +285,7 @@ def add_protein(self, protein: str):
self.protein = protein
-class Contig:
+class Contig(MetaFeatures):
"""
Describe the contig content and some information
Methods:
@@ -299,12 +299,14 @@ class Contig:
- RNAs: Set of RNA annotations present in the contig.
"""
- def __init__(self, name: str, is_circular: bool = False):
+ def __init__(self, identifier: int, name: str, is_circular: bool = False):
"""Constructor method
:param name: Name of the contig
:param is_circular: saves if the contig is circular
"""
+ super().__init__()
+ self.ID = identifier
self.name = name
self.is_circular = is_circular
self._rna_getter = set() # Saving the rna annotations. We're not using them in the vast majority of cases.
@@ -331,7 +333,9 @@ def __setitem__(self, start: int, gene: Gene):
if not isinstance(gene, Gene):
raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object")
if start in self._genes_getter:
- raise ValueError(f"Gene with start position {start} already exists in the contig")
+ raise ValueError(f"Gene '{self._genes_getter[start].ID}' with start position {start} already exists in the "
+ f"contig '{self.name}' {f'from organism {self.organism}' if self.organism is not None else ''}, "
+ f"cannot add gene '{gene.ID}' {f'from organism {gene.organism}' if gene.organism is not None else ''}")
if gene.position is None:
raise AttributeError("The gene object needs to have its position in the contig filled before adding it")
# Adding empty values.
@@ -355,7 +359,13 @@ def length(self, contig_len: int):
raise TypeError("Contig length is expected to be an integer")
if contig_len < 0:
raise ValueError("Contig length must be positive")
- self._length = contig_len
+
+ if self._length is None:
+ self._length = contig_len
+ elif self.length != contig_len:
+ logging.getLogger("PPanGGOLiN").debug(f"Known contig length = {self.length}, new length = {contig_len}")
+ raise ValueError('Attempting to define a contig length different from the previously defined value.')
+
def __len__(self):
return self.length
@@ -433,8 +443,10 @@ def remove(self, position):
raise TypeError(f"Position to get gene must be an integer. The provided type was {type(position)}")
del self[position]
- def get_genes(self, begin: int, end: int) -> List[Gene]:
- """Gets a list of genes within a range
+ def get_genes(self, begin: int = 0, end: int = None) -> List[Gene]:
+ """
+ Gets a list of genes within a range.
+ If no arguments are given it return all genes.
:param begin: Position of the first gene to retrieve
:param end: Position of the last gene to not retrieve
@@ -444,10 +456,16 @@ def get_genes(self, begin: int, end: int) -> List[Gene]:
:raises TypeError: If begin or end is not an integer
:raises ValueError: If begin position is greater than end positon
"""
+
+ if end is None:
+ end = self.length
+
if not isinstance(begin, int) or not isinstance(end, int):
- raise TypeError(f"Expected type is int, given type was '{type(begin)}, {type(end)}'")
- if end < begin:
- raise ValueError("End position is lower than begin position")
+ raise TypeError(f"Expected type int for 'begin' and 'end', but received types '{type(begin)}' and '{type(end)}'.")
+
+ if begin >= end:
+ raise ValueError("The 'begin' position must be less than the 'end' position.")
+
else:
return self._genes_position[begin: end]
@@ -642,12 +660,28 @@ def genes(self) -> Generator[Gene, None, None]:
for contig in self.contigs:
yield from contig.genes
+ @property
+ def rna_genes(self) -> Generator[RNA, None, None]:
+ """Generator to get genes in the organism
+
+ :return: Generator of genes
+ """
+ for contig in self.contigs:
+ yield from contig.RNAs
+
def number_of_genes(self) -> int:
""" Get number of genes in the organism
:return: Number of genes
"""
- return sum([contig.number_of_genes for contig in self.contigs])
+ return sum((contig.number_of_genes for contig in self.contigs))
+
+ def number_of_rnas(self) -> int:
+ """ Get number of genes in the organism
+
+ :return: Number of genes
+ """
+ return sum((contig.number_of_rnas for contig in self.contigs))
@property
def contigs(self) -> Generator[Contig, None, None]:
@@ -657,6 +691,16 @@ def contigs(self) -> Generator[Contig, None, None]:
"""
yield from self._contigs_getter.values()
+
+ @property
+ def number_of_contigs(self) -> int:
+ """ Get number of contigs in organism
+
+ :return: Number of contigs in organism
+ """
+ return len(self._contigs_getter)
+
+
def add(self, contig: Contig):
"""Add a contig to organism
@@ -672,6 +716,7 @@ def add(self, contig: Contig):
else:
raise KeyError(f"Contig {contig.name} already in organism {self.name}")
+
def get(self, name: str) -> Contig:
"""
Get contig with the given identifier in the organism
@@ -682,6 +727,7 @@ def get(self, name: str) -> Contig:
"""
return self[name]
+
def remove(self, name: str) -> Contig:
"""
Remove a contig with the given identifier in the organism
@@ -692,6 +738,7 @@ def remove(self, name: str) -> Contig:
"""
del self[name]
+
def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'):
"""Produces a bitarray representing the presence / absence of families in the organism using the provided index
The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type.
@@ -718,3 +765,4 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'):
self.bitarray[index[fam]] = 1
else:
raise Exception("There is not any partition corresponding please report a github issue")
+
diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py
index 84033e8d..69557705 100644
--- a/ppanggolin/graph/makeGraph.py
+++ b/ppanggolin/graph/makeGraph.py
@@ -110,10 +110,8 @@ def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0,
pangenome.status["neighborsGraph"] = "Computed"
pangenome.parameters["graph"] = {}
- pangenome.parameters["graph"]["removed_high_copy_number_families"] = False
if remove_copy_number > 0:
- pangenome.parameters["graph"]["removed_high_copy_number_families"] = True
- pangenome.parameters["graph"]["removed_high_copy_number_of_families_above"] = remove_copy_number
+ pangenome.parameters["graph"]["remove_high_copy_number"] = remove_copy_number
def launch(args: argparse.Namespace):
diff --git a/ppanggolin/info/info.py b/ppanggolin/info/info.py
index 18671733..638281d3 100644
--- a/ppanggolin/info/info.py
+++ b/ppanggolin/info/info.py
@@ -9,7 +9,7 @@
import tables
# local libraries
-from ppanggolin.formats import read_info, read_parameters, fix_partitioned
+from ppanggolin.formats import read_info, read_parameters, fix_partitioned
def read_status(h5f: tables.File):
diff --git a/ppanggolin/main.py b/ppanggolin/main.py
index 2d94fc7b..09002c52 100644
--- a/ppanggolin/main.py
+++ b/ppanggolin/main.py
@@ -13,8 +13,8 @@
# local modules
import ppanggolin.pangenome
from ppanggolin.utils import check_input_files, set_verbosity_level, add_common_arguments, manage_cli_and_config_args
-import ppanggolin.nem.rarefaction
import ppanggolin.nem.partition
+import ppanggolin.nem.rarefaction
import ppanggolin.graph
import ppanggolin.annotate
import ppanggolin.cluster
@@ -65,6 +65,7 @@ def cmd_line() -> argparse.Namespace:
desc += " partition Partition the pangenome graph\n"
desc += " rarefaction Compute the rarefaction curve of the pangenome\n"
desc += " msa Compute Multiple Sequence Alignments for pangenome gene families\n"
+ desc += " projection Annotate an input genome with an existing pangenome\n"
desc += " metadata Add metadata to elements in pangenome\n"
desc += " \n"
desc += " Output:\n"
@@ -132,20 +133,26 @@ def cmd_line() -> argparse.Namespace:
set_verbosity_level(args)
if args.subcommand == "annotate" and args.fasta is None and args.anno is None:
- parser.error("You must provide at least a file with the --fasta option to annotate from sequences, "
- "or a file with the --gff option to load annotations through the command line or the config file.")
+ parser.error("Please provide either a sequence file using the --fasta option or an annotation file using the --anno option "
+ "to enable annotation. Use the command line or the config file.")
cmds_pangenome_required = ["cluster", "info", "module", "graph", "align",
"context", "write", "msa", "draw", "partition",
- "rarefaction", "spot", "fasta", "metrics", "rgp"]
+ "rarefaction", "spot", "fasta", "metrics", "rgp", "projection", "metadata"]
if args.subcommand in cmds_pangenome_required and args.pangenome is None:
- parser.error("You must provide a pangenome file with the --pangenome "
- "argument through the command line or the config file.")
+ parser.error("Please specify a pangenome file using the --pangenome argument, "
+ "either through the command line or the config file.")
- if args.subcommand == "align" and args.sequences is None:
- parser.error("You must provide sequences (nucleotides or amino acids) to align on the pangenome gene families "
- "with the --sequences argument through the command line or the config file.")
+ if args.subcommand == "align" and args.sequences is None:
+ parser.error("Please provide sequences (nucleotides or amino acids) for alignment with the pangenome gene families "
+ "using the --sequences argument, either through the command line or the config file.")
+
+ if args.subcommand == "projection":
+ # check argument correctness and determine input mode (single or multiple files) and add it to args.
+ input_mode = ppanggolin.projection.projection.check_projection_arguments(args, parser)
+ setattr(args, "input_mode", input_mode)
+
return args
@@ -186,6 +193,8 @@ def main():
ppanggolin.metrics.metrics.launch(args)
elif args.subcommand == "align":
ppanggolin.align.launch(args)
+ elif args.subcommand == "projection":
+ ppanggolin.projection.projection.launch(args)
elif args.subcommand == "rgp":
ppanggolin.RGP.genomicIsland.launch(args)
elif args.subcommand == "spot":
diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py
index b07ea8c8..f7506599 100644
--- a/ppanggolin/meta/meta.py
+++ b/ppanggolin/meta/meta.py
@@ -64,8 +64,8 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame:
:return: Dataframe with metadata loaded
"""
- assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]
- colname_check = re.compile('^[a-zA-Z_][a-zA-Z0-9_]*$')
+ assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"]
+ colname_check = re.compile('^[a-zA-Z_]\w*$') # \w = [A-Za-z0-9_]
metadata_df = pd.read_csv(metadata, sep="\t", header=0, quoting=csv.QUOTE_NONE,
dtype={metatype: str})
metadata_df.replace(to_replace='-', value=pd.NA, inplace=True)
@@ -95,7 +95,24 @@ def assign_metadata(metadata_df: pd.DataFrame, pangenome: Pangenome, source: str
:raise KeyError: element name is not find in pangenome
:raise AssertionError: Metatype is not recognized
"""
- assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]
+ assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"]
+
+ org2contig = None
+
+ def check_duplicate_contig_name():
+ contig_names = set()
+ for contig in pangenome.contigs:
+ old_len = len(contig_names)
+ contig_names.add(contig.name)
+ if len(contig_names) == old_len:
+ raise Exception("There are 2 contigs with the same name in the pangenome and "
+ "you did not provide the genome linked to contig. "
+ "Add a column 'genomes' to indicate to which genome the contig belongs to.")
+
+ if metatype == "contigs" and "genomes" not in metadata_df.columns:
+ check_duplicate_contig_name()
+ org2contig = {contig.name: contig.organism.name for contig in pangenome.contigs}
+
for row in tqdm(metadata_df.iterrows(), unit='row',
total=metadata_df.shape[0], disable=disable_bar):
row = row[1]
@@ -104,6 +121,10 @@ def assign_metadata(metadata_df: pd.DataFrame, pangenome: Pangenome, source: str
element = pangenome.get_gene_family(row[metatype])
elif metatype == "genomes":
element = pangenome.get_organism(row[metatype])
+ elif metatype == "contigs":
+ org = row["genomes"] if "genomes" in metadata_df.columns else org2contig[row[metatype]]
+ print("pika")
+ element = pangenome.get_contig(name=row[metatype], organism_name=org)
elif metatype == "genes":
element = pangenome.get_gene(row[metatype])
elif metatype == "RGPs":
@@ -119,6 +140,8 @@ def assign_metadata(metadata_df: pd.DataFrame, pangenome: Pangenome, source: str
logging.getLogger().debug(f"{metatype}: {row[metatype]} doesn't exist")
else:
meta = Metadata(source=source, **{k: v for k, v in row.to_dict().items() if k != metatype})
+ if metatype == "contigs":
+ meta.genomes = element.organism.name
element.add_metadata(source=source, metadata=meta)
pangenome.status["metadata"][metatype] = "Computed"
@@ -169,7 +192,7 @@ def parser_meta(parser: argparse.ArgumentParser):
required.add_argument("-s", "--source", required=False, type=str, nargs="?",
help='Name of the metadata source')
required.add_argument("-a", "--assign", required=False, type=str, nargs="?",
- choices=["families", "genomes", "genes", "RGPs", "spots", "modules"],
+ choices=["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"],
help="Select to which pangenome element metadata will be assigned")
optional = parser.add_argument_group(title="Optional arguments")
optional.add_argument("--omit", required=False, action="store_true",
diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py
index 23f2f94b..df3c8c5b 100644
--- a/ppanggolin/mod/module.py
+++ b/ppanggolin/mod/module.py
@@ -129,12 +129,12 @@ def predict_modules(pangenome: Pangenome, dup_margin: float = 0.05, size: int =
logging.getLogger("PPanGGOLiN").info(f"Computing modules took {round(time.time() - start_time, 2)} seconds")
pangenome.status["modules"] = "Computed"
- pangenome.parameters["modules"] = {}
- pangenome.parameters["modules"]["size"] = size
- pangenome.parameters["modules"]["min_presence"] = min_presence
- pangenome.parameters["modules"]["transitive"] = transitive
- pangenome.parameters["modules"]["jaccard"] = jaccard
- pangenome.parameters["modules"]["dup_margin"] = dup_margin
+ pangenome.parameters["module"] = {}
+ pangenome.parameters["module"]["size"] = size
+ pangenome.parameters["module"]["min_presence"] = min_presence
+ pangenome.parameters["module"]["transitive"] = transitive
+ pangenome.parameters["module"]["jaccard"] = jaccard
+ pangenome.parameters["module"]["dup_margin"] = dup_margin
def launch(args: argparse.Namespace):
diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py
index 137f2f27..dfa852a4 100644
--- a/ppanggolin/nem/partition.py
+++ b/ppanggolin/nem/partition.py
@@ -479,20 +479,25 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d
pangenome.parameters["partition"] = {}
pangenome.parameters["partition"]["beta"] = beta
+ pangenome.parameters["partition"]["max_degree_smoothing"] = sm_degree
pangenome.parameters["partition"]["free_dispersion"] = free_dispersion
- pangenome.parameters["partition"]["max_node_degree_for_smoothing"] = sm_degree
+ pangenome.parameters["partition"]["ICL_margin"] = icl_margin
+ pangenome.parameters["partition"]["seed"] = seed
if len(organisms) > chunk_size:
pangenome.parameters["partition"]["chunk_size"] = chunk_size
- pangenome.parameters["partition"]["computed_K"] = False
+ pangenome.parameters["partition"]["# computed nb of partitions"] = False
+ # the K value initally given by the user
+ pangenome.parameters["partition"]["nb_of_partitions"] = kval
if kval < 2:
- pangenome.parameters["partition"]["computed_K"] = True
+ pangenome.parameters["partition"]["# computed nb of partitions"] = True
logging.getLogger("PPanGGOLiN").info("Estimating the optimal number of partitions...")
kval = evaluate_nb_partitions(organisms, output, sm_degree, free_dispersion, chunk_size, kmm,
icl_margin, draw_icl, cpu, seed, tmp_path, disable_bar)
logging.getLogger("PPanGGOLiN").info(f"The number of partitions has been evaluated at {kval}")
- pangenome.parameters["partition"]["K"] = kval
+ pangenome.parameters["partition"]["# final nb of partitions"] = kval
+ pangenome.parameters["partition"]["krange"] = kmm
init = "param_file"
partitioning_results = {}
diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py
index 786ebbe0..af2e33b6 100644
--- a/ppanggolin/nem/rarefaction.py
+++ b/ppanggolin/nem/rarefaction.py
@@ -362,8 +362,8 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No
ppp.pan = pangenome # use the global from partition to store the pangenome, so that it is usable
try:
- krange[0] = ppp.pan.parameters["partition"]["K"] if krange[0] < 0 else krange[0]
- krange[1] = ppp.pan.parameters["partition"]["K"] if krange[1] < 0 else krange[1]
+ krange[0] = ppp.pan.parameters["partition"]["# final nb of partitions"] if krange[0] < 0 else krange[0]
+ krange[1] = ppp.pan.parameters["partition"]["# final nb of partitions"] if krange[1] < 0 else krange[1]
except KeyError:
krange = [3, 20]
check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar)
@@ -378,7 +378,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No
if kval < 3 and kestimate is False: # estimate K once and for all.
try:
- kval = ppp.pan.parameters["partition"]["K"]
+ kval = ppp.pan.parameters["partition"]["# final nb of partitions"]
logging.getLogger("PPanGGOLiN").info(f"Reuse the number of partitions {kval}")
except KeyError:
logging.getLogger("PPanGGOLiN").info("Estimating the number of partitions...")
diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py
index b5bf7c88..7727193f 100644
--- a/ppanggolin/pangenome.py
+++ b/ppanggolin/pangenome.py
@@ -29,15 +29,15 @@ def __init__(self):
self.file = None
# basic parameters
- self._famGetter = {}
+ self._fam_getter = {}
self._org_index = None
self._fam_index = None
self._max_fam_id = 0
- self._orgGetter = {}
- self._edgeGetter = {}
- self._regionGetter = {}
- self._spotGetter = {}
- self._moduleGetter = {}
+ self._org_getter = {}
+ self._edge_getter = {}
+ self._region_getter = {}
+ self._spot_getter = {}
+ self._module_getter = {}
self.status = {
'genomesAnnotated': "No",
'geneSequences': "No",
@@ -51,12 +51,14 @@ def __init__(self):
'modules': 'No',
"metadata": {"families": 'No',
"genes": 'No',
+ "contigs": 'No',
"genomes": 'No',
"RGPs": 'No',
"spots": 'No',
"modules": 'No'},
"metasources": {"families": [],
"genes": [],
+ "contigs": [],
"genomes": [],
"RGPs": [],
"spots": [],
@@ -101,16 +103,16 @@ def genes(self) -> Generator[Gene, None, None]:
def _mk_gene_getter(self):
"""
- Builds the attribute _geneGetter of the pangenome
+ Builds the attribute _gene_getter of the pangenome
Since the genes are never explicitly 'added' to a pangenome (but rather to a gene family, or a contig),
the pangenome cannot directly extract a gene from a geneID since it does not 'know' them.
- If at some point we want to extract genes from a pangenome we'll create a geneGetter.
+ If at some point we want to extract genes from a pangenome we'll create a gene_getter.
The assumption behind this is that the pangenome has been filled and no more gene will be added.
"""
- self._geneGetter = {}
+ self._gene_getter = {}
for gene in self.genes:
- self._geneGetter[gene.ID] = gene
+ self._gene_getter[gene.ID] = gene
def get_gene(self, gene_id: str) -> Gene:
"""Returns the gene that has the given gene ID
@@ -125,7 +127,7 @@ def get_gene(self, gene_id: str) -> Gene:
assert isinstance(gene_id, str), "Gene id should be an integer"
try:
- return self._geneGetter[gene_id]
+ return self._gene_getter[gene_id]
except AttributeError:
# in that case, either the gene getter has not been computed, or the geneID is not in the pangenome.
self._mk_gene_getter() # make it
@@ -140,10 +142,10 @@ def number_of_genes(self) -> int:
:return: The number of genes
"""
try:
- return len(self._geneGetter)
+ return len(self._gene_getter)
except AttributeError: # in that case the gene getter has not been computed
self._mk_gene_getter() # make it
- return len(self._geneGetter)
+ return len(self._gene_getter)
"""RNAs methods"""
@property
@@ -185,7 +187,7 @@ def gene_families(self) -> Generator[GeneFamily, None, None]:
:return: Generator of gene families
"""
- for family in self._famGetter.values():
+ for family in self._fam_getter.values():
yield family
@property
@@ -194,7 +196,7 @@ def number_of_gene_families(self) -> int:
:return: The number of gene families
"""
- return len(self._famGetter)
+ return len(self._fam_getter)
def get_gene_family(self, name: str) -> GeneFamily:
"""Returns the gene family that has the given `name`
@@ -208,7 +210,7 @@ def get_gene_family(self, name: str) -> GeneFamily:
"""
assert isinstance(name, str), "Name of gene family should be a string"
try:
- fam = self._famGetter[name]
+ fam = self._fam_getter[name]
except KeyError:
raise KeyError(f"Gene family with name={name} is not in pangenome")
except Exception as error:
@@ -229,7 +231,7 @@ def add_gene_family(self, family: GeneFamily):
try:
_ = self.get_gene_family(family.name)
except KeyError:
- self._famGetter[family.name] = family
+ self._fam_getter[family.name] = family
self.max_fam_id += 1
except Exception as error:
raise Exception(error)
@@ -243,7 +245,7 @@ def edges(self) -> Generator[Edge, None, None]:
:return: Generator of edge
"""
- for edge in self._edgeGetter.values():
+ for edge in self._edge_getter.values():
yield edge
def add_edge(self, gene1: Gene, gene2: Gene) -> Edge:
@@ -265,10 +267,10 @@ def add_edge(self, gene1: Gene, gene2: Gene) -> Edge:
raise AttributeError("Genes are not linked to families. Check that you compute the gene families and post an"
" issue on our GitHub")
key = frozenset([family_1, family_2 ])
- edge = self._edgeGetter.get(key)
+ edge = self._edge_getter.get(key)
if edge is None:
edge = Edge(gene1, gene2)
- self._edgeGetter[key] = edge
+ self._edge_getter[key] = edge
else:
edge.add_genes(gene1, gene2)
return edge
@@ -279,7 +281,7 @@ def number_of_edges(self) -> int:
:return: The number of gene families
"""
- return len(self._edgeGetter)
+ return len(self._edge_getter)
"""Organism methods"""
@property
@@ -288,7 +290,7 @@ def organisms(self) -> Generator[Organism, None, None]:
:return: Generator :class:`ppanggolin.genome.Organism`
"""
- for organism in self._orgGetter.values():
+ for organism in self._org_getter.values():
yield organism
@property
@@ -297,7 +299,7 @@ def number_of_organisms(self) -> int:
:return: The number of organism
"""
- return len(self._orgGetter)
+ return len(self._org_getter)
@property
def contigs(self) -> Generator[Contig, None, None]:
@@ -322,28 +324,45 @@ def _mk_contig_getter(self):
"""
self._contig_getter = {}
for contig in self.contigs:
- self._contig_getter[contig.name] = contig
+ self._contig_getter[contig.ID] = contig
- def get_contig(self, name: str) -> Contig:
- """Returns the contig that has the given name
+ def get_contig(self, identifier: int = None, name: str = None, organism_name: str = None) -> Contig:
+ """Returns the contig by his identifier or by his name. If name is given the organism name is needed
- :param name: The ,ame of the contig to look for
+ :param identifier: ID of the contig to look for
+ :param name: The name of the contig to look for
+ :param organism_name: Name of the organism to which the contig belong
:return: Returns the wanted contig
:raises AssertionError: If the `gene_id` is not an integer
:raises KeyError: If the `gene_id` is not in the pangenome
"""
- assert isinstance(name, str), "Contig name should be a string"
-
- try:
- return self._contig_getter[name]
- except AttributeError:
- # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome.
- self._mk_contig_getter() # make it
- return self.get_contig(name) # Return what was expected. If geneID does not exist it will raise an error.
- except KeyError:
- raise KeyError(f"Contig: {name}, does not exist in the pangenome.")
+ if identifier is None:
+ if name is None:
+ raise ValueError("Neiher identifier or name of the contig are given.")
+ else:
+ if not isinstance(name, str):
+ raise AssertionError("Contig name should be a string")
+
+ if organism_name is None:
+ raise ValueError("You should provide the name of the organism to which the contig belong")
+ else:
+ if not isinstance(organism_name, str):
+ raise AssertionError("Organism name should be a string")
+ organism = self.get_organism(organism_name)
+ return organism.get(name)
+ else:
+ if not isinstance(identifier, int):
+ raise AssertionError("Contig ID should be an integer")
+ try:
+ return self._contig_getter[identifier]
+ except AttributeError:
+ # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome.
+ self._mk_contig_getter() # make it
+ return self.get_contig(identifier) # Return what was expected. If geneID does not exist it will raise an error.
+ except KeyError:
+ raise KeyError(f"Contig: {identifier}, does not exist in the pangenome.")
def get_organism(self, name: str) -> Organism:
"""
Get an organism that is expected to be in the pangenome using its name, which is supposedly unique.
@@ -358,7 +377,7 @@ def get_organism(self, name: str) -> Organism:
"""
assert isinstance(name, str), "Organism name should be a string"
try:
- return self._orgGetter[name]
+ return self._org_getter[name]
except KeyError:
raise KeyError(f"{name} does not seem to be in your pangenome")
@@ -378,7 +397,7 @@ def add_organism(self, organism: Organism):
try:
self.get_organism(organism.name)
except KeyError:
- self._orgGetter[organism.name] = organism
+ self._org_getter[organism.name] = organism
else:
raise KeyError(f"Redondant organism name was found ({organism.name})."
f"All of your organisms must have unique names.")
@@ -450,7 +469,7 @@ def regions(self) -> Generator[Region, None, None]:
:return: list of RGP
"""
- for region in self._regionGetter.values():
+ for region in self._region_getter.values():
yield region
def get_region(self, name: str) -> Region:
@@ -466,7 +485,7 @@ def get_region(self, name: str) -> Region:
assert isinstance(name, str), "RGP name should be a string"
try:
- rgp = self._regionGetter[name]
+ rgp = self._region_getter[name]
except KeyError: # then the region is not stored in this pangenome.
raise KeyError(f"There is no RGP with name={name}")
else:
@@ -507,7 +526,7 @@ def add_region(self, region: Region):
try:
self.get_region(region.name)
except KeyError:
- self._regionGetter[region.name] = region
+ self._region_getter[region.name] = region
else:
raise KeyError(f"A RGP with this name ({region.name} already exist in pangenome")
@@ -517,7 +536,7 @@ def number_of_rgp(self) -> int:
:return: The number of gene families
"""
- return len(self._regionGetter)
+ return len(self._region_getter)
"""Spot methods"""
@property
@@ -525,7 +544,7 @@ def spots(self) -> Generator[Spot, None, None]:
"""Generate spots in the pangenome
:return: Spot generator"""
- yield from self._spotGetter.values()
+ yield from self._spot_getter.values()
def get_spot(self, spot_id: Union[int, str]) -> Spot:
# TODO Change for only str or only int
@@ -549,7 +568,7 @@ def get_spot(self, spot_id: Union[int, str]) -> Spot:
raise ValueError(f"The provided spot ID '{spot_id}' does not have the expected format."
"It should be an integer or in the format 'spot_'.")
try:
- spot = self._spotGetter[spot_id]
+ spot = self._spot_getter[spot_id]
except KeyError:
raise KeyError(f"Spot {spot_id} does not exist in the pangenome.")
else:
@@ -567,7 +586,7 @@ def add_spot(self, spot: Spot):
try:
self.get_spot(spot.ID)
except KeyError:
- self._spotGetter[spot.ID] = spot
+ self._spot_getter[spot.ID] = spot
except Exception as error:
raise Exception(error)
else:
@@ -579,14 +598,14 @@ def number_of_spots(self) -> int:
:return: The number of gene families
"""
- return len(self._spotGetter)
+ return len(self._spot_getter)
"""Modules methods"""
@property
def modules(self) -> Generator[Module, None, None]:
"""Generate modules in the pangenome
"""
- yield from self._moduleGetter.values()
+ yield from self._module_getter.values()
def get_module(self, module_id: Union[int, str]) -> Module:
# TODO Change for only str or only int
@@ -612,7 +631,7 @@ def get_module(self, module_id: Union[int, str]) -> Module:
"It should be an integer or in the format 'module_'.")
try:
- module = self._moduleGetter[module_id]
+ module = self._module_getter[module_id]
except KeyError:
raise KeyError(f"Module {module_id} does not exist in the pangenome.")
else:
@@ -630,7 +649,7 @@ def add_module(self, module: Module):
try:
self.get_module(module.ID)
except KeyError:
- self._moduleGetter[module.ID] = module
+ self._module_getter[module.ID] = module
except Exception as error:
raise Exception(error)
else:
@@ -661,7 +680,7 @@ def number_of_modules(self) -> int:
:return: The number of modules
"""
- return len(self._moduleGetter)
+ return len(self._module_getter)
"""Metadata"""
def select_elem(self, metatype: str):
@@ -680,6 +699,8 @@ def select_elem(self, metatype: str):
return self.gene_families
elif metatype == "genomes":
return self.organisms
+ elif metatype == "contigs":
+ return self.contigs
elif metatype == "genes":
return self.genes
elif metatype == "RGPs":
@@ -737,7 +758,7 @@ def get_elem_by_sources(self, source: List[str], metatype: str) -> Generator[
:return: Gene families with the source
"""
- assert metatype in ["families", "genomes", "genes", "RGPs", "spots", "modules"]
+ assert metatype in ["families", "genomes", "contigs", "genes", "RGPs", "spots", "modules"]
for elem in self.select_elem(metatype):
if elem.get_metadata_by_source(source) is not None:
yield elem
diff --git a/ppanggolin/projection/__init__.py b/ppanggolin/projection/__init__.py
new file mode 100644
index 00000000..56bb37d6
--- /dev/null
+++ b/ppanggolin/projection/__init__.py
@@ -0,0 +1 @@
+from .projection import subparser, launch
\ No newline at end of file
diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py
new file mode 100644
index 00000000..8b0d6161
--- /dev/null
+++ b/ppanggolin/projection/projection.py
@@ -0,0 +1,1384 @@
+#!/usr/bin/env python3
+# coding:utf-8
+
+# default libraries
+import argparse
+from concurrent.futures import ProcessPoolExecutor
+from multiprocessing import get_context, Value
+import logging
+import os
+import time
+from pathlib import Path
+import tempfile
+from typing import Tuple, Set, Dict, Optional, List, Iterable, Any
+from collections import defaultdict
+import csv
+from itertools import chain
+
+
+# installed libraries
+from tqdm import tqdm
+import networkx as nx
+import yaml
+import pandas as pd
+
+
+# # local libraries
+from ppanggolin.annotate.synta import read_fasta, get_dna_sequence
+from ppanggolin.annotate.annotate import init_contig_counter, read_anno_file, annotate_organism, local_identifiers_are_unique
+from ppanggolin.annotate import subparser as annotate_subparser
+from ppanggolin.pangenome import Pangenome
+from ppanggolin.utils import detect_filetype, create_tmpdir, read_compressed_or_not, write_compressed_or_not, \
+ restricted_float, mk_outdir, get_config_args, parse_config_file, get_default_args, \
+ check_input_files, parse_input_paths_file
+from ppanggolin.align.alignOnPang import write_gene_to_gene_family, get_input_seq_to_family_with_rep,get_input_seq_to_family_with_all, project_and_write_partition
+from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations
+from ppanggolin.formats.readBinaries import check_pangenome_info
+from ppanggolin.RGP.genomicIsland import naming_scheme, compute_org_rgp
+from ppanggolin.RGP.spot import make_spot_graph, check_sim, add_new_node_in_spot_graph, write_spot_graph
+from ppanggolin.genome import Organism
+from ppanggolin.geneFamily import GeneFamily
+from ppanggolin.region import Region, Spot, Module
+from ppanggolin.formats.writeFlat import summarize_spots, write_proksee_organism, manage_module_colors, write_gff_file
+from ppanggolin.formats.writeSequences import read_genome_file
+
+class NewSpot(Spot):
+ """
+ This class represent a hotspot specifically
+ created for the projected genome.
+ """
+
+ def __str__(self):
+ return f'new_spot_{str(self.ID)}'
+
+def check_pangenome_for_projection(pangenome: Pangenome, fast_aln:bool):
+ """
+ Check the status of a pangenome and determine whether projection is possible.
+
+ :param pangenome: The pangenome to be checked.
+ :param fast_aln: Whether to use the fast alignment option for gene projection.
+
+ This function checks various attributes of a pangenome to determine whether it is suitable for projecting
+ features into a provided genome.
+
+ Returns:
+ A tuple indicating whether RGP prediction, spot projection, and module projection
+ are possible (True) or not (False) based on the pangenome's status.
+
+ Raises:
+ NameError: If the pangenome has not been partitioned.
+ Exception: If the pangenome lacks gene sequences or gene family sequences, and fast alignment is not enabled.
+ """
+
+ project_modules = True
+ predict_rgp = True
+ project_spots = True
+
+
+ if pangenome.status["partitioned"] not in ["Computed", "Loaded", "inFile"]:
+ raise NameError("The provided pangenome has not been partitioned. "
+ "Annotation of an external genome is therefore not possible. "
+ "See the 'partition' subcommands.")
+
+ if pangenome.status["predictedRGP"] not in ["Computed", "Loaded", "inFile"]:
+ logging.getLogger('PPanGGOLiN').info("RGPs have not been predicted in the provided pangenome. "
+ "Projection of RGPs and spots into the provided genome will not be performed.")
+ predict_rgp = False
+ project_spots = False
+
+ elif pangenome.status["spots"] not in ["Computed", "Loaded", "inFile"]:
+ logging.getLogger('PPanGGOLiN').info("Spots have not been predicted in the provided pangenome. "
+ "Projection of spots into the provided genome will not be performed.")
+ project_spots = False
+
+ if pangenome.status["modules"] not in ["Computed", "Loaded", "inFile"]:
+ logging.getLogger('PPanGGOLiN').info("Modules have not been predicted in the provided pangenome. "
+ "Projection of modules into the provided genome will not be performed.")
+
+ project_modules = False
+
+ if pangenome.status["geneSequences"] not in ["Loaded", "Computed", "inFile"] and not fast_aln:
+ raise Exception("The provided pangenome has no gene sequences. "
+ "Projection is still possible with the --fast option to use representative "
+ "sequences rather than all genes to annotate input genes.")
+
+ if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"]:
+ raise Exception("The provided pangenome has no gene families sequences. "
+ "This is not possible to annotate an input organism to this pangenome.")
+
+ return predict_rgp, project_spots, project_modules
+
+
+def manage_input_genomes_annotation(pangenome, input_mode, anno, fasta,
+ organism_name, circular_contigs, pangenome_params,
+ cpu, use_pseudo, disable_bar, tmpdir, config):
+ """
+ """
+ genome_name_to_path = None
+
+ if input_mode == "multiple":
+ if anno:
+ input_type = "annotation"
+ genome_name_to_path = parse_input_paths_file(anno)
+
+ elif fasta:
+ input_type = "fasta"
+ genome_name_to_path = parse_input_paths_file(fasta)
+
+ else: # args.input_mode == "single:
+
+ circular_contigs = circular_contigs if circular_contigs else []
+ if anno:
+ input_type = "annotation"
+ genome_name_to_path = {organism_name: {"path": anno,
+ "circular_contigs": circular_contigs}}
+
+ elif fasta:
+ input_type = "fasta"
+ genome_name_to_path = {organism_name: {"path": fasta,
+ "circular_contigs": circular_contigs}}
+
+ if input_type == "annotation":
+ check_input_names(pangenome, genome_name_to_path)
+
+ organisms, org_2_has_fasta = read_annotation_files(genome_name_to_path, cpu=cpu, pseudo=use_pseudo,
+ disable_bar=disable_bar)
+
+ if not all((has_fasta for has_fasta in org_2_has_fasta.values())):
+ organisms_with_no_fasta = {org for org, has_fasta in org_2_has_fasta.items() if not has_fasta}
+ if fasta:
+ get_gene_sequences_from_fasta_files(organisms_with_no_fasta, genome_name_to_path)
+
+ else:
+ raise ValueError(f"You provided GFF files for {len(organisms_with_no_fasta)} (out of {len(organisms)}) "
+ "organisms without associated sequence data, and you did not provide "
+ "FASTA sequences using the --fasta or --single_fasta_file options. Therefore, it is impossible to project the pangenome onto the input genomes. "
+ f"The following organisms have no associated sequence data: {', '.join(o.name for o in organisms_with_no_fasta)}")
+
+ elif input_type == "fasta":
+ annotate_param_names = ["norna", "kingdom",
+ "allow_overlap", "prodigal_procedure"]
+
+ annotate_params = manage_annotate_param(annotate_param_names, pangenome_params.annotate, config)
+
+
+ check_input_names(pangenome, genome_name_to_path)
+ organisms = annotate_fasta_files(genome_name_to_fasta_path=genome_name_to_path, tmpdir=tmpdir, cpu=cpu,
+ translation_table=int(pangenome_params.cluster.translation_table), norna=annotate_params.norna, kingdom=annotate_params.kingdom,
+ allow_overlap=annotate_params.allow_overlap, procedure=annotate_params.prodigal_procedure, disable_bar=disable_bar)
+ return organisms, genome_name_to_path, input_type
+
+
+def write_projection_results(pangenome:Pangenome, organisms:Set[Organism], input_org_2_rgps:Dict[Organism, Set[Region]],
+ input_org_to_spots:Dict[Organism, Set[Spot]],
+ input_orgs_to_modules:Dict[Organism, Set[Module]] ,
+ input_org_to_lonely_genes_count:Dict[Organism, int],
+ write_proksee:bool, write_gff:bool, add_sequences:bool,
+ genome_name_to_path:Dict[str,dict], input_type:str,
+ output_dir:Path, dup_margin:float, ):
+ """
+ Write the results of the projection of pangneome onto input genomes.
+
+ :param pangenome: The pangenome onto which the projection is performed.
+ :param organisms: A set of input organisms for projection.
+ :param input_org_2_rgps: A dictionary mapping input organisms to sets of regions of genomic plasticity (RGPs).
+ :param input_org_to_spots: A dictionary mapping input organisms to sets of spots.
+ :param input_orgs_to_modules: A dictionary mapping input organisms to sets of modules.
+ :param input_org_to_lonely_genes_count: A dictionary mapping input organisms to the count of lonely genes.
+ :param write_proksee: Whether to write ProkSee JSON files.
+ :param write_gff: Whether to write GFF files.
+ :param add_sequences: Whether to add sequences to the output files.
+ :param genome_name_to_path: A dictionary mapping genome names to file paths.
+ :param input_type: The type of input data (e.g., "annotation").
+ :param output_dir: The directory where the output files will be written.
+ :param dup_margin: The duplication margin used to compute completeness.
+
+ Note:
+ - If `write_proksee` is True and input organisms have modules, module colors for ProkSee are obtained.
+ - The function calls other functions such as `summarize_projection`, `read_genome_file`, `write_proksee_organism`,
+ `write_gff_file`, and `write_summaries` to generate various output files and summaries.
+ """
+
+ if write_proksee and input_orgs_to_modules:
+ # get module color for proksee
+ module_to_colors = manage_module_colors(set(pangenome.modules))
+
+ single_copy_families = get_single_copy_families(pangenome, dup_margin)
+
+ organism_2_summary = {}
+
+ for organism in organisms:
+
+ org_outdir = output_dir / organism.name
+
+ # summarize projection for all input organisms
+ organism_2_summary[organism] = summarize_projection(organism, pangenome, single_copy_families,
+ input_org_2_rgps.get(organism, None),
+ input_org_to_spots.get(organism, None),
+ input_orgs_to_modules.get(organism, None),
+ input_org_to_lonely_genes_count[organism])
+
+ if (write_proksee or write_gff) and add_sequences:
+ genome_sequences = read_genome_file(genome_name_to_path[organism.name]['path'], organism)
+ genome_name_to_path[organism.name]['path']
+ else:
+ genome_sequences = None
+
+ if write_proksee:
+ org_module_to_color = {org_mod: module_to_colors[org_mod] for org_mod in input_orgs_to_modules.get(organism, [])}
+
+ output_file = output_dir / organism.name / f"{organism.name}_proksee.json"
+
+
+ write_proksee_organism(organism, output_file, features='all', module_to_colors=org_module_to_color,
+ rgps=input_org_2_rgps.get(organism, None),
+ genome_sequences=genome_sequences)
+
+
+ if write_gff:
+ if input_type == "annotation": # if the genome has not been annotated by PPanGGOLiN
+ annotation_sources = {"rRNA": "external",
+ "tRNA": "external",
+ "CDS":"external"}
+ else:
+ annotation_sources = {}
+
+ contig_to_rgp, rgp_to_spot_id = {}, {}
+
+ if organism in input_org_2_rgps:
+ contig_to_rgp = defaultdict(list)
+ for rgp in input_org_2_rgps[organism]:
+ contig_to_rgp[rgp.contig].append(rgp)
+
+ if organism in input_org_to_spots:
+ rgp_to_spot_id = {rgp:f"spot_{spot.ID}" for spot in input_org_to_spots[organism] for rgp in spot.regions if rgp in input_org_2_rgps[organism] }
+
+
+ write_gff_file(organism, contig_to_rgp, rgp_to_spot_id, outdir=org_outdir, compress=False,
+ annotation_sources=annotation_sources, genome_sequences=genome_sequences)
+
+
+
+
+ write_summaries(organism_2_summary, output_dir)
+
+
+def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str, cpu: int = 1, translation_table: int = 11,
+ kingdom: str = "bacteria", norna: bool = False, allow_overlap: bool = False, procedure: str = None,
+ disable_bar: bool = False):
+ """
+ Main function to annotate a pangenome
+
+ :param genome_name_to_fasta_path:
+ :param fasta_list: List of fasta file containing sequences that will be base of pangenome
+ :param tmpdir: Path to temporary directory
+ :param cpu: number of CPU cores to use
+ :param translation_table: Translation table (genetic code) to use.
+ :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation.
+ :param norna: Use to avoid annotating RNA features.
+ :param allow_overlap: Use to not remove genes overlapping with RNA features
+ :param procedure: prodigal procedure used
+ :param disable_bar: Disable the progresse bar
+ """
+
+ organisms = []
+ arguments = [] # Argument given to annotate organism in same order than prototype
+ for org_name, org_info in genome_name_to_fasta_path.items():
+
+ arguments.append((org_name, org_info['path'], org_info['circular_contigs'], tmpdir, translation_table,
+ norna, kingdom, allow_overlap, procedure))
+
+ logging.getLogger("PPanGGOLiN").info(f"Annotating {len(arguments)} genomes using {cpu} cpus...")
+ contig_counter = Value('i', 0)
+ with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu,
+ initializer=init_contig_counter, initargs=(contig_counter,)) as executor:
+ with tqdm(total=len(arguments), unit="file", disable=disable_bar) as progress:
+ futures = []
+
+ for fn_args in arguments:
+ future = executor.submit(annotate_organism, *fn_args)
+ future.add_done_callback(lambda p: progress.update())
+ futures.append(future)
+
+ for future in futures:
+ organisms.append(future.result())
+
+ return organisms
+
+
+def read_annotation_files(genome_name_to_annot_path: Dict[str,dict], cpu: int = 1, pseudo: bool = False,
+ disable_bar: bool = False) -> Tuple[List[Organism], Dict[Organism,bool]]:
+ """
+ Read the annotation from GBFF file
+
+ :param pangenome: pangenome object
+ :param organisms_file: List of GBFF files for each organism
+ :param cpu: number of CPU cores to use
+ :param pseudo: allow to read pseudogène
+ :param disable_bar: Disable the progresse bar
+ """
+
+ args = []
+ organisms = []
+
+ # we assume there are gene sequences in the annotation files,
+ # unless a gff file without fasta is met (which is the only case where sequences can be absent)
+ org_to_has_fasta_flag = {}
+
+ args = [(org_name, org_info['path'], org_info['circular_contigs'], pseudo)
+ for org_name, org_info in genome_name_to_annot_path.items()]
+
+ contig_counter = Value('i', 0)
+ with ProcessPoolExecutor(mp_context=get_context('fork'), max_workers=cpu,
+ initializer=init_contig_counter, initargs=(contig_counter,)) as executor:
+ with tqdm(total=len(args), unit="file", disable=disable_bar) as progress:
+ futures = []
+
+ for fn_args in args:
+ future = executor.submit(read_anno_file, *fn_args)
+ future.add_done_callback(lambda p: progress.update())
+ futures.append(future)
+
+ for future in futures:
+ org, has_fasta = future.result()
+ organisms.append(org)
+ org_to_has_fasta_flag[org] = has_fasta
+
+ genes = (gene for org in organisms for gene in org.genes)
+
+ if local_identifiers_are_unique(genes):
+ for gene in genes:
+ gene.ID = gene.local_identifier # Erase ppanggolin generated gene ids and replace with local identifiers
+ gene.local_identifier = "" # this is now useless, setting it to default value
+
+ logging.getLogger("PPanGGOLiN").info("Gene identifiers used in the provided annotation files were unique, "
+ "PPanGGOLiN will use them.")
+ else:
+ logging.getLogger("PPanGGOLiN").info("Gene identifiers used in the provided annotation files were not unique, "
+ "PPanGGOLiN will use self-generated identifiers.")
+ return organisms, org_to_has_fasta_flag
+
+
+def get_gene_sequences_from_fasta_files(organisms, genome_name_to_annot_path):
+ """
+ Get gene sequences from fasta path file
+
+ :param organisms: input pangenome
+ :param fasta_file: list of fasta file
+ """
+
+ org_names = {org.name for org in organisms}
+
+ if org_names & set(genome_name_to_annot_path) != org_names:
+ missing = len(org_names - set(genome_name_to_annot_path))
+ raise ValueError(f"You did not provided fasta for all the organisms found in annotation file. "
+ f"{missing} are missing (out of {len(organisms)}). Missing organisms: {','.join(missing)}")
+
+ for org in organisms:
+
+ org_fasta_file = genome_name_to_annot_path[org.name]['path']
+
+ with read_compressed_or_not(org_fasta_file) as currFastaFile:
+ org_contig_to_seq, _ = read_fasta(org, currFastaFile)
+
+ for contig in org.contigs:
+ try:
+ contig_seq = org_contig_to_seq[contig.name]
+ except KeyError:
+ msg = f"Fasta file for organism {org.name} did not have the contig {contig.name} " \
+ f"that was read from the annotation file. "
+ msg += f"The provided contigs in the fasta were : " \
+ f"{', '.join([contig for contig in org_contig_to_seq])}."
+ raise KeyError(msg)
+
+ for gene in contig.genes:
+ gene.add_sequence(get_dna_sequence(contig_seq, gene))
+
+ for rna in contig.RNAs:
+ rna.add_sequence(get_dna_sequence(contig_seq, rna))
+
+
+def check_input_names(pangenome, input_names):
+ """
+ Check if input organism names already exist in the pangenome.
+
+ :param pangenome: The pangenome object.
+ :param input_names: List of input organism names to check.
+ :raises NameError: If duplicate organism names are found in the pangenome.
+ """
+ duplicated_names = set(input_names) & {org.name for org in pangenome.organisms}
+ if len(duplicated_names) != 0:
+ raise NameError(f"{len(duplicated_names)} provided organism names already exist in the given pangenome: {' '.join(duplicated_names)}")
+
+
+
+def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_dir: Path):
+ """
+ Write summary information to YAML files and create a summary projection in TSV format.
+
+ This function takes a dictionary where keys are input organisms and values are dictionaries containing summary
+ information. It writes this information to YAML files for each organism and creates a summary projection in TSV format.
+
+ :param organism_2_summary: A dictionary where keys are input organisms and values are dictionaries containing
+ summary information.
+ :param output_dir: The directory where the summary files will be written.
+ """
+ flat_summaries = []
+
+ for input_organism, summary_info in organism_2_summary.items():
+ yaml_string = yaml.dump(summary_info, default_flow_style=False, sort_keys=False, indent=4)
+
+ with open(output_dir / input_organism.name / "projection_summary.yaml", 'w') as flout:
+ flout.write('Projection_summary:')
+ flout.write(yaml_string)
+
+ flat_summary = {}
+ for key, val in summary_info.items():
+ if isinstance(val, dict):
+ for nest_k, nest_v in val.items():
+ flat_summary[f"{key} {nest_k}"] = nest_v
+ else:
+ flat_summary[key] = val
+
+ flat_summaries.append(flat_summary)
+
+ df_summary = pd.DataFrame(flat_summaries)
+
+ df_summary.to_csv(output_dir / "summary_projection.tsv", sep='\t', index=False)
+
+def get_single_copy_families(pangenome: Pangenome, dup_margin:float):
+ """
+ Get single copy families
+
+ :param pangenome: The pangenome onto which the projection is performed.
+ :param dup_margin: The duplication margin used to compute single copy families.
+
+ """
+
+ # TODO make this single_copy_fams a method of class Pangenome that should be used in write --stats
+ single_copy_families = set()
+
+ for fam in pangenome.gene_families:
+ if fam.named_partition == "persistent":
+ dup = len([genes for genes in fam.get_org_dict().values() if
+ len([gene for gene in genes if not gene.is_fragment]) > 1])
+
+ if (dup / fam.number_of_organisms) < dup_margin:
+ single_copy_families.add(fam)
+
+ return single_copy_families
+
+
+def summarize_projection(input_organism:Organism, pangenome:Pangenome, single_copy_families:Set, input_org_rgps:Region,
+ input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int):
+ """
+ Summarize the projection of an input organism onto a pangenome.
+
+ :param input_organism: The input organism for projection.
+ :param input_org_rgps: The regions of genomic plasticity (RGPs) in the input organism.
+ :param input_org_spots: The spots in the input organism.
+ :param input_org_modules: The modules in the input organism.
+ :param singleton_gene_count: Number of genes that do not cluster with any gene families in the pangenome.
+
+ Returns:
+ A dictionary containing summary information about the projection, including organism details,
+ gene and family counts, completeness, and counts of RGPs, spots, new spots, and modules.
+
+ """
+
+
+ partition_to_gene = defaultdict(set)
+ contigs_count = 0
+ for contig in input_organism.contigs:
+ contigs_count += 1
+ for gene in contig.genes:
+ partition_to_gene[gene.family.named_partition].add(gene)
+
+ persistent_gene_count = len(partition_to_gene['persistent'])
+ shell_gene_count = len(partition_to_gene['shell'])
+ cloud_gene_count = len(partition_to_gene['cloud'])
+
+ completeness = "NA"
+
+ single_copy_markers_count = len(set(input_organism.families) & single_copy_families)
+ if len(single_copy_families) > 0:
+ completeness = round((single_copy_markers_count /
+ len(single_copy_families)) * 100, 2)
+
+ gene_count = persistent_gene_count + shell_gene_count + cloud_gene_count
+
+ persistent_family_count = len({g.family for g in partition_to_gene['persistent']})
+ shell_family_count = len({g.family for g in partition_to_gene['shell']})
+ cloud_family_count = len({g.family for g in partition_to_gene['cloud']})
+
+ families_count = persistent_family_count + shell_family_count + cloud_family_count
+
+ rgp_count = "Not computed" if input_org_rgps is None else len(input_org_rgps)
+ spot_count = "Not computed" if input_org_spots is None else len(input_org_spots)
+ new_spot_count = "Not computed" if input_org_spots is None else sum(1 for spot in input_org_spots if isinstance(spot, NewSpot))
+ module_count = "Not computed" if input_org_modules is None else len(input_org_modules)
+
+ summary_info = {
+ "Organism name": input_organism.name,
+ "Pangenome file": pangenome.file,
+ "Contigs": contigs_count,
+ "Genes": gene_count,
+ "Families": families_count,
+ "Persistent": {"genes":persistent_gene_count, "families":persistent_family_count},
+ "Shell": {"genes":shell_gene_count, "families":shell_family_count},
+ "Cloud": {"genes":cloud_gene_count, "families":cloud_family_count - singleton_gene_count, "specific families":singleton_gene_count},
+ "Completeness":completeness,
+ "RGPs": rgp_count,
+ "Spots": spot_count,
+ "New spots": new_spot_count,
+ "Modules": module_count
+ }
+ return summary_info
+
+
+def annotate_input_genes_with_pangenome_families(pangenome: Pangenome, input_organisms: Iterable[Organism], output: Path,
+ cpu: int,use_representatives:bool, no_defrag: bool,
+ identity: float, coverage: float, tmpdir: Path,
+ translation_table: int, keep_tmp:bool = False, disable_bar: bool =False):
+ """
+ Annotate input genes with pangenome gene families by associating them to a cluster.
+
+ :param pangenome: Pangenome object.
+ :param input_organisms: Iterable of input organism objects.
+ :param output: Output directory for generated files.
+ :param cpu: Number of CPU cores to use.
+ :param no_defrag: Whether to use defragmentation.
+ :param use_representatives: Use representative sequences of gene families rather than all sequence to align input genes
+ :param identity: Minimum identity threshold for gene clustering.
+ :param coverage: Minimum coverage threshold for gene clustering.
+ :param tmpdir: Temporary directory for intermediate files.
+ :param translation_table: Translation table ID for nucleotide sequences.
+ :param keep_tmp: If True, keep temporary files.
+ :param disable_bar: Whether to disable progress bar.
+
+ :return: Number of genes that do not cluster with any of the gene families of the pangenome.
+ """
+ seq_fasta_files = []
+
+ logging.getLogger('PPanGGOLiN').info('Writting gene sequences of input genomes.')
+
+ for input_organism in input_organisms:
+
+ seq_outdir = output / input_organism.name
+ mk_outdir(seq_outdir, force=True)
+
+ seq_fasta_file = seq_outdir / "cds_sequences.fasta"
+
+ with open(seq_fasta_file, "w") as fh_out_faa:
+ write_gene_sequences_from_annotations(input_organism.genes, fh_out_faa, disable_bar=True, add="ppanggolin_")
+
+ seq_fasta_files.append(seq_fasta_file)
+
+ with create_tmpdir(main_dir=tmpdir, basename="align_input_seq_tmpdir", keep_tmp=keep_tmp) as new_tmpdir:
+
+ if use_representatives:
+ _, seqid_to_gene_family = get_input_seq_to_family_with_rep(pangenome, seq_fasta_files, output=new_tmpdir, tmpdir=new_tmpdir, is_input_seq_nt=True,
+ cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage, translation_table=translation_table)
+ else:
+ _, seqid_to_gene_family = get_input_seq_to_family_with_all(pangenome=pangenome, sequence_files=seq_fasta_files,
+ output=new_tmpdir, tmpdir=new_tmpdir, is_input_seq_nt=True,
+ cpu=cpu, no_defrag=no_defrag, identity=identity, coverage=coverage,
+ translation_table=translation_table, disable_bar=disable_bar)
+ input_org_to_lonely_genes_count = {}
+ for input_organism in input_organisms:
+
+ org_outdir = output / input_organism.name
+
+ seq_set = {gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in input_organism.genes}
+
+ project_and_write_partition(seqid_to_gene_family, seq_set, org_outdir)
+
+ write_gene_to_gene_family(seqid_to_gene_family, seq_set, org_outdir)
+
+ lonely_genes = set()
+ for gene in input_organism.genes:
+ gene_id = gene.ID if gene.local_identifier == "" else gene.local_identifier
+
+ try:
+ gene_family = seqid_to_gene_family[gene_id]
+ gene_family.add(gene)
+ except KeyError:
+ # the seqid is not in the dict so it does not align with any pangenome families
+ # We consider it as cloud gene
+ try:
+ # in some case a family exists already and has the same name of the gene id
+ # So gene id cannot be used
+ _ = pangenome.get_gene_family(gene_id)
+ except KeyError:
+ new_gene_family = GeneFamily(pangenome.max_fam_id, gene_id)
+
+ else:
+ # gene id already exists.
+ new_name=f"{input_organism.name}_{gene_id}"
+ logging.getLogger('PPanGGOLiN').warning('The input organism as a specific gene that does not align to any '
+ f'pangenome families with the same id ({gene_id}) than an existing gene family in the pangenome. '
+ f'The organism name is added to the family name: {new_name}')
+ new_gene_family = GeneFamily(pangenome.max_fam_id, new_name)
+
+ pangenome.add_gene_family(new_gene_family)
+ new_gene_family.add(gene)
+ new_gene_family.partition = "Cloud"
+ lonely_genes.add(gene)
+
+ logging.getLogger('PPanGGOLiN').info(f"{input_organism.name} has {len(lonely_genes)}/{input_organism.number_of_genes()} "
+ "specific genes that do not align to any gene of the pangenome.")
+ # Write specific gene ids in a file
+ with open(org_outdir / "specific_genes.tsv", "w") as fl:
+ fl.write('\n'.join((gene.ID if gene.local_identifier == "" else gene.local_identifier for gene in lonely_genes)) + '\n')
+
+ input_org_to_lonely_genes_count[input_organism] = len(lonely_genes)
+
+ return input_org_to_lonely_genes_count
+
+
+def predict_RGP(pangenome: Pangenome, input_organisms: Organism, persistent_penalty: int, variable_gain: int,
+ min_length: int, min_score: int, multigenics: float,
+ output_dir:Path, disable_bar: bool) -> Dict[Organism, Set[Region]]:
+ """
+ Compute Regions of Genomic Plasticity (RGP) for the given input organisms.
+
+ :param pangenome: The pangenome object.
+ :param input_organisms: The input organism for which to compute RGPs.
+ :param persistent_penalty: Penalty score to apply to persistent genes.
+ :param variable_gain: Gain score to apply to variable genes.
+ :param min_length: Minimum length (bp) of a region to be considered as RGP.
+ :param min_score: Minimal score required for considering a region as RGP.
+ :param multigenics: multigenic families.
+ :param output_dir: Output directory where predicted rgps are going to be written.
+ :param disable_bar: Flag to disable the progress bar.
+
+ :return: Dictionary mapping organism with the set of predicted regions
+ """
+
+ logging.getLogger('PPanGGOLiN').info("Computing Regions of Genomic Plasticity...")
+
+ name_scheme = naming_scheme(chain(pangenome.organisms, input_organisms))
+ organism_to_rgps = {}
+
+ for input_organism in input_organisms:
+ rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length,
+ min_score, naming=name_scheme, disable_bar=disable_bar)
+
+ logging.getLogger('PPanGGOLiN').info(f"{len(rgps)} RGPs have been predicted in the input genomes.")
+
+
+ org_outdir = output_dir / input_organism.name
+
+ write_predicted_regions(rgps, output=org_outdir, compress=False)
+ organism_to_rgps[input_organism] = rgps
+
+ return organism_to_rgps
+
+
+def write_predicted_regions(regions: Set[Region],
+ output: Path, compress: bool = False):
+ """
+ Write the file providing information about predicted regions.
+
+ :param regions: Set of Region objects representing predicted regions.
+ :param output: Path to the output directory.
+ :param compress: Whether to compress the file in .gz format.
+ """
+ fname = output / "plastic_regions.tsv"
+ with write_compressed_or_not(fname, compress) as tab:
+ fieldnames = ["region", "organism", "contig", "start",
+ "stop", "genes", "contigBorder", "wholeContig"]
+
+ writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter='\t')
+ writer.writeheader()
+
+ regions = sorted(regions, key=lambda x: (
+ x.organism.name, x.contig.name, x.ID))
+ for region in regions:
+ row = {
+ "region": region.name,
+ "organism": region.organism,
+ "contig": region.contig,
+ "start": region.starter,
+ "stop": region.stopper,
+ "genes": len(region),
+ "contigBorder": region.is_contig_border,
+ "wholeContig": region.is_whole_contig
+ }
+
+ writer.writerow(row)
+
+
+def write_rgp_to_spot_table(rgp_to_spots: Dict[Region, Set[str]], output: Path, filename: str, compress: bool = False):
+ """
+ Write a table mapping RGPs to corresponding spot IDs.
+
+ :param rgp_to_spots: A dictionary mapping RGPs to spot IDs.
+ :param output: Path to the output directory.
+ :param filename: Name of the file to write.
+ :param compress: Whether to compress the file.
+ """
+ fname = output / filename
+ logging.getLogger('PPanGGOLiN').debug(
+ f'Writing RGPs to spot table in {fname}')
+
+ with write_compressed_or_not(fname, compress) as tab:
+ fieldnames = ["region", "spot_id"]
+
+ writer = csv.DictWriter(tab, fieldnames=fieldnames, delimiter='\t')
+ writer.writeheader()
+
+ regions = sorted(rgp_to_spots.keys(), key=lambda x: (
+ x.organism.name, x.contig.name, x.ID))
+ for region in regions:
+ row = {
+ "region": region.name,
+ "spot_id": ';'.join(map(str, rgp_to_spots[region]))
+ }
+
+ writer.writerow(row)
+
+
+def retrieve_gene_sequences_from_fasta_file(input_organism, fasta_file):
+ """
+ Get gene sequences from fastas
+
+ :param pangenome: input pangenome
+ :param fasta_file: list of fasta file
+ """
+
+ with read_compressed_or_not(fasta_file) as currFastaFile:
+ contig_id2deq, _ = read_fasta(input_organism, currFastaFile)
+
+ for contig in input_organism.contigs:
+ try:
+ for gene in contig.genes:
+ gene.add_dna(get_dna_sequence(
+ contig_id2deq[contig.name], gene))
+
+ for rna in contig.RNAs:
+ rna.add_dna(get_dna_sequence(contig_id2deq[contig.name], rna))
+ except KeyError:
+ msg = f"Fasta file for input_organism {input_organism.name} did not have the contig {contig.name} " \
+ f"that was read from the annotation file. "
+ msg += f"The provided contigs in the fasta were : " \
+ f"{', '.join([contig for contig in contig_id2deq.keys()])}."
+ raise KeyError(msg)
+
+
+def manage_annotate_param(annotate_param_names: List[str], pangenome_args: argparse.Namespace,
+ config_file: Optional[str]) -> argparse.Namespace:
+ """
+ Manage annotate parameters by collecting them from different sources and merging them.
+
+ :param annotate_param_names: List of annotate parameter names to be managed.
+ :param pangenome_args: Annotate arguments parsed from pangenomes parameters.
+ :param config_file: Path to the config file, can be None if not provided.
+
+ :return: An argparse.Namespace containing the merged annotate parameters with their values.
+ """
+
+ default_annotate_args = get_default_args('annotate', annotate_subparser)
+
+ if config_file is None:
+ config_annotate_args = argparse.Namespace()
+ else:
+ config = defaultdict(dict, parse_config_file(config_file))
+ config_annotate_args = get_config_args(
+ 'annotate', annotate_subparser, config, "annotate", annotate_param_names, strict_config_check=False)
+
+ annotate_param_from_pangenome = {}
+ annotate_param_from_config = {}
+ annotate_param_from_default = {}
+
+ annotate_params = argparse.Namespace()
+
+ # Collecting annotate parameters from different sources
+ # if they are found in pangenome param they are used
+ # elif they are found in config they are used
+ # else use the default value.
+ for annotate_arg in annotate_param_names:
+ if hasattr(pangenome_args, annotate_arg):
+ param_val = getattr(pangenome_args, annotate_arg)
+ annotate_param_from_pangenome[annotate_arg] = param_val
+ setattr(annotate_params, annotate_arg, param_val)
+
+ elif hasattr(config_annotate_args, annotate_arg):
+ param_val = getattr(config_annotate_args, annotate_arg)
+ annotate_param_from_config[annotate_arg] = param_val
+ setattr(annotate_params, annotate_arg, param_val)
+
+ else:
+ param_val = getattr(default_annotate_args, annotate_arg)
+ annotate_param_from_default[annotate_arg] = param_val
+ setattr(annotate_params, annotate_arg, param_val)
+
+ # Log the sources of the annotate parameters
+ if len(annotate_param_from_pangenome) > 0:
+ param_val_string = ' '.join(
+ [f'--{k} {v}' for k, v in annotate_param_from_pangenome.items()])
+ logging.getLogger("PPanGGOLiN").debug(f"{len(annotate_param_from_pangenome)}/{len(annotate_param_names)} annotate parameters extracted from pangenome parameters "
+ f"(the parameters used to build the input pangenome): {param_val_string}")
+
+ if len(annotate_param_from_config) > 0:
+ param_val_string = ';'.join(
+ [f' {k} : {v}' for k, v in annotate_param_from_config.items()])
+ logging.getLogger("PPanGGOLiN").debug(f"{len(annotate_param_from_config)}/{len(annotate_param_names)} annotate parameters were not found in pangenome internal parameters."
+ f" They have been parsed from the annotate section in the config file: {param_val_string}")
+
+ if len(annotate_param_from_default) > 0:
+ param_val_string = ';'.join(
+ [f' {k} : {v}' for k, v in annotate_param_from_default.items()])
+ logging.getLogger("PPanGGOLiN").debug(f"{len(annotate_param_from_default)}/{len(annotate_param_names)} annotate parameters were not found in the pangenome parameters "
+ f"nor in the config file. Default values have been used: {param_val_string}")
+
+ return annotate_params
+
+
+def check_spots_congruency(graph_spot: nx.Graph, spots: List[Spot]) -> None:
+ """
+ Check congruency of spots in the spot graph with the original spots.
+
+ :param graph_spot: The spot graph containing the connected components representing the spots.
+ :param spots: List of original spots in the pangenome.
+ :return: None.
+ """
+ rgp_to_spot = {region: spot for spot in spots for region in spot.regions}
+
+ spots = []
+ for cc in nx.algorithms.components.connected_components(graph_spot):
+ # one connected component is a spot
+ regions_in_cc = set()
+ for node in cc:
+ regions_in_cc |= graph_spot.nodes[node]["rgp"]
+
+ # check that region in cc are the regions of a spot
+ spot_in_cc = {rgp_to_spot[rgp] for rgp in regions_in_cc}
+ assert len(
+ spot_in_cc) == 1, "More than one spot in a connected_components. Something went wrong when recomputing spots."
+ current_spot = spot_in_cc.pop()
+ # Add spot id to the graph
+ for node in cc:
+ graph_spot.nodes[node]["spot_id"] = str(current_spot)
+ graph_spot.nodes[node]["spots"] = {current_spot}
+
+
+
+def predict_spots_in_input_organisms(
+ initial_spots: List[Spot],
+ initial_regions: List[Region],
+ input_org_2_rgps: Dict[Organism, Set[Region]],
+ multigenics: Set[GeneFamily],
+ output: Path,
+ write_graph_flag: bool = False,
+ graph_formats: List[str] = ['gexf'],
+ overlapping_match: int = 2,
+ set_size: int = 3,
+ exact_match: int = 1 ) -> Dict[Organism, Set[Spot]]:
+ """
+ Create a spot graph from pangenome RGP and predict spots for input organism RGPs.
+
+ :param initial_spots: List of original spots in the pangenome.
+ :param initial_regions: List of original regions in the pangenome.
+ :param input_org_2_rgps: Dictionary mapping input organisms to their RGPs.
+ :param multigenics: Set of pangenome graph multigenic persistent families.
+ :param output: Output directory to save the spot graph.
+ :param write_graph_flag: If True, writes the spot graph in the specified formats. Default is False.
+ :param graph_formats: List of graph formats to write (default is ['gexf']).
+ :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes. Default is 2.
+ :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation. Default is 3.
+ :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs. Default is 1.
+
+ :return: A dictionary mapping input organism RGPs to their predicted spots.
+ """
+
+ logging.getLogger("PPanGGOLiN").debug("Rebuilding original spot graph.")
+ graph_spot = make_spot_graph(rgps=initial_regions, multigenics=multigenics,
+ overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match)
+
+ original_nodes = set(graph_spot.nodes)
+
+ # Check congruency with already computed spot and add spot id in node attributes
+ check_spots_congruency(graph_spot, initial_spots)
+
+ new_spot_id_counter = max((s.ID for s in initial_spots)) + 1
+
+ input_org_to_spots = {}
+ for input_organism, rgps in input_org_2_rgps.items():
+
+ if len(rgps) == 0:
+ logging.getLogger('PPanGGOLiN').debug(f"{input_organism.name}: No RGPs have been found. "
+ "As a result, spot prediction and RGP output will be skipped.")
+
+ input_org_to_spots[input_organism] = set()
+ continue
+
+ outdir_org = output / input_organism.name
+ # Copy the graph spot, as each input organism are processed independently
+ graph_spot_cp = graph_spot.copy()
+
+ input_org_spots = predict_spot_in_one_organism(graph_spot_cp, input_org_rgps=rgps, original_nodes=original_nodes,
+ new_spot_id_counter=new_spot_id_counter, multigenics=multigenics, organism_name=input_organism.name,
+ output=outdir_org, write_graph_flag=write_graph_flag, graph_formats=graph_formats,
+ overlapping_match=overlapping_match, set_size=set_size, exact_match=exact_match)
+
+ new_spot_id_counter = max((s.ID for s in input_org_spots)) + 1
+
+ input_org_to_spots[input_organism] = input_org_spots
+
+ return input_org_to_spots
+
+def predict_spot_in_one_organism(
+ graph_spot: nx.Graph,
+ input_org_rgps: List[Region],
+ original_nodes: Set[int],
+ new_spot_id_counter: int,
+ multigenics: Set[GeneFamily],
+ organism_name: str,
+ output: Path,
+ write_graph_flag: bool = False,
+ graph_formats: List[str] = ['gexf'],
+ overlapping_match: int = 2,
+ set_size: int = 3,
+ exact_match: int = 1 ) -> Set[Spot]:
+ """
+ Predict spots for input organism RGPs.
+
+ :param graph_spot: The spot graph from the pangenome.
+ :param input_org_rgps: List of RGPs from the input organism to be associated with spots.
+ :param original_nodes: Set of original nodes in the spot graph.
+ :param new_spot_id_counter: Counter for new spot IDs.
+ :param multigenics: Set of pangenome graph multigenic persistent families.
+ :param organism_name: Name of the input organism.
+ :param output: Output directory to save the spot graph.
+ :param write_graph_flag: If True, writes the spot graph in the specified formats. Default is False.
+ :param graph_formats: List of graph formats to write (default is ['gexf']).
+ :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes. Default is 2.
+ :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation. Default is 3.
+ :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs. Default is 1.
+
+ Returns:
+ Set[Spot]: The predicted spots for the input organism RGPs.
+ """
+ # Check which input RGP has a spot
+ lost = 0
+ used = 0
+
+ input_org_node_to_rgps = defaultdict(set)
+
+ for rgp in input_org_rgps:
+ border = rgp.get_bordering_genes(set_size, multigenics)
+ if len(border[0]) < set_size or len(border[1]) < set_size:
+ lost += 1
+ else:
+ used += 1
+ border_node = add_new_node_in_spot_graph(graph_spot, rgp, border)
+ input_org_node_to_rgps[border_node].add(rgp)
+
+ if len(input_org_node_to_rgps) == 0:
+ logging.getLogger("PPanGGOLiN").debug(f"{organism_name}: no RGPs of the input organism will be associated with any spot of insertion "
+ "as they are on a contig border (or have "
+ f"less than {set_size} persistent gene families until the contig border). "
+ "Projection of spots stops here")
+ return set()
+
+ # remove node that were already in the graph
+ new_nodes = set(input_org_node_to_rgps) - original_nodes
+
+ logging.getLogger("PPanGGOLiN").debug(f"{organism_name}: {lost} RGPs were not used as they are on a contig border (or have"
+ f"less than {set_size} persistent gene families until the contig border)")
+
+ logging.getLogger("PPanGGOLiN").debug(
+ f"{organism_name}: {used} RGPs of the input organism will be associated to a spot of insertion")
+
+ # add potential edges from new nodes to the rest of the nodes
+ all_nodes = list(graph_spot.nodes)
+ for nodei in new_nodes:
+ for nodej in all_nodes:
+ if nodei == nodej:
+ continue
+ node_obj_i = graph_spot.nodes[nodei]
+ node_obj_j = graph_spot.nodes[nodej]
+ if check_sim([node_obj_i["border0"], node_obj_i["border1"]],
+ [node_obj_j["border0"], node_obj_j["border1"]],
+ overlapping_match, set_size, exact_match):
+ graph_spot.add_edge(nodei, nodej)
+
+ input_rgp_to_spots = {}
+ new_spots = []
+
+ # determine spot ids of the new nodes and by extension to their rgps
+ for comp in nx.algorithms.components.connected_components(graph_spot):
+ # in very rare case one cc can have several original spots
+ # that would mean a new nodes from the input organism have connected two old cc
+ # in this case we report the two spots in the output
+ spots_of_the_cc = set()
+ for node in comp:
+ if "spots" in graph_spot.nodes[node]:
+ spots_of_the_cc |= {
+ spot for spot in graph_spot.nodes[node]["spots"]}
+
+ if len(spots_of_the_cc) == 0:
+ # no spot associated with any node of the cc
+ # that means this cc is only composed of new nodes
+ # let's add a new spot id
+ new_spot = NewSpot(new_spot_id_counter)
+ new_spots.append(new_spot)
+ spots_of_the_cc = {new_spot} # {f"new_spot_{new_spot_id_counter}"}
+ new_spot_id_counter += 1
+
+ elif len(spots_of_the_cc) > 1:
+ # more than one spot in the cc
+ logging.getLogger("PPanGGOLiN").debug(f'{organism_name}: Some RGPs of the input organism '
+ f"are connected to {len(spots_of_the_cc)} original spots of the pangenome.")
+
+ input_rgps_of_the_cc = set()
+ for node in comp:
+ if node in input_org_node_to_rgps:
+ input_rgps_of_the_cc |= input_org_node_to_rgps[node]
+
+ if write_graph_flag:
+ graph_spot.nodes[node]["spots"] = spots_of_the_cc
+
+ graph_spot.nodes[node]["spot_id"] = ';'.join(
+ (str(spot) for spot in spots_of_the_cc))
+ graph_spot.nodes[node]["includes_RGPs_from_the_input_organism"] = True
+
+ for spot in spots_of_the_cc:
+ for region in input_rgps_of_the_cc:
+ spot.add(region)
+
+ input_rgp_to_spots.update(
+ {rgp: spots_of_the_cc for rgp in input_rgps_of_the_cc})
+
+ if write_graph_flag:
+ # remove node that would not be writable in graph file
+ for node in graph_spot.nodes:
+ del graph_spot.nodes[node]["spots"]
+
+ write_spot_graph(graph_spot, output, graph_formats,
+ file_basename='projected_spotGraph')
+
+ write_rgp_to_spot_table(input_rgp_to_spots, output=output,
+ filename='input_organism_rgp_to_spot.tsv')
+
+ input_org_spots = {spot for spots in input_rgp_to_spots.values()
+ for spot in spots }
+ new_spots = {spot for spot in input_org_spots if isinstance(spot, NewSpot)}
+
+
+ logging.getLogger('PPanGGOLiN').debug(
+ f'{organism_name}: {len(new_spots)} new spots have been created for the input genome.')
+
+ if new_spots:
+ summarize_spots(new_spots, output, compress=False,
+ file_name="new_spots_summary.tsv")
+
+ return input_org_spots
+
+def project_and_write_modules(pangenome: Pangenome, input_organisms: Iterable[Organism],
+ output: Path, compress: bool = False):
+ """
+ Write a tsv file providing association between modules and the input organism
+
+ :param pangenome: Pangenome object
+ :param input_organisms: iterable of the organisms that is being annotated
+ :param output: Path to output directory
+ :param compress: Compress the file in .gz
+ """
+ input_orgs_to_modules = {}
+ for input_organism in input_organisms:
+ output_file = output / input_organism.name / "modules_in_input_organism.tsv"
+
+ input_organism_families = list(input_organism.families)
+ counter = 0
+ modules_in_input_org = []
+ with write_compressed_or_not(output_file, compress) as fout:
+ fout.write("module_id\torganism\tcompletion\n")
+
+ for mod in pangenome.modules:
+ module_in_input_organism = any(
+ (fam in input_organism_families for fam in mod.families))
+
+ if module_in_input_organism:
+ counter += 1
+ modules_in_input_org.append(mod)
+
+ completion = round(
+ len(set(input_organism.families) & set(mod.families)) / len(set(mod.families)), 2)
+ fout.write(
+ f"module_{mod.ID}\t{input_organism.name}\t{completion}\n")
+
+ logging.getLogger('PPanGGOLiN').debug(
+ f"{input_organism.name}: {counter} modules have been projected to the input genomes.")
+
+ logging.getLogger('PPanGGOLiN').debug(
+ f"{input_organism.name}: Projected modules have been written in: '{output_file}'")
+
+ input_orgs_to_modules[input_organism] = modules_in_input_org
+
+ return input_orgs_to_modules
+
+
+def infer_input_mode(input_file: Path, expected_types: List[str], parser: argparse.ArgumentParser) -> str:
+ """
+ Determine the input mode based on the provided input file and expected file types.
+
+ :param input_file: A Path object representing the input file.
+ :param expected_types: A list of expected file types (e.g., ['fasta', 'gff', 'gbff', 'tsv']).
+
+ :return: A string indicating the input mode ('single' or 'multiple').
+ """
+ if not input_file.exists():
+ parser.error(f"The provided file {input_file} does not exist.")
+
+ try:
+ filetype = detect_filetype(input_file)
+ except Exception:
+ parser.error("Based on its content, the provided file is not recognized as a valid input file. Please ensure it is in one of the supported formats (FASTA, GFF/GBFF, or TSV).")
+
+ if filetype == "tsv":
+ logging.getLogger('PPanGGOLiN').debug(f"The provided file ({input_file}) is detected as a TSV file.")
+ mode = "multiple"
+ elif filetype in expected_types:
+ logging.getLogger('PPanGGOLiN').debug(f"The provided file ({input_file}) is detected as a single {'/'.join(expected_types)} file.")
+ mode = "single"
+ else:
+ logging.getLogger('PPanGGOLiN').error(f"The provided file {input_file} is not recognized as a valid {'/'.join(expected_types)} file or a TSV file listing names and {'/'.join(expected_types)} files of genomes to annotate.")
+ parser.error(f"The provided file {input_file} is not recognized as a valid {'/'.join(expected_types)} file or a TSV file listing names and files of genomes to annotate.")
+
+ return mode
+
+
+def check_projection_arguments(args: argparse.Namespace, parser: argparse.ArgumentParser ) -> str:
+ """
+ Check the arguments provided for genome projection and raise errors if they are incompatible or missing.
+
+ :param args: An argparse.Namespace object containing parsed command-line arguments.
+ :param parser : parser of the command
+ :return: A string indicating the input mode ('single' or 'multiple').
+ """
+
+ # Check if we annotate genomes from path files or only a single genome...
+ if not args.anno and not args.fasta:
+ parser.error("Please provide either a FASTA file or a tab-separated file listing sequence files using the '--fasta' option, "
+ "or an annotation file or a tab-separated file listing annotation files using the '--anno' option. "
+ "You can specify these either through the command line or the configuration file.")
+
+ mode_from_fasta, mode_from_anno = None, None
+ if args.fasta:
+ mode_from_fasta = infer_input_mode(args.fasta, ['fasta'], parser)
+ input_mode = mode_from_fasta
+
+ if args.anno:
+ mode_from_anno = infer_input_mode(args.anno, ['gff', "gbff"], parser)
+ input_mode = mode_from_anno
+
+ logging.getLogger('PPanGGOLiN').debug("")
+
+ if mode_from_fasta and mode_from_anno and mode_from_fasta != mode_from_anno:
+ single_input, multiple_input = ("fasta", "anno") if mode_from_fasta == "single" else ("anno", "fasta")
+
+ parser.error(f"You've provided both a single annotation/fasta file using the '--{single_input}' option and a list of files using "
+ f"the '--{multiple_input}' option. Please choose either a single file or a tab-separated file listing genome files, but not both.")
+
+
+ if input_mode == "multiple":
+ # We are in paths file mode
+
+ if args.circular_contigs:
+ parser.error("You provided a TSV file listing the files of genomes you wish to annotate. "
+ "Therefore, the argument '--circular_contigs' is incompatible with this multiple genomes file.")
+
+ if args.fasta:
+ check_input_files(args.fasta, True)
+
+ if args.anno:
+ check_input_files(args.anno, True)
+
+ return input_mode
+
+
+
+
+def launch(args: argparse.Namespace):
+ """
+ Command launcher
+
+ :param args: All arguments provide by user
+ """
+
+ output_dir = Path(args.output)
+ mk_outdir(output_dir, args.force)
+
+ # For the moment these elements of the pangenome are predicted by default
+
+ pangenome = Pangenome()
+ pangenome.add_file(args.pangenome)
+
+ predict_rgp, project_spots, project_modules = check_pangenome_for_projection(pangenome, args.fast)
+
+ check_pangenome_info(pangenome, need_annotations=True, need_families=True, disable_bar=args.disable_prog_bar,
+ need_rgp=predict_rgp, need_modules=project_modules, need_gene_sequences=False,
+ need_spots=project_spots)
+
+ logging.getLogger('PPanGGOLiN').info('Retrieving parameters from the provided pangenome file.')
+ pangenome_params = argparse.Namespace(
+ **{step: argparse.Namespace(**k_v) for step, k_v in pangenome.parameters.items()})
+
+ organisms, genome_name_to_path, input_type = manage_input_genomes_annotation(pangenome=pangenome,
+ input_mode=args.input_mode,
+ anno=args.anno, fasta=args.fasta,
+ organism_name=args.organism_name,
+ circular_contigs=args.circular_contigs,
+ pangenome_params=pangenome_params,
+ cpu=args.cpu, use_pseudo=args.use_pseudo,
+ disable_bar=args.disable_prog_bar,
+ tmpdir= args.tmpdir, config=args.config)
+
+
+
+ input_org_to_lonely_genes_count = annotate_input_genes_with_pangenome_families(pangenome, input_organisms=organisms,
+ output=output_dir, cpu=args.cpu, use_representatives=args.fast,
+ no_defrag=args.no_defrag, identity=args.identity,
+ coverage=args.coverage, tmpdir=args.tmpdir,
+ translation_table=int(pangenome_params.cluster.translation_table),
+ keep_tmp=args.keep_tmp,
+ disable_bar=args.disable_prog_bar)
+
+
+ input_org_2_rgps, input_org_to_spots, input_orgs_to_modules = {}, {}, {}
+
+ if predict_rgp:
+
+ logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.')
+
+ multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin)
+
+ input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain,
+ min_length=pangenome_params.rgp.min_length, min_score=pangenome_params.rgp.min_score, multigenics=multigenics, output_dir=output_dir,
+ disable_bar=args.disable_prog_bar)
+
+ if project_spots:
+ logging.getLogger('PPanGGOLiN').info('Predicting spot of insertion in input genomes.')
+ input_org_to_spots = predict_spots_in_input_organisms(initial_spots=list(pangenome.spots),
+ initial_regions=pangenome.regions,
+ input_org_2_rgps=input_org_2_rgps,
+ multigenics=multigenics,
+ output=output_dir,
+ write_graph_flag=args.spot_graph,
+ graph_formats=args.graph_formats,
+ overlapping_match=pangenome_params.spot.overlapping_match,
+ set_size=pangenome_params.spot.set_size,
+ exact_match=pangenome_params.spot.exact_match_size)
+
+ if project_modules:
+ input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir)
+
+ write_projection_results(pangenome, organisms, input_org_2_rgps,
+ input_org_to_spots,
+ input_orgs_to_modules,
+ input_org_to_lonely_genes_count,
+ write_proksee=args.proksee, write_gff=args.gff, add_sequences=args.add_sequences,
+ genome_name_to_path=genome_name_to_path, input_type=input_type,
+ output_dir=output_dir, dup_margin=args.dup_margin)
+
+
+def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
+ """
+ Subparser to launch PPanGGOLiN in Command line
+
+ :param sub_parser : sub_parser for projection command
+
+ :return : parser arguments for projection command
+ """
+ parser = sub_parser.add_parser(
+ "projection", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser_projection(parser)
+ return parser
+
+
+def parser_projection(parser: argparse.ArgumentParser):
+ """
+ Parser for specific argument of projection command
+
+ :param parser: parser for projection argument
+ """
+ required = parser.add_argument_group(title="Required arguments")
+
+ required.add_argument('-p', '--pangenome', required=False,
+ type=Path, help="The pangenome.h5 file")
+
+ required.add_argument('--fasta', required=False, type=Path,
+ help="Specify a FASTA file containing the genomic sequences of the organism(s) you wish to annotate, "
+ "or provide a tab-separated file listing organism names alongside their respective FASTA filepaths, with one line per organism.")
+
+ required.add_argument('--anno', required=False, type=Path,
+ help="Specify an annotation file in GFF/GBFF format for the genome you wish to annotate. "
+ "Alternatively, you can provide a tab-separated file listing organism names alongside their respective annotation filepaths, "
+ "with one line per organism. If both an annotation file and a FASTA file are provided, the annotation file will take precedence.")
+
+ required_single = parser.add_argument_group(title="Single Genome Arguments",
+ description="Use these options when providing a single FASTA or annotation file:")
+
+ required_single.add_argument("-n", '--organism_name', required=False, type=str, default="input_genome",
+ help="Specify the name of the organism whose genome you want to annotate when providing a single FASTA or annotation file.")
+
+ required_single.add_argument('--circular_contigs', nargs="+", required=False, type=tuple,
+ help="Specify the contigs of the input genome that should be treated as circular when providing a single FASTA or annotation file.")
+
+
+ optional = parser.add_argument_group(title="Optional arguments")
+
+ optional.add_argument('-o', '--output', required=False, type=Path,
+ default="ppanggolin_projection" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S",
+ time.localtime()) + "_PID" + str(os.getpid()),
+ help="Output directory")
+
+ optional.add_argument('--no_defrag', required=False, action="store_true",
+ help="DO NOT Realign gene families to link fragments with "
+ "their non-fragmented gene family. (default: False)")
+
+ optional.add_argument("--fast", required=False, action="store_true",
+ help="Use representative sequences of gene families for input gene alignment. "
+ "This option is faster but may be less sensitive. By default, all pangenome genes are used.")
+
+ optional.add_argument('--identity', required=False, type=restricted_float, default=0.8,
+ help="min identity percentage threshold")
+
+ optional.add_argument('--coverage', required=False, type=restricted_float, default=0.8,
+ help="min coverage percentage threshold")
+
+ optional.add_argument("--use_pseudo", required=False, action="store_true",
+ help="In the context of provided annotation, use this option to read pseudogenes. "
+ "(Default behavior is to ignore them)")
+
+ optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05,
+ help="minimum ratio of organisms in which the family must have multiple genes "
+ "for it to be considered 'duplicated'. "
+ "This metric is used to compute completeness and duplication of the input genomes")
+
+ optional.add_argument("--spot_graph", required=False, action="store_true",
+ help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs "
+ "as nodes. This graph can be used to visualize nodes that have RGPs from the input organism.")
+
+ optional.add_argument('--graph_formats', required=False, type=str, choices=['gexf', "graphml"], nargs="+",
+ default=['gexf'], help="Format of the output graph.")
+
+ optional.add_argument("--gff", required=False, action="store_true",
+ help="Generate GFF files with projected pangenome annotations for each input organism.")
+
+ optional.add_argument("--proksee", required=False, action="store_true",
+ help="Generate JSON map files for PROKSEE with projected pangenome annotations for each input organism.")
+
+ optional.add_argument("--add_sequences", required=False, action="store_true",
+ help="Include input genome DNA sequences in GFF and Proksee output.")
+
+ optional.add_argument("-c", "--cpu", required=False,
+ default=1, type=int, help="Number of available cpus")
+
+ optional.add_argument("--tmpdir", required=False, type=Path, default=Path(tempfile.gettempdir()),
+ help="directory for storing temporary files")
+
+ optional.add_argument("--keep_tmp", required=False, default=False, action="store_true",
+ help="Keeping temporary files (useful for debugging).")
diff --git a/ppanggolin/region.py b/ppanggolin/region.py
index 83dc1479..a2abcc3c 100644
--- a/ppanggolin/region.py
+++ b/ppanggolin/region.py
@@ -6,6 +6,7 @@
import logging
# installed libraries
+import networkx as nx
from typing import Dict, Generator, List, Set
import gmpy2
@@ -50,13 +51,15 @@ def __init__(self, name: str):
self.starter = None
self.stopper = None
self.ID = Region.id_counter
+ self.spot = None
Region.id_counter += 1
def __str__(self):
return self.name
def __repr__(self) -> str:
- """Region representation
+ """
+ Region representation
"""
return f"RGP name:{self.name}"
@@ -132,6 +135,18 @@ def __getitem__(self, position: int) -> Gene:
except KeyError:
raise KeyError(f"There is no gene at position {position} in RGP {self.name}")
+ def add_spot(self, spot: Spot):
+ """Sets the spot of the RGP
+
+ :param spot: spot to which the RGP is added
+
+ :raise TypeError: if the given spot is not a Spot.
+ """
+ if isinstance(spot, Spot):
+ self.spot = spot#only 1 spot possible
+ else:
+ raise TypeError(f"Unexpected class / type for {type(spot)} when adding it to a RGP")
+
def __delitem__(self, position):
"""Remove the gene at the given position
@@ -228,6 +243,25 @@ def contig(self) -> Contig:
:return: Contig corresponding to the region
"""
return self.starter.contig
+
+ @property
+ def start(self) -> int:
+ """
+ Get the starter start link to RGP
+
+ :return: start position in the contig of the first gene of the RGP
+ """
+ return self.starter.start
+
+ @property
+ def stop(self) -> int:
+ """
+ Get the stopper stop link to RGP
+
+ :return: start position in the contig of the last gene of the RGP
+ """
+ return self.stopper.stop
+
@property
def is_whole_contig(self) -> bool:
@@ -702,6 +736,18 @@ def families(self) -> Generator[GeneFamily, None, None]:
:return: Families belonging to the module
"""
yield from self._families_getter.values()
+
+ @property
+ def organisms(self) -> Generator[Organism, None, None]:
+ """Returns all the Organisms that have this module
+
+ :return: Organisms that have this module
+ """
+ organisms = set()
+ for fam in self.families:
+ organisms |= set(fam.organisms)
+ yield from organisms
+
def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'):
"""Produces a bitarray representing the presence / absence of families in the organism using the provided index
@@ -736,24 +782,47 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'):
class GeneContext:
"""
- The GeneContext class represents a gene context, which is a collection of gene families related to a specific genomic context.
+ Represent a gene context which is a collection of gene families related to a specific genomic context..
Methods
- families: Generator that yields all the gene families in the gene context.
+ - families: Generator that yields all the gene families in the gene context.
+ - add_context_graph: Add a context graph corresponding to the gene context.
+ - add_family: Add a gene family to the gene context.
Fields
- ID: The identifier of the gene context.
+ - gc_id: The identifier of the gene context.
+ - graph: context graph corresponding to the gene context
"""
- def __init__(self, gc_id: int, families: set = None):
+
+ def __init__(self, gc_id: int, families: Set[GeneFamily] = None, families_of_interest: Set[GeneFamily] = None):
"""Constructor method
- :param gc_id : Identifier of the Gene context
- :param families: Gene families related to the GeneContext
+
+ :param gc_id: Identifier of the gene context.
+ :param families: Gene families related to the gene context.
+ :param families_of_interest: Input families for which the context is being searched.
"""
+
if not isinstance(gc_id, int):
raise TypeError(f"Gene context identifier must be an integer. Given type is {type(gc_id)}")
+
self.ID = gc_id
- self._families_getter = {family.name: family for family in families} if families is not None else {}
+ self._families_getter = {}
+ self.families_of_interest = families_of_interest
+ self._graph = None
+ if families is not None:
+ if not all(isinstance(fam, GeneFamily) for fam in families):
+ raise Exception("You provided elements that were not GeneFamily objects. "
+ "GeneContexts are only made of GeneFamily objects.")
+ self._families_getter = {family.name: family for family in families}
+
+ def __len__(self) -> int:
+ """
+ Get length of a context graph by returning the number of gene families it includes.
+ :return: number of family in the gene context
+ """
+ return len(self.families)
+
def __repr__(self) -> str:
"""Context representation
"""
@@ -829,6 +898,24 @@ def __delitem__(self, name):
except KeyError:
raise KeyError(f"There isn't gene family with the name {name} in the gene context")
+ @property
+ def graph(self):
+ if self._graph is None:
+ raise ValueError("Graph has not been added to the context")
+ return self._graph
+
+ @graph.setter
+ def graph(self, graph: nx.Graph):
+ """
+ Add a context graph to the gene context.
+
+ :param graph: The context graph.
+ """
+ if not isinstance(nx.Graph, graph):
+ logging.getLogger("PPanGGOLiN").debug(f"given type: {type(graph)}")
+ raise TypeError("Context graph must be a networkx graph object.")
+ self._graph = graph
+
@property
def families(self) -> Generator[GeneFamily, None, None]:
"""Generator of the family in the context
@@ -836,3 +923,14 @@ def families(self) -> Generator[GeneFamily, None, None]:
:return: Gene families belonging to the context
"""
yield from self._families_getter.values()
+
+ def add_family(self, family: GeneFamily):
+ """
+ Add a gene family to the gene context.
+
+ :param family: The gene family to add.
+ """
+ if not isinstance(family, GeneFamily):
+ raise Exception("You did not provide a GeneFamily object. "
+ "GeneContexts are only made of GeneFamily objects.")
+ self[family.name] = family
diff --git a/ppanggolin/utility/utils.py b/ppanggolin/utility/utils.py
index bbb43ac0..accea93a 100644
--- a/ppanggolin/utility/utils.py
+++ b/ppanggolin/utility/utils.py
@@ -150,6 +150,9 @@ def launch_default_config(args: argparse.Namespace):
# it is clearer if the order of the subcommand is conserved in wf config file
commands = [initial_command] + [sub_cmd for sub_cmd in ALL_WORKFLOW_DEPENDENCIES if
sub_cmd in workflow_dependencies]
+ elif initial_command == "projection":
+ commands = [initial_command] + ['annotate']
+
else:
commands = [initial_command]
@@ -211,7 +214,6 @@ def launch_default_config(args: argparse.Namespace):
arg_lines.append(f"\n{sub_command}:")
arg_lines += get_default_argument_lines(specific_actions)
- mk_outdir(args.output.parent, True) # Everytime it is True because the config file is already tested
logging.getLogger("PPanGGOLiN").info(f'Writting default config in {args.output}')
with open(args.output, 'w') as fl:
fl.write('\n'.join(arg_lines) + '\n')
diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py
index 41f7bf26..e6bc20cb 100755
--- a/ppanggolin/utils.py
+++ b/ppanggolin/utils.py
@@ -9,7 +9,11 @@
import argparse
from io import TextIOWrapper
from pathlib import Path
-from typing import TextIO, Union, BinaryIO, Tuple, List, Set, Iterable
+from typing import TextIO, Union, BinaryIO, Tuple, List, Set, Iterable, Dict
+from contextlib import contextmanager
+import tempfile
+import time
+from itertools import zip_longest
import networkx as nx
from importlib.metadata import distribution
@@ -24,11 +28,12 @@
from ppanggolin.geneFamily import GeneFamily
# all input params that exists in ppanggolin
-ALL_INPUT_PARAMS = ['fasta', 'anno', 'clusters', 'pangenome']
+ALL_INPUT_PARAMS = ['fasta', 'anno', 'clusters', 'pangenome',
+ "fasta_file", "annot_file", "organism_name"] # the last three params is for projection cmd
# all params that should be in the general_parameters section of the config file
ALL_GENERAL_PARAMS = ['output', 'basename', 'rarefaction', 'no_flat_files', 'tmpdir', 'verbose', 'log',
- 'disable_prog_bar', 'force']
+ 'disable_prog_bar', 'force', "config"]
WORKFLOW_SUBCOMMANDS = {'all', 'workflow', 'panrgp', 'panModule'}
@@ -39,7 +44,7 @@
# Inside a workflow command, write output default is overwrite to output some flat files
WRITE_FLAG_DEFAULT_IN_WF = ["csv", "Rtab", "gexf", "light_gexf",
'projection', 'stats', 'json', 'partitions', 'regions',
- 'borders', 'modules', 'spot_modules', "draw_spots"]
+ 'borders', 'modules', 'spot_modules', "spots"]
DRAW_FLAG_DEFAULT_IN_WF = ["tile_plot", "ucurve", "draw_spots"]
@@ -88,7 +93,7 @@ def check_tsv_sanity(tsv: Path):
except IOError as ios_error:
raise IOError(ios_error)
except Exception as exception_error:
- raise Exception(f"The following unexpected error happened when opening the list of pangenomes : "
+ raise Exception(f"The following unexpected error happened when opening the list of genomes path: "
f"{exception_error}")
else:
name_set = set()
@@ -259,6 +264,23 @@ def mk_outdir(output: Path, force: bool = False):
raise FileExistsError(
f"{output} already exists. Use -f if you want to overwrite the files in the directory")
+@contextmanager
+def create_tmpdir(main_dir, basename="tmpdir", keep_tmp=False):
+
+ if keep_tmp:
+ dir_name = basename + time.strftime("_%Y-%m-%d_%H.%M.%S",time.localtime())
+
+ new_tmpdir = main_dir / dir_name
+ logging.debug(f'Creating a temporary directory: {new_tmpdir.as_posix()}. This directory will be retained.')
+
+ mk_outdir(new_tmpdir, force=True)
+ yield new_tmpdir
+
+ else:
+ with tempfile.TemporaryDirectory(dir=main_dir, prefix=basename) as new_tmpdir:
+ logging.debug(f"Creating a temporary directory: {new_tmpdir}. This directory won't be retained.")
+ yield Path(new_tmpdir)
+
def mk_file_name(basename: str, output: Path, force: bool = False) -> Path:
"""Returns a usable filename for a ppanggolin output file, or crashes.
@@ -297,6 +319,8 @@ def detect_filetype(filename: Path) -> str:
return 'gff'
elif first_line.startswith(">"):
return 'fasta'
+ elif "\t" in first_line:
+ return "tsv"
else:
raise Exception("Filetype was not gff3 (file starts with '##gff-version 3') "
"nor gbff/gbk (file starts with 'LOCUS '). "
@@ -446,9 +470,7 @@ def add_common_arguments(subparser: argparse.ArgumentParser):
common.add_argument('-f', '--force', action="store_true",
help="Force writing in output directory and in pangenome output file.")
common.add_argument("--config", required=False, type=argparse.FileType(),
- help="Config file in yaml format to launch the different step of "
- "the workflow with specific arguments.")
-
+ help="Specify command arguments through a YAML configuration file.")
subparser._action_groups.append(common)
@@ -461,7 +483,7 @@ def get_arg_name(arg_val: Union[str, TextIOWrapper]) -> Union[str, TextIOWrapper
:return: Either a string or a TextIOWrapper object, depending on the type of the input argument.
"""
- if type(arg_val) == TextIOWrapper:
+ if isinstance(arg_val, TextIOWrapper):
return arg_val.name
return arg_val
@@ -469,14 +491,15 @@ def get_arg_name(arg_val: Union[str, TextIOWrapper]) -> Union[str, TextIOWrapper
def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Namespace, cli_args: argparse.Namespace):
"""
Overwrite args objects.
- When arguments are given in CLI, their value is used instead of the one found in config.
- When arguments are specified in config they overwrite default values.
+
+ When arguments are given in CLI, their values are used instead of the ones found in the config file.
+ When arguments are specified in the config file, they overwrite default values.
:param default_args: default arguments
- :param config_args: arguments parsed from config file
- :param cli_args: arguments parsed from command line
+ :param config_args: arguments parsed from the config file
+ :param cli_args: arguments parsed from the command line
- :return: final arguments
+ :return: final arguments
"""
args = argparse.Namespace()
all_params = [arg for arg in dir(default_args) if not arg.startswith('_')]
@@ -486,30 +509,48 @@ def overwrite_args(default_args: argparse.Namespace, config_args: argparse.Names
cli_val = getattr(cli_args, param, 'unspecified')
config_val = getattr(config_args, param, 'unspecified')
- if param in cli_args:
- # param is defined in cli, cli val is used
+ if param in cli_args and param not in config_args:
+ # Use the value from the command line argument
setattr(args, param, cli_val)
- if default_val != cli_val:
+ if default_val != cli_val and param != "config":
logging.getLogger("PPanGGOLiN").debug(
- f'Parameter "--{param} {get_arg_name(cli_val)}" has been specified in command line.'
- f' Its value overwrites putative config values.')
+ f'The parameter "--{param}: {get_arg_name(cli_val)}" has been specified in the command line with a non-default value.'
+ f' Its value overwrites the default value ({get_arg_name(default_val)}).')
- elif param in config_args:
- # parma is defined only in config. config val is used
+ elif param not in cli_args and param in config_args:
+ # Use the value from the config file
setattr(args, param, config_val)
if default_val != config_val:
logging.getLogger("PPanGGOLiN").debug(
- f'Parameter "{param}: {get_arg_name(config_val)}" has been specified in config file with non default value.'
- f' Its value overwrites default value ({get_arg_name(default_val)}).')
+ f'The parameter "--{param}: {get_arg_name(config_val)}" has been specified in the config file with a non-default value.'
+ f' Its value overwrites the default value ({get_arg_name(default_val)}).')
+
+ elif param in cli_args and param in config_args:
+ # Use the value from the command line argument (cli) if it's different from the config file (config)
+ setattr(args, param, cli_val)
+
+ if cli_val == config_val and cli_val != default_val:
+ logging.getLogger("PPanGGOLiN").debug(
+ f'The parameter "--{param} {get_arg_name(cli_val)}" has been specified in both the command line '
+ f'and the config file with the same values, but with non-default value. '
+ f'Its value overwrites the default value ({get_arg_name(default_val)}).')
+
+ elif cli_val != config_val and param != "config":
+ # Values in cli and config differ. Use the value from the command line argument (cli)
+ logging.getLogger("PPanGGOLiN").debug(
+ f'The parameter "--{param}" has been specified in both the command line ("{get_arg_name(cli_val)}") '
+ f'and the config file ("{get_arg_name(config_val)}") with different values. '
+ f'The value from the command line argument is used.')
else:
- # param is not defined in cli and in config. default value is applied
+ # Parameter is not defined in cli and in config. Use the default value.
setattr(args, param, default_val)
return args
+
def combine_args(args: argparse.Namespace, another_args: argparse.Namespace):
"""
Combine two args object.
@@ -616,12 +657,16 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_
f"{len(params_that_differ)} {subcommand} parameters have non-default value: {params_that_differ_str}")
# manage workflow command
+ workflow_steps = []
if subcommand in WORKFLOW_SUBCOMMANDS:
- for workflow_step in ALL_WORKFLOW_DEPENDENCIES:
+
+ workflow_steps = [wf_step for wf_step in ALL_WORKFLOW_DEPENDENCIES if not (wf_step in ["rgp", "spot"] and subcommand in ["workflow", "panmodule"]) or \
+ not (wf_step == "module" and subcommand in ["workflow", "panmodule"])]
+
+ for workflow_step in workflow_steps:
if (workflow_step in ["rgp", "spot"] and subcommand in ["workflow", "panmodule"]) or \
(workflow_step == "module" and subcommand in ["workflow", "panmodule"]):
continue
-
logging.getLogger("PPanGGOLiN").debug(f'Parsing {workflow_step} arguments in config file.')
step_subparser = subcommand_to_subparser[workflow_step]
@@ -664,7 +709,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_
if params_that_differ:
logging.getLogger("PPanGGOLiN").info(f'{len(params_that_differ)} parameters have a non-default value.')
- check_config_consistency(config, ALL_WORKFLOW_DEPENDENCIES)
+ check_config_consistency(config, workflow_steps)
return args
@@ -688,7 +733,7 @@ def count_different_values(values: Iterable[Union[int, str, Tuple, List]]) -> in
"""
hashable_values = set()
for value in values:
- hashable_value = tuple(value) if type(value) == list else value
+ hashable_value = tuple(value) if isinstance(value, list) else value
hashable_values.add(hashable_value)
return len(hashable_values)
@@ -720,14 +765,15 @@ def set_up_config_param_to_parser(config_param_val: dict) -> list:
arguments_to_parse = []
for param, val in config_param_val.items():
- if type(val) == bool:
+ if isinstance(val, bool) or val is None or val == "None":
# param is a flag
if val is True:
arguments_to_parse.append(f"--{param}")
+ # if val is False or None we don't add id to the
else:
arguments_to_parse.append(f"--{param}")
- if type(val) == list:
+ if isinstance(val, list):
# range of values need to be added one by one
arguments_to_parse += [str(v) for v in val]
else:
@@ -860,8 +906,6 @@ def get_cli_args(subparser_fct: Callable) -> argparse.Namespace:
# remove argument that have not been specified
delete_unspecified_args(cli_args)
delattr(cli_args, 'subcommand')
- if 'config' in cli_args:
- delattr(cli_args, 'config')
return cli_args
@@ -890,3 +934,104 @@ def delete_unspecified_args(args: argparse.Namespace):
for arg_name, arg_val in args._get_kwargs():
if arg_val is None:
delattr(args, arg_name)
+
+
+def extract_contig_window(contig_size: int, positions_of_interest: Iterable[int], window_size: int,
+ is_circular: bool = False):
+ """
+ Extracts contiguous windows around positions of interest within a contig.
+
+ :param contig_size: Number of genes in contig.
+ :param positions_of_interest: An iterable containing the positions of interest.
+ :param window_size: The size of the window to extract around each position of interest.
+ :param is_circular: Indicates if the contig is circular.
+ :return: Yields tuples representing the start and end positions of each contiguous window.
+ """
+ windows_coordinates = []
+
+ # Sort the positions of interest
+ sorted_positions = sorted(positions_of_interest)
+
+ # Check if any position of interest is out of range
+ if sorted_positions[0] < 0 or sorted_positions[-1] >= contig_size:
+ raise IndexError(f'Positions of interest are out of range. '
+ f"Contig has {contig_size} genes while given min={sorted_positions[0]} & max={sorted_positions[-1]} positions")
+
+ if is_circular:
+ first_position = sorted_positions[0]
+ last_position = sorted_positions[-1]
+ # in a circular contig, if the window of a gene of interest overlaps the end/start of the contig
+ # an out of scope position is added to the sorted positions to take into account those positions
+ # the returned window are always checked that its positions are not out of range...
+ # so there's no chance to find an out of scope position in final list
+ if first_position - window_size < 0:
+ out_of_scope_position = contig_size + first_position
+ sorted_positions.append(out_of_scope_position)
+
+ if last_position + window_size >= contig_size:
+ out_of_scope_position = last_position - contig_size
+ sorted_positions.insert(0, out_of_scope_position)
+
+ start_po = max(sorted_positions[0] - window_size, 0)
+
+ for position, next_po in zip_longest(sorted_positions, sorted_positions[1:]):
+
+ if next_po is None:
+ # If there are no more positions, add the final window
+ end_po = min(position + window_size, contig_size - 1)
+ windows_coordinates.append((start_po, end_po))
+
+ elif position + window_size + 1 < next_po - window_size:
+ # If there is a gap between positions, add the current window
+ # and update the start position for the next window
+ end_po = min(position + window_size, contig_size - 1)
+
+ windows_coordinates.append((start_po, end_po))
+
+ start_po = max(next_po - window_size, 0)
+
+ return windows_coordinates
+
+
+
+def parse_input_paths_file(path_list_file: Path) -> Dict[str, Dict[str, List[str]]]:
+ """
+ Parse an input paths file to extract genome information.
+
+ This function reads an input paths file, which is in TSV format, and extracts genome information
+ including file paths and putative circular contigs.
+
+ :param path_list_file: The path to the input paths file.
+ :return: A dictionary where keys are genome names and values are dictionaries containing path information and
+ putative circular contigs.
+ :raises FileNotFoundError: If a specified genome file path does not exist.
+ :raises Exception: If there are no genomes in the provided file.
+ """
+ logging.getLogger("PPanGGOLiN").info(f"Reading {path_list_file} to process organism files")
+ genome_name_to_genome_path = {}
+
+ for line in read_compressed_or_not(path_list_file):
+ elements = [el.strip() for el in line.split("\t")]
+ genome_file_path = Path(elements[1])
+ genome_name = elements[0]
+ putative_circular_contigs = elements[2:]
+
+ if not genome_file_path.exists():
+ # Check if the file path doesn't exist and try an alternative path.
+ genome_file_path_alt = path_list_file.parent.joinpath(genome_file_path)
+
+ if not genome_file_path_alt.exists():
+ raise FileNotFoundError(f"The file path '{genome_file_path}' for genome '{genome_name}' specified in '{path_list_file}' does not exist.")
+ else:
+ genome_file_path = genome_file_path_alt
+
+ genome_name_to_genome_path[genome_name] = {
+ "path": genome_file_path,
+ "circular_contigs": putative_circular_contigs
+ }
+
+ if len(genome_name_to_genome_path) == 0:
+ raise Exception(f"There are no genomes in the provided file: {path_list_file} ")
+
+ return genome_name_to_genome_path
+
diff --git a/requirements.txt b/requirements.txt
index 2b72f9a1..6396883f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
tqdm>=4.64
pytables>=3.7
-prodigal>=2.6.3
+pyrodigal>=3.0.1
aragorn>=1.2.41
infernal>=1.1.4
mmseqs2>=13.45111
-networkx>=2.7
+networkx>=2.7,<=3.1
dataclasses>=0.8
scipy>=1.7.3
plotly>=4.14.3
diff --git a/testingDataset/some_chlam_families.txt b/testingDataset/some_chlam_families.txt
index 535f87b6..77d70cc1 100644
--- a/testingDataset/some_chlam_families.txt
+++ b/testingDataset/some_chlam_families.txt
@@ -2,3 +2,5 @@ BKC02_RS00820
E150_RS00825
L2BLST_RS00795
AKW53_RS02300
+BKC03_RS00800
+G9768_RS02640
diff --git a/tests/context/test_context.py b/tests/context/test_context.py
new file mode 100644
index 00000000..6a610d07
--- /dev/null
+++ b/tests/context/test_context.py
@@ -0,0 +1,213 @@
+import pytest
+from ppanggolin.context.searchGeneContext import extract_contig_window, get_n_next_genes_index, add_edges_to_context_graph, compute_gene_context_graph
+
+from ppanggolin.geneFamily import GeneFamily
+from ppanggolin.genome import Gene, Contig, Organism
+
+import networkx as nx
+
+
+def test_extract_contig_window():
+ assert extract_contig_window(contig_size=15, positions_of_interest={8}, window_size=1) == [(7,9)]
+
+ # check that extracted window is inside contig limit
+ assert extract_contig_window(contig_size=16, positions_of_interest={15}, window_size=4) == [(11,15)]
+
+ assert extract_contig_window(contig_size=10, positions_of_interest={2, 8}, window_size=2) == [(0,4), (6,9)]
+
+ # 12 window is (9,15)
+ # 19 window is (16,22)
+ # so when 12 and 19 are of interest window merge (9,22)
+ assert extract_contig_window(contig_size=200, positions_of_interest={12}, window_size=3) == [(9,15)]
+ assert extract_contig_window(contig_size=200, positions_of_interest={19}, window_size=3) == [(16,22)]
+ assert extract_contig_window(contig_size=200, positions_of_interest={12, 19}, window_size=3) == [(9,22)]
+
+ assert extract_contig_window(contig_size=10, positions_of_interest={2, 5, 8}, window_size=2) == [(0,9)]
+
+def test_extract_contig_window_with_circular_contig():
+ # # check that circularity is properly taken into account
+ assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=2, is_circular=True) == [(0,3), (11,11)]
+ assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=3, is_circular=True) == [(0,4), (10,11)]
+ assert extract_contig_window(contig_size=12, positions_of_interest={10}, window_size=3, is_circular=True) == [(0,1), (7,11)]
+
+ assert extract_contig_window(contig_size=12, positions_of_interest={6}, window_size=6, is_circular=True) == [(0,11)]
+ assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=6, is_circular=True) == [(0,11)]
+ assert extract_contig_window(contig_size=12, positions_of_interest={1}, window_size=6, is_circular=False) == [(0,7)]
+
+ assert extract_contig_window(contig_size=12, positions_of_interest={0, 9}, window_size=2, is_circular=False) == [(0,2), (7,11)]
+
+ assert extract_contig_window(contig_size=894, positions_of_interest=[151, 152, 153, 893], window_size=4, is_circular=True) == [(0, 3), (147, 157), (889, 893)]
+
+def test_extract_contig_window_out_of_range():
+ with pytest.raises(IndexError):
+ extract_contig_window(contig_size=15, positions_of_interest={15}, window_size=1)
+
+ with pytest.raises(IndexError):
+ extract_contig_window(contig_size=15, positions_of_interest={-1}, window_size=1)
+
+def test_get_n_next_genes_index():
+
+ assert list(get_n_next_genes_index(current_index=6, next_genes_count=3, contig_size=100, is_circular=False)) == [7, 8, 9]
+
+ # there is no next gene because the current index is at the end of a non cicurclar contig
+ assert list(get_n_next_genes_index(current_index=11, next_genes_count=2, contig_size=12, is_circular=False)) == []
+
+def test_get_n_next_genes_index_circular():
+ assert list(get_n_next_genes_index(current_index=10, next_genes_count=3, contig_size=12, is_circular=True)) == [11, 0, 1]
+ assert list(get_n_next_genes_index(current_index=10, next_genes_count=16, contig_size=12, is_circular=True)) == [11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+def test_get_n_next_genes_index_out_of_range():
+ with pytest.raises(IndexError):
+ assert list(get_n_next_genes_index(current_index=10, next_genes_count=16, contig_size=8, is_circular=False))
+
+@pytest.fixture()
+def simple_contig():
+
+ contig = Contig(identifier=1, name="contig1", is_circular=False)
+
+ contig_size=6
+ contig.length = contig_size
+ genes = [Gene(str(i)) for i in range(contig_size)]
+ organism = Organism('organism_A')
+ for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')):
+ family = GeneFamily(i, family_name)
+ gene.fill_annotations(start=i, stop=i+1, strand="+", position=i)
+
+ gene.fill_parents(organism, contig)
+
+ contig.add(gene)
+ family.add(gene)
+
+ return contig
+
+@pytest.fixture()
+def simple_circular_contig():
+
+ contig = Contig(identifier=2, name="contig2", is_circular=True)
+
+ contig_size=6
+ genes = [Gene(i) for i in range(contig_size)]
+
+ for i, (gene, family_name) in enumerate(zip(genes, 'ABCDEFGHIJKLMNOP')):
+ family = GeneFamily(i, family_name)
+ gene.fill_annotations(start=0, stop=0, strand=0, position=i)
+
+ contig.add_gene(gene)
+ family.add_gene(gene)
+
+ return contig
+
+
+
+def test_add_edges_to_context_graph(simple_contig):
+ context_graph = nx.Graph()
+
+ #simple_contig families : ABCDEF
+
+ add_edges_to_context_graph(context_graph,
+ contig_genes = list(simple_contig.genes),
+ contig_windows = [(0,3)],
+ transitivity=1,
+ is_circular=simple_contig.is_circular)
+
+ nodes = sorted([n.name for n in context_graph.nodes()])
+ edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()}
+
+ assert nodes == ['A', "B", "C", "D"]
+ assert edges == {('A', 'B'),
+ ('A', 'C'),
+ ('B', 'C'),
+ ('B', 'D'),
+ ('C', 'D')}
+
+def test_add_edges_to_context_graph_2(simple_contig):
+ context_graph = nx.Graph()
+
+ #simple_contig families : A B-C-D E F
+
+ add_edges_to_context_graph(context_graph,
+ contig_genes = list(simple_contig.genes),
+ contig_windows = [(1,3)],
+ transitivity=0,
+ is_circular=simple_contig.is_circular)
+
+ nodes = sorted([n.name for n in context_graph.nodes()])
+ edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()}
+
+ assert nodes == ["B", "C", "D"]
+ assert edges == {('B', 'C'),
+ ('C', 'D')}
+
+def test_add_edges_to_context_graph_linear(simple_contig):
+
+ # genes : 1-2-3-4-5-6
+ # families : A-B-C-D-E-F
+ # windows : _____ ___ [(0,2) (4,5)]
+
+
+ context_graph = nx.Graph()
+
+ add_edges_to_context_graph(context_graph,
+ contig_genes = list(simple_contig.genes),
+ contig_windows = [(4,5), (0,2)],
+ transitivity=0,
+ is_circular=False)
+
+ nodes = sorted([n.name for n in context_graph.nodes()])
+ edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()}
+
+ assert nodes == ["A", "B", "C", "E", "F"]
+ assert edges == {('A', 'B'),
+ ('B', 'C'),
+ ('E', "F"),
+ }
+
+
+def test_add_edges_to_context_graph_circular(simple_contig):
+
+ # genes : 1-2-3-4-5-6
+ # families : A-B-C-D-E-F
+ # windows : _____ ___ [(0,2) (4,5)]
+
+ context_graph = nx.Graph()
+
+ add_edges_to_context_graph(context_graph,
+ contig_genes = list(simple_contig.genes),
+ contig_windows = [(4,5), (0,2)],
+ transitivity=0,
+ is_circular=True)
+
+ nodes = sorted([n.name for n in context_graph.nodes()])
+ edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()}
+
+ assert nodes == ["A", "B", "C", "E", "F"]
+ assert edges == {('A', 'B'),
+ ('B', 'C'),
+ ('E', "F"),
+ ('A', 'F')} # circular so F and A are linked
+
+
+def test_compute_gene_context_graph(simple_contig):
+
+ # genes : 0-1-2-3-4-5
+ # families : A-B-C-D-E-F
+ # family of interest : ^
+ # windows of 2 : ___ ___
+
+ # simple case with only one contig with 6 genes and 6 families
+
+ families_in_contigs = [g.family for g in simple_contig.genes ]
+ family_names_of_interest = ["C"]
+ families_of_interest = {f for f in families_in_contigs if f.name in family_names_of_interest }
+
+ context_graph = compute_gene_context_graph(families_of_interest,
+ transitive=0,
+ window_size = 2)
+ nodes = sorted([n.name for n in context_graph.nodes()])
+ edges = {tuple(sorted([n.name, v.name])) for n, v in context_graph.edges()}
+
+ assert nodes == ["A", "B", "C", "D", "E"]
+ assert edges == {('A', 'B'),
+ ('B', 'C'),
+ ('C', "D"),
+ ('D', 'E')}
\ No newline at end of file
diff --git a/tests/region/test_rgp_cluster.py b/tests/region/test_rgp_cluster.py
index d6913d30..b1ded836 100644
--- a/tests/region/test_rgp_cluster.py
+++ b/tests/region/test_rgp_cluster.py
@@ -16,7 +16,7 @@ def genes() -> Generator[Set[Gene], None, None]:
"""Create a set of genes to fill gene families
"""
organism = Organism("organism")
- contig = Contig("contig")
+ contig = Contig(0, "contig")
genes = set()
for i in range(0, randint(11, 20)):
gene = Gene(f"gene_{str(i)}")
diff --git a/tests/test_genefamily.py b/tests/test_genefamily.py
index 9baddb00..e1eefd3f 100644
--- a/tests/test_genefamily.py
+++ b/tests/test_genefamily.py
@@ -120,9 +120,11 @@ def organisms(self, genes) -> Generator[Set[Organism], None, None]:
nb_organisms = randint(2, 10)
nb_genes_per_organisms = len(genes) // nb_organisms
idx_org = 1
+ contig_counter = 0
while idx_org < nb_organisms:
organism = Organism(f"organism_{idx_org}")
- contig = Contig(f"contig_{idx_org}")
+ contig = Contig(contig_counter, f"contig_{idx_org}")
+ contig_counter
organism.add(contig)
idx_genes = 0
while idx_genes < nb_genes_per_organisms:
@@ -134,7 +136,7 @@ def organisms(self, genes) -> Generator[Set[Organism], None, None]:
idx_org += 1
# last family fill with all the gene left
organism = Organism(f"organism_{idx_org}")
- contig = Contig(f"contig_{idx_org}")
+ contig = Contig(contig_counter, f"contig_{idx_org}")
organism.add(contig)
idx_genes = (idx_org - 1) * nb_genes_per_organisms
while idx_genes < len(genes):
diff --git a/tests/test_genome.py b/tests/test_genome.py
index 4119c943..35ac4714 100644
--- a/tests/test_genome.py
+++ b/tests/test_genome.py
@@ -92,7 +92,7 @@ def test_fill_parents(self, feature):
"""Tests that 'fill_parents' method associates the object with the given organism and contig
"""
organism = Organism('org_id')
- contig = Contig('contig_name')
+ contig = Contig(0, 'contig_name')
feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id')
feature.fill_parents(organism, contig)
assert feature.organism == organism
@@ -102,7 +102,7 @@ def test_fill_parents_with_organism_or_contig_only(self, feature):
"""Tests that Gene can be filled with only an organism or a contig
"""
organism = Organism('org')
- contig = Contig("ctg")
+ contig = Contig(0, "ctg")
feature.fill_annotations(1, 10, '+', 'gene_type', 'name', 'product', 'local_id')
feature.fill_parents(organism=organism)
assert feature.organism == organism
@@ -131,7 +131,7 @@ def test_set_organism_not_isinstance_organism(self, feature):
def test_set_contig(self, feature):
"""Tests that contig setter sets contig with the valid type
"""
- contig = Contig('contig')
+ contig = Contig(0, 'contig')
feature.contig = contig
assert feature.contig == contig
@@ -268,7 +268,7 @@ class TestContig:
def contig(self) -> Generator[Contig, None, None]:
"""Generate basic contig for tests
"""
- yield Contig("contig")
+ yield Contig(0, "contig")
@pytest.fixture
def gene(self) -> Generator[Gene, None, None]:
@@ -446,7 +446,7 @@ def organism(self) -> Generator[Organism, None, None]:
def contig(self) -> Generator[Contig, None, None]:
"""Generate a basic contig for test
"""
- yield Contig("contig")
+ yield Contig(0, "contig")
@pytest.fixture
def gene(self) -> Generator[Gene, None, None]:
@@ -498,7 +498,7 @@ def test_add_contig_existing_name(self, organism, contig):
"""
organism.add(contig)
with pytest.raises(KeyError):
- organism.add(Contig('contig'))
+ organism.add(Contig(0, 'contig'))
def test_get_contig(self, organism, contig):
"""Tests that a contig can be retrieved from an Organism instance
@@ -521,8 +521,10 @@ def test_get_nonexistent_contig(self, organism):
def test_number_of_contigs(self, organism):
"""Tests that the number of contigs in an organism instance can be retrieved
"""
- organism.add(Contig('contig1'))
- organism.add(Contig('contig2'))
+ organism.add(Contig(1, 'contig1'))
+ organism.add(Contig(2, 'contig2'))
+
+ assert organism.number_of_contigs == 2
assert isinstance(len(organism), int)
assert len(organism) == 2
diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py
index 6a4d7e72..d67eb1ce 100644
--- a/tests/test_pangenome.py
+++ b/tests/test_pangenome.py
@@ -39,15 +39,15 @@ def test_cstr(self, pangenome):
"""
pangenome_attr_type = {
"file": type(None),
- "_famGetter": dict,
+ "_fam_getter": dict,
"_org_index": type(None),
"_fam_index": type(None),
"_max_fam_id": int,
- "_orgGetter": dict,
- "_edgeGetter": dict,
- "_regionGetter": dict,
- "_spotGetter": dict,
- "_moduleGetter": dict,
+ "_org_getter": dict,
+ "_edge_getter": dict,
+ "_region_getter": dict,
+ "_spot_getter": dict,
+ "_module_getter": dict,
"status": dict,
"parameters": dict
}
@@ -335,8 +335,8 @@ def fill_org_with_genes(self) -> Generator[Union[Organism, Set[Gene]], None, Non
"""
genes = set()
organism = Organism(name="organism")
- for contig_id in range(randint(2, 10)):
- contig = Contig("k_{}".format(contig_id))
+ for ctg_counter, contig_id in enumerate(range(randint(2, 10))):
+ contig = Contig(ctg_counter, "k_{}".format(contig_id))
organism.add(contig)
for gene_idx in range(randint(2, 10)):
gene = Gene(gene_id=f"{organism.name}.{contig_id}.{gene_idx}")
@@ -455,8 +455,8 @@ def make_gene_pair(gene_id_1: int = 1, gene_id_2: int = 2) -> Tuple[Gene, Gene]:
gene2 = Gene(gene_id=f"gene_{gene_id_2}")
fam1 = GeneFamily(family_id=1, name=f"fam_{gene_id_1}")
fam2 = GeneFamily(family_id=2, name=f"fam_{gene_id_2}")
- ctg1 = Contig(name=f"ctg_{gene_id_1}")
- ctg2 = Contig(name=f"ctg_{gene_id_2}")
+ ctg1 = Contig(1, name=f"ctg_{gene_id_1}")
+ ctg2 = Contig(2, name=f"ctg_{gene_id_2}")
fam1.add(gene1)
fam2.add(gene2)
organism = Organism(name=f"org_{choices([gene_id_1, gene_id_2], k=1)}")
@@ -597,8 +597,8 @@ def test_add_region(self, pangenome):
"""
rgp = Region(name="rgp")
pangenome.add_region(rgp)
- assert len(pangenome._regionGetter) == 1
- assert pangenome._regionGetter["rgp"] == rgp
+ assert len(pangenome._region_getter) == 1
+ assert pangenome._region_getter["rgp"] == rgp
def test_add_region_already_in_pangenome(self, pangenome):
"""Tests that adding region already in pangenome return a KeyError.
@@ -665,8 +665,8 @@ def test_add_spot(self, pangenome):
"""
spot = Spot(spot_id=0)
pangenome.add_spot(spot)
- assert len(pangenome._spotGetter) == 1
- assert pangenome._spotGetter[0] == spot
+ assert len(pangenome._spot_getter) == 1
+ assert pangenome._spot_getter[0] == spot
def test_add_spot_already_in_pangenome(self, pangenome):
"""Tests that adding spot already in pangenome return a KeyError.
@@ -734,8 +734,8 @@ def test_add_module(self, pangenome):
"""
module = Module(module_id=0)
pangenome.add_module(module)
- assert len(pangenome._moduleGetter) == 1
- assert pangenome._moduleGetter[0] == module
+ assert len(pangenome._module_getter) == 1
+ assert pangenome._module_getter[0] == module
def test_add_module_already_in_pangenome(self, pangenome):
"""Tests that adding module already in pangenome return a KeyError.
@@ -808,7 +808,7 @@ def add_element_to_pangenome(self, pangenome):
pangenome.add_gene_family(family)
org = Organism("Org")
org.add_metadata(source=metadata.source, metadata=metadata)
- ctg = Contig("Ctg")
+ ctg = Contig(0, "Ctg")
org.add(ctg)
gene = Gene("Gene")
gene.position, gene.start = (0, 0)
diff --git a/tests/test_region.py b/tests/test_region.py
index 16a4d68f..dbff76b0 100644
--- a/tests/test_region.py
+++ b/tests/test_region.py
@@ -145,9 +145,9 @@ def test_add_genes_from_different_contigs(self, region):
gene1, gene2 = Gene('gene_1'), Gene('gene_2')
gene1.fill_annotations(start=0, stop=10, strand='+', position=0)
gene2.fill_annotations(start=11, stop=20, strand='+', position=1)
- gene1.fill_parents(None, Contig('contig_1'))
+ gene1.fill_parents(None, Contig(1, 'contig_1'))
region.add(gene1)
- gene2.fill_parents(None, Contig('contig_2'))
+ gene2.fill_parents(None, Contig(2, 'contig_2'))
with pytest.raises(Exception):
region.add(gene2)
@@ -223,7 +223,7 @@ def test_get_contig(self, region):
"""
gene = Gene('gene')
gene.fill_annotations(start=0, stop=10, strand='+', position=0)
- gene.fill_parents(contig=Contig("contig"))
+ gene.fill_parents(contig=Contig(0, "contig"))
region.add(gene)
assert region.contig.name == 'contig'
@@ -233,7 +233,7 @@ def test_is_whole_contig_true(self, region):
starter, stopper = Gene('starter'), Gene('stopper')
starter.fill_annotations(start=0, stop=10, strand='+', position=0)
stopper.fill_annotations(start=11, stop=20, strand='+', position=1)
- contig = Contig("contig")
+ contig = Contig(0, "contig")
contig[starter.start], contig[stopper.start] = starter, stopper
starter.fill_parents(None, contig), stopper.fill_parents(None, contig)
region.add(starter), region.add(stopper)
@@ -247,7 +247,7 @@ def test_is_whole_contig_false(self, region):
starter.fill_annotations(start=11, stop=20, strand='+', position=1)
stopper.fill_annotations(start=21, stop=30, strand='+', position=2)
after.fill_annotations(start=31, stop=40, strand='+', position=3)
- contig = Contig("contig")
+ contig = Contig(0, "contig")
contig[before.start], contig[after.start] = before, after
contig[starter.start], contig[stopper.start] = starter, stopper
before.fill_parents(None, contig), after.fill_parents(None, contig)
@@ -263,7 +263,7 @@ def test_is_contig_border_true(self, region):
starter.fill_annotations(start=11, stop=20, strand='+', position=1)
stopper.fill_annotations(start=21, stop=30, strand='+', position=2)
after.fill_annotations(start=31, stop=40, strand='+', position=3)
- contig = Contig("contig")
+ contig = Contig(0, "contig")
before.fill_parents(None, contig), after.fill_parents(None, contig)
starter.fill_parents(None, contig), stopper.fill_parents(None, contig)
# Test bordering right
@@ -284,7 +284,7 @@ def test_is_contig_border_false(self, region):
starter.fill_annotations(start=11, stop=20, strand='+', position=1)
stopper.fill_annotations(start=21, stop=30, strand='+', position=2)
after.fill_annotations(start=31, stop=40, strand='+', position=3)
- contig = Contig("contig")
+ contig = Contig(0, "contig")
contig[before.start], contig[after.start] = before, after
contig[starter.start], contig[stopper.start] = starter, stopper
before.fill_parents(None, contig), after.fill_parents(None, contig)