Merge remote-tracking branch 'origin/AnnotHDF5Reformat' into gff_output

labgem · Sep 19, 2023 · b87a2ee · b87a2ee
2 parents 95497b7 + e992a17
commit b87a2ee
Show file tree

Hide file tree

Showing 63 changed files with 7,761 additions and 3,969 deletions.
diff --git a/.github/workflows/check_recipes.yml b/.github/workflows/check_recipes.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       matrix:
         os: ['ubuntu-latest','macos-latest']
-        python-version: ['3.7','3.8','3.9','3.10']
+        python-version: ['3.8','3.9','3.10']
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       matrix:
         os: ['ubuntu-latest', 'macos-latest']
-        python-version: ['3.7', '3.8', '3.9', '3.10']
+        python-version: ['3.8', '3.9', '3.10']
     steps:
     # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
     - uses: actions/checkout@v2
@@ -58,8 +58,8 @@ jobs:
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin annotate --fasta organisms.fasta.list --output stepbystep --kingdom bacteria --contig_filter 500
-        ppanggolin cluster -p stepbystep/pangenome.h5 --defrag --coverage 0.8 --identity 0.8
+        ppanggolin annotate --fasta organisms.fasta.list --output stepbystep --kingdom bacteria
+        ppanggolin cluster -p stepbystep/pangenome.h5 --coverage 0.8 --identity 0.8
         ppanggolin graph -p stepbystep/pangenome.h5 -r 10
         ppanggolin partition --output stepbystep -f -p stepbystep/pangenome.h5 --cpu 1 -b 2.6 -ms 10 -fd -ck 500 -Kmm 3 12 -im 0.04 --draw_ICL -se $RANDOM
         ppanggolin rarefaction --output stepbystep -f -p stepbystep/pangenome.h5 --depth 5 --min 1 --max 50 -ms 10 -fd -ck 30 -K 3 --soft_core 0.9 -se $RANDOM
@@ -70,7 +70,7 @@ jobs:
         ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
         ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list
-        ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
+        ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log
         cd -
     - name: gbff parsing and MSA computing
@@ -86,9 +86,17 @@ jobs:
         cd testingDataset
         ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv --output readclusterpang
         ppanggolin annotate --anno organisms.gbff.list --output readclusters
-        ppanggolin cluster --cluster clusters.tsv -p readclusters/pangenome.h5
+        ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5
         ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f
         cd -
+    - name: testing rgp_cluster command
+      shell: bash -l {0}
+      run: |
+        cd testingDataset
+        ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5
+        ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5 --ignore_incomplete_rgp --grr_metric max_grr -f --graph_formats graphml gexf
+        ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5 --no_identical_rgp_merging -o rgp_clustering_no_identical_rgp_merging --graph_formats graphml
+        cd -
     - name: testing align command
       shell: bash -l {0}
       run: |

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.2.132
+1.2.187
diff --git a/docs/user/Regions-of-Genome-Plasticity.md b/docs/user/Regions-of-Genome-Plasticity.md
@@ -27,3 +27,34 @@ Spots can be computed once RGPs have been predicted. You can do that using:
 For versions between 1.1.0 and 1.2.12, you can use additional option '--draw_hotspots' which uses [genoplotR](http://genoplotr.r-forge.r-project.org/) to draw those spots in png figures. For versions above 1.2.12, you can use the dedicated subcommand [draw](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#draw), which uses the python library [bokeh](http://docs.bokeh.org/en/latest/) to draw interactive figures which can be visualized and modified directly in the browser.
 
 Information about spots can then be written using `ppanggolin write -p pangenome --spots` which will provide a [file linking RGPs with their spots](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#spots) and a [file showing multiple metrics for each spot](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#summarize-spots)
+
+
+
+# RGP cluster based on their gene families
+
+To cluster RGPs (Regions of Genome Plasticity) based on their gene families, you can use the command `panggolin rgp_cluster`.
+The panggolin rgp_cluster command performs the following steps to cluster RGPs (Regions of Genome Plasticity) based on their gene families:
+
+1. Calculation of GRR (Gene Repertoire Relatedness): The command calculates the GRR values for all pairs of RGPs. The GRR metric evaluates the similarity between two RGPs by assessing their shared gene families.
+2. Graph Construction: The command constructs a graph representation of the RGPs, where each RGP is represented as a node in the graph. The edges between the nodes are weighted using the GRR values, indicating the strength of the relationship between the RGPs.
+3. Filtering GRR Values: GRR values below the `--grr_cutoff` threshold (default 0.8) are filtered out to remove noise from the analysis.
+4. Louvain Communities Clustering: The Louvain communities clustering algorithm is then applied to the graph. This algorithm identifies clusters of RGPs with similar gene family relationships.
+
+There are three modes available for calculating the GRR value: `min_grr`, `max_grr`, or `incomplete_aware_grr`.
+- `min_grr` mode: This mode computes the number of gene families shared between two RGPs and divides it by the smaller number of gene families among the two RGPs.
+- `max_grr` mode: In this mode, the number of gene families shared between two RGPs is calculated and divided by the larger number of gene families among the two RGPs.
+- `incomplete_aware_grr` (default) mode: If at least one RGP is considered incomplete, which typically happens when it is located at the border of a contig, the `min_grr` mode is used. Otherwise, the `max_grr` mode is applied. This mode is useful to correctly cluster incomplete RGPs.
+
+
+The resulting RGP clusters are stored in a tsv file with the folowing columns:
+
+| column  | description                  |
+|---------|------------------------------|
+| RGP     | The unique region identifier |
+| cluster | The cluster id of the RGP    |
+| spot_id    | the spot ID of the RGP       |
+
+
+
+The command also generates an RGP graph in the gexf format, which can be utilized to explore the RGP clusters along with their spots of insertion. In this graph identical RGPs with the same family content and with the same spot are merged into a single node to simplify the graph representation. This feature can be disable with the parameter `--no_identical_rgp_merging`.
+
diff --git a/ppanggolin/RGP/__init__.py b/ppanggolin/RGP/__init__.py
@@ -1,2 +1,3 @@
 from .genomicIsland import subparser, launch
 from .spot import *
+from . import rgp_cluster
diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py
@@ -4,7 +4,8 @@
 # default libraries
 import logging
 import argparse
-
+from pathlib import Path
+from typing import Set
 # installed libraries
 from tqdm import tqdm
 
@@ -30,7 +31,7 @@ def changes(self, score):
         self.score = score if score >= 0 else 0
 
 
-def extract_rgp(contig, node, rgp_id, naming):
+def extract_rgp(contig, node, rgp_id, naming) -> Region:
     """
         Extract the region from the given starting node
     """
@@ -40,7 +41,7 @@ def extract_rgp(contig, node, rgp_id, naming):
     elif naming == "organism":
         new_region = Region(node.gene.organism.name + "_" + contig.name + "_RGP_" + str(rgp_id))
     while node.state:
-        new_region.append(node.gene)
+        new_region.add(node.gene)
         node.state = 0
         node.score = 0
         node = node.prev
@@ -148,7 +149,7 @@ def init_matrices(contig: Contig, multi: set, persistent_penalty: int = 3, varia
 
 
 def mk_regions(contig: Contig, matrix: list, multi: set, min_length: int = 3000, min_score: int = 4,
-               persistent: int = 3, continuity: int = 1, naming: str = "contig") -> set:
+               persistent: int = 3, continuity: int = 1, naming: str = "contig") -> Set[Region]:
     """
     Processing matrix and 'emptying' it to get the regions.
 
@@ -163,6 +164,7 @@ def mk_regions(contig: Contig, matrix: list, multi: set, min_length: int = 3000,
 
     :return:
     """
+
     def max_index_node(lst):
         """gets the last node with the highest score from a list of matriceNode"""
         if isinstance(lst, list):
@@ -182,18 +184,18 @@ def max_index_node(lst):
     while val >= min_score:
         new_region = extract_rgp(contig, matrix[index], len(contig_regions), naming)
         new_region.score = val
-        if (new_region[0].stop - new_region[-1].start) > min_length:
+        if new_region.length > min_length:
             contig_regions.add(new_region)
         rewrite_matrix(contig, matrix, index, persistent, continuity, multi)
         val, index = max_index_node(matrix)
     return contig_regions
 
 
 def compute_org_rgp(organism: Organism, multigenics: set, persistent_penalty: int = 3, variable_gain: int = 1,
-                    min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> set:
+                    min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> Set[Region]:
     org_regions = set()
     for contig in organism.contigs:
-        if len(contig.genes) != 0:  # some contigs have no coding genes...
+        if contig.number_of_genes != 0:  # some contigs have no coding genes...
             # can definitely multiprocess this part, as not THAT much information is needed...
             matrix = init_matrices(contig, multigenics, persistent_penalty, variable_gain)
             org_regions |= mk_regions(contig, matrix, multigenics, min_length, min_score, persistent_penalty,
@@ -208,8 +210,9 @@ def naming_scheme(pangenome: Pangenome):
             oldlen = len(contigsids)
             contigsids.add(contig.name)
             if oldlen == len(contigsids):
-                logging.getLogger().warning("You have contigs with identical identifiers in your assemblies. "
-                                            "identifiers will be supplemented with your provided organism names.")
+                logging.getLogger("PPanGGOLiN").warning("You have contigs with identical identifiers in your "
+                                                        "assemblies. identifiers will be supplemented with your "
+                                                        "provided organism names.")
                 return "organism"
     return "contig"
 
@@ -247,14 +250,15 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain
     check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=False, need_partitions=True,
                          disable_bar=disable_bar)
 
-    logging.getLogger().info("Detecting multigenic families...")
+    logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...")
     multigenics = pangenome.get_multigenics(dup_margin)
-    logging.getLogger().info("Compute Regions of Genomic Plasticity ...")
+    logging.getLogger("PPanGGOLiN").info("Compute Regions of Genomic Plasticity ...")
     name_scheme = naming_scheme(pangenome)
-    for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar):
-        pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length,
-                                              min_score, naming=name_scheme))
-    logging.getLogger().info(f"Predicted {len(pangenome.regions)} RGP")
+    for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genomes", disable=disable_bar):
+        for region in compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length,
+                                              min_score, naming=name_scheme):
+	        pangenome.add_region(region)
+    logging.getLogger("PPanGGOLiN").info(f"Predicted {pangenome.number_of_rgp} RGP")
 
     # save parameters and save status
     pangenome.parameters["RGP"] = {}
@@ -301,7 +305,7 @@ def parser_rgp(parser: argparse.ArgumentParser):
     """
     required = parser.add_argument_group(title="Required arguments",
                                          description="One of the following arguments is required :")
-    required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome .h5 file")
+    required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file")
 
     optional = parser.add_argument_group(title="Optional arguments")
     optional.add_argument('--persistent_penalty', required=False, type=int, default=3,
@@ -319,20 +323,13 @@ def parser_rgp(parser: argparse.ArgumentParser):
 
 if __name__ == '__main__':
     """To test local change and allow using debugger"""
-    from ppanggolin.utils import check_log, set_verbosity_level
+    from ppanggolin.utils import set_verbosity_level, add_common_arguments
 
     main_parser = argparse.ArgumentParser(
         description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors",
         formatter_class=argparse.RawTextHelpFormatter)
 
     parser_rgp(main_parser)
-    common = main_parser.add_argument_group(title="Common argument")
-    common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2],
-                        help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)")
-    common.add_argument("--log", required=False, type=check_log, default="stdout", help="log output file")
-    common.add_argument("-d", "--disable_prog_bar", required=False, action="store_true",
-                        help="disables the progress bars")
-    common.add_argument('-f', '--force', action="store_true",
-                        help="Force writing in output directory and in pangenome output file.")
+    add_common_arguments(main_parser)
     set_verbosity_level(main_parser.parse_args())
     launch(main_parser.parse_args())