Merge remote-tracking branch 'origin/projection' into context

labgem · Oct 4, 2023 · 566fdad · 566fdad
2 parents 50aa4d5 + d3299a0
commit 566fdad
Show file tree

Hide file tree

Showing 72 changed files with 10,319 additions and 4,477 deletions.
diff --git a/.github/workflows/check_recipes.yml b/.github/workflows/check_recipes.yml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       matrix:
         os: ['ubuntu-latest','macos-latest']
-        python-version: ['3.7','3.8','3.9','3.10']
+        python-version: ['3.8','3.9','3.10']
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       matrix:
         os: ['ubuntu-latest', 'macos-latest']
-        python-version: ['3.7', '3.8', '3.9', '3.10']
+        python-version: ['3.8', '3.9', '3.10']
     steps:
     # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
     - uses: actions/checkout@v2
@@ -58,8 +58,8 @@ jobs:
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin annotate --fasta organisms.fasta.list --output stepbystep --kingdom bacteria --contig_filter 500
-        ppanggolin cluster -p stepbystep/pangenome.h5 --defrag --coverage 0.8 --identity 0.8
+        ppanggolin annotate --fasta organisms.fasta.list --output stepbystep --kingdom bacteria
+        ppanggolin cluster -p stepbystep/pangenome.h5 --coverage 0.8 --identity 0.8
         ppanggolin graph -p stepbystep/pangenome.h5 -r 10
         ppanggolin partition --output stepbystep -f -p stepbystep/pangenome.h5 --cpu 1 -b 2.6 -ms 10 -fd -ck 500 -Kmm 3 12 -im 0.04 --draw_ICL -se $RANDOM
         ppanggolin rarefaction --output stepbystep -f -p stepbystep/pangenome.h5 --depth 5 --min 1 --max 50 -ms 10 -fd -ck 30 -K 3 --soft_core 0.9 -se $RANDOM
@@ -70,9 +70,9 @@ jobs:
         ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
         ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list
-        ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
+        ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log
-        cd -
+        cd - 
     - name: gbff parsing and MSA computing
       shell: bash -l {0}
       run: |
@@ -86,21 +86,33 @@ jobs:
         cd testingDataset
         ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv --output readclusterpang
         ppanggolin annotate --anno organisms.gbff.list --output readclusters
-        ppanggolin cluster --cluster clusters.tsv -p readclusters/pangenome.h5
+        ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5
         ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f
         cd -
+    - name: testing rgp_cluster command
+      shell: bash -l {0}
+      run: |
+        cd testingDataset
+        ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5
+        ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5 --ignore_incomplete_rgp --grr_metric max_grr -f --graph_formats graphml gexf
+        ppanggolin rgp_cluster --pangenome mybasicpangenome/pangenome.h5 --no_identical_rgp_merging -o rgp_clustering_no_identical_rgp_merging --graph_formats graphml
+        cd -
     - name: testing align command
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_align --draw_related --getinfo
+        ppanggolin align --pangenome mybasicpangenome/pangenome.h5 --sequences some_chlam_proteins.fasta \
+                         --output test_align --draw_related --getinfo --fast
         cd -
     - name: testing context command
       shell: bash -l {0}
       run: |
         cd testingDataset
-        ppanggolin context --pangenome myannopang/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_context
-        ppanggolin context --pangenome readclusterpang/pangenome.h5 --family some_chlam_families.txt --output test_context -f
+        ppanggolin context --pangenome myannopang/pangenome.h5 --sequences some_chlam_proteins.fasta --output test_context --fast
+
+        # test from gene family ids. Test here with one family of module 1. The context should find all families of module 1
+        echo AP288_RS05055 > one_family_of_module_1.txt 
+        ppanggolin context --pangenome myannopang/pangenome.h5 --family one_family_of_module_1.txt  --output test_context_from_id
         cd -
     - name: testing metadata command
       shell: bash -l {0}
@@ -109,11 +121,24 @@ jobs:
         ppanggolin metadata -p mybasicpangenome/pangenome.h5 -s test -m metadata/metadata_genes.tsv -a genes
         ppanggolin metadata -p mybasicpangenome/pangenome.h5 -s test -m metadata/metadata_genomes.tsv -a genomes
         ppanggolin metadata -p mybasicpangenome/pangenome.h5 -s test -m metadata/metadata_families.tsv -a families --omit
+        ppanggolin write -p mybasicpangenome/pangenome.h5 --output mybasicpangenome -f --gexf --light_gexf --cpu 1
         cd -
     - name: testing config file
       shell: bash -l {0}
       run: |
         cd testingDataset
         ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml
         ppanggolin panrgp  --anno organisms.gbff.list --cluster clusters.tsv -o test_config --config panrgp_default_config.yaml
- 
+        cd -
+    - name: testing projection cmd
+      shell: bash -l {0}
+      run: |
+        cd testingDataset
+        head organisms.gbff.list | sed 's/^/input_org_/g' > organisms.gbff.head.list
+        ppanggolin projection --pangenome stepbystep/pangenome.h5  -o projection_from_lisy_of_gbff --anno organisms.gbff.head.list 
+
+
+        ppanggolin projection --pangenome mybasicpangenome/pangenome.h5  -o projection_from_single_fasta \
+                              --organism_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \
+                              --spot_graph --graph_formats graphml --fast --keep_tmp -f
+
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.2.127
+1.2.191
diff --git a/docs/user/Home.md b/docs/user/Home.md
@@ -44,4 +44,5 @@
     * [source](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#source)
     * [phylo](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#phylo)
   * [Info](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#info)
-  * [Metrics](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#Metrics)
+  * [Metrics](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#Metrics)
+  * [Metadata](https://github.com/labgem/PPanGGOLiN/wiki/Metadata)
diff --git a/docs/user/Regions-of-Genome-Plasticity.md b/docs/user/Regions-of-Genome-Plasticity.md
@@ -27,3 +27,34 @@ Spots can be computed once RGPs have been predicted. You can do that using:
 For versions between 1.1.0 and 1.2.12, you can use additional option '--draw_hotspots' which uses [genoplotR](http://genoplotr.r-forge.r-project.org/) to draw those spots in png figures. For versions above 1.2.12, you can use the dedicated subcommand [draw](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#draw), which uses the python library [bokeh](http://docs.bokeh.org/en/latest/) to draw interactive figures which can be visualized and modified directly in the browser.
 
 Information about spots can then be written using `ppanggolin write -p pangenome --spots` which will provide a [file linking RGPs with their spots](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#spots) and a [file showing multiple metrics for each spot](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#summarize-spots)
+
+
+
+# RGP cluster based on their gene families
+
+To cluster RGPs (Regions of Genome Plasticity) based on their gene families, you can use the command `panggolin rgp_cluster`.
+The panggolin rgp_cluster command performs the following steps to cluster RGPs (Regions of Genome Plasticity) based on their gene families:
+
+1. Calculation of GRR (Gene Repertoire Relatedness): The command calculates the GRR values for all pairs of RGPs. The GRR metric evaluates the similarity between two RGPs by assessing their shared gene families.
+2. Graph Construction: The command constructs a graph representation of the RGPs, where each RGP is represented as a node in the graph. The edges between the nodes are weighted using the GRR values, indicating the strength of the relationship between the RGPs.
+3. Filtering GRR Values: GRR values below the `--grr_cutoff` threshold (default 0.8) are filtered out to remove noise from the analysis.
+4. Louvain Communities Clustering: The Louvain communities clustering algorithm is then applied to the graph. This algorithm identifies clusters of RGPs with similar gene family relationships.
+
+There are three modes available for calculating the GRR value: `min_grr`, `max_grr`, or `incomplete_aware_grr`.
+- `min_grr` mode: This mode computes the number of gene families shared between two RGPs and divides it by the smaller number of gene families among the two RGPs.
+- `max_grr` mode: In this mode, the number of gene families shared between two RGPs is calculated and divided by the larger number of gene families among the two RGPs.
+- `incomplete_aware_grr` (default) mode: If at least one RGP is considered incomplete, which typically happens when it is located at the border of a contig, the `min_grr` mode is used. Otherwise, the `max_grr` mode is applied. This mode is useful to correctly cluster incomplete RGPs.
+
+
+The resulting RGP clusters are stored in a tsv file with the folowing columns:
+
+| column  | description                  |
+|---------|------------------------------|
+| RGP     | The unique region identifier |
+| cluster | The cluster id of the RGP    |
+| spot_id    | the spot ID of the RGP       |
+
+
+
+The command also generates an RGP graph in the gexf format, which can be utilized to explore the RGP clusters along with their spots of insertion. In this graph identical RGPs with the same family content and with the same spot are merged into a single node to simplify the graph representation. This feature can be disable with the parameter `--no_identical_rgp_merging`.
+
diff --git a/docs/user/_Sidebar.md b/docs/user/_Sidebar.md
@@ -19,3 +19,4 @@
   * [Fasta](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#fasta)
   * [MSA](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#MSA)
   * [Info](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#info)
+  * [Metadata](https://github.com/labgem/PPanGGOLiN/wiki/Metadata)
diff --git a/docs/user/metadata.md b/docs/user/metadata.md
@@ -0,0 +1,44 @@
+From version 2.0.0, it is possible to add metadata link to pangenome elements using PPanGGOLiN. 
+Metadata can be associated with: genes, genomes, families, RGPs, spots and modules from a simple TSV file. 
+To add metadata in your pangenome you can launch the command is as follows:
+
+`ppanggolin metadata -p PANGENOME --metadata METADATA.TSV --source SOURCE --assign ASSIGN`
+
+- `--source` arguments corresponds to the origin of the metadata and will be used as the storage key in the pangenome.
+- `--assign` Choose to which pangenome elements who want to add metadata in the following list {families,genomes,genes,RGPs,spots,modules}
+
+# Metadata format
+
+PPanGGOLiN allows to use a highly flexible metadata file. Only one column name is mandatory, and it is identical to the 
+assignment argument chosen by the user.
+
+For example the TSV file to assign metadata to gene families to functional annotation could be as follows:
+
+| families | Accesion | Function | Description |
+|----------|----------|----------|-------------|
+| GF_1     | Acc_1    | Fn_1     | Desc_1      |
+| GF_2     | Acc_2    | Fn_2     | Desc_2      |
+| GF_2     | Acc_3    | Fn_3     | Desc_3      |
+| ...      | ...      | ...      | ...         |
+| GF_n     | Acc_n    | Fn_n     | Desc_n      |
+
+*Note: As you can see in the above table, one element (here GF_2) can be associated with more than one metadata.*
+
+## Command specifiq option details
+
+### `--metadata`
+PPanGGOLiN enables to give one TSV at a time to add metadata. Look at [Metadata Format](<https://github.com/labgem/PPanGGOLiN/wiki/Metadata#Metadata Format>)
+
+### `--source` 
+The source is the key use to access to metadata in pangenome. 
+So if the name of the source already exist in the pangenome it can be overwritten only with `--force` option.
+This system allow to have multiple metadata source that can be read and use in PPanGGOLiN.
+
+### `--assign` 
+PPanGGOLiN allows to add metadata to all pangenome elements: families,genomes,genes,RGPs,spots,modules.
+But the user can only give one metadata file at a time as he can provide only source and so one type of pangenome element.
+
+### `--omit`
+You can use this option to skip the error provide by an unfind ID in the pangenome. 
+This could be useful if you are using a general TSV with element not in the pangenome, but must be used with carefully.  
+
diff --git a/docs/user/projection.md b/docs/user/projection.md
@@ -0,0 +1,63 @@
+# Projection command
+The ppanggolin projection command allows you to annotate external genomes using an existing pangenome. This process eliminates the need to recompute all components, streamlining the annotation process. Input genomes are expected to belong to the same species.
+
+Genes within the input genome are aligned with genes in the pangenome to determine their gene families and partitions. Genes that do not align with any existing gene in the pangenome are considered specific to the input genome and are assigned to the "Cloud" partition. Based on the alignment and partition assignment, Regions of Plasticity (RGPs) within the input genome are predicted. Each RGP that is not located on a contig border is assigned to a spot of insertion. Finally, conserved modules of the pangenome found in the input genome are reported in the output files.
+
+## Input files:
+
+This command supports two input modes depending on whether you want to project a single genome or multiple genomes at once:
+
+Multiple Files in One TSV:
+- **Options**: `--fasta` or `--anno`
+- **Description**: You can provide a tab-separated file listing organism names alongside their respective FASTA genomic sequences or annotation filepaths, with one line per organism. This mode is suitable when you want to annotate multiple genomes in a single operation. The format of this file is identical to the format used in the annotate and workflow commands; for more details, refer here.
+
+Single File:
+- **Options**: `--organism_name` with `--fasta` or `--anno` and `--circular_contigs` (optional)
+- **Description**: When annotating a single genome, you can directly provide a single FASTA genomic sequence file or an annotation file in GFF/GBFF format. Additionally, specify the name of the organism using the `--organism_name` option. You can also indicate circular contigs using the `--circular_contigs` option when necessary.
+
+
+## Output files:
+
+The Output directory contains `summary_projection.tsv` giving an overview of the projection. one line per organism.
+
+
+| Column                               | Description|
+|--------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| Organism name                        | This column contains name or identifier of the organisms being analyzed.|
+| Pangenome file                       | The path to the pangenome file (pangenome.h5) used for the analysis.|
+| Contigs                              | The number of contigs in the projected genome.|
+| Genes                                | The total number of genes identified in the input genome.|
+| Families                             | The total number of gene families to which genes in the genome of the input organism are assigned.|
+| Persistent genes                     | The number of genes in the "Persistent" partition.|
+| Persistent families                  | The number of gene families in the "Persistent" partition.|
+| Shell genes                          | The number of genes in the "Shell" partition.|
+| Shell families                       | The number of gene families in the "Shell" partition.|
+| Cloud genes                          | The number of genes in the "Cloud" partition.|
+| Cloud families                       | The number of gene families in the "Cloud" parition.|
+| Cloud specific families              | The number of gene families that are specific to the input organism. These families are unique to the input organism and do not have homologs in any other genomes within the pangenome and have been assigned to the "Cloud" partition.|
+| RGPs (Regions of Genomic Plasticity) | The number of Regions of Genomic Plasticity (RGPs) predicted within the input genome.|
+| Spots                                | The total number of spots of insertion associated with RGPs in the input genome.|
+| New spots                            | The number of new insertion spots that have been identified in the input genome. These spots represent novel genomic regions compared to other genomes in the pangenome.|
+| Modules                              | The number of modules that have been projected onto the input genome.|
+
+
+Additionally, within the Output directory, there is a subdirectory for each input genome, named after the input genome itself. Each of these subdirectories contains several files:
+
+For Gene Family and Partition of Input Genes:
+
+- `cds_sequences.fasta`: This file contains the sequences of coding regions (CDS) from the input genome.
+- `gene_to_gene_family.tsv`: It provides the mapping of genes to gene families of the pangenome. its format follows [this output](Outputs.md#gene-families-and-genes)
+- `sequences_partition_projection.tsv`: This file maps the input genes to its partition (Persistent, Shell or Cloud).
+- `specific_genes.tsv`: This file list the gene of the input genomes that do not align to any gene of the pangenome. These genes are assigned to Cloud parititon. 
+
+For RGPs and Spots:
+
+- `plastic_regions.tsv`: This file contains information about Regions of Genomic Plasticity (RGPs) within the input genome. Its format follows [this output](Outputs.md#plastic-regions).
+- `input_organism_rgp_to_spot.tsv`: It provides information about the association between RGPs and insertion spots in the input genome. Its format follows [this ouput](Outputs.md#spots).
+
+Optionally, you can produce a graph of the RGPs using the `--spot_graph` option. This graph is similar as the one produce by the `ppanggolin spot` command.
+
+For Modules:
+
+- `modules_in_input_organism.tsv`: This file lists the modules that have been found in the input genome. Its format follows [this ouput](Outputs.md#modules-in-organisms).
+
diff --git a/ppanggolin/RGP/__init__.py b/ppanggolin/RGP/__init__.py
@@ -1,2 +1,3 @@
 from .genomicIsland import subparser, launch
 from .spot import *
+from . import rgp_cluster