diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5a0ac918..126cfc2d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,8 +32,7 @@ jobs: shell: bash -l {0} run: | conda install -y --file requirements.txt - conda install -y pytest - pip install . + pip install .[test] # Check that it is installed and displays help without error - name: Check that PPanGGOLiN is installed shell: bash -l {0} diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000..ff7f01ab --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,35 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 +python: + install: + - requirements: docs/requirements.txt + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.8" + # You can also specify other tool versions: + # nodejs: "19" + # rust: "1.64" + # golang: "1.19" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +# python: +# install: +# - requirements: docs/requirements.txt \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 00000000..7cf776a1 --- /dev/null +++ b/README.md @@ -0,0 +1,146 @@ +# PPanGGOLiN: Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors + +[![Actions](https://img.shields.io/github/actions/workflow/status/althonos/pyrodigal/test.yml?branch=main&logo=github&style=flat-square&maxAge=300)](https://github.com/labgem/ppanggolin/actions) +[![License](https://anaconda.org/bioconda/ppanggolin/badges/license.svg)](http://www.cecill.info/licences.fr.html) +[![Bioconda](https://img.shields.io/conda/vn/bioconda/ppanggolin?style=flat-square&maxAge=3600&logo=anaconda)](https://anaconda.org/bioconda/ppanggolin) +[![Source](https://img.shields.io/badge/source-GitHub-303030.svg?maxAge=2678400&style=flat-square)](https://github.com/labgem/ppanggolin/) +[![GitHub issues](https://img.shields.io/github/issues/labgem/ppanggolin.svg?style=flat-square&maxAge=600)](https://github.com/labgem/ppanggolin/issues) +[![Docs](https://img.shields.io/readthedocs/ppanggolin/latest?style=flat-square&maxAge=600)](https://ppanggolin.readthedocs.io) +[![Downloads](https://anaconda.org/bioconda/ppanggolin/badges/downloads.svg)](https://bioconda.github.io/recipes/ppanggolin/README.html#download-stats) + +**PPanGGOLiN** +([Gautreau et al. 2020](https://doi.org/10.1371/journal.pcbi.1007732)) is a software suite used to create and manipulate prokaryotic pangenomes from a set of either genomic DNA sequences or provided genome annotations. +It is designed to scale up to tens of thousands of genomes. +It has the specificity to partition the pangenome using a statistical approach rather than using fixed thresholds which gives it the ability to work with low-quality data such as *Metagenomic Assembled Genomes (MAGs)* or *Single-cell Amplified Genomes (SAGs)* thus taking advantage of large scale environmental studies and letting users study the pangenome of uncultivable species. + +**PPanGGOLiN** builds pangenomes through a graphical model and a statistical method to partition gene families in persistent, shell and cloud genomes. +It integrates both information on protein-coding genes and their genomic neighborhood to build a graph of gene families where each node is a gene family, and each edge is a relation of genetic contiguity. +The partitioning method promotes that two gene families that are consistent neighbors in the graph are more likely to belong to the same partition. +It results in a Partitioned Pangenome Graph (PPG) made of persistent, shell and cloud nodes drawing genomes on rails like a subway map to help biologists navigate the great diversity of microbial life. + + +Moreover, the panRGP method ([Bazin et al. 2020](https://doi.org/10.1093/bioinformatics/btaa792)) included in **PPanGGOLiN** predicts, for each genome, Regions of Genome Plasticity (RGPs) that are clusters of genes made of shell and cloud genomes in the pangenome graph. +Most of them arise from Horizontal gene transfer (HGT) and correspond to Genomic Islands (GIs). +RGPs from different genomes are next grouped in spots of insertion based on their conserved flanking persistent genes. + + +Those RGPs can be further divided in conserved modules by panModule ([Bazin et al. 2021](https://doi.org/10.1101/2021.12.06.471380)). Those conserved modules correspond to groups of cooccurring and colocalized genes that are gained or lost together in the variable regions of the pangenome. + +```{image} _static/logo.png +:alt: ppangolin logo +:align: center +:heigth: 300 +:width: 300 +``` + +# Installation + +**PPanGGOLiN** is easily installed via conda. +You will need the following conda channels if you don't have them already: + +```bash +conda config --add channels defaults +conda config --add channels bioconda +conda config --add channels conda-forge +``` + +Then, you can just run : + +```bash +conda install -c bioconda ppanggolin +``` + +# Quick usage + +**PPanGGOLiN** integrates some workflows to build and analyse easily and rapidly a pangenome. +These commands can be tuned with some parameters but are mostly automatic. +All workflow parameters are described [here](https://ppanggolin.readthedocs.io/en/updateenv/user/Basic-usage-and-practical-information.html#the-workflow-subcommand). + +## Pangenome graph construction and partition + +To build and partition a pangenome, you can use the following command: +```bash +ppanggolin workflow --fasta ORGANISMS_FASTA_LIST +``` + +It uses parameters that we found to be generally the best when working with species pangenomes. + +The file ORGANISMS_FASTA_LIST is a tsv-separated file with the following organization : +1. The first column contains a unique organism name **(without space)** +2. The second column the path to the associated FASTA file +3. Circular contig identifiers are indicated in the following columns +4. Each line represents an organism + +An [example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.fasta.list) with 50 *Chlamydia trachomatis* genomes can be found in the testingDataset/ directory. + + +You can also give **PPanGGOLiN** your own annotations using *.gff* or *.gbff/.gbk* files instead of *.fasta* files, +such as the ones provided by prokka using the following command : + +```bash +ppanggolin workflow --anno ORGANISMS_ANNOTATION_LIST +``` + +Another [example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.gbff.list) of such a file can be found in the testingDataset/ directory. + +Both of those commands write several output files and graphics (more information [here](https://ppanggolin.readthedocs.io/en/updateenv/user/Outputs.html#ppanggolin-outputs)). Most notably, an HDF-5 (pangenome.h5) file is written. +It can be used as input for any of the subcommands to rerun parts of the analysis with different parameters, +write and draw different representations of the pangenome or run additional analysis with **PPanGGOLiN**. + +A minimum of 5 genomes is generally required to perform a pangenomics analysis using the traditional *core genome*/*accessory genome* paradigm. +It is advised to use at least 15 genomes having genomic variations (and not only SNPs) to obtain robust results with the **PPanGGOLiN** statistical approach. + +If you want to use personalized parameters for each subcommand, most options should be self-descriptive. +If you want to know more about what each output file is, or briefly how each subcommand works, +you can check the [steb by step documentation](https://github.com/labgem/PPanGGOLiN/wiki) + + +## Region of plasticity detection + +Furthermore, you can also predict genomic islands and cluster them into spots of insertion using the **panRGP** pipeline. +The usage is identical to the previous 'workflow' command: + +```bash +ppanggolin panrgp --fasta ORGANISMS_FASTA_LIST +``` + +It will run more analyses after the pangenome has been partitioned. Further details are available [here](https://ppanggolin.readthedocs.io/en/updateenv/user/Basic-usage-and-practical-information.html#the-panrgp-subcommand) and in the [panRPG publication](https://doi.org/10.1093/bioinformatics/btaa792) + +## Conserved module prediction +To detect the conserved modules in your pangenome, you can use the panModule workflow, as such: + +```bash +ppanggolin panmodule --fasta ORGANISMS_FASTA_LIST +``` + +Further details can be found [here](https://ppanggolin.readthedocs.io/en/updateenv/user/Basic-usage-and-practical-information.html#the-panmodule-subcommand) and in the [panModule publication](https://doi.org/10.1101/2021.12.06.471380) + + +Alternatively, to run all the possible analysis that **PPanGGOLiN** can run, you can use: + +```bash +ppanggolin all --fasta ORGANISMS_FASTA_LIST +``` + +Overall, ppanggolin has a lot of subcommands and possibilities. +Don't hesitate to check the command line help, and the [GitHub wiki](https://github.com/labgem/PPanGGOLiN/wiki) to see all the possible analysis, if you are missing a file you're looking for, or do not understand an output. +You can also raise an `issue` if you wish! + +# Issues, Questions, Remarks +If you have any question or issue with installing, +using or understanding **PPanGGOLiN**, please do not hesitate to post an issue! +We cannot correct bugs if we do not know about them, and will try to help you the best we can. + +# Citation +If you use this tool for your research, please cite: + +Gautreau G et al. (2020) **PPanGGOLiN**: Depicting microbial diversity via a partitioned pangenome graph. +PLOS Computational Biology 16(3): e1007732. + +If you use this tool to study genomic islands, please cite: + +Bazin et al., panRGP: a pangenome-based method to predict genomic islands and explore their diversity, Bioinformatics, Volume 36, Issue Supplement_2, December 2020, Pages i651–i658, + +If you use this tool to study modules, please cite: + +Bazin et al., panModule: detecting conserved modules in the variable regions of a pangenome graph. biorxiv. diff --git a/README.rst b/README.rst deleted file mode 100755 index 1afdf1b2..00000000 --- a/README.rst +++ /dev/null @@ -1,126 +0,0 @@ -PPanGGOLiN : Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors -======================================================================================================== - -PPanGGOLiN (Gautreau et al. 2020) is a software suite used to create and manipulate prokaryotic pangenomes from a set of either genomic DNA sequences or provided genome annotations. It is designed to scale up to tens of thousands of genomes. It has the specificity to partition the pangenome using a statistical approach rather than using fixed thresholds which gives it the ability to work with low-quality data such as Metagenomic Assembled Genomes (MAGs) or Single-cell Amplified Genomes (SAGs) thus taking advantage of large scale environmental studies and letting users study the pangenome of uncultivable species. - -PPanGGOLiN builds pangenomes through a graphical model and a statistical method to partition gene families in persistent, shell and cloud genomes. It integrates both information on protein-coding genes and their genomic neighborhood to build a graph of gene families where each node is a gene family and each edge is a relation of genetic contiguity. The partitioning method promotes that two gene families that are consistent neighbors in the graph are more likely to belong to the same partition. It results in a Partitioned Pangenome Graph (PPG) made of persistent, shell and cloud nodes drawing genomes on rails like a subway map to help biologists navigate the great diversity of microbial life. - -Moreover, the panRGP method (Bazin et al. 2020) included in PPanGGOLiN predicts, for each genome, Regions of Genome Plasticity (RGPs) that are clusters of genes made of shell and cloud genomes in the pangenome graph. -Most of them arise from Horizontal gene transfer (HGT) and correspond to Genomic Islands (GIs). -RGPs from different genomes are next grouped in spots of insertion based on their conserved flanking persistent genes. - -Those RGPs can be further divided in conserved modules by panModule (Bazin et al. 2021). Those conserved modules correspond to groups of cooccurring and colocalized genes that are gained or lost together in the variable regions of the pangenome. - -|installs| |bioconda| |plat| |version| - -.. |installs| image:: https://img.shields.io/conda/dn/bioconda/ppanggolin.svg?style=flag&label=BioConda%20install - :target: https://anaconda.org/bioconda/ppanggolin -.. |plat| image:: https://anaconda.org/bioconda/ppanggolin/badges/platforms.svg - :target: https://anaconda.org/bioconda/ppanggolin -.. |version| image:: https://anaconda.org/bioconda/ppanggolin/badges/version.svg - :target: https://anaconda.org/bioconda/ppanggolin -.. |bioconda| image:: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat - :target: http://bioconda.github.io/recipes/ppanggolin/README.html - - -.. image:: images/logo.png - :align: center - - -Installation -============ - - -PPanGGOLiN is easily installed via conda -You will need the following conda channels if you don't have them already: - -.. code:: bash - - conda config --add channels defaults - conda config --add channels bioconda - conda config --add channels conda-forge - -Then, you can just run : - -.. code:: bash - - conda install -c bioconda ppanggolin - - -Quick usage -=========== - -PPanGGOLiN has a minimal command for the non-expert users : - -.. code:: bash - - ppanggolin workflow --fasta ORGANISMS_FASTA_LIST - -It uses parameters that we found to be generally the best when working with species pangenomes. - -The file ORGANISMS_FASTA_LIST is a tsv-separated file with the following organisation : - 1. The first column contains a unique organism name **(without whitespace)** - 2. The second column the path to the associated FASTA file - 3. Circular contig identifiers are indicated in the following columns - 4. Each line represents an organism - -An `example `_ with 50 *Chlamydia trachomatis* genomes can be found in the testingDataset/ directory. - -You can also give PPanGGOLiN your own annotations using .gff or .gbff/.gbk files instead of .fasta files, such as the ones provided by prokka using the following command : - -.. code:: bash - - ppanggolin workflow --anno ORGANISMS_ANNOTATION_LIST - -Another `example `_ of such a file can be found in the testingDataset/ directory. - -Both of those commands write several output files and graphics. Most notably a HDF-5 (pangenome.h5) file is written. It can be used as input for any of the subcommands to rerun parts of the analysis with different parameters, write and draw different representations of the pangenome or run additional analysis with PPanGGOLiN. - -A minimum of 5 genomes is generally required to perform a pangenomics analysis using the traditional *core genome*/*accessory genome* paradigm. It is advised to use at least 15 genomes having genomic variations (and not only SNPs) to obtain robust results with the PPanGGOLiN statistical approach. - -If you want to use personalized parameters for each subcommand most options should be self descriptive. If you want to know more about what each output file is, or briefly how each subcommand works you can check the `github wiki `_ - -Furthermore, you can also predict genomic islands and cluster them into spots of insertion using the panRGP pipeline. The usage is identical to the previous 'workflow' command: - -.. code:: bash - - ppanggolin panrgp --fasta ORGANISMS_FASTA_LIST - -It will run more analyses after the pangenome has been partitioned. Further details are available `here `_ and in the panRPG publication (see below) - -To detect the conserved modules in your pangenome, you can use the panModule workflow, as such: - -.. code:: bash - - ppanggolin panmodule --fasta ORGANISMS_FASTA_LIST - -Further details can be found `in the wiki `_ and in the panModule publication (see below) - -Alternatively, to run all the possible analysis that PPanGGOLiN can run, you can use: - -.. code:: bash - - ppanggolin all --fasta ORGANISMS_FASTA_LIST - -Overall, ppanggolin has a lot of subcommands and possibilities. Don't hesitate to check the command line help, and the `github wiki `_ to see all the possible analysis, if you are missing a file you're looking for, or do not understand an output. You can also raise an `issue` if you wish! - -Issues, Questions, Remarks -========================== - -If you have any question or issue with installing, using or understanding PPanGGOLiN, please do not hesitate to post an issue ! We cannot correct bugs if we do not know about them, and will try to help you the best we can. - - -Citation -======== -If you use this tool for your research please cite: - -Gautreau G et al. (2020) PPanGGOLiN: Depicting microbial diversity via a partitioned pangenome graph. PLOS Computational Biology 16(3): e1007732. https://doi.org/10.1371/journal.pcbi.1007732 - -If you use this tool to study genomic islands, please cite: - -Bazin et al., panRGP: a pangenome-based method to predict genomic islands and explore their diversity, Bioinformatics, Volume 36, Issue Supplement_2, December 2020, Pages i651–i658, https://doi.org/10.1093/bioinformatics/btaa792 - -If you use this tool to study modules, please cite: - -Bazin et al., panModule: detecting conserved modules in the variable regions of a pangenome graph. biorxiv. https://doi.org/10.1101/2021.12.06.471380 - diff --git a/docs/dev/Makefile b/docs/Makefile similarity index 100% rename from docs/dev/Makefile rename to docs/Makefile diff --git a/docs/_static/drawspot_example.png b/docs/_static/drawspot_example.png new file mode 100644 index 00000000..73e1c90d Binary files /dev/null and b/docs/_static/drawspot_example.png differ diff --git a/docs/_static/evolution.png b/docs/_static/evolution.png new file mode 100644 index 00000000..34ad1ff1 Binary files /dev/null and b/docs/_static/evolution.png differ diff --git a/docs/_static/gephi.gif b/docs/_static/gephi.gif new file mode 100644 index 00000000..4618cc7b Binary files /dev/null and b/docs/_static/gephi.gif differ diff --git a/docs/_static/logo.png b/docs/_static/logo.png new file mode 100644 index 00000000..b8ecacc5 Binary files /dev/null and b/docs/_static/logo.png differ diff --git a/docs/_static/projection.png b/docs/_static/projection.png new file mode 100644 index 00000000..1221081e Binary files /dev/null and b/docs/_static/projection.png differ diff --git a/docs/_static/resampling.png b/docs/_static/resampling.png new file mode 100644 index 00000000..d8887701 Binary files /dev/null and b/docs/_static/resampling.png differ diff --git a/docs/_static/runtimes.png b/docs/_static/runtimes.png new file mode 100644 index 00000000..c05e1a65 Binary files /dev/null and b/docs/_static/runtimes.png differ diff --git a/docs/_static/tile_plot.png b/docs/_static/tile_plot.png new file mode 100644 index 00000000..01a88cfb Binary files /dev/null and b/docs/_static/tile_plot.png differ diff --git a/docs/_static/u_plot.png b/docs/_static/u_plot.png new file mode 100644 index 00000000..3ee5110d Binary files /dev/null and b/docs/_static/u_plot.png differ diff --git a/docs/_static/workflow.png b/docs/_static/workflow.png new file mode 100644 index 00000000..fb7fe3b9 Binary files /dev/null and b/docs/_static/workflow.png differ diff --git a/docs/api/ppanggolin.RGP.md b/docs/api/ppanggolin.RGP.md new file mode 100644 index 00000000..dc7f2f0e --- /dev/null +++ b/docs/api/ppanggolin.RGP.md @@ -0,0 +1,39 @@ +# ppanggolin.RGP package + +## Submodules + +## ppanggolin.RGP.genomicIsland module + +```{eval-rst} +.. automodule:: ppanggolin.RGP.genomicIsland + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.RGP.rgp_cluster module + +```{eval-rst} +.. automodule:: ppanggolin.RGP.rgp_cluster + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.RGP.spot module + +```{eval-rst} +.. automodule:: ppanggolin.RGP.spot + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.RGP + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.align.md b/docs/api/ppanggolin.align.md new file mode 100644 index 00000000..83505bbe --- /dev/null +++ b/docs/api/ppanggolin.align.md @@ -0,0 +1,21 @@ +# ppanggolin.align package + +## Submodules + +## ppanggolin.align.alignOnPang module + +```{eval-rst} +.. automodule:: ppanggolin.align.alignOnPang + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.align + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.annotate.md b/docs/api/ppanggolin.annotate.md new file mode 100644 index 00000000..e6c0e295 --- /dev/null +++ b/docs/api/ppanggolin.annotate.md @@ -0,0 +1,30 @@ +# ppanggolin.annotate package + +## Submodules + +## ppanggolin.annotate.annotate module + +```{eval-rst} +.. automodule:: ppanggolin.annotate.annotate + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.annotate.synta module + +```{eval-rst} +.. automodule:: ppanggolin.annotate.synta + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.annotate + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.cluster.md b/docs/api/ppanggolin.cluster.md new file mode 100644 index 00000000..fa24fb9a --- /dev/null +++ b/docs/api/ppanggolin.cluster.md @@ -0,0 +1,21 @@ +# ppanggolin.cluster package + +## Submodules + +## ppanggolin.cluster.cluster module + +```{eval-rst} +.. automodule:: ppanggolin.cluster.cluster + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.cluster + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.context.md b/docs/api/ppanggolin.context.md new file mode 100644 index 00000000..2f7ac70a --- /dev/null +++ b/docs/api/ppanggolin.context.md @@ -0,0 +1,21 @@ +# ppanggolin.context package + +## Submodules + +## ppanggolin.context.searchGeneContext module + +```{eval-rst} +.. automodule:: ppanggolin.context.searchGeneContext + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.context + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.figures.md b/docs/api/ppanggolin.figures.md new file mode 100644 index 00000000..ecf2eb6b --- /dev/null +++ b/docs/api/ppanggolin.figures.md @@ -0,0 +1,48 @@ +# ppanggolin.figures package + +## Submodules + +## ppanggolin.figures.draw_spot module + +```{eval-rst} +.. automodule:: ppanggolin.figures.draw_spot + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.figures.drawing module + +```{eval-rst} +.. automodule:: ppanggolin.figures.drawing + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.figures.tile_plot module + +```{eval-rst} +.. automodule:: ppanggolin.figures.tile_plot + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.figures.ucurve module + +```{eval-rst} +.. automodule:: ppanggolin.figures.ucurve + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.figures + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.formats.md b/docs/api/ppanggolin.formats.md new file mode 100644 index 00000000..c86f1c52 --- /dev/null +++ b/docs/api/ppanggolin.formats.md @@ -0,0 +1,66 @@ +# ppanggolin.formats package + +## Submodules + +## ppanggolin.formats.readBinaries module + +```{eval-rst} +.. automodule:: ppanggolin.formats.readBinaries + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.formats.writeBinaries module + +```{eval-rst} +.. automodule:: ppanggolin.formats.writeBinaries + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.formats.writeFlat module + +```{eval-rst} +.. automodule:: ppanggolin.formats.writeFlat + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.formats.writeMSA module + +```{eval-rst} +.. automodule:: ppanggolin.formats.writeMSA + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.formats.writeMetadata module + +```{eval-rst} +.. automodule:: ppanggolin.formats.writeMetadata + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.formats.writeSequences module + +```{eval-rst} +.. automodule:: ppanggolin.formats.writeSequences + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.formats + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.graph.md b/docs/api/ppanggolin.graph.md new file mode 100644 index 00000000..42abbdbf --- /dev/null +++ b/docs/api/ppanggolin.graph.md @@ -0,0 +1,21 @@ +# ppanggolin.graph package + +## Submodules + +## ppanggolin.graph.makeGraph module + +```{eval-rst} +.. automodule:: ppanggolin.graph.makeGraph + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.graph + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.info.md b/docs/api/ppanggolin.info.md new file mode 100644 index 00000000..4c4d3f62 --- /dev/null +++ b/docs/api/ppanggolin.info.md @@ -0,0 +1,21 @@ +# ppanggolin.info package + +## Submodules + +## ppanggolin.info.info module + +```{eval-rst} +.. automodule:: ppanggolin.info.info + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.info + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.md b/docs/api/ppanggolin.md new file mode 100644 index 00000000..514f4319 --- /dev/null +++ b/docs/api/ppanggolin.md @@ -0,0 +1,116 @@ +(ppanggolin-api)= +# PPanGGOLiN Package + +## Subpackages + +```{toctree} +:maxdepth: 4 + +ppanggolin.RGP +ppanggolin.align +ppanggolin.annotate +ppanggolin.cluster +ppanggolin.context +ppanggolin.figures +ppanggolin.formats +ppanggolin.graph +ppanggolin.info +ppanggolin.meta +ppanggolin.metrics +ppanggolin.mod +ppanggolin.nem +ppanggolin.utility +ppanggolin.workflow +``` + +## Submodules + +## ppanggolin.edge module + +```{eval-rst} +.. automodule:: ppanggolin.edge + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.geneFamily module + +```{eval-rst} +.. automodule:: ppanggolin.geneFamily + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.genetic_codes module + +```{eval-rst} +.. automodule:: ppanggolin.genetic_codes + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.genome module + +```{eval-rst} +.. automodule:: ppanggolin.genome + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.main module + +```{eval-rst} +.. automodule:: ppanggolin.main + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.metadata module + +```{eval-rst} +.. automodule:: ppanggolin.metadata + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.pangenome module + +```{eval-rst} +.. automodule:: ppanggolin.pangenome + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.region module + +```{eval-rst} +.. automodule:: ppanggolin.region + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.utils module + +```{eval-rst} +.. automodule:: ppanggolin.utils + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.meta.md b/docs/api/ppanggolin.meta.md new file mode 100644 index 00000000..d0144b59 --- /dev/null +++ b/docs/api/ppanggolin.meta.md @@ -0,0 +1,21 @@ +# ppanggolin.meta package + +## Submodules + +## ppanggolin.meta.meta module + +```{eval-rst} +.. automodule:: ppanggolin.meta.meta + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.meta + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.metrics.md b/docs/api/ppanggolin.metrics.md new file mode 100644 index 00000000..028aac60 --- /dev/null +++ b/docs/api/ppanggolin.metrics.md @@ -0,0 +1,30 @@ +# ppanggolin.metrics package + +## Submodules + +## ppanggolin.metrics.fluidity module + +```{eval-rst} +.. automodule:: ppanggolin.metrics.fluidity + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.metrics.metrics module + +```{eval-rst} +.. automodule:: ppanggolin.metrics.metrics + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.metrics + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.mod.md b/docs/api/ppanggolin.mod.md new file mode 100644 index 00000000..fdcfaf74 --- /dev/null +++ b/docs/api/ppanggolin.mod.md @@ -0,0 +1,21 @@ +# ppanggolin.mod package + +## Submodules + +## ppanggolin.mod.module module + +```{eval-rst} +.. automodule:: ppanggolin.mod.module + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.mod + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.nem.md b/docs/api/ppanggolin.nem.md new file mode 100644 index 00000000..57092abc --- /dev/null +++ b/docs/api/ppanggolin.nem.md @@ -0,0 +1,30 @@ +# ppanggolin.nem package + +## Submodules + +## ppanggolin.nem.partition module + +```{eval-rst} +.. automodule:: ppanggolin.nem.partition + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.nem.rarefaction module + +```{eval-rst} +.. automodule:: ppanggolin.nem.rarefaction + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.nem + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.utility.md b/docs/api/ppanggolin.utility.md new file mode 100644 index 00000000..5230a9c5 --- /dev/null +++ b/docs/api/ppanggolin.utility.md @@ -0,0 +1,21 @@ +# ppanggolin.utility package + +## Submodules + +## ppanggolin.utility.utils module + +```{eval-rst} +.. automodule:: ppanggolin.utility.utils + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.utility + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/api/ppanggolin.workflow.md b/docs/api/ppanggolin.workflow.md new file mode 100644 index 00000000..c30b18e9 --- /dev/null +++ b/docs/api/ppanggolin.workflow.md @@ -0,0 +1,48 @@ +# ppanggolin.workflow package + +## Submodules + +## ppanggolin.workflow.all module + +```{eval-rst} +.. automodule:: ppanggolin.workflow.all + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.workflow.panModule module + +```{eval-rst} +.. automodule:: ppanggolin.workflow.panModule + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.workflow.panRGP module + +```{eval-rst} +.. automodule:: ppanggolin.workflow.panRGP + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.workflow.workflow module + +```{eval-rst} +.. automodule:: ppanggolin.workflow.workflow + :members: + :undoc-members: + :show-inheritance: +``` + +## Module contents + +```{eval-rst} +.. automodule:: ppanggolin.workflow + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/dev/source/conf.py b/docs/conf.py similarity index 79% rename from docs/dev/source/conf.py rename to docs/conf.py index 8dfdce2c..c56d026b 100644 --- a/docs/dev/source/conf.py +++ b/docs/conf.py @@ -10,19 +10,16 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -import os -import sys -sys.path.insert(0, os.path.abspath('../../ppanggolin/')) - +from pathlib import Path # -- Project information ----------------------------------------------------- project = 'PPanGGOLiN' -copyright = '2020, Adelme Bazin, Guillaume Gautreau' -author = 'Adelme Bazin, Guillaume Gautreau' +copyright = '2023, LABGeM' +author = 'Jérôme Arnoux' # The full version, including alpha/beta/rc tags -release = '1.1.67' +release = open(Path(__file__).resolve().parents[1]/"VERSION").read().rstrip() # Get release number in the VERSION file # -- General configuration --------------------------------------------------- @@ -30,7 +27,14 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] +extensions = [ + "myst_parser", + # "sphinxcontrib.jquery", + "sphinx.ext.duration", + "sphinx.ext.autosectionlabel", + "sphinx.ext.autodoc", + 'sphinx_search.extension', +] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -40,7 +44,7 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] - +suppress_warning = ["myst.header", "autosectionlabel.*"] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for diff --git a/docs/dev/README.txt b/docs/dev/README.txt deleted file mode 100644 index 76c1857f..00000000 --- a/docs/dev/README.txt +++ /dev/null @@ -1,4 +0,0 @@ -To build the doc, install the requirements listed in requirements.txt, and then run: - -make html - diff --git a/docs/dev/buildDoc.md b/docs/dev/buildDoc.md new file mode 100644 index 00000000..b6d23c53 --- /dev/null +++ b/docs/dev/buildDoc.md @@ -0,0 +1,256 @@ +# Build the documentation +This partdescribe the guidelines to build the documentation of PPanGGOLiN. + +```{danger} +When you will merge or pull request your branch on master, a bot from readthedoc will see it and update the doc online. +Be sure that your doc is clean and without error. +``` + +## Install required packages + +Required packages are listed below : +```text +sphinx==6.2.1 +sphinx_rtd_theme==1.2.2 +readthedocs-sphinx-search==0.3.1 +sphinx-autobuild==2021.3.14 +myst-parser==1.0.0 +``` +To build the doc you need to use an environnement with panorama installed. +To make think easier [pyproject.toml file](../../pyproject.toml) contain the same list of requirement +and can install everything automatically with pip. +```shell +# PANORAMA=/path/to/panorama/ +pip install $PANORAMA[doc] # You can add -e to install in editable mode +``` +## Build documentation with sphinx + +You can look at your modification in live by using **sphinx-autobuild** (installed previously). + +```shell +cd $PANORAMA/.docs +sphinx-autobuild source/ build/ +#copy server adresse, for me (as example) http://127.0.0.1:8000 +#paste the adresse in your browser +``` + +```{note} +The package [readthedocs-sphinx-search](https://readthedocs-sphinx-search.readthedocs.io/en/latest/) "enable search as +you type for docs hosted on Read the Docs". It's only work on ReadTheDocs web site `[INFO] Docs are not being served on Read the Docs, readthedocs-sphinx-search will not work.`, don't try to make it work. +``` + +### Modify existing documentation +In this part we will speak about how to change the already existing documentation files. +To add files for command, package, ... See [Adding section](#heading-adding) + +To modify the existing user or developper documentation, you simply need to go to the file where you want to make a change and modify it. + +The API documentation is automatically update when you modify the docstring in the code. +It's also working when you add function, method, class, ect, in an already existing package, +but not if you add new package (new file in the ppanggolin), for this look at [Update API documentation](#add-api-doc). + +(heading-adding)= +### Adding to existing documentation +#### Adding user documentation file +User documentation should contain files relative to new command, example and information about PPanGGOLiN. +To ensure efficency, file name should correspond to the main topic. +A file should not be long, prefer to split in multiple files. + +When the file is created, you can add it to the index in the *toctree UserGuide* by adding a line `user/filename` +without the file extension (.md) in the **index file**. + +#### New guidelines for development +All new guidelines that seems interesting are welcomed. + +If you think that the guidelines could not be added to an existing file, you can create a new one. +Use an explicit name for your file and add it to the *toctree DevelopperGuide* + +(add-api-doc)= +#### Update API documentation +The API documentation is build automatically. +To update the API documentation and keep the automatic update when a new package, module, submodules is added follow the +next lines: +```shell +sphinx-apidoc -o api $PANORAMA/panorama -f +``` +```{attention} +*sphinx-apidoc* will generate ReStructeredText files. You need to convert them in markdown. For this follow the guides +[here](#rst2md) +``` + +### Creating a new documentation from scratch +#### Quickstart with sphinx +```{warning} +This must be discuss with repository administrators. +``` +To create the documentation from scratch, rename the existing documentation (or use another name for the new one) +and follow the next steps. + +```shell +DOCS=path/to/PPanGGOLiN/docs +sphinx-quickstart $DOCS +#Welcome to the Sphinx 6.2.1 quickstart utility. +# +#Please enter values for the following settings (just press Enter to +#accept a default value, if one is given in brackets). +# +#Selected root path: docs_scratch +# +#You have two options for placing the build directory for Sphinx output. +#Either, you use a directory "_build" within the root path, or you separate +#"source" and "build" directories within the root path. +#> Separate source and build directories (y/n) [n]: y +# +#The project name will occur in several places in the built documentation. +#> Project name: PPanGGOLiN +#> Author name(s): Jérôme Arnoux +#> Project release []: 1.2.174 +# +#If the documents are to be written in a language other than English, +#you can select a language here by its language code. Sphinx will then +#translate text that it generates into that language. +# +#For a list of supported codes, see +#https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-language. +#> Project language [en]: +# +#Creating file /home/jarnoux/Projects/PANORAMA/docs_scratch/source/conf.py. +#Creating file /home/jarnoux/Projects/PANORAMA/docs_scratch/source/index.rst. +#Creating file /home/jarnoux/Projects/PANORAMA/docs_scratch/Makefile. +#Creating file /home/jarnoux/Projects/PANORAMA/docs_scratch/make.bat. +# +#Finished: An initial directory structure has been created. +# +#You should now populate your master file /home/jarnoux/Projects/PANORAMA/docs_scratch/source/index.rst and create other documentation +#source files. Use the Makefile to build the docs, like so: +# make builder +#where "builder" is one of the supported builders, e.g. html, latex or linkcheck. +``` + +Now you have a documentation folder ready to use. +#### Configuration file +In the *source* directory you should find a `conf.py` file. Replace the code inside by the following. +```python +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +from pathlib import Path + +# -- Project information ----------------------------------------------------- + +project = 'PPanGGOLiN' +copyright = 'LABGeM' +author = 'Jérôme Arnoux' + +# The full version, including alpha/beta/rc tags +release = open(Path(__file__).resolve().parents[2]/"VERSION").read().rstrip() # Get release number in the VERSION file + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "myst_parser", + # "sphinxcontrib.jquery", + "sphinx.ext.duration", + "sphinx.ext.autosectionlabel", + "sphinx.ext.autodoc", + 'sphinx_search.extension', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +``` +(rst2md)= +#### ReStructeredText to markdown +reStructuredText (rst) is the default plaintext markup language used by both Docutils and Sphinx. +More complete but a little bit older than Markdown, which is easier to use too. +We are going to change rst for Markdown (md). +To translate rst and keep all the features, we will use [MyST](https://mystmd.org/guide). + +For this case we will need to install a new package `rst-to-myst`. +```{note} We advice to use another environment, because as far as we know today, this package is not compatible with our sphinx version +``` + +```shell +pip install rst-to-myst[sphinx] +# Go to your environment with rst2myst +rst2myst convert source/index.rst +# Go back to your environment with panorama +rm source/index.rst +``` +#### README in index.md +It's possible to add the **README** file in the index to don't have to rewrite it in the doc. +Simply add the following line in `index.md` +```markdown + ```{include} ../../README.md + :relative-images: % To + ``` +% Without tabulation +``` + +#### User documentation +The user documentation is completely handwritten. Moreover, we advise respecting the following guidelines: + +1. One file per topic/command with an explicit text on the feature +2. One file for the installation guidelines +3. One file on how to report issue or enhancement +4. Don't ref to any function in the panorama code. This is reserved for developper documentation + +#### Developper documentation +The developper documentation is handwritten too. We advise respecting the following guidelines: +1. Spoke about the PEP rules +2. Give guidelines on how to use git and GitHub for version control +3. Explain how to write unit test and modify GitHub workflows +4. Write how to enhance the documentation +5. Select some function, class or command that are central in the code and provide a more complete description of them. + + +#### API documentation +To build the API documentation and use the docstring in code, you can use the command `sphinx-apidoc` as follows: +```shell +sphinx-apidoc -o api $PANORAMA/panorama +# Go to your environment with rst2myst +rst2myst convert api/*.rst +# Go back to your environment with sphinx +rm api/*.rst +``` +You have now documentation for PPanGGOLiN api. To ref api in your doc you can paste **\{ref\}\`package panorama\`** + +```{tip} +With the "sphinx.ext.autosectionlabel", you will certainly get multiple warning for duplicate label. +To remove them you have to remove or modify the label in one of the cited file. +``` +```{tip} +When you use "sphinx-apidoc" a modules.md file is created but he is not used. we advice to removed it to prevent warning. +``` \ No newline at end of file diff --git a/docs/dev/devRules.md b/docs/dev/devRules.md new file mode 100644 index 00000000..20de4f53 --- /dev/null +++ b/docs/dev/devRules.md @@ -0,0 +1,4 @@ +# Development rules +```{warning} +This part of the documentation is in progress. +``` \ No newline at end of file diff --git a/docs/dev/git.md b/docs/dev/git.md new file mode 100644 index 00000000..5d4c5d77 --- /dev/null +++ b/docs/dev/git.md @@ -0,0 +1,6 @@ +# How to contribute to the project +```{warning} +This part of the documentation is in progress. +``` +## How to use git to version the code +## How to use our GitHub repository diff --git a/docs/dev/requirements.txt b/docs/dev/requirements.txt deleted file mode 100644 index fedd390e..00000000 --- a/docs/dev/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -sphinx -sphinx_rtd_theme - diff --git a/docs/dev/source/classes/edge.rst b/docs/dev/source/classes/edge.rst deleted file mode 100644 index bc984e55..00000000 --- a/docs/dev/source/classes/edge.rst +++ /dev/null @@ -1,7 +0,0 @@ -The ``Edge`` class -================== - -.. autoclass:: ppanggolin.pangenome.Edge - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/classes/geneFamily.rst b/docs/dev/source/classes/geneFamily.rst deleted file mode 100644 index f7c2f21a..00000000 --- a/docs/dev/source/classes/geneFamily.rst +++ /dev/null @@ -1,7 +0,0 @@ -The ``GeneFamily`` class -======================== - -.. autoclass:: ppanggolin.geneFamily.GeneFamily - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/classes/pangenome.rst b/docs/dev/source/classes/pangenome.rst deleted file mode 100644 index 4f2ebd31..00000000 --- a/docs/dev/source/classes/pangenome.rst +++ /dev/null @@ -1,7 +0,0 @@ -The ``Pangenome`` class -======================= - -.. autoclass:: ppanggolin.pangenome.Pangenome - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/index.rst b/docs/dev/source/index.rst deleted file mode 100644 index 14b36498..00000000 --- a/docs/dev/source/index.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. PPanGGOLiN documentation master file, created by - sphinx-quickstart on Thu Mar 12 10:23:28 2020. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to PPanGGOLiN's documentation! -====================================== - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - ppanggolin - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/dev/source/ppanggolin.RGP.rst b/docs/dev/source/ppanggolin.RGP.rst deleted file mode 100644 index 3ffb12d1..00000000 --- a/docs/dev/source/ppanggolin.RGP.rst +++ /dev/null @@ -1,23 +0,0 @@ -The `RGP` package -====================== - -This package computes Regions of Genome Plasticity, and cluster them into spots of insertion. - -Submodules ----------- - -ppanggolin.RGP.genomicIsland module ------------------------------------ - -.. automodule:: ppanggolin.RGP.genomicIsland - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.RGP.spot module --------------------------- - -.. automodule:: ppanggolin.RGP.spot - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/ppanggolin.align.rst b/docs/dev/source/ppanggolin.align.rst deleted file mode 100644 index e40dfe1a..00000000 --- a/docs/dev/source/ppanggolin.align.rst +++ /dev/null @@ -1,26 +0,0 @@ -The `align` package -=================== - -This package uses a pangenome as a reference to compute elements for a given genome, or a given set of proteins. As such, analysis that are usually run on multiple genomes can be run on the single genome or set of proteins that is provided. This subpackage depends on many of the other subpackages to run its analysis. -This package depends on the following packages: - -- `formats`, to check the pangenome status. -- `annotate`, to read the given input files that can be gff or gbff. -- `cluster`, to write gene sequences from annotations. -- `RGP`, to eventually compute RGP and spot predictions. - -It depends on the following modules: - -- `pangenome` -- `utils` - -Submodules ----------- - -ppanggolin.align.alignOnPang module ------------------------------------ - -.. automodule:: ppanggolin.align.alignOnPang - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/ppanggolin.annotate.rst b/docs/dev/source/ppanggolin.annotate.rst deleted file mode 100644 index 92fd67e8..00000000 --- a/docs/dev/source/ppanggolin.annotate.rst +++ /dev/null @@ -1,33 +0,0 @@ -The `annotate` package -=========================== - -This package is made to either annotate genomes or read annotations from gbff or gff files. - -It depends on the following subpackage: - -- `formats`, to write the pangenome to the HDF-5 file. - -It depends on the following modules: - -- `pangenome` -- `genome` -- `utils` - -Submodules ----------- - -ppanggolin.annotate.annotate module ------------------------------------ - -.. automodule:: ppanggolin.annotate.annotate - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.annotate.synta module --------------------------------- - -.. automodule:: ppanggolin.annotate.synta - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/ppanggolin.cluster.rst b/docs/dev/source/ppanggolin.cluster.rst deleted file mode 100644 index 82193ea2..00000000 --- a/docs/dev/source/ppanggolin.cluster.rst +++ /dev/null @@ -1,15 +0,0 @@ -The `cluster` package -===================== - -This package is there to built gene families, or to read gene families from used input. It will mainly use MMseqs2 for the computation. - -Submodules ----------- - -ppanggolin.cluster.cluster module ---------------------------------- - -.. automodule:: ppanggolin.cluster.cluster - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/ppanggolin.figures.rst b/docs/dev/source/ppanggolin.figures.rst deleted file mode 100644 index 9b163b9d..00000000 --- a/docs/dev/source/ppanggolin.figures.rst +++ /dev/null @@ -1,31 +0,0 @@ -The `figures` package -========================== - -This package is there to draw informative figures about the pangenome after it has been computed. - -Submodules ----------- - -ppanggolin.figures.drawing module ---------------------------------- - -.. automodule:: ppanggolin.figures.drawing - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.figures.tile\_plot module ------------------------------------- - -.. automodule:: ppanggolin.figures.tile_plot - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.figures.ucurve module --------------------------------- - -.. automodule:: ppanggolin.figures.ucurve - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/ppanggolin.formats.rst b/docs/dev/source/ppanggolin.formats.rst deleted file mode 100644 index afebb751..00000000 --- a/docs/dev/source/ppanggolin.formats.rst +++ /dev/null @@ -1,31 +0,0 @@ -The `formats` package -===================== - -This package is used by the other packages to read and write the pangenome to/from the HDF-5 file. - -Submodules ----------- - -ppanggolin.formats.readBinaries module --------------------------------------- - -.. automodule:: ppanggolin.formats.readBinaries - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.formats.writeBinaries module ---------------------------------------- - -.. automodule:: ppanggolin.formats.writeBinaries - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.formats.writeFlat module ------------------------------------ - -.. automodule:: ppanggolin.formats.writeFlat - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/ppanggolin.graph.rst b/docs/dev/source/ppanggolin.graph.rst deleted file mode 100644 index 2de4840d..00000000 --- a/docs/dev/source/ppanggolin.graph.rst +++ /dev/null @@ -1,15 +0,0 @@ -The `graph` package -======================== - -This builds the pangenome graph, and eventually removes nodes from it. - -Submodules ----------- - -ppanggolin.graph.makeGraph module ---------------------------------- - -.. automodule:: ppanggolin.graph.makeGraph - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/ppanggolin.info.rst b/docs/dev/source/ppanggolin.info.rst deleted file mode 100644 index 8f7645a9..00000000 --- a/docs/dev/source/ppanggolin.info.rst +++ /dev/null @@ -1,15 +0,0 @@ -The `info` package -======================= - -This package is used to query a pangenome to get summaries of its content, the parameters used to compute it and the different analyses that were run on it and are stored in the HDF-5 file. - -Submodules ----------- - -ppanggolin.info.info module ---------------------------- - -.. automodule:: ppanggolin.info.info - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/ppanggolin.nem.rst b/docs/dev/source/ppanggolin.nem.rst deleted file mode 100644 index da4c0d2d..00000000 --- a/docs/dev/source/ppanggolin.nem.rst +++ /dev/null @@ -1,23 +0,0 @@ -The `nem` package -====================== - -This package is there to use the NEM algorithm. - -Submodules ----------- - -ppanggolin.nem.partition module -------------------------------- - -.. automodule:: ppanggolin.nem.partition - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.nem.rarefaction module ---------------------------------- - -.. automodule:: ppanggolin.nem.rarefaction - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/source/ppanggolin.rst b/docs/dev/source/ppanggolin.rst deleted file mode 100644 index 2592b890..00000000 --- a/docs/dev/source/ppanggolin.rst +++ /dev/null @@ -1,79 +0,0 @@ -Developper doc -============== - -``ppanggolin`` is both a command line tool and a python library for comparative genomics. It tries to prodive a solution for using cutting-edge methods for large scale comparative analysis and stores any computed results in a compact format so that they can be reused at will. -This part of the documentation is made for people that want to use PPanGGOLiN as a python library, or for those that need to maintain the package or want to modify it. - -If you were looking for the command line tool documentation of PPanGGOLiN, you should check the github wiki instead. - -Subpackages ------------ - -There is a ppanggolin subpackage for each specific step of the analysis. Each subpackage is associated to one or more subcommand. - -.. toctree:: - :maxdepth: 1 - - ppanggolin.RGP - ppanggolin.align - ppanggolin.annotate - ppanggolin.cluster - ppanggolin.figures - ppanggolin.formats - ppanggolin.graph - ppanggolin.info - ppanggolin.nem - ppanggolin.workflow - -Submodules ----------- - -Submodules includes all of the basic classes of PPanGGOLiN that will be used by the subpackages. - -.. toctree:: - :maxdepth: 2 - - classes/pangenome - classes/edge - classes/geneFamily - -ppanggolin.genome module ------------------------- - -.. automodule:: ppanggolin.genome - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.main module ----------------------- - -.. automodule:: ppanggolin.main - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.region module ------------------------- - -.. automodule:: ppanggolin.region - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.utils module ------------------------ - -.. automodule:: ppanggolin.utils - :members: - :undoc-members: - :show-inheritance: - - -Module contents ---------------- - -.. automodule:: ppanggolin - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/dev/source/ppanggolin.workflow.rst b/docs/dev/source/ppanggolin.workflow.rst deleted file mode 100644 index 4046fe4b..00000000 --- a/docs/dev/source/ppanggolin.workflow.rst +++ /dev/null @@ -1,23 +0,0 @@ -The `workflow` package -=========================== - -This package includes the different 'basic' workflows of ppanggolin. It will depend on most of the other packages to run its analyses. - -Submodules ----------- - -ppanggolin.workflow.panRGP module ---------------------------------- - -.. automodule:: ppanggolin.workflow.panRGP - :members: - :undoc-members: - :show-inheritance: - -ppanggolin.workflow.workflow module ------------------------------------ - -.. automodule:: ppanggolin.workflow.workflow - :members: - :undoc-members: - :show-inheritance: \ No newline at end of file diff --git a/docs/dev/unitTest.md b/docs/dev/unitTest.md new file mode 100644 index 00000000..3a59f526 --- /dev/null +++ b/docs/dev/unitTest.md @@ -0,0 +1,4 @@ +# Unitary test +```{warning} +This part of the documentation is in progress. +``` \ No newline at end of file diff --git a/docs/dev/workflows.md b/docs/dev/workflows.md new file mode 100644 index 00000000..b4ea9df3 --- /dev/null +++ b/docs/dev/workflows.md @@ -0,0 +1,4 @@ +# GitHub workflows +```{warning} +This part of the documentation is in progress. +``` \ No newline at end of file diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..ea7064f6 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,189 @@ +% PPanGGOLiN documentation master file, created by +% sphinx-quickstart on Tue Sep 12 10:29:06 2023. +% You can adapt this file completely to your liking, but it should at least +% contain the root `toctree` directive. + +# PPanGGOLiN: Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors + +[![Actions](https://img.shields.io/github/actions/workflow/status/althonos/pyrodigal/test.yml?branch=main&logo=github&style=flat-square&maxAge=300)](https://github.com/labgem/ppanggolin/actions) +[![License](https://anaconda.org/bioconda/ppanggolin/badges/license.svg)](http://www.cecill.info/licences.fr.html) +[![Bioconda](https://img.shields.io/conda/vn/bioconda/ppanggolin?style=flat-square&maxAge=3600&logo=anaconda)](https://anaconda.org/bioconda/ppanggolin) +[![Source](https://img.shields.io/badge/source-GitHub-303030.svg?maxAge=2678400&style=flat-square)](https://github.com/labgem/ppanggolin/) +[![GitHub issues](https://img.shields.io/github/issues/labgem/ppanggolin.svg?style=flat-square&maxAge=600)](https://github.com/labgem/ppanggolin/issues) +[![Docs](https://img.shields.io/readthedocs/ppanggolin/latest?style=flat-square&maxAge=600)](https://ppanggolin.readthedocs.io) +[![Downloads](https://anaconda.org/bioconda/ppanggolin/badges/downloads.svg)](https://bioconda.github.io/recipes/ppanggolin/README.html#download-stats) + +**PPanGGOLiN** +([Gautreau et al. 2020](https://doi.org/10.1371/journal.pcbi.1007732)) is a software suite used to create and manipulate prokaryotic pangenomes from a set of either genomic DNA sequences or provided genome annotations. +It is designed to scale up to tens of thousands of genomes. +It has the specificity to partition the pangenome using a statistical approach rather than using fixed thresholds which gives it the ability to work with low-quality data such as *Metagenomic Assembled Genomes (MAGs)* or *Single-cell Amplified Genomes (SAGs)* thus taking advantage of large scale environmental studies and letting users study the pangenome of uncultivable species. + +**PPanGGOLiN** builds pangenomes through a graphical model and a statistical method to partition gene families in persistent, shell and cloud genomes. +It integrates both information on protein-coding genes and their genomic neighborhood to build a graph of gene families where each node is a gene family, and each edge is a relation of genetic contiguity. +The partitioning method promotes that two gene families that are consistent neighbors in the graph are more likely to belong to the same partition. +It results in a Partitioned Pangenome Graph (PPG) made of persistent, shell and cloud nodes drawing genomes on rails like a subway map to help biologists navigate the great diversity of microbial life. + + +Moreover, the panRGP method ([Bazin et al. 2020](https://doi.org/10.1093/bioinformatics/btaa792)) included in **PPanGGOLiN** predicts, for each genome, Regions of Genome Plasticity (RGPs) that are clusters of genes made of shell and cloud genomes in the pangenome graph. +Most of them arise from Horizontal gene transfer (HGT) and correspond to Genomic Islands (GIs). +RGPs from different genomes are next grouped in spots of insertion based on their conserved flanking persistent genes. + + +Those RGPs can be further divided in conserved modules by panModule ([Bazin et al. 2021](https://doi.org/10.1101/2021.12.06.471380)). Those conserved modules correspond to groups of cooccurring and colocalized genes that are gained or lost together in the variable regions of the pangenome. + +```{image} _static/logo.png +:alt: ppangolin logo +:align: center +:heigth: 300 +:width: 300 +``` + +# Installation + +**PPanGGOLiN** is easily installed via conda. +You will need the following conda channels if you don't have them already: + +```bash +conda config --add channels defaults +conda config --add channels bioconda +conda config --add channels conda-forge +``` + +Then, you can just run : + +```bash +conda install -c bioconda ppanggolin +``` + +# Quick usage + +**PPanGGOLiN** integrates some workflows to build and analyse easily and rapidly a pangenome. +These commands can be tuned with some parameters but are mostly automatic. +All workflow parameters are described [here](#step-by-step-section). + +## Pangenome graph construction and partition + +To build and partition a pangenome, you can use the following command: +```bash +ppanggolin workflow --fasta ORGANISMS_FASTA_LIST +``` + +It uses parameters that we found to be generally the best when working with species pangenomes. + +The file ORGANISMS_FASTA_LIST is a tsv-separated file with the following organization : +1. The first column contains a unique organism name **(without space)** +2. The second column the path to the associated FASTA file +3. Circular contig identifiers are indicated in the following columns +4. Each line represents an organism + +An [example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.fasta.list) with 50 *Chlamydia trachomatis* genomes can be found in the testingDataset/ directory. + + +You can also give **PPanGGOLiN** your own annotations using *.gff* or *.gbff/.gbk* files instead of *.fasta* files, +such as the ones provided by prokka using the following command : + +```bash +ppanggolin workflow --anno ORGANISMS_ANNOTATION_LIST +``` + +Another [example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.gbff.list) of such a file can be found in the testingDataset/ directory. + +Both of those commands write several output files and graphics (more information [here](#output)). Most notably, an HDF-5 (pangenome.h5) file is written. +It can be used as input for any of the subcommands to rerun parts of the analysis with different parameters, +write and draw different representations of the pangenome or run additional analysis with **PPanGGOLiN**. + +A minimum of 5 genomes is generally required to perform a pangenomics analysis using the traditional *core genome*/*accessory genome* paradigm. +It is advised to use at least 15 genomes having genomic variations (and not only SNPs) to obtain robust results with the **PPanGGOLiN** statistical approach. + +If you want to use personalized parameters for each subcommand, most options should be self-descriptive. +If you want to know more about what each output file is, or briefly how each subcommand works, +you can check the [steb by step documentation](https://github.com/labgem/PPanGGOLiN/wiki) + + +## Region of plasticity detection + +Furthermore, you can also predict genomic islands and cluster them into spots of insertion using the **panRGP** pipeline. +The usage is identical to the previous 'workflow' command: + +```bash +ppanggolin panrgp --fasta ORGANISMS_FASTA_LIST +``` + +It will run more analyses after the pangenome has been partitioned. Further details are available [here](#panrgp) and in the [panRPG publication](https://doi.org/10.1093/bioinformatics/btaa792) + +## Conserved module prediction +To detect the conserved modules in your pangenome, you can use the panModule workflow, as such: + +```bash +ppanggolin panmodule --fasta ORGANISMS_FASTA_LIST +``` + +Further details can be found [here](#panmodule) and in the [panModule publication](https://doi.org/10.1101/2021.12.06.471380) + + +Alternatively, to run all the possible analysis that **PPanGGOLiN** can run, you can use: + +```bash +ppanggolin all --fasta ORGANISMS_FASTA_LIST +``` + +Overall, ppanggolin has a lot of subcommands and possibilities. +Don't hesitate to check the command line help, and the [GitHub wiki](https://github.com/labgem/PPanGGOLiN/wiki) to see all the possible analysis, if you are missing a file you're looking for, or do not understand an output. +You can also raise an `issue` if you wish! + +# Issues, Questions, Remarks +If you have any question or issue with installing, +using or understanding **PPanGGOLiN**, please do not hesitate to post an issue! +We cannot correct bugs if we do not know about them, and will try to help you the best we can. + +# Citation +If you use this tool for your research, please cite: + +Gautreau G et al. (2020) **PPanGGOLiN**: Depicting microbial diversity via a partitioned pangenome graph. +PLOS Computational Biology 16(3): e1007732. + +If you use this tool to study genomic islands, please cite: + +Bazin et al., panRGP: a pangenome-based method to predict genomic islands and explore their diversity, Bioinformatics, Volume 36, Issue Supplement_2, December 2020, Pages i651–i658, + +If you use this tool to study modules, please cite: + +Bazin et al., panModule: detecting conserved modules in the variable regions of a pangenome graph. biorxiv. + + +```{toctree} +:caption: 'User Guide:' +:maxdepth: 2 + +user/Introduction +user/Installation +user/Basic-usage-and-practical-information +user/step-by-step-pangenome-analysis +user/Regions-of-Genome-Plasticity +user/Conserved-modules +user/Align +user/Genomic-context +user/metadata +user/Outputs +``` + +```{toctree} +:caption: 'Developper Guide:' +:maxdepth: 2 + +dev/devRules +dev/git +dev/unitTest +dev/workflows +dev/buildDoc +``` + + +# Indices and tables +[//]: # (- {ref}`ppanggolin package`) + +- {ref}`genindex` + +- {ref}`modindex` + +- {ref}`search` diff --git a/docs/dev/make.bat b/docs/make.bat similarity index 89% rename from docs/dev/make.bat rename to docs/make.bat index 6247f7e2..207019d5 100644 --- a/docs/dev/make.bat +++ b/docs/make.bat @@ -1,35 +1,35 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..e8024c8d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,6 @@ +sphinx==6.2.1 +sphinx_rtd_theme==1.2.2 +readthedocs-sphinx-search==0.3.1 +sphinx-autobuild==2021.3.14 +myst-parser==1.0.0 +docutils==0.18.1 \ No newline at end of file diff --git a/docs/user/Align.md b/docs/user/Align.md index 6e62cea2..5c552d20 100644 --- a/docs/user/Align.md +++ b/docs/user/Align.md @@ -1,3 +1,5 @@ +# Align + `ppanggolin align` is a command made to use a pangenome as a reference to get information about a set of sequences of interest. It requires a HDF-5 file of a previously computed pangenome as input, as well as either a set of sequences, nucleotides or proteins, as a .fasta file. The command will use MMseqs to compare the given input sequences to the pangenome gene family representatives, and will assign a gene family to each input sequence, if there is one which is close enough (the 'closeness' can be defined by some of the command parameters). If there are multiple assignable families, the closest one in bitscore is chosen. diff --git a/docs/user/Basic-usage-and-practical-information.md b/docs/user/Basic-usage-and-practical-information.md index 62743c76..08c41332 100644 --- a/docs/user/Basic-usage-and-practical-information.md +++ b/docs/user/Basic-usage-and-practical-information.md @@ -1,10 +1,17 @@ -# The 'workflow' subcommand +(basic)= +# Basic usage and pratical information -We tried to make PPanGGOLiN relatively easy to use by making this 'workflow' subcommand. It runs a pangenome analysis whose exact steps will depend on the input files you provide it with. In the end, you will end up with some files and figures that describe the pangenome of your taxonomic group of interest in different ways. +## The 'workflow' subcommand -The minimal subcommand is as follow : +We tried to make PPanGGOLiN relatively easy to use by making this **'workflow'** subcommand. +It runs a pangenome analysis whose exact steps will depend on the input files you provide it with. +In the end, you will end up with some files and figures that describe the pangenome of your taxonomic group of interest in different ways. + +The minimal subcommand is as follows : -`ppanggolin workflow --fasta ORGANISMS_FASTA_LIST` +``` +ppanggolin workflow --fasta ORGANISMS_FASTA_LIST +``` It uses parameters that we found to be generally the best when working with species pangenomes. @@ -20,44 +27,116 @@ An [example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/org You can also give PPanGGOLiN your own annotations using .gff or .gbff/.gbk files instead of .fasta files as long as they include the genomic dna sequences, such as the ones provided by prokka using the following command : -`ppanggolin workflow --anno ORGANISMS_ANNOTATION_LIST` +``` +ppanggolin workflow --anno ORGANISMS_ANNOTATION_LIST +``` Another [example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.gbff.list) of such a file can be found in the testingDataset/ directory. -# The 'panrgp' subcommand -This command works exactly like 'workflow'. The difference is that it will run more analysis related to [Regions of Genome Plasticity](https://github.com/labgem/PPanGGOLiN/wiki/Regions-of-Genome-Plasticity). +```{note} +Look at the **annotate** command documentation for more information [here](#annotation) +``` -# Required computing resources +In addition, you can provide your own gene families. +PPanGGOLiN will use it to build and partition the pangenome graph. +You can do that through the command line : -Most of PPanGGOLiN's commands should be run with as many CPUs as you can give them by using the --cpu option as PPanGGOLiN's speed increases relatively well with the number of CPUs. While the 'smallest' pangenomes (up to a few hundred genomes) can be easily analyzed on a normal desktop computer, the biggest ones will require a good amount of RAM. -For example, 40 strains of _E. coli_ were analyzed in 3 minutes using 1.2Go of RAM using 16 threads. 1000 strains were analyzed in 45 minutes with 14 Go of RAM using 16 threads, and as of writing those lines, 20 656 genomes was the biggest pangenome we did and it required about a day and 120 Go of RAM. -The following graphic can give you an idea of the time it takes for a pangenome analysis given the number of genomes in input: +``` +ppanggolin workflow --fasta ORGANISMS_FASTA_LIST --anno ORGANISMS_ANNOTATION_LIST --clusters MY_CLUSTERS_FILE +``` -![runtime](https://github.com/labgem/PPanGGOLiN/blob/master/images/runtimes.png) +An example of what MY_CLUSTERS_FILE should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv) + +Whether you use fasta or annotations, the workflow command options are the same. + +| name | alias | default | type / choices | description | +|---------------------|-------|---------------------------------|--------------------|-------------------------------------------------------------------------------------------------------------------------------| +| --output | -o | ppanggolin_output_DATE_HOUR_PID | Path | Output directory to save the pangenome and all the output files | +| --basename | | pangenome | string | basename for the pangenome file | +| --rarefaction | | False | bool | Use to compute the rarefaction curves (WARNING: can be time consuming) | +| --cpu | -c | 1 | integer | Number of available cpus | +| --translation_table | | 11 | integer | Translation table (genetic code) to use | +| --kingdom | | bacteria | {bacteria,archaea} | Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation | +| --mode | | 1 | {0,1,2,3} | the cluster mode of MMseqs2. 0: Setcover, 1: single linkage (or connected component), 2: CD-HIT-like, 3: CD-HIT-like (lowmem) | +| --coverage | | 0.8 | 0<=float<=1 | Minimal coverage of the alignment for two proteins to be in the same cluster | +| --identity | | 0.8 | 0<=float<=1 | Minimal identity percent for two proteins to be in the same cluster | +| --nb_of_partitions | -K | -1 | integer | Number of partitions to use. Must be at least 2. If under 2, it will be detected automatically | +| --no_defrag | | False | bool | DO NOT Realign gene families to link fragments with their non-fragmented gene family | +| --no_flat_files | | False | bool | Generate only the HDF5 pangenome file | +| --tmpdir | | TMPDIR | Path | directory for storing temporary files | + +(panrgp)= +## The 'panrgp' subcommand + +This command works exactly like 'workflow'. The difference is that it will run more analysis related to [Regions of Genome Plasticity](#RGP-section). +You can use the panrgp command as follow: + +```bash +ppanggolin panrgp --fasta ORGANISMS_FASTA_LIST +``` -# Usage and basic options +The rgp analysis is launched after the pangenome partitionning and use the default parameters. +If you want to tune the rgp detection, you can use the `rgp` command after the `workflow` command. -As most programs in bioinformatics, you can always specify some utility options. -You can specify the number of CPUs to use (which is recommended ! The default is to use just one) using the option `--cpu`. +More detail about RGP detection [here](#RGP-section) and in the [panRGP publication](https://doi.org/10.1093/bioinformatics/btaa792) -You can specify the output directory (if not provided, one will be generated) using the option `--output`. +(panmodule)= +## The 'panmodule' subcommand -If you work in a strange environment that has no, or little available disk space in the '/tmp' (or your system equivalent, what is stored in TMPDIR) directory, you can specify a new temporary directory using `--tmp` +Again, it works like 'workflow' but you can detect the conserved modules in your pangenome, you can use the **panModule** workflow, as such: + +```bash +ppanggolin panmodule --fasta ORGANISMS_FASTA_LIST +``` + +The module prediction is launched after the pangenome partitionning with the default parameters. +If you want to tune the module detection, you can use the `module` command after the `workflow`. + + +Further details can be found in the [conserved module analysis documentation](#module-section) and in the [panModule publication](https://doi.org/10.1101/2021.12.06.471380) + +## Run all PPanGGOLiN analysis + +Finally it's also possible to run all analysis with one command wrapper `all`. +With this workflow, the pangenome will be built and partionned and RGP, spots and module will be predicted. +You can run all the analysis as such: + +```bash +ppanggolin all --fasta ORGANISMS_FASTA_LIST +``` + +## Configuration file -And if you want to redo an analysis from scratch and store it in a directory that already exists, you will have to use the `--force` option. Be wary, however, that the data in that directory will be overwritten if named identically as any output file written by ppanggolin. +Advanced users can provide a configuration file containing any or all parameters to PPanGGolin commands. +This feature is particularly useful for workflow commands such as `workflow`, `all`, `panrgp`, and `panmodule`, as it allows for the specification of all parameters for each subcommand launched in a workflow. +Additionally, a configuration file can be used to reuse a specific set of parameters across multiple pangenomes. -# Config file +To provide a configuration file to a PPanGGolin command, use the `--config` parameter. -Advanced users can provide a configuration file containing any or all parameters to PPanGGolin commands. This feature is particularly useful for workflow commands such as `workflow`, `all`, `panrgp`, and `panmodule`, as it allows for the specification of all parameters for each subcommand launched in a workflow. Additionally, a configuration file can be used to reuse a specific set of parameters across multiple pangenomes. +```{note} +Any command line arguments provided along with a configuration file will override the corresponding arguments specified in the configuration file. +When an argument is not specified in either the command line or the configuration file, the default value is used. +``` + +The configuration file is a JSON file that contains two sections common to all commands: `input_parameters` and `general_parameters`. +In addition, there is a section for each subcommand that contains its specific parameters. + +You can generate a configuration file template with default values by using the `ppanggolin utils` command as follows: -To provide a configuration file to a PPanGGolin command, use the `--config_file` parameter. Note that any command line arguments provided along with a configuration file will override the corresponding arguments specified in the configuration file. When an argument is not specified in either the command line or the configuration file, the default value is used. -The configuration file is a JSON file that contains two sections common to all commands: `input_parameters` and `general_parameters`. In addition, there is a section for each subcommand that contains its specific parameters. +``` +ppanggolin utils --default_config CMD +``` -Users can generate a configuration file template with default values by using the `ppanggolin utils` command. For example, to generate a configuration file for the panrgp command with default values, use the command `ppanggolin utils --default_config panrgp`. This command will create the following configuration file: +For example, to generate a configuration file for the panrgp command with default values, use the command +``` +ppanggolin utils --default_config panrgp +``` + + This command will create the following configuration file: -```python +```yaml input_parameters: # A tab-separated file listing the organism names, and the fasta filepath of its # genomic sequence(s) (the fastas can be compressed with gzip). One line per organism. @@ -102,3 +181,37 @@ annotate: # Number of available cpus cpu: 1 ``` + +## Required computing resources + +Most of PPanGGOLiN's commands should be run with as many CPUs as you can give them by using the --cpu option as PPanGGOLiN's speed increases relatively well with the number of CPUs. +While the 'smallest' pangenomes (up to a few hundred genomes) can be easily analyzed on a normal desktop computer, +the biggest ones will require a good amount of RAM. +For example, 40 strains of *E. coli* were analyzed in 3 minutes using 1.2Go of RAM using 16 threads. +1000 strains were analyzed in 45 minutes with 14 Go of RAM using 16 threads, and as of writing those lines, +20 656 genomes was the biggest pangenome we did, and it required about a day and 120 Go of RAM. +The following graphic can give you an idea of the time it takes for a pangenome analysis given the number of genomes in input: + +```{image} ../_static/runtimes.png +:align: center +``` + +## Usage and basic options + +As most programs in bioinformatics, you can always specify some utility options. + +You can specify the number of CPUs to use (which is recommended ! The default is to use just one) using the option `--cpu`. + +You can specify the output directory (if not provided, one will be generated) using the option `--output`. + +If you work in a strange environment that has no, or little available disk space in the '/tmp' (or your system equivalent, what is stored in TMPDIR) directory, you can specify a new temporary directory using `--tmp` + +If you want to redo an analysis from scratch and store it in a directory that already exists, you will have to use the `--force` option. +Be wary, however, that the data in that directory will be overwritten if named identically as any output file written by ppanggolin. + +PPanGGOLiN is deliberately very verbose, to help users understand each stage of the analysis. +If you want, verbosity can be reduced in several ways. +First, you can specify the verbosity level with the `--verbose` option. +With `0` will show only warning and erros, `1` will add the information (default value), and if you encounter any problem you can use the debug level with value `2`. +Then you can also remove the progress bar with the option `--disable_prog_bar` +Finaly, you can also save PPanGGOLiN logs in a file by specified its path with the option `--log`. \ No newline at end of file diff --git a/docs/user/Conserved-modules.md b/docs/user/Conserved-modules.md index 87cd181b..bef64487 100644 --- a/docs/user/Conserved-modules.md +++ b/docs/user/Conserved-modules.md @@ -1,12 +1,8 @@ +(module-section)= +# Conserved Module From version 1.2.0, it is possible to predict and work with conserved modules using PPanGGOLiN. Modules are groups of genes that are part of the variable genome, and often found together in the different genomes. As such, they are conserved modules and potential functional modules. -This tool can be used using the `ppanggolin panmodule` command directly from your .fasta or .gbff file lists instead of the `ppanggolin workflow` command. It will run additional analysis related to module predictions only, with some more descriptive files related to modules. - -The analysis can also be run directly on your formerly computed pangenomes with a dedicated subcommand. - -# Module - -Once partitions have been computed, you can predict conserved modules. All of the options of the `module` subcommand are for tuning the parameters for the analysis. +Once partitions have been computed, you can predict conserved modules. All the options of the `module` subcommand are for tuning the parameters for the analysis. Details about each parameter and what they do is available in the related [preprint](https://www.biorxiv.org/content/10.1101/2021.12.06.471380v1). The command can be used simply as such: diff --git a/docs/user/Figures/Uplot.md b/docs/user/Figures/Uplot.md new file mode 100644 index 00000000..3f32b3c4 --- /dev/null +++ b/docs/user/Figures/Uplot.md @@ -0,0 +1,10 @@ +A U-shaped plot is a figure presenting the number of families (y axis) per number of organisms (x axis). +It is a .html file that can be opened with any browser and with which you can interact, zoom, move around, mouseover to see numbers in more detail, and you can save what you are seeing as a .png image file. + +It can be generated using the 'draw' subcommand as such : + +`ppanggolin draw -p pangenome.h5 --ucurve` + +```{image} ../_static/u_plot.png +:align: center +``` \ No newline at end of file diff --git a/docs/user/Figures/rarefaction.md b/docs/user/Figures/rarefaction.md new file mode 100644 index 00000000..7389ec27 --- /dev/null +++ b/docs/user/Figures/rarefaction.md @@ -0,0 +1,26 @@ + +This figure is not drawn by default in the 'workflow' subcommand as it requires a lot of computations. It represents the evolution of the number of gene families for each partition as you add more genomes to the pangenome. It has been used a lot in the literature as an indicator of the diversity that you are missing with your dataset on your taxonomic group. The idea is that if at some point when you keep adding genomes to your pangenome you do not add any more gene families, you might have access to your entire taxonomic group's diversity. On the contrary if you are still adding a lot of genes you may be still missing a lot of gene families. + +There are 8 partitions represented. For each of the partitions there are multiple representations of the observed data. You can find the observed means, medians, 1st and 3rd quartiles of the number of gene families per number of genome used. And you can find the fitting of the data by the Heaps' law, which is usually used to represent this evolution of the diversity in terms of gene families in each of the partitions. + +It can be generated using the 'rarefaction' subcommand, which is dedicated to drawing this graph, as such : + +`ppanggolin rarefaction -p pangenome.h5` + + +```{image} ../_static/evolution.png +:align: center +``` + +A lot of options can be used with this subcommand to tune your rarefaction curves, most of them are the same as with the `partition` workflow. +The following 3 are related to the rarefaction alone: + +- `--depth` defines the number of sampling for each number of organism (default 30) +- `--min` defines the minimal number of organisms in a sample (default 1) +- `--max` defines the maximal number of organisms in a sample (default 100) + +So for example the following command: +`ppanggolin rarefaction -p pangenome.h5 --min 5 --max 50 --depth 30` + +Will draw a rarefaction curve with sample sizes between 5 and 50 (between 5 and 50 genomes will be used), and with 30 samples at each point (so 30 samples of 5 genomes, 30 samples or 6 genomes ... up to 50 genomes). + diff --git a/docs/user/Figures/spots.md b/docs/user/Figures/spots.md new file mode 100644 index 00000000..391007cf --- /dev/null +++ b/docs/user/Figures/spots.md @@ -0,0 +1,21 @@ + +For versions 1.2.30 and above, the 'draw' command can draw specific spots of interest, whose ID are provided, or all the spots if you wish. +It will also write a gexf file, which corresponds to the gene families and their organization within the spots. It is basically a subgraph of the pangenome, consisting of the spot itself. +The command can be used as such: + +`ppanggolin draw -p pangenome.h5 --spots all` will draw an interactive .html figure and a gexf file for all the spots. + +If you are interested in only a single spot, you can use its identifier to draw it, as such: + +`ppanggolin draw -p pangenome.h5 --spots spot_34` for spot_34, for example. + +The interactive figures that are drawn look like this: + +```{image} ../_static/drawspot_example.png +:align: center +``` + +The plot represents the different gene organizations that are found in the spot. If there are RGPs with identical gene organization, the organization is represented only once (the represented RGP is picked at random among all identical RGPs). The list of RGPs with the same organization is accessible in the file written alongside the figure called 'spot_X_identical_rgps.tsv', with X the spot_id. + +They can be edited using the sliders and the radio buttons, to change various graphical parameters, and then the plot itself can be saved using the save button on the right of the screen, if need be. +For the gexf file, you can see how to visualize it in the section about the [pangenome gexf](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#gexf-and-light-gexf). \ No newline at end of file diff --git a/docs/user/Figures/tilePlot.md b/docs/user/Figures/tilePlot.md new file mode 100644 index 00000000..d1e5430e --- /dev/null +++ b/docs/user/Figures/tilePlot.md @@ -0,0 +1,19 @@ + +A tile plot is a heatmap representing the gene families (y axis) in the organisms (x axis) making up your pangenome. The tiles on the graph will be colored if the gene family is present in an organism and uncolored if absent. The gene families are ordered by partition, and the genomes are ordered by a hierarchical clustering based on their shared gene families (basically two genomes that are close together in terms of gene family composition will be close together on the figure). + +This plot is quite helpful to observe potential structures in your pangenome, and can also help you to identify eventual outliers. You can interact with it, and mousing over a tile in the plot will indicate to you which is the gene identifier(s), the gene family and the organism that corresponds to the tile. + +If you build your pangenome using the 'workflow' subcommand and you have more than 500 organisms, only the 'shell' and the 'persistent' partitions will be drawn, leaving out the 'cloud' as the figure tends to be too heavy for a browser to open it otherwise. + +It can be generated using the 'draw' subcommand as such : + +`ppanggolin draw -p pangenome.h5 --tile_plot` + + +```{image} ../_static/tile_plot.png +:align: center +``` + +and if you do not want the 'cloud' gene families as it is a lot of data and can be hard to open with a browser sometimes, you can use the following option : + +`ppanggolin draw -p pangenome.h5 --tile_plot --nocloud` diff --git a/docs/user/Flat/RGP.md b/docs/user/Flat/RGP.md new file mode 100644 index 00000000..6a0be260 --- /dev/null +++ b/docs/user/Flat/RGP.md @@ -0,0 +1,64 @@ +#### RGP +This file is a tsv file that lists all of the detected Regions of Genome Plasticity. This requires to have run the RGP detection analysis by either using the `panrgp` command or the `rgp` command. + +It can be written with the following command: +`ppanggolin write -p pangenome.h5 --regions` + +The file has the following format : + +| column | description | +|--------|-------------| +| region | a unique identifier for the region. This is usually built from the contig it is on, with a number after it| +|organism| the organism it is in. This is the organism name provided by the user.| +|start| the start position of the RGP in the contig| +|stop| the stop position of the RGP in the contig| +|genes| the number of genes included in the RGP| +|contigBorder| this is a boolean column. If the RGP is on a contig border it will be True, otherwise, it will be False. This often can indicate that, if an RGP is on a contig border it is probably not complete.| +|wholeContig| this is a boolean column. If the RGP is an entire contig, it will be True, and False otherwise. If an RGP is an entire contig it can possibly be a plasmid, a region flanked with repeat sequences or a contaminant| + +#### Spots + +This is a tsv file with two column. It links the spots of 'summarize_spots' with the RGPs of 'plastic_regions'. + +It is written with the following command: +`ppanggolin write -p pangenome.h5 --spots` + +|column|description| +|------|------------| +|spot_id| The spot identifier (found in the 'spot' column of 'summarize_spots')| +|rgp_id| the RGP identifier (found in 'region' column of 'plastic_regions')| + +#### Summarize spots + +This is a tsv file that will associate each spot with multiple metrics that can indicate the dynamic of the spot. + +It is written with the following command: +`ppanggolin write -p pangenome.h5 --spots` + +|column| description| +|-------|------------| +|spot| the spot identifier. It is unique in the pangenome| +|nb_rgp| the number of RGPs present in the spot| +|nb_families| The number of different gene families that are found in the spot| +|nb_unique_family_sets| The number of RGPs with different gene family content. If two RGPs are identical, they will be counted only once. The difference between this number and the one provided in 'nb_rgp' can be a strong indicator on whether their is a high turnover in gene content in this area or not| +|mean_nb_genes| the mean number of genes on RGPs in the spot| +|stdev_nb_genes| the standard deviation of the number of genes in the spot| +|max_nb_genes| the longest RGP in number of genes of the spot| +|min_nb_genes| the shortest RGP in number of genes of the spot| + +#### Borders + +Each spot has at least one set of gene families bordering them. To write the list of gene families bordering a spot, you need to use the following option: +`ppanggolin write -p pangenome.h5 --borders` + +It will write a .tsv file with 4 columns: + +|column| description| +|-------|------------| +|spot_id| the spot identifier. It is unique in the pangenome| +|number| the number of RGPs present in the spot that have those bordering genes| +|border1| Comma-separated list of gene families of the 1st border| +|border2| Comma-separated list of gene families of the 2nd border| + +As there can be some variation in the borders, some spots will have multiple borders and as such multiple lines in this file. +The sum of the number for each spot_id should be exactly the number of RGPs in the spot. \ No newline at end of file diff --git a/docs/user/Flat/dupplication.md b/docs/user/Flat/dupplication.md new file mode 100644 index 00000000..7256b9f2 --- /dev/null +++ b/docs/user/Flat/dupplication.md @@ -0,0 +1,8 @@ +This file is a .tsv file, with a single parameter written as a comment at the beginning of the file, which indicates the proportion of genomes in which a gene family must be present more than once to be considered 'duplicated' (and not single copy marker). +This file lists the gene families, their duplication ratio, their mean presence in the pangenome and whether it is considered a 'single copy marker' or not, which is particularly useful when calculating the completeness recorded in the [organisms statistics](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#organisms-statistics) file described previously. + +It can be generated using the 'write' subcommand as such : + +`ppanggolin write -p pangenome.h5 --stats` + +This command will also generate the 'organisms_statistics.tsv' file. diff --git a/docs/user/Flat/fam2gen.md b/docs/user/Flat/fam2gen.md new file mode 100644 index 00000000..f356b360 --- /dev/null +++ b/docs/user/Flat/fam2gen.md @@ -0,0 +1,7 @@ + +You can write a list containing the gene family assigned to every single gene of your pangenome, in a file format extactly like the one provided by [MMseqs2](https://github.com/soedinglab/MMseqs2) through its subcommand 'createtsv'. +It is basically a three-column file listing the gene family name in the first column, and the gene names in the second. A third column is either empty, or has an "F" in it. In that case it indicates that the gene is potentially a gene fragment and not complete. This will be indicated only if the [defragmentation](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#defragmentation) pipeline is used. + +You can obtain it as such : + +`ppanggolin write -p pangenome.h5 --families_tsv` \ No newline at end of file diff --git a/docs/user/Flat/info.md b/docs/user/Flat/info.md new file mode 100644 index 00000000..373cf449 --- /dev/null +++ b/docs/user/Flat/info.md @@ -0,0 +1,34 @@ +When computing a pangenome, all of the information about it is saved in the .h5 file, notably parameters used at each step and metrics about the pangenome. You can easily retrieve those informations using the 'info' module. + +This command prints information on stdout, and does not write any file. + +### Content + +This option indicates the following metrics about your pangenome, if they have been computed: +* The total number of genes +* The number of genomes +* The number of gene families +* The number of edges in the pangenome graph +* The number of persistent genes, with the minimal, maximal, sd and mean presence thresholds of the families in this partition +* The number of shell genes, with the minimal, maximal, sd and mean presence thresholds of the families in this partition +* The number of cloud genes, with the minimal, maximal, sd and mean presence thresholds of the families in this partition +* The number of partitions + +Additionally, if you have predicted RGPs and spots (with the subcommands 'panrgp', 'rgp' and 'spot', or 'all'), you will have the following metrics: +* The number of RGPs (Regions of Genomic Plasticity) +* The number of spots of insertion + +Additionally, if you have predicted modules (with the subcommands 'panmodule', 'module' or 'all'): +* The number of modules +* The number of gene families in modules + +It is used as such: +` ppanggolin info -p pangenome.h5 --content` + +### Parameters + +This option indicates, for each steps of the analysis, the PPanGGOLiN parameters that were used and the source of the data if appropriate. + +It is used as such: + +`ppanggolin info -p pangenome.h5 --parameters` diff --git a/docs/user/Flat/metrics.md b/docs/user/Flat/metrics.md new file mode 100644 index 00000000..19c09fbf --- /dev/null +++ b/docs/user/Flat/metrics.md @@ -0,0 +1,33 @@ +After computing a pangenome, it's interesting to get some metrics about it. +The `metrics` subcommand allow running and compute some analysis and metrics. + +All the metrics computed here will be saved in your pangenome file and +will be easily readable with the `info` subcommand + +### Genomic fluidity +The genomic fluidity is described as *a robust metric to categorize the +gene-level similarity among groups of sequenced isolates.* +[more information here](https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-12-32) + +We add the possibility to get genomic fluidity for all the pangenome or +for specific partition. The genomic fluidity is computable like that : +``` +ppanggolin metrics -p pangenome --genome_fluidity +... +Genomes fluidity : all=0.026, shell=0.477, cloud=0.045, accessory=0.554 +``` +*all* correspond to all the family in the pangenome (core and accessory) + +### Module information +It could be necessary to get more information about the modules. +Here we provide information about families, and we separate modules in +function of the partition. You can get this supplementary information +as such : +``` +ppanggolin metrics -p pangenome.h5 --info_modules +... +Modules : 3 +Families in Modules : 22 (min : 5, max : 9, sd : 2.08, mean : 7.33) + Sheel specific : 36.36 (sd : 4.62, mean : 2.67) + Cloud specific : 63.64 (sd : 4.51, mean : 4.67) +``` diff --git a/docs/user/Flat/module.md b/docs/user/Flat/module.md new file mode 100644 index 00000000..35ea8a37 --- /dev/null +++ b/docs/user/Flat/module.md @@ -0,0 +1,59 @@ +#### Functional modules +This .tsv file lists the modules and the gene families that belong to them. It lists one family per line, and there are multiple line for each module. +It is written along with other files with the following command: +`ppanggolin write -p pangenome.h5 --modules` + +It follows the following format: +|column|description| +|------|------------| +|module_id| The module identifier| +|family_id| the family identifier| + +#### Modules in organisms +This .tsv file lists for each organism the modules that are present and how complete they are. Since there are some variability that are allowed in the module predictions, occasionnally some modules can be incomplete in some of the organisms where they are found. +This file is written along with other files with the following command: +`ppanggolin write -p pangenome.h5 --modules` + +And it follows the following format: +|column|description| +|------|------------| +|module_id| The module identifier| +|organism| the organism which has the indicated module| +|completion| a value between 0.0 and 1.0 which indicates how complete (in terms of gene family) the module is in the given organism| + +#### modules summary +This .tsv file lists a few characteristics for each detected module. There is one line for each module. +The file is written along with other files with the following command: +`ppanggolin write -p pangenome.h5 --modules` + +And it follows the following format: +|column|description| +|------|------------| +|module_id| The module identifier| +|nb_families| The number of families which are included in the module The families themselves are listed in the 'functional_modules.tsv' file.| +|nb_organisms|The number of organisms in which the module is found. Those organisms are listed in the 'modules_in_organisms.tsv' file.| +|partition| The average partition of the families in the module.| +|mean_number_of_occurrence| the mean number of time a module is present in each organism. The expected value is around one, but it can be more if it is a module often repeated in the genomes (like a phage).| + +### spot modules +This command is available only if both modules and spots have been computed for your pangenome (see the command `all`, or the commands `spot` and `module` for that). +It indicates which modules are present in which spot and in which RGP. +The files are written with the following command: +```ppanggolin write -p pangenome.h5 --spot_modules``` +The format of the 'modules_spots.tsv' file is the following: + +|column|description| +|------|------------| +|module_id| The module identifier| +|spot_id| the spot identifier| + +The file 'modules_RGP_lists.tsv' lists RGPs that have the same modules. Those RGPs can have different gene families, however they will not have any other module than those that are indicated. The format of the 'modules_RGP_lists.tsv' is the following: + +|column|description| +|------|------------| +|representative_RGP| an RGP deemed representative for the group, and serving as a 'group of rgp id'(randomly picked)| +|nb_spots| The number of spots in which we see the RGPs which have the modules listed afterwards| +|mod_list| a list of the modules that are in the indicated RGPs| +|RGP_list| a list of RGP that include exactly the modules listed previously| + +This information can also be visualized through figures that can be drawn with `ppanggolin draw --spots` (see [Spot plots](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#spot-plots), and which can display modules. diff --git a/docs/user/Flat/orgStat.md b/docs/user/Flat/orgStat.md new file mode 100644 index 00000000..1bae7ac4 --- /dev/null +++ b/docs/user/Flat/orgStat.md @@ -0,0 +1,29 @@ + +The organisms_statistics.tsv file is a tab-separated file describing the content of each of the genome used for building the pangenome. It might be useful when working with fragmented data such as MAGs or if you suspect some of your genomes to be chimeric, or to not belong to your taxonomic group (as those genomes will be outliers regarding to the numbers in this file). +The first lines starting with a '#' are indicators of parameters used when generating the numbers describing each organisms, and should not be read if loading this into a spreadsheet. They will be skipped automatically if you load this file with R. + +This file is made of 15 columns described in the following table + +| Column | Description | +|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| organism | Indicates the organism's name to whom the provided genome belongs to | +| nb_families | Indicates the number of gene families present in that genome | +| nb_persistent_families | The number of persistent families present in that genome | +| nb_shell_families | The number of shell families present in that genome | +| nb_cloud_families | The number of cloud families present in that genome | +| nb_exact_core | The number of exact core families present in that genome. This number should be identical in all genomes. | +| nb_soft_core | The number of soft core families present in that genome. The threshold used is indicated in the #soft_core line at the beginning of the file, and is 0.95 by default. | +| nb_genes | The number of genes in that genome | +| nb_persistent_genes | The number of genes whose family is persistent in that genome | +| nb_shell_genes | The number of genes whose family is shell in that genome | +| nb_cloud_genes | The number of genes whose family is cloud in that genome | +| nb_exact_core_genes | The number of genes whose family is exact core in that genome | +| nb_soft_core_genes | The number of genes whose family is soft core in that genome | +| completeness | This is an indicator of the proportion of single copy markers in the persistent that are present in the genome. While it is expected to be relatively close to 100 when working with isolates, it may be particularly interesting when working with very fragmented genomes as this provide a *de novo* estimation of the completess based on the expectation that single copy markers within the persistent should be mostly present in all individuals of the studied taxonomic group | +| nb_single_copy_markers | This indicates the number of present single copy markers in the genomes. They are computed using the parameter duplication_margin indicated at the beginning of the file. They correspond to all of the persistent gene families that are not present in more than one copy in 5% (or more) of the genomes by default. | + +It can be generated using the 'write' subcommand as such : + +`ppanggolin write -p pangenome.h5 --stats` + +This command will also generate the 'mean_persistent_duplication.tsv' file. \ No newline at end of file diff --git a/docs/user/Flat/partition.md b/docs/user/Flat/partition.md new file mode 100644 index 00000000..76586801 --- /dev/null +++ b/docs/user/Flat/partition.md @@ -0,0 +1,5 @@ +Those files will be stored in the 'partitions' directory and will be named after the partition that they represent (like persistent.txt for the persistent partition). In each of those file there will be a list of gene family identifiers that correspond to the gene families belonging to that partition, one family per line, should you need it for your pipelines or during your analysis. + +You can generate those files as such : + +` ppanggolin write -p pangenome.h5 --partitions` \ No newline at end of file diff --git a/docs/user/Flat/presAbs.md b/docs/user/Flat/presAbs.md new file mode 100644 index 00000000..8560115b --- /dev/null +++ b/docs/user/Flat/presAbs.md @@ -0,0 +1,16 @@ +### gene presence absence + +This file is basically a presence absence matrix. The columns are the genomes used to build the pangenome, the lines are the gene families. The identifier of the gene family is the gene identifier chosen as a representative. + There is a 1 if the gene family is present in a genome, and 0 otherwise. It follows the exact same format than the 'gene_presence_absence.Rtab' file that you get from the pangenome analysis software [roary](https://sanger-pathogens.github.io/Roary/) + +It can be generated using the 'write' subcommand as such : + +`ppanggolin write -p pangenome.h5 --Rtab` + +### matrix + +This file is a .csv file following a format alike the gene_presence_absence.csv file generated by [roary](https://sanger-pathogens.github.io/Roary/), and works with [scoary](https://github.com/AdmiralenOla/Scoary) if you want to do pangenome-wide association studies. + +It can be generated using the 'write' subcommand as such : + +`ppanggolin write -p pangenome.h5 --csv` diff --git a/docs/user/Flat/projection.md b/docs/user/Flat/projection.md new file mode 100644 index 00000000..19a41fa4 --- /dev/null +++ b/docs/user/Flat/projection.md @@ -0,0 +1,21 @@ +This option writes in a 'projection' directory. There will be a file written in the .tsv file format for every single genome in the pangenome. +The columns of this file are described in the following table : + +| Column | Description | +|----------------------|--------------------------------------------------------------------------------------------------------------------------------| +| gene | the unique identifier of the gene | +| contig | the contig that the gene is on | +| start | the start position of the gene | +| stop | the stop position of the gene | +| strand | The strand that the gene is on | +| ori | Will be T if the gene name is dnaA | +| family | the family identifier to which the gene belongs to | +| nb_copy_in_org | The number of copy of the family in the organism (basically, if 1, the gene has no closely related paralog in that organism) | +| partition | the partition to which the gene family of the gene belongs to | +| persistent_neighbors | The number of neighbors classified as 'persistent' in the pangenome graph | +| shell_neighbors | The number of neighbors classified as 'shell' in the pangenome graph | +| cloud_neighbors | The number of neighbors classidied as 'cloud' in the pangenome graph | + +Those files can be generated as such : + +`ppanggolin write -p pangenome.h5 --projection` \ No newline at end of file diff --git a/docs/user/Genomic-context.md b/docs/user/Genomic-context.md index a9f66ef1..69021b00 100644 --- a/docs/user/Genomic-context.md +++ b/docs/user/Genomic-context.md @@ -1,9 +1,9 @@ +# Genomic context searching + From version 1.2.45, it is possible to search genomic context in a pangenome graph using PPanGGOLiN. A genomic context corresponds to a group of genes/proteins with a functional interest, often found together in the genomes. They are detected by extracting a subgraph obtained by filtering edges connecting the sequences of interest in the pangenome. The analysis can be run on your formerly computed pangenomes and one or more genomic context. -# Context - To search your genomic context of interest, there are two possibilities. You can search directly with genes/proteins sequences in a fasta file or use a list of the gene family ID. Both possibilities can be run in the same subcommand `context` and all the options are for tuning the parameters for the analysis. ## Search Genomic context with sequences @@ -41,10 +41,10 @@ There are 4 columns in `gene_context.tsv`. In **sequence Id**, it is possible to find a NA value. This case, correspond to another gene family found in the context. ## Detailed options -| option name | Description | -|-----------------------------|---------------------------------------------------------------------------| -| --no_defrag | Do not use the defragmentation step, to align sequences with MMseqs2 | -| --identity | Minimum identity percentage threshold | -| --coverage | Minimum coverage percentage threshold | +| option name | Description | +|------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| --no_defrag | Do not use the defragmentation step, to align sequences with MMseqs2 | +| --identity | Minimum identity percentage threshold | +| --coverage | Minimum coverage percentage threshold | | -t, --transitive | Size of the transitive closure used to build the graph. This indicates the number of non-related genes allowed in-between two related genes. Increasing it will improve precision but lower sensitivity a little. | -| -s, --jaccard | Minimum jaccard similarity used to filter edges between gene families. Increasing it will improve precision but lower sensitivity a lot. | \ No newline at end of file +| -s, --jaccard | Minimum jaccard similarity used to filter edges between gene families. Increasing it will improve precision but lower sensitivity a lot. | \ No newline at end of file diff --git a/docs/user/Home.md b/docs/user/Home.md deleted file mode 100644 index d52ccd19..00000000 --- a/docs/user/Home.md +++ /dev/null @@ -1,48 +0,0 @@ -* [Introduction](https://github.com/labgem/PPanGGOLiN/wiki/Introduction) -* [Installation instructions](https://github.com/labgem/PPanGGOLiN/wiki/Installation) -* [Basic usage and practical information](https://github.com/labgem/PPanGGOLiN/wiki/Basic-usage-and-practical-information) -* [step by step pangenome analysis](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis) - * [Annotation](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#annotation) - * [Cluster](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#clustering) - * [Graph](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#graph) - * [Partition](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#partition) -* [Regions of Genome Plasticity](https://github.com/labgem/PPanGGOLiN/wiki/Regions-of-Genome-Plasticity) - * [RGP](https://github.com/labgem/PPanGGOLiN/wiki/Regions-of-Genome-Plasticity#rgp) - * [Spots of insertion](https://github.com/labgem/PPanGGOLiN/wiki/Regions-of-Genome-Plasticity#spots-of-insertion) -* [Conserved modules](https://github.com/labgem/PPanGGOLiN/wiki/Conserved-modules) -* [Align](https://github.com/labgem/PPanGGOLiN/wiki/Align) -* [Genomic context](https://github.com/labgem/PPanGGOLiN/wiki/Genomic-context) -* [Output](https://github.com/labgem/PPanGGOLiN/wiki/Outputs) - * [Draw](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#draw) - * [Ushaped plot](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#u-shaped-plot) - * [Tile plot](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#tile-plot) - * [Spot plots](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#spot-plots) - * [Rarefaction](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#rarefaction) - * [Write](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#write) - * [organisms statistics](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#organisms-statistics) - * [GEXF files](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#gexf-and-light-gexf) - * [JSON](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#json) - * [gene presence absence](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#gene-presence-absence) - * [matrix](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#matrix) - * [mean persistent duplication](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#mean-persistent-duplication) - * [partitions](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#partitions) - * [projection](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#projection) - * [gene families](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#gene-families-and-genes) - * [plastic regions](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#plastic-regions) - * [spots](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#spots) - * [summarize spots](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#summarize-spots) - * [borders](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#borders) - * [Modules](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#modules) - * [Spot modules](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#spot-modules) - * [Fasta](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#fasta) - * [Genes](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#Genes) - * [Protein families](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#protein-families) - * [Genes families](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#gene-families) - * [Regions](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#regions) - * [MSA](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#MSA) - * [partitions](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#partitions-1) - * [source](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#source) - * [phylo](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#phylo) - * [Info](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#info) - * [Metrics](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#Metrics) - * [Metadata](https://github.com/labgem/PPanGGOLiN/wiki/Metadata) \ No newline at end of file diff --git a/docs/user/Installation.md b/docs/user/Installation.md index 5128a531..b04009e4 100644 --- a/docs/user/Installation.md +++ b/docs/user/Installation.md @@ -1,6 +1,11 @@ -# Using conda +# Installation -The recommended way of installing PPanGGOLiN is to do it using conda.To use it you need to add the conda channels that store the dependencies as such : +## Latest version + +### Install with conda (recommended) + +The recommended way of installing PPanGGOLiN is to do it using conda. +To use it, you need to add the conda channels that store the dependencies as such: ``` conda config --add channels defaults ; @@ -12,32 +17,95 @@ Then you can just run : `conda install -c bioconda ppanggolin` -You can also use mamba, which is much quicker and sometimes help solving conflicting dependencies: - -``` -conda install mamba -mamba install -c bioconda ppanggolin +```{tip} +You can also use mamba, which is much quicker and sometimes help solving conflicting dependencies ``` -If you have troubles or if conda tells you something about conflicting dependencies, I recommend you install PPanGGOLiN on a separate conda environment as PPanGGOLiN has quite a few dependencies and their versions can be conflicting with other bioinformatics software. +If you have troubles or if conda tells you something about conflicting dependencies, I recommend you install PPanGGOLiN in a separate conda environment as PPanGGOLiN has quite a few dependencies, and their versions can be conflicting with other bioinformatics software. Sometimes, installation problems come from having an unsupported python version installed by default in your environment. Forcing the python version solves the problem: + ``` conda install -c bioconda ppanggolin python=3.8 ``` -# Using github +```{note} +Supported python version are 3.8, 3.9 and 3.10 +``` + +### Install from source code (GitHub) + +If you want to install from the source code, you should respect some step before. + +```{warning} +This is a manual installation, we can help, but not support all troubleshooting. Prefer to use the conda installation whenever possible. +``` + +First, you must install the python dependencies. +For that, create a *dependencies.txt* file and copy/paste the next content. + +```text +tqdm>=4.66.0 +tables>=3.8.0 +networkx>=3.0 +scipy>=1.10.0 +plotly>=5.16.0 +gmpy2>=2.1.0 +pandas>=2.0.0 +colorlover>=0.3 +numpy>=1.24.0 +bokeh>=3.1.0 +``` + +Next, you can use **pip** to install dependencies + +```bash +python3 -m pip install -r dependencies.txt +``` +```{warning} +Be sure to use a python version greater than 3.8 ! +``` +Then you must install the following software: + +- [Prodigal>=2.6.3](https://github.com/hyattpd/Prodigal/wiki/installation) +- [Aragorn>=1.2.41](http://www.ansikte.se/ARAGORN/Downloads/) +- [Infernal>=1.1.4](http://eddylab.org/infernal/) +- [MMSeqs2>=13.45111](https://github.com/soedinglab/MMseqs2/wiki#installation) + +To finish, you can install ppanggolin by cloning the GitHub repository and using **pip** + +```bash +git clone https://github.com/labgem/PPanGGOLiN.git +cd PPanGGOLiN +pip install . +``` + +## Development version -If you want to use the development version you can use the 'master' branch on Github. While it is not guaranteed to work, it should most of the time. +If you want to use the development version, you can use the 'dev' branch on GitHub. While it is not guaranteed to work, it should most of the time. -You need to clone the repository on your computer, then you will need to have all of the dependencies (they are listed in the [requirements](https://github.com/labgem/PPanGGOLiN/blob/master/requirements.txt) file at the root of the repositories). You can install them using conda as such : -` conda install --file requirements.txt ` -Then you will need to install ppanggolin. You can use pip while in your conda environment which is the preferred way of doing. For this, place yourself at the root of the repository and run : +You need to clone the repository on your computer, as followed: -`pip install .` +```bash +git clone https://github.com/labgem/PPanGGOLiN.git +cd PPanGGOLiN +git checkout -b dev +git branch --set-upstream-to=origin/dev dev +git pull +``` -It should install PPanGGOLiN in your conda environment. -Alternatively, you can use the setup.py file directly, as such : +Then you will need to have all the dependencies. +You can install them as described [above](#install-from-source-code-github) with **pip** or by using **conda** and the [*requirements.txt file*](https://github.com/labgem/PPanGGOLiN/blob/dev/requirements.txt) as such: + +``` +conda install --file requirements.txt +``` + +Then you will need to install ppanggolin. You can use pip while in your conda environment, which is the preferred way of doing. For this, place yourself at the root of the repository and run : + +``` +pip install . +``` -`python setup.py install .` \ No newline at end of file +It should install PPanGGOLiN in your conda environment. \ No newline at end of file diff --git a/docs/user/Introduction.md b/docs/user/Introduction.md index b631526f..41c52ec7 100644 --- a/docs/user/Introduction.md +++ b/docs/user/Introduction.md @@ -1,9 +1,18 @@ +# Introduction -PPanGGOLiN is built as a software suite to analyse groups of genomes potentially belonging to the same taxonomic group (subspecies, species, genus...). It is open source CeCiLL-licensed software implemented in C and python3 and works under Linux and MacOS. It is built to use multiple cores and is able to handle tens of thousands of genomes in a single analysis with (relatively) reasonable CPU and RAM usage. +PPanGGOLiN is built as a software suite to analyse groups of genomes potentially belonging to the same taxonomic group (subspecies, species, genus...). +It is an open source CeCiLL-licensed software, implemented in C and python3 and works under Linux and MacOS. +It is built to use multiple cores and is able to handle tens of thousands of genomes in a single analysis with (relatively) reasonable CPU and RAM usage. -The software has a minimal one-liner that can be used to generate most of the outputs of PPanGGOLiN without parameter tuning. While it should work fine at the species or subspecies level, it will yield very poor results with more divergent genomes, and working with PPanGGOLiN on such datasets will require parameter tuning. -If you want to tune your analysis, the PPanGGOLiN suite can be used through multiple subcommands that allow tuning parameters for each step of the analysis. The results of each analysis are saved in the same file, which is in the HDF-5 file format (with the .h5 extension). When you run an analysis using this file as input, the results of that analysis will be added to the file to supplement the data that are already stored in it. -The idea behind this is that you can store and manipulate your pangenomes with PPanGGOLiN by using this file only. It will keep all the information about what was done, all the parameters used, and all of the pangenome's content. +The software has a minimal one-liner that can be used to generate most of the outputs of PPanGGOLiN without parameter tuning. +While it should work fine at the species or subspecies level, it will yield indigent results with more divergent genomes, and working with PPanGGOLiN on such datasets will require parameter tuning. + + +If you want to tune your analysis, the PPanGGOLiN suite can be used through multiple subcommands that allow tuning parameters for each step of the analysis. +The results of each analysis are saved in the same file, which is in the HDF-5 file format (with the .h5 extension). +When you run an analysis using this file as input, the results of that analysis will be added to the file to supplement the data that are already stored in it. +The idea behind this is that you can store and manipulate your pangenomes with PPanGGOLiN by using this file only. It will keep all the information about what was done, all the parameters used, and all the pangenome's content. + Keeping it this way makes it possible to use the file as a pangenome reference database for a taxonomic group for some applications such as evaluating MAG completeness or contamination, or comparing a genome with a pangenome to assign partitions to the genome's genes. Other applications are currently in development. \ No newline at end of file diff --git a/docs/user/Outputs.md b/docs/user/Outputs.md index 70a0bebb..c88aef22 100644 --- a/docs/user/Outputs.md +++ b/docs/user/Outputs.md @@ -1,3 +1,6 @@ +(output)= +# PPanGGOLiN outputs + PPanGGOLiN provides multiple outputs to describe a pangenome. In this section the different outputs will be described. In most cases it will provide with a HDF-5 file named "pangenome.h5". This file stores all the information about your pangenome and the analysis that were run. If given to ppanggolin through most of the subcommands, it will read information from it. This is practical as you can regenerate figures or output files, or rerun parts of the analysis without redoing everything. @@ -6,495 +9,80 @@ In this section, each parts will describe a possible output of PPanGGOLiN, and w When using the same subcommand (like 'write' or 'draw' that can help you generate multiple file each), you can provide multiple options to write all of the file formats that you desire at once. -# Draw - -## U-shaped plot - -A U-shaped plot is a figure presenting the number of families (y axis) per number of organisms (x axis). -It is a .html file that can be opened with any browser and with which you can interact, zoom, move around, mouseover to see numbers in more detail, and you can save what you are seeing as a .png image file. - -It can be generated using the 'draw' subcommand as such : - -`ppanggolin draw -p pangenome.h5 --ucurve` - -## tile plot - -A tile plot is a heatmap representing the gene families (y axis) in the organisms (x axis) making up your pangenome. The tiles on the graph will be colored if the gene family is present in an organism and uncolored if absent. The gene families are ordered by partition, and the genomes are ordered by a hierarchical clustering based on their shared gene families (basically two genomes that are close together in terms of gene family composition will be close together on the figure). - -This plot is quite helpful to observe potential structures in your pangenome, and can also help you to identify eventual outliers. You can interact with it, and mousing over a tile in the plot will indicate to you which is the gene identifier(s), the gene family and the organism that corresponds to the tile. - -If you build your pangenome using the 'workflow' subcommand and you have more than 500 organisms, only the 'shell' and the 'persistent' partitions will be drawn, leaving out the 'cloud' as the figure tends to be too heavy for a browser to open it otherwise. - -It can be generated using the 'draw' subcommand as such : - -`ppanggolin draw -p pangenome.h5 --tile_plot` - -and if you do not want the 'cloud' gene families as it is a lot of data and can be hard to open with a browser sometimes, you can use the following option : - -`ppanggolin draw -p pangenome.h5 --tile_plot --nocloud` - - -## Spot plots - -For versions 1.2.30 and above, the 'draw' command can draw specific spots of interest, whose ID are provided, or all the spots if you wish. -It will also write a gexf file, which corresponds to the gene families and their organization within the spots. It is basically a subgraph of the pangenome, consisting of the spot itself. -The command can be used as such: - -`ppanggolin draw -p pangenome.h5 --spots all` will draw an interactive .html figure and a gexf file for all the spots. - -If you are interested in only a single spot, you can use its identifier to draw it, as such: - -`ppanggolin draw -p pangenome.h5 --spots spot_34` for spot_34, for example. - -The interactive figures that are drawn look like this: - -![interactive figure](https://github.com/labgem/PPanGGOLiN/raw/master/images/drawspot_example.png) +## PPanGGOLiN figures outputs -The plot represents the different gene organizations that are found in the spot. If there are RGPs with identical gene organization, the organization is represented only once (the represented RGP is picked at random among all identical RGPs). The list of RGPs with the same organization is accessible in the file written alongside the figure called 'spot_X_identical_rgps.tsv', with X the spot_id. - -They can be edited using the sliders and the radio buttons, to change various graphical parameters, and then the plot itself can be saved using the save button on the right of the screen, if need be. -For the gexf file, you can see how to visualize it in the section about the [pangenome gexf](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#gexf-and-light-gexf). - -# Rarefaction - -This figure is not drawn by default in the 'workflow' subcommand as it requires a lot of computations. It represents the evolution of the number of gene families for each partition as you add more genomes to the pangenome. It has been used a lot in the literature as an indicator of the diversity that you are missing with your dataset on your taxonomic group. The idea is that if at some point when you keep adding genomes to your pangenome you do not add any more gene families, you might have access to your entire taxonomic group's diversity. On the contrary if you are still adding a lot of genes you may be still missing a lot of gene families. - -There are 8 partitions represented. For each of the partitions there are multiple representations of the observed data. You can find the observed means, medians, 1st and 3rd quartiles of the number of gene families per number of genome used. And you can find the fitting of the data by the Heaps' law, which is usually used to represent this evolution of the diversity in terms of gene families in each of the partitions. - -It can be generated using the 'rarefaction' subcommand, which is dedicated to drawing this graph, as such : - -`ppanggolin rarefaction -p pangenome.h5` - -A lot of options can be used with this subcommand to tune your rarefaction curves, most of them are the same as with the `partition` workflow. -The following 3 are related to the rarefaction alone: - -- `--depth` defines the number of sampling for each number of organism (default 30) -- `--min` defines the minimal number of organisms in a sample (default 1) -- `--max` defines the maximal number of organisms in a sample (default 100) - -So for example the following command: -`ppanggolin rarefaction -p pangenome.h5 --min 5 --max 50 --depth 30` - -Will draw a rarefaction curve with sample sizes between 5 and 50 (between 5 and 50 genomes will be used), and with 30 samples at each point (so 30 samples of 5 genomes, 30 samples or 6 genomes ... up to 50 genomes). - -# Write -## Organisms statistics - -The organisms_statistics.tsv file is a tab-separated file describing the content of each of the genome used for building the pangenome. It might be useful when working with fragmented data such as MAGs or if you suspect some of your genomes to be chimeric, or to not belong to your taxonomic group (as those genomes will be outliers regarding to the numbers in this file). -The first lines starting with a '#' are indicators of parameters used when generating the numbers describing each organisms, and should not be read if loading this into a spreadsheet. They will be skipped automatically if you load this file with R. - -This file is made of 15 columns described in the following table +### U-shaped plot +```{include} Figures/Uplot.md +``` -| Column | Description | -|------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| organism | Indicates the organism's name to whom the provided genome belongs to | -| nb_families | Indicates the number of gene families present in that genome | -| nb_persistent_families | The number of persistent families present in that genome | -| nb_shell_families | The number of shell families present in that genome | -| nb_cloud_families | The number of cloud families present in that genome | -| nb_exact_core | The number of exact core families present in that genome. This number should be identical in all genomes. | -| nb_soft_core | The number of soft core families present in that genome. The threshold used is indicated in the #soft_core line at the beginning of the file, and is 0.95 by default. | -| nb_genes | The number of genes in that genome | -| nb_persistent_genes | The number of genes whose family is persistent in that genome | -| nb_shell_genes | The number of genes whose family is shell in that genome | -| nb_cloud_genes | The number of genes whose family is cloud in that genome | -| nb_exact_core_genes | The number of genes whose family is exact core in that genome | -| nb_soft_core_genes | The number of genes whose family is soft core in that genome | -| completeness | This is an indicator of the proportion of single copy markers in the persistent that are present in the genome. While it is expected to be relatively close to 100 when working with isolates, it may be particularly interesting when working with very fragmented genomes as this provide a *de novo* estimation of the completess based on the expectation that single copy markers within the persistent should be mostly present in all individuals of the studied taxonomic group | -| nb_single_copy_markers | This indicates the number of present single copy markers in the genomes. They are computed using the parameter duplication_margin indicated at the beginning of the file. They correspond to all of the persistent gene families that are not present in more than one copy in 5% (or more) of the genomes by default. | +### tile plot +```{include} Figures/tilePlot.md +``` -It can be generated using the 'write' subcommand as such : +### Spot plots +```{include} Figures/spots.md +``` -`ppanggolin write -p pangenome.h5 --stats` +## Rarefaction +```{include} Figures/rarefaction.md +``` -This command will also generate the 'mean_persistent_duplication.tsv' file. +## Write +### Organisms statistics +```{include} Flat/orgStat.md +``` -## pangenomeGraph files +### pangenomeGraph files The pangenome's graph can be given through multiple data formats, in order to manipulate it with other softwares. -### gexf and light gexf -The Graph can be given through the .gexf and through the _light.gexf files. The _light.gexf file will contain the gene families as nodes and the edges between gene families describing their relationship, and the .gexf file will contain the same thing, but also include more informations about each gene and each relation between gene families. -We have made two different files representing the same graph because, while the non-light file is exhaustive, it can be very heavy to manipulate and most of the information in it are not of interest to everyone. The _light.gexf file should be the one you use to manipulate the pangenome graph most of the time. - -They can be manipulated and visualised through a software called [Gephi](https://gephi.org/), with which we have made extensive testings, or potentially any other softwares or libraries that can read gexf files such as [networkx](https://networkx.github.io/documentation/stable/index.html) or [gexf-js](https://github.com/raphv/gexf-js) among others. - -Using Gephi, the layout can be tuned as illustrated below: - -![Gephi layout](https://github.com/labgem/PPanGGOLiN/raw/master/images/gephi.gif) - -We advise the Gephi "Force Atlas 2" algorithm to compute the graph layout with "Stronger Gravity: on" and "scaling: 4000" but don't hesitate to tinker the layout parameters. - -In the _light.gexf file : -The nodes will contain the number of genes belonging to the gene family, the most commun gene name (if you provided annotations), the most common product name(if you provided annotations), the partitions it belongs to, its average and median size in nucleotids, and the number of organisms that have this gene family. - -The edges contain the number of times they are present in the pangenome. - -The .gexf non-light file will contain in addition to this all the information about genes belonging to each gene families, their names, their product string, their sizes and all the information about the neighborhood relationships of each pair of genes described through the edges. - -The light gexf can be generated using the 'write' subcommand as such : - -`ppanggolin write -p pangenome.h5 --light_gexf` - -while the gexf file can be generated as such : - -`ppanggolin write -p pangenome.h5 --gexf` - -### json -The json's file content corresponds to the .gexf file content, but in json rather than gexf file format. It follows the 'node-link' format as shown in [this example](https://observablehq.com/@d3/force-directed-graph) in javascript, or as used in the [networkx](https://networkx.github.io/documentation/stable/reference/readwrite/json_graph.html) python library and it should be usable with both [D3js](https://d3js.org/) and [networkx](https://networkx.github.io/documentation/stable/index.html), or any other software or library that supports this format. - -The json can be generated using the 'write' subcommand as such : - -`ppanggolin write -p pangenome.h5 --json` - -## gene presence absence - -This file is basically a presence absence matrix. The columns are the genomes used to build the pangenome, the lines are the gene families. The identifier of the gene family is the gene identifier chosen as a representative. - There is a 1 if the gene family is present in a genome, and 0 otherwise. It follows the exact same format than the 'gene_presence_absence.Rtab' file that you get from the pangenome analysis software [roary](https://sanger-pathogens.github.io/Roary/) - -It can be generated using the 'write' subcommand as such : - -`ppanggolin write -p pangenome.h5 --Rtab` - -## matrix - -This file is a .csv file following a format alike the gene_presence_absence.csv file generated by [roary](https://sanger-pathogens.github.io/Roary/), and works with [scoary](https://github.com/AdmiralenOla/Scoary) if you want to do pangenome-wide association studies. - -It can be generated using the 'write' subcommand as such : - -`ppanggolin write -p pangenome.h5 --csv` - -## mean persistent duplication - -This file is a .tsv file, with a single parameter written as a comment at the beginning of the file, which indicates the proportion of genomes in which a gene family must be present more than once to be considered 'duplicated' (and not single copy marker). -This file lists the gene families, their duplication ratio, their mean presence in the pangenome and whether it is considered a 'single copy marker' or not, which is particularly useful when calculating the completeness recorded in the [organisms statistics](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#organisms-statistics) file described previously. - -It can be generated using the 'write' subcommand as such : - -`ppanggolin write -p pangenome.h5 --stats` - -This command will also generate the 'organisms_statistics.tsv' file. - -## partitions - -Those files will be stored in the 'partitions' directory and will be named after the partition that they represent (like persistent.txt for the persistent partition). In each of those file there will be a list of gene family identifiers that correspond to the gene families belonging to that partition, one family per line, should you need it for your pipelines or during your analysis. - -You can generate those files as such : - -` ppanggolin write -p pangenome.h5 --partitions` - -## projection - -This option writes in a 'projection' directory. There will be a file written in the .tsv file format for every single genome in the pangenome. -The columns of this file are described in the following table : - -| Column | Description | -|----------------------|--------------------------------------------------------------------------------------------------------------------------------| -| gene | the unique identifier of the gene | -| contig | the contig that the gene is on | -| start | the start position of the gene | -| stop | the stop position of the gene | -| strand | The strand that the gene is on | -| ori | Will be T if the gene name is dnaA | -| family | the family identifier to which the gene belongs to | -| nb_copy_in_org | The number of copy of the family in the organism (basically, if 1, the gene has no closely related paralog in that organism) | -| partition | the partition to which the gene family of the gene belongs to | -| persistent_neighbors | The number of neighbors classified as 'persistent' in the pangenome graph | -| shell_neighbors | The number of neighbors classified as 'shell' in the pangenome graph | -| cloud_neighbors | The number of neighbors classidied as 'cloud' in the pangenome graph | - -Those files can be generated as such : - -`ppanggolin write -p pangenome.h5 --projection` - -## Gene families and genes - -You can write a list containing the gene family assigned to every single gene of your pangenome, in a file format extactly like the one provided by [MMseqs2](https://github.com/soedinglab/MMseqs2) through its subcommand 'createtsv'. -It is basically a three-column file listing the gene family name in the first column, and the gene names in the second. A third column is either empty, or has an "F" in it. In that case it indicates that the gene is potentially a gene fragment and not complete. This will be indicated only if the [defragmentation](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#defragmentation) pipeline is used. - -You can obtain it as such : - -`ppanggolin write -p pangenome.h5 --families_tsv` - -## Plastic regions - -This file is a tsv file that lists all of the detected Regions of Genome Plasticity. This requires to have run the RGP detection analysis by either using the `panrgp` command or the `rgp` command. - -It can be written with the following command: -`ppanggolin write -p pangenome.h5 --regions` - -The file has the following format : - -| column | description | -|--------|-------------| -| region | a unique identifier for the region. This is usually built from the contig it is on, with a number after it| -|organism| the organism it is in. This is the organism name provided by the user.| -|start| the start position of the RGP in the contig| -|stop| the stop position of the RGP in the contig| -|genes| the number of genes included in the RGP| -|contigBorder| this is a boolean column. If the RGP is on a contig border it will be True, otherwise, it will be False. This often can indicate that, if an RGP is on a contig border it is probably not complete.| -|wholeContig| this is a boolean column. If the RGP is an entire contig, it will be True, and False otherwise. If an RGP is an entire contig it can possibly be a plasmid, a region flanked with repeat sequences or a contaminant| - -## Spots - -This is a tsv file with two column. It links the spots of 'summarize_spots' with the RGPs of 'plastic_regions'. - -It is written with the following command: -`ppanggolin write -p pangenome.h5 --spots` - -|column|description| -|------|------------| -|spot_id| The spot identifier (found in the 'spot' column of 'summarize_spots')| -|rgp_id| the RGP identifier (found in 'region' column of 'plastic_regions')| - -## Summarize spots - -This is a tsv file that will associate each spot with multiple metrics that can indicate the dynamic of the spot. - -It is written with the following command: -`ppanggolin write -p pangenome.h5 --spots` - -|column| description| -|-------|------------| -|spot| the spot identifier. It is unique in the pangenome| -|nb_rgp| the number of RGPs present in the spot| -|nb_families| The number of different gene families that are found in the spot| -|nb_unique_family_sets| The number of RGPs with different gene family content. If two RGPs are identical, they will be counted only once. The difference between this number and the one provided in 'nb_rgp' can be a strong indicator on whether their is a high turnover in gene content in this area or not| -|mean_nb_genes| the mean number of genes on RGPs in the spot| -|stdev_nb_genes| the standard deviation of the number of genes in the spot| -|max_nb_genes| the longest RGP in number of genes of the spot| -|min_nb_genes| the shortest RGP in number of genes of the spot| - -## Borders - -Each spot has at least one set of gene families bordering them. To write the list of gene families bordering a spot, you need to use the following option: -`ppanggolin write -p pangenome.h5 --borders` - -It will write a .tsv file with 4 columns: - -|column| description| -|-------|------------| -|spot_id| the spot identifier. It is unique in the pangenome| -|number| the number of RGPs present in the spot that have those bordering genes| -|border1| Comma-separated list of gene families of the 1st border| -|border2| Comma-separated list of gene families of the 2nd border| - -As there can be some variation in the borders, some spots will have multiple borders and as such multiple lines in this file. -The sum of the number for each spot_id should be exactly the number of RGPs in the spot. - -## Modules -### Functional modules -This .tsv file lists the modules and the gene families that belong to them. It lists one family per line, and there are multiple line for each module. -It is written along with other files with the following command: -`ppanggolin write -p pangenome.h5 --modules` - -It follows the following format: -|column|description| -|------|------------| -|module_id| The module identifier| -|family_id| the family identifier| - -### Modules in organisms -This .tsv file lists for each organism the modules that are present and how complete they are. Since there are some variability that are allowed in the module predictions, occasionnally some modules can be incomplete in some of the organisms where they are found. -This file is written along with other files with the following command: -`ppanggolin write -p pangenome.h5 --modules` - -And it follows the following format: -|column|description| -|------|------------| -|module_id| The module identifier| -|organism| the organism which has the indicated module| -|completion| a value between 0.0 and 1.0 which indicates how complete (in terms of gene family) the module is in the given organism| - -### modules summary -This .tsv file lists a few characteristics for each detected module. There is one line for each module. -The file is written along with other files with the following command: -`ppanggolin write -p pangenome.h5 --modules` - -And it follows the following format: -|column|description| -|------|------------| -|module_id| The module identifier| -|nb_families| The number of families which are included in the module The families themselves are listed in the 'functional_modules.tsv' file.| -|nb_organisms|The number of organisms in which the module is found. Those organisms are listed in the 'modules_in_organisms.tsv' file.| -|partition| The average partition of the families in the module.| -|mean_number_of_occurrence| the mean number of time a module is present in each organism. The expected value is around one, but it can be more if it is a module often repeated in the genomes (like a phage).| - -## spot modules -This command is available only if both modules and spots have been computed for your pangenome (see the command `all`, or the commands `spot` and `module` for that). -It indicates which modules are present in which spot and in which RGP. -The files are written with the following command: -```ppanggolin write -p pangenome.h5 --spot_modules``` -The format of the 'modules_spots.tsv' file is the following: - -|column|description| -|------|------------| -|module_id| The module identifier| -|spot_id| the spot identifier| - -The file 'modules_RGP_lists.tsv' lists RGPs that have the same modules. Those RGPs can have different gene families, however they will not have any other module than those that are indicated. The format of the 'modules_RGP_lists.tsv' is the following: - -|column|description| -|------|------------| -|representative_RGP| an RGP deemed representative for the group, and serving as a 'group of rgp id'(randomly picked)| -|nb_spots| The number of spots in which we see the RGPs which have the modules listed afterwards| -|mod_list| a list of the modules that are in the indicated RGPs| -|RGP_list| a list of RGP that include exactly the modules listed previously| - -This information can also be visualized through figures that can be drawn with `ppanggolin draw --spots` (see [Spot plots](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#spot-plots), and which can display modules. - -# Fasta -This command is available from 1.1.98 and on. -This command can be used to write fasta sequences of the pangenome or of parts of the pangenome. Most options expect a partition to write. Available partitions are: -* 'all' for the entire pangenome. -* 'Persistent' for persistent families -* 'Shell' for shell genes or families -* 'Cloud' for cloud genes or families -* 'rgp' for genes or families found in RGPs -* 'core' for core genes or families -* 'softcore' for softcore genes or families - -When using the 'softcore' filter, the '--soft_core' option can be used to modily the threshold used to determine what is part of the soft core. It is at 0.95 by default. - -## Genes - -This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all of the genes of the pangenome for example: - -```ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all``` - -Or to write only the persistent genes: - -```ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent``` - - -## Protein families - -This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families: - -```ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all``` - -or for all of the shell families for example: - -```ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell``` - - -## Gene families - -This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such: - -```ppanggolin fasta -p pangenome.h5 --output MY_GENES --gene_families all``` - -or for the cloud families for example: - -```ppanggolin fasta -p pangenome.h5 --output MY_GENES --gene_families cloud``` - -## Regions - -This option can be used to write the nucleotide sequences of the detected RGPs. It requires the fasta sequences that were used to compute the pangenome as they were provided originally when you computed your pangenome. This command only has two filters: -* all, for all regions -* complete, for only the 'complete' regions which are not on a contig border - -It can be used as such: - -```ppanggolin fasta -p pangenome.h5 --output MYREGIONS --regions all --fasta organisms.fasta.list``` - -# MSA - -This command is available from 1.1.103 and on. -It is used to call [mafft](https://mafft.cbrc.jp/alignment/software/) with default options to compute MSA of any partition of the pangenome. Using multiple cpus is recommended as it is quite demanding in computational resources. - -By default it will write the strict 'core' (genes that are present in absolutely all genomes) and remove any duplicated genes. Beware however that, if you have many genomes (over 1000), the core will likely be either very small or even empty if you have fragmented genomes. - -It will write one MSA for each family. You can then provide the directory where the MSA are written to [IQ-TREE](https://github.com/Cibiv/IQ-TREE) for example, to do phylogenetic analysis. - -## partitions - -You can change the partition which is written, by using the --partition option. -`ppanggolin msa -p pangenome.h5 --partition persistent` for example will compute MSA for all the persistent gene families. - -Supported partitions are core, persistent, shell, cloud, softcore, accessory. If you wish to have additional filters, you can raise an issue with your demand, or write a PR directly, most possibilites should be quite straightforward to add. - -## source - -You can specify whether to use dna or protein sequences for the MSA by using --source. It uses protein sequences by default. - -`ppanggolin msa -p pangenome.h5 --source dna` - -## phylo - -It is also possible to write a single whole genome MSA file, which many phylogenetic softwares accept as input, by using the --phylo option as such: - -`ppanggolin msa -p pangenome.h5 --phylo` - -This will contatenate all of the family MSA into a single MSA, with one sequence for each genome. - -# Info - -When computing a pangenome, all of the information about it is saved in the .h5 file, notably parameters used at each step and metrics about the pangenome. You can easily retrieve those informations using the 'info' module. - -This command prints information on stdout, and does not write any file. - -## Content - -This option indicates the following metrics about your pangenome, if they have been computed: -* The total number of genes -* The number of genomes -* The number of gene families -* The number of edges in the pangenome graph -* The number of persistent genes, with the minimal, maximal, sd and mean presence thresholds of the families in this partition -* The number of shell genes, with the minimal, maximal, sd and mean presence thresholds of the families in this partition -* The number of cloud genes, with the minimal, maximal, sd and mean presence thresholds of the families in this partition -* The number of partitions - -Additionally, if you have predicted RGPs and spots (with the subcommands 'panrgp', 'rgp' and 'spot', or 'all'), you will have the following metrics: -* The number of RGPs (Regions of Genomic Plasticity) -* The number of spots of insertion - -Additionally, if you have predicted modules (with the subcommands 'panmodule', 'module' or 'all'): -* The number of modules -* The number of gene families in modules +#### gexf and light gexf +```{include} graphOut/GEXF.md +``` -It is used as such: -` ppanggolin info -p pangenome.h5 --content` -## Parameters +#### json +```{include} graphOut/JSON.md +``` -This option indicates, for each steps of the analysis, the PPanGGOLiN parameters that were used and the source of the data if appropriate. +```{include} Flat/presAbs.md +``` -It is used as such: +### mean persistent duplication +```{include} Flat/dupplication.md +``` -`ppanggolin info -p pangenome.h5 --parameters` +### partitions +```{include} Flat/partition.md +``` -# Metrics -After computing a pangenome, it's interesting to get some metrics about it. -The `metrics` subcommand allow running and compute some analysis and metrics. +### projection +```{include} Flat/projection.md +``` -All the metrics computed here will be saved in your pangenome file and -will be easily readable with the `info` subcommand +### Gene families and genes +```{include} Flat/fam2gen.md +``` -## Genomic fluidity -The genomic fluidity is described as *a robust metric to categorize the -gene-level similarity among groups of sequenced isolates.* -[more information here](https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-12-32) +### Genomic Island +```{include} Flat/RGP.md +``` -We add the possibility to get genomic fluidity for all the pangenome or -for specific partition. The genomic fluidity is computable like that : +### Modules +```{include} Flat/module.md ``` -ppanggolin metrics -p pangenome --genome_fluidity -... -Genomes fluidity : all=0.026, shell=0.477, cloud=0.045, accessory=0.554 + +## Fasta +```{include} sequence/fasta.md ``` -*all* correspond to all the family in the pangenome (core and accessory) -## Module information -It could be necessary to get more information about the modules. -Here we provide information about families, and we separate modules in -function of the partition. You can get this supplementary information -as such : +## MSA +```{include} sequence/MSA.md ``` -ppanggolin metrics -p pangenome.h5 --info_modules -... -Modules : 3 -Families in Modules : 22 (min : 5, max : 9, sd : 2.08, mean : 7.33) - Sheel specific : 36.36 (sd : 4.62, mean : 2.67) - Cloud specific : 63.64 (sd : 4.51, mean : 4.67) + +## Info +```{include} Flat/info.md ``` + +## Metrics +```{include} Flat/metrics.md +``` \ No newline at end of file diff --git a/docs/user/PPanGGOLiN---step-by-step-pangenome-analysis.md b/docs/user/PPanGGOLiN---step-by-step-pangenome-analysis.md deleted file mode 100644 index b31265c0..00000000 --- a/docs/user/PPanGGOLiN---step-by-step-pangenome-analysis.md +++ /dev/null @@ -1,101 +0,0 @@ -The workflow subcommand of PPanGGOLiN uses what you can use through the other subcommands. In this section, the different steps will be briefly described along with the inputs that can be given, and which subcommands to use to run those specific parts yourself. - -This is useful only if you want to customize your workflow parameters. If you want to build the pangenome of your species without tuning parameters, you can use the subcommand `ppanggolin workflow` as described in the introduction. - -# Annotation - -As an input file, you can provide a list of .fasta files. If you do so, the provided genomes will be annotated using the following tools: The CDS will be annotated using [Prodigal](https://github.com/hyattpd/Prodigal), the tRNA will be annotated using [ARAGORN](http://130.235.244.92/ARAGORN/) and the rRNA are annotated using the [Infernal](http://eddylab.org/infernal/) command-line tools coupled with HMM of the bacterial and archaeal rRNAs downloaded from [RFAM](https://rfam.xfam.org/). Then the CDS overlapping any RNA genes will be deleted as they are usually false positive calls. - -To run this part of the pipeline, you can do : - -`ppanggolin annotate --fasta ORGANISM_FASTA_LIST` - -With ORGANISM_FASTA_LIST following the format described for the 'workflow' subcommand. You can check [this example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.fasta.list). - -If you do not want to use this pipeline, you can provide your annotation files (This is especially recommended if you already have functional annotations of your genome, as they will be added to the pangenome). - -They can be either gff3 files or .gbk files or .gbff files, or a mix of them, and should be provided through a list alike [this example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.gbff.list). .gbk or .gbff files are preferred. - -You can provide them using the following command : - -`ppanggolin annotate --anno ORGANISM_ANNOTATION_LIST` - -With ORGANISM_ANNOTATION_LIST being your file listing the organisms and the annotation file associated. If your annotation files do not have the genome sequence in them, you can use both options at the same time (to have both the gene annotations and the gene sequences) as such : - -`ppanggolin annotate --anno ORGANISM_ANNOTATION_LIST --fasta ORGANISM_FASTA_LIST` - -In addition, you can tune the command that is run with the annotation pipeline, or read in your annotation files using various options. You can check them through the command line `ppanggolin annotate --help`, they help should be self-explanatory. If not, don't hesitate to ask questions through the [issues page](https://github.com/labgem/PPanGGOLiN/issues). - -# Clustering - -Once we have the genes, we need to compare them to know which are similar, and to build gene families through this information. -If you provided .fasta files or annotation files with gene sequences in them, clustering can be run directly by providing the .h5 file that was generated, as such : - -`ppanggolin cluster -p pangenome.h5` - -PPanGGOLiN will call [MMseqs2](https://github.com/soedinglab/MMseqs2) to run the clustering on all the protein sequences by searching for connected components for the clustering step. You can tune its parameters using `--identity`(default 0.8) and `--coverage`(default 0.8). You can use other clustering algorithms of MMseqs by using --mode (default 1). Both protein sequences have to be covered by at least the proportion indicated by --coverage. - -## Providing your gene families - -If you do not want to use MMseqs2 and provide your clusters (or gene families) you can do so only if you provided the annotations in the first step. In the case of gff3 files, the 'ID' field in the 9th column is expected as a gene id. In the case of gbff or gbk files, the 'locus_tag' is used as a gene id, except with files coming from MaGe or from SEED, where the id provided in the 'db_xref' field is used. - -You will need to provide a .tsv file. The first column indicates the cluster id, and the second column indicates a unique gene id that is used in the annotation files. There is a single gene id per line. - -You can do that through the command line : - -`ppanggolin cluster -p pangenome.h5 --clusters MY_CLUSTERS_FILE` - -An example of what MY_CLUSTERS_FILE should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv) - -There are other options that you can use to tune your clustering. Most of them should be self-explanatory. If not, do not hesitate to write an [issue](https://github.com/labgem/PPanGGOLiN/issues). The only tricky option is the '--no-defrag' option. - -## Defragmentation - -We noticed that most of the cloud genes in the pangenome are fragments of 'shell' or 'persistent' genes, and so not informative on the pangenome's diversity. We added an additional workflow to reduce the number of gene families and reduce the computational load by trying to associate fragments to their original gene families. - -It adds a step to the clustering described previously. It will compare all of the gene families representative protein sequences using the same identity threshold as the first step. It will also use the same coverage threshold, but only the smallest of both protein sequences have to be covered by at least the value indicated by '--coverage'. - -After that, we build a similarity graph where the edges are the hits given by the comparison, and the nodes are the original gene families. Then we iter on all nodes and compare them to their neighbors. If the neighbor of a node is more numerous (has more members in the cluster it represents) and its representative sequence is longer, that node (and all the genes associated) is associated with the neighbor. The genes associated with this node are defined as 'fragments' of the gene family represented by the longer and more numerous neighboring node. - -You can use this pipeline by using the following command for versions 1.1.88 and under: - -`ppanggolin cluster -p pangenome.h5 --defrag` - -Starting from 1.1.89 and on, this strategy has become the default behavior of PPanGGOLiN. To avoid using it, you can run the following: - -`ppanggolin cluster -p pangenome.h5 --no_defrag` - -In any case and whichever pipeline you use, in the end, the gene families will be saved in the 'pangenome.h5' given as input. - -# Graph - -To partition a pangenome graph, you need to build a said pangenome graph. This can be done through the 'graph' subcommand. This will take a pangenome .h5 file as input and add edges to it. -This subcommand has only a single other option, which is '-r' or '--remove_high_copy_number'. If used, it will remove the gene families that are too duplicated in your genomes. This is useful if you want to visualize your pangenome afterward and want to remove the biggest hubs to have a clearer view. It can also be used to limit the influence of very duplicated genes such as transposase or ABC transporters in the partition step. - -The graph is constructed using the following subcommand : - -`ppanggolin graph -p pangenome.h5` - -And the results are saved in the pangenome.h5 file given as input. - -# Partition - -This is the step that will assign gene families to the 'persistent', 'shell', or 'cloud' partitions. - -The 'persistent' partition will group genes that are present throughout the entire species. They will be essential genes, genes required for important metabolic pathways and genes that define the metabolic and biosynthetic capabilities of the taxonomic group. - -The 'shell' partition groups genes that are present in only some individuals. Those are often genes that were acquired through horizontal gene transfers and encode for functions involved in environmental adaptations, pathogenicity, virulence or encoding secondary metabolites for example. - -The 'cloud' partition groups genes that are very rare in the pangenome and found in one, or very few, individuals. Most of the genes were associated with phage-related genes. They probably all were acquired through horizontal gene transfers. Antibiotic resistance genes were often found to be belonging to the cloud genome, as well as plasmid genes. - -It can be realized through the following subcommand : - -`ppanggolin partition -p pangenome.h5` - -It also has quite a few options. Most of them are not self-explanatory. If you want to know what they do, you should read the PPanGGOLiN paper (you can read it [here](https://journals.plos.org/ploscompbiol/article?rev=2&id=10.1371/journal.pcbi.1007732)) where the statistical methods used are thoroughly described. - -The one parameter that might be of importance is the '-K', or '--nb_of_partitions' parameter. This will define the number of classes used to partition the pangenome. This may be of use if you expect to have well-defined subpopulations in your pangenome, and you know exactly how many. If not, that number is detected automatically through an ICL criterion. The idea is that the most present partition will be 'persistent', the least present will be 'cloud', and all the others will be 'shell'. The number of partitions corresponding to the shell will be the number of expected subpopulations in your pangenome. (So if you expect 5 subpopulations, you could use -K 7). - -In most cases, you should let the statistical criterion used by PPanGGOLiN find the optimal number of partitions for you. - -All the results will be added to the given 'pangenome.h5' input file. \ No newline at end of file diff --git a/docs/user/RegionGenomicPlasticity/RGP.md b/docs/user/RegionGenomicPlasticity/RGP.md new file mode 100644 index 00000000..541a901f --- /dev/null +++ b/docs/user/RegionGenomicPlasticity/RGP.md @@ -0,0 +1,9 @@ + +Once partitions have been computed, you can predict the regions of genome plasticity. +This subcommand's options are about tuning parameters for the analysis. Details about each parameter can be found in the related [article](https://doi.org/10.1093/bioinformatics/btaa792). + +You can do it as such: + +`ppanggolin rgp -p pangenome.h5` + +This will predict RGPs and store results in the HDF5 file. If you want a list of RGPs for each genome, you can use `ppanggolin write -p pangenome.h5 --regions --output MYOUTPUTDIR`. It will provide the file 'plastic regions' whose format is described [here](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#plastic-regions) \ No newline at end of file diff --git a/docs/user/RegionGenomicPlasticity/RGPclust.md b/docs/user/RegionGenomicPlasticity/RGPclust.md new file mode 100644 index 00000000..f6fa9e7d --- /dev/null +++ b/docs/user/RegionGenomicPlasticity/RGPclust.md @@ -0,0 +1,25 @@ + +To cluster RGPs (Regions of Genome Plasticity) based on their gene families, you can use the command `panggolin rgp_cluster`. +The panggolin rgp_cluster command performs the following steps to cluster RGPs (Regions of Genome Plasticity) based on their gene families: + +1. Calculation of GRR (Gene Repertoire Relatedness): The command calculates the GRR values for all pairs of RGPs. The GRR metric evaluates the similarity between two RGPs by assessing their shared gene families. +2. Graph Construction: The command constructs a graph representation of the RGPs, where each RGP is represented as a node in the graph. The edges between the nodes are weighted using the GRR values, indicating the strength of the relationship between the RGPs. +3. Filtering GRR Values: GRR values below the `--grr_cutoff` threshold (default 0.8) are filtered out to remove noise from the analysis. +4. Louvain Communities Clustering: The Louvain communities clustering algorithm is then applied to the graph. This algorithm identifies clusters of RGPs with similar gene family relationships. + +There are three modes available for calculating the GRR value: `min_grr`, `max_grr`, or `incomplete_aware_grr`. +- `min_grr` mode: This mode computes the number of gene families shared between two RGPs and divides it by the smaller number of gene families among the two RGPs. +- `max_grr` mode: In this mode, the number of gene families shared between two RGPs is calculated and divided by the larger number of gene families among the two RGPs. +- `incomplete_aware_grr` (default) mode: If at least one RGP is considered incomplete, which typically happens when it is located at the border of a contig, the `min_grr` mode is used. Otherwise, the `max_grr` mode is applied. This mode is useful to correctly cluster incomplete RGPs. + + +The resulting RGP clusters are stored in a tsv file with the folowing columns: + +| column | description | +|---------|------------------------------| +| RGP | The unique region identifier | +| cluster | The cluster id of the RGP | +| spot_id | the spot ID of the RGP | + + +The command also generates an RGP graph in the gexf format, which can be utilized to explore the RGP clusters along with their spots of insertion. In this graph identical RGPs with the same family content and with the same spot are merged into a single node to simplify the graph representation. This feature can be disable with the parameter `--no_identical_rgp_merging`. diff --git a/docs/user/RegionGenomicPlasticity/spot.md b/docs/user/RegionGenomicPlasticity/spot.md new file mode 100644 index 00000000..cc5e6dbd --- /dev/null +++ b/docs/user/RegionGenomicPlasticity/spot.md @@ -0,0 +1,10 @@ + +To study RGPs that are found in the same area in different genomes, we gather them into 'spots of insertion'. Those spots are groups of RGPs that do not necessarily have the same gene content but have similar bordering _persistent_ genes. We run those analyses to study the dynamic of gene turnover of large regions in bacterial genomes. In this way, spots of the same pangenome can be compared and their dynamic can be established by comparing their different metrics together. Those metrics are described in the [output wiki](https://github.com/labgem/PPanGGOLiN/wiki/Outputs). + +Spots can be computed once RGPs have been predicted. You can do that using: + +`ppanggolin spot -p pangenome.h5` + +For versions between 1.1.0 and 1.2.12, you can use additional option '--draw_hotspots' which uses [genoplotR](http://genoplotr.r-forge.r-project.org/) to draw those spots in png figures. For versions above 1.2.12, you can use the dedicated subcommand [draw](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#draw), which uses the python library [bokeh](http://docs.bokeh.org/en/latest/) to draw interactive figures which can be visualized and modified directly in the browser. + +Information about spots can then be written using `ppanggolin write -p pangenome --spots` which will provide a [file linking RGPs with their spots](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#spots) and a [file showing multiple metrics for each spot](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#summarize-spots) \ No newline at end of file diff --git a/docs/user/Regions-of-Genome-Plasticity.md b/docs/user/Regions-of-Genome-Plasticity.md index b7bfa771..9abd6998 100644 --- a/docs/user/Regions-of-Genome-Plasticity.md +++ b/docs/user/Regions-of-Genome-Plasticity.md @@ -1,3 +1,5 @@ +(RGP-section)= +# Regions of Genome Plasticity From version 1.1.0 and on, it is possible to predict and work with Regions of Genome Plasticity (RGP) using PPanGGOLiN. RGPs correspond roughly to genomic islands, plasmids, and regions that have been lost in multiple strains. They are areas of the genome where there is a stretch of _shell_ and _cloud_ genes, which can indicate a more plastic area than those made of only _persistent_ genes. @@ -5,56 +7,14 @@ Those analyses can be done using the `ppanggolin panrgp` command directly from y This analysis can also be run with dedicated subcommands. -# RGP +## RGP prediction +```{include} RegionGenomicPlasticity/RGP.md +``` -Once partitions have been computed, you can predict the regions of genome plasticity. -This subcommand's options are about tuning parameters for the analysis. Details about each parameter can be found in the related [article](https://doi.org/10.1093/bioinformatics/btaa792). - -You can do it as such: - -`ppanggolin rgp -p pangenome.h5` - -This will predict RGPs and store results in the HDF5 file. If you want a list of RGPs for each genome, you can use `ppanggolin write -p pangenome.h5 --regions --output MYOUTPUTDIR`. It will provide the file 'plastic regions' whose format is described [here](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#plastic-regions) - -# Spots of insertion - -To study RGPs that are found in the same area in different genomes, we gather them into 'spots of insertion'. Those spots are groups of RGPs that do not necessarily have the same gene content but have similar bordering _persistent_ genes. We run those analyses to study the dynamic of gene turnover of large regions in bacterial genomes. In this way, spots of the same pangenome can be compared and their dynamic can be established by comparing their different metrics together. Those metrics are described in the [output wiki](https://github.com/labgem/PPanGGOLiN/wiki/Outputs). - -Spots can be computed once RGPs have been predicted. You can do that using: - -`ppanggolin spot -p pangenome.h5` - -For versions between 1.1.0 and 1.2.12, you can use additional option '--draw_hotspots' which uses [genoplotR](http://genoplotr.r-forge.r-project.org/) to draw those spots in png figures. For versions above 1.2.12, you can use the dedicated subcommand [draw](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#draw), which uses the python library [bokeh](http://docs.bokeh.org/en/latest/) to draw interactive figures which can be visualized and modified directly in the browser. - -Information about spots can then be written using `ppanggolin write -p pangenome --spots` which will provide a [file linking RGPs with their spots](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#spots) and a [file showing multiple metrics for each spot](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#summarize-spots) - - - -# RGP cluster based on their gene families - -To cluster RGPs (Regions of Genome Plasticity) based on their gene families, you can use the command `panggolin rgp_cluster`. -The panggolin rgp_cluster command performs the following steps to cluster RGPs (Regions of Genome Plasticity) based on their gene families: - -1. Calculation of GRR (Gene Repertoire Relatedness): The command calculates the GRR values for all pairs of RGPs. The GRR metric evaluates the similarity between two RGPs by assessing their shared gene families. -2. Graph Construction: The command constructs a graph representation of the RGPs, where each RGP is represented as a node in the graph. The edges between the nodes are weighted using the GRR values, indicating the strength of the relationship between the RGPs. -3. Filtering GRR Values: GRR values below the `--grr_cutoff` threshold (default 0.8) are filtered out to remove noise from the analysis. -4. Louvain Communities Clustering: The Louvain communities clustering algorithm is then applied to the graph. This algorithm identifies clusters of RGPs with similar gene family relationships. - -There are three modes available for calculating the GRR value: `min_grr`, `max_grr`, or `incomplete_aware_grr`. -- `min_grr` mode: This mode computes the number of gene families shared between two RGPs and divides it by the smaller number of gene families among the two RGPs. -- `max_grr` mode: In this mode, the number of gene families shared between two RGPs is calculated and divided by the larger number of gene families among the two RGPs. -- `incomplete_aware_grr` (default) mode: If at least one RGP is considered incomplete, which typically happens when it is located at the border of a contig, the `min_grr` mode is used. Otherwise, the `max_grr` mode is applied. This mode is useful to correctly cluster incomplete RGPs. - - -The resulting RGP clusters are stored in a tsv file with the folowing columns: - -| column | description | -|---------|------------------------------| -| RGP | The unique region identifier | -| cluster | The cluster id of the RGP | -| spot_id | the spot ID of the RGP | - - - -The command also generates an RGP graph in the gexf format, which can be utilized to explore the RGP clusters along with their spots of insertion. In this graph identical RGPs with the same family content and with the same spot are merged into a single node to simplify the graph representation. This feature can be disable with the parameter `--no_identical_rgp_merging`. +## Spots of insertion detection +```{include} RegionGenomicPlasticity/spot.md +``` +## RGP clustering based on their gene families +```{include} RegionGenomicPlasticity/RGPclust.md +``` \ No newline at end of file diff --git a/docs/user/_Sidebar.md b/docs/user/_Sidebar.md deleted file mode 100644 index e874b20b..00000000 --- a/docs/user/_Sidebar.md +++ /dev/null @@ -1,22 +0,0 @@ -* [Introduction](https://github.com/labgem/PPanGGOLiN/wiki/Introduction) -* [Installation instructions](https://github.com/labgem/PPanGGOLiN/wiki/Installation) -* [Basic usage and practical information](https://github.com/labgem/PPanGGOLiN/wiki/Basic-usage-and-practical-information) -* [step by step pangenome analysis](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis) - * [Annotation](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#annotation) - * [Cluster](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#clustering) - * [Graph](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#graph) - * [Partition](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#partition) -* [Regions of Genome Plasticity](https://github.com/labgem/PPanGGOLiN/wiki/Regions-of-Genome-Plasticity) - * [RGP](https://github.com/labgem/PPanGGOLiN/wiki/Regions-of-Genome-Plasticity#rgp) - * [Spots of insertion](https://github.com/labgem/PPanGGOLiN/wiki/Regions-of-Genome-Plasticity#spots-of-insertion) -* [Conserved modules](https://github.com/labgem/PPanGGOLiN/wiki/Conserved-modules) -* [Align](https://github.com/labgem/PPanGGOLiN/wiki/Align) -* [Genomic Context](https://github.com/labgem/PPanGGOLiN/wiki/Genomic-context) -* [Output](https://github.com/labgem/PPanGGOLiN/wiki/Outputs) - * [Draw](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#draw) - * [rarefaction](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#rarefaction) - * [Write](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#write) - * [Fasta](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#fasta) - * [MSA](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#MSA) - * [Info](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#info) - * [Metadata](https://github.com/labgem/PPanGGOLiN/wiki/Metadata) diff --git a/docs/user/graphOut/GEXF.md b/docs/user/graphOut/GEXF.md new file mode 100644 index 00000000..8a08729f --- /dev/null +++ b/docs/user/graphOut/GEXF.md @@ -0,0 +1,25 @@ +The Graph can be given through the .gexf and through the _light.gexf files. The _light.gexf file will contain the gene families as nodes and the edges between gene families describing their relationship, and the .gexf file will contain the same thing, but also include more informations about each gene and each relation between gene families. +We have made two different files representing the same graph because, while the non-light file is exhaustive, it can be very heavy to manipulate and most of the information in it are not of interest to everyone. The _light.gexf file should be the one you use to manipulate the pangenome graph most of the time. + +They can be manipulated and visualised through a software called [Gephi](https://gephi.org/), with which we have made extensive testings, or potentially any other softwares or libraries that can read gexf files such as [networkx](https://networkx.github.io/documentation/stable/index.html) or [gexf-js](https://github.com/raphv/gexf-js) among others. + +Using Gephi, the layout can be tuned as illustrated below: + +![Gephi layout](https://github.com/labgem/PPanGGOLiN/raw/master/images/gephi.gif) + +We advise the Gephi "Force Atlas 2" algorithm to compute the graph layout with "Stronger Gravity: on" and "scaling: 4000" but don't hesitate to tinker the layout parameters. + +In the _light.gexf file : +The nodes will contain the number of genes belonging to the gene family, the most commun gene name (if you provided annotations), the most common product name(if you provided annotations), the partitions it belongs to, its average and median size in nucleotids, and the number of organisms that have this gene family. + +The edges contain the number of times they are present in the pangenome. + +The .gexf non-light file will contain in addition to this all the information about genes belonging to each gene families, their names, their product string, their sizes and all the information about the neighborhood relationships of each pair of genes described through the edges. + +The light gexf can be generated using the 'write' subcommand as such : + +`ppanggolin write -p pangenome.h5 --light_gexf` + +while the gexf file can be generated as such : + +`ppanggolin write -p pangenome.h5 --gexf` \ No newline at end of file diff --git a/docs/user/graphOut/JSON.md b/docs/user/graphOut/JSON.md new file mode 100644 index 00000000..4f648889 --- /dev/null +++ b/docs/user/graphOut/JSON.md @@ -0,0 +1,5 @@ +The json's file content corresponds to the .gexf file content, but in json rather than gexf file format. It follows the 'node-link' format as shown in [this example](https://observablehq.com/@d3/force-directed-graph) in javascript, or as used in the [networkx](https://networkx.github.io/documentation/stable/reference/readwrite/json_graph.html) python library and it should be usable with both [D3js](https://d3js.org/) and [networkx](https://networkx.github.io/documentation/stable/index.html), or any other software or library that supports this format. + +The json can be generated using the 'write' subcommand as such : + +`ppanggolin write -p pangenome.h5 --json` \ No newline at end of file diff --git a/docs/user/metadata.md b/docs/user/metadata.md index c9d92060..801fdd88 100644 --- a/docs/user/metadata.md +++ b/docs/user/metadata.md @@ -1,3 +1,5 @@ +# Adding metadata to pangenome elements + From version 2.0.0, it is possible to add metadata link to pangenome elements using PPanGGOLiN. Metadata can be associated with: genes, genomes, families, RGPs, spots and modules from a simple TSV file. To add metadata in your pangenome you can launch the command is as follows: @@ -7,7 +9,7 @@ To add metadata in your pangenome you can launch the command is as follows: - `--source` arguments corresponds to the origin of the metadata and will be used as the storage key in the pangenome. - `--assign` Choose to which pangenome elements who want to add metadata in the following list {families,genomes,genes,RGPs,spots,modules} -# Metadata format +## Metadata format PPanGGOLiN allows to use a highly flexible metadata file. Only one column name is mandatory, and it is identical to the assignment argument chosen by the user. @@ -24,21 +26,21 @@ For example the TSV file to assign metadata to gene families to functional annot *Note: As you can see in the above table, one element (here GF_2) can be associated with more than one metadata.* -## Command specifiq option details +### Command specifiq option details -### `--metadata` +#### `--metadata` PPanGGOLiN enables to give one TSV at a time to add metadata. Look at [Metadata Format]() -### `--source` +#### `--source` The source is the key use to access to metadata in pangenome. So if the name of the source already exist in the pangenome it can be overwritten only with `--force` option. This system allow to have multiple metadata source that can be read and use in PPanGGOLiN. -### `--assign` +#### `--assign` PPanGGOLiN allows to add metadata to all pangenome elements: families,genomes,genes,RGPs,spots,modules. But the user can only give one metadata file at a time as he can provide only source and so one type of pangenome element. -### `--omit` +#### `--omit` You can use this option to skip the error provide by an unfind ID in the pangenome. This could be useful if you are using a general TSV with element not in the pangenome, but must be used with carefully. diff --git a/docs/user/sequence/MSA.md b/docs/user/sequence/MSA.md new file mode 100644 index 00000000..e1fbb3a0 --- /dev/null +++ b/docs/user/sequence/MSA.md @@ -0,0 +1,28 @@ + +This command is available from 1.1.103 and on. +It is used to call [mafft](https://mafft.cbrc.jp/alignment/software/) with default options to compute MSA of any partition of the pangenome. Using multiple cpus is recommended as it is quite demanding in computational resources. + +By default it will write the strict 'core' (genes that are present in absolutely all genomes) and remove any duplicated genes. Beware however that, if you have many genomes (over 1000), the core will likely be either very small or even empty if you have fragmented genomes. + +It will write one MSA for each family. You can then provide the directory where the MSA are written to [IQ-TREE](https://github.com/Cibiv/IQ-TREE) for example, to do phylogenetic analysis. + +### partitions + +You can change the partition which is written, by using the --partition option. +`ppanggolin msa -p pangenome.h5 --partition persistent` for example will compute MSA for all the persistent gene families. + +Supported partitions are core, persistent, shell, cloud, softcore, accessory. If you wish to have additional filters, you can raise an issue with your demand, or write a PR directly, most possibilites should be quite straightforward to add. + +### source + +You can specify whether to use dna or protein sequences for the MSA by using --source. It uses protein sequences by default. + +`ppanggolin msa -p pangenome.h5 --source dna` + +### phylo + +It is also possible to write a single whole genome MSA file, which many phylogenetic softwares accept as input, by using the --phylo option as such: + +`ppanggolin msa -p pangenome.h5 --phylo` + +This will contatenate all of the family MSA into a single MSA, with one sequence for each genome. \ No newline at end of file diff --git a/docs/user/sequence/fasta.md b/docs/user/sequence/fasta.md new file mode 100644 index 00000000..0283adde --- /dev/null +++ b/docs/user/sequence/fasta.md @@ -0,0 +1,53 @@ +This command is available from 1.1.98 and on. +This command can be used to write fasta sequences of the pangenome or of parts of the pangenome. Most options expect a partition to write. Available partitions are: +* 'all' for the entire pangenome. +* 'Persistent' for persistent families +* 'Shell' for shell genes or families +* 'Cloud' for cloud genes or families +* 'rgp' for genes or families found in RGPs +* 'core' for core genes or families +* 'softcore' for softcore genes or families + +When using the 'softcore' filter, the '--soft_core' option can be used to modily the threshold used to determine what is part of the soft core. It is at 0.95 by default. + +### Genes + +This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all of the genes of the pangenome for example: + +```ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all``` + +Or to write only the persistent genes: + +```ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent``` + + +### Protein families + +This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families: + +```ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all``` + +or for all of the shell families for example: + +```ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell``` + + +### Gene families + +This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such: + +```ppanggolin fasta -p pangenome.h5 --output MY_GENES --gene_families all``` + +or for the cloud families for example: + +```ppanggolin fasta -p pangenome.h5 --output MY_GENES --gene_families cloud``` + +### Regions + +This option can be used to write the nucleotide sequences of the detected RGPs. It requires the fasta sequences that were used to compute the pangenome as they were provided originally when you computed your pangenome. This command only has two filters: +* all, for all regions +* complete, for only the 'complete' regions which are not on a contig border + +It can be used as such: + +```ppanggolin fasta -p pangenome.h5 --output MYREGIONS --regions all --fasta organisms.fasta.list``` diff --git a/docs/user/step-by-step-pangenome-analysis.md b/docs/user/step-by-step-pangenome-analysis.md new file mode 100644 index 00000000..5e55c521 --- /dev/null +++ b/docs/user/step-by-step-pangenome-analysis.md @@ -0,0 +1,37 @@ +(step-by-step-section)= +# Pangenome graph building and partitionning +The workflow subcommand of PPanGGOLiN uses what you can use through the other subcommands. +In this section, the different steps will be described along with the inputs that can be given, and which subcommands to use to run those specific parts yourself. + +This is useful only if you want to customize your workflow parameters. +If you want to build the pangenome of your species without tuning parameters, you can use the subcommand `ppanggolin workflow` as described in the [introduction](#basic). + +(annotation)= +## Genomes annotation and storage + +The first PPanGGOLiN step consist to get all information about genomes and store them in the pangenome file. +PPanGGOLiN will get information about: genes, RNA, contigs and genomes. + +```{include} step-by-step/annotation.md +``` + +## Clustering genes in gene families + +In order to build the pangenome graph, we need to compute clusters of similar genes called gene families. + +```{include} step-by-step/clustering.md +``` + +## Building the pangenome graph + +Now that we get the gene families we can build the pangenome graph. + +```{include} step-by-step/graph.md +``` + +## Pangenome graph partitioning + +The final step is to partition the pre-computed pangenome graph. + +```{include} step-by-step/partition.md +``` \ No newline at end of file diff --git a/docs/user/step-by-step/annotation.md b/docs/user/step-by-step/annotation.md new file mode 100644 index 00000000..581c8183 --- /dev/null +++ b/docs/user/step-by-step/annotation.md @@ -0,0 +1,65 @@ +### Annotate pangenome with fasta files + +As an input file, you can provide a list of .fasta files. +If you do so, the provided genomes will be annotated using the following tools: + +- [Prodigal](https://github.com/hyattpd/Prodigal) to annotate the CDS, +- [ARAGORN](http://130.235.244.92/ARAGORN/) to annotate the tRNA +- [Infernal](http://eddylab.org/infernal/) coupled with HMM of the bacterial and archaeal rRNAs downloaded from [RFAM](https://rfam.xfam.org/) to annotate the RNA command-line tools. + +Then the CDS overlapping any RNA genes will be deleted as they are usually false positive calls. +You can prevent this filtering by using `--allow_overlap` option. + +To run this part of the pipeline, you can do : + +``` +ppanggolin annotate --fasta ORGANISM_FASTA_LIST +``` + +The file ORGANISMS_FASTA_LIST is a tsv-separated file with the following organisation : + +1. The first column contains a unique organism name +2. The second column the path to the associated FASTA file +3. Circular contig identifiers are indicated in the following columns +4. Each line represents an organism + +You can check [this example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.fasta.list). + +### Annotate pangenome with annotation files + +You can also provide your annotation files. +They can be either gff3 files or .gbk files or .gbff files, or a mix of them, and should be provided through a list alike [this example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.gbff.list). +.gbk or .gbff files are preferred. + +```{note} +Use your own annotation is especially recommended if you already have functional annotations of your genome, + as they will be added to the pangenome +``` + +You can provide them using the following command : + +``` +ppanggolin annotate --anno ORGANISM_ANNOTATION_LIST +``` + +With ORGANISM_ANNOTATION_LIST being your file listing the organisms and the annotation file associated. +If your annotation files do not have the genome sequence in them, +you can use both options at the same time (to have both the gene annotations and the gene sequences) as such : + +``` +ppanggolin annotate --anno ORGANISM_ANNOTATION_LIST --fasta ORGANISM_FASTA_LIST +``` + +### Annotate command-line options + +You can tune the command that is run with the annotation pipeline, or read in your annotation files using various options described below. + +| name | alias | default | type / choices | description | +|----------------------|-------|-----------|--------------------|-----------------------------------------------------------------------------------------------------------------| +| --allow_overlap | | False | bool | Use to not remove genes overlapping with RNA features | +| --norna | | False | bool | Use to avoid annotating RNA features | +| --kingdom | | bacteria | {bacteria,archaea} | Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation | +| --translation_table | | 11 | integer | Translation table (genetic code) to use | +| --basename | | pangenome | string | basename for the output file | +| --use_pseudo | | False | bool | In the context of provided annotation, use this option to read pseudogenes (Default behavior is to ignore them) | +| --prodigal_procedure | -p | None | {single,meta} | Allow to force the prodigal procedure. If nothing given, PPanGGOLiN will decide in function of contig length | \ No newline at end of file diff --git a/docs/user/step-by-step/clustering.md b/docs/user/step-by-step/clustering.md new file mode 100644 index 00000000..369e9787 --- /dev/null +++ b/docs/user/step-by-step/clustering.md @@ -0,0 +1,61 @@ + +### Computing gene families with PPanGGOLiN + +Once we have the genes, we need to compare them to know which are similar, and to build gene families through this information. + +If you provided .fasta files or annotation files with gene sequences in them, clustering can be run directly by providing the .h5 file that was generated, as such : + +``` +ppanggolin cluster -p pangenome.h5 +``` + +PPanGGOLiN will call [MMseqs2](https://github.com/soedinglab/MMseqs2) to run the clustering on all the protein sequences by searching for connected components for the clustering step. +You can tune its parameters using `--identity`(default 0.8) and `--coverage`(default 0.8). +You can use other clustering algorithms of MMseqs by using --mode (default 1). +Both protein sequences have to be covered by at least the proportion indicated by --coverage. + +(read-clustering)= +### Providing your gene families + +If you do not want to use MMseqs2 and provide your clusters (or gene families) you can do so only if you provided the annotations in the first step. +In the case of gff3 files, the 'ID' field in the 9th column is expected as a gene id. +In the case of gbff or gbk files, the 'locus_tag' is used as a gene id, except with files coming from MaGe or from SEED, where the id provided in the 'db_xref' field is used. + +You will need to provide a .tsv file. +The first column indicates the cluster id, and the second column indicates a unique gene id that is used in the annotation files. +There is a single gene id per line. + +You can do that through the command line : + +`ppanggolin cluster -p pangenome.h5 --clusters MY_CLUSTERS_FILE` + +An example of what MY_CLUSTERS_FILE should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv) + +There are other options that you can use to tune your clustering. +Most of them should be self-explanatory. +If not, do not hesitate to write an [issue](https://github.com/labgem/PPanGGOLiN/issues). +The only tricky option is the '--no-defrag' option. + + +### Defragmentation + +We noticed that most of the cloud genes in the pangenome are fragments of 'shell' or 'persistent' genes, and so not informative on the pangenome's diversity. +We added another workflow to reduce the number of gene families and reduce the computational load by trying to associate fragments to their original gene families. +It adds a step to the clustering described previously. +It will compare all the gene families representative protein sequences using the same identity threshold as the first step. +It will also use the same coverage threshold, but only the smallest of both protein sequences have to be covered by at least the value indicated by `--coverage`. + +After that, we build a similarity graph where the edges are the hits given by the comparison, and the nodes are the original gene families. +Then we iterate on all nodes and compare them to their neighbors. +If the neighbor of a node is more numerous (has more members in the cluster it represents) and its representative sequence is longer, that node (and all the genes associated) is associated with the neighbor. +The genes associated with this node are defined as 'fragments' of the gene family represented by the longer and more numerous neighboring node. + + +To avoid using it, you can run the following: + +``` +ppanggolin cluster -p pangenome.h5 --no_defrag +``` + + +In any case and whichever pipeline you use, in the end, the gene families will be saved in the 'pangenome.h5' given as input. \ No newline at end of file diff --git a/docs/user/step-by-step/graph.md b/docs/user/step-by-step/graph.md new file mode 100644 index 00000000..4b29536d --- /dev/null +++ b/docs/user/step-by-step/graph.md @@ -0,0 +1,17 @@ + +To partition a pangenome graph, you need to build a said pangenome graph. +This can be done through the `graph` subcommand. +This will take a pangenome .h5 file as input and compute edges to link gene families together based on the genomic neighborhood. +The graph is constructed using the following subcommand : + +``` +ppanggolin graph -p pangenome.h5 +``` + +This subcommand has only a single other option, which is `-r` or `--remove_high_copy_number`. +If used, it will remove the gene families that are too duplicated in your genomes. +This is useful if you want to visualize your pangenome afterward and want to remove the biggest hubs to have a clearer view. +It can also be used to limit the influence of very duplicated genes such as transposase or ABC transporters in the partition step. + + +The resulting pangenome graph is saved in the pangenome.h5 file given as input. \ No newline at end of file diff --git a/docs/user/step-by-step/partition.md b/docs/user/step-by-step/partition.md new file mode 100644 index 00000000..be0f8ae6 --- /dev/null +++ b/docs/user/step-by-step/partition.md @@ -0,0 +1,34 @@ +This is the step that will assign gene families to the 'persistent', 'shell', or 'cloud' partitions. + + +The 'persistent' partition will group genes that are present throughout the entire species. +They will be essential genes, genes required for important metabolic pathways and genes that define the metabolic and biosynthetic capabilities of the taxonomic group. + +The 'shell' partition groups genes that are present in only some individuals. +Those are often genes that were acquired through horizontal gene transfers and encode for functions involved in environmental adaptations, pathogenicity, virulence or encoding secondary metabolites for example. + +The 'cloud' partition groups genes that are very rare in the pangenome and found in one, or very few, individuals. +Most of the genes were associated with phage-related genes. +They probably all were acquired through horizontal gene transfers. +Antibiotic resistance genes were often found to be belonging to the cloud genome, as well as plasmid genes. + +It can be realized through the following subcommand : + +`ppanggolin partition -p pangenome.h5` + +It also has quite a few options. +Most of them are not self-explanatory. +If you want to know what they do, you should read the PPanGGOLiN paper (you can read it [here](https://journals.plos.org/ploscompbiol/article?rev=2&id=10.1371/journal.pcbi.1007732)) where the statistical methods used are thoroughly described. + +The one parameter that might be of importance is the '-K', or '--nb_of_partitions' parameter. +This will define the number of classes used to partition the pangenome. +This may be of use if you expect to have well-defined subpopulations in your pangenome, and you know exactly how many. +If not, that number is detected automatically through an ICL criterion. +The idea is that the most present partition will be 'persistent', the least present will be 'cloud', and all the others will be 'shell'. +The number of partitions corresponding to the shell will be the number of expected subpopulations in your pangenome. +(So if you expect 5 subpopulations, you could use -K 7). + + +In most cases, you should let the statistical criterion used by PPanGGOLiN find the optimal number of partitions for you. + +All the results will be added to the given 'pangenome.h5' input file. \ No newline at end of file diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py index 63b9f2a8..7dacb6f4 100644 --- a/ppanggolin/RGP/rgp_cluster.py +++ b/ppanggolin/RGP/rgp_cluster.py @@ -382,14 +382,14 @@ def cluster_rgp_on_grr(graph: nx.Graph, clustering_attribute: str = "grr"): def get_spot_id(rgp: Region, rgp_to_spot: Dict[Region, int]) -> str: """ - Return Spot ID associated to an RGP. - It adds the prefix "spot_" to the spot ID. - When no spot is associated with the RGP, then the string "No spot" is return + Return Spot ID associated to an RGP. + It adds the prefix "spot" to the spot ID. When no spot is associated with the RGP, + then the string "No spot" is return - :params rgp: RGP id - :params rgp_to_spot: A dictionary mapping an RGP to its spot. + :param rgp: RGP id + :param rgp_to_spot: A dictionary mapping an RGP to its spot. - :return: Spot ID of the given RGP with the prefix spot_ or "No spot". + :return: Spot ID of the given RGP with the prefix spot or "No spot". """ if rgp in rgp_to_spot: return f"spot_{rgp_to_spot[rgp]}" diff --git a/ppanggolin/edge.py b/ppanggolin/edge.py index 903f8aa6..6a1b0840 100644 --- a/ppanggolin/edge.py +++ b/ppanggolin/edge.py @@ -9,17 +9,19 @@ class Edge: - """The Edge class represents an edge between two gene families in the pangenome graph. It is associated with all the - organisms in which the neighborship is found, and all the involved genes as well. + """ + The Edge class represents an edge between two gene families in the pangenome graph. It is associated with all the + organisms in which the neighborship is found, and all the involved genes as well. + Methods: - - get_org_dict: Returns a dictionary with organisms as keys and an iterable of the pairs in genes as values. - - gene_pairs: Returns a list of all the gene pairs in the Edge. - - add_genes: Adds genes to the edge. They are supposed to be in the same organism. + - get_org_dict: Returns a dictionary with organisms as keys and an iterable of the pairs in genes as values. + - gene_pairs: Returns a list of all the gene pairs in the Edge. + - add_genes: Adds genes to the edge. They are supposed to be in the same organism. Fields: - - source: A GeneFamily object representing the source gene family of the edge. - - target: A GeneFamily object representing the target gene family of the edge. - - organisms: A defaultdict object representing the organisms in which the edge is found and the pairs of genes involved. + - source: A GeneFamily object representing the source gene family of the edge. + - target: A GeneFamily object representing the target gene family of the edge. + - organisms: A defaultdict object representing the organisms in which the edge is found and the pairs of genes involved. """ def __init__(self, source_gene: Gene, target_gene: Gene): diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index dd0123ca..30e19b2a 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -6,7 +6,7 @@ from collections import Counter, defaultdict import statistics from typing import Tuple, Union -import pkg_resources +from importlib.metadata import distribution # installed libraries from tqdm import tqdm @@ -394,7 +394,7 @@ def write_status(pangenome: Pangenome, h5f: tables.File): status_group._v_attrs.spots = True if pangenome.status["spots"] in ["Computed", "Loaded", "inFile"] else False status_group._v_attrs.modules = True if pangenome.status["modules"] in ["Computed", "Loaded", "inFile"] else False status_group._v_attrs.metadata = write_metadata_status(pangenome, h5f, status_group) - status_group._v_attrs.version = pkg_resources.get_distribution("ppanggolin").version + status_group._v_attrs.version = distribution("ppanggolin").version def write_info(pangenome: Pangenome, h5f: tables.File): diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index 4a9926c6..d63d0f79 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -8,7 +8,7 @@ from collections import Counter, defaultdict from pathlib import Path from typing import TextIO -import pkg_resources +from importlib.metadata import distribution from statistics import median, mean, stdev import os @@ -214,7 +214,7 @@ def write_gexf_header(gexf: TextIO, light: bool = True): gexf.write(f' \n') gexf.write(' \n') gexf.write(' \n') - gexf.write(f' PPanGGOLiN {pkg_resources.get_distribution("ppanggolin").version}\n') + gexf.write(f' PPanGGOLiN {distribution("ppanggolin").version}\n') gexf.write(' \n') diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 0bf13aef..2e075bda 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -19,6 +19,7 @@ class GeneFamily(MetaFeatures): """ This represents a single gene family. It will be a node in the pangenome graph, and be aware of its genes and edges. + Methods: - named_partition: returns a meaningful name for the partition associated with the family. - neighbors: returns all the GeneFamilies that are linked with an edge. diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 73e0affd..84033e8d 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -147,11 +147,14 @@ def parser_graph(parser: argparse.ArgumentParser): :param parser: parser for graph argument """ - parser.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") - parser.add_argument('-r', '--remove_high_copy_number', type=int, default=0, - help="Positive Number: Remove families having a number of copy of gene in a single organism " - "above or equal to this threshold in at least one organism " - "(0 or negative values are ignored).") + required = parser.add_argument_group(title="Required arguments", + description="Following arguments is required:") + required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file") + optional = parser.add_argument_group(title="Optional arguments") + optional.add_argument('-r', '--remove_high_copy_number', type=int, default=0, + help="Positive Number: Remove families having a number of copy of gene in a single organism " + "above or equal to this threshold in at least one organism " + "(0 or negative values are ignored).") if __name__ == '__main__': diff --git a/ppanggolin/main.py b/ppanggolin/main.py index a2de8535..2d94fc7b 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -4,11 +4,11 @@ # default libraries import sys -if sys.version_info < (3, 6): # minimum is python3.6 - raise AssertionError("Minimum python version to run PPanGGOLiN is 3.6. Your current python version is " + +if sys.version_info < (3, 8): # minimum is python3.8 + raise AssertionError("Minimum python version to run PPanGGOLiN is 3.8. Your current python version is " + ".".join(map(str, sys.version_info))) import argparse -import pkg_resources +from importlib.metadata import distribution # local modules import ppanggolin.pangenome @@ -32,6 +32,13 @@ from ppanggolin import SUBCOMMAND_TO_SUBPARSER +version = distribution("ppanggolin").version +epilog = f""" +PPanGGOLiN ({version}) is an opensource bioinformatic tools under CeCILL FREE SOFTWARE LICENSE AGREEMENT +LABGeM +Please cite: Gautreau G et al. (2020) PPanGGOLiN: Depicting microbial diversity via a partitioned pangenome graph. +PLOS Computational Biology 16(3): e1007732. https://doi.org/10.1371/journal.pcbi.1007732 +""" def cmd_line() -> argparse.Namespace: """ Manage the command line argument given by user @@ -80,19 +87,18 @@ def cmd_line() -> argparse.Namespace: desc += " \n" desc += " Utility command:\n" desc += " utils Helper side commands.\n" - desc += " \n" parser = argparse.ArgumentParser( description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors", - formatter_class=argparse.RawTextHelpFormatter) + formatter_class=argparse.RawTextHelpFormatter, + epilog=epilog) parser.add_argument('-v', '--version', action='version', - version='%(prog)s ' + pkg_resources.get_distribution("ppanggolin").version) + version='%(prog)s ' + version) subparsers = parser.add_subparsers(metavar="", dest="subcommand", title="subcommands", description=desc) subparsers.required = True # because python3 sent subcommands to hell apparently - # print help if no subcommand is specified if len(sys.argv) == 1: parser.print_help() diff --git a/ppanggolin/metadata.py b/ppanggolin/metadata.py index a6f52cc3..cfd067a7 100644 --- a/ppanggolin/metadata.py +++ b/ppanggolin/metadata.py @@ -13,15 +13,14 @@ class Metadata: """The Metadata class represents a metadata link to genes, gene families, organisms, regions, spot or modules. - Methods: + Methods: - number_of_attribute: Returns the number of attributes in the Metadata object. - get: Returns the value of a specific attribute, or None if the attribute does not exist. - fields: Returns a list of all the attributes in the Metadata object. - - Fields: + Fields: - source: A string representing the source of the metadata. - - **kwargs: A dictionary of attributes and values representing the metadata. The attributes can be any string, and the values can be any type except None or NaN. + - kwargs: A dictionary of attributes and values representing the metadata. The attributes can be any string, and the values can be any type except None or NaN. """ def __init__(self, source: str, **kwargs): """Constructor Method diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 5581e55c..83dc1479 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -18,6 +18,7 @@ class Region(MetaFeatures): """ The 'Region' class represents a region of genomic plasticity. + Methods: - 'genes': the property that generates the genes in the region as they are ordered in contigs. - 'families': the property that generates the gene families in the region. @@ -304,6 +305,7 @@ def get_bordering_genes(self, n: int, multigenics: set) -> List[List[Gene], List class Spot(MetaFeatures): """ The 'Spot' class represents a region of genomic plasticity. + Methods: - 'regions': the property that generates the regions in the spot. - 'families': the property that generates the gene families in the spot. diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 28e19a1c..41f7bf26 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -12,7 +12,7 @@ from typing import TextIO, Union, BinaryIO, Tuple, List, Set, Iterable import networkx as nx -import pkg_resources +from importlib.metadata import distribution from numpy import repeat from collections.abc import Callable @@ -158,8 +158,7 @@ def set_verbosity_level(args): format=str_format, datefmt=datefmt) logging.getLogger("PPanGGOLiN").info("Command: " + " ".join([arg for arg in sys.argv])) - logging.getLogger("PPanGGOLiN").info( - "PPanGGOLiN version: " + pkg_resources.get_distribution("ppanggolin").version) + logging.getLogger("PPanGGOLiN").info(f"PPanGGOLiN version: {distribution('ppanggolin').version}") def jaccard_similarities(mat: csc_matrix, jaccard_similarity_th) -> csc_matrix: diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..530c6052 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,70 @@ +[build-system] +requires = [ + "setuptools", + "setuptools-scm", + "cython" +] +build-backend = "setuptools.build_meta" +py_modules=["ppanggolin"] + +[project] +name = "PPanGGOLiN" +description = "Pangenome analysis suite" +dynamic = ["version"] +authors = [ + {name = "Guillaume Gautreau"}, + {name = "Adelme Bazin"}, + {name = "Jérôme Arnoux", email = "jarnoux@genoscope.cns.fr"}, + {name = "Jean Mainguy"}, +] +maintainers = [ + {name = "Guillaume Gautreau"}, + {name = "Adelme Bazin"}, + {name = "Jérôme Arnoux", email = "jarnoux@genoscope.cns.fr"}, + {name = "Jean Mainguy"}, +] +readme = "README.md" +keywords = ["Pangenomics", "Comparative genomics", "Bioinformatics", "Prokaryote"] +classifiers=[ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)", + "Natural Language :: English", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Topic :: Scientific/Engineering :: Bio-Informatics"] +requires-python = ">=3.8" +license = {file="LICENCE.txt"} + +[project.optional-dependencies] +doc = [ + "sphinx==6.2.1", + "sphinx_rtd_theme==1.2.2", + "readthedocs-sphinx-search==0.3.1", + "sphinx-autobuild==2021.3.14", + "myst-parser==1.0.0", + "docutils==0.18.1" +] +test = [ + "pytest>=7.0.0" +] +# +[project.urls] +Homepage = "https://labgem.genoscope.cns.fr/2023/04/27/ppanggolin/" +Repository = "https://github.com/labgem/PPanGGOLiN/" +#Changelog = "https://github.com/me/spam/blob/master/CHANGELOG.md" +#Documentation = "https://readthedocs.org" +# +# +[project.scripts] +ppanggolin = "ppanggolin.main:main" + +[tool.setuptools] +packages = ["ppanggolin"] + +#[tool.setuptools.package-data] +#mypkg = ["*.txt", "*.rst"] + +[tool.setuptools.dynamic] +version = {file = "VERSION"} \ No newline at end of file diff --git a/requirements_pip.txt b/requirements_pip.txt deleted file mode 100644 index 25c4a708..00000000 --- a/requirements_pip.txt +++ /dev/null @@ -1,11 +0,0 @@ -tqdm>=4.7.0 -pytables>=3.6.1 -networkx>=2.3 -dataclasses==0.8 -scipy>=1.5.3 -plotly>=4.14.3 -gmpy2>=2.1.0b5 -pandas>=0.25.3 -colorlover>=0.3 -numpy>=1.19.5 -bokeh>=2.3.3 diff --git a/setup.py b/setup.py index ada32680..dde043b5 100755 --- a/setup.py +++ b/setup.py @@ -1,31 +1,12 @@ #!/usr/bin/env python3 -import setuptools -import os - -from distutils.extension import Extension +from setuptools import Extension, setup NEM_DIR_PATH = "ppanggolin/nem/NEM/" -if __name__ == "__main__": - setuptools.setup( - name="ppanggolin", - version=open(os.path.join(os.path.dirname(__file__), "VERSION")).read().rstrip(), - url="https://github.com/labgem/PPanGGOLiN", - description="Pangenome analysis suite", - packages=setuptools.find_packages(), - setup_requires=["cython<3.0.0"], - install_requires=[], - package_data={'': ['rRNA_DB/*cm*']}, - classifiers=["Environment :: Console", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)", - "Natural Language :: English", - "Operating System :: POSIX :: Linux", - "Programming Language :: Python :: 3", - "Topic :: Scientific/Engineering :: Bio-Informatics"], - entry_points={"console_scripts": ["ppanggolin = ppanggolin.main:main"]}, - ext_modules=[Extension( +setup( + ext_modules=[ + Extension( extra_compile_args=['-fcommon'], name="nem_stats", sources=[NEM_DIR_PATH + 'nem_stats.pyx', @@ -38,4 +19,4 @@ NEM_DIR_PATH + 'nem_hlp.c', NEM_DIR_PATH + 'genmemo.c'], include_dirs=[NEM_DIR_PATH])] - ) +)