Merge pull request #272 from labgem/dev

Merge dev branch into master to release version 2.1.1
labgem · Aug 22, 2024 · 83c7d75 · 83c7d75
2 parents d49dd5d + 9abb654
commit 83c7d75
Show file tree

Hide file tree

Showing 73 changed files with 220 additions and 262 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -31,8 +31,8 @@ jobs:
     strategy:
       matrix:
         os: ['ubuntu-latest', 'macos-13']
-        python-version: ['3.8', '3.10']
-
+        python-version: ['3.8', '3.12']
+ 
     steps:
 
     # Get number of cpu available on the current runner
@@ -86,7 +86,9 @@ jobs:
         mkdir info_to_test
         ppanggolin all --cpu $NUM_CPUS --fasta genomes.fasta.list --output mybasicpangenome
         ppanggolin info --pangenome mybasicpangenome/pangenome.h5 --content --parameters --status > info_to_test/mybasicpangenome_info.yaml
-        cat info_to_test/mybasicpangenome_info.yaml
+        cat info_to_test/mybasicpangenome_info.yaml   
+        echo "$(grep 'mybasicpangenome/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1)  mybasicpangenome/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; }
+        shasum -a 256 mybasicpangenome/gene_families.tsv > info_to_test/checksum.txt
         cd -
     # test most options calls. If there is a change in the API somewhere that was not taken into account (whether in the options for the users, or the classes for the devs), this should fail, otherwise everything is probably good.
     #--draw_hotspots option is problematic on macOS.
@@ -118,7 +120,10 @@ jobs:
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log
         ppanggolin info --pangenome stepbystep/pangenome.h5 > info_to_test/stepbystep_info.yaml
         cat info_to_test/stepbystep_info.yaml
-        cd - 
+        gzip -d stepbystep/gene_families.tsv.gz
+        echo "$(grep 'stepbystep/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1)  stepbystep/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; }
+        shasum -a 256 stepbystep/gene_families.tsv >> info_to_test/checksum.txt
+        cd -
     - name: gbff parsing and MSA computing
       shell: bash -l {0}
       run: |
@@ -127,6 +132,8 @@ jobs:
         ppanggolin msa --pangenome myannopang/pangenome.h5 --source dna --partition core -o myannopang/ -f --use_gene_id --phylo --single_copy --cpu $NUM_CPUS
         ppanggolin info --pangenome myannopang/pangenome.h5 > info_to_test/myannopang_info.yaml
         cat info_to_test/myannopang_info.yaml
+        echo "$(grep 'myannopang/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1)  myannopang/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; }
+        shasum -a 256 myannopang/gene_families.tsv >> info_to_test/checksum.txt
         cd -
     - name: clusters reading from external file
       shell: bash -l {0}
@@ -137,6 +144,8 @@ jobs:
         awk 'BEGIN{FS=OFS="\t"} {$1 = $1 OFS $1} 1' clusters.tsv > clusters_with_reprez.tsv;
         ppanggolin cluster --clusters clusters_with_reprez.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS
         ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS
+        echo "$(grep 'readclusterpang/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1)  readclusterpang/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; }
+        shasum -a 256 readclusterpang/gene_families.tsv >> info_to_test/checksum.txt
         cd -
     - name: testing rgp_cluster command
       shell: bash -l {0}
@@ -186,6 +195,8 @@ jobs:
         ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml
         cut -f1,2 clusters.tsv > clusters_without_frag.tsv
         ppanggolin panrgp  --anno genomes.gbff.list --cluster clusters_without_frag.tsv -o test_config --config panrgp_default_config.yaml --cpu $NUM_CPUS
+        echo "$(grep 'test_config/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1)  test_config/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; }
+        shasum -a 256 test_config/gene_families.tsv >> info_to_test/checksum.txt        
         cd -
     - name: testing projection cmd
       shell: bash -l {0}

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.1.0
+2.1.1
diff --git a/docs/user/MSA.md b/docs/user/MSA.md
@@ -1,6 +1,6 @@
 # Multiple Sequence Alignment
 
-The commande `msa` compute multiple sequence alignement of any partition of the pangenome. The command uses [mafft](https://mafft.cbrc.jp/alignment/software/) with default options to perform the alignment. Using multiple cpus with the `--cpu` argument is recommended as multiple alignment can be quite demanding in computational resources.
+The commande `msa` compute multiple sequence alignment of any partition of the pangenome. The command uses [mafft](https://mafft.cbrc.jp/alignment/software/) with default options to perform the alignment. Using multiple cpus with the `--cpu` argument is recommended as multiple alignment can be quite demanding in computational resources.
 
 This command can be used as follow:
 
@@ -34,10 +34,10 @@ ppanggolin msa -p pangenome.h5 --source dna
 
 ### Write a single whole MSA file with `--phylo` 
 
-It is also possible to write a single whole genome MSA file, which many phylogenetic softwares accept as input, by using the `--phylo` option as such:
+It is also possible to write a single whole genome MSA file, which many phylogenetic software accept as input, by using the `--phylo` option as such:
 
 ```bash
 ppanggolin msa -p pangenome.h5 --phylo
 ```
 
-This will contatenate all of the family MSA into a single MSA, with one sequence for each genome.
+This will concatenate all of the family MSA into a single MSA, with one sequence for each genome.
diff --git a/docs/user/Modules/moduleOutputs.md b/docs/user/Modules/moduleOutputs.md
@@ -112,7 +112,7 @@ Modules:
 	Number_of_modules: 380
 	Families_in_Modules: 2242
 	Partition_composition:
-		Persitent: 0.27
+		Persistent: 0.27
 		Shell: 37.69
 		Cloud: 62.04
 	Number_of_Families_per_Modules:
@@ -122,4 +122,3 @@ Modules:
 		mean: 5.9
 
 ```
-
diff --git a/docs/user/Modules/modulePrediction.md b/docs/user/Modules/modulePrediction.md
@@ -59,12 +59,12 @@ ppanggolin panmodule --fasta GENOME_LIST_FILE
 ```
 Replace `GENOME_LIST_FILE` with a tab-separated file listing the genome names, and the fasta file path of their genomic sequences as described [here](../PangenomeAnalyses/pangenomeAnnotation.md#annotate-from-fasta-files). Alternatively, you can provide a list of GFF/GBFF files as input by using the `--anno` parameter, similar to how it is used in the workflow and annotate commands.
 
-The panmodule workflow predicts modules using default parameters. To fine-tune the detection, you can use the `module` command on a partioned pangenome acquired through the `workflow` for example or use a configuration file, as described [here](../practicalInformation.md#configuration-file). 
+The panmodule workflow predicts modules using default parameters. To fine-tune the detection, you can use the `module` command on a partitioned pangenome acquired through the `workflow` for example or use a configuration file, as described [here](../practicalInformation.md#configuration-file). 
 
 
 ## Predict conserved module
 
-The `module` command predicts conserved modules on an partioned pangenome. The command has several options for tuning the prediction. Details about each parameter are available in the related [preprint](https://www.biorxiv.org/content/10.1101/2021.12.06.471380v1).
+The `module` command predicts conserved modules on an partitioned pangenome. The command has several options for tuning the prediction. Details about each parameter are available in the related [preprint](https://www.biorxiv.org/content/10.1101/2021.12.06.471380v1).
 
 The command can be used simply as such:
 

diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md
@@ -141,7 +141,7 @@ Family_C    Gene_6  Gene_6
 ```{mermaid}
 
 ---
-title: "Pangenome gene families when specifing representative gene"
+title: "Pangenome gene families when specifying representative gene"
 align: center
 ---
 

diff --git a/docs/user/PangenomeAnalyses/pangenomeGraphOut.md b/docs/user/PangenomeAnalyses/pangenomeGraphOut.md
@@ -3,7 +3,7 @@
 The pangneome graph can be given through the `.gexf` and through the `_light.gexf` files. The `_light.gexf` file will contain the gene families as nodes and the edges between gene families describing their relationship, and the `.gexf` file will contain the same things but also include more details about each gene and each relation between gene families. 
 We have made two different files representing the same graph because, while the non-light file is exhaustive, it can be very heavy to manipulate and most of its content is not of interest to everyone. The `_light.gexf` file should be the one you use to manipulate the pangenome graph most of the time.
 
-These files can be manipulated and visualized for example through a software called [Gephi](https://gephi.org/), with which we have made extensive testings, or potentially any other softwares or libraries able to read gexf files such as [networkx](https://networkx.github.io/documentation/stable/index.html) or [gexf-js](https://github.com/raphv/gexf-js) among others. Gephi also have a web version able to open small pangenome graphs [gephi-lite](https://gephi.org/gephi-lite/).
+These files can be manipulated and visualized for example through a software called [Gephi](https://gephi.org/), with which we have made extensive testings, or potentially any other software or libraries able to read gexf files such as [networkx](https://networkx.github.io/documentation/stable/index.html) or [gexf-js](https://github.com/raphv/gexf-js) among others. Gephi also have a web version able to open small pangenome graphs [gephi-lite](https://gephi.org/gephi-lite/).
 
 Using Gephi, the layout can be tuned as illustrated below:
 

diff --git a/docs/user/QuickUsage/quickWorkflow.md b/docs/user/QuickUsage/quickWorkflow.md
@@ -101,7 +101,7 @@ genome_updater.sh -d "refseq"  -o "B_japonicum_genomes" -M "gtdb" -T "s__Bradyrh
 ```
 
 
-After the completion of the `all` command, all of your genomes have had their genes predicted, the genes have been clustered into gene families, a pangenome graph has been successfully constructed and partitioned into three distinct paritions: **persistent**, **shell**, and **cloud**. Additionally, **RGP, spots, and modules** have been detected within your pangenome.
+After the completion of the `all` command, all of your genomes have had their genes predicted, the genes have been clustered into gene families, a pangenome graph has been successfully constructed and partitioned into three distinct partitions: **persistent**, **shell**, and **cloud**. Additionally, **RGP, spots, and modules** have been detected within your pangenome.
 
 The results of the workflow is saved in the  **pangenome.h5** file, which is in the HDF-5 file format.
 When you run an analysis using this file as input, the results of that analysis will be added to the file to supplement the data that are already stored in it. 

diff --git a/docs/user/RGP/rgpClustering.md b/docs/user/RGP/rgpClustering.md
@@ -14,7 +14,7 @@ There are three modes available for calculating the GRR value: `min_grr`, `max_g
 - `incomplete_aware_grr` (default) mode: If at least one RGP is considered incomplete, which typically happens when it is located at the border of a contig, the `min_grr` mode is used. Otherwise, the `max_grr` mode is applied. This mode is useful to correctly cluster incomplete RGP.
 
 
-The resulting RGP clusters are stored in a tsv file with the folowing columns:
+The resulting RGP clusters are stored in a tsv file with the following columns:
 
 | column  | description                  |
 |---------|------------------------------|

diff --git a/docs/user/RGP/rgpPrediction.md b/docs/user/RGP/rgpPrediction.md
@@ -68,7 +68,7 @@ ppanggolin panrgp --fasta genomes.fasta.list
 ```
 
 Just like [workflow](../PangenomeAnalyses/pangenomeAnalyses.md#workflow), this command will deal with the [annotation](../PangenomeAnalyses/pangenomeAnalyses.md#annotation), [clustering](../PangenomeAnalyses/pangenomeAnalyses.md#compute-pangenome-gene-families), [graph](../PangenomeAnalyses/pangenomeAnalyses.md#graph) and [partition](../PangenomeAnalyses/pangenomeAnalyses.md#partition) commands by itself.
-Then, the RGP detection is ran using [rgp](#rgp-detection) after the pangenome partitionning. Once all RGP have been computed, those found in similar genomic contexts in the genomes are gathered into spots of insertion using [spot](#spot-prediction).
+Then, the RGP detection is ran using [rgp](#rgp-detection) after the pangenome partitioning. Once all RGP have been computed, those found in similar genomic contexts in the genomes are gathered into spots of insertion using [spot](#spot-prediction).
 
 If you want to tune the rgp detection, you can use the `rgp` command after the `workflow` command. If you wish to tune the spot detection, you can use the `spot` command after the `rgp` command. Additionally, you have the option to utilize a configuration file to customize each detection within the `panrgp` command.
 

diff --git a/docs/user/align.md b/docs/user/align.md
@@ -24,7 +24,7 @@ By default the command creates two output files:
 
 ### 2. 'input_to_pangenome_associations.blast-tab'
 
-'input_to_pangenome_associations.blast-tab' is a .tsv file that follows the tabular blast format which many alignment softwares (such as blast, diamond, mmseqs etc.) use, with two additional columns: the length of query sequence which was aligned, and the length of the subject sequence which was aligned (provided with qlen and slen with the softwares I previously named). You can find a detailed description of the format in [this blog post](https://www.metagenomics.wiki/tools/blast/blastn-output-format-6) for example (and there are many other descriptions of this format on internet, if you search for 'tabular blast format'). The query are the provided sequences, and the subjet are the pangenome gene families.
+'input_to_pangenome_associations.blast-tab' is a .tsv file that follows the tabular blast format which many alignment software (such as blast, diamond, mmseqs etc.) use, with two additional columns: the length of query sequence which was aligned, and the length of the subject sequence which was aligned (provided with qlen and slen with the software I previously named). You can find a detailed description of the format in [this blog post](https://www.metagenomics.wiki/tools/blast/blastn-output-format-6) for example (and there are many other descriptions of this format on internet, if you search for 'tabular blast format'). The query are the provided sequences, and the subject are the pangenome gene families.
 
 
 ### 3. Optional outputs 

diff --git a/docs/user/install.md b/docs/user/install.md
@@ -1,7 +1,7 @@
 # Installation
 
-```{warning}
-Supported python version are 3.8, 3.9 and 3.10
+```{note}
+Supported python version are 3.8, 3.9, 3.10, 3.11 and 3.12
 ```
 
 ## Installing PPanGGOLiN with Conda (recommended)

diff --git a/docs/user/practicalInformation.md b/docs/user/practicalInformation.md
@@ -52,7 +52,7 @@ If you want, verbosity can be reduced in several ways.
 First, you can specify the verbosity level with the `--verbose` option. 
 With `0` will show only warnings and errors, `1` will add the information (default value), and if you encounter any problem you can use the debug level with value `2`.
 Then you can also remove the progress bars with the option `--disable_prog_bar`
-Finaly, you can also save PPanGGOLiN logs in a file by indicating its path with the option `--log`.
+Finally, you can also save PPanGGOLiN logs in a file by indicating its path with the option `--log`.
 
 ## Configuration file
 

diff --git a/docs/user/projection.md b/docs/user/projection.md
@@ -58,13 +58,13 @@ For Gene Family and Partition of Input Genes:
 For RGPs and Spots:
 
 - `plastic_regions.tsv`: This file contains information about RGPs within the input genome. Its format follows [this output](RGP/rgpOutputs.md#rgp-outputs).
-- `input_genome_rgp_to_spot.tsv`: It provides information about the association between RGPs and insertion spots in the input genome. Its format follows [this ouput](RGP/rgpOutputs.md#summarize-spots).
+- `input_genome_rgp_to_spot.tsv`: It provides information about the association between RGPs and insertion spots in the input genome. Its format follows [this output](RGP/rgpOutputs.md#summarize-spots).
 
 Optionally, you can generate a graph of the spots using the `--spot_graph` option. This graph resembles the one produced by the `ppanggolin draw --spots` command, which is detailed [here](RGP/rgpOutputs.md#draw-spots).
 
 For Modules:
 
-- `modules_in_input_genome.tsv`: This file lists the modules that have been found in the input genome. Its format follows [this ouput](Modules/moduleOutputs.md#module-outputs).
+- `modules_in_input_genome.tsv`: This file lists the modules that have been found in the input genome. Its format follows [this output](Modules/moduleOutputs.md#module-outputs).
 
 
 
diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# coding:utf-8
 
 # default libraries
 import logging

diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# coding:utf-8
 
 # default libraries
 import logging
@@ -101,8 +100,7 @@ def genes(self):
         Return iterable of genes from all RGPs that are identical in families
         """
         for rgp in self.rgps:
-            for gene in rgp.genes:
-                yield gene
+            yield from rgp.genes
     @property
     def spots(self) -> Set[Spot]:
         """
@@ -141,7 +139,7 @@ def compute_grr(rgp_a_families: Set[GeneFamily], rgp_b_families: Set[GeneFamily]
 
 def compute_jaccard_index(rgp_a_families: set, rgp_b_families: set) -> float:
     """
-    Compute jaccard index between two rgp based on their famillies.
+    Compute jaccard index between two rgp based on their families.
 
     :param rgp_a_families: Rgp A
     :param rgp_b_families: rgp B
@@ -283,15 +281,15 @@ def add_info_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List
                            name=identical_rgp_obj.name,
                            families_count=len(identical_rgp_obj.families),
                            identical_rgp_count=len(identical_rgp_obj.rgps),
-                           identical_rgp_names=';'.join([i_rgp.name for i_rgp in identical_rgp_obj.rgps]),
+                           identical_rgp_names=';'.join(i_rgp.name for i_rgp in identical_rgp_obj.rgps),
                            identical_rgp_genomes=';'.join({i_rgp.organism.name for i_rgp in identical_rgp_obj.rgps}),
                            identical_rgp_contig_border_count=len(
                                [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_contig_border]),
                            identical_rgp_whole_contig_count=len(
                                [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_whole_contig]),
                            identical_rgp_spots=";".join(spots_of_identical_rgp_obj),
                            spot_id=spots_of_identical_rgp_obj.pop() if len(
-                               spots_of_identical_rgp_obj) == 1 else "Mulitple spots",
+                               spots_of_identical_rgp_obj) == 1 else "Multiple spots",
                             modules = ';'.join({str(module) for module in identical_rgp_obj.modules}),
                            )
 
@@ -608,18 +606,18 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str,
         add_rgp_metadata_to_graph(grr_graph, rgps_in_graph)
 
     if "gexf" in graph_formats:
-        # writting graph in gexf format
+        # writing graph in gexf format
         graph_file_name = os.path.join(output, f"{basename}.gexf")
-        logging.info(f"Writting graph in gexf format in {graph_file_name}.")
+        logging.info(f"Writing graph in gexf format in {graph_file_name}.")
         nx.readwrite.gexf.write_gexf(grr_graph, graph_file_name)
 
     if "graphml" in graph_formats:
         graph_file_name = os.path.join(output, f"{basename}.graphml")
-        logging.info(f"Writting graph in graphml format in {graph_file_name}.")
+        logging.info(f"Writing graph in graphml format in {graph_file_name}.")
         nx.readwrite.graphml.write_graphml(grr_graph, graph_file_name)
 
     outfile = os.path.join(output, f"{basename}.tsv")
-    logging.info(f"Writting rgp clusters in tsv format in {outfile}")
+    logging.info(f"Writing rgp clusters in tsv format in {outfile}")
 
     write_rgp_cluster_table(
         outfile, grr_graph, rgps_in_graph, grr_metric, rgp_to_spot)

diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# coding:utf-8
 
 # default libraries
 import time