diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7d068598..bcc3ef9c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,8 +31,8 @@ jobs: strategy: matrix: os: ['ubuntu-latest', 'macos-13'] - python-version: ['3.8', '3.10'] - + python-version: ['3.8', '3.12'] + steps: # Get number of cpu available on the current runner @@ -86,7 +86,9 @@ jobs: mkdir info_to_test ppanggolin all --cpu $NUM_CPUS --fasta genomes.fasta.list --output mybasicpangenome ppanggolin info --pangenome mybasicpangenome/pangenome.h5 --content --parameters --status > info_to_test/mybasicpangenome_info.yaml - cat info_to_test/mybasicpangenome_info.yaml + cat info_to_test/mybasicpangenome_info.yaml + echo "$(grep 'mybasicpangenome/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1) mybasicpangenome/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; } + shasum -a 256 mybasicpangenome/gene_families.tsv > info_to_test/checksum.txt cd - # test most options calls. If there is a change in the API somewhere that was not taken into account (whether in the options for the users, or the classes for the devs), this should fail, otherwise everything is probably good. #--draw_hotspots option is problematic on macOS. @@ -118,7 +120,10 @@ jobs: ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log ppanggolin info --pangenome stepbystep/pangenome.h5 > info_to_test/stepbystep_info.yaml cat info_to_test/stepbystep_info.yaml - cd - + gzip -d stepbystep/gene_families.tsv.gz + echo "$(grep 'stepbystep/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1) stepbystep/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; } + shasum -a 256 stepbystep/gene_families.tsv >> info_to_test/checksum.txt + cd - - name: gbff parsing and MSA computing shell: bash -l {0} run: | @@ -127,6 +132,8 @@ jobs: ppanggolin msa --pangenome myannopang/pangenome.h5 --source dna --partition core -o myannopang/ -f --use_gene_id --phylo --single_copy --cpu $NUM_CPUS ppanggolin info --pangenome myannopang/pangenome.h5 > info_to_test/myannopang_info.yaml cat info_to_test/myannopang_info.yaml + echo "$(grep 'myannopang/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1) myannopang/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; } + shasum -a 256 myannopang/gene_families.tsv >> info_to_test/checksum.txt cd - - name: clusters reading from external file shell: bash -l {0} @@ -137,6 +144,8 @@ jobs: awk 'BEGIN{FS=OFS="\t"} {$1 = $1 OFS $1} 1' clusters.tsv > clusters_with_reprez.tsv; ppanggolin cluster --clusters clusters_with_reprez.tsv -p readclusters/pangenome.h5 --cpu $NUM_CPUS ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f --cpu $NUM_CPUS + echo "$(grep 'readclusterpang/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1) readclusterpang/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; } + shasum -a 256 readclusterpang/gene_families.tsv >> info_to_test/checksum.txt cd - - name: testing rgp_cluster command shell: bash -l {0} @@ -186,6 +195,8 @@ jobs: ppanggolin utils --default_config panrgp -o panrgp_default_config.yaml cut -f1,2 clusters.tsv > clusters_without_frag.tsv ppanggolin panrgp --anno genomes.gbff.list --cluster clusters_without_frag.tsv -o test_config --config panrgp_default_config.yaml --cpu $NUM_CPUS + echo "$(grep 'test_config/gene_families.tsv' expected_info_files/checksum.txt | cut -d' ' -f1) test_config/gene_families.tsv" | shasum -a 256 -c - || { echo 'Checksum verification failed.' >&2; exit 1; } + shasum -a 256 test_config/gene_families.tsv >> info_to_test/checksum.txt cd - - name: testing projection cmd shell: bash -l {0} diff --git a/VERSION b/VERSION index 7ec1d6db..3e3c2f1e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.0 +2.1.1 diff --git a/docs/user/MSA.md b/docs/user/MSA.md index 04ea268b..3bf93278 100644 --- a/docs/user/MSA.md +++ b/docs/user/MSA.md @@ -1,6 +1,6 @@ # Multiple Sequence Alignment -The commande `msa` compute multiple sequence alignement of any partition of the pangenome. The command uses [mafft](https://mafft.cbrc.jp/alignment/software/) with default options to perform the alignment. Using multiple cpus with the `--cpu` argument is recommended as multiple alignment can be quite demanding in computational resources. +The commande `msa` compute multiple sequence alignment of any partition of the pangenome. The command uses [mafft](https://mafft.cbrc.jp/alignment/software/) with default options to perform the alignment. Using multiple cpus with the `--cpu` argument is recommended as multiple alignment can be quite demanding in computational resources. This command can be used as follow: @@ -34,10 +34,10 @@ ppanggolin msa -p pangenome.h5 --source dna ### Write a single whole MSA file with `--phylo` -It is also possible to write a single whole genome MSA file, which many phylogenetic softwares accept as input, by using the `--phylo` option as such: +It is also possible to write a single whole genome MSA file, which many phylogenetic software accept as input, by using the `--phylo` option as such: ```bash ppanggolin msa -p pangenome.h5 --phylo ``` -This will contatenate all of the family MSA into a single MSA, with one sequence for each genome. \ No newline at end of file +This will concatenate all of the family MSA into a single MSA, with one sequence for each genome. \ No newline at end of file diff --git a/docs/user/Modules/moduleOutputs.md b/docs/user/Modules/moduleOutputs.md index e9941d54..8236b4bb 100644 --- a/docs/user/Modules/moduleOutputs.md +++ b/docs/user/Modules/moduleOutputs.md @@ -112,7 +112,7 @@ Modules: Number_of_modules: 380 Families_in_Modules: 2242 Partition_composition: - Persitent: 0.27 + Persistent: 0.27 Shell: 37.69 Cloud: 62.04 Number_of_Families_per_Modules: @@ -122,4 +122,3 @@ Modules: mean: 5.9 ``` - diff --git a/docs/user/Modules/modulePrediction.md b/docs/user/Modules/modulePrediction.md index 6625a9fb..ad513c2a 100644 --- a/docs/user/Modules/modulePrediction.md +++ b/docs/user/Modules/modulePrediction.md @@ -59,12 +59,12 @@ ppanggolin panmodule --fasta GENOME_LIST_FILE ``` Replace `GENOME_LIST_FILE` with a tab-separated file listing the genome names, and the fasta file path of their genomic sequences as described [here](../PangenomeAnalyses/pangenomeAnnotation.md#annotate-from-fasta-files). Alternatively, you can provide a list of GFF/GBFF files as input by using the `--anno` parameter, similar to how it is used in the workflow and annotate commands. -The panmodule workflow predicts modules using default parameters. To fine-tune the detection, you can use the `module` command on a partioned pangenome acquired through the `workflow` for example or use a configuration file, as described [here](../practicalInformation.md#configuration-file). +The panmodule workflow predicts modules using default parameters. To fine-tune the detection, you can use the `module` command on a partitioned pangenome acquired through the `workflow` for example or use a configuration file, as described [here](../practicalInformation.md#configuration-file). ## Predict conserved module -The `module` command predicts conserved modules on an partioned pangenome. The command has several options for tuning the prediction. Details about each parameter are available in the related [preprint](https://www.biorxiv.org/content/10.1101/2021.12.06.471380v1). +The `module` command predicts conserved modules on an partitioned pangenome. The command has several options for tuning the prediction. Details about each parameter are available in the related [preprint](https://www.biorxiv.org/content/10.1101/2021.12.06.471380v1). The command can be used simply as such: diff --git a/docs/user/PangenomeAnalyses/pangenomeCluster.md b/docs/user/PangenomeAnalyses/pangenomeCluster.md index bcaa5108..f543bf7b 100644 --- a/docs/user/PangenomeAnalyses/pangenomeCluster.md +++ b/docs/user/PangenomeAnalyses/pangenomeCluster.md @@ -141,7 +141,7 @@ Family_C Gene_6 Gene_6 ```{mermaid} --- -title: "Pangenome gene families when specifing representative gene" +title: "Pangenome gene families when specifying representative gene" align: center --- diff --git a/docs/user/PangenomeAnalyses/pangenomeGraphOut.md b/docs/user/PangenomeAnalyses/pangenomeGraphOut.md index 0b792a3d..71fc93c2 100644 --- a/docs/user/PangenomeAnalyses/pangenomeGraphOut.md +++ b/docs/user/PangenomeAnalyses/pangenomeGraphOut.md @@ -3,7 +3,7 @@ The pangneome graph can be given through the `.gexf` and through the `_light.gexf` files. The `_light.gexf` file will contain the gene families as nodes and the edges between gene families describing their relationship, and the `.gexf` file will contain the same things but also include more details about each gene and each relation between gene families. We have made two different files representing the same graph because, while the non-light file is exhaustive, it can be very heavy to manipulate and most of its content is not of interest to everyone. The `_light.gexf` file should be the one you use to manipulate the pangenome graph most of the time. -These files can be manipulated and visualized for example through a software called [Gephi](https://gephi.org/), with which we have made extensive testings, or potentially any other softwares or libraries able to read gexf files such as [networkx](https://networkx.github.io/documentation/stable/index.html) or [gexf-js](https://github.com/raphv/gexf-js) among others. Gephi also have a web version able to open small pangenome graphs [gephi-lite](https://gephi.org/gephi-lite/). +These files can be manipulated and visualized for example through a software called [Gephi](https://gephi.org/), with which we have made extensive testings, or potentially any other software or libraries able to read gexf files such as [networkx](https://networkx.github.io/documentation/stable/index.html) or [gexf-js](https://github.com/raphv/gexf-js) among others. Gephi also have a web version able to open small pangenome graphs [gephi-lite](https://gephi.org/gephi-lite/). Using Gephi, the layout can be tuned as illustrated below: diff --git a/docs/user/QuickUsage/quickWorkflow.md b/docs/user/QuickUsage/quickWorkflow.md index 472593ae..e632fce3 100644 --- a/docs/user/QuickUsage/quickWorkflow.md +++ b/docs/user/QuickUsage/quickWorkflow.md @@ -101,7 +101,7 @@ genome_updater.sh -d "refseq" -o "B_japonicum_genomes" -M "gtdb" -T "s__Bradyrh ``` -After the completion of the `all` command, all of your genomes have had their genes predicted, the genes have been clustered into gene families, a pangenome graph has been successfully constructed and partitioned into three distinct paritions: **persistent**, **shell**, and **cloud**. Additionally, **RGP, spots, and modules** have been detected within your pangenome. +After the completion of the `all` command, all of your genomes have had their genes predicted, the genes have been clustered into gene families, a pangenome graph has been successfully constructed and partitioned into three distinct partitions: **persistent**, **shell**, and **cloud**. Additionally, **RGP, spots, and modules** have been detected within your pangenome. The results of the workflow is saved in the **pangenome.h5** file, which is in the HDF-5 file format. When you run an analysis using this file as input, the results of that analysis will be added to the file to supplement the data that are already stored in it. diff --git a/docs/user/RGP/rgpClustering.md b/docs/user/RGP/rgpClustering.md index 62c42cfc..06c1653e 100644 --- a/docs/user/RGP/rgpClustering.md +++ b/docs/user/RGP/rgpClustering.md @@ -14,7 +14,7 @@ There are three modes available for calculating the GRR value: `min_grr`, `max_g - `incomplete_aware_grr` (default) mode: If at least one RGP is considered incomplete, which typically happens when it is located at the border of a contig, the `min_grr` mode is used. Otherwise, the `max_grr` mode is applied. This mode is useful to correctly cluster incomplete RGP. -The resulting RGP clusters are stored in a tsv file with the folowing columns: +The resulting RGP clusters are stored in a tsv file with the following columns: | column | description | |---------|------------------------------| diff --git a/docs/user/RGP/rgpPrediction.md b/docs/user/RGP/rgpPrediction.md index 08aa704b..4fcb7cfc 100644 --- a/docs/user/RGP/rgpPrediction.md +++ b/docs/user/RGP/rgpPrediction.md @@ -68,7 +68,7 @@ ppanggolin panrgp --fasta genomes.fasta.list ``` Just like [workflow](../PangenomeAnalyses/pangenomeAnalyses.md#workflow), this command will deal with the [annotation](../PangenomeAnalyses/pangenomeAnalyses.md#annotation), [clustering](../PangenomeAnalyses/pangenomeAnalyses.md#compute-pangenome-gene-families), [graph](../PangenomeAnalyses/pangenomeAnalyses.md#graph) and [partition](../PangenomeAnalyses/pangenomeAnalyses.md#partition) commands by itself. -Then, the RGP detection is ran using [rgp](#rgp-detection) after the pangenome partitionning. Once all RGP have been computed, those found in similar genomic contexts in the genomes are gathered into spots of insertion using [spot](#spot-prediction). +Then, the RGP detection is ran using [rgp](#rgp-detection) after the pangenome partitioning. Once all RGP have been computed, those found in similar genomic contexts in the genomes are gathered into spots of insertion using [spot](#spot-prediction). If you want to tune the rgp detection, you can use the `rgp` command after the `workflow` command. If you wish to tune the spot detection, you can use the `spot` command after the `rgp` command. Additionally, you have the option to utilize a configuration file to customize each detection within the `panrgp` command. diff --git a/docs/user/align.md b/docs/user/align.md index e3e9311a..3c7e6462 100644 --- a/docs/user/align.md +++ b/docs/user/align.md @@ -24,7 +24,7 @@ By default the command creates two output files: ### 2. 'input_to_pangenome_associations.blast-tab' -'input_to_pangenome_associations.blast-tab' is a .tsv file that follows the tabular blast format which many alignment softwares (such as blast, diamond, mmseqs etc.) use, with two additional columns: the length of query sequence which was aligned, and the length of the subject sequence which was aligned (provided with qlen and slen with the softwares I previously named). You can find a detailed description of the format in [this blog post](https://www.metagenomics.wiki/tools/blast/blastn-output-format-6) for example (and there are many other descriptions of this format on internet, if you search for 'tabular blast format'). The query are the provided sequences, and the subjet are the pangenome gene families. +'input_to_pangenome_associations.blast-tab' is a .tsv file that follows the tabular blast format which many alignment software (such as blast, diamond, mmseqs etc.) use, with two additional columns: the length of query sequence which was aligned, and the length of the subject sequence which was aligned (provided with qlen and slen with the software I previously named). You can find a detailed description of the format in [this blog post](https://www.metagenomics.wiki/tools/blast/blastn-output-format-6) for example (and there are many other descriptions of this format on internet, if you search for 'tabular blast format'). The query are the provided sequences, and the subject are the pangenome gene families. ### 3. Optional outputs diff --git a/docs/user/install.md b/docs/user/install.md index 40172657..8ab44612 100644 --- a/docs/user/install.md +++ b/docs/user/install.md @@ -1,7 +1,7 @@ # Installation -```{warning} -Supported python version are 3.8, 3.9 and 3.10 +```{note} +Supported python version are 3.8, 3.9, 3.10, 3.11 and 3.12 ``` ## Installing PPanGGOLiN with Conda (recommended) diff --git a/docs/user/practicalInformation.md b/docs/user/practicalInformation.md index 84d2d8ab..0d9c5e67 100644 --- a/docs/user/practicalInformation.md +++ b/docs/user/practicalInformation.md @@ -52,7 +52,7 @@ If you want, verbosity can be reduced in several ways. First, you can specify the verbosity level with the `--verbose` option. With `0` will show only warnings and errors, `1` will add the information (default value), and if you encounter any problem you can use the debug level with value `2`. Then you can also remove the progress bars with the option `--disable_prog_bar` -Finaly, you can also save PPanGGOLiN logs in a file by indicating its path with the option `--log`. +Finally, you can also save PPanGGOLiN logs in a file by indicating its path with the option `--log`. ## Configuration file diff --git a/docs/user/projection.md b/docs/user/projection.md index 1c8b671c..8dda6bbc 100644 --- a/docs/user/projection.md +++ b/docs/user/projection.md @@ -58,13 +58,13 @@ For Gene Family and Partition of Input Genes: For RGPs and Spots: - `plastic_regions.tsv`: This file contains information about RGPs within the input genome. Its format follows [this output](RGP/rgpOutputs.md#rgp-outputs). -- `input_genome_rgp_to_spot.tsv`: It provides information about the association between RGPs and insertion spots in the input genome. Its format follows [this ouput](RGP/rgpOutputs.md#summarize-spots). +- `input_genome_rgp_to_spot.tsv`: It provides information about the association between RGPs and insertion spots in the input genome. Its format follows [this output](RGP/rgpOutputs.md#summarize-spots). Optionally, you can generate a graph of the spots using the `--spot_graph` option. This graph resembles the one produced by the `ppanggolin draw --spots` command, which is detailed [here](RGP/rgpOutputs.md#draw-spots). For Modules: -- `modules_in_input_genome.tsv`: This file lists the modules that have been found in the input genome. Its format follows [this ouput](Modules/moduleOutputs.md#module-outputs). +- `modules_in_input_genome.tsv`: This file lists the modules that have been found in the input genome. Its format follows [this output](Modules/moduleOutputs.md#module-outputs). diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 7a41a560..1ae7e87a 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging diff --git a/ppanggolin/RGP/rgp_cluster.py b/ppanggolin/RGP/rgp_cluster.py index 8d237a10..078dc19a 100644 --- a/ppanggolin/RGP/rgp_cluster.py +++ b/ppanggolin/RGP/rgp_cluster.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -101,8 +100,7 @@ def genes(self): Return iterable of genes from all RGPs that are identical in families """ for rgp in self.rgps: - for gene in rgp.genes: - yield gene + yield from rgp.genes @property def spots(self) -> Set[Spot]: """ @@ -141,7 +139,7 @@ def compute_grr(rgp_a_families: Set[GeneFamily], rgp_b_families: Set[GeneFamily] def compute_jaccard_index(rgp_a_families: set, rgp_b_families: set) -> float: """ - Compute jaccard index between two rgp based on their famillies. + Compute jaccard index between two rgp based on their families. :param rgp_a_families: Rgp A :param rgp_b_families: rgp B @@ -283,7 +281,7 @@ def add_info_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List name=identical_rgp_obj.name, families_count=len(identical_rgp_obj.families), identical_rgp_count=len(identical_rgp_obj.rgps), - identical_rgp_names=';'.join([i_rgp.name for i_rgp in identical_rgp_obj.rgps]), + identical_rgp_names=';'.join(i_rgp.name for i_rgp in identical_rgp_obj.rgps), identical_rgp_genomes=';'.join({i_rgp.organism.name for i_rgp in identical_rgp_obj.rgps}), identical_rgp_contig_border_count=len( [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_contig_border]), @@ -291,7 +289,7 @@ def add_info_to_identical_rgps(rgp_graph: nx.Graph, identical_rgps_objects: List [True for i_rgp in identical_rgp_obj.rgps if i_rgp.is_whole_contig]), identical_rgp_spots=";".join(spots_of_identical_rgp_obj), spot_id=spots_of_identical_rgp_obj.pop() if len( - spots_of_identical_rgp_obj) == 1 else "Mulitple spots", + spots_of_identical_rgp_obj) == 1 else "Multiple spots", modules = ';'.join({str(module) for module in identical_rgp_obj.modules}), ) @@ -608,18 +606,18 @@ def cluster_rgp(pangenome, grr_cutoff: float, output: str, basename: str, add_rgp_metadata_to_graph(grr_graph, rgps_in_graph) if "gexf" in graph_formats: - # writting graph in gexf format + # writing graph in gexf format graph_file_name = os.path.join(output, f"{basename}.gexf") - logging.info(f"Writting graph in gexf format in {graph_file_name}.") + logging.info(f"Writing graph in gexf format in {graph_file_name}.") nx.readwrite.gexf.write_gexf(grr_graph, graph_file_name) if "graphml" in graph_formats: graph_file_name = os.path.join(output, f"{basename}.graphml") - logging.info(f"Writting graph in graphml format in {graph_file_name}.") + logging.info(f"Writing graph in graphml format in {graph_file_name}.") nx.readwrite.graphml.write_graphml(grr_graph, graph_file_name) outfile = os.path.join(output, f"{basename}.tsv") - logging.info(f"Writting rgp clusters in tsv format in {outfile}") + logging.info(f"Writing rgp clusters in tsv format in {outfile}") write_rgp_cluster_table( outfile, grr_graph, rgps_in_graph, grr_metric, rgp_to_spot) diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index eee073b8..6b73fe7d 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import time diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index a2ffe485..187d4208 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse @@ -113,7 +112,7 @@ def extract_positions(string: str) -> Tuple[List[Tuple[int, int]], bool, bool]: """ Extracts start and stop positions from a string and determines whether it is complement and pseudogene. - Exemple of strings that the function is able to process: + Example of strings that the function is able to process: "join(190..7695,7695..12071)", "complement(join(4359800..4360707,4360707..4360962))", @@ -359,7 +358,7 @@ def combine_contigs_metadata(contig_to_metadata: Dict[str, Dict[str, str]]) -> T all_tag_to_value = [(tag, value) for source_info in contig_to_metadata.values() for (tag, value) in source_info.items() if isinstance(value, str)] - # Filter tags that would have a / as it is forbiden when writing the table in HDF5. Such tag can appear with db_xref formating + # Filter tags that would have a / as it is forbidden when writing the table in HDF5. Such tag can appear with db_xref formatting invalid_tag_names = [] for tag, _ in set(all_tag_to_value): try: @@ -691,7 +690,7 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b if fields_gff[gff_type] == 'region': # keep region attributes to add them as metadata of genome and contigs - # excluding some info as they are alredy contained in contig object. + # excluding some info as they are already contained in contig object. contig_name_to_region_info[fields_gff[gff_seqname]] = {tag.lower(): value for tag, value in attributes.items() if @@ -790,7 +789,7 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b rna_counter += 1 contig.add_rna(rna) - # Correct coordinates of genes that overlapp the edge of circulars contig + # Correct coordinates of genes that overlap the edge of circulars contig correct_putative_overlaps(org.contigs) # GET THE FASTA SEQUENCES OF THE GENES @@ -798,7 +797,7 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b contig_sequences = read_fasta(org, fasta_string.split('\n')) # _ is total contig length for contig in org.contigs: if contig.length != len(contig_sequences[contig.name]): - raise ValueError("The contig lenght defined is different than the sequence length") + raise ValueError("The contig length defined is different than the sequence length") for gene in contig.genes: gene.add_sequence(get_dna_sequence(contig_sequences[contig.name], gene)) diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 2997519e..6ad10365 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -10,7 +9,7 @@ from subprocess import Popen, PIPE import ast from collections import defaultdict -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union from pathlib import Path # install libraries @@ -46,7 +45,7 @@ def reverse_complement(seq: str): return rcseq -def launch_aragorn(fna_file: str, org: Organism) -> defaultdict: +def launch_aragorn(fna_file: str, org: Organism, contig_to_length: Dict[str, int]) -> defaultdict: """ Launches Aragorn to annotate tRNAs. @@ -63,24 +62,30 @@ def launch_aragorn(fna_file: str, org: Organism) -> defaultdict: file_data = p.communicate()[0].decode().split("\n")[:: -1] gene_objs = defaultdict(set) c = 0 - header = "" + contig_name = "" while len(file_data) != 0: line = file_data.pop() if line.startswith(">"): - header = line.replace(">", "").split()[0] + contig_name = line.replace(">", "").split()[0] file_data.pop() # then next line must be removed too. elif len(line) > 0: # if the line isn't empty, there's data to get. line_data = line.split() start, stop = map(int, ast.literal_eval(line_data[2].replace("c", ""))) if start < 1 or stop < 1: # In some case aragorn gives negative coordinates. This case is just ignored. - logging.warning(f'Aragorn gives non valide coordiates for a RNA gene: {line_data} This RNA is ignored.') + logging.warning(f'Aragorn gives non valid coordiates for a RNA gene in contig {contig_name}: {line_data}. This RNA is ignored.') continue + if start > contig_to_length[contig_name] or stop > contig_to_length[contig_name]: + logging.warning(f'Aragorn gives non valide coordiates for a RNA gene in contig {contig_name}. ' + f'Gene coordinates exceed contig length ({contig_to_length[contig_name]}): ' + f'{line_data}. This RNA is ignored.') + continue + c += 1 gene = RNA(rna_id=locustag + '_tRNA_' + str(c).zfill(4)) gene.fill_annotations(start=start, stop=stop, strand="-" if line_data[2].startswith("c") else "+", gene_type="tRNA", product=line_data[1] + line_data[4]) - gene_objs[header].add(gene) + gene_objs[contig_name].add(gene) return gene_objs @@ -210,7 +215,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str, def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrapper: """ - Writes a temporary fna formated file and returns the file-like object. Useful in case of compressed input file. + Writes a temporary fna formatted file and returns the file-like object. Useful in case of compressed input file. The file will be deleted when close() is called. :param contigs: Contigs sequences of each contig @@ -250,13 +255,15 @@ def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, contig_sequenc # launching tools for syntaxic annotation genes = defaultdict(list) - for key, items in launch_prodigal(contig_sequences=contig_sequences, org=org, code=code, use_meta=use_meta).items(): - genes[key].extend(items) + for contig_name, genes_from_contig in launch_prodigal(contig_sequences=contig_sequences, org=org, code=code, use_meta=use_meta).items(): + genes[contig_name].extend(genes_from_contig) if not norna: - for key, items in launch_aragorn(fna_file=fasta_file.name, org=org).items(): - genes[key].extend(items) - for key, items in launch_infernal(fna_file=fasta_file.name, org=org, kingdom=kingdom, tmpdir=tmpdir).items(): - genes[key].extend(items) + contig_to_length = {contig_name:len(contig_seq) for contig_name, contig_seq in contig_sequences.items()} + + for contig_name, genes_from_contig in launch_aragorn(fna_file=fasta_file.name, org=org, contig_to_length= contig_to_length).items(): + genes[contig_name].extend(genes_from_contig) + for contig_name, genes_from_contig in launch_infernal(fna_file=fasta_file.name, org=org, kingdom=kingdom, tmpdir=tmpdir).items(): + genes[contig_name].extend(genes_from_contig) fasta_file.close() # closing either tmp file or original fasta file. return genes @@ -307,13 +314,13 @@ def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str: # check contig coordinate is in scope of contig seq length highest_position = max((stop for _, stop in gene.coordinates)) assert highest_position <= len( - contig_seq), f"Gene coordinates exceed contig length. gene coordinates {gene.coordinates} vs contig length {len(contig_seq)}" + contig_seq), f"Coordinates of gene {gene} exceed length of the contig. Gene coordinates {gene.coordinates} vs contig length {len(contig_seq)}" # Extract gene seq seq = ''.join([contig_seq[start - 1:stop] for start, stop in gene.coordinates]) # check length of extracted seq - assert len(seq) == len(gene), ("The gene sequence extracted from the contig does not have the expected length: " + assert len(seq) == len(gene), (f"The gene sequence of {gene} extracted from the contig does not have the expected length: " f"extracted seq length {len(seq)}nt vs expected length based on gene coordinates ({gene.coordinates}) {len(gene)}nt ") if gene.strand == "+": @@ -324,7 +331,7 @@ def get_dna_sequence(contig_seq: str, gene: Union[Gene, RNA]) -> str: def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str], tmpdir: str, code: int = 11, norna: bool = False, kingdom: str = "bacteria", - allow_overlap: bool = False, procedure: str = None) -> Organism: + allow_overlap: bool = False, procedure: Optional[str] = None) -> Organism: """ Function to annotate a single organism diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 6dcedca1..b16c0a5f 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -95,12 +94,12 @@ def first_clustering(sequences: Path, tmpdir: Path, cpu: int = 1, code: int = 11 run_subprocess(cmd, msg="MMSeqs2 cluster failed with the following error:\n") logging.getLogger("PPanGGOLiN").info("Extracting cluster representatives...") repdb = tmpdir / 'representative_db' - cmd = list(map(str, ["mmseqs", "result2repseq", seqdb, cludb, repdb])) + cmd = list(map(str, ["mmseqs", "result2repseq", seqdb, cludb, repdb, "--threads", cpu])) run_subprocess(cmd, msg="MMSeqs2 result2repseq failed with the following error:\n") reprfa = tmpdir / 'representative_sequences.fasta' cmd = list(map(str, ["mmseqs", "result2flat", seqdb, seqdb, repdb, reprfa, "--use-fasta-header"])) run_subprocess(cmd, msg="MMSeqs2 result2flat failed with the following error:\n") - logging.getLogger("PPanGGOLiN").info("Writing gene to family informations") + logging.getLogger("PPanGGOLiN").info("Writing gene to family information") outtsv = tmpdir / 'families_tsv' cmd = list(map(str, ["mmseqs", "createtsv", seqdb, seqdb, cludb, outtsv, "--threads", cpu, "--full-header"])) run_subprocess(cmd, msg="MMSeqs2 createtsv failed with the following error:\n") @@ -157,7 +156,7 @@ def read_tsv(tsv_file_name: Path) -> Tuple[Dict[str, Tuple[str, bool]], Dict[str :param tsv_file_name: path to the tsv - :return: two dictionnary which link genes and families + :return: two dictionaries which link genes and families """ genes2fam = {} fam2genes = defaultdict(set) diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index ef31ae4b..f86baf85 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse @@ -202,10 +201,10 @@ def get_gene_contexts(context_graph: nx.Graph, families_of_interest: Set[GeneFam # Connected component graph Filtering - # remove singleton famillies + # remove singleton families connected_components = (component for component in connected_components if len(component) > 1) - # remove component made only of famillies not initially requested + # remove component made only of families not initially requested connected_components = (component for component in connected_components if component & families_of_interest) gene_contexts = set() diff --git a/ppanggolin/edge.py b/ppanggolin/edge.py index 2d21f334..6b323eb1 100644 --- a/ppanggolin/edge.py +++ b/ppanggolin/edge.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding: utf8 # default libraries from collections import defaultdict @@ -50,8 +49,7 @@ def organisms(self) -> Generator[Organism, None, None]: :return: Generator with organisms as the key and an iterable of the gene pairs as value """ - for organism in self._organisms.keys(): - yield organism + yield from self._organisms.keys() @property def number_of_organisms(self) -> int: diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index ec90d1d7..458fb126 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries @@ -252,7 +251,7 @@ def mk_source_data(genelists: list, fam_col: dict, fam_to_mod: dict) -> (ColumnD last_gene = genelist[-1] if is_gene_list_ordered(genelist): - # if the order has been inverted, positionning elements on the figure is different + # if the order has been inverted, positioning elements on the figure is different ordered = True start = first_gene.start else: @@ -460,7 +459,7 @@ def mk_genomes(gene_lists: list, ordered_counts: list) -> (ColumnDataSource, lis first_gene = genelist[0] last_gene = genelist[-1] if is_gene_list_ordered(genelist): - # if the order has been inverted, positionning elements on the figure is different + # if the order has been inverted, positioning elements on the figure is different width = abs(last_gene.stop_relative_to(first_gene ) - genelist[0].start) df["width"].append(width) else: @@ -627,7 +626,7 @@ def draw_selected_spots(selected_spots: Union[List[Spot], Set[Spot]], pangenome: left_border = [gene for gene in left_border_and_in_between_genes if gene.family.named_partition == "persistent" and gene.family not in multigenics] right_border = [gene for gene in right_border_and_in_between_genes if gene.family.named_partition == "persistent" and gene.family not in multigenics] - # in some rare case with plasmid left and rigth border can be made of the same genes + # in some rare case with plasmid left and right border can be made of the same genes # we use a set to only have one gene represented. consecutive_genes_lists = contig.get_ordered_consecutive_genes(set(left_border_and_in_between_genes + right_border_and_in_between_genes + list(rgp.genes))) diff --git a/ppanggolin/figures/drawing.py b/ppanggolin/figures/drawing.py index 98b684e2..41897dfc 100644 --- a/ppanggolin/figures/drawing.py +++ b/ppanggolin/figures/drawing.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index bf5b1012..ce3f2cf9 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index e9f4b966..9d2f3609 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -152,8 +151,7 @@ def read_chunks(table: tables.Table, column: str = None, chunk: int = 10000): :param chunk: """ for i in range(0, table.nrows, chunk): - for row in table.read(start=i, stop=i + chunk, field=column): - yield row + yield from table.read(start=i, stop=i + chunk, field=column) def read_genedata(h5f: tables.File) -> Dict[int, Genedata]: diff --git a/ppanggolin/formats/writeAnnotations.py b/ppanggolin/formats/writeAnnotations.py index 998596cd..8e90540f 100644 --- a/ppanggolin/formats/writeAnnotations.py +++ b/ppanggolin/formats/writeAnnotations.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -140,7 +139,7 @@ def write_genes(pangenome: Pangenome, h5f: tables.File, annotation: tables.Grou :param gene_desc: Genes table description :param disable_bar: Allow to disable progress bar - :returns: Dictionnary linking genedata to gene identifier + :returns: Dictionary linking genedata to gene identifier """ global genedata_counter genedata2gene = {} @@ -188,7 +187,7 @@ def write_rnas(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group :param rna_desc: RNAs table description :param disable_bar: Allow to disable progress bar - :returns: Dictionnary linking genedata to RNA identifier + :returns: Dictionary linking genedata to RNA identifier """ global genedata_counter genedata2rna = {} @@ -295,11 +294,11 @@ def get_genedata(feature: Union[Gene, RNA]) -> Genedata: feature.product, genetic_code, coordinates = feature.coordinates) def write_gene_joined_coordinates(h5f, annotation, genes_with_joined_coordinates_2_id, disable_bar): - """Writting genedata information in pangenome file + """Writing genedata information in pangenome file :param h5f: Pangenome file :param annotation: Annotation group in Table - :param genedata2gene: Dictionnary linking genedata to gene identifier. + :param genedata2gene: Dictionary linking genedata to gene identifier. :param disable_bar: Allow disabling progress bar """ number_of_gene_pieces = sum([len(gene.coordinates) for gene in genes_with_joined_coordinates_2_id]) @@ -330,12 +329,12 @@ def write_gene_joined_coordinates(h5f, annotation, genes_with_joined_coordinates def write_genedata(pangenome: Pangenome, h5f: tables.File, annotation: tables.Group, genedata2gene: Dict[Genedata, int], disable_bar=False): - """Writting genedata information in pangenome file + """Writing genedata information in pangenome file :param pangenome: Pangenome object filled with annotation. :param h5f: Pangenome file :param annotation: Annotation group in Table - :param genedata2gene: Dictionnary linking genedata to gene identifier. + :param genedata2gene: Dictionary linking genedata to gene identifier. :param disable_bar: Allow disabling progress bar """ try: @@ -377,7 +376,7 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, rec_organisms: boo :param rec_contigs: Allow writing contigs in pangenomes :param rec_genes: Allow writing genes in pangenomes :param rec_rnas: Allow writing RNAs in pangenomes - :param disable_bar: Alow to disable progress bar + :param disable_bar: Allow to disable progress bar """ annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") @@ -429,7 +428,7 @@ def gene_sequences_desc(gene_id_len: int, gene_type_len: int) -> Dict[str, Union :param gene_id_len: Maximum size of gene sequence identifier :param gene_type_len: Maximum size of gene type - :return: Formated table + :return: Formatted table """ return { "gene": tables.StringCol(itemsize=gene_id_len), @@ -455,7 +454,7 @@ def sequence_desc(max_seq_len: int) -> Dict[str, Union[tables.UIntCol, tables.St """ Table description to save sequences :param max_seq_len: Maximum size of gene type - :return: Formated table + :return: Formatted table """ return { "seqid": tables.UInt32Col(), diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index a764a27f..735f13ae 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -60,13 +59,13 @@ def getmin(arg: iter) -> float: def gene_fam_desc(max_name_len: int, max_sequence_length: int, max_part_len: int) -> dict: """ - Create a formated table for gene families description + Create a formatted table for gene families description :param max_name_len: Maximum size of gene family name :param max_sequence_length: Maximum size of gene family representing gene sequences :param max_part_len: Maximum size of gene family partition - :return: Formated table + :return: Formatted table """ return { "name": tables.StringCol(itemsize=max_name_len), @@ -123,12 +122,12 @@ def write_gene_fam_info(pangenome: Pangenome, h5f: tables.File, force: bool = Fa def gene_to_fam_desc(gene_fam_name_len: int, gene_id_len: int) -> dict: """ - Create a formated table for gene in gene families information + Create a formatted table for gene in gene families information :param gene_fam_name_len: Maximum size of gene family names :param gene_id_len: Maximum size of gene identifier - :return: formated table + :return: formatted table """ return { "geneFam": tables.StringCol(itemsize=gene_fam_name_len), @@ -180,11 +179,11 @@ def write_gene_families(pangenome: Pangenome, h5f: tables.File, force: bool = Fa def graph_desc(max_gene_id_len): """ - Create a formated table for pangenome graph + Create a formatted table for pangenome graph :param max_gene_id_len: Maximum size of gene id - :return: formated table + :return: formatted table """ return { 'geneTarget': tables.StringCol(itemsize=max_gene_id_len), @@ -235,12 +234,12 @@ def write_graph(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis def rgp_desc(max_rgp_len, max_gene_len): """ - Create a formated table for region of genomic plasticity + Create a formatted table for region of genomic plasticity :param max_rgp_len: Maximum size of RGP :param max_gene_len: Maximum sizez of gene - :return: formated table + :return: formatted table """ return { 'RGP': tables.StringCol(itemsize=max_rgp_len), @@ -293,11 +292,11 @@ def write_rgp(pangenome: Pangenome, h5f: tables.File, force: bool = False, disab def spot_desc(max_rgp_len): """ - Create a formated table for hotspot + Create a formatted table for hotspot :param max_rgp_len: Maximum size of RGP - :return: formated table + :return: formatted table """ return { 'spot': tables.UInt32Col(), @@ -347,11 +346,11 @@ def write_spots(pangenome: Pangenome, h5f: tables.File, force: bool = False, dis def mod_desc(gene_fam_name_len): """ - Create a formated table for hotspot + Create a formatted table for hotspot :param gene_fam_name_len: Maximum size of gene families name - :return: formated table + :return: formatted table """ return { "geneFam": tables.StringCol(itemsize=gene_fam_name_len), @@ -446,7 +445,7 @@ def write_info(pangenome: Pangenome, h5f: tables.File): if "/info" in h5f: info_group = h5f.root.info else: - info_group = h5f.create_group("/", "info", "Informations about the pangenome content") + info_group = h5f.create_group("/", "info", "Information about the pangenome content") if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"]: info_group._v_attrs.numberOfGenes = pangenome.number_of_genes info_group._v_attrs.numberOfGenomes = pangenome.number_of_organisms @@ -762,4 +761,4 @@ def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable h5f.close() logging.getLogger("PPanGGOLiN").info(f"Done writing the pangenome. It is in file : {filename}") - \ No newline at end of file + diff --git a/ppanggolin/formats/writeFlatGenomes.py b/ppanggolin/formats/writeFlatGenomes.py index 7f16cc91..232240c7 100644 --- a/ppanggolin/formats/writeFlatGenomes.py +++ b/ppanggolin/formats/writeFlatGenomes.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse @@ -395,10 +394,10 @@ def convert_overlapping_coordinates_for_gff(coordinates: List[Tuple[int, int]], start, stop = coordinates[0] new_coordinates = [(start, stop )] - # convert all coordinate that are at the begining + # convert all coordinates that are at the beginning # of the contig to the extent of the contig for start_n, stop_n in coordinates[1:]: - if start_n < start: # we are on the begining of the contig + if start_n < start: # we are on the beginning of the contig new_start = contig_length + start_n new_stop = contig_length + stop_n new_coordinates.append((new_start, new_stop)) diff --git a/ppanggolin/formats/writeFlatMetadata.py b/ppanggolin/formats/writeFlatMetadata.py index cfd58d12..68905f18 100644 --- a/ppanggolin/formats/writeFlatMetadata.py +++ b/ppanggolin/formats/writeFlatMetadata.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index 5f49d807..ecfdd7e7 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse @@ -480,7 +479,7 @@ def summarize_genome(organism: Organism, :param pangenome_persistent_count: Count of persistent genes in the pangenome. :param pangenome_persistent_single_copy_families: Set of gene families considered as persistent single-copy in the pangenome. :param soft_core_families: soft core families of the pangenome - :parma exact_core_families: exact core families of the pangenome + :param exact_core_families: exact core families of the pangenome :param input_org_rgps: Number of regions of genomic plasticity in the input organism. None if not computed. :param input_org_spots: Number of spots in the input organism. None if not computed. :param input_org_modules: Number of modules in the input organism. None if not computed. @@ -636,7 +635,7 @@ def write_persistent_duplication_statistics(pangenome: Pangenome, output: Path, return single_copy_persistent def write_summaries_in_tsv(summaries: List[Dict[str, Any]], output_file: Path, - dup_margin:float, soft_core:float): + dup_margin:float, soft_core:float, compress:bool = False): """ Writes summaries of organisms stored in a dictionary into a Tab-Separated Values (TSV) file. @@ -644,6 +643,7 @@ def write_summaries_in_tsv(summaries: List[Dict[str, Any]], output_file: Path, :param output_file: The Path specifying the output TSV file location. :param soft_core: Soft core threshold used :param dup_margin: minimum ratio of organisms in which family must have multiple genes to be considered duplicated + :param compress: Compress the file in .gz """ # Flatten the nested dictionaries within the summaries dictionary flat_summaries = [flatten_nested_dict(summary_info) for summary_info in summaries] @@ -651,7 +651,7 @@ def write_summaries_in_tsv(summaries: List[Dict[str, Any]], output_file: Path, # Create a DataFrame from the flattened summaries df_summary = pd.DataFrame(flat_summaries) - with open(output_file, "w") as flout: + with write_compressed_or_not(output_file, compress) as flout: flout.write(f"#soft_core={round(soft_core, 3)}\n") flout.write(f"#duplication_margin={round(dup_margin, 3)}\n") @@ -702,7 +702,7 @@ def write_stats(output: Path, soft_core: float = 0.95, dup_margin: float = 0.05, summaries.append(organism_summary) - write_summaries_in_tsv(summaries, output_file= output / "genomes_statistics.tsv", dup_margin=dup_margin, soft_core=soft_core) + write_summaries_in_tsv(summaries, output_file= output / "genomes_statistics.tsv", dup_margin=dup_margin, soft_core=soft_core, compress=compress) logging.getLogger("PPanGGOLiN").info("Done writing genome per genome statistics") @@ -760,15 +760,20 @@ def write_gene_families_tsv(output: Path, compress: bool = False, disable_bar: b """ logging.getLogger("PPanGGOLiN").info( "Writing the file providing the association between genes and gene families...") - outname = output / "gene_families.tsv" - with write_compressed_or_not(outname, compress) as tsv: - for fam in tqdm(pan.gene_families, total=pan.number_of_gene_families, unit='family', disable=disable_bar): - for gene in fam.genes: - tsv.write("\t".join([fam.name, gene.ID, gene.local_identifier, - "F" if gene.is_fragment else ""]) + "\n") + outname = output / f"gene_families.tsv{'.gz' if compress else ''}" + out_list = [] + for fam in tqdm(pan.gene_families, total=pan.number_of_gene_families, unit='family', disable=disable_bar): + for gene in fam.genes: + out_list.append([fam.name, gene.ID, gene.local_identifier, "F" if gene.is_fragment else ""]) + out_df = pd.DataFrame(out_list, columns=["GeneFam", "Gene", "local_id", "is_frag"]) + out_df["count"] = out_df.groupby("GeneFam")["GeneFam"].transform('count') + out_df = out_df.sort_values(by=["count", "Gene", "local_id", "is_frag"], ascending=[False, True, True, True]) + out_df = out_df.drop(columns=['count']) + out_df.to_csv(outname, sep="\t", index=False, header=False, compression='infer' if compress else None) logging.getLogger("PPanGGOLiN").info("Done writing the file providing the association between genes and " f"gene families: '{outname}'") + def summarize_spots(spots: set, output: Path, compress: bool = False, file_name="summarize_spots.tsv"): """ Write a file providing summarize information about hotspots @@ -1059,7 +1064,7 @@ def write_pangenome_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, :param soft_core: Soft core threshold to use :param dup_margin: minimum ratio of organisms in which family must have multiple genes to be considered duplicated :param csv: write csv file format as used by Roary - :param gene_pa: write gene presence abscence matrix + :param gene_pa: write gene presence absence matrix :param gexf: write pangenome graph in gexf format :param light_gexf: write pangenome graph with only gene families :param stats: write statistics about pangenome @@ -1210,7 +1215,7 @@ def parser_flat(parser: argparse.ArgumentParser): optional.add_argument("--gexf", required=False, action="store_true", help="write a gexf file with all the annotations and all the genes of each gene family") optional.add_argument("--light_gexf", required=False, action="store_true", - help="write a gexf file with the gene families and basic informations about them") + help="write a gexf file with the gene families and basic information about them") optional.add_argument("--json", required=False, action="store_true", help="Writes the graph in a json file format") diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 03697169..cc72f001 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse @@ -123,7 +122,7 @@ def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory # get genes that are present in only one copy for our family in each organism. single_copy_genes = [] - for _, genes in family.get_org_dict().items(): + for genes in family.get_org_dict().values(): if len(genes) == 1: single_copy_genes.extend(genes) @@ -224,7 +223,7 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: Path :param use_gene_id: Use gene identifiers rather than organism names for sequences in the family MSA """ - # sort familes by ID, so the gene order is consistent + # sort families by ID, so the gene order is consistent families = sorted(families, key=lambda f: f.ID) phylo_dict = {} diff --git a/ppanggolin/formats/writeMetadata.py b/ppanggolin/formats/writeMetadata.py index ee7a5efe..207f9a0c 100644 --- a/ppanggolin/formats/writeMetadata.py +++ b/ppanggolin/formats/writeMetadata.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -58,7 +57,7 @@ def write_metadata_status(pangenome: Pangenome, h5f: tables.File, status_group: metadata_group._v_attrs.modules = True metasources_group._v_attrs.modules = metasources["modules"] - return True if any(metadata_group._v_attrs._f_list()) else False + return any(metadata_group._v_attrs._f_list()) def write_metadata_group(h5f: tables.File, metatype: str) -> tables.Group: @@ -81,9 +80,9 @@ def write_metadata_group(h5f: tables.File, metatype: str) -> tables.Group: def desc_metadata(max_len_dict: Dict[str, int], type_dict: Dict[str, tables.Col]) -> dict: - """Create a formated table for metadata description + """Create a formatted table for metadata description - :return: Formated table + :return: Formatted table """ desc_dict = {attr: tables.StringCol(itemsize=max_value) for attr, max_value in max_len_dict.items()} desc_dict.update({attr: col_type for attr, col_type in type_dict.items()}) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 30dc04ff..cb01d704 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py index 82a88780..e75076d0 100644 --- a/ppanggolin/formats/write_proksee.py +++ b/ppanggolin/formats/write_proksee.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import json diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 0ccb3887..71100a95 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding: utf8 # default libraries from __future__ import annotations @@ -242,7 +241,7 @@ def named_partition(self) -> str: :raises ValueError: If the gene family has no partition assigned """ if self.partition == "": - raise ValueError("The gene family has not beed associated to a partition.") + raise ValueError("The gene family has not been associated to a partition.") if self.partition.startswith("P"): return "persistent" elif self.partition.startswith("C"): @@ -259,8 +258,7 @@ def edges(self) -> Generator[Edge, None, None]: :return: Edges of the gene family """ - for edge in self._edges_getter.values(): - yield edge + yield from self._edges_getter.values() @property def neighbors(self) -> Generator[GeneFamily, None, None]: @@ -268,8 +266,7 @@ def neighbors(self) -> Generator[GeneFamily, None, None]: :return: Neighbors """ - for neighbor in self._edges_getter.keys(): - yield neighbor + yield from self._edges_getter.keys() @property def genes(self): @@ -277,8 +274,7 @@ def genes(self): :return: Generator of genes """ - for gene in self._genes_getter.values(): - yield gene + yield from self._genes_getter.values() @property def organisms(self) -> Generator[Organism, None, None]: @@ -288,8 +284,7 @@ def organisms(self) -> Generator[Organism, None, None]: """ if len(self._genePerOrg) == 0: _ = self.get_org_dict() - for org in self._genePerOrg.keys(): - yield org + yield from self._genePerOrg.keys() @property def spots(self) -> Generator[Spot, None, None]: @@ -297,8 +292,7 @@ def spots(self) -> Generator[Spot, None, None]: :return: Generator of spots """ - for spot in self._spots: - yield spot + yield from self._spots @property def module(self) -> Module: @@ -446,8 +440,7 @@ def get_genes_per_org(self, org: Organism) -> Generator[Gene, None, None]: _ = self.get_org_dict() if org not in self._genePerOrg: raise KeyError(f"Genome does not have the gene family: {self.name}") - for gene in self._genePerOrg[org]: - yield gene + yield from self._genePerOrg[org] def is_single_copy(self, dup_margin: float, exclude_fragment: bool) -> bool: @@ -459,10 +452,7 @@ def is_single_copy(self, dup_margin: float, exclude_fragment: bool) -> bool: :return: A boolean indicating whether the gene family is single copy. """ - if self.duplication_ratio(exclude_fragment) < dup_margin: - return True - else: - return False + return self.duplication_ratio(exclude_fragment) < dup_margin def duplication_ratio(self, exclude_fragment: bool) -> bool: """ diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 19359a42..d5238f99 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding: utf8 from __future__ import annotations import logging @@ -77,7 +76,7 @@ def __len__(self) -> int: try: return sum([(stop - start +1) for start, stop in self.coordinates ]) except TypeError: - raise ValueError(f"Cooridnates of gene {self} has not been defined. Geting is length is then impossible.") + raise ValueError(f"Coordinates of gene {self} have not been defined. Getting its length is then impossible.") @property def has_joined_coordinates(self) -> bool: @@ -230,7 +229,7 @@ def string_coordinates(self) -> str: """ Return a string representation of the coordinates """ - return ','.join([f'{start}..{stop}' for start, stop in self.coordinates]) + return ','.join(f'{start}..{stop}' for start, stop in self.coordinates) def start_relative_to(self, gene): """ @@ -311,7 +310,7 @@ def family(self, family): def RGP(self): """Return the RGP that gene belongs to - :return: RGP fo the Gene + :return: RGP of the Gene :rtype: Region """ return self._RGP diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 021ab21b..11d25ec6 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging diff --git a/ppanggolin/info/info.py b/ppanggolin/info/info.py index f1baa640..969ceb99 100644 --- a/ppanggolin/info/info.py +++ b/ppanggolin/info/info.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse diff --git a/ppanggolin/main.py b/ppanggolin/main.py index d64be471..c5283977 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import sys diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py index cacf5534..659e2723 100644 --- a/ppanggolin/meta/meta.py +++ b/ppanggolin/meta/meta.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging diff --git a/ppanggolin/metadata.py b/ppanggolin/metadata.py index 3e61cd0b..90dc8c9a 100644 --- a/ppanggolin/metadata.py +++ b/ppanggolin/metadata.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding: utf8 # default libraries import logging @@ -52,12 +51,12 @@ def __repr__(self): def __len__(self) -> int: """Get the number of attribute links to the metadata object - :return: Number of fields (atribute) of the metadata + :return: Number of fields (attribute) of the metadata """ return len(self.__dict__) - 1 def __getattr__(self, attr: str) -> Any: - """Get the value corresponding to the given attibute + """Get the value corresponding to the given attribute :return: Value of the attribute @@ -119,8 +118,7 @@ def metadata(self) -> Generator[Metadata, None, None]: """ for meta_dict in self._metadata_getter.values(): - for metadata in meta_dict.values(): - yield metadata + yield from meta_dict.values() @property def sources(self) -> Generator[str, None, None]: @@ -260,7 +258,7 @@ def has_metadata(self) -> bool: :return: True if it has metadata else False """ - return True if self.number_of_metadata > 0 else False + return self.number_of_metadata > 0 def has_source(self, source: str) -> bool: """Check if the source is in the metadata feature @@ -269,4 +267,4 @@ def has_source(self, source: str) -> bool: :return: True if the source is in the metadata feature else False """ - return True if source in self._metadata_getter else False + return source in self._metadata_getter diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py index e760e966..b2b5b0c6 100644 --- a/ppanggolin/metrics/fluidity.py +++ b/ppanggolin/metrics/fluidity.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging diff --git a/ppanggolin/metrics/metrics.py b/ppanggolin/metrics/metrics.py index bed0e490..5b10a9a4 100644 --- a/ppanggolin/metrics/metrics.py +++ b/ppanggolin/metrics/metrics.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index ff30a974..e2d9595d 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -54,7 +53,7 @@ def compute_mod_graph(pangenome: Pangenome, t: int = 1, disable_bar: bool = Fals edge = g[gene.family][a_gene.family] add_gene(edge, gene) add_gene(edge, a_gene) - if j == i + t + 1 or i == 0: # if it's the last gene of the serie, or the first serie + if j == i + t + 1 or i == 0: # if it's the last gene of the series, or the first series add_gene(g.nodes[a_gene.family], a_gene, fam_split=False) return g diff --git a/ppanggolin/nem/NEM/nem_alg.c b/ppanggolin/nem/NEM/nem_alg.c index 6460042f..983e926b 100755 --- a/ppanggolin/nem/NEM/nem_alg.c +++ b/ppanggolin/nem/NEM/nem_alg.c @@ -1477,7 +1477,7 @@ static StatusET MakeRandomPara /* Get mean coordinates from drawn object non-NaN coordinates ; - or by uniform sampling if drawn object's coodinate is NaN */ + or by uniform sampling if drawn object's coordinate is NaN */ for ( d = 0 ; d < nd ; d ++ ) { /* Use drawn point's dth coordinate if not NaN */ @@ -2516,7 +2516,7 @@ static int ComputePartitionGEM /* +++ */ GetNeighFT* fGetNeigh ; /*V1.06-c*/ float* z_nk ; /* currently simulated partition */ - int* occur_nk ; /* occurence count for class h at site i */ + int* occur_nk ; /* occurrence count for class h at site i */ int icycle ; /* current Monte-Carlo cycle */ int kdraw ; /* drawn class : 0..nk-1 */ int ivis ; @@ -2551,7 +2551,7 @@ static int ComputePartitionGEM /* +++ */ LabelToClassVector( nk, kmap, &z_nk[ ipt * nk ] ) ; } - /* Initialize to zero all occurence counts */ + /* Initialize to zero all occurrence counts */ for ( ipt = 0 ; ipt < npt ; ipt ++ ) for ( k = 0 ; k < nk ; k ++ ) occur_nk[ ( ipt * nk ) + k ] = 0 ; diff --git a/ppanggolin/nem/NEM/nem_hlp.c b/ppanggolin/nem/NEM/nem_hlp.c index cb2f0d48..65596312 100755 --- a/ppanggolin/nem/NEM/nem_hlp.c +++ b/ppanggolin/nem/NEM/nem_hlp.c @@ -433,7 +433,7 @@ void PrintHelpFileIn( FILE* F ) fprintf( F , "\n" ) ; fprintf( F , " 4.c) file.m (option -s m)\n" ) ; fprintf( F , " -----------\n" ) ; - fprintf( F , " Contains the parameters (at begining or throughout the clustering process) of the mixture model separated by spaces\n" ) ; + fprintf( F , " Contains the parameters (at beginning or throughout the clustering process) of the mixture model separated by spaces\n" ) ; fprintf( F , "\n" ) ; fprintf( F , " If the parameters are just initialized by this file, the file start by 1, ortherwise if the parameters are fixed throughout the clustering process the file start by 2\n" ) ; fprintf( F , " Then :\n" ) ; @@ -547,7 +547,7 @@ void PrintHelpVersions( FILE* F ) fprintf( F , "Version 1.03 (02.10.1997) \n" ) ; fprintf( F , "------------\n" ) ; fprintf( F , "If a partial knowledge of the classification is available, the\n" ) ; - fprintf( F , "intial cluster centers are computed from the observations with\n" ) ; + fprintf( F , "initial cluster centers are computed from the observations with\n" ) ; fprintf( F , "known labels. \n" ) ; fprintf( F , "\n" ) ; fprintf( F , "The log file is made optional (option '-l y'). The ___.mf file now\n" ) ; @@ -567,7 +567,7 @@ void PrintHelpVersions( FILE* F ) fprintf( F , "The heuristics may be invoked with '-B heu_d' or '-B heu_l'. Their\n" ) ; fprintf( F , "default parameters may be changed with '-H ...'. \n" ) ; fprintf( F , "\n" ) ; - fprintf( F , "The final partition may now be printed to standard ouput instead of to a\n" ) ; + fprintf( F , "The final partition may now be printed to standard output instead of to a\n" ) ; fprintf( F , "file (option '-o -'). The result can thus be redirected as an \n" ) ; fprintf( F , "initial partition to another nem_exe session's input.\n" ) ; fprintf( F , "\n" ) ; diff --git a/ppanggolin/nem/NEM/nem_typ.h b/ppanggolin/nem/NEM/nem_typ.h index 43299b0a..ab198f62 100755 --- a/ppanggolin/nem/NEM/nem_typ.h +++ b/ppanggolin/nem/NEM/nem_typ.h @@ -348,7 +348,7 @@ typedef struct char NeighName[ LEN_FILENAME + 1 ] ; /* name of neighborhood file */ char LabelName[ LEN_FILENAME + 1 ] ; /* name of fixed labels file */ char RefName[ LEN_FILENAME + 1 ] ; /* name of reference labels file *//*V1.04-f*/ - char ParamName[ LEN_FILENAME + 1 ] ; /* name of initilization param file *//*V1.08-a*/ + char ParamName[ LEN_FILENAME + 1 ] ; /* name of initialization param file *//*V1.08-a*/ } NemParaT ; /* NEM running parameters */ diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index 09116256..952ab583 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -38,7 +37,7 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di just_log_likelihood: bool = False) \ -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: """ - Main function to make partitionning + Main function to make partitioning :param nem_dir_path: Path to directory with nem files :param nb_org: Number of organisms @@ -166,27 +165,15 @@ def run_partitioning(nem_dir_path: Path, nb_org: int, beta: float = 2.5, free_di partitions_list[i] = "S_" # SHELL in case of doubt gene families is attributed to shell else: partitions_list[i] = parti[positions_max_prob.pop()] - except IOError: - logging.getLogger("PPanGGOLiN").debug( - "partitioning did not work (the number of genomes used is probably too low), " - "see logs here to obtain more details " + nem_dir_path.as_posix() + "/nem_file_" + - str(kval) + ".log") + except OSError: + logging.getLogger("PPanGGOLiN").warning("Partitioning did not work (the number of genomes used is probably too low), " + f"see logs here to obtain more details {nem_dir_path.as_posix()}") return {}, None, None # return empty objects + except ValueError: # return the default partitions_list which correspond to undefined pass - if not keep_files and no_nem is False: - os.remove(nem_dir_path / f"nem_file_{str(kval)}.uf") - os.remove(nem_dir_path / f"nem_file_{str(kval)}.mf") - os.remove(nem_dir_path / f"nem_file_{str(kval)}.log") - os.remove(nem_dir_path / f"nem_file_{str(kval)}.stderr") - os.remove(nem_dir_path / f"nem_file_init_{str(kval)}.m") - os.remove(nem_dir_path / "nem_file.index") - os.remove(nem_dir_path / "nem_file.dat") - os.remove(nem_dir_path / "nem_file.nei") - os.remove(nem_dir_path / "nem_file.str") - if just_log_likelihood: return kval, log_likelihood, entropy else: @@ -248,7 +235,7 @@ def write_nem_input_files(tmpdir: Path, organisms: set, sm_degree: int = 10) -> :param organisms: Set of organism from pangenome :param sm_degree: Maximum degree of the nodes to be included in the smoothing process. - :return: total edge weigth to ponderate beta and number of families + :return: total edge weight to ponderate beta and number of families """ mk_outdir(tmpdir, force=False) total_edges_weight = 0 @@ -470,8 +457,15 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d check_pangenome_former_partition(pangenome, force) check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) organisms = set(pangenome.organisms) - tmp_dir = tempfile.TemporaryDirectory(dir=tmpdir) - tmp_path = Path(tmp_dir.name) + + if keep_tmp_files: + # Create a temporary directory without auto-cleanup + tmp_dir = tempfile.mkdtemp(dir=tmpdir) + tmp_path = Path(tmp_dir) + else: + # Create a temporary directory with auto-cleanup + tmp_dir = tempfile.TemporaryDirectory(dir=tmpdir) + tmp_path = Path(tmp_dir.name) if len(organisms) <= 10: logging.getLogger("PPanGGOLiN").warning(f"The number of selected genomes is too low ({len(organisms)} " @@ -487,7 +481,7 @@ def partition(pangenome: Pangenome, output: Path = None, beta: float = 2.5, sm_d pangenome.parameters["partition"]["chunk_size"] = chunk_size pangenome.parameters["partition"]["# computed nb of partitions"] = False - # the K value initally given by the user + # the K value initially given by the user pangenome.parameters["partition"]["nb_of_partitions"] = kval if kval < 2: pangenome.parameters["partition"]["# computed nb of partitions"] = True diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index cf8dcd41..04e2f638 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse @@ -49,7 +48,7 @@ def raref_nem(index: int, tmpdir: Path, beta: float = 2.5, sm_degree: int = 10, :param krange: Range of K values to test when detecting K automatically. :param seed: seed used to generate random numbers - :return: Count of each partition and paremeters for the given sample index + :return: Count of each partition and parameters for the given sample index """ samp = samples[index] currtmpdir = tmpdir / f"{str(index)}" @@ -147,7 +146,7 @@ def launch_raref_nem(args: Tuple[int, Path, float, int, bool, int, int, list, in :param args: {index: int, tmpdir: str, beta: float, sm_degree: int, free_dispersion: bool, chunk_size: int, kval: int, krange: list, seed: int} - :return: Count of each partition and paremeters for the given sample index + :return: Count of each partition and parameters for the given sample index """ return raref_nem(*args) @@ -391,7 +390,7 @@ def make_rarefaction_curve(pangenome: Pangenome, output: Path, tmpdir: Path = No all_samples = [] for i in range(min_sampling, max_sampling): # each point for _ in range(depth): # number of samples per points - all_samples.append(set(random.sample(set(pangenome.organisms), i + 1))) + all_samples.append(set(random.sample(list(pangenome.organisms), i + 1))) logging.getLogger("PPanGGOLiN").info(f"Done sampling genomes in the pan, there are {len(all_samples)} samples") samp_nb_per_part = [] diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 37cb3b84..ee12d09d 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding: utf8 # default libraries import logging @@ -191,7 +190,7 @@ def max_fam_id(self): def max_fam_id(self, value): """Set the last family identifier - :param value: value of the maximum family identifer + :param value: value of the maximum family identifier """ self._max_fam_id = value @@ -201,8 +200,7 @@ def gene_families(self) -> Generator[GeneFamily, None, None]: :return: Generator of gene families """ - for family in self._fam_getter.values(): - yield family + yield from self._fam_getter.values() @property def number_of_gene_families(self) -> int: @@ -259,8 +257,7 @@ def edges(self) -> Generator[Edge, None, None]: :return: Generator of edge """ - for edge in self._edge_getter.values(): - yield edge + yield from self._edge_getter.values() def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: """ @@ -304,8 +301,7 @@ def organisms(self) -> Generator[Organism, None, None]: :return: Generator :class:`ppanggolin.genome.Organism` """ - for organism in self._org_getter.values(): - yield organism + yield from self._org_getter.values() @property def number_of_organisms(self) -> int: @@ -507,8 +503,7 @@ def regions(self) -> Generator[Region, None, None]: :return: list of RGP """ - for region in self._region_getter.values(): - yield region + yield from self._region_getter.values() def get_region(self, name: str) -> Region: """Returns a region with the given region_name. Creates it if it does not exist. diff --git a/ppanggolin/projection/projection.py b/ppanggolin/projection/projection.py index d731e92c..53aaf0b0 100644 --- a/ppanggolin/projection/projection.py +++ b/ppanggolin/projection/projection.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse @@ -263,7 +262,7 @@ def write_projection_results(pangenome: Pangenome, organisms: Set[Organism], # multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin) # dup margin value here is specified in argument and is used to compute completeness. - # Thats mean it can be different than dup margin used in spot and RGPS. + # That means it can be different than dup margin used in spot and RGPS. pangenome_persistent_single_copy_families = pangenome.get_single_copy_persistent_families(dup_margin=dup_margin, exclude_fragments=True) @@ -330,7 +329,7 @@ def write_projection_results(pangenome: Pangenome, organisms: Set[Organism], write_summaries_in_tsv(summaries, output_file=output_file, dup_margin=dup_margin, - soft_core=soft_core) + soft_core=soft_core, compress=compress) def summarize_projected_genome(organism: Organism, @@ -703,7 +702,7 @@ def predict_RGP(pangenome: Pangenome, input_organisms: List[Organism], persisten for input_organism in input_organisms: rgps = compute_org_rgp(input_organism, multigenics, persistent_penalty, variable_gain, min_length, min_score, naming=name_scheme, disable_bar=disable_bar) - # turn on projected attribut in rgp objects + # turn on projected attribute in rgp objects # useful when associating spot to prevent failure when multiple spot are associated to a projected RGP for rgp in rgps: rgp.projected = True @@ -914,7 +913,7 @@ def predict_spots_in_input_organisms( # Check congruency with already computed spot and add spot id in node attributes check_spots_congruency(graph_spot, initial_spots) - new_spot_id_counter = max((s.ID for s in initial_spots)) + 1 + new_spot_id_counter = max((s.ID for s in initial_spots)) + 1 if len(initial_spots) != 0 else 1 input_org_to_spots = {} for input_organism, rgps in input_org_2_rgps.items(): diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 5cc6fa2a..f33f0d82 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding: utf8 # default libraries from __future__ import annotations @@ -319,8 +318,7 @@ def genes(self) -> Generator[Gene, None, None]: :return: Genes in the region """ - for gene in sorted(self._genes_getter.values(), key=lambda x: x.position): - yield gene + yield from sorted(self._genes_getter.values(), key=lambda x: x.position) @property def families(self) -> Generator[GeneFamily, None, None]: @@ -432,7 +430,7 @@ def get_bordering_genes(self, n: int, multigenics: Set[GeneFamily], return_only_ :return: A list of bordering genes in start and stop position """ genes_in_region = list(self.genes) - # Identifiying left border + # Identifying left border left_border = [] pos = self.starter.position init = pos @@ -458,7 +456,7 @@ def get_bordering_genes(self, n: int, multigenics: Set[GeneFamily], return_only_ if pos == init: break # looped around the contig - # Identifiying right border + # Identifying right border right_border = [] pos = self.stopper.position init = pos @@ -625,8 +623,7 @@ def regions(self) -> Generator[Region, None, None]: :return: Regions in the spot """ - for region in self._region_getter.values(): - yield region + yield from self._region_getter.values() @property def families(self) -> Generator[GeneFamily, None, None]: diff --git a/ppanggolin/utility/utils.py b/ppanggolin/utility/utils.py index 52037676..bf49951f 100644 --- a/ppanggolin/utility/utils.py +++ b/ppanggolin/utility/utils.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse @@ -12,7 +11,7 @@ WORKFLOW_SUBCOMMANDS, ALL_WORKFLOW_DEPENDENCIES, WRITE_PAN_FLAG_DEFAULT_IN_WF, WRITE_GENOME_FLAG_DEFAULT_IN_WF, DRAW_FLAG_DEFAULT_IN_WF from ppanggolin import SUBCOMMAND_TO_SUBPARSER -""" Utility scripts to help formating input files of PPanggolin.""" +""" Utility scripts to help formatting input files of PPanggolin.""" def split(list_object: list, chunk_count: int) -> List[List[int]]: @@ -36,9 +35,9 @@ def split_comment_string(comment_string: str, max_word_count: int = 20, prefix: :params comment_string: comment string to split :params max_word_count: maximum number of word per line - :params prefix: prefic used to start a new comment line + :params prefix: prefix used to start a new comment line - :return : the splited comment line. + :return : the split comment line. """ splitted_comment = comment_string.split() @@ -185,8 +184,8 @@ def launch_default_config(args: argparse.Namespace): if parser_action.dest in ALL_INPUT_PARAMS: if sub_command == initial_command: - # with worflow dependencies, we do not use their input params - # as input params are given by worflow cmds + # with workflow dependencies, we do not use their input params + # as input params are given by workflow cmds inputs_actions.append(parser_action) elif parser_action.dest in ALL_GENERAL_PARAMS: diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index b9e66fd3..88fd963f 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -72,9 +71,9 @@ def check_log(log_file: str) -> TextIO: if os.access(log_file, os.W_OK): return log_file else: - raise IOError(f"The given log file {log_file} is not writable. Please check if it is accessible.") + raise OSError(f"The given log file {log_file} is not writable. Please check if it is accessible.") else: - raise IOError(f"The given log file: {log_file} is a directory. Please provide a valid log file.") + raise OSError(f"The given log file: {log_file} is a directory. Please provide a valid log file.") # target does not exist, check perms on parent dir parent_dir = os.path.dirname(log_file) @@ -84,7 +83,7 @@ def check_log(log_file: str) -> TextIO: if os.access(parent_dir, os.W_OK): return log_file else: - raise IOError(f"The given log file {log_file} is not writable. Please check if it is accessible.") + raise OSError(f"The given log file {log_file} is not writable. Please check if it is accessible.") def check_tsv_sanity(tsv: Path): @@ -94,8 +93,8 @@ def check_tsv_sanity(tsv: Path): """ try: input_file = open(tsv, "r") - except IOError as ios_error: - raise IOError(ios_error) + except OSError as ios_error: + raise OSError(ios_error) except Exception as exception_error: raise Exception(f"The following unexpected error happened when opening the list of genomes path: " f"{exception_error}") @@ -166,7 +165,7 @@ def set_verbosity_level(args): logging.basicConfig(filename=args.log, level=level, format=str_format, datefmt=datefmt) - logging.getLogger("PPanGGOLiN").info("Command: " + " ".join([arg for arg in sys.argv])) + logging.getLogger("PPanGGOLiN").info("Command: " + " ".join(arg for arg in sys.argv)) logging.getLogger("PPanGGOLiN").info(f"PPanGGOLiN version: {distribution('ppanggolin').version}") @@ -276,7 +275,7 @@ def read_compressed_or_not(file_or_file_path: Union[Path, BinaryIO, TextIOWrappe return file_or_file_path -def write_compressed_or_not(file_path: Path, compress: bool = False) -> Union[gzip.GzipFile, TextIO]: +def write_compressed_or_not(file_path: Path, compress: bool = False) -> Union[gzip.GzipFile, TextIOWrapper]: """ Create a file-like object, compressed or not. @@ -652,7 +651,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ Manage command line and config arguments for the given subcommand. This function parse arguments from the cmd line and config file and set up the following priority: cli > config > default - When the subcommand is a workflow, the subcommand used in worflows are also parsed in the config. + When the subcommand is a workflow, the subcommand used in workflows are also parsed in the config. :params subcommand: Name of the subcommand. @@ -709,7 +708,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ params_that_differ = get_args_differing_from_default(default_args, args, input_params) if params_that_differ: - params_that_differ_str = ', '.join([f'{p}={v}' for p, v in params_that_differ.items()]) + params_that_differ_str = ', '.join(f'{p}={v}' for p, v in params_that_differ.items()) logging.getLogger("PPanGGOLiN").debug( f"{len(params_that_differ)} {subcommand} parameters have non-default value: {params_that_differ_str}") @@ -736,21 +735,25 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ # overwrite write and draw default when not specified in config if workflow_step == 'write_pangenome': for out_flag in WRITE_PAN_FLAG_DEFAULT_IN_WF: - setattr(default_step_args, out_flag, True) + if out_flag not in config[workflow_step]: + setattr(default_step_args, out_flag, True) + if workflow_step == 'write_genomes': for out_flag in WRITE_GENOME_FLAG_DEFAULT_IN_WF: - setattr(default_step_args, out_flag, True) + if out_flag not in config[workflow_step]: + setattr(default_step_args, out_flag, True) if workflow_step == "draw": for out_flag in DRAW_FLAG_DEFAULT_IN_WF: - setattr(default_step_args, out_flag, True) + if out_flag not in config[workflow_step]: + setattr(default_step_args, out_flag, True) step_args = overwrite_args(default_step_args, config_step_args, cli_args) step_params_that_differ = get_args_differing_from_default(default_step_args, step_args) if step_params_that_differ: - step_params_that_differ_str = ', '.join([f'{p}={v}' for p, v in step_params_that_differ.items()]) + step_params_that_differ_str = ', '.join(f'{p}={v}' for p, v in step_params_that_differ.items()) logging.getLogger("PPanGGOLiN").debug(f"{len(step_params_that_differ)} {workflow_step} parameters have " f"a non-default value: {step_params_that_differ_str}") @@ -760,7 +763,7 @@ def manage_cli_and_config_args(subcommand: str, config_file: str, subcommand_to_ params_that_differ.update(step_params_that_differ) - # Add args namespace of the step to the inital args namespace + # Add args namespace of the step to the initial args namespace setattr(args, workflow_step, step_args) if params_that_differ: @@ -816,7 +819,7 @@ def set_up_config_param_to_parser(config_param_val: dict) -> list: :params config_param_val: Dict with parameter name as key and parameter value as value. - :return: list of argument strings formated for an argparse.ArgumentParser object. + :return: list of argument strings formatted for an argparse.ArgumentParser object. """ arguments_to_parse = [] diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index e884f0bf..5c02bf73 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import logging @@ -209,7 +208,7 @@ def launch_workflow(args: argparse.Namespace, panrgp: bool = True, start_desc = time.time() - write_pangenome_arguments = ["csv", "Rtab", "gexf", "light_gexf", "projection", "stats", 'json', "families_tsv"] + write_pangenome_arguments = ["gexf", "light_gexf", 'json', "csv", "Rtab", "stats", "partitions", "families_tsv"] # Check that we don't ask write to output something not computed. borders, spots, spot_modules, modules, regions = (False, False, False, False, False) diff --git a/ppanggolin/workflow/panModule.py b/ppanggolin/workflow/panModule.py index faf99553..9105da05 100644 --- a/ppanggolin/workflow/panModule.py +++ b/ppanggolin/workflow/panModule.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse diff --git a/ppanggolin/workflow/panRGP.py b/ppanggolin/workflow/panRGP.py index c810e9b2..7f024ec3 100644 --- a/ppanggolin/workflow/panRGP.py +++ b/ppanggolin/workflow/panRGP.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse diff --git a/ppanggolin/workflow/workflow.py b/ppanggolin/workflow/workflow.py index e6063ee7..bf2e44e4 100644 --- a/ppanggolin/workflow/workflow.py +++ b/ppanggolin/workflow/workflow.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# coding:utf-8 # default libraries import argparse diff --git a/ppanggolin_env.yaml b/ppanggolin_env.yaml index 25bb02a9..30b058fa 100644 --- a/ppanggolin_env.yaml +++ b/ppanggolin_env.yaml @@ -12,9 +12,9 @@ dependencies: - scipy=1 - plotly=5 - gmpy2=2 - - pandas=2.0 + - pandas=2 - colorlover=0.3 - - numpy=1.24 + - numpy=1 - bokeh=3 # Tool that are not in python - infernal=1 diff --git a/testingDataset/expected_info_files/checksum.txt b/testingDataset/expected_info_files/checksum.txt new file mode 100644 index 00000000..2291de3d --- /dev/null +++ b/testingDataset/expected_info_files/checksum.txt @@ -0,0 +1,5 @@ +9d6219523e890b08c467c936590b37436e915840a797ea161c9707041c3b821c mybasicpangenome/gene_families.tsv +9d6219523e890b08c467c936590b37436e915840a797ea161c9707041c3b821c stepbystep/gene_families.tsv +bd71918de23737aec5a262c883c1154a507ae616bdc47a152db0e76319e7484d readclusterpang/gene_families.tsv +b893343f249102dff23a9542752f83cdee93efd34bc951b48a0c986fcb5d26a4 myannopang/gene_families.tsv +41511d7c482c0c400a504d5c564605427081917e54aee1e5f37ffda220d2cdb9 test_config/gene_families.tsv diff --git a/testingDataset/expected_info_files/myannopang_info.yaml b/testingDataset/expected_info_files/myannopang_info.yaml index e6687f73..ca7dea34 100644 --- a/testingDataset/expected_info_files/myannopang_info.yaml +++ b/testingDataset/expected_info_files/myannopang_info.yaml @@ -8,7 +8,7 @@ Status: RGP_Predicted: false Spots_Predicted: false Modules_Predicted: false - PPanGGOLiN_Version: 2.1.0 + PPanGGOLiN_Version: 2.1.1 Content: Genes: 47961 diff --git a/testingDataset/expected_info_files/mybasicpangenome_info.yaml b/testingDataset/expected_info_files/mybasicpangenome_info.yaml index 0d49fe30..48d59f36 100644 --- a/testingDataset/expected_info_files/mybasicpangenome_info.yaml +++ b/testingDataset/expected_info_files/mybasicpangenome_info.yaml @@ -8,7 +8,7 @@ Status: RGP_Predicted: true Spots_Predicted: true Modules_Predicted: true - PPanGGOLiN_Version: 2.1.0 + PPanGGOLiN_Version: 2.1.1 Content: Genes: 45429 diff --git a/testingDataset/expected_info_files/stepbystep_info.yaml b/testingDataset/expected_info_files/stepbystep_info.yaml index fdb6dc6d..395b259a 100644 --- a/testingDataset/expected_info_files/stepbystep_info.yaml +++ b/testingDataset/expected_info_files/stepbystep_info.yaml @@ -8,7 +8,7 @@ Status: RGP_Predicted: true Spots_Predicted: true Modules_Predicted: true - PPanGGOLiN_Version: 2.1.0 + PPanGGOLiN_Version: 2.1.1 Content: Genes: 45429 diff --git a/tests/context/test_context.py b/tests/context/test_context.py index 7c331961..453a1f5d 100644 --- a/tests/context/test_context.py +++ b/tests/context/test_context.py @@ -1,5 +1,4 @@ #! /usr/bin/env python3 -# coding: utf8 import pytest from ppanggolin.context.searchGeneContext import (extract_contig_window, get_n_next_genes_index, diff --git a/tests/region/test_rgp_cluster.py b/tests/region/test_rgp_cluster.py index 3776cc33..51cf6eb7 100644 --- a/tests/region/test_rgp_cluster.py +++ b/tests/region/test_rgp_cluster.py @@ -1,5 +1,4 @@ #! /usr/bin/env python3 -# coding: utf8 import pytest from random import randint @@ -18,7 +17,7 @@ def genes() -> Generator[Set[Gene], None, None]: organism = Organism("organism") contig = Contig(0, "contig") genes = [] - for i in range(0, randint(11, 20)): + for i in range(randint(11, 20)): gene = Gene(f"gene_{str(i)}") gene.fill_annotations(start=10 * i + 1, stop=10 * (i + 1), strand='+', position=i, genetic_code=4) gene.fill_parents(organism, contig) diff --git a/tests/test_edge.py b/tests/test_edge.py index 95b2f3ef..0702cdc7 100644 --- a/tests/test_edge.py +++ b/tests/test_edge.py @@ -1,5 +1,4 @@ #! /usr/bin/env python3 -# coding: utf8 import pytest from typing import Generator, Tuple diff --git a/tests/test_genefamily.py b/tests/test_genefamily.py index a00ebc26..594ad8d5 100644 --- a/tests/test_genefamily.py +++ b/tests/test_genefamily.py @@ -1,5 +1,4 @@ #! /usr/bin/env python3 -# coding: utf8 import pytest from random import randint diff --git a/tests/test_genome.py b/tests/test_genome.py index 0c57ae25..6f519a74 100644 --- a/tests/test_genome.py +++ b/tests/test_genome.py @@ -1,5 +1,4 @@ #! /usr/bin/env python3 -# coding: utf8 import pytest from typing import Generator, Tuple diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 95928b05..dff4ad05 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,5 +1,4 @@ #! /usr/bin/env python3 -# coding: utf8 import pytest from random import randint diff --git a/tests/test_pangenome.py b/tests/test_pangenome.py index 8fd13e5d..3a7f958a 100644 --- a/tests/test_pangenome.py +++ b/tests/test_pangenome.py @@ -1,5 +1,4 @@ #! /usr/bin/env python3 -# coding: utf8 import pytest from random import choices, randint diff --git a/tests/test_region.py b/tests/test_region.py index 9a41d024..dbc35bf9 100644 --- a/tests/test_region.py +++ b/tests/test_region.py @@ -1,5 +1,4 @@ #! /usr/bin/env python3 -# coding: utf8 import pytest from typing import Generator, Set diff --git a/tests/utils/test_utilities.py b/tests/utils/test_utilities.py index 6e0b55a1..ceeef757 100644 --- a/tests/utils/test_utilities.py +++ b/tests/utils/test_utilities.py @@ -1,5 +1,4 @@ #! /usr/bin/env python3 -# coding: utf8 import pytest from pathlib import Path