Skip to content

Commit

Permalink
Merge branch 'dev' into bugfix-sept-23
Browse files Browse the repository at this point in the history
  • Loading branch information
axbazin authored Sep 30, 2023
2 parents ac77094 + 5513cd7 commit 291a324
Show file tree
Hide file tree
Showing 59 changed files with 7,153 additions and 4,142 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ jobs:
shell: bash -l {0}
run: |
cd testingDataset
ppanggolin annotate --fasta organisms.fasta.list --output stepbystep --kingdom bacteria --contig_filter 500
ppanggolin cluster -p stepbystep/pangenome.h5 --defrag --coverage 0.8 --identity 0.8
ppanggolin annotate --fasta organisms.fasta.list --output stepbystep --kingdom bacteria
ppanggolin cluster -p stepbystep/pangenome.h5 --coverage 0.8 --identity 0.8
ppanggolin graph -p stepbystep/pangenome.h5 -r 10
ppanggolin partition --output stepbystep -f -p stepbystep/pangenome.h5 --cpu 1 -b 2.6 -ms 10 -fd -ck 500 -Kmm 3 12 -im 0.04 --draw_ICL -se $RANDOM
ppanggolin rarefaction --output stepbystep -f -p stepbystep/pangenome.h5 --depth 5 --min 1 --max 50 -ms 10 -fd -ck 30 -K 3 --soft_core 0.9 -se $RANDOM
Expand All @@ -70,7 +70,7 @@ jobs:
ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
ppanggolin write -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --projection --stats --partitions --compress --json --regions --spots --borders --families_tsv --cpu 1
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta organisms.fasta.list
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --info_modules --no_print_info -f --log metrics.log
cd -
- name: gbff parsing and MSA computing
Expand All @@ -86,7 +86,7 @@ jobs:
cd testingDataset
ppanggolin panrgp --anno organisms.gbff.list --cluster clusters.tsv --output readclusterpang
ppanggolin annotate --anno organisms.gbff.list --output readclusters
ppanggolin cluster --cluster clusters.tsv -p readclusters/pangenome.h5
ppanggolin cluster --clusters clusters.tsv -p readclusters/pangenome.h5
ppanggolin msa --pangenome readclusterpang/pangenome.h5 --partition persistent --phylo -o readclusterpang/msa/ -f
cd -
- name: testing rgp_cluster command
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.2.132
1.2.187
47 changes: 22 additions & 25 deletions ppanggolin/RGP/genomicIsland.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
# default libraries
import logging
import argparse

from pathlib import Path
from typing import Set
# installed libraries
from tqdm import tqdm

Expand All @@ -30,7 +31,7 @@ def changes(self, score):
self.score = score if score >= 0 else 0


def extract_rgp(contig, node, rgp_id, naming):
def extract_rgp(contig, node, rgp_id, naming) -> Region:
"""
Extract the region from the given starting node
"""
Expand All @@ -40,7 +41,7 @@ def extract_rgp(contig, node, rgp_id, naming):
elif naming == "organism":
new_region = Region(node.gene.organism.name + "_" + contig.name + "_RGP_" + str(rgp_id))
while node.state:
new_region.append(node.gene)
new_region.add(node.gene)
node.state = 0
node.score = 0
node = node.prev
Expand Down Expand Up @@ -148,7 +149,7 @@ def init_matrices(contig: Contig, multi: set, persistent_penalty: int = 3, varia


def mk_regions(contig: Contig, matrix: list, multi: set, min_length: int = 3000, min_score: int = 4,
persistent: int = 3, continuity: int = 1, naming: str = "contig") -> set:
persistent: int = 3, continuity: int = 1, naming: str = "contig") -> Set[Region]:
"""
Processing matrix and 'emptying' it to get the regions.
Expand All @@ -163,6 +164,7 @@ def mk_regions(contig: Contig, matrix: list, multi: set, min_length: int = 3000,
:return:
"""

def max_index_node(lst):
"""gets the last node with the highest score from a list of matriceNode"""
if isinstance(lst, list):
Expand All @@ -182,18 +184,18 @@ def max_index_node(lst):
while val >= min_score:
new_region = extract_rgp(contig, matrix[index], len(contig_regions), naming)
new_region.score = val
if (new_region[0].stop - new_region[-1].start) > min_length:
if new_region.length > min_length:
contig_regions.add(new_region)
rewrite_matrix(contig, matrix, index, persistent, continuity, multi)
val, index = max_index_node(matrix)
return contig_regions


def compute_org_rgp(organism: Organism, multigenics: set, persistent_penalty: int = 3, variable_gain: int = 1,
min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> set:
min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> Set[Region]:
org_regions = set()
for contig in organism.contigs:
if len(contig.genes) != 0: # some contigs have no coding genes...
if contig.number_of_genes != 0: # some contigs have no coding genes...
# can definitely multiprocess this part, as not THAT much information is needed...
matrix = init_matrices(contig, multigenics, persistent_penalty, variable_gain)
org_regions |= mk_regions(contig, matrix, multigenics, min_length, min_score, persistent_penalty,
Expand All @@ -208,8 +210,9 @@ def naming_scheme(pangenome: Pangenome):
oldlen = len(contigsids)
contigsids.add(contig.name)
if oldlen == len(contigsids):
logging.getLogger().warning("You have contigs with identical identifiers in your assemblies. "
"identifiers will be supplemented with your provided organism names.")
logging.getLogger("PPanGGOLiN").warning("You have contigs with identical identifiers in your "
"assemblies. identifiers will be supplemented with your "
"provided organism names.")
return "organism"
return "contig"

Expand Down Expand Up @@ -247,14 +250,15 @@ def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain
check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=False, need_partitions=True,
disable_bar=disable_bar)

logging.getLogger().info("Detecting multigenic families...")
logging.getLogger("PPanGGOLiN").info("Detecting multigenic families...")
multigenics = pangenome.get_multigenics(dup_margin)
logging.getLogger().info("Compute Regions of Genomic Plasticity ...")
logging.getLogger("PPanGGOLiN").info("Compute Regions of Genomic Plasticity ...")
name_scheme = naming_scheme(pangenome)
for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar):
pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length,
min_score, naming=name_scheme))
logging.getLogger().info(f"Predicted {len(pangenome.regions)} RGP")
for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms, unit="genomes", disable=disable_bar):
for region in compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length,
min_score, naming=name_scheme):
pangenome.add_region(region)
logging.getLogger("PPanGGOLiN").info(f"Predicted {pangenome.number_of_rgp} RGP")

# save parameters and save status
pangenome.parameters["RGP"] = {}
Expand Down Expand Up @@ -301,7 +305,7 @@ def parser_rgp(parser: argparse.ArgumentParser):
"""
required = parser.add_argument_group(title="Required arguments",
description="One of the following arguments is required :")
required.add_argument('-p', '--pangenome', required=False, type=str, help="The pangenome .h5 file")
required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file")

optional = parser.add_argument_group(title="Optional arguments")
optional.add_argument('--persistent_penalty', required=False, type=int, default=3,
Expand All @@ -319,20 +323,13 @@ def parser_rgp(parser: argparse.ArgumentParser):

if __name__ == '__main__':
"""To test local change and allow using debugger"""
from ppanggolin.utils import check_log, set_verbosity_level
from ppanggolin.utils import set_verbosity_level, add_common_arguments

main_parser = argparse.ArgumentParser(
description="Depicting microbial species diversity via a Partitioned PanGenome Graph Of Linked Neighbors",
formatter_class=argparse.RawTextHelpFormatter)

parser_rgp(main_parser)
common = main_parser.add_argument_group(title="Common argument")
common.add_argument("--verbose", required=False, type=int, default=1, choices=[0, 1, 2],
help="Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug)")
common.add_argument("--log", required=False, type=check_log, default="stdout", help="log output file")
common.add_argument("-d", "--disable_prog_bar", required=False, action="store_true",
help="disables the progress bars")
common.add_argument('-f', '--force', action="store_true",
help="Force writing in output directory and in pangenome output file.")
add_common_arguments(main_parser)
set_verbosity_level(main_parser.parse_args())
launch(main_parser.parse_args())
Loading

0 comments on commit 291a324

Please sign in to comment.