Skip to content

Commit

Permalink
add completeness in projection
Browse files Browse the repository at this point in the history
  • Loading branch information
JeanMainguy committed Sep 29, 2023
1 parent 123dd48 commit d3299a0
Showing 1 changed file with 32 additions and 7 deletions.
39 changes: 32 additions & 7 deletions ppanggolin/projection/projection.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@ def launch(args: argparse.Namespace):
if predict_rgp:
logging.getLogger('PPanGGOLiN').info('Detecting RGPs in input genomes.')

logging.getLogger('PPanGGOLiN').debug("Detecting multigenic families...")
multigenics = pangenome.get_multigenics(pangenome_params.rgp.dup_margin)

input_org_2_rgps = predict_RGP(pangenome, organisms, persistent_penalty=pangenome_params.rgp.persistent_penalty, variable_gain=pangenome_params.rgp.variable_gain,
Expand All @@ -197,15 +196,27 @@ def launch(args: argparse.Namespace):
input_orgs_to_modules = project_and_write_modules(pangenome, organisms, output_dir)

organism_2_summary = {}
# dup margin value here is specified in argument and is used to compute completeness.
# Thats mean it can be different than dup margin used in spot and RGPS.
single_copy_fams = set()

for fam in pangenome.gene_families:
if fam.named_partition == "persistent":
dup = len([genes for genes in fam.get_org_dict().values() if
len([gene for gene in genes if not gene.is_fragment]) > 1])

if (dup / fam.number_of_organisms) < args.dup_margin:
single_copy_fams.add(fam)

for organism in organisms:
# summarize projection for all input organisms
organism_2_summary[organism] = summarize_projection(organism, pangenome,
organism_2_summary[organism] = summarize_projection(organism, pangenome, single_copy_fams,
input_org_2_rgps.get(organism, None),
input_org_to_spots.get(organism, None),
input_orgs_to_modules.get(organism, None),
input_org_to_lonely_genes_count[organism], output_dir)
input_org_to_lonely_genes_count[organism])

write_summaries(organism_2_summary, output_dir)
write_summaries(organism_2_summary, output_dir)


def annotate_fasta_files(genome_name_to_fasta_path: Dict[str, dict], tmpdir: str, cpu: int = 1, translation_table: int = 11,
Expand Down Expand Up @@ -434,8 +445,8 @@ def write_summaries(organism_2_summary: Dict[Organism, Dict[str, Any]], output_d
df_summary.to_csv(output_dir / "summary_projection.tsv", sep='\t', index=False)


def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org_rgps:Region,
input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int, output_dir:Path):
def summarize_projection(input_organism:Organism, pangenome:Pangenome, single_copy_families:Set, input_org_rgps:Region,
input_org_spots:Spot, input_org_modules:Module, singleton_gene_count:int):
"""
:param singleton_gene_count: Number of genes that do not cluster with any of the gene families of the pangenome.
Expand All @@ -452,6 +463,13 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org
persistent_gene_count = len(partition_to_gene['persistent'])
shell_gene_count = len(partition_to_gene['shell'])
cloud_gene_count = len(partition_to_gene['cloud'])

completeness = "NA"

single_copy_markers_count = len(set(input_organism.families) & single_copy_families )
if len(single_copy_families) > 0:
completeness = round((single_copy_markers_count /
len(single_copy_families)) * 100, 2)

gene_count = persistent_gene_count + shell_gene_count + cloud_gene_count

Expand All @@ -475,6 +493,8 @@ def summarize_projection(input_organism:Organism, pangenome:Pangenome, input_org
"Persistent": {"genes":persistent_gene_count, "families":persistent_family_count},
"Shell": {"genes":shell_gene_count, "families":shell_family_count},
"Cloud": {"genes":cloud_gene_count, "families":cloud_family_count - singleton_gene_count, "specific families":singleton_gene_count},
"Completeness":completeness,
"Single copy markers":single_copy_markers_count,
"RGPs": rgp_count,
"Spots": spot_count,
"New spots": new_spot_count,
Expand Down Expand Up @@ -1214,7 +1234,12 @@ def parser_projection(parser: argparse.ArgumentParser):
optional.add_argument("--use_pseudo", required=False, action="store_true",
help="In the context of provided annotation, use this option to read pseudogenes. "
"(Default behavior is to ignore them)")


optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05,
help="minimum ratio of organisms in which the family must have multiple genes "
"for it to be considered 'duplicated'. "
"This metric is used to compute completeness and duplication of the input genomes")

optional.add_argument("--spot_graph", required=False, action="store_true",
help="Write the spot graph to a file, with pairs of blocks of single copy markers flanking RGPs "
"as nodes. This graph can be used to visualize nodes that have RGPs from the input organism.")
Expand Down

0 comments on commit d3299a0

Please sign in to comment.