From 1dab861ca57088675f4df3b8d5b260927cc9be15 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 25 Sep 2023 11:59:57 +0200 Subject: [PATCH] adapt proksee panorama code to ppanggolin --- ppanggolin/formats/proksee_template.json | 74 ++++++ ppanggolin/formats/writeFlat.py | 38 ++- ppanggolin/formats/write_proksee.py | 305 +++++++++++++++++++++++ ppanggolin/genome.py | 18 +- 4 files changed, 425 insertions(+), 10 deletions(-) create mode 100644 ppanggolin/formats/proksee_template.json create mode 100644 ppanggolin/formats/write_proksee.py diff --git a/ppanggolin/formats/proksee_template.json b/ppanggolin/formats/proksee_template.json new file mode 100644 index 00000000..a6952dbd --- /dev/null +++ b/ppanggolin/formats/proksee_template.json @@ -0,0 +1,74 @@ +{ + "cgview": { + "settings":{ + "format":"circular", + "geneticCode":11, + "backgroundColor":"rgba(255,255,255,1)", + "showShading":true, + "arrowHeadLength":0.3, + "minArcLength":1, + "initialMapThicknessProportion":0.1, + "maxMapThicknessProportion":0.5}, + "backbone":{"color":"rgba(128,128,128,1)", + "colorAlternate":"rgba(200,200,200,1)", + "thickness":5, + "decoration":"arrow" + }, + "ruler": { + "font":"sans-serif,plain,10", + "color":"rgba(0,0,0,1)"}, + "annotation":{"font":"monospace,plain,12", + "onlyDrawFavorites":false, + "visible":true + }, + "dividers":{ + "track": { + "color": "rgba(50,50,50,1)", + "thickness": 1, + "spacing": 1 + }, + "slot":{ + "visible":true, + "color":"rgba(0,0,0,1)", + "thickness":1, + "spacing":1 + } + }, + "highlighter":{ + "visible":true + }, + "captions":[ + { + "position":"bottom-center", + "textAlignment":"center", + "font":"sans-serif,plain,24", + "fontColor":"rgba(0,0,0,1)", + "backgroundColor":"rgba(255,255,255,0.4)" + } + ], + "legend":{ + "position":"top-right", + "textAlignment":"left", + "defaultFont":"sans-serif,plain,14", + "defaultFontColor":"rgba(0,0,0,1)", + "backgroundColor":"rgba(255,255,255,0.75)" + }, + "sequence": { + "color": "rgb(0,0,0)", + "font": "sans-serif,plain,14" + }, + "bookmarks": [ + { + "bbOffset": 86.75, + "bp": 5617, + "favorite": false, + "format": "circular", + "name": "Bookmark-1", + "shortcut": "1", + "zoom": 1.685 + } + ], + "plots": [ + ] + } +} \ No newline at end of file diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index c4d85ff3..84ae194e 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -7,7 +7,7 @@ from multiprocessing import get_context from collections import Counter, defaultdict import logging -from typing import TextIO, Dict +from typing import TextIO,List, Dict from pathlib import Path import pkg_resources from statistics import median, mean, stdev @@ -21,7 +21,7 @@ from ppanggolin.pangenome import Pangenome from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float from ppanggolin.formats.readBinaries import check_pangenome_info - +from ppanggolin.formats.write_proksee import write_proksee_organism # global variable to store the pangenome pan = Pangenome() # TODO change to pangenome:Pangenome = Pangenome=() ? needAnnotations = False @@ -617,6 +617,23 @@ def write_projections(output: Path, compress: bool = False): write_org_file(org, outdir, compress) logging.getLogger("PPanGGOLiN").info("Done writing the projection files") + +def write_proksee(output: Path, compress: bool = False): + """ + """ + + features = ["all"] + template = Path(__file__).parent.joinpath("proksee_template").with_suffix(".json") + + organism_with_rgp = {rgp.organism for rgp in pan.regions} + + for organism in organism_with_rgp : #pan.organisms: + write_proksee_organism(pan, organism, output, + template, features) + + + + def write_gff(output: str, compress: bool = False): """ Write the gff files for all organisms @@ -660,7 +677,6 @@ def write_gff_file(org: Organism, contig_to_rgp: Dict[Contig, Region], :param outdir: Path to the output directory where the GFF file will be written. :param compress: If True, compress the output GFF file using .gz format. :param annotation_sources: A dictionary that maps types of features to their source information. - :type annotation_sources: Dict[str, str] """ @@ -1060,7 +1076,7 @@ def write_rgp_modules(output: Path, compress: bool = False): def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core: float = 0.95, dup_margin: float = 0.05, csv: bool = False, gene_pa: bool = False, gexf: bool = False, - light_gexf: bool = False, projection: bool = False, gff: bool = False, stats: bool = False, json: bool = False, + light_gexf: bool = False, projection: bool = False, gff: bool = False, proksee: bool = False, stats: bool = False, json: bool = False, partitions: bool = False, regions: bool = False, families_tsv: bool = False, spots: bool = False, borders: bool = False, modules: bool = False, spot_modules: bool = False, compress: bool = False, disable_bar: bool = False): @@ -1091,7 +1107,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core :param disable_bar: Disable progress bar """ # TODO Add force parameter to check if output already exist - if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, gff, stats, json, partitions, regions, spots, borders, + if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, gff, proksee, stats, json, partitions, regions, spots, borders, families_tsv, modules, spot_modules]): raise Exception("You did not indicate what file you wanted to write.") @@ -1111,10 +1127,10 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core pan = pangenome if csv or gene_pa or gexf or light_gexf or projection or stats or json or partitions or regions or spots or \ - families_tsv or borders or modules or spot_modules or gff: + families_tsv or borders or modules or spot_modules or gff or proksee: needAnnotations = True needFamilies = True - if projection or stats or partitions or regions or spots or borders or gff: + if projection or stats or partitions or regions or spots or borders or gff or proksee: needPartitions = True if gexf or light_gexf or json: needGraph = True @@ -1132,7 +1148,7 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core needSpots = True if modules or spot_modules: # or projection: needModules = True - if projection or gff: + if projection or gff or proksee: needRegions = True if pan.status["predictedRGP"] == "inFile" else False needSpots = True if pan.status["spots"] == "inFile" else False needModules = True if pan.status["modules"] == "inFile" else False @@ -1155,6 +1171,8 @@ def write_flat_files(pangenome: Pangenome, output: Path, cpu: int = 1, soft_core processes.append(p.apply_async(func=write_projections, args=(output, compress))) if gff: processes.append(p.apply_async(func=write_gff, args=(output, compress))) + if proksee: + processes.append(p.apply_async(func=write_proksee, args=(output, compress))) if stats: processes.append(p.apply_async(func=write_stats, args=(output, soft_core, dup_margin, compress))) if json: @@ -1191,7 +1209,7 @@ def launch(args: argparse.Namespace): global pan pan.add_file(args.pangenome) write_flat_files(pan, args.output, cpu=args.cpu, soft_core=args.soft_core, dup_margin=args.dup_margin, csv=args.csv, - gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, gff=args.gff, + gene_pa=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, gff=args.gff, proksee=args.proksee, stats=args.stats, json=args.json, partitions=args.partitions, regions=args.regions, families_tsv=args.families_tsv, spots=args.spots, borders=args.borders, modules=args.modules, spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar) @@ -1242,6 +1260,8 @@ def parser_flat(parser: argparse.ArgumentParser): "on the organism") optional.add_argument("--gff", required=False, action="store_true", help="Generate a gff file for each organism containing pangenome annotations.") + optional.add_argument("--proksee", required=False, action="store_true", + help="Generate a json file for each organism containing pangenome annotations to be used to in proksee.") optional.add_argument("--stats", required=False, action="store_true", help="tsv files with some statistics for each organism and for each gene family") optional.add_argument("--partitions", required=False, action="store_true", diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py new file mode 100644 index 00000000..5474b320 --- /dev/null +++ b/ppanggolin/formats/write_proksee.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +# coding:utf-8 + +# default libraries +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +import json +import logging +from pathlib import Path +from random import randint +from tqdm import tqdm +from typing import Dict, List, Tuple +from uuid import uuid4 + +# installed libraries +from bokeh.palettes import Category20 +from ppanggolin.genome import Organism, Contig, Gene +from ppanggolin.region import Spot + +# local libraries +from ppanggolin.pangenome import Pangenome + + +def palette() -> List[Tuple[int]]: + palette = [] + for hex_color in list(Category20[20]): + palette.append(tuple(int(hex_color.strip('#')[i:i + 2], 16) for i in (0, 2, 4))) + return palette + + +def read_settings(settings_data: dict): + if "format" not in settings_data: + settings_data["format"] = "circular" + if "geneticCode" not in settings_data: + # TODO Manage genetic code + settings_data["geneticCode"] = "11" + + +def write_legend_items(legend_data: dict, features: List[str]): #, sources: List[str]): + colors = palette() + legend_data["items"] = [#{"name": "CDS", "swatchColor": f"rgba({','.join(map(str, colors.pop(1)))},0.5)", "decoration": "arrow"}, + {"name": "persistent", "swatchColor": "rgba(229,156,4,1)", "decoration": "arrow"}, + {"name": "shell", "swatchColor": "rgba(60,254,91,1)", "decoration": "arrow"}, + {"name": "cloud", "swatchColor": f"rgba({','.join(map(str, colors.pop(17)))},1)", "decoration": "arrow"}, + {"name": "RNA", "swatchColor": "rgba(137,23,207,0.5)", "decoration": "arrow"},] + if "rgp" in features or "all" in features: + legend_data["items"].append({"name": "RGP", "swatchColor": f"rgba({','.join(map(str, colors.pop(6)))}, 1)", "decoration": "arc"}), + if "spots" in features or "all" in features: + legend_data["items"].append({"name": "Spot", "swatchColor": f"rgba({','.join(map(str, colors.pop(5)))}, 1)", "decoration": "arc"}) + if "modules" in features or "all" in features: + legend_data["items"].append({"name": "Module", "swatchColor": f"rgba({','.join(map(str, colors.pop(3)))},1)", "decoration": "arc"}) + # if "systems" in features or "all" in features: + # for source in sources: + # color = ','.join(map(str, colors.pop(randint(0, len(colors) - 1)))) + # legend_data["items"].append({"name": source, "decoration": "arc", "swatchColor": f"rgba({color},1)"}) + + +def write_tracks(features: List[str]): + + tracks = [{"name": "Gene", "separateFeaturesBy": "None", "position": "outside", "thicknessRatio": 1, + "dataType": "feature", "dataMethod": "source", "dataKeys": "Gene"}, + {"name": "Partition", "separateFeaturesBy": "strand", "position": "outside", "thicknessRatio": 1, + "dataType": "feature", "dataMethod": "source", "dataKeys": "partition"}] + + if "rgp" in features or "all" in features: + tracks.append({"name": "RGP", "separateFeaturesBy": "None", "position": "inside", "thicknessRatio": 1, + "dataType": "feature", "dataMethod": "source", "dataKeys": "RGP"}), + # if "spots" in features or "all" in features: + # tracks.append({"name": "Spots", "separateFeaturesBy": "None", "position": "inside", "thicknessRatio": 1, + # "dataType": "feature", "dataMethod": "source", "dataKeys": "Spot"}) + if "modules" in features or "all" in features: + tracks.append({"name": "Module", "separateFeaturesBy": "None", "position": "inside", "thicknessRatio": 1, + "dataType": "feature", "dataMethod": "source", "dataKeys": "Module"}) + + return tracks + + +def read_data(template: Path, features: List[str], sources: List[str] = None) -> dict: + """ + """ + + with open(template, "r") as template_file: + proksee_data = json.load(template_file) + + now = datetime.now() + + if "created" in proksee_data["cgview"]: + proksee_data["cgview"]["updated"] = now.strftime("%Y-%m-%d %H:%M:%S") + last_version = proksee_data["cgview"]["version"].split('.') + proksee_data["cgview"]["version"] = ".".join(last_version[:-1] + last_version[-1] + 1) + else: + proksee_data["cgview"]["created"] = now.strftime("%Y-%m-%d %H:%M:%S") + proksee_data["cgview"]["version"] = "1.0" + + if "name" not in proksee_data["cgview"]: + proksee_data["cgview"]["name"] = "PPanGGOLiN annotations at genome levels" + proksee_data["cgview"]["id"] = uuid4().hex + + read_settings(proksee_data["cgview"]["settings"]) + + if "items" not in proksee_data["cgview"]["legend"]: + write_legend_items(proksee_data["cgview"]["legend"], features) + + if "tracks" not in proksee_data["cgview"]: + proksee_data["cgview"]["tracks"] = write_tracks(features) + return proksee_data + + +def write_contig(organism: Organism): + contigs_data_list = [] + for contig in tqdm(organism.contigs, unit="contig", disable=True): + + contigs_data_list.append({"name": contig.name, + "length": contig.length, + "orientation": "+", # "seq": "".join([gene.dna for gene in contig.genes]) + }) + return contigs_data_list + + +def write_genes(organism: Organism, sources: List[str]=None): + genes_data_list = [] + gf2gene = {} + + for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=True): + + gf = gene.family + if gf.name in gf2gene: + gf2gene[gf.name].append(gene) + else: + gf2gene[gf.name] = [gene] + # annotations = {source: "|".join(list(map(str, gf.get_source(source)))) for source in gf.sources if + # source in sources} + genes_data_list.append({"name": gene.name, + "type": "Gene", + "contig": gene.contig.name, + "start": gene.start, + "stop": gene.stop, + "strand": 1 if gene.strand == "+" else -1, + "product": gene.product, + "tags": [gene.family.named_partition, gene.family.name], + "source": "Gene", + "legend": gene.family.named_partition, + "meta": ""#annotations + }) + + for gene in tqdm(organism.rna_genes, total=organism.number_of_rnas(), unit="rnas", disable=True): + + if gf.name in gf2gene: + gf2gene[gf.name].append(gene) + else: + gf2gene[gf.name] = [gene] + # annotations = {source: "|".join(list(map(str, gf.get_source(source)))) for source in gf.sources if + # source in sources} + genes_data_list.append({"name": gene.name, + "type": "Gene", + "contig": gene.contig.name, + "start": gene.start, + "stop": gene.stop, + "strand": 1 if gene.strand == "+" else -1, + "product": gene.product, + "tags": [], + "source": "Gene", + "legend": "RNA", + "meta": ""#annotations + }) + + + return genes_data_list, gf2gene + + +def write_partition(organism: Organism): + partition_data_list = [] + c=0 + for gene in tqdm(organism.genes, total=organism.number_of_genes(), unit="genes", disable=True): + c += 1 + partition_data_list.append({"name": gene.family.name, + "presence": gene.family.named_partition, + "contig": gene.contig.name, + "start": gene.start, + "stop": gene.stop, + "source": "partition", + "legend": gene.family.named_partition, + "tags": ["partition"]}) + + + return partition_data_list + + +def write_rgp(pangenome: Pangenome, organism: Organism): + rgp_data_list = [] + for rgp in tqdm(pangenome.regions, unit="RGP", disable=True): + if rgp.organism == organism: + rgp_data_list.append({"name": rgp.name, + "contig": rgp.contig.name, + "start": rgp.start, + "stop": rgp.stop, + "legend": "RGP", + 'source':"RGP", + "tags": []}) + return rgp_data_list + + +def write_spots(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, List[Gene]]): + spots_data_list = [] + for spot in tqdm(pangenome.spots, unit="Spot", disable=True): + spot: Spot + spot_orgs = set() + + for gf in spot.families: + spot_orgs |= set(gf.organisms) + + if organism in spot_orgs: + gf_intersection = set(organism.families) & set(spot.families) + completion = round(len(gf_intersection) / spot.number_of_families, 2) + + for gf in gf_intersection: + for gene in gf2genes[gf.name]: + spots_data_list.append({"name": f"Spot_{spot.ID}", + "start": gene.start, + "stop": gene.stop, + "contig": gene.contig.name, + "legend": "Spot", + "source":"Spot", + "tags": [], + "meta": { + "completion": completion + }}) + return spots_data_list + + +def write_modules(pangenome: Pangenome, organism: Organism, gf2genes: Dict[str, List[Gene]]): + modules_data_list = [] + for module in tqdm(pangenome.modules, unit="Module", disable=True): + mod_orgs = set() + for gf in module.families: + mod_orgs |= set(gf.organisms) + if organism in mod_orgs: + gf_intersection = set(organism.families) & set(module.families) + completion = round(len(gf_intersection) / len(set(module.families)), 2) + for gf in gf_intersection: + for gene in gf2genes[gf.name]: + modules_data_list.append({"name": f"Module_{module.ID}", + "presence": "Module", + "start": gene.start, + "stop": gene.stop, + "contig": gene.contig.name, + "legend": "Module", + "source": "Module", + "tags": [], + "meta": { + "completion": completion + }}) + return modules_data_list + + +def write_proksee_organism(pangenome: Pangenome, organism: Organism, output: Path, template: Path, + features: List[str] = None, sources: List[str] = None): + + proksee_data = read_data(template=template, features=features, sources=sources) + + if "name" not in proksee_data["cgview"]["captions"]: + proksee_data["cgview"]["captions"][0]["name"] = f"{organism.name} annotated with PPanGGOLiN" + + proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism) + + if "features" not in proksee_data["cgview"]: + proksee_data["cgview"]["features"] = [] + + genes_features, gf2genes = write_genes(organism, sources=sources) + print(len(genes_features)) + proksee_data["cgview"]["features"] += genes_features + proksee_data["cgview"]["features"] += write_partition(organism) + + if "rgp" in features or "all" in features: + proksee_data["cgview"]["features"] += write_rgp(pangenome=pangenome, organism=organism) + if "spots" in features or "all" in features: + proksee_data["cgview"]["features"] += write_spots(pangenome=pangenome, organism=organism, gf2genes=gf2genes) + if "modules" in features or "all" in features: + proksee_data["cgview"]["features"] += write_modules(pangenome=pangenome, organism=organism, gf2genes=gf2genes) + + logging.debug(f"Write proksee for {organism.name}") + with open(output.joinpath(organism.name).with_suffix(".json"), "w") as out_json: + json.dump(proksee_data, out_json, indent=2) + + +def write_proksee(pangenome: Pangenome, output: Path, features: List[str] = None, sources: List[str] = None, + template: Path = None, organisms_list: List[str] = None, threads: int = 1, disable_bar: bool = False): + assert features is not None + if template is None: + template = Path(__file__).parent.joinpath("proksee_template").with_suffix(".json") + if organisms_list is not None: + organisms = [organism for organism in pangenome.organisms if organism.name in organisms_list] + else: + organisms = pangenome.organisms + with ThreadPoolExecutor(max_workers=threads) as executor: + with tqdm(total=len(organisms), unit='organism', disable=disable_bar) as progress: + futures = [] + for organism in organisms: + future = executor.submit(write_proksee_organism, pangenome, organism, output, + template, features, sources) + future.add_done_callback(lambda p: progress.update()) + futures.append(future) + + for future in futures: + future.result() diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index 0d3d8adb..69f9539e 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -655,12 +655,28 @@ def genes(self) -> Generator[Gene, None, None]: for contig in self.contigs: yield from contig.genes + @property + def rna_genes(self) -> Generator[RNA, None, None]: + """Generator to get genes in the organism + + :return: Generator of genes + """ + for contig in self.contigs: + yield from contig.RNAs + def number_of_genes(self) -> int: """ Get number of genes in the organism :return: Number of genes """ - return sum([contig.number_of_genes for contig in self.contigs]) + return sum((contig.number_of_genes for contig in self.contigs)) + + def number_of_rnas(self) -> int: + """ Get number of genes in the organism + + :return: Number of genes + """ + return sum((contig.number_of_rnas for contig in self.contigs)) @property def contigs(self) -> Generator[Contig, None, None]: