From 23afb59acb7db6e267026ce5a274e133de7af597 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 11 Mar 2024 10:36:25 +0100 Subject: [PATCH 1/5] fix missing metadata sep when using metadata in proksee --- ppanggolin/formats/writeFlatGenomes.py | 3 ++- ppanggolin/formats/write_proksee.py | 34 +++++++++++++++----------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/ppanggolin/formats/writeFlatGenomes.py b/ppanggolin/formats/writeFlatGenomes.py index 8c309603..771398f4 100644 --- a/ppanggolin/formats/writeFlatGenomes.py +++ b/ppanggolin/formats/writeFlatGenomes.py @@ -442,7 +442,7 @@ def mp_write_genomes_file(organism: Organism, output: Path, organisms_file: Path # Write ProkSee data for the organism write_proksee_organism(organism, output_file, features=['all'], genome_sequences=genome_sequences, - **{arg: kwargs[arg] for arg in kwargs.keys() & {'module_to_colors', 'compress'}}) + **{arg: kwargs[arg] for arg in kwargs.keys() & {'module_to_colors', 'compress', 'metadata_sep'}}) if gff: gff_outdir = output / "gff" @@ -532,6 +532,7 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa else: organism_args["annotation_sources"] = {} organism_args["metadata_sep"] = metadata_sep + if table: organism_args.update({"need_regions": need_dict['need_rgp'], "need_modules": need_dict['need_modules'], diff --git a/ppanggolin/formats/write_proksee.py b/ppanggolin/formats/write_proksee.py index 5596f519..6bbdb6dc 100644 --- a/ppanggolin/formats/write_proksee.py +++ b/ppanggolin/formats/write_proksee.py @@ -137,11 +137,12 @@ def initiate_proksee_data(features: List[str], organism: Organism, module_to_col return {"cgview": cgview_data} -def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None) -> List[Dict]: +def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None, metadata_sep: str = "|") -> List[Dict]: """ Writes contig data for a given organism in proksee format. :param organism: The organism for which contig data will be written. + :param metadata_sep: The separator used to join multiple metadata values :param genome_sequences: A dictionary mapping contig names to their DNA sequences (default: None). :return: A list of contig data in a structured format. @@ -153,7 +154,7 @@ def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None) -> "name": contig.name, "length": contig.length, "orientation": "+", - "meta": contig.formatted_metadata_dict() + "meta": contig.formatted_metadata_dict(metadata_sep) } if genome_sequences: @@ -164,11 +165,12 @@ def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None) -> return contigs_data_list -def write_genes(organism: Organism, disable_bar: bool = True) -> Tuple[List[Dict], Dict[str, List[Gene]]]: +def write_genes(organism: Organism, metadata_sep: str = "|", disable_bar: bool = True) -> Tuple[List[Dict], Dict[str, List[Gene]]]: """ Writes gene data for a given organism, including both protein-coding genes and RNA genes. :param organism: The organism for which gene data will be written. + :param metadata_sep: The separator used to join multiple metadata values :param disable_bar: A flag to disable the progress bar when processing genes (default: True). :return: List of gene data in a structured format and a dictionary mapping gene families to genes. @@ -181,8 +183,8 @@ def write_genes(organism: Organism, disable_bar: bool = True) -> Tuple[List[Dict gf = gene.family gf2gene[gf.name].append(gene) - metadata_for_proksee = {f"gene_{k}": v for k, v in gene.formatted_metadata_dict().items()} - metadata_for_proksee.update({f"family_{k}": v for k, v in gene.family.formatted_metadata_dict().items()}) + metadata_for_proksee = {f"gene_{k}": v for k, v in gene.formatted_metadata_dict(metadata_sep).items()} + metadata_for_proksee.update({f"family_{k}": v for k, v in gene.family.formatted_metadata_dict(metadata_sep).items()}) genes_data_list.append({ "name": gene.name, "type": "Gene", @@ -210,16 +212,17 @@ def write_genes(organism: Organism, disable_bar: bool = True) -> Tuple[List[Dict "tags": [], "source": "Gene", "legend": "RNA", - "meta": gene.formatted_metadata_dict() + "meta": gene.formatted_metadata_dict(metadata_sep) }) return genes_data_list, gf2gene -def write_rgp(organism: Organism): +def write_rgp(organism: Organism, metadata_sep:str = "|"): """ Writes RGP (Region of Genomic Plasticity) data for a given organism in proksee format. :param organism: The specific organism for which RGP data will be written. + :param metadata_sep: The separator used to join multiple metadata values :return: A list of RGP data in a structured format. """ @@ -236,17 +239,18 @@ def write_rgp(organism: Organism): "legend": "RGP", "source": "RGP", "tags": [rgp.spot.ID if rgp.spot else "No_spot"], - "meta": rgp.formatted_metadata_dict() + "meta": rgp.formatted_metadata_dict(metadata_sep) }) return rgp_data_list -def write_modules(organism: Organism, gf2genes: Dict[str, List[Gene]]): +def write_modules(organism: Organism, gf2genes: Dict[str, List[Gene]], metadata_sep:str = "|"): """ Writes module data in proksee format for a list of modules associated with a given organism. :param organism: The organism to which the modules are associated. :param gf2genes: A dictionary that maps gene families to the genes they contain. + :param metadata_sep: The separator used to join multiple metadata values :return: A list of module data in a structured format. """ @@ -272,7 +276,7 @@ def write_modules(organism: Organism, gf2genes: Dict[str, List[Gene]]): "legend": f"module_{module.ID}", "source": "Module", "tags": [f'{completion}% complete'], - "meta": module.formatted_metadata_dict() + "meta": module.formatted_metadata_dict(metadata_sep) }) return modules_data_list @@ -282,6 +286,7 @@ def write_proksee_organism(organism: Organism, output_file: Path, features: List[str] = None, module_to_colors: Dict[Module, str] = None, genome_sequences: Dict[str, str] = None, + metadata_sep: str = "|", compress: bool = False): """ Writes ProkSee data for a given organism, including contig information, genes colored by partition, @@ -292,21 +297,22 @@ def write_proksee_organism(organism: Organism, output_file: Path, :param features: A list of features to include in the ProkSee data, e.g., ["rgp", "modules", "all"]. :param module_to_colors: A dictionary mapping modules to their assigned colors. :param genome_sequences: The genome sequences for the organism. + :param metadata_sep: The separator used to join multiple metadata values :param compress: Compress the output file """ proksee_data = initiate_proksee_data(features, organism, module_to_colors) - proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences) + proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences, metadata_sep=metadata_sep) - genes_features, gf2genes = write_genes(organism) + genes_features, gf2genes = write_genes(organism, metadata_sep=metadata_sep) proksee_data["cgview"]["features"] = genes_features if ("rgp" in features or "all" in features) and organism.regions is not None: - proksee_data["cgview"]["features"] += write_rgp(organism=organism) + proksee_data["cgview"]["features"] += write_rgp(organism=organism, metadata_sep=metadata_sep) if module_to_colors is not None and ("modules" in features or "all" in features): - proksee_data["cgview"]["features"] += write_modules(organism=organism, gf2genes=gf2genes) + proksee_data["cgview"]["features"] += write_modules(organism=organism, gf2genes=gf2genes, metadata_sep=metadata_sep) logging.debug(f"Write ProkSee for {organism.name}") with write_compressed_or_not(output_file, compress=compress) as out_json: From df35cf09bfd9b1736dc5dc68ac8c44dfcf9ebeff Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 11 Mar 2024 11:02:45 +0100 Subject: [PATCH 2/5] add metadata sep in args dict of all output --- ppanggolin/formats/writeFlatGenomes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ppanggolin/formats/writeFlatGenomes.py b/ppanggolin/formats/writeFlatGenomes.py index 771398f4..3f5e5ea1 100644 --- a/ppanggolin/formats/writeFlatGenomes.py +++ b/ppanggolin/formats/writeFlatGenomes.py @@ -519,7 +519,9 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa organism2args = defaultdict(lambda: {"output": output, "table": table, "gff": gff, "proksee": proksee, "compress": compress}) for organism in organisms_list: - organism_args = {"genome_file": org_dict[organism.name]['path'] if org_dict else None} + organism_args = {"genome_file": org_dict[organism.name]['path'] if org_dict else None, + "metadata_sep": metadata_sep} + if proksee: organism_args["module_to_colors"] = {module: module_to_colors[module] for module in organism.modules} @@ -531,7 +533,6 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa "CDS": "external"} else: organism_args["annotation_sources"] = {} - organism_args["metadata_sep"] = metadata_sep if table: organism_args.update({"need_regions": need_dict['need_rgp'], From 9cf6ce08f06310c46a2eb0f7571bef068432cf38 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 11 Mar 2024 11:12:54 +0100 Subject: [PATCH 3/5] add proksee in metadata sep test line --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a73337a3..67c4ad2d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -177,7 +177,7 @@ jobs: # Default separator is a pipe but a pipe is found in a value of metadata db1. That is why we use another separator here. ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs \ --genomes genome_names.fasta.head.list \ - -f --gff --add_metadata --table --metadata_sep § + -f --gff --add_metadata --table --metadata_sep § --proksee # Pipe separatore is found in metadata source db1. if we don't require this source then the writting with pipe is work fine. ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs_with_metadata -f --gff --proksee --table --add_metadata --metadata_sources db2 db3 db4 From 27612056023990187584db4102337151ddc8f586 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Mar 2024 10:56:24 +0100 Subject: [PATCH 4/5] add missing fasta cmd documentation --- docs/index.md | 1 + docs/user/writeFasta.md | 76 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 docs/user/writeFasta.md diff --git a/docs/index.md b/docs/index.md index 2ffd38c2..84ced103 100644 --- a/docs/index.md +++ b/docs/index.md @@ -73,6 +73,7 @@ user/PangenomeAnalyses/pangenomeAnalyses user/RGP/rgpAnalyses user/Modules/moduleAnalyses user/writeGenomes +user/writeFasta user/align user/projection user/genomicContext diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md new file mode 100644 index 00000000..ba7db52d --- /dev/null +++ b/docs/user/writeFasta.md @@ -0,0 +1,76 @@ + +# Fasta + +This command can be used to write fasta sequences of the pangenome or specific parts of the pangenome. + +Most options require a partition. + +Available partitions are: +* 'all' for the entire pangenome. +* 'Persistent' for persistent families +* 'Shell' for shell genes or families +* 'Cloud' for cloud genes or families +* 'rgp' for genes or families found in RGPs +* 'core' for core genes or families +* 'softcore' for softcore genes or families + +When using the 'softcore' filter, the '--soft_core' option can be used to modify the threshold used to determine what is part of the softcore. It is set to 0.95 by default. + +## Genes + +This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all of the genes of the pangenome for example: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all +``` + +Or to write only the persistent genes: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent +``` + + +## Protein families + +This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all +``` + +or for all of the shell families for example: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell +``` + + +## Gene families + +This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families all +``` + +or for the cloud families for example: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families cloud +``` + +## Regions + +This option can be used to write the nucleotide sequences of the detected RGPs. +It requires the fasta sequences used to compute the pangenome, as originally provided when you computed your pangenome. + +This command has only two filters: +* all, for all regions +* complete, for only the 'complete' regions which are not on a contig border + +It can be used as such: + +```bash +ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --regions all --fasta genomes.fasta.list +``` From fcb189c8f39a66b6864a30153bd1391e4ebea58b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Mar 2024 11:06:46 +0100 Subject: [PATCH 5/5] improve fasta command title --- docs/user/writeFasta.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md index ba7db52d..ac1ece26 100644 --- a/docs/user/writeFasta.md +++ b/docs/user/writeFasta.md @@ -1,20 +1,20 @@ -# Fasta +# Write pangenome sequences -This command can be used to write fasta sequences of the pangenome or specific parts of the pangenome. +The `fasta` command can be used to write sequences of the pangenome or specific parts of the pangenome in FASTA format. Most options require a partition. Available partitions are: -* 'all' for the entire pangenome. -* 'Persistent' for persistent families -* 'Shell' for shell genes or families -* 'Cloud' for cloud genes or families -* 'rgp' for genes or families found in RGPs -* 'core' for core genes or families -* 'softcore' for softcore genes or families - -When using the 'softcore' filter, the '--soft_core' option can be used to modify the threshold used to determine what is part of the softcore. It is set to 0.95 by default. +* `all` for the entire pangenome. +* `Persistent` for persistent families +* `Shell` for shell genes or families +* `Cloud` for cloud genes or families +* `rgp` for genes or families found in RGPs +* `core` for core genes or families +* `softcore` for softcore genes or families + +When using the `softcore` filter, the `--soft_core` option can be used to modify the threshold used to determine what is part of the softcore. It is set to 0.95 by default. ## Genes