Skip to content

Commit

Permalink
Merge pull request #193 from labgem/dev
Browse files Browse the repository at this point in the history
Merge dev into master to include documentation update
  • Loading branch information
axbazin authored Mar 15, 2024
2 parents 69750f2 + 42de260 commit 9e9acdf
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 18 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ jobs:
# Default separator is a pipe but a pipe is found in a value of metadata db1. That is why we use another separator here.
ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs \
--genomes genome_names.fasta.head.list \
-f --gff --add_metadata --table --metadata_sep §
-f --gff --add_metadata --table --metadata_sep § --proksee
# Pipe separatore is found in metadata source db1. if we don't require this source then the writting with pipe is work fine.
ppanggolin write_genomes -p mybasicpangenome/pangenome.h5 --output mybasicpangenome/genomes_outputs_with_metadata -f --gff --proksee --table --add_metadata --metadata_sources db2 db3 db4
Expand Down
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ user/PangenomeAnalyses/pangenomeAnalyses
user/RGP/rgpAnalyses
user/Modules/moduleAnalyses
user/writeGenomes
user/writeFasta
user/align
user/projection
user/genomicContext
Expand Down
76 changes: 76 additions & 0 deletions docs/user/writeFasta.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@

# Write pangenome sequences

The `fasta` command can be used to write sequences of the pangenome or specific parts of the pangenome in FASTA format.

Most options require a partition.

Available partitions are:
* `all` for the entire pangenome.
* `Persistent` for persistent families
* `Shell` for shell genes or families
* `Cloud` for cloud genes or families
* `rgp` for genes or families found in RGPs
* `core` for core genes or families
* `softcore` for softcore genes or families

When using the `softcore` filter, the `--soft_core` option can be used to modify the threshold used to determine what is part of the softcore. It is set to 0.95 by default.

## Genes

This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all of the genes of the pangenome for example:

```bash
ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all
```

Or to write only the persistent genes:

```bash
ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent
```


## Protein families

This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families:

```bash
ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all
```

or for all of the shell families for example:

```bash
ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell
```


## Gene families

This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such:

```bash
ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families all
```

or for the cloud families for example:

```bash
ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families cloud
```

## Regions

This option can be used to write the nucleotide sequences of the detected RGPs.
It requires the fasta sequences used to compute the pangenome, as originally provided when you computed your pangenome.

This command has only two filters:
* all, for all regions
* complete, for only the 'complete' regions which are not on a contig border

It can be used as such:

```bash
ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --regions all --fasta genomes.fasta.list
```
8 changes: 5 additions & 3 deletions ppanggolin/formats/writeFlatGenomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ def mp_write_genomes_file(organism: Organism, output: Path, organisms_file: Path

# Write ProkSee data for the organism
write_proksee_organism(organism, output_file, features=['all'], genome_sequences=genome_sequences,
**{arg: kwargs[arg] for arg in kwargs.keys() & {'module_to_colors', 'compress'}})
**{arg: kwargs[arg] for arg in kwargs.keys() & {'module_to_colors', 'compress', 'metadata_sep'}})

if gff:
gff_outdir = output / "gff"
Expand Down Expand Up @@ -519,7 +519,9 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa
organism2args = defaultdict(lambda: {"output": output, "table": table, "gff": gff,
"proksee": proksee, "compress": compress})
for organism in organisms_list:
organism_args = {"genome_file": org_dict[organism.name]['path'] if org_dict else None}
organism_args = {"genome_file": org_dict[organism.name]['path'] if org_dict else None,
"metadata_sep": metadata_sep}

if proksee:
organism_args["module_to_colors"] = {module: module_to_colors[module] for module in organism.modules}

Expand All @@ -531,7 +533,7 @@ def write_flat_genome_files(pangenome: Pangenome, output: Path, table: bool = Fa
"CDS": "external"}
else:
organism_args["annotation_sources"] = {}
organism_args["metadata_sep"] = metadata_sep

if table:
organism_args.update({"need_regions": need_dict['need_rgp'],
"need_modules": need_dict['need_modules'],
Expand Down
34 changes: 20 additions & 14 deletions ppanggolin/formats/write_proksee.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,12 @@ def initiate_proksee_data(features: List[str], organism: Organism, module_to_col
return {"cgview": cgview_data}


def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None) -> List[Dict]:
def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None, metadata_sep: str = "|") -> List[Dict]:
"""
Writes contig data for a given organism in proksee format.
:param organism: The organism for which contig data will be written.
:param metadata_sep: The separator used to join multiple metadata values
:param genome_sequences: A dictionary mapping contig names to their DNA sequences (default: None).
:return: A list of contig data in a structured format.
Expand All @@ -153,7 +154,7 @@ def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None) ->
"name": contig.name,
"length": contig.length,
"orientation": "+",
"meta": contig.formatted_metadata_dict()
"meta": contig.formatted_metadata_dict(metadata_sep)
}

if genome_sequences:
Expand All @@ -164,11 +165,12 @@ def write_contig(organism: Organism, genome_sequences: Dict[str, str] = None) ->
return contigs_data_list


def write_genes(organism: Organism, disable_bar: bool = True) -> Tuple[List[Dict], Dict[str, List[Gene]]]:
def write_genes(organism: Organism, metadata_sep: str = "|", disable_bar: bool = True) -> Tuple[List[Dict], Dict[str, List[Gene]]]:
"""
Writes gene data for a given organism, including both protein-coding genes and RNA genes.
:param organism: The organism for which gene data will be written.
:param metadata_sep: The separator used to join multiple metadata values
:param disable_bar: A flag to disable the progress bar when processing genes (default: True).
:return: List of gene data in a structured format and a dictionary mapping gene families to genes.
Expand All @@ -181,8 +183,8 @@ def write_genes(organism: Organism, disable_bar: bool = True) -> Tuple[List[Dict
gf = gene.family
gf2gene[gf.name].append(gene)

metadata_for_proksee = {f"gene_{k}": v for k, v in gene.formatted_metadata_dict().items()}
metadata_for_proksee.update({f"family_{k}": v for k, v in gene.family.formatted_metadata_dict().items()})
metadata_for_proksee = {f"gene_{k}": v for k, v in gene.formatted_metadata_dict(metadata_sep).items()}
metadata_for_proksee.update({f"family_{k}": v for k, v in gene.family.formatted_metadata_dict(metadata_sep).items()})
genes_data_list.append({
"name": gene.name,
"type": "Gene",
Expand Down Expand Up @@ -210,16 +212,17 @@ def write_genes(organism: Organism, disable_bar: bool = True) -> Tuple[List[Dict
"tags": [],
"source": "Gene",
"legend": "RNA",
"meta": gene.formatted_metadata_dict()
"meta": gene.formatted_metadata_dict(metadata_sep)
})

return genes_data_list, gf2gene


def write_rgp(organism: Organism):
def write_rgp(organism: Organism, metadata_sep:str = "|"):
"""
Writes RGP (Region of Genomic Plasticity) data for a given organism in proksee format.
:param organism: The specific organism for which RGP data will be written.
:param metadata_sep: The separator used to join multiple metadata values
:return: A list of RGP data in a structured format.
"""
Expand All @@ -236,17 +239,18 @@ def write_rgp(organism: Organism):
"legend": "RGP",
"source": "RGP",
"tags": [rgp.spot.ID if rgp.spot else "No_spot"],
"meta": rgp.formatted_metadata_dict()
"meta": rgp.formatted_metadata_dict(metadata_sep)
})
return rgp_data_list


def write_modules(organism: Organism, gf2genes: Dict[str, List[Gene]]):
def write_modules(organism: Organism, gf2genes: Dict[str, List[Gene]], metadata_sep:str = "|"):
"""
Writes module data in proksee format for a list of modules associated with a given organism.
:param organism: The organism to which the modules are associated.
:param gf2genes: A dictionary that maps gene families to the genes they contain.
:param metadata_sep: The separator used to join multiple metadata values
:return: A list of module data in a structured format.
"""
Expand All @@ -272,7 +276,7 @@ def write_modules(organism: Organism, gf2genes: Dict[str, List[Gene]]):
"legend": f"module_{module.ID}",
"source": "Module",
"tags": [f'{completion}% complete'],
"meta": module.formatted_metadata_dict()
"meta": module.formatted_metadata_dict(metadata_sep)
})

return modules_data_list
Expand All @@ -282,6 +286,7 @@ def write_proksee_organism(organism: Organism, output_file: Path,
features: List[str] = None,
module_to_colors: Dict[Module, str] = None,
genome_sequences: Dict[str, str] = None,
metadata_sep: str = "|",
compress: bool = False):
"""
Writes ProkSee data for a given organism, including contig information, genes colored by partition,
Expand All @@ -292,21 +297,22 @@ def write_proksee_organism(organism: Organism, output_file: Path,
:param features: A list of features to include in the ProkSee data, e.g., ["rgp", "modules", "all"].
:param module_to_colors: A dictionary mapping modules to their assigned colors.
:param genome_sequences: The genome sequences for the organism.
:param metadata_sep: The separator used to join multiple metadata values
:param compress: Compress the output file
"""
proksee_data = initiate_proksee_data(features, organism, module_to_colors)

proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences)
proksee_data["cgview"]["sequence"]["contigs"] = write_contig(organism, genome_sequences, metadata_sep=metadata_sep)

genes_features, gf2genes = write_genes(organism)
genes_features, gf2genes = write_genes(organism, metadata_sep=metadata_sep)

proksee_data["cgview"]["features"] = genes_features

if ("rgp" in features or "all" in features) and organism.regions is not None:
proksee_data["cgview"]["features"] += write_rgp(organism=organism)
proksee_data["cgview"]["features"] += write_rgp(organism=organism, metadata_sep=metadata_sep)

if module_to_colors is not None and ("modules" in features or "all" in features):
proksee_data["cgview"]["features"] += write_modules(organism=organism, gf2genes=gf2genes)
proksee_data["cgview"]["features"] += write_modules(organism=organism, gf2genes=gf2genes, metadata_sep=metadata_sep)

logging.debug(f"Write ProkSee for {organism.name}")
with write_compressed_or_not(output_file, compress=compress) as out_json:
Expand Down

0 comments on commit 9e9acdf

Please sign in to comment.