From 96ec8651ec9e896101c6e07aec441930a0777d00 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 16 Sep 2024 17:27:56 +0200 Subject: [PATCH 01/14] write fasta by reading directly from the pangenome file --- ppanggolin/formats/readBinaries.py | 83 +++++++++++++++++++++++++++- ppanggolin/formats/writeSequences.py | 32 +++++++++-- 2 files changed, 108 insertions(+), 7 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 6c911fe7..588a4078 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -4,7 +4,7 @@ import logging from optparse import Option from pathlib import Path -from typing import Dict, Any, Set, List, Tuple, Optional +from typing import Dict, Any, Iterator, Set, List, Tuple, Optional from collections import defaultdict # installed libraries @@ -279,7 +279,7 @@ def get_non_redundant_gene_sequences_from_file(pangenome_filename: str, output: file_obj.write(f'{row["dna"].decode()}\n') -def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Path, list_cds: iter = None, +def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Path, list_cds: Optional[Iterator] = None, add: str = '', compress: bool = False, disable_bar: bool = False): """ Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, @@ -309,6 +309,85 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa f"{output.absolute()}{'.gz' if compress else ''}") +def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, partition: str, + compress: bool = False, disable_bar=False): + """ + Write representative nucleotide sequences of gene families + + :param pangenome: Pangenome object with gene families sequences + :param output: Path to output directory + :param gene_families: Selected partition of gene families + :param soft_core: Soft core threshold to use + :param compress: Compress the file in .gz + :param disable_bar: Disable progress bar + """ + + outpath = output / f"{partition}_nucleotide_families.fasta" + + parition_first_letter = partition[0].upper() + + with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: + gene_fam_info_table = h5f.root.geneFamiliesInfo + family_to_write = set() + seq_id_to_seqs = defaultdict(list) + for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar): + + if partition == "all" or row['partition'].decode().startswith(parition_first_letter): + family_to_write.add(row['name']) + + + gene_seq_table = h5f.root.annotations.geneSequences + for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar): + + if row['gene'] in family_to_write: + seq_id_to_seqs[row['seqid']].append(row['gene'].decode()) + + with write_compressed_or_not(outpath, compress) as file_obj: + seq_table = h5f.root.annotations.sequences + for row in read_chunks(seq_table, chunk=20000): + if row["seqid"] in seq_id_to_seqs: + + for seq_name in seq_id_to_seqs[row["seqid"]]: + + file_obj.write(f">{seq_name}\n") + file_obj.write(row["dna"].decode() + "\n") + + logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " + f"of the gene families : '{outpath}{'.gz' if compress else ''}") + +def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, partition: str, + compress: bool = False, disable_bar=False): + """ + Write representative amino acid sequences of gene families. + + :param pangenome: Pangenome object with gene families sequences + :param output: Path to output directory + :param prot_families: Selected partition of protein families + :param soft_core: Soft core threshold to use + :param compress: Compress the file in .gz + :param disable_bar: Disable progress bar + """ + + outpath = output / f"{partition}_protein_families.faa" + + parition_first_letter = partition[0].upper() + + with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f, write_compressed_or_not(outpath, compress) as fasta: + + gene_fam_info_table = h5f.root.geneFamiliesInfo + + for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar): + + if partition == "all" or row['partition'].decode().startswith(parition_first_letter): + + fasta.write(f">{row['name'].decode()}\n") + fasta.write(row['protein'].decode() + "\n") + + logging.getLogger("PPanGGOLiN").info(f"Done writing the representative amino acid sequences of the gene families:" + f"'{outpath}{'.gz' if compress else ''}'") + + + def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ Read information about graph in pangenome hdf5 file to add in pangenome object diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index a7023812..a602b35e 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -19,7 +19,7 @@ from ppanggolin.utils import (write_compressed_or_not, mk_outdir, create_tmpdir, read_compressed_or_not, restricted_float, detect_filetype, run_subprocess) -from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file +from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file, write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file module_regex = re.compile(r'^module_\d+') # \d == [0-9] poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] @@ -577,10 +577,32 @@ def launch(args: argparse.Namespace): mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) - write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, - regions=args.regions, genes=args.genes, proteins=args.proteins, - gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, - disable_bar=args.disable_prog_bar, **translate_kwgs) + + # if all(element_to_write is None for element_to_write in [args.regions, args.genes, args.proteins]): + if args.gene_families in ['all', 'persistent', 'shell', 'cloud']: + + logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences " + "of the gene families by reading the pangenome file directly.") + + write_fasta_gene_fam_from_pangenome_file(pangenome_filename=args.pangenome, partition = args.gene_families, + output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) + args.gene_families = None + + if args.prot_families in ['all', 'persistent', 'shell', 'cloud']: + + + logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences " + "of the gene families by reading the pangenome file directly.") + write_fasta_prot_fam_from_pangenome_file(pangenome_filename=args.pangenome, partition = args.prot_families, + output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) + + args.prot_families = None + + if any(x for x in [args.regions, args.genes, args.proteins, args.gene_families, args.prot_families]): + write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, + regions=args.regions, genes=args.genes, proteins=args.proteins, + gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, + disable_bar=args.disable_prog_bar, **translate_kwgs) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: From 1b3b35f22931bb50cd4d4e074cfd29dd882f1deb Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 17 Sep 2024 18:44:32 +0200 Subject: [PATCH 02/14] add module filter in gene_families writing --- ppanggolin/formats/readBinaries.py | 49 +++++++++++++++++++++------- ppanggolin/formats/writeSequences.py | 4 +-- 2 files changed, 39 insertions(+), 14 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 588a4078..d1295d0c 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -308,8 +308,21 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa logging.getLogger("PPanGGOLiN").debug("Gene sequences from pangenome file was written to " f"{output.absolute()}{'.gz' if compress else ''}") - -def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, partition: str, +def read_module_families_from_pangenome_file(h5f: tables.File, module_name:str): + """ + + """ + family_to_write = set() + module_id = int(module_name[len("module_"):]) + print("MODULE", module_id) + module_table = h5f.root.modules + for row in read_chunks(module_table, chunk=20000): + if row['module'] == module_id: + family_to_write.add(row['geneFam']) + print("family_to_write", family_to_write) + return family_to_write + +def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, compress: bool = False, disable_bar=False): """ Write representative nucleotide sequences of gene families @@ -322,19 +335,31 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa :param disable_bar: Disable progress bar """ - outpath = output / f"{partition}_nucleotide_families.fasta" + outpath = output / f"{family_filter}_nucleotide_families.fasta" - parition_first_letter = partition[0].upper() + with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: - gene_fam_info_table = h5f.root.geneFamiliesInfo - family_to_write = set() + + seq_id_to_seqs = defaultdict(list) - for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar): - - if partition == "all" or row['partition'].decode().startswith(parition_first_letter): - family_to_write.add(row['name']) + if family_filter in ["all", 'persistent', 'shell', 'cloud']: + family_to_write = set() + gene_fam_info_table = h5f.root.geneFamiliesInfo + parition_first_letter = family_filter[0].upper() + for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar): + + if family_filter == "all" or row['partition'].decode().startswith(parition_first_letter): + family_to_write.add(row['name']) + + + if family_filter.startswith("module_"): + family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter) + + if len(family_to_write) == 0: + logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.") + return gene_seq_table = h5f.root.annotations.geneSequences for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar): @@ -342,9 +367,9 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa if row['gene'] in family_to_write: seq_id_to_seqs[row['seqid']].append(row['gene'].decode()) - with write_compressed_or_not(outpath, compress) as file_obj: + with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj: seq_table = h5f.root.annotations.sequences - for row in read_chunks(seq_table, chunk=20000): + for row in read_chunks(table=seq_table, chunk=20000): if row["seqid"] in seq_id_to_seqs: for seq_name in seq_id_to_seqs[row["seqid"]]: diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index a602b35e..4d903c9d 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -579,12 +579,12 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) # if all(element_to_write is None for element_to_write in [args.regions, args.genes, args.proteins]): - if args.gene_families in ['all', 'persistent', 'shell', 'cloud']: + if args.gene_families and args.gene_families in ['all', 'persistent', 'shell', 'cloud'] or module_regex.match(args.gene_families): logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences " "of the gene families by reading the pangenome file directly.") - write_fasta_gene_fam_from_pangenome_file(pangenome_filename=args.pangenome, partition = args.gene_families, + write_fasta_gene_fam_from_pangenome_file(pangenome_filename=args.pangenome, family_filter= args.gene_families, output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) args.gene_families = None From c14eac408bf455e80c335b400602b3babc218d77 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 17 Sep 2024 18:51:50 +0200 Subject: [PATCH 03/14] fix none manageemnt --- ppanggolin/formats/writeSequences.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 4d903c9d..e63d978b 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -579,7 +579,7 @@ def launch(args: argparse.Namespace): pangenome.add_file(args.pangenome) # if all(element_to_write is None for element_to_write in [args.regions, args.genes, args.proteins]): - if args.gene_families and args.gene_families in ['all', 'persistent', 'shell', 'cloud'] or module_regex.match(args.gene_families): + if args.gene_families is not None and args.gene_families in ['all', 'persistent', 'shell', 'cloud'] or module_regex.match(args.gene_families): logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences " "of the gene families by reading the pangenome file directly.") From 1ac566c12b658e14229eca5938ab36d8bcd3ec10 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Sep 2024 16:04:58 +0200 Subject: [PATCH 04/14] write module and rgp family genes --- ppanggolin/formats/readBinaries.py | 36 +++++++++++++++++++++++----- ppanggolin/formats/writeSequences.py | 4 +--- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index d1295d0c..dddac342 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -308,18 +308,37 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa logging.getLogger("PPanGGOLiN").debug("Gene sequences from pangenome file was written to " f"{output.absolute()}{'.gz' if compress else ''}") + +def read_rgp__genes_from_pangenome_file(h5f: tables.File) -> List[str]: + """ + """ + rgp_genes = [row["gene"] for row in read_chunks(h5f.root.RGP, chunk=20000)] + + return rgp_genes + + +def get_families_from_genes(h5f: tables.File , genes:Set[str]): + """ + """ + families = set() + for row in read_chunks(h5f.root.geneFamilies, chunk=20000): + if row['gene'] in genes: + families.add(row["geneFam"]) + + return families + def read_module_families_from_pangenome_file(h5f: tables.File, module_name:str): """ """ family_to_write = set() module_id = int(module_name[len("module_"):]) - print("MODULE", module_id) module_table = h5f.root.modules + for row in read_chunks(module_table, chunk=20000): if row['module'] == module_id: family_to_write.add(row['geneFam']) - print("family_to_write", family_to_write) + return family_to_write def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, @@ -337,11 +356,8 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa outpath = output / f"{family_filter}_nucleotide_families.fasta" - - with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: - seq_id_to_seqs = defaultdict(list) if family_filter in ["all", 'persistent', 'shell', 'cloud']: @@ -354,18 +370,26 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa family_to_write.add(row['name']) - if family_filter.startswith("module_"): + elif family_filter.startswith("module_"): family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter) + + elif family_filter == "rgp": + rgp_genes = read_rgp__genes_from_pangenome_file(h5f) + family_to_write = get_families_from_genes(h5f, rgp_genes) if len(family_to_write) == 0: logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.") return gene_seq_table = h5f.root.annotations.geneSequences + match_count = 0 for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar): if row['gene'] in family_to_write: seq_id_to_seqs[row['seqid']].append(row['gene'].decode()) + match_count += 1 + + assert match_count == len(family_to_write), f"Number of sequences found ({match_count}) does not match the number of expected family {len(family_to_write)}." with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj: seq_table = h5f.root.annotations.sequences diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index e63d978b..fe8aada6 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -578,8 +578,7 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) - # if all(element_to_write is None for element_to_write in [args.regions, args.genes, args.proteins]): - if args.gene_families is not None and args.gene_families in ['all', 'persistent', 'shell', 'cloud'] or module_regex.match(args.gene_families): + if args.gene_families is not None and (args.gene_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(args.gene_families)): logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences " "of the gene families by reading the pangenome file directly.") @@ -590,7 +589,6 @@ def launch(args: argparse.Namespace): if args.prot_families in ['all', 'persistent', 'shell', 'cloud']: - logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences " "of the gene families by reading the pangenome file directly.") write_fasta_prot_fam_from_pangenome_file(pangenome_filename=args.pangenome, partition = args.prot_families, From 25189429cc4cb4c6f96eb05c42bd293e047fa0e0 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Sep 2024 16:23:24 +0200 Subject: [PATCH 05/14] add possibility to write module and rgp in prot fasta --- ppanggolin/formats/readBinaries.py | 27 +++++++++++++++++++++------ ppanggolin/formats/writeSequences.py | 4 ++-- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index dddac342..4b3ebb4e 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -404,7 +404,7 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " f"of the gene families : '{outpath}{'.gz' if compress else ''}") -def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, partition: str, +def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, compress: bool = False, disable_bar=False): """ Write representative amino acid sequences of gene families. @@ -417,17 +417,32 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa :param disable_bar: Disable progress bar """ - outpath = output / f"{partition}_protein_families.faa" + outpath = output / f"{family_filter}_protein_families.faa" + + partition_filter = False + family_to_write = [] - parition_first_letter = partition[0].upper() - with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f, write_compressed_or_not(outpath, compress) as fasta: + if family_filter in ["all", 'persistent', 'shell', 'cloud']: + partition_filter = True + parition_first_letter = family_filter[0].upper() + + elif family_filter == "rgp": + rgp_genes = read_rgp__genes_from_pangenome_file(h5f) + family_to_write = get_families_from_genes(h5f, rgp_genes) + + elif family_filter.startswith("module_"): + family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter) + gene_fam_info_table = h5f.root.geneFamiliesInfo for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar): - - if partition == "all" or row['partition'].decode().startswith(parition_first_letter): + + partition_match = partition_filter and (family_filter == "all" or row['partition'].decode().startswith(parition_first_letter)) + family_match = row['name'] in family_to_write + + if partition_match or family_match: fasta.write(f">{row['name'].decode()}\n") fasta.write(row['protein'].decode() + "\n") diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index fe8aada6..eaca363b 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -587,11 +587,11 @@ def launch(args: argparse.Namespace): output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) args.gene_families = None - if args.prot_families in ['all', 'persistent', 'shell', 'cloud']: + if args.prot_families is not None and (args.prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(args.prot_families)): logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences " "of the gene families by reading the pangenome file directly.") - write_fasta_prot_fam_from_pangenome_file(pangenome_filename=args.pangenome, partition = args.prot_families, + write_fasta_prot_fam_from_pangenome_file(pangenome_filename=args.pangenome, family_filter = args.prot_families, output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) args.prot_families = None From 1157761987d9cec351588e78b6656321a1189272 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Sep 2024 17:27:35 +0200 Subject: [PATCH 06/14] Add possibility to write genes by reading pangenome file directly --- ppanggolin/formats/readBinaries.py | 135 +++++++++++++++++++++------ ppanggolin/formats/writeSequences.py | 12 ++- 2 files changed, 119 insertions(+), 28 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 4b3ebb4e..bb1a868b 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 # default libraries +import dis import logging from optparse import Option from pathlib import Path from typing import Dict, Any, Iterator, Set, List, Tuple, Optional from collections import defaultdict +import numpy # installed libraries from tqdm import tqdm @@ -341,6 +343,67 @@ def read_module_families_from_pangenome_file(h5f: tables.File, module_name:str): return family_to_write +def get_families_matching_partition(h5f: tables.File, partition:str): + """ + """ + family_to_write = set() + + gene_fam_info_table = h5f.root.geneFamiliesInfo + parition_first_letter = partition[0].upper() + + for row in read_chunks(gene_fam_info_table, chunk=20000): + + if partition == "all" or row['partition'].decode().startswith(parition_first_letter): + family_to_write.add(row['name']) + + return family_to_write + +def get_genes_from_families(h5f: tables.File, families:List[numpy.byte]): + """ + """ + matching_genes = set() + + gene_fam_table = h5f.root.geneFamilies + + for row in read_chunks(gene_fam_table, chunk=20000): + if row["geneFam"] in families: + matching_genes.add(row['gene']) + + return matching_genes + +def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = False, disable_bar:bool = False): + """ + """ + + seq_id_to_genes = defaultdict(list) + gene_seq_table = h5f.root.annotations.geneSequences + match_count = 0 + for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar): + + if get_all_genes or row['gene'] in genes: + seq_id_to_genes[row['seqid']].append(row['gene'].decode()) + match_count += 1 + + assert match_count == len(genes), f"Number of sequences found ({match_count}) does not match the number of expected genes {len(genes)}." + + return seq_id_to_genes + +def write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes): + """ + """ + + with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj: + seq_table = h5f.root.annotations.sequences + #TODO add tqm + for row in read_chunks(table=seq_table, chunk=20000): + if row["seqid"] in seq_id_to_genes: + + for seq_name in seq_id_to_genes[row["seqid"]]: + + file_obj.write(f">{seq_name}\n") + file_obj.write(row["dna"].decode() + "\n") + + def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, compress: bool = False, disable_bar=False): """ @@ -358,16 +421,8 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: - seq_id_to_seqs = defaultdict(list) - if family_filter in ["all", 'persistent', 'shell', 'cloud']: - family_to_write = set() - gene_fam_info_table = h5f.root.geneFamiliesInfo - parition_first_letter = family_filter[0].upper() - for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar): - - if family_filter == "all" or row['partition'].decode().startswith(parition_first_letter): - family_to_write.add(row['name']) + family_to_write = get_families_matching_partition(h5f, family_filter) elif family_filter.startswith("module_"): @@ -381,25 +436,9 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.") return - gene_seq_table = h5f.root.annotations.geneSequences - match_count = 0 - for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar): - - if row['gene'] in family_to_write: - seq_id_to_seqs[row['seqid']].append(row['gene'].decode()) - match_count += 1 - - assert match_count == len(family_to_write), f"Number of sequences found ({match_count}) does not match the number of expected family {len(family_to_write)}." + seq_id_to_genes = get_seqid_to_genes(h5f, family_to_write) - with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj: - seq_table = h5f.root.annotations.sequences - for row in read_chunks(table=seq_table, chunk=20000): - if row["seqid"] in seq_id_to_seqs: - - for seq_name in seq_id_to_seqs[row["seqid"]]: - - file_obj.write(f">{seq_name}\n") - file_obj.write(row["dna"].decode() + "\n") + write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes) logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " f"of the gene families : '{outpath}{'.gz' if compress else ''}") @@ -451,6 +490,48 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa f"'{outpath}{'.gz' if compress else ''}'") +def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_filter: str, + compress: bool = False, disable_bar=False): + """ + Write representative nucleotide sequences of gene families + + :param pangenome: Pangenome object with gene families sequences + :param output: Path to output directory + :param gene_families: Selected partition of gene families + :param soft_core: Soft core threshold to use + :param compress: Compress the file in .gz + :param disable_bar: Disable progress bar + """ + + outpath = output / f"{gene_filter}_genes.fna" + get_all_genes = False + + with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: + + if gene_filter in ['persistent', 'shell', 'cloud'] or gene_filter.startswith("module_"): + + if gene_filter.startswith("module_"): + families = read_module_families_from_pangenome_file(h5f, module_name=gene_filter) + + else: + families = get_families_matching_partition(h5f, gene_filter) + + genes_to_write = get_genes_from_families(h5f, families) + + elif gene_filter == "rgp": + genes_to_write = read_rgp__genes_from_pangenome_file(h5f) + + elif gene_filter == "all": + genes_to_write = [] + get_all_genes = True + + seq_id_to_genes = get_seqid_to_genes(h5f, genes_to_write, get_all_genes=get_all_genes, disable_bar=disable_bar) + + write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes) + + logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " + f"of the gene families : '{outpath}{'.gz' if compress else ''}") + def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index eaca363b..108015c8 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -19,7 +19,7 @@ from ppanggolin.utils import (write_compressed_or_not, mk_outdir, create_tmpdir, read_compressed_or_not, restricted_float, detect_filetype, run_subprocess) -from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file, write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file +from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file, write_genes_from_pangenome_file, write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file module_regex = re.compile(r'^module_\d+') # \d == [0-9] poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] @@ -596,6 +596,16 @@ def launch(args: argparse.Namespace): args.prot_families = None + + if args.genes is not None and (args.genes in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(args.genes)): + + logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.") + write_genes_from_pangenome_file(pangenome_filename=args.pangenome, gene_filter = args.genes, + output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) + + args.prot_families = None + + if any(x for x in [args.regions, args.genes, args.proteins, args.gene_families, args.prot_families]): write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, regions=args.regions, genes=args.genes, proteins=args.proteins, From 3a6840c9f4fa3f034b1a96bbc83ab0061e7c3846 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Wed, 18 Sep 2024 17:30:45 +0200 Subject: [PATCH 07/14] reset genes var if it has been handle by writting fct --- ppanggolin/formats/writeSequences.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 108015c8..fe1ed13e 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -603,7 +603,7 @@ def launch(args: argparse.Namespace): write_genes_from_pangenome_file(pangenome_filename=args.pangenome, gene_filter = args.genes, output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) - args.prot_families = None + args.genes = None if any(x for x in [args.regions, args.genes, args.proteins, args.gene_families, args.prot_families]): From 825e1e76a5c2da03df1bd8d0faff69bff6d71543 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Sep 2024 14:22:25 +0200 Subject: [PATCH 08/14] add softcore and core filter in gene_families output --- ppanggolin/formats/readBinaries.py | 49 ++++++++++++- ppanggolin/formats/writeSequences.py | 101 +++++++++++++++------------ 2 files changed, 103 insertions(+), 47 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index bb1a868b..82bb3791 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -394,7 +394,8 @@ def write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes) with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj: seq_table = h5f.root.annotations.sequences - #TODO add tqm + #TODO add tqdm + for row in read_chunks(table=seq_table, chunk=20000): if row["seqid"] in seq_id_to_genes: @@ -404,8 +405,37 @@ def write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes) file_obj.write(row["dna"].decode() + "\n") -def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, - compress: bool = False, disable_bar=False): +def get_gene_to_genome(h5f): + """ + """ + + contig_id_to_genome = {row["ID"]:row['genome'] for row in read_chunks( h5f.root.annotations.contigs, chunk=20000) } + + gene_to_genome = {row["ID"]:contig_id_to_genome[row['contig'] ] for row in read_chunks( h5f.root.annotations.genes, chunk=20000) } + + return gene_to_genome + + +def get_family_to_genome_count(h5f) -> Dict[bytes, int]: + """ + """ + + contig_id_to_genome = {row["ID"]:row['genome'] for row in read_chunks( h5f.root.annotations.contigs, chunk=20000) } + + gene_to_genome = {row["ID"]:contig_id_to_genome[row['contig'] ] for row in read_chunks( h5f.root.annotations.genes, chunk=20000) } + + family_to_genomes = defaultdict(set) + for row in read_chunks( h5f.root.geneFamilies, chunk=20000): + family_to_genomes[row['geneFam']].add(gene_to_genome[row["gene"]]) + + family_to_genome_count = {fam: len(genomes) for fam, genomes in family_to_genomes.items()} + + return family_to_genome_count + + + +def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, soft_core:float = 0.95, + compress: bool = False, disable_bar=False): """ Write representative nucleotide sequences of gene families @@ -432,6 +462,19 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa rgp_genes = read_rgp__genes_from_pangenome_file(h5f) family_to_write = get_families_from_genes(h5f, rgp_genes) + elif family_filter in ["softcore", "core"]: + family_to_genome_count = get_family_to_genome_count(h5f) + pangenome_info = read_info(h5f) + genome_count = pangenome_info["Content"]["Genomes"] + + if family_filter == "core": + family_to_write = {family for family, fam_genome_count in family_to_genome_count.items() if genome_count == fam_genome_count} + + elif family_filter == "softcore": + genome_count_threshold = genome_count * soft_core + family_to_write = {family for family, fam_genome_count in family_to_genome_count.items() if fam_genome_count >= genome_count_threshold} + + if len(family_to_write) == 0: logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.") return diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index fe1ed13e..947b17f1 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -548,21 +548,62 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, :param disable_bar: Disable progress bar """ - check_pangenome_to_write_sequences(pangenome, regions, genes, proteins, gene_families, prot_families, disable_bar) - if prot_families is not None: - write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar) - if gene_families is not None: - write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar) - if genes is not None: - write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar) - if proteins is not None: - write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar, - **translate_kwgs) - if regions is not None: - write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) + if gene_families is not None: #and (gene_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(gene_families)): + + logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences " + "of the gene families by reading the pangenome file directly.") + + write_fasta_gene_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter= gene_families, + output=output, compress=compress, disable_bar=disable_bar) + gene_families = None + + if prot_families is not None and (prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(prot_families)): + + logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences " + "of the gene families by reading the pangenome file directly.") + write_fasta_prot_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter = prot_families, + output=output, compress=compress, disable_bar=disable_bar) + + prot_families = None + if genes is not None and (genes in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(genes)): + + logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.") + write_genes_from_pangenome_file(pangenome_filename=pangenome.file, gene_filter = genes, + output=output, compress=compress, disable_bar=disable_bar) + + genes = None + + if proteins is not None and (proteins in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(proteins)): + pass + + # logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.") + # write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar, + # **translate_kwgs) + + # write_genes_from_pangenome_file(pangenome_filename=pangenome, gene_filter = genes, + # output=output, compress=compress, disable_bar=disable_bar) + + # proteins = None + + if any(x for x in [regions, genes, proteins, gene_families, prot_families]): + check_pangenome_to_write_sequences(pangenome, regions, genes, proteins, gene_families, prot_families, disable_bar) + + if prot_families is not None: + write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar) + if gene_families is not None: + write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar) + if genes is not None: + write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar) + if proteins is not None: + write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar, + **translate_kwgs) + if regions is not None: + write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) + + def launch(args: argparse.Namespace): """ Command launcher @@ -578,39 +619,11 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) - if args.gene_families is not None and (args.gene_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(args.gene_families)): - - logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences " - "of the gene families by reading the pangenome file directly.") - - write_fasta_gene_fam_from_pangenome_file(pangenome_filename=args.pangenome, family_filter= args.gene_families, - output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) - args.gene_families = None - - if args.prot_families is not None and (args.prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(args.prot_families)): - - logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences " - "of the gene families by reading the pangenome file directly.") - write_fasta_prot_fam_from_pangenome_file(pangenome_filename=args.pangenome, family_filter = args.prot_families, - output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) - - args.prot_families = None - - - if args.genes is not None and (args.genes in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(args.genes)): - - logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.") - write_genes_from_pangenome_file(pangenome_filename=args.pangenome, gene_filter = args.genes, - output=args.output, compress=args.compress, disable_bar=args.disable_prog_bar) - - args.genes = None - - if any(x for x in [args.regions, args.genes, args.proteins, args.gene_families, args.prot_families]): - write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, - regions=args.regions, genes=args.genes, proteins=args.proteins, - gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, - disable_bar=args.disable_prog_bar, **translate_kwgs) + write_sequence_files(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, + regions=args.regions, genes=args.genes, proteins=args.proteins, + gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, + disable_bar=args.disable_prog_bar, **translate_kwgs) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: From 25c61bc88cf6a2d641040b7bf1c6ed1039caa7ac Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Sep 2024 14:41:58 +0200 Subject: [PATCH 09/14] add softcore and core filter in prot_families output --- ppanggolin/formats/readBinaries.py | 32 ++++++++++++++++++---------- ppanggolin/formats/writeSequences.py | 4 ++-- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 82bb3791..90ce9e90 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -432,7 +432,16 @@ def get_family_to_genome_count(h5f) -> Dict[bytes, int]: return family_to_genome_count +def get_soft_core_families(h5f, soft_core:float): + """ + """ + family_to_genome_count = get_family_to_genome_count(h5f) + pangenome_info = read_info(h5f) + genome_count = pangenome_info["Content"]["Genomes"] + genome_count_threshold = genome_count * soft_core + soft_core_families = {family for family, fam_genome_count in family_to_genome_count.items() if fam_genome_count >= genome_count_threshold} + return soft_core_families def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, soft_core:float = 0.95, compress: bool = False, disable_bar=False): @@ -463,17 +472,10 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa family_to_write = get_families_from_genes(h5f, rgp_genes) elif family_filter in ["softcore", "core"]: - family_to_genome_count = get_family_to_genome_count(h5f) - pangenome_info = read_info(h5f) - genome_count = pangenome_info["Content"]["Genomes"] - if family_filter == "core": - family_to_write = {family for family, fam_genome_count in family_to_genome_count.items() if genome_count == fam_genome_count} - - elif family_filter == "softcore": - genome_count_threshold = genome_count * soft_core - family_to_write = {family for family, fam_genome_count in family_to_genome_count.items() if fam_genome_count >= genome_count_threshold} - + soft_core = 1.0 + + family_to_write = get_soft_core_families(h5f, soft_core) if len(family_to_write) == 0: logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.") @@ -486,7 +488,8 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " f"of the gene families : '{outpath}{'.gz' if compress else ''}") -def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, + +def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Path, family_filter: str, soft_core:float=0.95, compress: bool = False, disable_bar=False): """ Write representative amino acid sequences of gene families. @@ -517,8 +520,15 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa elif family_filter.startswith("module_"): family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter) + elif family_filter in ["softcore", "core"]: + + soft_core_to_apply = 1.0 if family_filter == "core" else soft_core + + family_to_write = get_soft_core_families(h5f, soft_core=soft_core_to_apply) + gene_fam_info_table = h5f.root.geneFamiliesInfo + # TODO improve tqdm with total being the number of family to write for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar): partition_match = partition_filter and (family_filter == "all" or row['partition'].decode().startswith(parition_first_letter)) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 947b17f1..2ea531f6 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -554,11 +554,11 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences " "of the gene families by reading the pangenome file directly.") - write_fasta_gene_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter= gene_families, + write_fasta_gene_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter= gene_families, soft_core=soft_core, output=output, compress=compress, disable_bar=disable_bar) gene_families = None - if prot_families is not None and (prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(prot_families)): + if prot_families is not None: #and (prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(prot_families)): logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences " "of the gene families by reading the pangenome file directly.") From baa9a0f90d9ed36ae86daf60608ca5233080b576 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Sep 2024 16:08:14 +0200 Subject: [PATCH 10/14] handle proteins in faster way --- ppanggolin/formats/readBinaries.py | 13 ++- ppanggolin/formats/writeSequences.py | 134 ++++++--------------------- 2 files changed, 37 insertions(+), 110 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 90ce9e90..4aec2c7a 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -384,11 +384,11 @@ def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = F seq_id_to_genes[row['seqid']].append(row['gene'].decode()) match_count += 1 - assert match_count == len(genes), f"Number of sequences found ({match_count}) does not match the number of expected genes {len(genes)}." + assert get_all_genes or match_count == len(genes), f"Number of sequences found ({match_count}) does not match the number of expected genes {len(genes)}." return seq_id_to_genes -def write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes): +def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath, compress, seq_id_to_genes): """ """ @@ -543,7 +543,7 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa f"'{outpath}{'.gz' if compress else ''}'") -def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_filter: str, +def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_filter: str, soft_core:float=0.95, compress: bool = False, disable_bar=False): """ Write representative nucleotide sequences of gene families @@ -561,11 +561,16 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_ with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: - if gene_filter in ['persistent', 'shell', 'cloud'] or gene_filter.startswith("module_"): + if gene_filter in ['persistent', 'shell', 'cloud', "softcore", "core"] or gene_filter.startswith("module_"): if gene_filter.startswith("module_"): families = read_module_families_from_pangenome_file(h5f, module_name=gene_filter) + + elif gene_filter in ["softcore", "core"]: + soft_core_to_apply = 1.0 if gene_filter == "core" else soft_core + families = get_soft_core_families(h5f, soft_core=soft_core_to_apply) + else: families = get_families_matching_partition(h5f, gene_filter) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 2ea531f6..83562ae6 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -39,81 +39,6 @@ def check_write_sequences_args(args: argparse.Namespace) -> None: "(You need to provide the same file used to compute the pangenome)") -def check_pangenome_to_write_sequences(pangenome: Pangenome, regions: str = None, genes: str = None, - genes_prot: str = None, gene_families: str = None, - prot_families: str = None, disable_bar: bool = False): - """Check and load required information from pangenome - - :param pangenome: Empty pangenome - :param regions: Check and load the RGP - :param genes: Check and load the genes - :param genes_prot: Write amino acid CDS sequences. - :param gene_families: Check and load the gene families to write representative nucleotide sequences. - :param prot_families: Check and load the gene families to write representative amino acid sequences. - :param disable_bar: Disable progress bar - - :raises AssertionError: if not any arguments to write any file is given - :raises ValueError: if the given filter is not recognized - :raises AttributeError: if the pangenome does not contain the required information - """ - if not any(x for x in [regions, genes, genes_prot, prot_families, gene_families]): - raise AssertionError("You did not indicate what file you wanted to write.") - - need_annotations = False - need_families = False - need_graph = False - need_partitions = False - need_spots = False - need_regions = False - need_modules = False - - if prot_families is not None: - need_families = True - if prot_families in ["core", "softcore"]: - need_annotations = True - - if any(x is not None for x in [regions, genes, genes_prot, gene_families]): - need_annotations = True - need_families = True - if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)): - need_annotations = True - need_regions = True - if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)): - need_partitions = True - for x in (genes, gene_families, prot_families): - if x is not None and 'module_' in x: - need_modules = True - - if not (need_annotations or need_families or need_graph or need_partitions or - need_spots or need_regions or need_modules): - # then nothing is needed, then something is wrong. - # find which filter was provided - provided_filter = '' - if genes is not None: - provided_filter = genes - if genes_prot is not None: - provided_filter = genes_prot - if gene_families is not None: - provided_filter = gene_families - if prot_families is not None: - provided_filter = prot_families - if regions is not None: - provided_filter = regions - raise ValueError(f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. " - f"{poss_values_log}") - - if pangenome.status["geneSequences"] not in ["inFile"] and (genes or gene_families): - raise AttributeError("The provided pangenome has no gene sequences. " - "This is not compatible with any of the following options : --genes, --gene_families") - if pangenome.status["geneFamilySequences"] not in ["Loaded", "Computed", "inFile"] and prot_families: - raise AttributeError("The provided pangenome has no gene families. This is not compatible with any of " - "the following options : --prot_families, --gene_families") - - check_pangenome_info(pangenome, need_annotations=need_annotations, need_families=need_families, - need_graph=need_graph, need_partitions=need_partitions, need_rgp=need_regions, - need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar) - - def write_gene_sequences_from_annotations(genes_to_write: Iterable[Gene], output: Path, add: str = '', compress: bool = False, disable_bar: bool = False): """ @@ -222,7 +147,7 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co return outpath -def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: str, soft_core: float = 0.95, +def write_gene_protein_sequences(pangenome_filename: str, output: Path, gene_filter: str, soft_core: float = 0.95, compress: bool = False, keep_tmp: bool = False, tmp: Path = None, cpu: int = 1, code: int = 11, disable_bar: bool = False): """ Write all amino acid sequences from given genes in pangenome @@ -241,12 +166,17 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s with create_tmpdir(tmp if tmp is not None else Path(tempfile.gettempdir()), basename="translateGenes", keep_tmp=keep_tmp) as tmpdir: - write_gene_sequences(pangenome, tmpdir, proteins, soft_core, compress, disable_bar) + write_genes_from_pangenome_file(pangenome_filename=pangenome_filename, gene_filter = gene_filter, + output=tmpdir, compress=compress, disable_bar=disable_bar) - pangenome_sequences = tmpdir / f"{proteins}_genes.fna{'.gz' if compress else ''}" - translate_db = translate_genes(sequences=pangenome_sequences, tmpdir=tmpdir, + genes_sequence_tmp_file = tmpdir / f"{gene_filter}_genes.fna{'.gz' if compress else ''}" + translate_db = translate_genes(sequences=genes_sequence_tmp_file, tmpdir=tmpdir, cpu=cpu, is_single_line_fasta=True, code=code) - outpath = output / f"{proteins}_protein_genes.fna" + + outpath = output / f"{gene_filter}_protein_genes.faa" + + logging.getLogger("PPanGGOLiN").info("Translating nucleotide gene sequence in protein sequences with mmseqs convert2fasta") + cmd = list(map(str, ["mmseqs", "convert2fasta", translate_db, outpath])) run_subprocess(cmd, msg="MMSeqs convert2fasta failed with the following error:\n") if compress: @@ -254,9 +184,9 @@ def write_gene_protein_sequences(pangenome: Pangenome, output: Path, proteins: s with open(outpath) as sequence_file: shutil.copyfileobj(sequence_file, compress_file) outpath.unlink() - logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}.gz'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene protein sequences : '{outpath}.gz'") else: - logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}'") + logging.getLogger("PPanGGOLiN").info(f"Done writing the gene protein sequences : '{outpath}'") def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]: @@ -549,7 +479,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, """ - if gene_families is not None: #and (gene_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(gene_families)): + if gene_families is not None: logging.getLogger("PPanGGOLiN").info("Writing the representative nucleotide sequences " "of the gene families by reading the pangenome file directly.") @@ -558,48 +488,40 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, output=output, compress=compress, disable_bar=disable_bar) gene_families = None - if prot_families is not None: #and (prot_families in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(prot_families)): + if prot_families is not None: logging.getLogger("PPanGGOLiN").info("Writing the representative protein sequences " "of the gene families by reading the pangenome file directly.") - write_fasta_prot_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter = prot_families, + write_fasta_prot_fam_from_pangenome_file(pangenome_filename=pangenome.file, family_filter = prot_families, soft_core=soft_core, output=output, compress=compress, disable_bar=disable_bar) prot_families = None - if genes is not None and (genes in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(genes)): + if genes is not None: logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.") - write_genes_from_pangenome_file(pangenome_filename=pangenome.file, gene_filter = genes, + write_genes_from_pangenome_file(pangenome_filename=pangenome.file, gene_filter = genes, soft_core=soft_core, output=output, compress=compress, disable_bar=disable_bar) genes = None - if proteins is not None and (proteins in ['all', 'persistent', 'shell', 'cloud', "rgp"] or module_regex.match(proteins)): - pass + if proteins is not None: + - # logging.getLogger("PPanGGOLiN").info("Writing gene nucleotide sequences by reading the pangenome file directly.") - # write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar, - # **translate_kwgs) + logging.getLogger("PPanGGOLiN").info("Writing gene protein sequences by reading the pangenome file directly.") + write_gene_protein_sequences(pangenome_filename=pangenome.file, output=output, gene_filter=proteins, soft_core=soft_core, compress=compress, disable_bar=disable_bar, + **translate_kwgs) + + proteins = None - # write_genes_from_pangenome_file(pangenome_filename=pangenome, gene_filter = genes, - # output=output, compress=compress, disable_bar=disable_bar) + if regions is not None: + + # load pangenome when writing region sequence + check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_rgp=True, disable_bar=disable_bar) - # proteins = None - if any(x for x in [regions, genes, proteins, gene_families, prot_families]): - check_pangenome_to_write_sequences(pangenome, regions, genes, proteins, gene_families, prot_families, disable_bar) - if prot_families is not None: - write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar) - if gene_families is not None: - write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar) - if genes is not None: - write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar) - if proteins is not None: - write_gene_protein_sequences(pangenome, output, proteins, soft_core, compress, disable_bar=disable_bar, - **translate_kwgs) if regions is not None: write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) From cfc9073aa4d797d068487fcd3e0e9d536d55d321 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Sep 2024 17:04:10 +0200 Subject: [PATCH 11/14] clean region seq writting --- ppanggolin/formats/writeSequences.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 83562ae6..90234462 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -516,14 +516,10 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None, proteins = None if regions is not None: - # load pangenome when writing region sequence check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_rgp=True, disable_bar=disable_bar) - - - if regions is not None: - write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) + write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) def launch(args: argparse.Namespace): From bd592d091a4e19468f377b4081fcc870083fbaa4 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 19 Sep 2024 17:40:10 +0200 Subject: [PATCH 12/14] add docstring and clean fct not used --- ppanggolin/formats/readBinaries.py | 118 +++++++++++++++++----- ppanggolin/formats/writeSequences.py | 145 --------------------------- 2 files changed, 93 insertions(+), 170 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 4aec2c7a..5f1b79e9 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -311,17 +311,27 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa f"{output.absolute()}{'.gz' if compress else ''}") -def read_rgp__genes_from_pangenome_file(h5f: tables.File) -> List[str]: +def read_rgp_genes_from_pangenome_file(h5f: tables.File) -> List[bytes]: """ + Retrieves a list of RGP genes from the pangenome file. + + :param h5f: The open HDF5 pangenome file containing RGP gene data. + :return: A list of gene names (as bytes) from the RGP. """ rgp_genes = [row["gene"] for row in read_chunks(h5f.root.RGP, chunk=20000)] return rgp_genes -def get_families_from_genes(h5f: tables.File , genes:Set[str]): +def get_families_from_genes(h5f: tables.File, genes: Set[bytes]) -> Set[bytes]: """ + Retrieves gene families associated with a specified set of genes from the pangenome file. + + :param h5f: The open HDF5 pangenome file containing gene family data. + :param genes: A set of gene names (as bytes) for which to retrieve the associated families. + :return: A set of gene family names (as bytes) associated with the specified genes. """ + families = set() for row in read_chunks(h5f.root.geneFamilies, chunk=20000): if row['gene'] in genes: @@ -329,10 +339,17 @@ def get_families_from_genes(h5f: tables.File , genes:Set[str]): return families -def read_module_families_from_pangenome_file(h5f: tables.File, module_name:str): +def read_module_families_from_pangenome_file(h5f: tables.File, module_name: str) -> Set[bytes]: """ - + Retrieves gene families associated with a specified module from the pangenome file. + + + :param h5f: The open HDF5 pangenome file containing module data. + :param module_name: The name of the module (as a string). The module ID is extracted from + the name by removing the "module_" prefix. + :return: A set of gene family names (as bytes) associated with the specified module. """ + family_to_write = set() module_id = int(module_name[len("module_"):]) module_table = h5f.root.modules @@ -343,9 +360,16 @@ def read_module_families_from_pangenome_file(h5f: tables.File, module_name:str): return family_to_write -def get_families_matching_partition(h5f: tables.File, partition:str): +def get_families_matching_partition(h5f: tables.File, partition: str) -> Set[bytes]: """ + Retrieves gene families that match the specified partition. + + :param h5f: The open HDF5 pangenome file containing gene family information. + :param partition: The partition name (as a string). If "all", all gene families are included. + Otherwise, it filters by the first letter of the partition. + :return: A set of gene family names (as bytes) that match the partition criteria. """ + family_to_write = set() gene_fam_info_table = h5f.root.geneFamiliesInfo @@ -358,9 +382,18 @@ def get_families_matching_partition(h5f: tables.File, partition:str): return family_to_write -def get_genes_from_families(h5f: tables.File, families:List[numpy.byte]): +def get_genes_from_families(h5f: tables.File, families: List[bytes]) -> Set[bytes]: """ + Retrieves a set of genes that belong to the specified families. + + This function reads the gene family data from an HDF5 pangenome file and returns + a set of genes that are part of the given list of gene families. + + :param h5f: The open HDF5 pangenome file containing gene family data. + :param families: A list of gene families (as bytes) to filter genes by. + :return: A set of genes (as bytes) that belong to the specified families. """ + matching_genes = set() gene_fam_table = h5f.root.geneFamilies @@ -371,14 +404,22 @@ def get_genes_from_families(h5f: tables.File, families:List[numpy.byte]): return matching_genes -def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = False, disable_bar:bool = False): +def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = False, disable_bar:bool = False) -> Dict[int, List[str]]: """ + Creates a mapping of sequence IDs to gene names. + + :param h5f: The open HDF5 pangenome file containing gene sequence data. + :param genes: A list of gene names to include in the mapping (if `get_all_genes` is False). + :param get_all_genes: Boolean flag to indicate if all genes should be included in the mapping. + If set to True, all genes will be added regardless of the `genes` parameter. + :param disable_bar: Boolean flag to disable the progress bar if set to True. + :return: A dictionary mapping sequence IDs (integers) to lists of gene names (strings). """ seq_id_to_genes = defaultdict(list) gene_seq_table = h5f.root.annotations.geneSequences match_count = 0 - for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="family", disable=disable_bar): + for row in tqdm(read_chunks(gene_seq_table, chunk=20000), total=gene_seq_table.nrows, unit="gene", disable=disable_bar): if get_all_genes or row['gene'] in genes: seq_id_to_genes[row['seqid']].append(row['gene'].decode()) @@ -388,25 +429,42 @@ def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = F return seq_id_to_genes -def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath, compress, seq_id_to_genes): +def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath: Path, compress: bool, seq_id_to_genes: Dict[int, List[str]], disable_bar:bool): """ + Writes gene sequences from the pangenome file to an output file. + + Only sequences whose IDs match the ones in `seq_id_to_genes` will be written. + + :param h5f: The open HDF5 pangenome file containing sequence data. + :param outpath: The path to the output file where sequences will be written. + :param compress: Boolean flag to indicate whether output should be compressed. + :param seq_id_to_genes: A dictionary mapping sequence IDs to lists of gene names. + :param disable_bar: Boolean flag to disable the progress bar if set to True. """ with write_compressed_or_not(file_path=outpath, compress=compress) as file_obj: + seq_table = h5f.root.annotations.sequences - #TODO add tqdm - for row in read_chunks(table=seq_table, chunk=20000): - if row["seqid"] in seq_id_to_genes: + with tqdm(total=len(seq_id_to_genes), unit="sequence", disable=disable_bar) as pbar: - for seq_name in seq_id_to_genes[row["seqid"]]: - - file_obj.write(f">{seq_name}\n") - file_obj.write(row["dna"].decode() + "\n") + for row in read_chunks(table=seq_table, chunk=20000): + if row["seqid"] in seq_id_to_genes: + + for seq_name in seq_id_to_genes[row["seqid"]]: + file_obj.write(f">{seq_name}\n") + file_obj.write(row["dna"].decode() + "\n") + + + pbar.update(1) -def get_gene_to_genome(h5f): +def get_gene_to_genome(h5f: tables.File) -> Dict[bytes, bytes]: """ + Generates a mapping between gene IDs and their corresponding genome. + + :param h5f: The open HDF5 pangenome file containing contig and gene annotations. + :return: A dictionary mapping gene IDs to genome names. """ contig_id_to_genome = {row["ID"]:row['genome'] for row in read_chunks( h5f.root.annotations.contigs, chunk=20000) } @@ -416,8 +474,12 @@ def get_gene_to_genome(h5f): return gene_to_genome -def get_family_to_genome_count(h5f) -> Dict[bytes, int]: +def get_family_to_genome_count(h5f: tables.File) -> Dict[bytes, int]: """ + Computes the number of unique genomes associated with each gene family. + + :param h5f: The open HDF5 pangenome file containing contig, gene, and gene family data. + :return: A dictionary mapping gene family names (as bytes) to the count of unique genomes. """ contig_id_to_genome = {row["ID"]:row['genome'] for row in read_chunks( h5f.root.annotations.contigs, chunk=20000) } @@ -432,8 +494,14 @@ def get_family_to_genome_count(h5f) -> Dict[bytes, int]: return family_to_genome_count -def get_soft_core_families(h5f, soft_core:float): +def get_soft_core_families(h5f: tables.File, soft_core: float) -> Set[bytes]: """ + Identifies gene families that are present in at least a specified proportion of genomes. + + :param h5f: The open HDF5 pangenome file containing gene family and genome data. + :param soft_core: The proportion of genomes (between 0 and 1) that a gene family must be present in + to be considered a soft core family. + :return: A set of gene family names (as bytes) that are classified as soft core. """ family_to_genome_count = get_family_to_genome_count(h5f) pangenome_info = read_info(h5f) @@ -468,7 +536,7 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa family_to_write = read_module_families_from_pangenome_file(h5f, module_name=family_filter) elif family_filter == "rgp": - rgp_genes = read_rgp__genes_from_pangenome_file(h5f) + rgp_genes = read_rgp_genes_from_pangenome_file(h5f) family_to_write = get_families_from_genes(h5f, rgp_genes) elif family_filter in ["softcore", "core"]: @@ -483,7 +551,7 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa seq_id_to_genes = get_seqid_to_genes(h5f, family_to_write) - write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes) + write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " f"of the gene families : '{outpath}{'.gz' if compress else ''}") @@ -514,7 +582,7 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa parition_first_letter = family_filter[0].upper() elif family_filter == "rgp": - rgp_genes = read_rgp__genes_from_pangenome_file(h5f) + rgp_genes = read_rgp_genes_from_pangenome_file(h5f) family_to_write = get_families_from_genes(h5f, rgp_genes) elif family_filter.startswith("module_"): @@ -528,7 +596,7 @@ def write_fasta_prot_fam_from_pangenome_file(pangenome_filename: str, output: Pa gene_fam_info_table = h5f.root.geneFamiliesInfo - # TODO improve tqdm with total being the number of family to write + for row in tqdm(read_chunks(gene_fam_info_table, chunk=20000), total=gene_fam_info_table.nrows, unit="family", disable=disable_bar): partition_match = partition_filter and (family_filter == "all" or row['partition'].decode().startswith(parition_first_letter)) @@ -577,7 +645,7 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_ genes_to_write = get_genes_from_families(h5f, families) elif gene_filter == "rgp": - genes_to_write = read_rgp__genes_from_pangenome_file(h5f) + genes_to_write = read_rgp_genes_from_pangenome_file(h5f) elif gene_filter == "all": genes_to_write = [] @@ -585,7 +653,7 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_ seq_id_to_genes = get_seqid_to_genes(h5f, genes_to_write, get_all_genes=get_all_genes, disable_bar=disable_bar) - write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes) + write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes, disable_bar=disable_bar) logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " f"of the gene families : '{outpath}{'.gz' if compress else ''}") diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 90234462..49ec4430 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -104,48 +104,6 @@ def translate_genes(sequences: Union[Path, Iterable[Path]], tmpdir: Path, cpu: i return seqdb -def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_core: float = 0.95, - compress: bool = False, disable_bar: bool = False) -> Path: - """ - Write all nucleotide CDS sequences - - :param pangenome: Pangenome object with gene families sequences - :param output: Path to output directory - :param genes: Selected partition of gene - :param soft_core: Soft core threshold to use - :param compress: Compress the file in .gz - :param disable_bar: Disable progress bar - - :raises AttributeError: If the pangenome does not contain gene sequences - """ - - logging.getLogger("PPanGGOLiN").info("Writing all the gene nucleotide sequences...") - outpath = output / f"{genes}_genes.fna" - - genefams = set() - genes_to_write = [] - if genes == "rgp": - logging.getLogger("PPanGGOLiN").info("Writing the gene nucleotide sequences in RGPs...") - for region in pangenome.regions: - genes_to_write.extend(region.genes) - else: - genefams = select_families(pangenome, genes, "gene nucleotide sequences", soft_core) - - for fam in genefams: - genes_to_write.extend(fam.genes) - - logging.getLogger("PPanGGOLiN").info(f"There are {len(genes_to_write)} genes to write") - if pangenome.status["geneSequences"] in ["inFile"]: - write_gene_sequences_from_pangenome_file(pangenome.file, outpath, set([gene.ID for gene in genes_to_write]), - compress=compress, disable_bar=disable_bar) - elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]: - write_gene_sequences_from_annotations(genes_to_write, outpath, compress=compress, disable_bar=disable_bar) - else: - # this should never happen if the pangenome has been properly checked before launching this function. - raise AttributeError("The pangenome does not include gene sequences") - logging.getLogger("PPanGGOLiN").info(f"Done writing the gene sequences : '{outpath}{'.gz' if compress else ''}'") - return outpath - def write_gene_protein_sequences(pangenome_filename: str, output: Path, gene_filter: str, soft_core: float = 0.95, compress: bool = False, keep_tmp: bool = False, tmp: Path = None, @@ -189,109 +147,6 @@ def write_gene_protein_sequences(pangenome_filename: str, output: Path, gene_fil logging.getLogger("PPanGGOLiN").info(f"Done writing the gene protein sequences : '{outpath}'") -def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> Set[GeneFamily]: - """ - function used to filter down families to the given partition - - :param pangenome: Pangenome object - :param partition: Selected partition - :param type_name: Which type of sequence we want. Gene families, protein, gene - :param soft_core: Soft core threshold to use - - :return: Selected gene families - """ - genefams = set() - if partition == 'all': - logging.getLogger("PPanGGOLiN").info(f"Writing all of the {type_name}...") - genefams = pangenome.gene_families - - elif partition in ['persistent', 'shell', 'cloud']: - logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} of the {partition}...") - for fam in pangenome.gene_families: - if fam.named_partition == partition: - genefams.add(fam) - - elif partition == "rgp": - logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} in RGPs...") - for region in pangenome.regions: - genefams |= set(region.families) - - elif partition == "softcore": - logging.getLogger("PPanGGOLiN").info( - f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes") - threshold = pangenome.number_of_organisms * soft_core - for fam in pangenome.gene_families: - if fam.number_of_organisms >= threshold: - genefams.add(fam) - - elif partition == "core": - logging.getLogger("PPanGGOLiN").info(f"Writing the representative {type_name} of the {partition} " - "gene families...") - for fam in pangenome.gene_families: - if fam.number_of_organisms == pangenome.number_of_organisms: - genefams.add(fam) - - elif "module_" in partition: - logging.getLogger("PPanGGOLiN").info(f"Writing the representation {type_name} of {partition} gene families...") - mod_id = int(partition.replace("module_", "")) - for mod in pangenome.modules: - # could be way more efficient with a dict structure instead of a set - if mod.ID == mod_id: - genefams |= set(mod.families) - break - return genefams - - -def write_fasta_gene_fam(pangenome: Pangenome, output: Path, gene_families: str, soft_core: float = 0.95, - compress: bool = False, disable_bar=False): - """ - Write representative nucleotide sequences of gene families - - :param pangenome: Pangenome object with gene families sequences - :param output: Path to output directory - :param gene_families: Selected partition of gene families - :param soft_core: Soft core threshold to use - :param compress: Compress the file in .gz - :param disable_bar: Disable progress bar - """ - - outpath = output / f"{gene_families}_nucleotide_families.fasta" - - genefams = select_families(pangenome, gene_families, "representative nucleotide sequences of the gene families", - soft_core) - - write_gene_sequences_from_pangenome_file(pangenome.file, outpath, [fam.name for fam in genefams], - compress=compress, disable_bar=disable_bar) - - logging.getLogger("PPanGGOLiN").info("Done writing the representative nucleotide sequences " - f"of the gene families : '{outpath}{'.gz' if compress else ''}") - - -def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str, soft_core: float = 0.95, - compress: bool = False, disable_bar: bool = False): - """ - Write representative amino acid sequences of gene families. - - :param pangenome: Pangenome object with gene families sequences - :param output: Path to output directory - :param prot_families: Selected partition of protein families - :param soft_core: Soft core threshold to use - :param compress: Compress the file in .gz - :param disable_bar: Disable progress bar - """ - - outpath = output / f"{prot_families}_protein_families.faa" - - genefams = select_families(pangenome, prot_families, "representative amino acid sequences of the gene families", - soft_core) - - with write_compressed_or_not(outpath, compress) as fasta: - for fam in tqdm(genefams, unit="prot families", disable=disable_bar): - fasta.write('>' + fam.name + "\n") - fasta.write(fam.sequence + "\n") - logging.getLogger("PPanGGOLiN").info(f"Done writing the representative amino acid sequences of the gene families:" - f"'{outpath}{'.gz' if compress else ''}'") - def read_fasta_or_gff(file_path: Path) -> Dict[str, str]: """ From cd30d15c79491469fa78620713fd169a0e07f40e Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 20 Sep 2024 11:33:09 +0200 Subject: [PATCH 13/14] clean import --- ppanggolin/formats/writeSequences.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 49ec4430..e69f3a5a 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -5,7 +5,7 @@ import logging import re from pathlib import Path -from typing import Dict, Set, Iterable, Union +from typing import Dict, Iterable, Union import tempfile import shutil @@ -14,12 +14,11 @@ # local libraries from ppanggolin.pangenome import Pangenome -from ppanggolin.geneFamily import GeneFamily from ppanggolin.genome import Gene, Organism from ppanggolin.utils import (write_compressed_or_not, mk_outdir, create_tmpdir, read_compressed_or_not, restricted_float, detect_filetype, run_subprocess) -from ppanggolin.formats.readBinaries import check_pangenome_info, write_gene_sequences_from_pangenome_file, write_genes_from_pangenome_file, write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file +from ppanggolin.formats.readBinaries import check_pangenome_info, write_genes_from_pangenome_file, write_fasta_gene_fam_from_pangenome_file, write_fasta_prot_fam_from_pangenome_file module_regex = re.compile(r'^module_\d+') # \d == [0-9] poss_values = ['all', 'persistent', 'shell', 'cloud', 'rgp', 'softcore', 'core', module_regex] From bc18e15d565eacd0c9eb55556e3816aeadbe8c44 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 20 Sep 2024 11:33:34 +0200 Subject: [PATCH 14/14] use set instead of list to speed gene selection --- ppanggolin/formats/readBinaries.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 5f1b79e9..b9499fe8 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -1,13 +1,10 @@ #!/usr/bin/env python3 # default libraries -import dis import logging -from optparse import Option from pathlib import Path from typing import Dict, Any, Iterator, Set, List, Tuple, Optional from collections import defaultdict -import numpy # installed libraries from tqdm import tqdm @@ -311,14 +308,14 @@ def write_gene_sequences_from_pangenome_file(pangenome_filename: str, output: Pa f"{output.absolute()}{'.gz' if compress else ''}") -def read_rgp_genes_from_pangenome_file(h5f: tables.File) -> List[bytes]: +def read_rgp_genes_from_pangenome_file(h5f: tables.File) -> Set[bytes]: """ Retrieves a list of RGP genes from the pangenome file. :param h5f: The open HDF5 pangenome file containing RGP gene data. :return: A list of gene names (as bytes) from the RGP. """ - rgp_genes = [row["gene"] for row in read_chunks(h5f.root.RGP, chunk=20000)] + rgp_genes = {row["gene"] for row in read_chunks(h5f.root.RGP, chunk=20000)} return rgp_genes @@ -404,7 +401,7 @@ def get_genes_from_families(h5f: tables.File, families: List[bytes]) -> Set[byte return matching_genes -def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = False, disable_bar:bool = False) -> Dict[int, List[str]]: +def get_seqid_to_genes(h5f: tables.File, genes:Set[bytes], get_all_genes:bool = False, disable_bar:bool = False) -> Dict[int, List[str]]: """ Creates a mapping of sequence IDs to gene names. @@ -429,6 +426,7 @@ def get_seqid_to_genes(h5f: tables.File, genes:List[str], get_all_genes:bool = F return seq_id_to_genes + def write_genes_seq_from_pangenome_file(h5f: tables.File, outpath: Path, compress: bool, seq_id_to_genes: Dict[int, List[str]], disable_bar:bool): """ Writes gene sequences from the pangenome file to an output file. @@ -549,7 +547,7 @@ def write_fasta_gene_fam_from_pangenome_file(pangenome_filename: str, output: Pa logging.getLogger("PPanGGOLiN").warning(f"No families matching filter {family_filter}.") return - seq_id_to_genes = get_seqid_to_genes(h5f, family_to_write) + seq_id_to_genes = get_seqid_to_genes(h5f, set(family_to_write)) write_genes_seq_from_pangenome_file(h5f, outpath, compress, seq_id_to_genes, disable_bar=disable_bar) @@ -627,6 +625,7 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_ outpath = output / f"{gene_filter}_genes.fna" get_all_genes = False + with tables.open_file(pangenome_filename, "r", driver_core_backing_store=0) as h5f: if gene_filter in ['persistent', 'shell', 'cloud', "softcore", "core"] or gene_filter.startswith("module_"): @@ -648,7 +647,7 @@ def write_genes_from_pangenome_file(pangenome_filename: str, output: Path, gene_ genes_to_write = read_rgp_genes_from_pangenome_file(h5f) elif gene_filter == "all": - genes_to_write = [] + genes_to_write = set() get_all_genes = True seq_id_to_genes = get_seqid_to_genes(h5f, genes_to_write, get_all_genes=get_all_genes, disable_bar=disable_bar)